432 lines
16 KiB
Plaintext
432 lines
16 KiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 1,
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import pandas as pd\n",
|
||
|
"import numpy as np\n",
|
||
|
"\n",
|
||
|
"class Predictor:\n",
|
||
|
" def __init__(self):\n",
|
||
|
" self.coefficients = None\n",
|
||
|
"\n",
|
||
|
" def fit(self, X, y):\n",
|
||
|
" pass\n",
|
||
|
"\n",
|
||
|
" def predict(self, X):\n",
|
||
|
" pass\n",
|
||
|
"\n",
|
||
|
"class LinearRegression(Predictor):\n",
|
||
|
" def fit(self, X, y):\n",
|
||
|
" X = np.hstack((np.ones((X.shape[0], 1)), X))\n",
|
||
|
" self.coefficients = np.linalg.inv(X.T @ X) @ X.T @ y\n",
|
||
|
"\n",
|
||
|
" def predict(self, X):\n",
|
||
|
" X = np.hstack((np.ones((X.shape[0], 1)), X))\n",
|
||
|
" return X @ self.coefficients\n",
|
||
|
"\n",
|
||
|
"class RidgeRegression(Predictor):\n",
|
||
|
" def __init__(self, alpha):\n",
|
||
|
" super().__init__()\n",
|
||
|
" self.alpha = alpha\n",
|
||
|
"\n",
|
||
|
" def fit(self, X, y):\n",
|
||
|
" X = np.hstack((np.ones((X.shape[0], 1)), X))\n",
|
||
|
" I = np.eye(X.shape[1])\n",
|
||
|
" I[0, 0] = 0\n",
|
||
|
" self.coefficients = np.linalg.inv(X.T @ X + self.alpha * I) @ X.T @ y\n",
|
||
|
"\n",
|
||
|
" def predict(self, X):\n",
|
||
|
" X = np.hstack((np.ones((X.shape[0], 1)), X))\n",
|
||
|
" return X @ self.coefficients\n",
|
||
|
"\n",
|
||
|
"class LassoRegression(Predictor):\n",
|
||
|
" def __init__(self, alpha, num_iters=1000, lr=0.01):\n",
|
||
|
" super().__init__()\n",
|
||
|
" self.alpha = alpha\n",
|
||
|
" self.num_iters = num_iters\n",
|
||
|
" self.lr = lr\n",
|
||
|
"\n",
|
||
|
" def fit(self, X, y):\n",
|
||
|
" X = np.hstack((np.ones((X.shape[0], 1)), X))\n",
|
||
|
" self.coefficients = np.random.randn(X.shape[1])\n",
|
||
|
" for _ in range(self.num_iters):\n",
|
||
|
" self.coefficients -= self.lr * (X.T @ (X @ self.coefficients - y) + self.alpha * np.sign(self.coefficients))\n",
|
||
|
"\n",
|
||
|
" def predict(self, X):\n",
|
||
|
" X = np.hstack((np.ones((X.shape[0], 1)), X))\n",
|
||
|
" return X @ self.coefficients"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"source": [
|
||
|
"Data Preprocessing and data loading"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 2,
|
||
|
"metadata": {
|
||
|
"collapsed": true
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"def preprocess(df):\n",
|
||
|
" # Handle missing values\n",
|
||
|
" df = df.fillna(df.mean())\n",
|
||
|
"\n",
|
||
|
" # Remove outliers\n",
|
||
|
" z_scores = np.abs((df - df.mean()) / df.std())\n",
|
||
|
" df = df[(z_scores < 3).all(axis=1)]\n",
|
||
|
"\n",
|
||
|
" df_nomralized = df.copy()\n",
|
||
|
"\n",
|
||
|
" # Normalize the data using z-score normalization except for the target column\n",
|
||
|
" for column in df.columns:\n",
|
||
|
" if column == \"target\":\n",
|
||
|
" continue\n",
|
||
|
" df_nomralized[column] = (df_nomralized[column] - df_nomralized[column].mean()) / df_nomralized[column].std()\n",
|
||
|
"\n",
|
||
|
" return df, df_nomralized\n",
|
||
|
"\n",
|
||
|
"def train_test_split(X, y, test_size=0.2):\n",
|
||
|
" # Split the dataset into train and test sets\n",
|
||
|
" idx = np.arange(X.shape[0])\n",
|
||
|
" np.random.shuffle(idx)\n",
|
||
|
" X = X[idx]\n",
|
||
|
" y = y[idx]\n",
|
||
|
" split = int((1 - test_size) * X.shape[0])\n",
|
||
|
" X_train, X_test = X[:split], X[split:]\n",
|
||
|
" y_train, y_test = y[:split], y[split:]\n",
|
||
|
" return X_train, X_test, y_train, y_test\n",
|
||
|
"\n",
|
||
|
"def load_data(normalize=True):\n",
|
||
|
" # Load the diabetes dataset\n",
|
||
|
" df = pd.read_csv(\"diabetes.csv\")\n",
|
||
|
"\n",
|
||
|
" # Preprocess the dataset\n",
|
||
|
" df, df_norm = preprocess(df)\n",
|
||
|
"\n",
|
||
|
" # Split the dataset into train and test sets\n",
|
||
|
" if normalize:\n",
|
||
|
" X = df_norm.drop(\"target\", axis=1).values\n",
|
||
|
" y = df_norm[\"target\"].values\n",
|
||
|
" else:\n",
|
||
|
" X = df.drop(\"target\", axis=1).values\n",
|
||
|
" y = df[\"target\"].values\n",
|
||
|
" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n",
|
||
|
" return X_train, X_test, y_train, y_test, df"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 3,
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": " age sex bmi bp s1 s2 s3 \\\n0 0.794887 1.061173 1.357096 0.459459 -0.917834 -0.734476 -0.958901 \n1 -0.038221 -0.940162 -1.095193 -0.557425 -0.148672 -0.395182 1.714481 \n2 1.779468 1.061173 0.983414 -0.121617 -0.947417 -0.720904 -0.708271 \n3 -1.855910 -0.940162 -0.231053 -0.775328 0.295075 0.561626 -0.791815 \n4 0.113253 -0.940162 -0.768221 0.459459 0.117576 0.358049 0.210704 \n\n s4 s5 s6 target \n0 -0.035628 0.434041 -0.356981 151.0 \n1 -0.856638 -1.429397 -1.923328 75.0 \n2 -0.035628 0.074059 -0.531020 141.0 \n3 0.785382 0.492755 -0.182943 206.0 \n4 -0.035628 -0.661884 -0.966116 135.0 ",
|
||
|
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>age</th>\n <th>sex</th>\n <th>bmi</th>\n <th>bp</th>\n <th>s1</th>\n <th>s2</th>\n <th>s3</th>\n <th>s4</th>\n <th>s5</th>\n <th>s6</th>\n <th>target</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0.794887</td>\n <td>1.061173</td>\n <td>1.357096</td>\n <td>0.459459</td>\n <td>-0.917834</td>\n <td>-0.734476</td>\n <td>-0.958901</td>\n <td>-0.035628</td>\n <td>0.434041</td>\n <td>-0.356981</td>\n <td>151.0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>-0.038221</td>\n <td>-0.940162</td>\n <td>-1.095193</td>\n <td>-0.557425</td>\n <td>-0.148672</td>\n <td>-0.395182</td>\n <td>1.714481</td>\n <td>-0.856638</td>\n <td>-1.429397</td>\n <td>-1.923328</td>\n <td>75.0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>1.779468</td>\n <td>1.061173</td>\n <td>0.983414</td>\n <td>-0.121617</td>\n <td>-0.947417</td>\n <td>-0.720904</td>\n <td>-0.708271</td>\n <td>-0.035628</td>\n <td>0.074059</td>\n <td>-0.531020</td>\n <td>141.0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>-1.855910</td>\n <td>-0.940162</td>\n <td>-0.231053</td>\n <td>-0.775328</td>\n <td>0.295075</td>\n <td>0.561626</td>\n <td>-0.791815</td>\n <td>0.785382</td>\n <td>0.492755</td>\n <td>-0.182943</td>\n <td>206.0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>0.113253</td>\n <td>-0.940162</td>\n <td>-0.768221</td>\n <td>0.459459</td>\n <td>0.117576</td>\n <td>0.358049</td>\n <td>0.210704</td>\n <td>-0.035628</td>\n <td>-0.661884</td>\n <td>-0.966116</td>\n <td>135.0</td>\n </tr>\n </tbody>\n</table>\n</div>"
|
||
|
},
|
||
|
"execution_count": 3,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# Load the diabetes dataset\n",
|
||
|
"df = pd.read_csv(\"diabetes.csv\")\n",
|
||
|
"# Preprocess the dataset\n",
|
||
|
"df, df_norm = preprocess(df)\n",
|
||
|
"df.head()\n",
|
||
|
"df_norm.head()"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"source": [],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"source": [
|
||
|
"Load the data"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 4,
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"X_train: (353, 10)\n",
|
||
|
"X_test: (89, 10)\n",
|
||
|
"y_train: (353,)\n",
|
||
|
"y_test: (89,)\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"from sklearn.datasets import load_diabetes\n",
|
||
|
"\n",
|
||
|
"normalize = True\n",
|
||
|
"# Load the data\n",
|
||
|
"# X_train, X_test, y_train, y_test, df = load_data(normalize=normalize)\n",
|
||
|
"data = load_diabetes(scaled=True)\n",
|
||
|
"# split data into train and test sets\n",
|
||
|
"X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2)\n",
|
||
|
"print(\"X_train:\", X_train.shape)\n",
|
||
|
"print(\"X_test:\", X_test.shape)\n",
|
||
|
"print(\"y_train:\", y_train.shape)\n",
|
||
|
"print(\"y_test:\", y_test.shape)"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"source": [
|
||
|
"Fit the models"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 5,
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Fit the linear regression\n",
|
||
|
"linear_regression = LinearRegression()\n",
|
||
|
"linear_regression.fit(X_train, y_train)"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 6,
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Fit the ridge regression\n",
|
||
|
"ridge_regression = RidgeRegression(alpha=1)\n",
|
||
|
"ridge_regression.fit(X_train, y_train)"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 7,
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Fit the lasso regression\n",
|
||
|
"lasso_regression = LassoRegression(alpha=1, num_iters=10000, lr=0.001)\n",
|
||
|
"lasso_regression.fit(X_train, y_train)"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"source": [
|
||
|
"Evaluate the models"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 8,
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Linear Regression MSE train: 2886.03 test: 2801.42\n",
|
||
|
"Ridge Regression MSE train: 3417.29 test: 3129.15\n",
|
||
|
"Lasso Regression MSE train: 2905.98 test: 2812.49\n",
|
||
|
"Linear Regression RMSE train: 53.72 test: 52.93\n",
|
||
|
"Ridge Regression RMSE train: 58.46 test: 55.94\n",
|
||
|
"Lasso Regression RMSE train: 53.91 test: 53.03\n",
|
||
|
"Linear Regression R2 train: 0.51 test: 0.54\n",
|
||
|
"Ridge Regression R2 train: 0.42 test: 0.49\n",
|
||
|
"Lasso Regression R2 train: 0.51 test: 0.54\n",
|
||
|
"Linear Regression features sorted by their coefficients:\n",
|
||
|
"s1: -822.75\n",
|
||
|
"s5: 765.40\n",
|
||
|
"bmi: 514.60\n",
|
||
|
"s2: 424.92\n",
|
||
|
"bp: 355.26\n",
|
||
|
"sex: -241.22\n",
|
||
|
"s4: 230.86\n",
|
||
|
"s3: 129.59\n",
|
||
|
"s6: 40.86\n",
|
||
|
"age: -10.93\n",
|
||
|
"Ridge Regression features sorted by their coefficients:\n",
|
||
|
"bmi: 283.92\n",
|
||
|
"s5: 238.87\n",
|
||
|
"bp: 195.46\n",
|
||
|
"s3: -142.91\n",
|
||
|
"s4: 106.92\n",
|
||
|
"s6: 93.55\n",
|
||
|
"sex: -63.98\n",
|
||
|
"s2: -30.82\n",
|
||
|
"age: 24.08\n",
|
||
|
"s1: 0.97\n",
|
||
|
"Lasso Regression features sorted by their coefficients:\n",
|
||
|
"bmi: 528.98\n",
|
||
|
"s5: 494.25\n",
|
||
|
"bp: 348.14\n",
|
||
|
"sex: -230.35\n",
|
||
|
"s3: -197.76\n",
|
||
|
"s2: -137.48\n",
|
||
|
"s4: 127.48\n",
|
||
|
"s1: -101.61\n",
|
||
|
"s6: 40.95\n",
|
||
|
"age: -7.39\n",
|
||
|
"Linear Regression number of non-zero coefficients: 11\n",
|
||
|
"Ridge Regression number of non-zero coefficients: 11\n",
|
||
|
"Lasso Regression number of non-zero coefficients: 11\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"from sklearn.metrics import r2_score\n",
|
||
|
"\n",
|
||
|
"y_pred_linear = linear_regression.predict(X_test)\n",
|
||
|
"y_pred_ridge = ridge_regression.predict(X_test)\n",
|
||
|
"y_pred_lasso = lasso_regression.predict(X_test)\n",
|
||
|
"\n",
|
||
|
"y_pred_train_linear = linear_regression.predict(X_train)\n",
|
||
|
"y_pred_train_ridge = ridge_regression.predict(X_train)\n",
|
||
|
"y_pred_train_lasso = lasso_regression.predict(X_train)\n",
|
||
|
"\n",
|
||
|
"\n",
|
||
|
"# Calculate the mean squared error\n",
|
||
|
"mse_linear_test = np.mean((y_test - y_pred_linear) ** 2)\n",
|
||
|
"mse_linear_train = np.mean((y_train - y_pred_train_linear) ** 2)\n",
|
||
|
"mse_ridge_test = np.mean((y_test - y_pred_ridge) ** 2)\n",
|
||
|
"mse_ridge_train = np.mean((y_train - y_pred_train_ridge) ** 2)\n",
|
||
|
"mse_lasso_test = np.mean((y_test - y_pred_lasso) ** 2)\n",
|
||
|
"mse_lasso_train = np.mean((y_train - y_pred_train_lasso) ** 2)\n",
|
||
|
"\n",
|
||
|
"# Print the mean squared error in a compact way keeping two decimal digits\n",
|
||
|
"print(\"Linear Regression MSE train: {:.2f} test: {:.2f}\".format(mse_linear_train, mse_linear_test))\n",
|
||
|
"print(\"Ridge Regression MSE train: {:.2f} test: {:.2f}\".format(mse_ridge_train, mse_ridge_test))\n",
|
||
|
"print(\"Lasso Regression MSE train: {:.2f} test: {:.2f}\".format(mse_lasso_train, mse_lasso_test))\n",
|
||
|
"\n",
|
||
|
"# calculate the root squared error\n",
|
||
|
"rmse_linear_test = np.sqrt(mse_linear_test)\n",
|
||
|
"rmse_linear_train = np.sqrt(mse_linear_train)\n",
|
||
|
"rmse_ridge_test = np.sqrt(mse_ridge_test)\n",
|
||
|
"rmse_ridge_train = np.sqrt(mse_ridge_train)\n",
|
||
|
"rmse_lasso_test = np.sqrt(mse_lasso_test)\n",
|
||
|
"rmse_lasso_train = np.sqrt(mse_lasso_train)\n",
|
||
|
"\n",
|
||
|
"# print the root squared error in a compact way keeping two decimal digits\n",
|
||
|
"print(\"Linear Regression RMSE train: {:.2f} test: {:.2f}\".format(rmse_linear_train, rmse_linear_test))\n",
|
||
|
"print(\"Ridge Regression RMSE train: {:.2f} test: {:.2f}\".format(rmse_ridge_train, rmse_ridge_test))\n",
|
||
|
"print(\"Lasso Regression RMSE train: {:.2f} test: {:.2f}\".format(rmse_lasso_train, rmse_lasso_test))\n",
|
||
|
"\n",
|
||
|
"# calculate the r-squared score\n",
|
||
|
"r2_linear_test = r2_score(y_test, y_pred_linear)\n",
|
||
|
"r2_linear_train = r2_score(y_train, y_pred_train_linear)\n",
|
||
|
"r2_ridge_test = r2_score(y_test, y_pred_ridge)\n",
|
||
|
"r2_ridge_train = r2_score(y_train, y_pred_train_ridge)\n",
|
||
|
"r2_lasso_test = r2_score(y_test, y_pred_lasso)\n",
|
||
|
"r2_lasso_train = r2_score(y_train, y_pred_train_lasso)\n",
|
||
|
"\n",
|
||
|
"# print the r-squared score in a compact way keeping two decimal digits\n",
|
||
|
"print(\"Linear Regression R2 train: {:.2f} test: {:.2f}\".format(r2_linear_train, r2_linear_test))\n",
|
||
|
"print(\"Ridge Regression R2 train: {:.2f} test: {:.2f}\".format(r2_ridge_train, r2_ridge_test))\n",
|
||
|
"print(\"Lasso Regression R2 train: {:.2f} test: {:.2f}\".format(r2_lasso_train, r2_lasso_test))\n",
|
||
|
"\n",
|
||
|
"print(\"Linear Regression features sorted by their coefficients:\")\n",
|
||
|
"for feature, coef in sorted(list(zip(df.columns[:-1], linear_regression.coefficients[1:])), key=lambda x: abs(x[1]), reverse=True):\n",
|
||
|
" print(f\"{feature}: {coef:.2f}\")\n",
|
||
|
"print(\"Ridge Regression features sorted by their coefficients:\")\n",
|
||
|
"for feature, coef in sorted(list(zip(df.columns[:-1], ridge_regression.coefficients[1:])), key=lambda x: abs(x[1]), reverse=True):\n",
|
||
|
" print(f\"{feature}: {coef:.2f}\")\n",
|
||
|
"print(\"Lasso Regression features sorted by their coefficients:\")\n",
|
||
|
"for feature, coef in sorted(list(zip(df.columns[:-1], lasso_regression.coefficients[1:])), key=lambda x: abs(x[1]), reverse=True):\n",
|
||
|
" print(f\"{feature}: {coef:.2f}\")\n",
|
||
|
"\n",
|
||
|
"# print the number of non-zero coefficients\n",
|
||
|
"print(\"Linear Regression number of non-zero coefficients:\", len(linear_regression.coefficients[linear_regression.coefficients != 0]))\n",
|
||
|
"print(\"Ridge Regression number of non-zero coefficients:\", len(ridge_regression.coefficients[ridge_regression.coefficients != 0]))\n",
|
||
|
"print(\"Lasso Regression number of non-zero coefficients:\", len(lasso_regression.coefficients[lasso_regression.coefficients != 0]))\n"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 32,
|
||
|
"outputs": [],
|
||
|
"source": [],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"outputs": [],
|
||
|
"source": [],
|
||
|
"metadata": {
|
||
|
"collapsed": false,
|
||
|
"pycharm": {
|
||
|
"is_executing": true
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 2
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython2",
|
||
|
"version": "2.7.6"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 0
|
||
|
}
|