diff --git a/assignment 4/iml_assginment4_solved.ipynb b/assignment 4/iml_assginment4_solved.ipynb new file mode 100644 index 0000000..a620b02 --- /dev/null +++ b/assignment 4/iml_assginment4_solved.ipynb @@ -0,0 +1,431 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "class Predictor:\n", + " def __init__(self):\n", + " self.coefficients = None\n", + "\n", + " def fit(self, X, y):\n", + " pass\n", + "\n", + " def predict(self, X):\n", + " pass\n", + "\n", + "class LinearRegression(Predictor):\n", + " def fit(self, X, y):\n", + " X = np.hstack((np.ones((X.shape[0], 1)), X))\n", + " self.coefficients = np.linalg.inv(X.T @ X) @ X.T @ y\n", + "\n", + " def predict(self, X):\n", + " X = np.hstack((np.ones((X.shape[0], 1)), X))\n", + " return X @ self.coefficients\n", + "\n", + "class RidgeRegression(Predictor):\n", + " def __init__(self, alpha):\n", + " super().__init__()\n", + " self.alpha = alpha\n", + "\n", + " def fit(self, X, y):\n", + " X = np.hstack((np.ones((X.shape[0], 1)), X))\n", + " I = np.eye(X.shape[1])\n", + " I[0, 0] = 0\n", + " self.coefficients = np.linalg.inv(X.T @ X + self.alpha * I) @ X.T @ y\n", + "\n", + " def predict(self, X):\n", + " X = np.hstack((np.ones((X.shape[0], 1)), X))\n", + " return X @ self.coefficients\n", + "\n", + "class LassoRegression(Predictor):\n", + " def __init__(self, alpha, num_iters=1000, lr=0.01):\n", + " super().__init__()\n", + " self.alpha = alpha\n", + " self.num_iters = num_iters\n", + " self.lr = lr\n", + "\n", + " def fit(self, X, y):\n", + " X = np.hstack((np.ones((X.shape[0], 1)), X))\n", + " self.coefficients = np.random.randn(X.shape[1])\n", + " for _ in range(self.num_iters):\n", + " self.coefficients -= self.lr * (X.T @ (X @ self.coefficients - y) + self.alpha * np.sign(self.coefficients))\n", + "\n", + " def predict(self, X):\n", + " X = np.hstack((np.ones((X.shape[0], 1)), X))\n", + " return X @ self.coefficients" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "Data Preprocessing and data loading" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def preprocess(df):\n", + " # Handle missing values\n", + " df = df.fillna(df.mean())\n", + "\n", + " # Remove outliers\n", + " z_scores = np.abs((df - df.mean()) / df.std())\n", + " df = df[(z_scores < 3).all(axis=1)]\n", + "\n", + " df_nomralized = df.copy()\n", + "\n", + " # Normalize the data using z-score normalization except for the target column\n", + " for column in df.columns:\n", + " if column == \"target\":\n", + " continue\n", + " df_nomralized[column] = (df_nomralized[column] - df_nomralized[column].mean()) / df_nomralized[column].std()\n", + "\n", + " return df, df_nomralized\n", + "\n", + "def train_test_split(X, y, test_size=0.2):\n", + " # Split the dataset into train and test sets\n", + " idx = np.arange(X.shape[0])\n", + " np.random.shuffle(idx)\n", + " X = X[idx]\n", + " y = y[idx]\n", + " split = int((1 - test_size) * X.shape[0])\n", + " X_train, X_test = X[:split], X[split:]\n", + " y_train, y_test = y[:split], y[split:]\n", + " return X_train, X_test, y_train, y_test\n", + "\n", + "def load_data(normalize=True):\n", + " # Load the diabetes dataset\n", + " df = pd.read_csv(\"diabetes.csv\")\n", + "\n", + " # Preprocess the dataset\n", + " df, df_norm = preprocess(df)\n", + "\n", + " # Split the dataset into train and test sets\n", + " if normalize:\n", + " X = df_norm.drop(\"target\", axis=1).values\n", + " y = df_norm[\"target\"].values\n", + " else:\n", + " X = df.drop(\"target\", axis=1).values\n", + " y = df[\"target\"].values\n", + " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n", + " return X_train, X_test, y_train, y_test, df" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "outputs": [ + { + "data": { + "text/plain": " age sex bmi bp s1 s2 s3 \\\n0 0.794887 1.061173 1.357096 0.459459 -0.917834 -0.734476 -0.958901 \n1 -0.038221 -0.940162 -1.095193 -0.557425 -0.148672 -0.395182 1.714481 \n2 1.779468 1.061173 0.983414 -0.121617 -0.947417 -0.720904 -0.708271 \n3 -1.855910 -0.940162 -0.231053 -0.775328 0.295075 0.561626 -0.791815 \n4 0.113253 -0.940162 -0.768221 0.459459 0.117576 0.358049 0.210704 \n\n s4 s5 s6 target \n0 -0.035628 0.434041 -0.356981 151.0 \n1 -0.856638 -1.429397 -1.923328 75.0 \n2 -0.035628 0.074059 -0.531020 141.0 \n3 0.785382 0.492755 -0.182943 206.0 \n4 -0.035628 -0.661884 -0.966116 135.0 ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
agesexbmibps1s2s3s4s5s6target
00.7948871.0611731.3570960.459459-0.917834-0.734476-0.958901-0.0356280.434041-0.356981151.0
1-0.038221-0.940162-1.095193-0.557425-0.148672-0.3951821.714481-0.856638-1.429397-1.92332875.0
21.7794681.0611730.983414-0.121617-0.947417-0.720904-0.708271-0.0356280.074059-0.531020141.0
3-1.855910-0.940162-0.231053-0.7753280.2950750.561626-0.7918150.7853820.492755-0.182943206.0
40.113253-0.940162-0.7682210.4594590.1175760.3580490.210704-0.035628-0.661884-0.966116135.0
\n
" + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load the diabetes dataset\n", + "df = pd.read_csv(\"diabetes.csv\")\n", + "# Preprocess the dataset\n", + "df, df_norm = preprocess(df)\n", + "df.head()\n", + "df_norm.head()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "Load the data" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 4, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "X_train: (353, 10)\n", + "X_test: (89, 10)\n", + "y_train: (353,)\n", + "y_test: (89,)\n" + ] + } + ], + "source": [ + "from sklearn.datasets import load_diabetes\n", + "\n", + "normalize = True\n", + "# Load the data\n", + "# X_train, X_test, y_train, y_test, df = load_data(normalize=normalize)\n", + "data = load_diabetes(scaled=True)\n", + "# split data into train and test sets\n", + "X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2)\n", + "print(\"X_train:\", X_train.shape)\n", + "print(\"X_test:\", X_test.shape)\n", + "print(\"y_train:\", y_train.shape)\n", + "print(\"y_test:\", y_test.shape)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "Fit the models" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 5, + "outputs": [], + "source": [ + "# Fit the linear regression\n", + "linear_regression = LinearRegression()\n", + "linear_regression.fit(X_train, y_train)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 6, + "outputs": [], + "source": [ + "# Fit the ridge regression\n", + "ridge_regression = RidgeRegression(alpha=1)\n", + "ridge_regression.fit(X_train, y_train)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 7, + "outputs": [], + "source": [ + "# Fit the lasso regression\n", + "lasso_regression = LassoRegression(alpha=1, num_iters=10000, lr=0.001)\n", + "lasso_regression.fit(X_train, y_train)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "Evaluate the models" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 8, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Linear Regression MSE train: 2886.03 test: 2801.42\n", + "Ridge Regression MSE train: 3417.29 test: 3129.15\n", + "Lasso Regression MSE train: 2905.98 test: 2812.49\n", + "Linear Regression RMSE train: 53.72 test: 52.93\n", + "Ridge Regression RMSE train: 58.46 test: 55.94\n", + "Lasso Regression RMSE train: 53.91 test: 53.03\n", + "Linear Regression R2 train: 0.51 test: 0.54\n", + "Ridge Regression R2 train: 0.42 test: 0.49\n", + "Lasso Regression R2 train: 0.51 test: 0.54\n", + "Linear Regression features sorted by their coefficients:\n", + "s1: -822.75\n", + "s5: 765.40\n", + "bmi: 514.60\n", + "s2: 424.92\n", + "bp: 355.26\n", + "sex: -241.22\n", + "s4: 230.86\n", + "s3: 129.59\n", + "s6: 40.86\n", + "age: -10.93\n", + "Ridge Regression features sorted by their coefficients:\n", + "bmi: 283.92\n", + "s5: 238.87\n", + "bp: 195.46\n", + "s3: -142.91\n", + "s4: 106.92\n", + "s6: 93.55\n", + "sex: -63.98\n", + "s2: -30.82\n", + "age: 24.08\n", + "s1: 0.97\n", + "Lasso Regression features sorted by their coefficients:\n", + "bmi: 528.98\n", + "s5: 494.25\n", + "bp: 348.14\n", + "sex: -230.35\n", + "s3: -197.76\n", + "s2: -137.48\n", + "s4: 127.48\n", + "s1: -101.61\n", + "s6: 40.95\n", + "age: -7.39\n", + "Linear Regression number of non-zero coefficients: 11\n", + "Ridge Regression number of non-zero coefficients: 11\n", + "Lasso Regression number of non-zero coefficients: 11\n" + ] + } + ], + "source": [ + "from sklearn.metrics import r2_score\n", + "\n", + "y_pred_linear = linear_regression.predict(X_test)\n", + "y_pred_ridge = ridge_regression.predict(X_test)\n", + "y_pred_lasso = lasso_regression.predict(X_test)\n", + "\n", + "y_pred_train_linear = linear_regression.predict(X_train)\n", + "y_pred_train_ridge = ridge_regression.predict(X_train)\n", + "y_pred_train_lasso = lasso_regression.predict(X_train)\n", + "\n", + "\n", + "# Calculate the mean squared error\n", + "mse_linear_test = np.mean((y_test - y_pred_linear) ** 2)\n", + "mse_linear_train = np.mean((y_train - y_pred_train_linear) ** 2)\n", + "mse_ridge_test = np.mean((y_test - y_pred_ridge) ** 2)\n", + "mse_ridge_train = np.mean((y_train - y_pred_train_ridge) ** 2)\n", + "mse_lasso_test = np.mean((y_test - y_pred_lasso) ** 2)\n", + "mse_lasso_train = np.mean((y_train - y_pred_train_lasso) ** 2)\n", + "\n", + "# Print the mean squared error in a compact way keeping two decimal digits\n", + "print(\"Linear Regression MSE train: {:.2f} test: {:.2f}\".format(mse_linear_train, mse_linear_test))\n", + "print(\"Ridge Regression MSE train: {:.2f} test: {:.2f}\".format(mse_ridge_train, mse_ridge_test))\n", + "print(\"Lasso Regression MSE train: {:.2f} test: {:.2f}\".format(mse_lasso_train, mse_lasso_test))\n", + "\n", + "# calculate the root squared error\n", + "rmse_linear_test = np.sqrt(mse_linear_test)\n", + "rmse_linear_train = np.sqrt(mse_linear_train)\n", + "rmse_ridge_test = np.sqrt(mse_ridge_test)\n", + "rmse_ridge_train = np.sqrt(mse_ridge_train)\n", + "rmse_lasso_test = np.sqrt(mse_lasso_test)\n", + "rmse_lasso_train = np.sqrt(mse_lasso_train)\n", + "\n", + "# print the root squared error in a compact way keeping two decimal digits\n", + "print(\"Linear Regression RMSE train: {:.2f} test: {:.2f}\".format(rmse_linear_train, rmse_linear_test))\n", + "print(\"Ridge Regression RMSE train: {:.2f} test: {:.2f}\".format(rmse_ridge_train, rmse_ridge_test))\n", + "print(\"Lasso Regression RMSE train: {:.2f} test: {:.2f}\".format(rmse_lasso_train, rmse_lasso_test))\n", + "\n", + "# calculate the r-squared score\n", + "r2_linear_test = r2_score(y_test, y_pred_linear)\n", + "r2_linear_train = r2_score(y_train, y_pred_train_linear)\n", + "r2_ridge_test = r2_score(y_test, y_pred_ridge)\n", + "r2_ridge_train = r2_score(y_train, y_pred_train_ridge)\n", + "r2_lasso_test = r2_score(y_test, y_pred_lasso)\n", + "r2_lasso_train = r2_score(y_train, y_pred_train_lasso)\n", + "\n", + "# print the r-squared score in a compact way keeping two decimal digits\n", + "print(\"Linear Regression R2 train: {:.2f} test: {:.2f}\".format(r2_linear_train, r2_linear_test))\n", + "print(\"Ridge Regression R2 train: {:.2f} test: {:.2f}\".format(r2_ridge_train, r2_ridge_test))\n", + "print(\"Lasso Regression R2 train: {:.2f} test: {:.2f}\".format(r2_lasso_train, r2_lasso_test))\n", + "\n", + "print(\"Linear Regression features sorted by their coefficients:\")\n", + "for feature, coef in sorted(list(zip(df.columns[:-1], linear_regression.coefficients[1:])), key=lambda x: abs(x[1]), reverse=True):\n", + " print(f\"{feature}: {coef:.2f}\")\n", + "print(\"Ridge Regression features sorted by their coefficients:\")\n", + "for feature, coef in sorted(list(zip(df.columns[:-1], ridge_regression.coefficients[1:])), key=lambda x: abs(x[1]), reverse=True):\n", + " print(f\"{feature}: {coef:.2f}\")\n", + "print(\"Lasso Regression features sorted by their coefficients:\")\n", + "for feature, coef in sorted(list(zip(df.columns[:-1], lasso_regression.coefficients[1:])), key=lambda x: abs(x[1]), reverse=True):\n", + " print(f\"{feature}: {coef:.2f}\")\n", + "\n", + "# print the number of non-zero coefficients\n", + "print(\"Linear Regression number of non-zero coefficients:\", len(linear_regression.coefficients[linear_regression.coefficients != 0]))\n", + "print(\"Ridge Regression number of non-zero coefficients:\", len(ridge_regression.coefficients[ridge_regression.coefficients != 0]))\n", + "print(\"Lasso Regression number of non-zero coefficients:\", len(lasso_regression.coefficients[lasso_regression.coefficients != 0]))\n" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 32, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false, + "pycharm": { + "is_executing": true + } + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}