{ "cells": [ { "cell_type": "markdown", "source": [ "### Solution for Assignment 4 of the course \"Introduction to Machine Learning\" at the University of Leoben.\n", "##### Author: Fotios Lygerakis\n", "##### Semester: SS 2022/2023" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 2, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "class Predictor:\n", " def __init__(self):\n", " self.coefficients = None\n", "\n", " def fit(self, X, y):\n", " pass\n", "\n", " def predict(self, X):\n", " pass\n", "\n", "class LinearRegression(Predictor):\n", " def fit(self, X, y):\n", " X = np.hstack((np.ones((X.shape[0], 1)), X))\n", " self.coefficients = np.linalg.inv(X.T @ X) @ X.T @ y\n", "\n", " def predict(self, X):\n", " X = np.hstack((np.ones((X.shape[0], 1)), X))\n", " return X @ self.coefficients\n", "\n", "class RidgeRegression(Predictor):\n", " def __init__(self, alpha):\n", " super().__init__()\n", " self.alpha = alpha\n", "\n", " def fit(self, X, y):\n", " X = np.hstack((np.ones((X.shape[0], 1)), X))\n", " I = np.eye(X.shape[1])\n", " I[0, 0] = 0\n", " self.coefficients = np.linalg.inv(X.T @ X + self.alpha * I) @ X.T @ y\n", "\n", " def predict(self, X):\n", " X = np.hstack((np.ones((X.shape[0], 1)), X))\n", " return X @ self.coefficients\n", "\n", "class LassoRegression(Predictor):\n", " def __init__(self, alpha, num_iters=1000, lr=0.01):\n", " super().__init__()\n", " self.alpha = alpha\n", " self.num_iters = num_iters\n", " self.lr = lr\n", "\n", " def fit(self, X, y):\n", " X = np.hstack((np.ones((X.shape[0], 1)), X))\n", " self.coefficients = np.random.randn(X.shape[1])\n", " for _ in range(self.num_iters):\n", " self.coefficients -= self.lr * (X.T @ (X @ self.coefficients - y) + self.alpha * np.sign(self.coefficients))\n", "\n", " def predict(self, X):\n", " X = np.hstack((np.ones((X.shape[0], 1)), X))\n", " return X @ self.coefficients" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "Data Preprocessing and data loading" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def preprocess(df):\n", " # Handle missing values\n", " df = df.fillna(df.mean())\n", "\n", " # Remove outliers\n", " z_scores = np.abs((df - df.mean()) / df.std())\n", " df = df[(z_scores < 3).all(axis=1)]\n", "\n", " df_nomralized = df.copy()\n", "\n", " # Normalize the data using z-score normalization except for the target column\n", " for column in df.columns:\n", " if column == \"target\":\n", " continue\n", " df_nomralized[column] = (df_nomralized[column] - df_nomralized[column].mean()) / df_nomralized[column].std()\n", "\n", " return df, df_nomralized\n", "\n", "\n", "def train_test_split(X, y, test_size=0.2):\n", " # Split the dataset into train and test sets\n", " idx = np.arange(X.shape[0])\n", " np.random.shuffle(idx)\n", " X = X[idx]\n", " y = y[idx]\n", " split = int((1 - test_size) * X.shape[0])\n", " X_train, X_test = X[:split], X[split:]\n", " y_train, y_test = y[:split], y[split:]\n", " return X_train, X_test, y_train, y_test\n", "\n", "def load_data(normalize=True):\n", " # Load the diabetes dataset\n", " df = pd.read_csv(\"diabetes.csv\")\n", "\n", " # Preprocess the dataset\n", " df, df_norm = preprocess(df)\n", "\n", " # Split the dataset into train and test sets\n", " if normalize:\n", " X = df_norm.drop(\"target\", axis=1).values\n", " y = df_norm[\"target\"].values\n", " else:\n", " X = df.drop(\"target\", axis=1).values\n", " y = df[\"target\"].values\n", " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n", " return X_train, X_test, y_train, y_test, df" ] }, { "cell_type": "code", "execution_count": 4, "outputs": [ { "data": { "text/plain": " age sex bmi bp s1 s2 s3 \\\n0 0.794887 1.061173 1.357096 0.459459 -0.917834 -0.734476 -0.958901 \n1 -0.038221 -0.940162 -1.095193 -0.557425 -0.148672 -0.395182 1.714481 \n2 1.779468 1.061173 0.983414 -0.121617 -0.947417 -0.720904 -0.708271 \n3 -1.855910 -0.940162 -0.231053 -0.775328 0.295075 0.561626 -0.791815 \n4 0.113253 -0.940162 -0.768221 0.459459 0.117576 0.358049 0.210704 \n\n s4 s5 s6 target \n0 -0.035628 0.434041 -0.356981 151.0 \n1 -0.856638 -1.429397 -1.923328 75.0 \n2 -0.035628 0.074059 -0.531020 141.0 \n3 0.785382 0.492755 -0.182943 206.0 \n4 -0.035628 -0.661884 -0.966116 135.0 ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
agesexbmibps1s2s3s4s5s6target
00.7948871.0611731.3570960.459459-0.917834-0.734476-0.958901-0.0356280.434041-0.356981151.0
1-0.038221-0.940162-1.095193-0.557425-0.148672-0.3951821.714481-0.856638-1.429397-1.92332875.0
21.7794681.0611730.983414-0.121617-0.947417-0.720904-0.708271-0.0356280.074059-0.531020141.0
3-1.855910-0.940162-0.231053-0.7753280.2950750.561626-0.7918150.7853820.492755-0.182943206.0
40.113253-0.940162-0.7682210.4594590.1175760.3580490.210704-0.035628-0.661884-0.966116135.0
\n
" }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Load the diabetes dataset\n", "df = pd.read_csv(\"diabetes.csv\")\n", "# Preprocess the dataset\n", "df, df_norm = preprocess(df)\n", "df.head()\n", "df_norm.head()" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "Load the data" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 10, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "X_train: (344, 10)\n", "X_test: (86, 10)\n", "y_train: (344,)\n", "y_test: (86,)\n" ] } ], "source": [ "from sklearn.datasets import load_diabetes\n", "\n", "normalize = True\n", "# Load the data\n", "X_train, X_test, y_train, y_test, df = load_data(normalize=normalize)\n", "print(\"X_train:\", X_train.shape)\n", "print(\"X_test:\", X_test.shape)\n", "print(\"y_train:\", y_train.shape)\n", "print(\"y_test:\", y_test.shape)" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "Fit the models" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 11, "outputs": [], "source": [ "# Fit the linear regression\n", "linear_regression = LinearRegression()\n", "linear_regression.fit(X_train, y_train)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 12, "outputs": [], "source": [ "# Fit the ridge regression\n", "ridge_regression = RidgeRegression(alpha=1)\n", "ridge_regression.fit(X_train, y_train)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 13, "outputs": [], "source": [ "# Fit the lasso regression\n", "lasso_regression = LassoRegression(alpha=1, num_iters=10000, lr=0.001)\n", "lasso_regression.fit(X_train, y_train)" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "Evaluate the models" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 14, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Linear Regression MSE train: 2983.19 test: 2487.62\n", "Ridge Regression MSE train: 2983.79 test: 2495.37\n", "Lasso Regression MSE train: 2983.19 test: 2487.98\n", "Linear Regression RMSE train: 54.62 test: 49.88\n", "Ridge Regression RMSE train: 54.62 test: 49.95\n", "Lasso Regression RMSE train: 54.62 test: 49.88\n", "Linear Regression R2 train: 0.50 test: 0.49\n", "Ridge Regression R2 train: 0.50 test: 0.49\n", "Lasso Regression R2 train: 0.50 test: 0.49\n", "Linear Regression features sorted by their coefficients:\n", "s5: 28.62\n", "bmi: 23.95\n", "s1: -23.18\n", "bp: 17.64\n", "s2: 14.38\n", "sex: -12.17\n", "s4: 4.23\n", "s3: -4.03\n", "s6: 2.06\n", "age: 0.86\n", "Ridge Regression features sorted by their coefficients:\n", "s5: 26.16\n", "bmi: 23.91\n", "bp: 17.60\n", "s1: -16.67\n", "sex: -12.13\n", "s2: 9.11\n", "s3: -6.64\n", "s4: 3.73\n", "s6: 2.12\n", "age: 0.87\n", "Lasso Regression features sorted by their coefficients:\n", "s5: 28.48\n", "bmi: 23.95\n", "s1: -22.80\n", "bp: 17.64\n", "s2: 14.07\n", "sex: -12.17\n", "s4: 4.19\n", "s3: -4.18\n", "s6: 2.06\n", "age: 0.86\n", "Linear Regression number of non-zero coefficients: 11\n", "Ridge Regression number of non-zero coefficients: 11\n", "Lasso Regression number of non-zero coefficients: 11\n" ] } ], "source": [ "from sklearn.metrics import r2_score\n", "\n", "y_pred_linear = linear_regression.predict(X_test)\n", "y_pred_ridge = ridge_regression.predict(X_test)\n", "y_pred_lasso = lasso_regression.predict(X_test)\n", "\n", "y_pred_train_linear = linear_regression.predict(X_train)\n", "y_pred_train_ridge = ridge_regression.predict(X_train)\n", "y_pred_train_lasso = lasso_regression.predict(X_train)\n", "\n", "\n", "# Calculate the mean squared error\n", "mse_linear_test = np.mean((y_test - y_pred_linear) ** 2)\n", "mse_linear_train = np.mean((y_train - y_pred_train_linear) ** 2)\n", "mse_ridge_test = np.mean((y_test - y_pred_ridge) ** 2)\n", "mse_ridge_train = np.mean((y_train - y_pred_train_ridge) ** 2)\n", "mse_lasso_test = np.mean((y_test - y_pred_lasso) ** 2)\n", "mse_lasso_train = np.mean((y_train - y_pred_train_lasso) ** 2)\n", "\n", "# Print the mean squared error in a compact way keeping two decimal digits\n", "print(\"Linear Regression MSE train: {:.2f} test: {:.2f}\".format(mse_linear_train, mse_linear_test))\n", "print(\"Ridge Regression MSE train: {:.2f} test: {:.2f}\".format(mse_ridge_train, mse_ridge_test))\n", "print(\"Lasso Regression MSE train: {:.2f} test: {:.2f}\".format(mse_lasso_train, mse_lasso_test))\n", "\n", "# calculate the root squared error\n", "rmse_linear_test = np.sqrt(mse_linear_test)\n", "rmse_linear_train = np.sqrt(mse_linear_train)\n", "rmse_ridge_test = np.sqrt(mse_ridge_test)\n", "rmse_ridge_train = np.sqrt(mse_ridge_train)\n", "rmse_lasso_test = np.sqrt(mse_lasso_test)\n", "rmse_lasso_train = np.sqrt(mse_lasso_train)\n", "\n", "# print the root squared error in a compact way keeping two decimal digits\n", "print(\"Linear Regression RMSE train: {:.2f} test: {:.2f}\".format(rmse_linear_train, rmse_linear_test))\n", "print(\"Ridge Regression RMSE train: {:.2f} test: {:.2f}\".format(rmse_ridge_train, rmse_ridge_test))\n", "print(\"Lasso Regression RMSE train: {:.2f} test: {:.2f}\".format(rmse_lasso_train, rmse_lasso_test))\n", "\n", "# calculate the r-squared score\n", "r2_linear_test = r2_score(y_test, y_pred_linear)\n", "r2_linear_train = r2_score(y_train, y_pred_train_linear)\n", "r2_ridge_test = r2_score(y_test, y_pred_ridge)\n", "r2_ridge_train = r2_score(y_train, y_pred_train_ridge)\n", "r2_lasso_test = r2_score(y_test, y_pred_lasso)\n", "r2_lasso_train = r2_score(y_train, y_pred_train_lasso)\n", "\n", "# print the r-squared score in a compact way keeping two decimal digits\n", "print(\"Linear Regression R2 train: {:.2f} test: {:.2f}\".format(r2_linear_train, r2_linear_test))\n", "print(\"Ridge Regression R2 train: {:.2f} test: {:.2f}\".format(r2_ridge_train, r2_ridge_test))\n", "print(\"Lasso Regression R2 train: {:.2f} test: {:.2f}\".format(r2_lasso_train, r2_lasso_test))\n", "\n", "print(\"Linear Regression features sorted by their coefficients:\")\n", "for feature, coef in sorted(list(zip(df.columns[:-1], linear_regression.coefficients[1:])), key=lambda x: abs(x[1]), reverse=True):\n", " print(f\"{feature}: {coef:.2f}\")\n", "print(\"Ridge Regression features sorted by their coefficients:\")\n", "for feature, coef in sorted(list(zip(df.columns[:-1], ridge_regression.coefficients[1:])), key=lambda x: abs(x[1]), reverse=True):\n", " print(f\"{feature}: {coef:.2f}\")\n", "print(\"Lasso Regression features sorted by their coefficients:\")\n", "for feature, coef in sorted(list(zip(df.columns[:-1], lasso_regression.coefficients[1:])), key=lambda x: abs(x[1]), reverse=True):\n", " print(f\"{feature}: {coef:.2f}\")\n", "\n", "# print the number of non-zero coefficients\n", "print(\"Linear Regression number of non-zero coefficients:\", len(linear_regression.coefficients[linear_regression.coefficients != 0]))\n", "print(\"Ridge Regression number of non-zero coefficients:\", len(ridge_regression.coefficients[ridge_regression.coefficients != 0]))\n", "print(\"Lasso Regression number of non-zero coefficients:\", len(lasso_regression.coefficients[lasso_regression.coefficients != 0]))\n" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 32, "outputs": [], "source": [], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [], "metadata": { "collapsed": false, "pycharm": { "is_executing": true } } } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }