{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "### Solution for Assignment 4 of the course \"Introduction to Machine Learning\" at the University of Leoben.\n",
    "##### Author: Fotios Lygerakis\n",
    "##### Semester: SS 2022/2023"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "class Predictor:\n",
    "    def __init__(self):\n",
    "        self.coefficients = None\n",
    "\n",
    "    def fit(self, X, y):\n",
    "        pass\n",
    "\n",
    "    def predict(self, X):\n",
    "        pass\n",
    "\n",
    "class LinearRegression(Predictor):\n",
    "    def fit(self, X, y):\n",
    "        X = np.hstack((np.ones((X.shape[0], 1)), X))\n",
    "        self.coefficients = np.linalg.inv(X.T @ X) @ X.T @ y\n",
    "\n",
    "    def predict(self, X):\n",
    "        X = np.hstack((np.ones((X.shape[0], 1)), X))\n",
    "        return X @ self.coefficients\n",
    "\n",
    "class RidgeRegression(Predictor):\n",
    "    def __init__(self, alpha):\n",
    "        super().__init__()\n",
    "        self.alpha = alpha\n",
    "\n",
    "    def fit(self, X, y):\n",
    "        X = np.hstack((np.ones((X.shape[0], 1)), X))\n",
    "        I = np.eye(X.shape[1])\n",
    "        I[0, 0] = 0\n",
    "        self.coefficients = np.linalg.inv(X.T @ X + self.alpha * I) @ X.T @ y\n",
    "\n",
    "    def predict(self, X):\n",
    "        X = np.hstack((np.ones((X.shape[0], 1)), X))\n",
    "        return X @ self.coefficients\n",
    "\n",
    "class LassoRegression(Predictor):\n",
    "    def __init__(self, alpha, num_iters=1000, lr=0.01):\n",
    "        super().__init__()\n",
    "        self.alpha = alpha\n",
    "        self.num_iters = num_iters\n",
    "        self.lr = lr\n",
    "\n",
    "    def fit(self, X, y):\n",
    "        X = np.hstack((np.ones((X.shape[0], 1)), X))\n",
    "        self.coefficients = np.random.randn(X.shape[1])\n",
    "        for _ in range(self.num_iters):\n",
    "            self.coefficients -= self.lr * (X.T @ (X @ self.coefficients - y) + self.alpha * np.sign(self.coefficients))\n",
    "\n",
    "    def predict(self, X):\n",
    "        X = np.hstack((np.ones((X.shape[0], 1)), X))\n",
    "        return X @ self.coefficients"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "Data Preprocessing and data loading"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def preprocess(df):\n",
    "    # Handle missing values\n",
    "    df = df.fillna(df.mean())\n",
    "\n",
    "    # Remove outliers\n",
    "    z_scores = np.abs((df - df.mean()) / df.std())\n",
    "    df = df[(z_scores < 3).all(axis=1)]\n",
    "\n",
    "    df_nomralized = df.copy()\n",
    "\n",
    "    # Normalize the data using z-score normalization except for the target column\n",
    "    for column in df.columns:\n",
    "        if column == \"target\":\n",
    "            continue\n",
    "        df_nomralized[column] = (df_nomralized[column] - df_nomralized[column].mean()) / df_nomralized[column].std()\n",
    "\n",
    "    return df, df_nomralized\n",
    "\n",
    "\n",
    "def train_test_split(X, y, test_size=0.2):\n",
    "    # Split the dataset into train and test sets\n",
    "    idx = np.arange(X.shape[0])\n",
    "    np.random.shuffle(idx)\n",
    "    X = X[idx]\n",
    "    y = y[idx]\n",
    "    split = int((1 - test_size) * X.shape[0])\n",
    "    X_train, X_test = X[:split], X[split:]\n",
    "    y_train, y_test = y[:split], y[split:]\n",
    "    return X_train, X_test, y_train, y_test\n",
    "\n",
    "def load_data(normalize=True):\n",
    "    # Load the diabetes dataset\n",
    "    df = pd.read_csv(\"diabetes.csv\")\n",
    "\n",
    "    # Preprocess the dataset\n",
    "    df, df_norm = preprocess(df)\n",
    "\n",
    "    # Split the dataset into train and test sets\n",
    "    if normalize:\n",
    "        X = df_norm.drop(\"target\", axis=1).values\n",
    "        y = df_norm[\"target\"].values\n",
    "    else:\n",
    "        X = df.drop(\"target\", axis=1).values\n",
    "        y = df[\"target\"].values\n",
    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n",
    "    return X_train, X_test, y_train, y_test, df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "outputs": [
    {
     "data": {
      "text/plain": "        age       sex       bmi        bp        s1        s2        s3  \\\n0  0.794887  1.061173  1.357096  0.459459 -0.917834 -0.734476 -0.958901   \n1 -0.038221 -0.940162 -1.095193 -0.557425 -0.148672 -0.395182  1.714481   \n2  1.779468  1.061173  0.983414 -0.121617 -0.947417 -0.720904 -0.708271   \n3 -1.855910 -0.940162 -0.231053 -0.775328  0.295075  0.561626 -0.791815   \n4  0.113253 -0.940162 -0.768221  0.459459  0.117576  0.358049  0.210704   \n\n         s4        s5        s6  target  \n0 -0.035628  0.434041 -0.356981   151.0  \n1 -0.856638 -1.429397 -1.923328    75.0  \n2 -0.035628  0.074059 -0.531020   141.0  \n3  0.785382  0.492755 -0.182943   206.0  \n4 -0.035628 -0.661884 -0.966116   135.0  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>age</th>\n      <th>sex</th>\n      <th>bmi</th>\n      <th>bp</th>\n      <th>s1</th>\n      <th>s2</th>\n      <th>s3</th>\n      <th>s4</th>\n      <th>s5</th>\n      <th>s6</th>\n      <th>target</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0.794887</td>\n      <td>1.061173</td>\n      <td>1.357096</td>\n      <td>0.459459</td>\n      <td>-0.917834</td>\n      <td>-0.734476</td>\n      <td>-0.958901</td>\n      <td>-0.035628</td>\n      <td>0.434041</td>\n      <td>-0.356981</td>\n      <td>151.0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>-0.038221</td>\n      <td>-0.940162</td>\n      <td>-1.095193</td>\n      <td>-0.557425</td>\n      <td>-0.148672</td>\n      <td>-0.395182</td>\n      <td>1.714481</td>\n      <td>-0.856638</td>\n      <td>-1.429397</td>\n      <td>-1.923328</td>\n      <td>75.0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>1.779468</td>\n      <td>1.061173</td>\n      <td>0.983414</td>\n      <td>-0.121617</td>\n      <td>-0.947417</td>\n      <td>-0.720904</td>\n      <td>-0.708271</td>\n      <td>-0.035628</td>\n      <td>0.074059</td>\n      <td>-0.531020</td>\n      <td>141.0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>-1.855910</td>\n      <td>-0.940162</td>\n      <td>-0.231053</td>\n      <td>-0.775328</td>\n      <td>0.295075</td>\n      <td>0.561626</td>\n      <td>-0.791815</td>\n      <td>0.785382</td>\n      <td>0.492755</td>\n      <td>-0.182943</td>\n      <td>206.0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>0.113253</td>\n      <td>-0.940162</td>\n      <td>-0.768221</td>\n      <td>0.459459</td>\n      <td>0.117576</td>\n      <td>0.358049</td>\n      <td>0.210704</td>\n      <td>-0.035628</td>\n      <td>-0.661884</td>\n      <td>-0.966116</td>\n      <td>135.0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Load the diabetes dataset\n",
    "df = pd.read_csv(\"diabetes.csv\")\n",
    "# Preprocess the dataset\n",
    "df, df_norm = preprocess(df)\n",
    "df.head()\n",
    "df_norm.head()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "Load the data"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "X_train: (344, 10)\n",
      "X_test: (86, 10)\n",
      "y_train: (344,)\n",
      "y_test: (86,)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.datasets import load_diabetes\n",
    "\n",
    "normalize = True\n",
    "# Load the data\n",
    "X_train, X_test, y_train, y_test, df = load_data(normalize=normalize)\n",
    "print(\"X_train:\", X_train.shape)\n",
    "print(\"X_test:\", X_test.shape)\n",
    "print(\"y_train:\", y_train.shape)\n",
    "print(\"y_test:\", y_test.shape)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "Fit the models"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "outputs": [],
   "source": [
    "# Fit the linear regression\n",
    "linear_regression = LinearRegression()\n",
    "linear_regression.fit(X_train, y_train)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "outputs": [],
   "source": [
    "# Fit the ridge regression\n",
    "ridge_regression = RidgeRegression(alpha=1)\n",
    "ridge_regression.fit(X_train, y_train)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "outputs": [],
   "source": [
    "# Fit the lasso regression\n",
    "lasso_regression = LassoRegression(alpha=1, num_iters=10000, lr=0.001)\n",
    "lasso_regression.fit(X_train, y_train)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "Evaluate the models"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Linear Regression MSE train: 2983.19 test: 2487.62\n",
      "Ridge Regression MSE train: 2983.79 test: 2495.37\n",
      "Lasso Regression MSE train: 2983.19 test: 2487.98\n",
      "Linear Regression RMSE train: 54.62 test: 49.88\n",
      "Ridge Regression RMSE train: 54.62 test: 49.95\n",
      "Lasso Regression RMSE train: 54.62 test: 49.88\n",
      "Linear Regression R2 train: 0.50 test: 0.49\n",
      "Ridge Regression R2 train: 0.50 test: 0.49\n",
      "Lasso Regression R2 train: 0.50 test: 0.49\n",
      "Linear Regression features sorted by their coefficients:\n",
      "s5: 28.62\n",
      "bmi: 23.95\n",
      "s1: -23.18\n",
      "bp: 17.64\n",
      "s2: 14.38\n",
      "sex: -12.17\n",
      "s4: 4.23\n",
      "s3: -4.03\n",
      "s6: 2.06\n",
      "age: 0.86\n",
      "Ridge Regression features sorted by their coefficients:\n",
      "s5: 26.16\n",
      "bmi: 23.91\n",
      "bp: 17.60\n",
      "s1: -16.67\n",
      "sex: -12.13\n",
      "s2: 9.11\n",
      "s3: -6.64\n",
      "s4: 3.73\n",
      "s6: 2.12\n",
      "age: 0.87\n",
      "Lasso Regression features sorted by their coefficients:\n",
      "s5: 28.48\n",
      "bmi: 23.95\n",
      "s1: -22.80\n",
      "bp: 17.64\n",
      "s2: 14.07\n",
      "sex: -12.17\n",
      "s4: 4.19\n",
      "s3: -4.18\n",
      "s6: 2.06\n",
      "age: 0.86\n",
      "Linear Regression number of non-zero coefficients: 11\n",
      "Ridge Regression number of non-zero coefficients: 11\n",
      "Lasso Regression number of non-zero coefficients: 11\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import r2_score\n",
    "\n",
    "y_pred_linear = linear_regression.predict(X_test)\n",
    "y_pred_ridge = ridge_regression.predict(X_test)\n",
    "y_pred_lasso = lasso_regression.predict(X_test)\n",
    "\n",
    "y_pred_train_linear = linear_regression.predict(X_train)\n",
    "y_pred_train_ridge = ridge_regression.predict(X_train)\n",
    "y_pred_train_lasso = lasso_regression.predict(X_train)\n",
    "\n",
    "\n",
    "# Calculate the mean squared error\n",
    "mse_linear_test = np.mean((y_test - y_pred_linear) ** 2)\n",
    "mse_linear_train = np.mean((y_train - y_pred_train_linear) ** 2)\n",
    "mse_ridge_test = np.mean((y_test - y_pred_ridge) ** 2)\n",
    "mse_ridge_train = np.mean((y_train - y_pred_train_ridge) ** 2)\n",
    "mse_lasso_test = np.mean((y_test - y_pred_lasso) ** 2)\n",
    "mse_lasso_train = np.mean((y_train - y_pred_train_lasso) ** 2)\n",
    "\n",
    "# Print the mean squared error in a compact way keeping two decimal digits\n",
    "print(\"Linear Regression MSE train: {:.2f} test: {:.2f}\".format(mse_linear_train, mse_linear_test))\n",
    "print(\"Ridge Regression MSE train: {:.2f} test: {:.2f}\".format(mse_ridge_train, mse_ridge_test))\n",
    "print(\"Lasso Regression MSE train: {:.2f} test: {:.2f}\".format(mse_lasso_train, mse_lasso_test))\n",
    "\n",
    "# calculate the root squared error\n",
    "rmse_linear_test = np.sqrt(mse_linear_test)\n",
    "rmse_linear_train = np.sqrt(mse_linear_train)\n",
    "rmse_ridge_test = np.sqrt(mse_ridge_test)\n",
    "rmse_ridge_train = np.sqrt(mse_ridge_train)\n",
    "rmse_lasso_test = np.sqrt(mse_lasso_test)\n",
    "rmse_lasso_train = np.sqrt(mse_lasso_train)\n",
    "\n",
    "# print the root squared error in a compact way keeping two decimal digits\n",
    "print(\"Linear Regression RMSE train: {:.2f} test: {:.2f}\".format(rmse_linear_train, rmse_linear_test))\n",
    "print(\"Ridge Regression RMSE train: {:.2f} test: {:.2f}\".format(rmse_ridge_train, rmse_ridge_test))\n",
    "print(\"Lasso Regression RMSE train: {:.2f} test: {:.2f}\".format(rmse_lasso_train, rmse_lasso_test))\n",
    "\n",
    "# calculate the r-squared score\n",
    "r2_linear_test = r2_score(y_test, y_pred_linear)\n",
    "r2_linear_train = r2_score(y_train, y_pred_train_linear)\n",
    "r2_ridge_test = r2_score(y_test, y_pred_ridge)\n",
    "r2_ridge_train = r2_score(y_train, y_pred_train_ridge)\n",
    "r2_lasso_test = r2_score(y_test, y_pred_lasso)\n",
    "r2_lasso_train = r2_score(y_train, y_pred_train_lasso)\n",
    "\n",
    "# print the r-squared score in a compact way keeping two decimal digits\n",
    "print(\"Linear Regression R2 train: {:.2f} test: {:.2f}\".format(r2_linear_train, r2_linear_test))\n",
    "print(\"Ridge Regression R2 train: {:.2f} test: {:.2f}\".format(r2_ridge_train, r2_ridge_test))\n",
    "print(\"Lasso Regression R2 train: {:.2f} test: {:.2f}\".format(r2_lasso_train, r2_lasso_test))\n",
    "\n",
    "print(\"Linear Regression features sorted by their coefficients:\")\n",
    "for feature, coef in sorted(list(zip(df.columns[:-1], linear_regression.coefficients[1:])), key=lambda x: abs(x[1]), reverse=True):\n",
    "    print(f\"{feature}: {coef:.2f}\")\n",
    "print(\"Ridge Regression features sorted by their coefficients:\")\n",
    "for feature, coef in sorted(list(zip(df.columns[:-1], ridge_regression.coefficients[1:])), key=lambda x: abs(x[1]), reverse=True):\n",
    "    print(f\"{feature}: {coef:.2f}\")\n",
    "print(\"Lasso Regression features sorted by their coefficients:\")\n",
    "for feature, coef in sorted(list(zip(df.columns[:-1], lasso_regression.coefficients[1:])), key=lambda x: abs(x[1]), reverse=True):\n",
    "    print(f\"{feature}: {coef:.2f}\")\n",
    "\n",
    "# print the number of non-zero coefficients\n",
    "print(\"Linear Regression number of non-zero coefficients:\", len(linear_regression.coefficients[linear_regression.coefficients != 0]))\n",
    "print(\"Ridge Regression number of non-zero coefficients:\", len(ridge_regression.coefficients[ridge_regression.coefficients != 0]))\n",
    "print(\"Lasso Regression number of non-zero coefficients:\", len(lasso_regression.coefficients[lasso_regression.coefficients != 0]))\n"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "is_executing": true
    }
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}