{ "cells": [ { "cell_type": "markdown", "source": [ "### Solution for Assignment 4 of the course \"Introduction to Machine Learning\" at the University of Leoben.\n", "##### Author: Fotios Lygerakis\n", "##### Semester: SS 2022/2023" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 2, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "class Predictor:\n", " def __init__(self):\n", " self.coefficients = None\n", "\n", " def fit(self, X, y):\n", " pass\n", "\n", " def predict(self, X):\n", " pass\n", "\n", "class LinearRegression(Predictor):\n", " def fit(self, X, y):\n", " X = np.hstack((np.ones((X.shape[0], 1)), X))\n", " self.coefficients = np.linalg.inv(X.T @ X) @ X.T @ y\n", "\n", " def predict(self, X):\n", " X = np.hstack((np.ones((X.shape[0], 1)), X))\n", " return X @ self.coefficients\n", "\n", "class RidgeRegression(Predictor):\n", " def __init__(self, alpha):\n", " super().__init__()\n", " self.alpha = alpha\n", "\n", " def fit(self, X, y):\n", " X = np.hstack((np.ones((X.shape[0], 1)), X))\n", " I = np.eye(X.shape[1])\n", " I[0, 0] = 0\n", " self.coefficients = np.linalg.inv(X.T @ X + self.alpha * I) @ X.T @ y\n", "\n", " def predict(self, X):\n", " X = np.hstack((np.ones((X.shape[0], 1)), X))\n", " return X @ self.coefficients\n", "\n", "class LassoRegression(Predictor):\n", " def __init__(self, alpha, num_iters=1000, lr=0.01):\n", " super().__init__()\n", " self.alpha = alpha\n", " self.num_iters = num_iters\n", " self.lr = lr\n", "\n", " def fit(self, X, y):\n", " X = np.hstack((np.ones((X.shape[0], 1)), X))\n", " self.coefficients = np.random.randn(X.shape[1])\n", " for _ in range(self.num_iters):\n", " self.coefficients -= self.lr * (X.T @ (X @ self.coefficients - y) + self.alpha * np.sign(self.coefficients))\n", "\n", " def predict(self, X):\n", " X = np.hstack((np.ones((X.shape[0], 1)), X))\n", " return X @ self.coefficients" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "Data Preprocessing and data loading" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def preprocess(df):\n", " # Handle missing values\n", " df = df.fillna(df.mean())\n", "\n", " # Remove outliers\n", " z_scores = np.abs((df - df.mean()) / df.std())\n", " df = df[(z_scores < 3).all(axis=1)]\n", "\n", " df_nomralized = df.copy()\n", "\n", " # Normalize the data using z-score normalization except for the target column\n", " for column in df.columns:\n", " if column == \"target\":\n", " continue\n", " df_nomralized[column] = (df_nomralized[column] - df_nomralized[column].mean()) / df_nomralized[column].std()\n", "\n", " return df, df_nomralized\n", "\n", "\n", "def train_test_split(X, y, test_size=0.2):\n", " # Split the dataset into train and test sets\n", " idx = np.arange(X.shape[0])\n", " np.random.shuffle(idx)\n", " X = X[idx]\n", " y = y[idx]\n", " split = int((1 - test_size) * X.shape[0])\n", " X_train, X_test = X[:split], X[split:]\n", " y_train, y_test = y[:split], y[split:]\n", " return X_train, X_test, y_train, y_test\n", "\n", "def load_data(normalize=True):\n", " # Load the diabetes dataset\n", " df = pd.read_csv(\"diabetes.csv\")\n", "\n", " # Preprocess the dataset\n", " df, df_norm = preprocess(df)\n", "\n", " # Split the dataset into train and test sets\n", " if normalize:\n", " X = df_norm.drop(\"target\", axis=1).values\n", " y = df_norm[\"target\"].values\n", " else:\n", " X = df.drop(\"target\", axis=1).values\n", " y = df[\"target\"].values\n", " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n", " return X_train, X_test, y_train, y_test, df" ] }, { "cell_type": "code", "execution_count": 4, "outputs": [ { "data": { "text/plain": " age sex bmi bp s1 s2 s3 \\\n0 0.794887 1.061173 1.357096 0.459459 -0.917834 -0.734476 -0.958901 \n1 -0.038221 -0.940162 -1.095193 -0.557425 -0.148672 -0.395182 1.714481 \n2 1.779468 1.061173 0.983414 -0.121617 -0.947417 -0.720904 -0.708271 \n3 -1.855910 -0.940162 -0.231053 -0.775328 0.295075 0.561626 -0.791815 \n4 0.113253 -0.940162 -0.768221 0.459459 0.117576 0.358049 0.210704 \n\n s4 s5 s6 target \n0 -0.035628 0.434041 -0.356981 151.0 \n1 -0.856638 -1.429397 -1.923328 75.0 \n2 -0.035628 0.074059 -0.531020 141.0 \n3 0.785382 0.492755 -0.182943 206.0 \n4 -0.035628 -0.661884 -0.966116 135.0 ", "text/html": "
\n | age | \nsex | \nbmi | \nbp | \ns1 | \ns2 | \ns3 | \ns4 | \ns5 | \ns6 | \ntarget | \n
---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n0.794887 | \n1.061173 | \n1.357096 | \n0.459459 | \n-0.917834 | \n-0.734476 | \n-0.958901 | \n-0.035628 | \n0.434041 | \n-0.356981 | \n151.0 | \n
1 | \n-0.038221 | \n-0.940162 | \n-1.095193 | \n-0.557425 | \n-0.148672 | \n-0.395182 | \n1.714481 | \n-0.856638 | \n-1.429397 | \n-1.923328 | \n75.0 | \n
2 | \n1.779468 | \n1.061173 | \n0.983414 | \n-0.121617 | \n-0.947417 | \n-0.720904 | \n-0.708271 | \n-0.035628 | \n0.074059 | \n-0.531020 | \n141.0 | \n
3 | \n-1.855910 | \n-0.940162 | \n-0.231053 | \n-0.775328 | \n0.295075 | \n0.561626 | \n-0.791815 | \n0.785382 | \n0.492755 | \n-0.182943 | \n206.0 | \n
4 | \n0.113253 | \n-0.940162 | \n-0.768221 | \n0.459459 | \n0.117576 | \n0.358049 | \n0.210704 | \n-0.035628 | \n-0.661884 | \n-0.966116 | \n135.0 | \n