ML_course/assignment 4/iml_assginment4_solved.ipynb

432 lines
16 KiB
Plaintext
Raw Normal View History

2023-05-12 11:14:44 +00:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"class Predictor:\n",
" def __init__(self):\n",
" self.coefficients = None\n",
"\n",
" def fit(self, X, y):\n",
" pass\n",
"\n",
" def predict(self, X):\n",
" pass\n",
"\n",
"class LinearRegression(Predictor):\n",
" def fit(self, X, y):\n",
" X = np.hstack((np.ones((X.shape[0], 1)), X))\n",
" self.coefficients = np.linalg.inv(X.T @ X) @ X.T @ y\n",
"\n",
" def predict(self, X):\n",
" X = np.hstack((np.ones((X.shape[0], 1)), X))\n",
" return X @ self.coefficients\n",
"\n",
"class RidgeRegression(Predictor):\n",
" def __init__(self, alpha):\n",
" super().__init__()\n",
" self.alpha = alpha\n",
"\n",
" def fit(self, X, y):\n",
" X = np.hstack((np.ones((X.shape[0], 1)), X))\n",
" I = np.eye(X.shape[1])\n",
" I[0, 0] = 0\n",
" self.coefficients = np.linalg.inv(X.T @ X + self.alpha * I) @ X.T @ y\n",
"\n",
" def predict(self, X):\n",
" X = np.hstack((np.ones((X.shape[0], 1)), X))\n",
" return X @ self.coefficients\n",
"\n",
"class LassoRegression(Predictor):\n",
" def __init__(self, alpha, num_iters=1000, lr=0.01):\n",
" super().__init__()\n",
" self.alpha = alpha\n",
" self.num_iters = num_iters\n",
" self.lr = lr\n",
"\n",
" def fit(self, X, y):\n",
" X = np.hstack((np.ones((X.shape[0], 1)), X))\n",
" self.coefficients = np.random.randn(X.shape[1])\n",
" for _ in range(self.num_iters):\n",
" self.coefficients -= self.lr * (X.T @ (X @ self.coefficients - y) + self.alpha * np.sign(self.coefficients))\n",
"\n",
" def predict(self, X):\n",
" X = np.hstack((np.ones((X.shape[0], 1)), X))\n",
" return X @ self.coefficients"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"Data Preprocessing and data loading"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def preprocess(df):\n",
" # Handle missing values\n",
" df = df.fillna(df.mean())\n",
"\n",
" # Remove outliers\n",
" z_scores = np.abs((df - df.mean()) / df.std())\n",
" df = df[(z_scores < 3).all(axis=1)]\n",
"\n",
" df_nomralized = df.copy()\n",
"\n",
" # Normalize the data using z-score normalization except for the target column\n",
" for column in df.columns:\n",
" if column == \"target\":\n",
" continue\n",
" df_nomralized[column] = (df_nomralized[column] - df_nomralized[column].mean()) / df_nomralized[column].std()\n",
"\n",
" return df, df_nomralized\n",
"\n",
"def train_test_split(X, y, test_size=0.2):\n",
" # Split the dataset into train and test sets\n",
" idx = np.arange(X.shape[0])\n",
" np.random.shuffle(idx)\n",
" X = X[idx]\n",
" y = y[idx]\n",
" split = int((1 - test_size) * X.shape[0])\n",
" X_train, X_test = X[:split], X[split:]\n",
" y_train, y_test = y[:split], y[split:]\n",
" return X_train, X_test, y_train, y_test\n",
"\n",
"def load_data(normalize=True):\n",
" # Load the diabetes dataset\n",
" df = pd.read_csv(\"diabetes.csv\")\n",
"\n",
" # Preprocess the dataset\n",
" df, df_norm = preprocess(df)\n",
"\n",
" # Split the dataset into train and test sets\n",
" if normalize:\n",
" X = df_norm.drop(\"target\", axis=1).values\n",
" y = df_norm[\"target\"].values\n",
" else:\n",
" X = df.drop(\"target\", axis=1).values\n",
" y = df[\"target\"].values\n",
" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n",
" return X_train, X_test, y_train, y_test, df"
]
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [
{
"data": {
"text/plain": " age sex bmi bp s1 s2 s3 \\\n0 0.794887 1.061173 1.357096 0.459459 -0.917834 -0.734476 -0.958901 \n1 -0.038221 -0.940162 -1.095193 -0.557425 -0.148672 -0.395182 1.714481 \n2 1.779468 1.061173 0.983414 -0.121617 -0.947417 -0.720904 -0.708271 \n3 -1.855910 -0.940162 -0.231053 -0.775328 0.295075 0.561626 -0.791815 \n4 0.113253 -0.940162 -0.768221 0.459459 0.117576 0.358049 0.210704 \n\n s4 s5 s6 target \n0 -0.035628 0.434041 -0.356981 151.0 \n1 -0.856638 -1.429397 -1.923328 75.0 \n2 -0.035628 0.074059 -0.531020 141.0 \n3 0.785382 0.492755 -0.182943 206.0 \n4 -0.035628 -0.661884 -0.966116 135.0 ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>age</th>\n <th>sex</th>\n <th>bmi</th>\n <th>bp</th>\n <th>s1</th>\n <th>s2</th>\n <th>s3</th>\n <th>s4</th>\n <th>s5</th>\n <th>s6</th>\n <th>target</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0.794887</td>\n <td>1.061173</td>\n <td>1.357096</td>\n <td>0.459459</td>\n <td>-0.917834</td>\n <td>-0.734476</td>\n <td>-0.958901</td>\n <td>-0.035628</td>\n <td>0.434041</td>\n <td>-0.356981</td>\n <td>151.0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>-0.038221</td>\n <td>-0.940162</td>\n <td>-1.095193</td>\n <td>-0.557425</td>\n <td>-0.148672</td>\n <td>-0.395182</td>\n <td>1.714481</td>\n <td>-0.856638</td>\n <td>-1.429397</td>\n <td>-1.923328</td>\n <td>75.0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>1.779468</td>\n <td>1.061173</td>\n <td>0.983414</td>\n <td>-0.121617</td>\n <td>-0.947417</td>\n <td>-0.720904</td>\n <td>-0.708271</td>\n <td>-0.035628</td>\n <td>0.074059</td>\n <td>-0.531020</td>\n <td>141.0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>-1.855910</td>\n <td>-0.940162</td>\n <td>-0.231053</td>\n <td>-0.775328</td>\n <td>0.295075</td>\n <td>0.561626</td>\n <td>-0.791815</td>\n <td>0.785382</td>\n <td>0.492755</td>\n <td>-0.182943</td>\n <td>206.0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>0.113253</td>\n <td>-0.940162</td>\n <td>-0.768221</td>\n <td>0.459459</td>\n <td>0.117576</td>\n <td>0.358049</td>\n <td>0.210704</td>\n <td>-0.035628</td>\n <td>-0.661884</td>\n <td>-0.966116</td>\n <td>135.0</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Load the diabetes dataset\n",
"df = pd.read_csv(\"diabetes.csv\")\n",
"# Preprocess the dataset\n",
"df, df_norm = preprocess(df)\n",
"df.head()\n",
"df_norm.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"Load the data"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 4,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"X_train: (353, 10)\n",
"X_test: (89, 10)\n",
"y_train: (353,)\n",
"y_test: (89,)\n"
]
}
],
"source": [
"from sklearn.datasets import load_diabetes\n",
"\n",
"normalize = True\n",
"# Load the data\n",
"# X_train, X_test, y_train, y_test, df = load_data(normalize=normalize)\n",
"data = load_diabetes(scaled=True)\n",
"# split data into train and test sets\n",
"X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2)\n",
"print(\"X_train:\", X_train.shape)\n",
"print(\"X_test:\", X_test.shape)\n",
"print(\"y_train:\", y_train.shape)\n",
"print(\"y_test:\", y_test.shape)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"Fit the models"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [],
"source": [
"# Fit the linear regression\n",
"linear_regression = LinearRegression()\n",
"linear_regression.fit(X_train, y_train)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [],
"source": [
"# Fit the ridge regression\n",
"ridge_regression = RidgeRegression(alpha=1)\n",
"ridge_regression.fit(X_train, y_train)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [],
"source": [
"# Fit the lasso regression\n",
"lasso_regression = LassoRegression(alpha=1, num_iters=10000, lr=0.001)\n",
"lasso_regression.fit(X_train, y_train)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"Evaluate the models"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Linear Regression MSE train: 2886.03 test: 2801.42\n",
"Ridge Regression MSE train: 3417.29 test: 3129.15\n",
"Lasso Regression MSE train: 2905.98 test: 2812.49\n",
"Linear Regression RMSE train: 53.72 test: 52.93\n",
"Ridge Regression RMSE train: 58.46 test: 55.94\n",
"Lasso Regression RMSE train: 53.91 test: 53.03\n",
"Linear Regression R2 train: 0.51 test: 0.54\n",
"Ridge Regression R2 train: 0.42 test: 0.49\n",
"Lasso Regression R2 train: 0.51 test: 0.54\n",
"Linear Regression features sorted by their coefficients:\n",
"s1: -822.75\n",
"s5: 765.40\n",
"bmi: 514.60\n",
"s2: 424.92\n",
"bp: 355.26\n",
"sex: -241.22\n",
"s4: 230.86\n",
"s3: 129.59\n",
"s6: 40.86\n",
"age: -10.93\n",
"Ridge Regression features sorted by their coefficients:\n",
"bmi: 283.92\n",
"s5: 238.87\n",
"bp: 195.46\n",
"s3: -142.91\n",
"s4: 106.92\n",
"s6: 93.55\n",
"sex: -63.98\n",
"s2: -30.82\n",
"age: 24.08\n",
"s1: 0.97\n",
"Lasso Regression features sorted by their coefficients:\n",
"bmi: 528.98\n",
"s5: 494.25\n",
"bp: 348.14\n",
"sex: -230.35\n",
"s3: -197.76\n",
"s2: -137.48\n",
"s4: 127.48\n",
"s1: -101.61\n",
"s6: 40.95\n",
"age: -7.39\n",
"Linear Regression number of non-zero coefficients: 11\n",
"Ridge Regression number of non-zero coefficients: 11\n",
"Lasso Regression number of non-zero coefficients: 11\n"
]
}
],
"source": [
"from sklearn.metrics import r2_score\n",
"\n",
"y_pred_linear = linear_regression.predict(X_test)\n",
"y_pred_ridge = ridge_regression.predict(X_test)\n",
"y_pred_lasso = lasso_regression.predict(X_test)\n",
"\n",
"y_pred_train_linear = linear_regression.predict(X_train)\n",
"y_pred_train_ridge = ridge_regression.predict(X_train)\n",
"y_pred_train_lasso = lasso_regression.predict(X_train)\n",
"\n",
"\n",
"# Calculate the mean squared error\n",
"mse_linear_test = np.mean((y_test - y_pred_linear) ** 2)\n",
"mse_linear_train = np.mean((y_train - y_pred_train_linear) ** 2)\n",
"mse_ridge_test = np.mean((y_test - y_pred_ridge) ** 2)\n",
"mse_ridge_train = np.mean((y_train - y_pred_train_ridge) ** 2)\n",
"mse_lasso_test = np.mean((y_test - y_pred_lasso) ** 2)\n",
"mse_lasso_train = np.mean((y_train - y_pred_train_lasso) ** 2)\n",
"\n",
"# Print the mean squared error in a compact way keeping two decimal digits\n",
"print(\"Linear Regression MSE train: {:.2f} test: {:.2f}\".format(mse_linear_train, mse_linear_test))\n",
"print(\"Ridge Regression MSE train: {:.2f} test: {:.2f}\".format(mse_ridge_train, mse_ridge_test))\n",
"print(\"Lasso Regression MSE train: {:.2f} test: {:.2f}\".format(mse_lasso_train, mse_lasso_test))\n",
"\n",
"# calculate the root squared error\n",
"rmse_linear_test = np.sqrt(mse_linear_test)\n",
"rmse_linear_train = np.sqrt(mse_linear_train)\n",
"rmse_ridge_test = np.sqrt(mse_ridge_test)\n",
"rmse_ridge_train = np.sqrt(mse_ridge_train)\n",
"rmse_lasso_test = np.sqrt(mse_lasso_test)\n",
"rmse_lasso_train = np.sqrt(mse_lasso_train)\n",
"\n",
"# print the root squared error in a compact way keeping two decimal digits\n",
"print(\"Linear Regression RMSE train: {:.2f} test: {:.2f}\".format(rmse_linear_train, rmse_linear_test))\n",
"print(\"Ridge Regression RMSE train: {:.2f} test: {:.2f}\".format(rmse_ridge_train, rmse_ridge_test))\n",
"print(\"Lasso Regression RMSE train: {:.2f} test: {:.2f}\".format(rmse_lasso_train, rmse_lasso_test))\n",
"\n",
"# calculate the r-squared score\n",
"r2_linear_test = r2_score(y_test, y_pred_linear)\n",
"r2_linear_train = r2_score(y_train, y_pred_train_linear)\n",
"r2_ridge_test = r2_score(y_test, y_pred_ridge)\n",
"r2_ridge_train = r2_score(y_train, y_pred_train_ridge)\n",
"r2_lasso_test = r2_score(y_test, y_pred_lasso)\n",
"r2_lasso_train = r2_score(y_train, y_pred_train_lasso)\n",
"\n",
"# print the r-squared score in a compact way keeping two decimal digits\n",
"print(\"Linear Regression R2 train: {:.2f} test: {:.2f}\".format(r2_linear_train, r2_linear_test))\n",
"print(\"Ridge Regression R2 train: {:.2f} test: {:.2f}\".format(r2_ridge_train, r2_ridge_test))\n",
"print(\"Lasso Regression R2 train: {:.2f} test: {:.2f}\".format(r2_lasso_train, r2_lasso_test))\n",
"\n",
"print(\"Linear Regression features sorted by their coefficients:\")\n",
"for feature, coef in sorted(list(zip(df.columns[:-1], linear_regression.coefficients[1:])), key=lambda x: abs(x[1]), reverse=True):\n",
" print(f\"{feature}: {coef:.2f}\")\n",
"print(\"Ridge Regression features sorted by their coefficients:\")\n",
"for feature, coef in sorted(list(zip(df.columns[:-1], ridge_regression.coefficients[1:])), key=lambda x: abs(x[1]), reverse=True):\n",
" print(f\"{feature}: {coef:.2f}\")\n",
"print(\"Lasso Regression features sorted by their coefficients:\")\n",
"for feature, coef in sorted(list(zip(df.columns[:-1], lasso_regression.coefficients[1:])), key=lambda x: abs(x[1]), reverse=True):\n",
" print(f\"{feature}: {coef:.2f}\")\n",
"\n",
"# print the number of non-zero coefficients\n",
"print(\"Linear Regression number of non-zero coefficients:\", len(linear_regression.coefficients[linear_regression.coefficients != 0]))\n",
"print(\"Ridge Regression number of non-zero coefficients:\", len(ridge_regression.coefficients[ridge_regression.coefficients != 0]))\n",
"print(\"Lasso Regression number of non-zero coefficients:\", len(lasso_regression.coefficients[lasso_regression.coefficients != 0]))\n"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 32,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false,
"pycharm": {
"is_executing": true
}
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}