### Solution for Assignment 4 of the course "Introduction to Machine Learning" at the University of Leoben.
##### Author: Fotios Lygerakis
##### Semester: SS 2022/2023

In [2]:
import pandas as pd
import numpy as np

class Predictor:
    def __init__(self):
        self.coefficients = None

    def fit(self, X, y):
        pass

    def predict(self, X):
        pass

class LinearRegression(Predictor):
    def fit(self, X, y):
        X = np.hstack((np.ones((X.shape[0], 1)), X))
        self.coefficients = np.linalg.inv(X.T @ X) @ X.T @ y

    def predict(self, X):
        X = np.hstack((np.ones((X.shape[0], 1)), X))
        return X @ self.coefficients

class RidgeRegression(Predictor):
    def __init__(self, alpha):
        super().__init__()
        self.alpha = alpha

    def fit(self, X, y):
        X = np.hstack((np.ones((X.shape[0], 1)), X))
        I = np.eye(X.shape[1])
        I[0, 0] = 0
        self.coefficients = np.linalg.inv(X.T @ X + self.alpha * I) @ X.T @ y

    def predict(self, X):
        X = np.hstack((np.ones((X.shape[0], 1)), X))
        return X @ self.coefficients

class LassoRegression(Predictor):
    def __init__(self, alpha, num_iters=1000, lr=0.01):
        super().__init__()
        self.alpha = alpha
        self.num_iters = num_iters
        self.lr = lr

    def fit(self, X, y):
        X = np.hstack((np.ones((X.shape[0], 1)), X))
        self.coefficients = np.random.randn(X.shape[1])
        for _ in range(self.num_iters):
            self.coefficients -= self.lr * (X.T @ (X @ self.coefficients - y) + self.alpha * np.sign(self.coefficients))

    def predict(self, X):
        X = np.hstack((np.ones((X.shape[0], 1)), X))
        return X @ self.coefficients

Data Preprocessing and data loading

In [3]:
def preprocess(df):
    # Handle missing values
    df = df.fillna(df.mean())

    # Remove outliers
    z_scores = np.abs((df - df.mean()) / df.std())
    df = df[(z_scores < 3).all(axis=1)]

    df_nomralized = df.copy()

    # Normalize the data using z-score normalization except for the target column
    for column in df.columns:
        if column == "target":
            continue
        df_nomralized[column] = (df_nomralized[column] - df_nomralized[column].mean()) / df_nomralized[column].std()

    return df, df_nomralized


def train_test_split(X, y, test_size=0.2):
    # Split the dataset into train and test sets
    idx = np.arange(X.shape[0])
    np.random.shuffle(idx)
    X = X[idx]
    y = y[idx]
    split = int((1 - test_size) * X.shape[0])
    X_train, X_test = X[:split], X[split:]
    y_train, y_test = y[:split], y[split:]
    return X_train, X_test, y_train, y_test

def load_data(normalize=True):
    # Load the diabetes dataset
    df = pd.read_csv("diabetes.csv")

    # Preprocess the dataset
    df, df_norm = preprocess(df)

    # Split the dataset into train and test sets
    if normalize:
        X = df_norm.drop("target", axis=1).values
        y = df_norm["target"].values
    else:
        X = df.drop("target", axis=1).values
        y = df["target"].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    return X_train, X_test, y_train, y_test, df

In [4]:
# Load the diabetes dataset
df = pd.read_csv("diabetes.csv")
# Preprocess the dataset
df, df_norm = preprocess(df)
df.head()
df_norm.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.794887,1.061173,1.357096,0.459459,-0.917834,-0.734476,-0.958901,-0.035628,0.434041,-0.356981,151.0
1,-0.038221,-0.940162,-1.095193,-0.557425,-0.148672,-0.395182,1.714481,-0.856638,-1.429397,-1.923328,75.0
2,1.779468,1.061173,0.983414,-0.121617,-0.947417,-0.720904,-0.708271,-0.035628,0.074059,-0.53102,141.0
3,-1.85591,-0.940162,-0.231053,-0.775328,0.295075,0.561626,-0.791815,0.785382,0.492755,-0.182943,206.0
4,0.113253,-0.940162,-0.768221,0.459459,0.117576,0.358049,0.210704,-0.035628,-0.661884,-0.966116,135.0


Load the data

In [10]:
from sklearn.datasets import load_diabetes

normalize = True
# Load the data
X_train, X_test, y_train, y_test, df = load_data(normalize=normalize)
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

X_train: (344, 10)
X_test: (86, 10)
y_train: (344,)
y_test: (86,)


Fit the models

In [11]:
# Fit the linear regression
linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)

In [12]:
# Fit the ridge regression
ridge_regression = RidgeRegression(alpha=1)
ridge_regression.fit(X_train, y_train)

In [13]:
# Fit the lasso regression
lasso_regression = LassoRegression(alpha=1, num_iters=10000, lr=0.001)
lasso_regression.fit(X_train, y_train)

Evaluate the models

In [14]:
from sklearn.metrics import r2_score

y_pred_linear = linear_regression.predict(X_test)
y_pred_ridge = ridge_regression.predict(X_test)
y_pred_lasso = lasso_regression.predict(X_test)

y_pred_train_linear = linear_regression.predict(X_train)
y_pred_train_ridge = ridge_regression.predict(X_train)
y_pred_train_lasso = lasso_regression.predict(X_train)


# Calculate the mean squared error
mse_linear_test = np.mean((y_test - y_pred_linear) ** 2)
mse_linear_train = np.mean((y_train - y_pred_train_linear) ** 2)
mse_ridge_test = np.mean((y_test - y_pred_ridge) ** 2)
mse_ridge_train = np.mean((y_train - y_pred_train_ridge) ** 2)
mse_lasso_test = np.mean((y_test - y_pred_lasso) ** 2)
mse_lasso_train = np.mean((y_train - y_pred_train_lasso) ** 2)

# Print the mean squared error in a compact way keeping two decimal digits
print("Linear Regression MSE train: {:.2f} test: {:.2f}".format(mse_linear_train, mse_linear_test))
print("Ridge Regression MSE train: {:.2f} test: {:.2f}".format(mse_ridge_train, mse_ridge_test))
print("Lasso Regression MSE train: {:.2f} test: {:.2f}".format(mse_lasso_train, mse_lasso_test))

# calculate the root squared error
rmse_linear_test = np.sqrt(mse_linear_test)
rmse_linear_train = np.sqrt(mse_linear_train)
rmse_ridge_test = np.sqrt(mse_ridge_test)
rmse_ridge_train = np.sqrt(mse_ridge_train)
rmse_lasso_test = np.sqrt(mse_lasso_test)
rmse_lasso_train = np.sqrt(mse_lasso_train)

# print the root squared error in a compact way keeping two decimal digits
print("Linear Regression RMSE train: {:.2f} test: {:.2f}".format(rmse_linear_train, rmse_linear_test))
print("Ridge Regression RMSE train: {:.2f} test: {:.2f}".format(rmse_ridge_train, rmse_ridge_test))
print("Lasso Regression RMSE train: {:.2f} test: {:.2f}".format(rmse_lasso_train, rmse_lasso_test))

# calculate the r-squared score
r2_linear_test = r2_score(y_test, y_pred_linear)
r2_linear_train = r2_score(y_train, y_pred_train_linear)
r2_ridge_test = r2_score(y_test, y_pred_ridge)
r2_ridge_train = r2_score(y_train, y_pred_train_ridge)
r2_lasso_test = r2_score(y_test, y_pred_lasso)
r2_lasso_train = r2_score(y_train, y_pred_train_lasso)

# print the r-squared score in a compact way keeping two decimal digits
print("Linear Regression R2 train: {:.2f} test: {:.2f}".format(r2_linear_train, r2_linear_test))
print("Ridge Regression R2 train: {:.2f} test: {:.2f}".format(r2_ridge_train, r2_ridge_test))
print("Lasso Regression R2 train: {:.2f} test: {:.2f}".format(r2_lasso_train, r2_lasso_test))

print("Linear Regression features sorted by their coefficients:")
for feature, coef in sorted(list(zip(df.columns[:-1], linear_regression.coefficients[1:])), key=lambda x: abs(x[1]), reverse=True):
    print(f"{feature}: {coef:.2f}")
print("Ridge Regression features sorted by their coefficients:")
for feature, coef in sorted(list(zip(df.columns[:-1], ridge_regression.coefficients[1:])), key=lambda x: abs(x[1]), reverse=True):
    print(f"{feature}: {coef:.2f}")
print("Lasso Regression features sorted by their coefficients:")
for feature, coef in sorted(list(zip(df.columns[:-1], lasso_regression.coefficients[1:])), key=lambda x: abs(x[1]), reverse=True):
    print(f"{feature}: {coef:.2f}")

# print the number of non-zero coefficients
print("Linear Regression number of non-zero coefficients:", len(linear_regression.coefficients[linear_regression.coefficients != 0]))
print("Ridge Regression number of non-zero coefficients:", len(ridge_regression.coefficients[ridge_regression.coefficients != 0]))
print("Lasso Regression number of non-zero coefficients:", len(lasso_regression.coefficients[lasso_regression.coefficients != 0]))


Linear Regression MSE train: 2983.19 test: 2487.62
Ridge Regression MSE train: 2983.79 test: 2495.37
Lasso Regression MSE train: 2983.19 test: 2487.98
Linear Regression RMSE train: 54.62 test: 49.88
Ridge Regression RMSE train: 54.62 test: 49.95
Lasso Regression RMSE train: 54.62 test: 49.88
Linear Regression R2 train: 0.50 test: 0.49
Ridge Regression R2 train: 0.50 test: 0.49
Lasso Regression R2 train: 0.50 test: 0.49
Linear Regression features sorted by their coefficients:
s5: 28.62
bmi: 23.95
s1: -23.18
bp: 17.64
s2: 14.38
sex: -12.17
s4: 4.23
s3: -4.03
s6: 2.06
age: 0.86
Ridge Regression features sorted by their coefficients:
s5: 26.16
bmi: 23.91
bp: 17.60
s1: -16.67
sex: -12.13
s2: 9.11
s3: -6.64
s4: 3.73
s6: 2.12
age: 0.87
Lasso Regression features sorted by their coefficients:
s5: 28.48
bmi: 23.95
s1: -22.80
bp: 17.64
s2: 14.07
sex: -12.17
s4: 4.19
s3: -4.18
s6: 2.06
age: 0.86
Linear Regression number of non-zero coefficients: 11
Ridge Regression number of non-zero coefficients: