ML_course/assignment 2/iml_assignment2a_solved.ipynb

250 lines
121 KiB
Plaintext
Raw Normal View History

2023-03-31 11:53:42 +00:00
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
2023-04-27 13:24:21 +00:00
{
"cell_type": "markdown",
"source": [
"### Solution for Assignment 2 of the course \"Introduction to Machine Learning\" at the University of Leoben.\n",
"##### Author: Fotios Lygerakis\n",
"##### Semester: SS 2022/2023"
],
"metadata": {
"collapsed": false
}
},
2023-03-31 11:53:42 +00:00
{
"cell_type": "markdown",
"source": [
"Importing the libraries"
],
"metadata": {
"id": "sCd8w2OwBTLm"
}
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"id": "kWsq3C9zxvlr"
},
"outputs": [],
"source": [
"import csv\n",
"import math\n",
"from matplotlib import pyplot as plt"
]
},
{
"cell_type": "markdown",
"source": [
"Create the BasicStatistics class"
],
"metadata": {
"id": "Bw9qskz1BXha"
}
},
{
"cell_type": "code",
"source": [
"class BasicStatistics:\n",
" \"\"\"\n",
" Basic statistics class\n",
"\n",
" Attributes:\n",
" dataset: list of numbers\n",
"\n",
" Methods:\n",
" mean: calculates the mean of the dataset\n",
" median: calculates the median of the dataset\n",
" variance: calculates the variance of the dataset\n",
" normalize: normalizes the dataset\n",
" plot: plots the dataset\n",
" \"\"\"\n",
" def __init__(self, dataset):\n",
" \"\"\"\n",
" Constructor for the BasicStatistics class\n",
" :param dataset: list of numbers\n",
" \"\"\"\n",
" self.dataset = dataset\n",
"\n",
" def mean(self):\n",
" \"\"\"\n",
" Calculates the mean of the dataset\n",
" :return: mean of the dataset\n",
" \"\"\"\n",
" return sum(self.dataset) / len(self.dataset)\n",
"\n",
" def median(self):\n",
" \"\"\"\n",
" Calculates the median of the dataset\n",
" :return: median of the dataset\n",
" \"\"\"\n",
2023-03-31 13:42:48 +00:00
" tmp_dataset = self.dataset.copy()\n",
" tmp_dataset.sort()\n",
" if len(tmp_dataset) % 2 == 0:\n",
" return (tmp_dataset[len(tmp_dataset) // 2] + tmp_dataset[len(tmp_dataset) // 2 - 1]) / 2\n",
2023-03-31 11:53:42 +00:00
" else:\n",
2023-03-31 13:42:48 +00:00
" return tmp_dataset[len(tmp_dataset) // 2]\n",
2023-03-31 11:53:42 +00:00
"\n",
" def variance(self):\n",
" \"\"\"\n",
" Calculates the variance of the dataset\n",
" :return: variance of the dataset\n",
" \"\"\"\n",
" return sum([(x - self.mean()) ** 2 for x in self.dataset]) / len(self.dataset)\n",
"\n",
" def normalize(self):\n",
" \"\"\"\n",
" Normalizes the dataset\n",
" :return normalized dataset\n",
" \"\"\"\n",
" return [(x - self.mean()) / self.variance() for x in self.dataset]\n",
"\n",
" def standard_deviation(self):\n",
" \"\"\"\n",
" Calculates the standard deviation of the dataset\n",
" :return: standard deviation of the dataset\n",
" \"\"\"\n",
" return math.sqrt(self.variance())\n",
"\n",
" def plot(self):\n",
" \"\"\"\n",
" Plots the dataset\n",
" :return:\n",
" \"\"\"\n",
" # get the data\n",
" mean = self.mean()\n",
" median = self.median()\n",
" variance = self.variance()\n",
" std = math.sqrt(variance)\n",
" data_norm = self.normalize()\n",
" # get the number of bins\n",
" nrBins = math.floor(len(self.dataset) / 50)\n",
"\n",
" # set up figure and plot grid\n",
" fig = plt.figure(figsize=(10, 8))\n",
" grid = plt.GridSpec(2, 2)\n",
" ax1 = plt.subplot(grid[:1, :])\n",
" ax2 = plt.subplot(grid[1:, :1])\n",
" ax3 = plt.subplot(grid[1:, 1:])\n",
"\n",
" # set titles\n",
" fig.suptitle('Basic Statistics')\n",
" ax1.set_title('Data Distribution')\n",
" ax2.set_title('Raw Data')\n",
" ax3.set_title('Normalized Data')\n",
"\n",
" # set axes\n",
" ax1.set_xlabel('Values')\n",
" ax1.set_ylabel('Frequency')\n",
" ax2.set_xlabel('Sample')\n",
" ax2.set_ylabel('Value')\n",
" ax3.set_xlabel('Sample')\n",
" ax3.set_ylabel('Standardized Value')\n",
"\n",
" # data plotting\n",
" ax1.hist(data, bins=nrBins, density=True, label='Histogram')\n",
" ax1.vlines(x=mean, ymin=0, ymax=0.3, colors='r', ls='--', label='Mean')\n",
" ax1.vlines(x=median, ymin=0, ymax=0.3, colors='y', ls='--', label='Median')\n",
" ax1.vlines(x=mean + std, ymin=0, ymax=0.3, colors='g', ls='--', label='Standard deviation')\n",
" ax1.vlines(x=mean - std, ymin=0, ymax=0.3, colors='g', ls='--')\n",
" ax1.legend()\n",
" # raw data plotting\n",
" x = [i for i in range(1, len(data) + 1)]\n",
" ax2.scatter(x, data, s=3, label='Data')\n",
" ax2.hlines(y=mean, xmin=0, xmax=len(data), colors='r', ls='--', label='Mean')\n",
" ax2.hlines(y=mean + std, xmin=0, xmax=len(data), colors='g', ls='--', label='Standard deviation')\n",
" ax2.hlines(y=mean - std, xmin=0, xmax=len(data), colors='g', ls='--')\n",
" ax2.legend()\n",
" # normalized data plotting\n",
" ax3.scatter(x, data_norm, s=3, label='Data')\n",
" ax3.hlines(y=0, xmin=0, xmax=len(data), colors='r', ls='--', label='Mean')\n",
" ax3.hlines(y=0 + 1, xmin=0, xmax=len(data), colors='g', ls='--', label='Standard deviation')\n",
" ax3.hlines(y=0 - 1, xmin=0, xmax=len(data), colors='g', ls='--')\n",
" ax3.legend()\n",
"\n",
" plt.show()\n",
" # save the figure\n",
" fig.savefig(\"Basic_Statistics.png\", format=\"png\")\n"
],
"metadata": {
"id": "cqSJ_htS_04g"
},
"execution_count": 3,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Demonstrate the BasicStatistics class on the data.csv file"
],
"metadata": {
"id": "xeMDhdMqBd2n"
}
},
{
"cell_type": "code",
"source": [
"# define the path to the data file\n",
"path = 'data.csv'\n",
"# open the file and read the data\n",
"with open(path, 'r') as f:\n",
" # create a csv reader\n",
" reader = csv.reader(f)\n",
" # convert the data to a list of floats\n",
" data = list(reader)\n",
" data = [float(x[0]) for x in data]\n",
" # create a BasicStatistics object\n",
" bs = BasicStatistics(data)\n",
" # print the mean, median, and variance\n",
" print(f'Mean: {bs.mean():.2f}')\n",
" print(f'Median: {bs.median():.2f}')\n",
" print(f'Variance: {bs.variance():.2f}')\n",
" # plot the data\n",
" bs.plot()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 606
},
"id": "UQVfNd_j_29Z",
"outputId": "e3cab288-537c-4937-b8e0-a7473770400d"
},
2023-03-31 13:42:48 +00:00
"execution_count": 4,
2023-03-31 11:53:42 +00:00
"outputs": [
{
"name": "stdout",
2023-03-31 13:42:48 +00:00
"output_type": "stream",
2023-03-31 11:53:42 +00:00
"text": [
"Mean: 4.08\n",
"Median: 4.13\n",
"Variance: 4.15\n"
]
},
{
"data": {
2023-03-31 13:42:48 +00:00
"text/plain": "<Figure size 1000x800 with 3 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1gAAAL3CAYAAACXouQnAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeVwU5R8H8M/sAssNHpyKgooHqSCi5pWaGJ55lHmmgEdqloZYYt5WeIs3agmeaZqZZWpEUmbmfeR9gXgA3iAgLOzO7w9ifq6Acgwui5/367UvZ5955pnv7DjMfneeeUYQRVEEERERERERlZhC3wEQERERERGVF0ywiIiIiIiIZMIEi4iIiIiISCZMsIiIiIiIiGTCBIuIiIiIiEgmTLCIiIiIiIhkwgSLiIiIiIhIJkywiIiIiIiIZMIEi4iIiIiISCZMsIiI6KWJi4uDIAiIjIzUdyiymTZtGgRBkK09f39/uLq6ytYeERG9XEywiIheQZGRkRAEQedlb2+Pdu3aYffu3foOr9Du3r2LMWPGoG7dujAzM4O9vT2aNm2Kzz77DKmpqVK9TZs2ISwsrNjrSU9Px7Rp0xATE1PyoAHcvn0b06ZNw8mTJ2Vpj4iIyg5BFEVR30EQEdHLFRkZiYCAAMyYMQNubm4QRRFJSUmIjIzE2bNn8dNPP6Fr166yr1cURWRmZsLY2BhKpbJEbT148ACNGjVCSkoKAgMDUbduXdy/fx+nT5/Gzz//jNOnT0tXgrp27YozZ84gLi6uWOu6d+8e7OzsMHXqVEybNk1nXnZ2NrKzs2Fqalro9o4ePYomTZogIiIC/v7+OvOysrKg1WqhUqmKFSsREemXkb4DICIi/enUqRN8fHyk90OGDIGDgwO+/fbbUkmwBEEoUiLyPN988w3i4+Nx4MABtGjRQmdeSkoKTExMZFnPixgZGcHISL7TqbGxsWxtERHRy8cugkREJLG1tYWZmVmehGHevHlo0aIFKlWqBDMzMzRu3Bjbtm3Ls3xUVBRatWoFW1tbWFpaok6dOpg4caI0v6B7sC5cuID33nsPdnZ2MDMzQ506dfD5558/N9arV69CqVTi9ddfzzPP2tpaSuTatm2LXbt24fr161J3yNwrW2q1GlOmTEHjxo1hY2MDCwsLtG7dGvv27dOJ2c7ODgAwffp0qY3cK1n53YP1vM8hJiYGTZo0AQAEBARI7eV+Jvndg6XVarFo0SI0aNAApqamsLOzQ8eOHXH06NFCf/ZERPRy8AoWEdErLDk5Gffu3YMoirhz5w6WLFmC1NRUDBw4UKfeokWL8Pbbb2PAgAFQq9XYvHkzevfujZ9//hldunQBAJw9exZdu3ZFw4YNMWPGDKhUKly5cgUHDhx4bgynT59G69atYWxsjOHDh8PV1RVXr17FTz/9hC+//LLA5apXrw6NRoP169dj8ODBBdb7/PPPkZycjJs3b2LhwoUAAEtLSwA5V7q+/vpr9OvXD8OGDcPjx4/xzTffwM/PD4cPH4aXlxfs7OywYsUKjBw5Ej179kSvXr0AAA0bNsx3fS/6HOrVq4cZM2ZgypQpGD58OFq3bg0Aea7CPW3IkCGIjIxEp06dMHToUGRnZ2P//v34559/4OPjU+zPnoiISoFIRESvnIiICBFAnpdKpRIjIyPz1E9PT9d5r1arxfr164tvvvmmVLZw4UIRgHj37t0C1xsbGysCECMiIqSyN954Q7SyshKvX7+uU1er1T53GxITE0U7OzsRgFi3bl1xxIgR4qZNm8RHjx7lqdulSxexevXqecqzs7PFzMxMnbKHDx+KDg4OYmBgoFR29+5dEYA4derUPG1MnTpVfPp0WpjP4ciRI3k+h1yDBw/WifX3338XAYgff/xxnrq5n1Fh1klERC8HuwgSEb3Cli1bhqioKERFRWHDhg1o164dhg4diu3bt+vUMzMzk6YfPnyI5ORktG7dGsePH5fKbW1tAQA//vgjtFptodZ/9+5d/PnnnwgMDES1atV05r1o6HMHBwecOnUKI0aMwMOHDxEeHo7+/fvD3t4eM2fOhFiIMZyUSqV0r5ZWq8WDBw+QnZ0NHx8fnW0riuJ8Ds/z/fffQxAETJ06Nc+83M9I7nUSEVHxMcEiInqFNW3aFL6+vvD19cWAAQOwa9cueHh4YPTo0VCr1VK9n3/+Ga+//jpMTU1RsWJFqdtccnKyVKdPnz5o2bIlhg4dCgcHB/Tt2xfffffdc7/wX7t2DQBQv379YsXv5OSEFStWICEhARcvXsTixYthZ2eHKVOm4JtvvilUG2vXrkXDhg1hamqKSpUqwc7ODrt27dLZtqIozufwPFevXoWzszMqVqz40tZJRETFxwSLiIgkCoUC7dq1Q0JCAi5fvgwA2L9/P95++22Ymppi+fLl+OWXXxAVFYX+/fvrXCUyMzPDn3/+id9++w3vv/8+Tp8+jT59+qBDhw7QaDSlGrcgCKhduzY++ugj/Pnnn1AoFNi4ceMLl9uwYQP8/f1Rs2ZNfPPNN9izZw+ioqLw5ptvFjs50cfnoM/PnoiIdDHBIiIiHdnZ2QAgPaj3+++/h6mpKfbu3YvAwEB06tQJvr6++S6rUCjQvn17LFiwAOfOncOXX36J33//XWdUvqfVqFEDAHDmzBnZ4q9RowYqVKiAhIQEqayg7obbtm1DjRo1sH37drz//vvw8/ODr68vMjIydOq9qLvis170ORSlvZo1a+L27dt48OBBidZJREQvBxMsIiKSZGVl4ddff4WJiQnq1asHIOc+JUEQdK6ExMXFYceOHTrL5pcAeHl5AQAyMzPzXZ+dnR3eeOMNrFmzBvHx8TrzXnQP1aFDh5CWlpan/PDhw7h//z7q1KkjlVlYWOTb5S/3YcdPr+vQoUM4ePCgTj1zc3MAwKNHj54bE1C4z8HCwqLQ7b3zzjsQRRHTp0/PMy837uJ89kREVDo4TDsR0Sts9+7duHDhAgDgzp072LRpEy5fvowJEybA2toaANClSxcsWLAAHTt2RP/+/XHnzh0sW7YMtWrVwunTp6W2ZsyYgT///BNdunRB9erVcefOHSxfvhxVq1ZFq1atCoxh8eLFaNWqFby9vTF8+HC4ubkhLi4Ou3btwsmTJwtcbv369di4cSN69uyJxo0bw8TEBOfPn8eaNWtgamqq8wyoxo0bY8uWLQgKCkKTJk1gaWmJbt26oWvXrti+fTt69uyJLl26IDY2FuHh4fDw8JCu4AE5XfA8PDywZcsW1K5dGxUrVkT9+vXzvXesMJ9DzZo1YWtri/DwcFhZWcHCwgLNmjWDm5tbnvbatWuH999/H4sXL8bly5fRsWNHaLVa7N+/H+3atcPo0aOL/dkTEVEp0OcQhkREpB/5DdNuamoqenl5iStWrMgzRPo333wjuru7iyqVSqxbt64YERGRZ3jy6OhosXv37qKzs7NoYmIiOjs7i/369RMvXbok1clvmHZRFMUzZ86IPXv2FG1tbUVTU1OxTp064uTJk5+7DadPnxbHjx8vent7ixUrVhSNjIxEJycnsXfv3uLx48d16qampor9+/cXbW1tRQDSMOharVb86quvxOrVq4sqlUps1KiR+PPPP+cZKl0URfHvv/8WGzduLJqYmOgM2V6cz0EURfHHH38UPTw8RCMjI53PJL91Z2dni3PnzhXr1q0rmpiYiHZ2dmKnTp3EY8eOFWmdRERU+gRRLMQ4tkRERERERPRCvAeLiIiIiIhIJkywiIiIiIiIZMIEi4iIiIiISCZMsIiIiIiIiGTCBIuIiIiIiEgmTLCIiIiIiIhkwgSLiIiIiIhIJkywiIiIiIiIZMIEi4iIiIiISCZMsIiIiIiIiGTCBIuIiIiIiEgmTLCIiIiIiIhkwgSLiIiIiIhIJkywiIiIiIiIZMIEi4iIiIiISCZMsIiIiIiIiGTCBIuIiIiIiEgmTLCIiIiIiIhkwgSLiIiIiIhIJkywiIiIiIiIZMIEi4iIiIiISCZMsIiIiIiIiGTCBIuIiIiIiEgmTLCIiIiIiIhkwgSLiIiIiIhIJkywiIiIiIiIZMIEi4iIiIi
2023-03-31 11:53:42 +00:00
},
2023-03-31 13:42:48 +00:00
"metadata": {},
"output_type": "display_data"
2023-03-31 11:53:42 +00:00
}
]
}
]
}