diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4940046 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +venv + diff --git a/regression/simple_linear_regression/.gitignore b/regression/simple_linear_regression/.gitignore new file mode 100644 index 0000000..37eca35 --- /dev/null +++ b/regression/simple_linear_regression/.gitignore @@ -0,0 +1,2 @@ +*.csv +.ipynb_checkpoints diff --git a/regression/simple_linear_regression/dataset/archive.zip b/regression/simple_linear_regression/dataset/archive.zip new file mode 100644 index 0000000..be99965 Binary files /dev/null and b/regression/simple_linear_regression/dataset/archive.zip differ diff --git a/regression/simple_linear_regression/simple_regression.ipynb b/regression/simple_linear_regression/simple_regression.ipynb new file mode 100644 index 0000000..1200e79 --- /dev/null +++ b/regression/simple_linear_regression/simple_regression.ipynb @@ -0,0 +1,472 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "29b5381a-72b4-4f81-9208-2075f7acad85", + "metadata": {}, + "source": [ + "# Simple Regression" + ] + }, + { + "cell_type": "markdown", + "id": "cac03d32", + "metadata": {}, + "source": [ + "Configure the project. Indeed you create a dataset in csv format." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6f480cda-8380-4355-998a-5c59d6203b05", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Archive: ./dataset/archive.zip\n", + " inflating: score.csv \n", + " inflating: score_updated.csv \n" + ] + } + ], + "source": [ + "! rm -rf *.csv\n", + "! unzip ./dataset/archive.zip\n", + "! head -n 1 *.csv | head -n 2 | tail -n 1 > data.csv && for file in *.csv; do (tail -n +2 \"$file\"; echo) >> data.csv; done && sed -i '/^$/d' data.csv" + ] + }, + { + "cell_type": "markdown", + "id": "52ec2f48", + "metadata": {}, + "source": [ + "Import needed libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "dd17f780", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn import linear_model\n", + "from sklearn.metrics import r2_score\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "id": "57b33a77", + "metadata": {}, + "source": [ + "Read data from data.csv using pandas and store in data frame structure. Also shuffle data to have uniform distribution. " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a102a751", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
HoursScores
07.469
13.835
23.530
31.619
45.147
\n", + "
" + ], + "text/plain": [ + " Hours Scores\n", + "0 7.4 69\n", + "1 3.8 35\n", + "2 3.5 30\n", + "3 1.6 19\n", + "4 5.1 47" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"data.csv\")\n", + "df.head()\n", + "df = df.sample(frac=1.0, random_state=42).reset_index(drop=True)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "93002df5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
HoursScores
count121.000000121.000000
mean5.21487653.495868
std2.49918924.988705
min1.00000012.000000
25%3.00000030.000000
50%5.10000054.000000
75%7.40000075.000000
max9.80000099.000000
\n", + "
" + ], + "text/plain": [ + " Hours Scores\n", + "count 121.000000 121.000000\n", + "mean 5.214876 53.495868\n", + "std 2.499189 24.988705\n", + "min 1.000000 12.000000\n", + "25% 3.000000 30.000000\n", + "50% 5.100000 54.000000\n", + "75% 7.400000 75.000000\n", + "max 9.800000 99.000000" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# summarize data\n", + "df.describe() " + ] + }, + { + "cell_type": "markdown", + "id": "7ebceb4d", + "metadata": {}, + "source": [ + "Print the histogram chart of data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "39faae37", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "viz = df[[\"Hours\", \"Scores\"]]\n", + "viz.hist()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "55722cea", + "metadata": {}, + "source": [ + "Print scatter chart of data to recognize the patterns of data. Based on the below chart we must answer to this question \"Is Linear Simple Regression good or not?\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d647bbf0", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.scatter(df.Hours, df.Scores, color=\"blue\")\n", + "plt.ylabel(\"Scores\")\n", + "plt.xlabel(\"Hours of studying\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "0068030a", + "metadata": {}, + "outputs": [], + "source": [ + "# print(df)\n", + "train, test = train_test_split(df, test_size=0.20, random_state=42)\n", + "# test, evaluate = train_test_split(test, test_size=0.5, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "bf8122b1", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig = plt.figure()\n", + "ax1 = fig.add_subplot()\n", + "ax1.scatter(train.Hours, train.Scores, color=\"blue\")\n", + "ax1.scatter(test.Hours, test.Scores, color=\"red\")\n", + "plt.ylabel(\"Scores\")\n", + "plt.xlabel(\"Hours of studying\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "70a01055", + "metadata": {}, + "source": [ + "Find the best fitted line based on distribution of data. " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "92f95186", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Coefficients:\t [[9.86089296]]\n", + "Intercept:\t [1.93973905]\n" + ] + } + ], + "source": [ + "reg = linear_model.LinearRegression()\n", + "train_x = np.asanyarray(train[['Hours']])\n", + "train_y = np.asanyarray(train[['Scores']])\n", + "reg.fit(train_x, train_y)\n", + "\n", + "print(\"Coefficients:\\t\", reg.coef_)\n", + "print(\"Intercept:\\t\", reg.intercept_)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "a3bb7499", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.scatter(train.Hours, train.Scores, color=\"blue\")\n", + "plt.plot(train_x, reg.coef_[0][0]*train_x + reg.intercept_[0], \"-r\")\n", + "# y = theta1 x + theta0\n", + "plt.ylabel(\"Scores\")\n", + "plt.xlabel(\"Hours of studying\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "42079719", + "metadata": {}, + "source": [ + "Testing model based on Test data. Measure the R2 and MSE." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "e109f595", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mean absolute error: 3.17\n", + "Residual sum of squares (MSE): 0.41\n", + "R2-score: 0.97\n" + ] + } + ], + "source": [ + "test_x = np.asanyarray(test[['Hours']])\n", + "test_y = np.asanyarray(test[['Scores']])\n", + "\n", + "test_y_ = reg.predict(test_x)\n", + "\n", + "print(\"Mean absolute error: %.2f\" % np.mean(np.absolute(test_y_ - test_y)))\n", + "print(\"Residual sum of squares (MSE): %.2f\" % np.mean(test_y_ - test_y)**2)\n", + "print(\"R2-score: %.2f\" % r2_score(test_y_, test_y))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/regression/simple_linear_regression/simple_regression.md b/regression/simple_linear_regression/simple_regression.md new file mode 100644 index 0000000..030c201 --- /dev/null +++ b/regression/simple_linear_regression/simple_regression.md @@ -0,0 +1,289 @@ +# Simple Regression + +Configure the project. Indeed you create a dataset in csv format. + + +```python +! rm -rf *.csv +! unzip ./dataset/archive.zip +! head -n 1 *.csv | head -n 2 | tail -n 1 > data.csv && for file in *.csv; do (tail -n +2 "$file"; echo) >> data.csv; done && sed -i '/^$/d' data.csv +``` + + Archive: ./dataset/archive.zip + inflating: score.csv + inflating: score_updated.csv + + +Import needed libraries + + +```python +import matplotlib.pyplot as plt +import pandas as pd +import numpy as np +import pylab as pl +from sklearn.model_selection import train_test_split +from sklearn import linear_model +from sklearn.metrics import r2_score + +%matplotlib inline +``` + +Read data from data.csv using pandas and store in data frame structure. Also shuffle data to have uniform distribution. + + +```python +df = pd.read_csv("data.csv") +df.head() +df = df.sample(frac=1.0, random_state=42).reset_index(drop=True) +df.head() +``` + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
HoursScores
07.469
13.835
23.530
31.619
45.147
+
+ + + + +```python +# summarize data +df.describe() +``` + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
HoursScores
count121.000000121.000000
mean5.21487653.495868
std2.49918924.988705
min1.00000012.000000
25%3.00000030.000000
50%5.10000054.000000
75%7.40000075.000000
max9.80000099.000000
+
+ + + +Print the histogram chart of data + + +```python +viz = df[["Hours", "Scores"]] +viz.hist() +plt.show() +``` + + + +![png](simple_regression_files/simple_regression_9_0.png) + + + +Print scatter chart of data to recognize the patterns of data. Based on the below chart we must answer to this question "Is Linear Simple Regression good or not?" + + +```python +plt.scatter(df.Hours, df.Scores, color="blue") +plt.ylabel("Scores") +plt.xlabel("Hours of studying") +plt.show() +``` + + + +![png](simple_regression_files/simple_regression_11_0.png) + + + + +```python +# print(df) +train, temp = train_test_split(df, test_size=0.25, random_state=42) +test, evaluate = train_test_split(temp, test_size=0.5, random_state=42) +``` + + +```python +fig = plt.figure() +ax1 = fig.add_subplot() +ax1.scatter(train.Hours, train.Scores, color="blue") +ax1.scatter(test.Hours, test.Scores, color="red") +ax1.scatter(evaluate.Hours, evaluate.Scores, color="green") +plt.ylabel("Scores") +plt.xlabel("Hours of studying") +plt.show() +``` + + + +![png](simple_regression_files/simple_regression_13_0.png) + + + +Find the best fitted line based on distribution of data. + + +```python +reg = linear_model.LinearRegression() +train_x = np.asanyarray(train[['Hours']]) +train_y = np.asanyarray(train[['Scores']]) +reg.fit(train_x, train_y) + +print("Coefficients:\t", reg.coef_) +print("Intercept:\t", reg.intercept_) +``` + + Coefficients: [[9.86801899]] + Intercept: [1.90944816] + + + +```python +plt.scatter(train.Hours, train.Scores, color="blue") +plt.plot(train_x, reg.coef_[0][0]*train_x + reg.intercept_[0], "-r") +# y = theta1 x + theta0 +plt.ylabel("Scores") +plt.xlabel("Hours of studying") +``` + + + + + Text(0.5, 0, 'Hours of studying') + + + + + +![png](simple_regression_files/simple_regression_16_1.png) + + + +Testing model based on Test data. Measure the R2 and MSE. + + +```python +test_x = np.asanyarray(test[['Hours']]) +test_y = np.asanyarray(test[['Scores']]) + +test_y_ = reg.predict(test_x) + +print("Mean absolute error: %.2f" % np.mean(np.absolute(test_y_ - test_y))) +print("Residual sum of squares (MSE): %.2f" % np.mean(test_y_ - test_y)**2) +print("R2-score: %.2f" % r2_score(test_y_, test_y)) +``` + + Mean absolute error: 2.95 + Residual sum of squares (MSE): 1.62 + R2-score: 0.97 + diff --git a/regression/simple_linear_regression/simple_regression_files/simple_regression_11_0.png b/regression/simple_linear_regression/simple_regression_files/simple_regression_11_0.png new file mode 100644 index 0000000..1c0b5cf Binary files /dev/null and b/regression/simple_linear_regression/simple_regression_files/simple_regression_11_0.png differ diff --git a/regression/simple_linear_regression/simple_regression_files/simple_regression_13_0.png b/regression/simple_linear_regression/simple_regression_files/simple_regression_13_0.png new file mode 100644 index 0000000..80ff701 Binary files /dev/null and b/regression/simple_linear_regression/simple_regression_files/simple_regression_13_0.png differ diff --git a/regression/simple_linear_regression/simple_regression_files/simple_regression_16_1.png b/regression/simple_linear_regression/simple_regression_files/simple_regression_16_1.png new file mode 100644 index 0000000..776262d Binary files /dev/null and b/regression/simple_linear_regression/simple_regression_files/simple_regression_16_1.png differ diff --git a/regression/simple_linear_regression/simple_regression_files/simple_regression_9_0.png b/regression/simple_linear_regression/simple_regression_files/simple_regression_9_0.png new file mode 100644 index 0000000..26d9c4b Binary files /dev/null and b/regression/simple_linear_regression/simple_regression_files/simple_regression_9_0.png differ