Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create ghada_project1.py #28

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
183 changes: 183 additions & 0 deletions ghada_project1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
# -*- coding: utf-8 -*-
"""Ghada_project1.ipynb

Automatically generated by Colaboratory.

Original file is located at
https://colab.research.google.com/drive/13zEpdpk8jRUaenJN_X6Arya6t02ObCqK
"""

! git

! git init

! git clone https://github.com/Technocolabs100/Product-Sales-Outlet-at-BigMart-Datawarehouse.git

! pwd



"""# Loading Packages and Data"""

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn

df = pd.read_csv("Train.csv")

"""# Data Structure and Content"""

df.head()

df.tail()

df.shape

df.describe()

df.info()

df.isnull().sum()

df['Item_Weight']=df['Item_Weight'].fillna(df['Item_Weight'].mean())

df['Outlet_Size']=df['Outlet_Size'].fillna('Medium')

"""#UNiVariate Analysis """

sns.distplot(df['Item_Outlet_Sales'])

sns.distplot(df['Item_Outlet_Sales'])

sns.countplot(x='Outlet_Size',data=df)

sns.countplot(df['Outlet_Location_Type'])

sns.countplot(df['Outlet_Type'])

sns.boxplot(df['Item_Weight'])

"""#BIvariate Analysis """

sns.regplot(x='Item_Weight',y='Item_Outlet_Sales',data=df)

sns.boxplot(x='Outlet_Identifier',y='Item_Outlet_Sales',data=df)

sns.regplot(x='Item_Visibility',y='Item_Outlet_Sales',data=df)

plt.figure(figsize=(10,10))
sns.boxplot(x='Outlet_Size',y='Item_Outlet_Sales',data=df)

df.corr()

sns.heatmap(df.corr())

sns.regplot(x='Item_MRP',y='Item_Outlet_Sales',data=df)

sns.factorplot(x='Outlet_Type',y='Item_Outlet_Sales',hue='Outlet_Size',data=df)

sns.pairplot(df)

"""#Feature Engineering """

#Encoding Categorical Variables
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
for col in df.columns:
df[col] = labelencoder.fit_transform(df[col])

#Now one hot encoding
df_train=pd.get_dummies(df, columns=['Item_Fat_Content',
'Item_Type',
'Outlet_Size',
'Outlet_Location_Type',
'Outlet_Type'],drop_first=False)

print(df_train.shape)

df_train.columns

df_train.head()

df_train.info()

x=df.drop(['Item_Identifier','Item_Type','Outlet_Establishment_Year','Item_Outlet_Sales'],axis=1)
y=df['Item_Outlet_Sales']

x.head()

x.info()

y.head()

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error,accuracy_score,classification_report,confusion_matrix

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

"""#Linear Regression model """

from sklearn.linear_model import LinearRegression

lrm=LinearRegression()

lrm.fit(x_train,y_train)

predicted=lrm.predict(x_test)

print("MEAN SQUARED ERROR(MSE)",mean_squared_error(y_test,predicted))
print("MEAN ABSOLUTE ERROR(MAE)",mean_absolute_error(y_test,predicted))
print("ROOT MEAN SQUARED ERROR(RMSE)",np.sqrt(mean_squared_error(y_test,predicted)))
print("SCORE",lrm.score(x_test,y_test))

"""#Random Forest"""

from sklearn.ensemble import RandomForestRegressor

rfg=RandomForestRegressor()

rfg.fit(x_train,y_train)

predicted=rfg.predict(x_test)

print("MEAN SQUARED ERROR(MSE)",mean_squared_error(y_test,predicted))
print("MEAN ABSOLUTE ERROR(MAE)",mean_absolute_error(y_test,predicted))
print("ROOT MEAN SQUARED ERROR(RMSE)",np.sqrt(mean_squared_error(y_test,predicted)))
print("SCORE",rfg.score(x_test,y_test))

"""#ADAboost"""

from sklearn.ensemble import AdaBoostRegressor

abr=AdaBoostRegressor(n_estimators=70)

abr.fit(x_train,y_train)

predicted=abr.predict(x_test)

print("MEAN SQUARED ERROR(MSE)",mean_squared_error(y_test,predicted))
print("MEAN ABSOLUTE ERROR(MAE)",mean_absolute_error(y_test,predicted))
print("ROOT MEAN SQUARED ERROR(RMSE)",np.sqrt(mean_squared_error(y_test,predicted)))
print("SCORE",abr.score(x_test,y_test))

"""#BAgging Regressor"""

from sklearn.ensemble import BaggingRegressor

br=BaggingRegressor(n_estimators=30)

br.fit(x_train,y_train)

predicted=br.predict(x_test)

print("MEAN SQUARED ERROR(MSE)",mean_squared_error(y_test,predicted))
print("MEAN ABSOLUTE ERROR(MAE)",mean_absolute_error(y_test,predicted))
print("ROOT MEAN SQUARED ERROR(RMSE)",np.sqrt(mean_squared_error(y_test,predicted)))
print("SCORE",br.score(x_test,y_test))

"""#Summary

From all the models that I used, I found out that Adaboost model gave us the best score. We could conclude Adaboost as the best predictive model.
"""