diff --git a/ghada_project1.py b/ghada_project1.py new file mode 100644 index 0000000..b944e7a --- /dev/null +++ b/ghada_project1.py @@ -0,0 +1,183 @@ +# -*- coding: utf-8 -*- +"""Ghada_project1.ipynb + +Automatically generated by Colaboratory. + +Original file is located at + https://colab.research.google.com/drive/13zEpdpk8jRUaenJN_X6Arya6t02ObCqK +""" + +! git + +! git init + +! git clone https://github.com/Technocolabs100/Product-Sales-Outlet-at-BigMart-Datawarehouse.git + +! pwd + + + +"""# Loading Packages and Data""" + +import numpy as np +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +import sklearn + +df = pd.read_csv("Train.csv") + +"""# Data Structure and Content""" + +df.head() + +df.tail() + +df.shape + +df.describe() + +df.info() + +df.isnull().sum() + +df['Item_Weight']=df['Item_Weight'].fillna(df['Item_Weight'].mean()) + +df['Outlet_Size']=df['Outlet_Size'].fillna('Medium') + +"""#UNiVariate Analysis """ + +sns.distplot(df['Item_Outlet_Sales']) + +sns.distplot(df['Item_Outlet_Sales']) + +sns.countplot(x='Outlet_Size',data=df) + +sns.countplot(df['Outlet_Location_Type']) + +sns.countplot(df['Outlet_Type']) + +sns.boxplot(df['Item_Weight']) + +"""#BIvariate Analysis """ + +sns.regplot(x='Item_Weight',y='Item_Outlet_Sales',data=df) + +sns.boxplot(x='Outlet_Identifier',y='Item_Outlet_Sales',data=df) + +sns.regplot(x='Item_Visibility',y='Item_Outlet_Sales',data=df) + +plt.figure(figsize=(10,10)) +sns.boxplot(x='Outlet_Size',y='Item_Outlet_Sales',data=df) + +df.corr() + +sns.heatmap(df.corr()) + +sns.regplot(x='Item_MRP',y='Item_Outlet_Sales',data=df) + +sns.factorplot(x='Outlet_Type',y='Item_Outlet_Sales',hue='Outlet_Size',data=df) + +sns.pairplot(df) + +"""#Feature Engineering """ + +#Encoding Categorical Variables +from sklearn.preprocessing import LabelEncoder +labelencoder=LabelEncoder() +for col in df.columns: + df[col] = labelencoder.fit_transform(df[col]) + +#Now one hot encoding +df_train=pd.get_dummies(df, columns=['Item_Fat_Content', + 'Item_Type', + 'Outlet_Size', + 'Outlet_Location_Type', + 'Outlet_Type'],drop_first=False) + +print(df_train.shape) + +df_train.columns + +df_train.head() + +df_train.info() + +x=df.drop(['Item_Identifier','Item_Type','Outlet_Establishment_Year','Item_Outlet_Sales'],axis=1) +y=df['Item_Outlet_Sales'] + +x.head() + +x.info() + +y.head() + +from sklearn.model_selection import train_test_split +from sklearn.metrics import mean_squared_error,mean_absolute_error,accuracy_score,classification_report,confusion_matrix + +x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2) + +"""#Linear Regression model """ + +from sklearn.linear_model import LinearRegression + +lrm=LinearRegression() + +lrm.fit(x_train,y_train) + +predicted=lrm.predict(x_test) + +print("MEAN SQUARED ERROR(MSE)",mean_squared_error(y_test,predicted)) +print("MEAN ABSOLUTE ERROR(MAE)",mean_absolute_error(y_test,predicted)) +print("ROOT MEAN SQUARED ERROR(RMSE)",np.sqrt(mean_squared_error(y_test,predicted))) +print("SCORE",lrm.score(x_test,y_test)) + +"""#Random Forest""" + +from sklearn.ensemble import RandomForestRegressor + +rfg=RandomForestRegressor() + +rfg.fit(x_train,y_train) + +predicted=rfg.predict(x_test) + +print("MEAN SQUARED ERROR(MSE)",mean_squared_error(y_test,predicted)) +print("MEAN ABSOLUTE ERROR(MAE)",mean_absolute_error(y_test,predicted)) +print("ROOT MEAN SQUARED ERROR(RMSE)",np.sqrt(mean_squared_error(y_test,predicted))) +print("SCORE",rfg.score(x_test,y_test)) + +"""#ADAboost""" + +from sklearn.ensemble import AdaBoostRegressor + +abr=AdaBoostRegressor(n_estimators=70) + +abr.fit(x_train,y_train) + +predicted=abr.predict(x_test) + +print("MEAN SQUARED ERROR(MSE)",mean_squared_error(y_test,predicted)) +print("MEAN ABSOLUTE ERROR(MAE)",mean_absolute_error(y_test,predicted)) +print("ROOT MEAN SQUARED ERROR(RMSE)",np.sqrt(mean_squared_error(y_test,predicted))) +print("SCORE",abr.score(x_test,y_test)) + +"""#BAgging Regressor""" + +from sklearn.ensemble import BaggingRegressor + +br=BaggingRegressor(n_estimators=30) + +br.fit(x_train,y_train) + +predicted=br.predict(x_test) + +print("MEAN SQUARED ERROR(MSE)",mean_squared_error(y_test,predicted)) +print("MEAN ABSOLUTE ERROR(MAE)",mean_absolute_error(y_test,predicted)) +print("ROOT MEAN SQUARED ERROR(RMSE)",np.sqrt(mean_squared_error(y_test,predicted))) +print("SCORE",br.score(x_test,y_test)) + +"""#Summary + +From all the models that I used, I found out that Adaboost model gave us the best score. We could conclude Adaboost as the best predictive model. +""" \ No newline at end of file