-
Notifications
You must be signed in to change notification settings - Fork 16
/
Housing.py
237 lines (173 loc) · 7.21 KB
/
Housing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
#!/usr/local/bin/python
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from scipy.stats import skew
from sklearn import decomposition
class App():
def __init__(self, file_train, file_test):
# Loads the dataset.
self._train_df = pd.read_csv(file_train, index_col=0)
self._test_df = pd.read_csv(file_test, index_col=0)
def _normalizeData(self, Numeric_columns):
# Function to normalize
means = self._df.loc[:, Numeric_columns].mean()
stdev = self._df.loc[:, Numeric_columns].std()
self._df.loc[:, Numeric_columns] = (self._df.loc[:, Numeric_columns] - means) / stdev
index_train = self._df.loc[self._train_df.index]
index_test = self._df.loc[self._test_df.index]
self._xTrain = index_train.values
self._xTest = index_test.values
self._df['LotArea'] = np.log(self._df['LotArea'])
self._df['LotFrontage'] = np.log(self._df['LotFrontage'])
def _removeSkewness(self):
# Store target variable and remove skewness
target = self._train_df['SalePrice']
plt.hist(target)
plt.show()
del self._train_df['SalePrice']
self._yTrain = np.log(target)
plt.hist(self._yTrain)
plt.xlabel('SalePrice')
plt.show()
def _dummyCreate(self):
# Create dummy variables for the categorical features and handle the missing values
self._df = pd.get_dummies(self._alldf)
self._df.isnull().sum().sort_values(ascending=False)
self._df = self._df.fillna( self._df.mean())
"""
"""
def _pcaLassoRegr(self):
pca = decomposition.PCA()
pca.fit(self._xTrain)
fig = plt.figure(1, figsize=(4, 3))
plt.clf()
plt.axes([.2, .2, .7, .7])
plt.plot(pca.explained_variance_, linewidth=2)
plt.axis('tight')
plt.xlabel('n_components')
plt.ylabel('explained_variance_')
plt.show()
train_pca = pca.transform(self._xTrain)
test_pca = pca.transform(self._xTest)
lassoregr = LassoCV(alphas=[0.1,0.001,0.0001,1,2,3,4,5,6,7,8,9,10,11,12]).fit(train_pca, self._yTrain)
rmse= np.sqrt(-cross_val_score(lassoregr, train_pca, self._yTrain, scoring="neg_mean_squared_error", cv = 5)).mean()
print rmse
y_lasso = lassoregr.predict(self._xTest)
return y_lasso
"""
"""
def _lassoRegr(self):
# Fitting the model and predicting using Lasso Regression
lassoregr = LassoCV(alphas=[0.1,0.001,0.0001,1,2,3,4,5,6,7,8,9,10,11,12]).fit(self._xTrain, self._yTrain)
y_lasso = lassoregr.predict(self._xTest)
# Root mean squre with lasso regression
rmse = np.sqrt(-cross_val_score(lassoregr, self._xTrain, self._yTrain, scoring="neg_mean_squared_error", cv = 5)).mean()
print "Root mean square of Lasso regression", rmse
return y_lasso
"""
"""
def _ridgeRegr(self):
# Fitting the model and predicting using Ridge Regression
ridgeregr = RidgeCV(alphas=[0.1,0.001,0.0001,1,2,3,4,5,6,7,8,9,10,11,12]).fit(self._xTrain, self._yTrain)
y_ridge = ridgeregr.predict(self._xTest)
# Root mean squre with Ridge Regression
ridgermse = np.sqrt(-cross_val_score(ridgeregr, self._xTrain, self._yTrain, scoring="neg_mean_squared_error", cv = 5)).mean()
print "Root mean square of rigde:",ridgermse
return y_ridge
"""
"""
def _xboost(self):
# Fitting the model and predicting using xgboost
regr = xgb.XGBRegressor(colsample_bytree=0.4,
gamma=0.045,
learning_rate=0.07,
max_depth=20,
min_child_weight=1.5,
n_estimators=300,
reg_alpha=0.65,
reg_lambda=0.45,
subsample=0.95)
regr.fit(self._xTrain, self._yTrain)
y_pred_xgb = regr.predict(self._xTest)
return y_pred_xgb
"""
"""
def _randomForest(self):
rf = RandomForestRegressor(10, max_features='sqrt')
rf.fit(self._xTrain, self._yTrain)
y_rf = rf.predict(self._xTest)
"""
"""
def _xboostLasso(self):
y_pred_xgb = self._xboost()
y_lasso = self._lassoRegr()
# Combining the predictions
y_final = (y_pred_xgb + y_lasso) / 2
y_final = np.exp(y_final)
return y_final
"""
"""
def _xboostLassoRidge(self):
y_pred_xgb = self._xboost()
y_lasso = self._lassoRegr()
y_ridge = self._ridgeRegr()
# Combining the predictions
y_final = (y_pred_xgb + y_lasso + y_ridge) / 3
y_final = np.exp(y_final)
return y_final
"""
Please change submission file path below.
"""
def _submission(self, y_final):
# Preparing for submissions
submission_df = pd.DataFrame(data= {'Id' : self._test_df.index, 'SalePrice': y_final})
submission_df.to_csv('~/Desktop/submisison.csv', index=False)
def main(self):
# Remove skewness
self._removeSkewness()
# Concatenates the data
self._alldf = pd.concat((self._train_df, self._test_df), axis=0)
# Creates dummy variables
self._dummyCreate()
# Retrieve all numeric features
numeric_columns = self._alldf.columns[self._alldf.dtypes != 'object']
# Normalize the data set
self._normalizeData(numeric_columns)
# Using PCA and LassoReggression
# Use this function and comment the functions below while running pca+lasso
y_final = self._pcaLassoRegr()
self._submission(y_final)
# Using LassoReggression
# Use this function and comment the remaining functions while running lasso
y_final = self._lassoRegr()
self._submission(y_final)
# Using RidgeReggression
# Use this function and comment the remaining functions while running ridge
y_final = self._ridgeRegr()
self._submission(y_final)
# Using Xboost
# Use this function and comment the remaining functions while running Xboost
y_final = self._xboost()
self._submission(y_final)
# Using RandomForest
# Use this function and comment the remaining functions while running RandomForest
y_final = self._randomForest()
self._submission(y_final)
# Using XboostLasso
# Use this function and comment the remaining functions while running XboostLasso
y_final = self._xboostLasso()
self._submission(y_final)
# Using XboostLassoRidge
# Use this function and comment the remaining functions while running XboostLassoRidge
y_final = self._xboostLassoRidge()
self._submission(y_final)
if __name__ == '__main__':
app = App(sys.argv[1], sys.argv[2])
sys.exit(app.main())