-
Notifications
You must be signed in to change notification settings - Fork 0
/
wrangle.py
248 lines (183 loc) · 6.81 KB
/
wrangle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression, RFE
def wrangle_data(df, target_name, modeling=False):
'''
Signature: prep_data(df, modeling=False)
Docstring:
This function accepts any dataframe and splits it into train, validate,
and test sets for EDA or modeling.
Parameters
----------
df : pandas.core.frame.DataFrame
target_name : str
target_name is the column name of the target variable
modeling : boolean, False by default
`modeling` parameter scales numeric data to use in machine learning models.
If modeling is False: The function returns unscaled X_set and y_set dataframes
If modeling is True: The function returns scaled X_set and y_set dataframes
Returns
-------
X_train, y_train, X_validate, y_validate, X_test, y_test
'''
# Create dummy variables for object dtypes
# Original object dtype columns are dropped
df = add_encoded_columns(df, drop_encoders=True)
# After columns are coded, this function accepts a cleaned and encoded
# dataframe and returns train, validate, and test sets
train, validate, test = train_validate_test(df)
# Split the train, validate, and test sets into 3 X_set and y_set
X_train, y_train = attributes_target_split(train, target_name)
X_validate, y_validate = attributes_target_split(validate, target_name)
X_test, y_test = attributes_target_split(test, target_name)
# If modeling is True
if modeling:
X_train, X_validate, X_test = add_scaled_columns(train, validate, test)
return X_train, y_train, X_validate, y_validate, X_test, y_test
def add_encoded_columns(df, drop_encoders=True):
'''
Signature: add_encoded_columns(df, drop_encoders=True)
Docstring:
This function accepts a DataFrame, creates encoded columns for object dtypes,
and returns a DataFrame with or without object dtype columns.
Parameters
----------
df : pandas.core.frame.DataFrame
Returns
-------
f, encoded_columns
'''
if df.select_dtypes('O').columns.to_list() == []:
return df
columns_to_encode = df.select_dtypes('O').columns.to_list()
encoded_columns = pd.get_dummies(df[columns_to_encode], drop_first=True, dummy_na=False)
df = pd.concat([df, encoded_columns], axis=1)
if drop_encoders:
df = df.drop(columns=columns_to_encode)
return df
else:
return df, encoded_columns
def train_validate_test(df):
'''
Signature: train_validate_test(df)
Docstring:
Parameters
----------
pandas.core.frame.DataFrame
Returns
-------
train, validate, test
'''
train_validate, test = train_test_split(df, test_size=.20, random_state=123)
train, validate = train_test_split(train_validate, test_size=.25, random_state=123)
return train, validate, test
def attributes_target_split(data_set, target_name):
'''
Signature: attributes_target_split(df, target)
Docstring:
Parameters
----------
pandas.core.frame.DataFrame
Returns
-------
'''
x = data_set.drop(columns=target_name)
y = data_set[target_name]
return x, y
def add_scaled_columns(train, validate, test, scaler=RobustScaler()):
'''
Signature: add_scaled_columns(train, validate, test, scaler)
Docstring:
Parameters
----------
pandas.core.frame.DataFrame
Returns
-------
train, validate, test
'''
columns_to_scale = train.select_dtypes(exclude='uint8').columns.to_list()
new_column_names = [c + '_scaled' for c in columns_to_scale]
scaler.fit(train[columns_to_scale])
# scale columns in train, validate and test sets
train_scaled = scaler.transform(train[columns_to_scale])
validate_scaled = scaler.transform(validate[columns_to_scale])
test_scaled = scaler.transform(test[columns_to_scale])
# drop columns that are now scaled
train.drop(columns=columns_to_scale, inplace=True)
validate.drop(columns=columns_to_scale, inplace=True)
test.drop(columns=columns_to_scale, inplace=True)
# concatenate scaled columns with the original train/validate/test sets
train = pd.concat([train,
pd.DataFrame(train_scaled,
columns=new_column_names,
index=train.index.values
)],
axis=1)
validate = pd.concat([validate,
pd.DataFrame(validate_scaled,
columns=new_column_names,
index=validate.index.values
)],
axis=1)
test = pd.concat([test,
pd.DataFrame(test_scaled,
columns=new_column_names,
index=test.index.values
)],
axis=1)
return train, validate, test
def features_for_modeling(predictors, target, k_features):
'''
Signature: features_for_modeling(predictors, target, k_features)
Docstring:
Parameters
----------
Returns
-------
'''
df_best = pd.DataFrame(select_kbest(predictors, target, k_features))
df_rfe = pd.DataFrame(select_rfe(predictors, target, k_features))
df_features = pd.concat([df_best, df_rfe], axis=1)
return df_features
def select_kbest(predictors, target, k_features=3):
'''
Signature: select_kbest(predictors, target, k_features=3)
Docstring:
Parameters
----------
pandas.core.frame.DataFrame
Returns
-------
'''
f_selector = SelectKBest(f_regression, k=k_features)
f_selector.fit(predictors, target)
f_mask = f_selector.get_support()
f_features = predictors.iloc[:,f_mask].columns.to_list()
print(f"Select K Best: {len(f_features)} features")
print(f_features)
return None
# return predictors[f_features]
def select_rfe(X, y, k_features=3):
'''
Signature: rfe(predictors, target, k_features=3)
Docstring:
Parameters
----------
pandas.core.frame.DataFrame
Returns
-------
'''
lm = LinearRegression()
rfe_init = RFE(lm, k_features)
X_rfe = rfe_init.fit(X, y)
rfe_mask = rfe_init.support_
rfe_features = X.iloc[:, rfe_mask].columns.to_list()
print(f"Recursive Feature Elimination: {len(rfe_features)} features")
print(rfe_features)
return None
#return X[rfe_features]