-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added MostFrequentImputer and CategoryEncoder (0.3.0)
- Loading branch information
1 parent
270a482
commit 46f0606
Showing
7 changed files
with
175 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
from sklearn.base import BaseEstimator, TransformerMixin | ||
from sklearn.preprocessing import LabelEncoder | ||
from sklearn.preprocessing import OneHotEncoder | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
class CategoryEncoder(BaseEstimator, TransformerMixin): | ||
def __init__(self, labels={}): | ||
self.label_encoders = {} | ||
self.one_hot_encoders = {} | ||
self.labels = labels | ||
|
||
def fit(self, X, y=None): | ||
for c in X: | ||
self.label_encoders[c] = LabelEncoder() | ||
self.one_hot_encoders[c] = OneHotEncoder(sparse=False) | ||
if c in self.labels: | ||
self.label_encoders[c].fit(self.labels[c]) | ||
values = self.label_encoders[c].transform(self.labels[c]) | ||
else: | ||
self.label_encoders[c].fit(X[c]) | ||
values = self.label_encoders[c].transform(X[c]) | ||
self.one_hot_encoders[c].fit(values.reshape(len(values),1)) | ||
|
||
return self | ||
|
||
def transform(self, X): | ||
one_hots = [] | ||
for c in X: | ||
values = self.label_encoders[c].transform(X[c]) | ||
o = self.one_hot_encoders[c].transform(values.reshape(len(values),1)) | ||
one_hots.append(o) | ||
|
||
return np.concatenate(one_hots, axis=1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
from sklearn.base import BaseEstimator, TransformerMixin | ||
import pandas as pd | ||
|
||
# A class to impute missing values given a collection of seen values | ||
# From: https://github.com/ageron/handson-ml/blob/master/02_end_to_end_machine_learning_project.ipynb | ||
# Inspired from stackoverflow.com/questions/25239958 | ||
class MostFrequentImputer(BaseEstimator, TransformerMixin): | ||
def fit(self, X, y=None): | ||
self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X], | ||
index=X.columns) | ||
return self | ||
|
||
def transform(self, X, y=None): | ||
return X.fillna(self.most_frequent_) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,3 @@ | ||
from .DataFrameSelector import DataFrameSelector | ||
from .DataFrameSelector import DataFrameSelector | ||
from .MostFrequentImputer import MostFrequentImputer | ||
from .CategoryEncoder import CategoryEncoder |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
from unittest import TestCase | ||
import numpy as np | ||
|
||
class NumPyTestCase(TestCase): | ||
|
||
def assertArrayEqual(self, expected: np.array, actual: np.array): | ||
return self.assertTrue(np.array_equal(expected, actual), "The arrays do not match") | ||
|
||
def array(self, array): | ||
return np.array(array, dtype=np.float32) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
from .NumPyTestCase import NumPyTestCase | ||
import pandas as pd | ||
import numpy as np | ||
|
||
from m16_mlutils.pipeline import CategoryEncoder | ||
|
||
class test_CategoryEncoder(NumPyTestCase): | ||
|
||
def test_two_categories(self): | ||
categoricalValues = ['s', 'n', 's','s'] | ||
expected = self.array([ | ||
[0, 1], | ||
[1, 0], | ||
[0, 1], | ||
[0, 1] | ||
]) | ||
|
||
encoder = CategoryEncoder() | ||
|
||
actual = encoder.fit_transform(pd.DataFrame({ | ||
'v': categoricalValues | ||
})) | ||
|
||
self.assertArrayEqual(expected, actual) | ||
|
||
def test_three_categories_custom_unobserved_labels(self): | ||
categoricalValues = ['s', 'n', 's','s'] | ||
expected = self.array([ | ||
[0, 0, 1], | ||
[0, 1, 0], | ||
[0, 0, 1], | ||
[0, 0, 1] | ||
]) | ||
|
||
encoder = CategoryEncoder(labels={'v': ['s', 'n', 'a']}) | ||
|
||
actual = encoder.fit_transform(pd.DataFrame({ | ||
'v': categoricalValues | ||
})) | ||
|
||
self.assertArrayEqual(expected, actual) | ||
|
||
def test_three_categories(self): | ||
categoricalValues = ['a', 'b', 'c', 'b', 'a', 'c'] | ||
expected = self.array([ | ||
[1, 0, 0], | ||
[0, 1, 0], | ||
[0, 0, 1], | ||
[0, 1, 0], | ||
[1, 0, 0], | ||
[0, 0, 1] | ||
]) | ||
|
||
encoder = CategoryEncoder() | ||
|
||
actual = encoder.fit_transform(pd.DataFrame({ | ||
'v': categoricalValues | ||
})) | ||
|
||
self.assertArrayEqual(expected, actual) | ||
|
||
def test_two_categorical_columns(self): | ||
categoricalValues1 = ['a', 'b', 'c', 'b', 'a', 'c'] | ||
categoricalValues2 = ['x', 'y', 'z', 'y', 'x', 'z'] | ||
expected = self.array([ | ||
[1, 0, 0, 1, 0, 0], | ||
[0, 1, 0, 0, 1, 0], | ||
[0, 0, 1, 0, 0, 1], | ||
[0, 1, 0, 0, 1, 0], | ||
[1, 0, 0, 1, 0, 0], | ||
[0, 0, 1, 0, 0, 1] | ||
]) | ||
|
||
encoder = CategoryEncoder() | ||
|
||
actual = encoder.fit_transform(pd.DataFrame({ | ||
'v1': categoricalValues1, | ||
'v2': categoricalValues2 | ||
})) | ||
|
||
self.assertArrayEqual(expected, actual) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
from unittest import TestCase | ||
import pandas as pd | ||
|
||
from m16_mlutils.pipeline import MostFrequentImputer | ||
|
||
class test_MostFrequentImputer(TestCase): | ||
|
||
def test_imputes_correct_value(self): | ||
inputValues1 = ['A', 'A', None,'B'] | ||
expectedValues1 = ['A', 'A', 'A', 'B'] | ||
|
||
inputValues2 = ['1', '2', None,'1'] | ||
expectedValues2 = ['1', '2', '1', '1'] | ||
|
||
missingData = pd.DataFrame({ | ||
'values1': inputValues1, | ||
'values2': inputValues2 | ||
}) | ||
|
||
expected = pd.DataFrame({ | ||
'values1': expectedValues1, | ||
'values2': expectedValues2 | ||
}) | ||
|
||
imputer = MostFrequentImputer() | ||
|
||
actual = imputer.fit_transform(missingData) | ||
|
||
self.assertTrue(expected.equals(actual)) | ||
|