Skip to content

Commit

Permalink
Added MostFrequentImputer and CategoryEncoder (0.3.0)
Browse files Browse the repository at this point in the history
  • Loading branch information
fferegrino committed Jul 4, 2018
1 parent 270a482 commit 46f0606
Show file tree
Hide file tree
Showing 7 changed files with 175 additions and 2 deletions.
35 changes: 35 additions & 0 deletions m16_mlutils/pipeline/CategoryEncoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

import numpy as np
import pandas as pd

class CategoryEncoder(BaseEstimator, TransformerMixin):
def __init__(self, labels={}):
self.label_encoders = {}
self.one_hot_encoders = {}
self.labels = labels

def fit(self, X, y=None):
for c in X:
self.label_encoders[c] = LabelEncoder()
self.one_hot_encoders[c] = OneHotEncoder(sparse=False)
if c in self.labels:
self.label_encoders[c].fit(self.labels[c])
values = self.label_encoders[c].transform(self.labels[c])
else:
self.label_encoders[c].fit(X[c])
values = self.label_encoders[c].transform(X[c])
self.one_hot_encoders[c].fit(values.reshape(len(values),1))

return self

def transform(self, X):
one_hots = []
for c in X:
values = self.label_encoders[c].transform(X[c])
o = self.one_hot_encoders[c].transform(values.reshape(len(values),1))
one_hots.append(o)

return np.concatenate(one_hots, axis=1)
14 changes: 14 additions & 0 deletions m16_mlutils/pipeline/MostFrequentImputer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

# A class to impute missing values given a collection of seen values
# From: https://github.com/ageron/handson-ml/blob/master/02_end_to_end_machine_learning_project.ipynb
# Inspired from stackoverflow.com/questions/25239958
class MostFrequentImputer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
index=X.columns)
return self

def transform(self, X, y=None):
return X.fillna(self.most_frequent_)
4 changes: 3 additions & 1 deletion m16_mlutils/pipeline/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
from .DataFrameSelector import DataFrameSelector
from .DataFrameSelector import DataFrameSelector
from .MostFrequentImputer import MostFrequentImputer
from .CategoryEncoder import CategoryEncoder
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from setuptools.command.install import install

# Package version
VERSION = "0.2.0"
VERSION = "0.3.0"

class VerifyVersionCommand(install):
"""Custom command to verify that the git tag matches our version"""
Expand Down
10 changes: 10 additions & 0 deletions tests/NumPyTestCase.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from unittest import TestCase
import numpy as np

class NumPyTestCase(TestCase):

def assertArrayEqual(self, expected: np.array, actual: np.array):
return self.assertTrue(np.array_equal(expected, actual), "The arrays do not match")

def array(self, array):
return np.array(array, dtype=np.float32)
82 changes: 82 additions & 0 deletions tests/test_CategoryEncoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from .NumPyTestCase import NumPyTestCase
import pandas as pd
import numpy as np

from m16_mlutils.pipeline import CategoryEncoder

class test_CategoryEncoder(NumPyTestCase):

def test_two_categories(self):
categoricalValues = ['s', 'n', 's','s']
expected = self.array([
[0, 1],
[1, 0],
[0, 1],
[0, 1]
])

encoder = CategoryEncoder()

actual = encoder.fit_transform(pd.DataFrame({
'v': categoricalValues
}))

self.assertArrayEqual(expected, actual)

def test_three_categories_custom_unobserved_labels(self):
categoricalValues = ['s', 'n', 's','s']
expected = self.array([
[0, 0, 1],
[0, 1, 0],
[0, 0, 1],
[0, 0, 1]
])

encoder = CategoryEncoder(labels={'v': ['s', 'n', 'a']})

actual = encoder.fit_transform(pd.DataFrame({
'v': categoricalValues
}))

self.assertArrayEqual(expected, actual)

def test_three_categories(self):
categoricalValues = ['a', 'b', 'c', 'b', 'a', 'c']
expected = self.array([
[1, 0, 0],
[0, 1, 0],
[0, 0, 1],
[0, 1, 0],
[1, 0, 0],
[0, 0, 1]
])

encoder = CategoryEncoder()

actual = encoder.fit_transform(pd.DataFrame({
'v': categoricalValues
}))

self.assertArrayEqual(expected, actual)

def test_two_categorical_columns(self):
categoricalValues1 = ['a', 'b', 'c', 'b', 'a', 'c']
categoricalValues2 = ['x', 'y', 'z', 'y', 'x', 'z']
expected = self.array([
[1, 0, 0, 1, 0, 0],
[0, 1, 0, 0, 1, 0],
[0, 0, 1, 0, 0, 1],
[0, 1, 0, 0, 1, 0],
[1, 0, 0, 1, 0, 0],
[0, 0, 1, 0, 0, 1]
])

encoder = CategoryEncoder()

actual = encoder.fit_transform(pd.DataFrame({
'v1': categoricalValues1,
'v2': categoricalValues2
}))

self.assertArrayEqual(expected, actual)

30 changes: 30 additions & 0 deletions tests/test_MostFrequentImputer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from unittest import TestCase
import pandas as pd

from m16_mlutils.pipeline import MostFrequentImputer

class test_MostFrequentImputer(TestCase):

def test_imputes_correct_value(self):
inputValues1 = ['A', 'A', None,'B']
expectedValues1 = ['A', 'A', 'A', 'B']

inputValues2 = ['1', '2', None,'1']
expectedValues2 = ['1', '2', '1', '1']

missingData = pd.DataFrame({
'values1': inputValues1,
'values2': inputValues2
})

expected = pd.DataFrame({
'values1': expectedValues1,
'values2': expectedValues2
})

imputer = MostFrequentImputer()

actual = imputer.fit_transform(missingData)

self.assertTrue(expected.equals(actual))

0 comments on commit 46f0606

Please sign in to comment.