-
Notifications
You must be signed in to change notification settings - Fork 106
/
sframe_to_scipy_sparse.py
102 lines (86 loc) · 2.98 KB
/
sframe_to_scipy_sparse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import graphlab as gl
def sframe_to_scipy(x, column_name):
assert x[column_name].dtype() == dict, \
"The chosen column must be dict type, representing sparse data."
# Create triples of (row_id, feature_id, count).
# 1. Add a row number.
x = x.add_row_number()
# 2. Stack will transform x to have a row for each unique (row, key) pair.
x = x.stack(column_name, ['feature', 'value'])
# x now looks like the following:
# Columns:
# id int
# word str
# value float
#
# Rows: 4
#
# Data:
# +----+-------+-------+
# | id | word | value |
# +----+-------+-------+
# | 0 | bob | 5.0 |
# | 0 | hello | 1.0 |
# | 1 | john | 10.0 |
# | 1 | hello | 3.0 |
# +----+-------+-------+
# [4 rows x 3 columns]
# Map words into integers using a OneHotEncoder feature transformation.
f = gl.feature_engineering.OneHotEncoder(features=['feature'])
# We first fit the transformer using the above data.
f.fit(x)
# The transform method will add a new column that is the transformed version
# of the 'word' column.
x = f.transform(x)
# Get the feature mapping.
mapping = f['feature_encoding']
# Get the actual word id.
x['feature_id'] = x['encoded_features'].dict_keys().apply(lambda x: x[0])
# x now has additional columns
# +----+-------+------------------+---------+
# | id | value | encoded_features | word_id |
# +----+-------+------------------+---------+
# | 0 | 5.0 | {0: 1} | 0 |
# | 0 | 1.0 | {1: 1} | 1 |
# | 1 | 10.0 | {2: 1} | 2 |
# | 1 | 3.0 | {1: 1} | 1 |
# +----+-------+------------------+---------+
# Create numpy arrays that contain the data for the sparse matrix.
import numpy as np
i = np.array(x['id'])
j = np.array(x['feature_id'])
v = np.array(x['value'])
width = x['id'].max() + 1
height = x['feature_id'].max() + 1
# Create a sparse matrix.
from scipy.sparse import csr_matrix
mat = csr_matrix((v, (i, j)), shape=(width, height))
return mat, mapping
# Original data.
x = gl.SFrame({'features': [{'hello': 1.0, 'bob': 5},
{'hello': 3.0, 'john': 10}]})
m, f = sframe_to_scipy(x, 'features')
# The m object is now a sparse matrix representing x.
# >>> m
# <2x3 sparse matrix of type '<type 'numpy.float64'>'
# with 4 stored elements in Compressed Sparse Row format>
# >>> m.todense()
# matrix([[ 5., 1., 0.],
# [ 0., 3., 10.]])
# The f object provides an SFrame with the following format:
# >>> f
# Columns:
# feature str
# category str
# index int
#
# Rows: 3
#
# Data:
# +---------+----------+-------+
# | feature | category | index |
# +---------+----------+-------+
# | word | bob | 0 |
# | word | hello | 1 |
# | word | john | 2 |
# +---------+----------+-------+