-
Notifications
You must be signed in to change notification settings - Fork 2
/
data_processing.py
199 lines (174 loc) · 7.82 KB
/
data_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import numpy as np
import pandas as pd
import argparse
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cross_validation import train_test_split
import pandautils as pup
import warnings
import logging
from collections import OrderedDict
from itertools import izip
import tqdm
logger = logging.getLogger('data_processing')
def _pairwise(iterable):
'''s -> (s0, s1), (s2, s3), (s4, s5), ...'''
a = iter(iterable)
return izip(a, a)
def read_in(class_files_dict, tree_name, particles, mode):
'''
takes in dict mapping class names to list of root files, loads them and slices them into ML format
Args:
class_files_dict: dictionary that links the names of the different classes
in the classification problem to the paths of the ROOT files
associated with each class; for example:
{
"ttbar" :
[
"/path/to/file1.root",
"/path/to/file2.root",
],
"qcd" :
[
"/path/to/file3.root",
"/path/to/file4*",
],
...
}
tree_name: string, name of the tree to open in the ntuples
particles: dictionary that provides various informations about the different streams in the events,
for example:
{
"jet" :
{
"branches" :
[
"jet_pt",
"jet_eta"
],
"max_length" : 5
},
"photon" :
{
"branches" :
[
"photon_pt",
"photon_eta"
],
"max_length" : 3
}
}
Returns:
X: an OrderedDict containing the feature matrices for the different particle types, e.g.:
X = {
"jet" : X_jet,
"photon" : X_photon,
"muon" : X_muon
}
where each X_<particle> is an ndarray of dimensions [n_ev, n_<particle>features]
y: ndarray [n_ev, 1] containing the truth labels
w: ndarray [n_ev, 1] containing the event weights
le: LabelEncoder to transform numerical y back to its string values
'''
branches = []
for particle_name, particle_info in particles.iteritems():
branches += particle_info["branches"]
#convert files to pd data frames, assign key or mass to y, concat all files
def _make_df(val, key, branches):
df = pup.root2panda(val, tree_name, branches = branches + ['HGamEventInfoAuxDyn.yybb_weight'])
if mode == 'classification':
df['y'] = key
elif mode == 'regression':
if key == 'bkg':
df['y'] = 0
else:
df['y'] = int(key[1:])
return df
all_events = pd.concat([_make_df(val, key, branches) for key, val in class_files_dict.iteritems()], ignore_index=True)
X = OrderedDict()
for particle_name, particle_info in particles.iteritems():
logger.info('Building X_{}'.format(particle_name))
X[particle_name] = all_events[particle_info["branches"]].values
#transform string labels to integer classes for classification or set y for regression
if mode == 'classification':
le = LabelEncoder()
y = le.fit_transform(all_events['y'].values)
elif mode == 'regression':
le = None
y = all_events['y'].values
#w = all_events['HGamEventInfoAuxDyn.yybb_weight'].values
w = np.ones(len(y))
return X, y, w, le
def _scale(matrix_train, matrix_test):
'''
Use scikit learn to scale features to 0 mean, 1 std.
Because of event-level structure, we need to flatten X, scale, and then reshape back into event format.
Args:
matrix_train: X_train [n_ev_train, n_particle_features], numpy ndarray of unscaled features of events allocated for training
matrix_test: X_test [n_ev_test, n_particle_features], numpy ndarray of unscaled features of events allocated for testing
Returns:
the same matrices after scaling
'''
with warnings.catch_warnings():
warnings.simplefilter("ignore")
ref_test = matrix_test[:, 0]
ref_train = matrix_train[:, 0]
for col in xrange(matrix_train.shape[1]):
scaler = StandardScaler()
matrix_train[:, col] = pup.match_shape(
scaler.fit_transform(pup.flatten(matrix_train[:, col]).reshape(-1, 1)).ravel(), ref_train)
matrix_test[:, col] = pup.match_shape(
scaler.transform(pup.flatten(matrix_test[:, col]).reshape(-1, 1)).ravel(), ref_test)
return matrix_train, matrix_test
def shuffle_split_scale(X, y, w):
'''
Shuffle data, split it into test (40%) and training (60%) sets, scale X
Args:
X: an OrderedDict containing the feature matrices for the different particle types, e.g.:
X = {
"jet" : X_jet,
"photon" : X_photon,
"muon" : X_muon
}
where each X_<particle> is an ndarray of dimensions [n_ev, n_<particle>features]
y: ndarray [n_ev, 1] containing the truth labels
w: ndarray [n_ev, 1] containing the event weights
Returns:
data: an OrderedDict containing all X, y, w ndarrays for all particles (both train and test), e.g.:
data = {
"X_jet_train" : X_jet_train,
"X_jet_test" : X_jet_test,
"X_photon_train" : X_photon_train,
"X_photon_test" : X_photon_test,
"y_train" : y_train,
"y_test" : y_test,
"w_train" : w_train,
"w_test" : w_test
}
'''
logger.info('Shuffling, splitting and scaling')
data_tuple = train_test_split(*(X.values() + [y, w]), test_size=0.4)
data = OrderedDict()
for particle, (train, test) in zip(X.keys(), _pairwise(data_tuple[:(2 * len(X))])):
print particle
data['X_' + particle + '_train'], data['X_' + particle+ '_test'] = _scale(train, test)
data['y_train'], data['y_test'], data['w_train'], data['w_test'] = data_tuple[-4:]
return data
def padding(X, max_length, value=-999):
'''
Transforms X to a 3D array where the dimensions correspond to [n_ev, n_particles, n_features].
n_particles is now fixed and equal to max_length.
If the number of particles in an event was < max_length, the missing particles will be filled with default values
If the number of particles in an event was > max_length, the excess particles will be removed
Args:
X: ndarray [n_ev, n_features] with an arbitrary number of particles per event
max_length: int, the number of particles to keep per event
value (optional): the value to input in case there are not enough particles in the event, default=-999
Returns:
X_pad: ndarray [n_ev, n_particles, n_features], padded version of X with fixed number of particles
Note:
Use Masking to avoid the particles with artificial entries = -999
'''
X_pad = value * np.ones((X.shape[0], max_length, X.shape[1]), dtype='float32')
for i, row in enumerate(X):
X_pad[i, :min(len(row[0]), max_length), :] = np.array(row.tolist()).T[:min(len(row[0]), max_length), :]
return X_pad