-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
95 lines (67 loc) · 2.34 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import numpy as np
import sys
import os
fileDir = os.path.dirname(os.path.realpath(__file__))
dataDir = os.path.join(fileDir, "data")
#fileName = "1. cd3cd28.csv"
fileName = "2. cd3cd28icam2.csv"
filePath = os.path.join(dataDir, fileName)
def readDataFile():
#with open(filePath, "r") as f:
my_data = np.genfromtxt(filePath, delimiter=',')
my_data = np.transpose( my_data[1:] ) #removing header row
print my_data.shape
return my_data
#Removes sets of observations in which the value of atleast one variable is an outlier
#Outliers defined as values lying further than 3*stddev from mean
def remove_outliers(my_data):
outlier_indices = set()
for row in my_data:
three_sigma = 3*np.std(row)
avg = np.mean(row)
for idx, item in enumerate(row):
if( abs(item - avg) > three_sigma ):
outlier_indices.add(idx)
new_data = []
for row in my_data:
new_row = [ item for idx, item in enumerate(row) if idx not in outlier_indices ]
new_data.append(new_row)
new_data = np.array(new_data)
print new_data.shape
return new_data
def get_value(item, first_boundary, second_boundary):
if(item <= first_boundary):
return 0
if(item <= second_boundary):
return 1
else:
return 2
#Discretize the protein level values
#Modes supported - interval and quantile discretization
def discretize(my_data, mode = 'quantile'):
new_data = []
for row in my_data:
if (mode == 'quantile'):
first_boundary = np.percentile(row, 33.33)
second_boundary = np.percentile(row, 66.66)
elif (mode == 'interval'):
row_min, row_max= np.min(row), np.max(row)
interval_length = (row_max - row_min)/3.0
first_boundary = row_min + interval_length
second_boundary = row_min + 2*interval_length
else:
print 'Invalid mode for discretize function'
sys.exit(1)
new_row = [ get_value(item, first_boundary, second_boundary) for item in row ]
#print new_row.count(0), new_row.count(1), new_row.count(2)
new_data.append(new_row)
new_data = np.array(new_data)
#print new_data
return new_data
if __name__ == '__main__':
my_data = readDataFile()
#my_data = remove_outliers(my_data)
my_data = discretize(my_data, 'interval')
nodes = ['praf','pmek','plcg','PIP2','PIP3','p44/42','pakts473','PKA','PKC','P38','pjnk']
header_string = ",".join(nodes)
np.savetxt("data/2_interval_full.csv", my_data.T, delimiter = ",", fmt = '%d', header = header_string)