-
Notifications
You must be signed in to change notification settings - Fork 0
/
correlation_matrix.py
104 lines (87 loc) · 3.34 KB
/
correlation_matrix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import seaborn as sn
import matplotlib.pyplot as plt
import os
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
data_dir = "./dataset_100/separated_text_data/"
#
# if not os.path.exists("./NN_data/plots/User18/accuracy/result" + year + month + day):
# print("error, create folder")
# quit()
fScalar = os.path.join(data_dir, 'scalars.txt')
fCategories = os.path.join(data_dir, 'categories.txt')
fIngredients = os.path.join(data_dir, 'ingredients.txt')
fPreparation = os.path.join(data_dir, 'preparations.txt')
fNames = os.path.join(data_dir, 'names.txt')
fCouple = os.path.join('./Ordinamenti/dataset_coppie.txt')
fLabels = os.path.join('./Ordinamenti/output-file.txt')
fS = open(fScalar)
dataS = fS.read()
fS.close()
fC = open(fCategories)
dataC = fC.read()
fC.close()
fI = open(fIngredients)
dataI = fI.read()
fI.close()
fP = open(fPreparation)
dataP = fP.read()
fP.close()
fN = open(fNames)
food_data_names = fN.read()
fN.close()
fCC = open(fCouple)
dataCC = fCC.read()
fCC.close()
fL = open(fLabels)
dataL = fL.read()
fL.close()
linesOfS = dataS.split('\n')
food_data_scalars = np.zeros((len(linesOfS), 1), dtype='float32')
for i, line in enumerate(linesOfS):
values = [x for x in line.split(' ')[1:]]
food_data_scalars[i, :] = values
linesOfC = dataC.split('\n')
food_data_categories = np.zeros((len(linesOfC), 3), dtype='float32')
for i, line in enumerate(linesOfC):
values = [x for x in line.split(' ')[1:]]
food_data_categories[i, :] = values
linesOfI = dataI.split('\n')
food_data_ingredients = np.zeros((len(linesOfI), 36), dtype='float32')
for i, line in enumerate(linesOfI):
values = [x for x in line.split(' ')[1:]]
food_data_ingredients[i, :] = values
linesOfP = dataP.split('\n')
food_data_preparation = np.zeros((len(linesOfP), 8), dtype='float32')
for i, line in enumerate(linesOfP):
values = [x for x in line.split(' ')[1:]]
food_data_preparation[i, :] = values
# manipulate categorical data
category1HE = food_data_categories[:, 0]
food_data_categories = np.delete(food_data_categories, 0, 1)
category1HE = np.reshape(category1HE, (101, 1))
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(category1HE)
finalCategory = enc.transform(category1HE).toarray()
# remove ingredients never used
food_data_ingredients = np.delete(food_data_ingredients, [1, 16, 23, 26, 32], axis=1)
# concatenate all data in a numpy tensor
all_data = np.concatenate([finalCategory, food_data_categories, food_data_scalars, food_data_ingredients, food_data_preparation], axis=1)
# preparation of labels for pandas dataframe
foodsLabel = ['F' + str(i) for i in range(1, 101)]
categoryLabel = ['CATEGORY' + str(i) for i in range(1, 6)]
ingredientsLabel = ['INGREDIENTS' + str(i) for i in range(1, 32)]
preparationsLabel = ['PREPARATIONS' + str(i) for i in range(1, 9)]
# creating pandas dataframe
final_data = pd.DataFrame(columns=[*categoryLabel, 'COST', 'DIFFICULTY', 'PREPARATION', *ingredientsLabel, *preparationsLabel], index=foodsLabel)
# fill pandas dataframe with data that we concatenated in line 83
for i, food in enumerate(final_data.index):
final_data.loc[food] = all_data[i]
final_data = final_data.astype(float)
corr = final_data.corr()
plt.figure(figsize=(32, 32))
heatmap = sns.heatmap(corr)
plt.savefig("./dataset_100/correlationMatrix.jpg", dpi=300)