-
Notifications
You must be signed in to change notification settings - Fork 1
/
card.py
147 lines (123 loc) · 4.18 KB
/
card.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
from collections import Counter
import numpy as np
def key(item):
return item[1]
def cardinality(df):
print ("Computing measures for the chosen dataset...")
datafilename="./datasets/"+df+"/"+df+".complete"
if datafilename.lower().endswith('.complete') == False :
sys.exit("Dataset format unknown, please use .arff datasets")
datafile=open(datafilename)
l0=datafile.readline()
l0=l0.split()
sparse = l0[1]
if sparse[:-1] == 'SPARSE':
sparse = True #The file is in sparse mode
else:
sparse = False
l1=datafile.readline()
l2=datafile.readline()
l3=datafile.readline()
instances=int(l1.split()[1])
#print instances
features=int(l2.split()[1])
#print features
labels=int(l3.split()[1])
#print labels
l4=datafile.readline()
avg=0
tmp=0
dist=[]
insts = np.zeros(labels,dtype=int)
nwdfname="./datasets/"+df+"/"+df+".dsetm"
fp=open(nwdfname, 'w')
fp.write("Instances: "+ str(instances)+'\n')
fp.write("Features: "+ str(features)+'\n')
fp.write("Labels: "+ str(labels)+'\n')
while l4 != "":
if(l4 == ' '):
pass
else:
if sparse == False:
label = map(int, l4.strip().split()[features+1:features+1+labels])
#To remove the '[' ']' from the labels extraction
dist.append(''.join(map(str, l4.strip().split()[features+1:features+1+labels])))
#print dist en dist tenemos todas las combinacs, luego hacemos el set
tmp = sum(label)
insts[tmp] += 1
avg += sum(label)
#print avg
else:
#Sparse . find '[' and start reading until ']'
label = map(int, l4.strip().split()[l4.strip().split().index('[')+1:l4.strip().split().index(']')])
dist.append(''.join(map(str,l4.strip().split()[l4.strip().split().index('[')+1:l4.strip().split().index(']')])))
tmp = sum(label)
insts[tmp] += 1
avg += sum(label)
l4=datafile.readline()
fp.write("Num of instances per label-count (0, 1, 2, ... nlabel)\n")
for i in range(0, insts.shape[0]):
fp.write(str(i) + ' ' + str(insts[i])+'\n')
fp.write("Labels frequency: \n")
aux=np.zeros(shape=(labels, 2))
for i in range(0, labels):
aux [i] = (sum(int(row[i]) for row in dist), i+1)
aux = aux[(-aux[:,0]).argsort()]
for s in aux:
fp.write(str(int(s[1]))+' '+str(int(s[0]))+'\n')
countr=Counter(dist)
fp.write ("Label combinations frequency: \n")
for value, count in countr.most_common():
fp.write(str(int(value, 2))+' '+ str(count)+'\n')
#print countr
un_combs=set(dist)
#print sorted(un_combs)
#print ("----------------")
fp.write ("Cardinality: ")
card = avg/(instances*1.0)
fp.write(str(card)+'\n')
fp.write("Density: ")
fp.write (str(card/(labels*1.0))+'\n')
fp.write("Distinct: ")
fp.write(str(len(un_combs))+'\n')
datafile.close()
fp.close()
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
#insts[] is the vector to plot
flbs = np.trim_zeros(insts, 'b')
objects=range(0, flbs.shape[0])
y_pos = np.arange(len(objects))
plt.figure(figsize=(15,9))
plt.bar(y_pos, flbs, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('Instances')
plt.xlabel('Num of active labels')
plt.title(df+': '+'Label frecuency')
for i,j in zip(flbs, y_pos):
plt.annotate(str(flbs[j]), xy=(j,i+(np.max(flbs)*0.01)), horizontalalignment='center')
plt.savefig('./datasets/'+df+'/'+df+'freclbs.png')
plt.close()
#Division on python 2.7 returns int by default,
#in python3 it returns float so we have to "force" float div on python2.7
def main():
dataset = {
'Delicious',
'bookmarks',
'mediamill',
'tmc2007',
'bibtex',
'Corel5k',
'emotions',
'Enron',
'genbase',
'medical',
'scene',
'yeast'
}
for ds in dataset:
print "dataset:" + ds
cardinality(ds)
if __name__== "__main__":
main()