-
Notifications
You must be signed in to change notification settings - Fork 4
/
hor-vis-graph.py
86 lines (73 loc) · 2.12 KB
/
hor-vis-graph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import utils
import text as text_parser
import csv
import os
from slugify import slugify
import math
print('Reading texts...')
all_texts = text_parser.get_text_corpus(9999, 'texts/news')
print('Done! Computing TF-IDF ranks...')
all_ranks = utils.tf_idf(all_texts)
print('\nDone! Computing horizontal visibility graph...')
horizon = 20 # do not search in HVG behind the horizon
rank_threshold = 0.01 # filter less relevant words, ranked in range 0..1
for i in range(0, len(all_texts)):
print(
'Writing HVG #' + str(i + 1) + '/' + str(len(all_texts))
+ ' (' + all_texts[i]['title'][:10] + '...)'
, end='\r'
)
line = []
ranks = all_ranks[i]['stats']
max_rank = all_ranks[i]['max_rank']
text = all_texts[i]['text']
commons = set()
for word in text:
rank = ranks[word] / max_rank
line.append(rank)
if rank < rank_threshold:
commons.add(word)
hvg = {}
all_words = set(text) - commons
def add_to_hvg(w1, w2):
if w1 > w2:
temp = w1
w1 = w2
w2 = temp
if w1 not in hvg:
hvg[w1] = {}
hvg[w1][w2] = 0 if w2 not in hvg[w1] else hvg[w1][w2] + 1
limit = len(line)
for current in range(0, limit):
if line[current] < rank_threshold:
continue
for left in reversed(range(max(0, current - horizon), max(0, current - 1))):
if line[left] > line[current]:
add_to_hvg(text[left], text[current])
break
for right in range(min(limit, current + 1), min(limit, current + horizon)):
if line[right] > line[current]:
add_to_hvg(text[right], text[current])
break
if not os.path.isdir('hor-vis-graph'):
os.mkdir('hor-vis-graph')
with open("hor-vis-graph/" + slugify(all_texts[i]['title']) + ".csv", "w") as f:
writer = csv.writer(
f, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL, lineterminator='\n'
)
matrix = [[""]]
for word in all_words:
matrix[0].append(word)
for word1 in all_words:
row = [word1]
for word2 in all_words:
if word1 > word2:
wo1 = word2
wo2 = word1
else:
wo1 = word1
wo2 = word2
row.append(hvg[wo1][wo2] if wo1 in hvg and wo2 in hvg[wo1] else 0)
matrix.append(row)
writer.writerows(matrix)
print('\nDone!')