-
Notifications
You must be signed in to change notification settings - Fork 0
/
VectorSpace.py
141 lines (106 loc) · 4.6 KB
/
VectorSpace.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import json
import math
import copy
from PositionalPosting import load_positional_postings_list
from Preprocess import read_data_from_file, preprocess_query
from PositionalPosting import POSITIONAL_POSTINGS_LIST_FILE_WITHOUT_STOP_WORDS, \
POSITIONAL_POSTINGS_LIST_FILE_WITH_STOP_WORDS, DF_WITH_STOP_WORDS
VECTORS_WITHOUT_STOP_WORDS_FILE = "./vectors_without_stop_words.json"
# positional_postings_lists_with_stop_words = load_positional_postings_list(POSITIONAL_POSTINGS_LIST_FILE_WITH_STOP_WORDS)
positional_postings_list_without_stop_words = load_positional_postings_list(
POSITIONAL_POSTINGS_LIST_FILE_WITHOUT_STOP_WORDS)
positional_postings_lists = positional_postings_list_without_stop_words
number_of_all_documents = len(read_data_from_file(DF_WITH_STOP_WORDS))
def calculate_term_freq_weight(term_freq_dict):
"""
this function calculate tf(t, d) = (1 + log(f(t, d)))
raw frequency => weight frequency
:param term_freq_dict:
:return: term_freq_dict_weight
"""
term_freq_dict_weight = {}
for term, raw_freq in term_freq_dict.items():
term_freq_dict_weight[term] = 1 + math.log10(raw_freq)
return term_freq_dict_weight
def calculate_inverse_document_frequency(term):
"""
this function calculate idf(t) = log(N / dft)
:param term:
:return: idf
"""
# dft
document_freq_of_term = positional_postings_lists[term]["unique_documents_frequency"]
# N
number_of_documents = len(positional_postings_lists)
inverse_document_frequency = math.log10(number_of_documents / document_freq_of_term)
return inverse_document_frequency
def calculate_doc_length(term_freq_list):
"""
calculate total squares of term frequency list
calculate square roots (radical)
:param term_freq_list:
:return: doc length
"""
sum = 0
for number in term_freq_list:
sum += number ** 2
return sum ** 0.5
def calculate_term_freq_normalize(term_freq_dict):
"""
term frequency wight / doc length
:param term_freq_dict:
:return: term_freq_dict_normalized
"""
term_freq_list = list(term_freq_dict.values())
doc_length = calculate_doc_length(term_freq_list)
term_freq_dict_normalized = {k: v / doc_length for k, v in term_freq_dict.items()}
return term_freq_dict_normalized
def calculate_final_weight(type, term_freq_raw):
"""
input is term frequency - raw (tf-raw)
calculate term frequency wight (tf-wt)
for query calculate inverse document frequency (idf)
then normalized weight
:param type:
:param term_freq_raw:
:return: final_weight
"""
# term frequency - weight (tf-wt)
term_freq_weight_dict = calculate_term_freq_weight(term_freq_raw)
# final weight: tf-wt * idf
final_term_freq_weight = copy.deepcopy(term_freq_weight_dict)
if type == "query":
for term, freq_weight in term_freq_weight_dict.items():
final_term_freq_weight[term] = freq_weight * calculate_inverse_document_frequency(term)
# term frequency normalized
return calculate_term_freq_normalize(final_term_freq_weight)
def create_vectors_list_from_postings_lists():
vectors_list = {}
for i in range(number_of_all_documents): # i is number of document
term_freq_dict = {}
for term, termInfo in positional_postings_lists.items():
if str(i) in termInfo["document_frequency_dict"]:
# term frequency - raw (tf-raw)
term_freq_dict[term] = termInfo["document_frequency_dict"][str(i)]
vectors_list[i] = calculate_final_weight("document", term_freq_dict)
return vectors_list
def save_vectors_list(vectors_list):
with open("./vectors_without_stop_words.json", 'w', encoding='utf-8') as fp:
json.dump(vectors_list, fp, sort_keys=True, indent=4, ensure_ascii=False)
def load_vectors_list(file_name):
with open(file_name, 'r', encoding='utf-8') as fp:
vectors_list = json.load(fp)
return vectors_list
def create_query_vector(query_string, remove_stop_words_flag, stem_flag):
query_tokens_dict = preprocess_query(query_string, "positional", remove_stop_words_flag, stem_flag)
# term frequency - raw (tf-raw)
query_term_freq_dict = {}
for term, positional_index in query_tokens_dict.items():
query_term_freq_dict[term] = len(positional_index)
# final term frequency
query_final_term_frequency = calculate_final_weight("query", query_term_freq_dict)
return query_final_term_frequency
vectors_without_stop_words = load_vectors_list(VECTORS_WITHOUT_STOP_WORDS_FILE)
if __name__ == "__main__":
v = create_vectors_list_from_postings_lists()
save_vectors_list(v)