-
Notifications
You must be signed in to change notification settings - Fork 0
/
QueryEnrichment.py
147 lines (103 loc) · 4.88 KB
/
QueryEnrichment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import math
import operator
import os
import shutil
import sys
from Retrieval import RetrievalModels
TYPE_OF_OUTPUTS=['Retrieval/OutputFiles',
]
#variable storing the path of the folder which contains the corpus with all the documents tokenized and processed
TOKENIZED_CORPUS_PATH="CorpusGeneration/TokenizedCorpus"
#output filename for storing the enriched version of the queries
ENRICHED_QUERY_FILE_NAME='QueryEnhancement/EnrichedCasmQueries.txt'
def performQueryEnrichment(docScore,current_query,inverted_index,query_id):
newQuery=current_query
relevance_set={}
non_relevance_set={}
initial_term_weights={}
expanded_query={}
for k,v in inverted_index.items():
initial_term_weights[k]=0
relevance_set[k]=0
non_relevance_set[k]=0
document_score=sorted(docScore.items(),key=operator.itemgetter(1),reverse=True)
count=0
'''Generating relevance set'''
for doc in document_score:
count+=1
doc_id=doc[0]
corpus_path=os.path.join(TOKENIZED_CORPUS_PATH,doc_id+".txt")
file_handle=open(corpus_path,"r")
content=file_handle.readline()
term_list=content.split()
for term in term_list:
relevance_set[term]+=1
if(count==10):
break
'''Generating normalizer for relevence vector'''
relevance_normalizer=0
for k,v in relevance_set.items():
relevance_normalizer+= float(v**2)
relevance_normalizer=float(math.sqrt(relevance_normalizer))
'''Generating non-relevance set'''
for i in range(10,len(document_score)):
doc_id=document_score[i][0]
corpus_path=os.path.join(TOKENIZED_CORPUS_PATH,doc_id+".txt")
file_handle=open(corpus_path,"r")
content=file_handle.readlines()
for line in content:
term_list=line.split()
for term in term_list:
non_relevance_set[term]+=1
'''Generating normalizer for non_relevence vector'''
non_relevance_normalizer=0
for k,v in non_relevance_set.items():
non_relevance_normalizer+= float(v**2)
non_relevance_normalizer=float(math.sqrt(non_relevance_normalizer))
'''Generating initial query term weights'''
query_list=current_query.split()
for term in query_list:
if(term in initial_term_weights.keys()):
initial_term_weights[term]+=1
else:
initial_term_weights[term]=1
'''Generating expanded query'''
for term in inverted_index.keys():
initial_term_weight= 0.2*initial_term_weights[term]
relevance_weightage= (0.75/relevance_normalizer)*relevance_set[term]
non_relevance_weightage=(0.05/non_relevance_normalizer)*non_relevance_set[term]
expanded_query[term]=initial_term_weight+relevance_weightage-non_relevance_weightage
sorted_expanded_query_terms=sorted(expanded_query.items(),key=operator.itemgetter(1),reverse=True)
'''Inserting the top 20 query words into the original query'''
for i in range(0,20):
t=sorted_expanded_query_terms[i]
if(t[0] not in newQuery):
newQuery+=" "+t[0]
return newQuery
'''prints the expanded queries into a file called 'EnrichedCasmQueries.txt' '''
def writeNewQueryTofile(ENRICHED_QUERY_FILE_NAME,newQuery,query_id):
fileIndex=open(ENRICHED_QUERY_FILE_NAME,'a')
fileIndex.write("Enriched query for QueryID:"+str(query_id)+"\n")
fileIndex.write("------------------------------------------\n\n")
fileIndex.write(newQuery+"\n\n\n")
def main(docScorePerQuery):
if os.path.exists(TYPE_OF_OUTPUTS[3]):
shutil.rmtree(TYPE_OF_OUTPUTS[3])
if not os.path.exists(TYPE_OF_OUTPUTS[3]):
os.makedirs(TYPE_OF_OUTPUTS[3])
qMap=RetrievalModels.fetchQueryMap()
invertedIndex=RetrievalModels.fetchInvertedIndex(RetrievalModels.INVERTED_INDEX[0])
newQMap={}
for queryID in docScorePerQuery:
newQuery=performQueryEnrichment(docScorePerQuery[queryID],qMap[queryID],invertedIndex,queryID)
writeNewQueryTofile(ENRICHED_QUERY_FILE_NAME,newQuery,queryID)
newQMap[queryID]=newQuery
docScorePerQuery=RetrievalModels.selectRetrievalModel(RetrievalModels.INVERTED_INDEX[0],\
RetrievalModels.NUM_OF_TOKEN_PER_DOC[0],1,TYPE_OF_OUTPUTS[3],newQMap)
return docScorePerQuery
if __name__=='__main__':
# fetch queryMap
qMap=RetrievalModels.fetchQueryMap()
docScorePerQuery=RetrievalModels.selectRetrievalModel(RetrievalModels.INVERTED_INDEX[0],RetrievalModels.NUM_OF_TOKEN_PER_DOC[0],\
1,TYPE_OF_OUTPUTS[0],qMap)
main(docScorePerQuery)