-
Notifications
You must be signed in to change notification settings - Fork 0
/
utilities.py
69 lines (56 loc) · 2.58 KB
/
utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
suffixes = {
1: ["ो", "े", "ू", "ु", "ी", "ि", "ा"],
2: ["कर", "ाओ", "िए", "ाई", "ाए", "ने", "नी", "ना", "ते", "ीं", "ती", "ता", "ाँ", "ां", "ों", "ें"],
3: ["ाकर", "ाइए", "ाईं", "ाया", "ेगी", "ेगा", "ोगी", "ोगे", "ाने", "ाना", "ाते", "ाती", "ाता", "तीं", "ाओं", "ाएं", "ुओं", "ुएं", "ुआं"],
4: ["ाएगी", "ाएगा", "ाओगी", "ाओगे", "एंगी", "ेंगी", "एंगे", "ेंगे", "ूंगी", "ूंगा", "ातीं", "नाओं", "नाएं", "ताओं", "ताएं", "ियाँ", "ियों", "ियां"],
5: ["ाएंगी", "ाएंगे", "ाऊंगी", "ाऊंगा", "ाइयाँ", "ाइयों", "ाइयां"],
}
def hi_stem(word):
for L in 5, 4, 3, 2, 1:
if len(word) > L + 1:
for suf in suffixes[L]:
if word.endswith(suf):
return word[:-L]
return word
def get_text(filename) :
file = open(filename, "r")
text = file.read()
file.close()
return text
def remove_stopwords(text) :
stopwords_set = ["|", "/", "\\", "." ,",", ")", "(", "-", "!", "~", "@"]
for stopword in stopwords_set :
text = text.replace(stopword, " ")
return text
def stem_file(text) :
text = sent_tokenize(text)
tokens = set()
ps = PorterStemmer()
word_frequency = {}
stem_frequency = {}
num_words = 0
for line in text :
words = word_tokenize(line)
num_words += len(words)
stemmed_words = [hi_stem(word) for word in words]
stemmed_words = [ps.stem(word) for word in stemmed_words]
for i in range(len(words)) :
tokens.add((words[i], stemmed_words[i], i))
if words[i] not in word_frequency :
word_frequency[words[i]] = 1
else :
word_frequency[words[i]] += 1
if stemmed_words[i] not in stem_frequency :
stem_frequency[stemmed_words[i]] = 1
else :
stem_frequency[stemmed_words[i]] += 1
for word in word_frequency :
word_frequency[word] /= num_words
for stem in stem_frequency :
stem_frequency[stem] /= num_words
'''
tokens -> (original_word, stemmed_word, line_number)
'''
return tokens, word_frequency, stem_frequency