-
Notifications
You must be signed in to change notification settings - Fork 0
/
functions.py
58 lines (38 loc) · 1.34 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import re
import nltk
def lemmatize(tokens):
"""
Lemmatized the tokens which means to convert the tokens to
their most basic canonical form.
Lemmatization examples:
running -> run, walked -> walk
:return: a string of all the words lemmatize, can be used to write to corpus.
"""
lemmatizer = nltk.wordnet.WordNetLemmatizer()
tagged_tokens = nltk.pos_tag(tokens, tagset=None)
lemmatized_words = []
for token, tag in tagged_tokens:
if tag.startswith("V"): # Verb
pos_val = "v"
elif tag.startswith("J"): # Adjective
pos_val = "a"
elif tag.startswith("R"): # Adverb
pos_val = "r"
else:
pos_val = "n" # Noun
lemmatized_word = lemmatizer.lemmatize(token, pos_val)
lemmatized_words.append(lemmatized_word)
return " ".join(lemmatized_words)
def tokenize(text):
"""
Remove uppercase and special chars of a text (str)
And return an array of tokens (in our case it's just words)
"""
text = str(text).lower()
text = re.sub(r"[^ a-z]", "", text)
stop_words = nltk.corpus.stopwords.words("english")
return [word for word in text.split() if word not in stop_words and len(word) <= 25]
def tokenize_and_lemmatize(text):
tokens = tokenize(text)
res = lemmatize(tokens)
return res