-
Notifications
You must be signed in to change notification settings - Fork 0
/
sentence.py
47 lines (37 loc) · 1.6 KB
/
sentence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from tagged_word import TaggedWord
class Sentence:
def __init__(self, sentence: str, insert_sec_labels: bool, index=None):
self.tokens = []
splitted_sec = sentence.split("\t")
if insert_sec_labels:
self.label = int(splitted_sec[0])
else:
self.label = None
words = "\t".join(splitted_sec[1:])
self.tokens = [TaggedWord(word_tag=token) for token in Sentence.split_cleaned_line(words)]
self.index = index
# TODO: doc!
self.feature_attributes_idxs = None
self.features = None
def clone(self, copy_labels: bool = True):
new_sentence = Sentence('-1\ttv_NN', False, self.index)
new_sentence.label = self.label
new_sentence.tokens = [tagged_word.clone(copy_labels) for tagged_word in self.tokens]
new_sentence.feature_attributes_idxs = self.feature_attributes_idxs
new_sentence.features = self.features
return new_sentence
@staticmethod
def split_cleaned_line(line): # TODO check if fix with data?
return line.strip("\n ").split(" ")
def count_tokens(self):
return len(self.tokens)
def xgram(self, index, x):
beginning = ["*" for _ in range(max(x - index - 1, 0))]
end = [token.tag for token in self.tokens[max(index - x + 1, 0):index + 1]]
return tuple(beginning + end)
def trigram(self, index):
return self.xgram(index, 2) # only two tags before current
def bigram(self, index):
return self.xgram(index, 1)
def __str__(self):
return " ".join([str(token) for token in self.tokens])