-
Notifications
You must be signed in to change notification settings - Fork 1
/
spraakbanken_inflector.py
40 lines (34 loc) · 1.4 KB
/
spraakbanken_inflector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import csv
from inflector import Inflector
POS2TAG = {'(noun)': 'subst', '(verb)': 'verb', '(adj)': 'adj'}
class SpraakbankenInflector(Inflector):
__slots__ = 'lemma2inflections'
def __init__(self):
id2lemma = {}
with open('data/spraakbanken/lemma.txt', encoding='utf8') as f:
reader = csv.DictReader(f, delimiter='\t')
for row in reader:
id2lemma[row['LEMMA_ID'].strip()] = row['GRUNNFORM'].strip()
self.lemma2inflections = {}
with open('data/spraakbanken/fullformsliste.txt', encoding='utf8') as f:
reader = csv.DictReader(f, delimiter='\t')
for row in reader:
try:
key = (id2lemma[row['LEMMA_ID'].strip()],
row['TAG'].strip().split()[0])
except KeyError:
continue
val = row['OPPSLAG'].strip()
try:
self.lemma2inflections[key].add(val)
except KeyError:
self.lemma2inflections[key] = {val}
# TODO genitive
def inflect(self, entry):
nb_word = entry.nb_word
if entry.pos == '(verb)' and nb_word.startswith('å '):
nb_word = nb_word.replace('å ', '')
try:
return self.lemma2inflections[(nb_word, POS2TAG[entry.pos])]
except KeyError:
return set()