This repository has been archived by the owner on Jul 22, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 7
/
lang.py
executable file
·188 lines (156 loc) · 5.87 KB
/
lang.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import json
import urllib.request
import urllib.error
import numpy as np
import unicodedata
import os
import re
import random
from nltk.tokenize import sent_tokenize, word_tokenize
try:
import cPickle
except ImportError:
import pickle as cPickle
from os import listdir, makedirs
from os.path import isfile, join, exists
class Lang(object):
def __init__(self, name, base_dir, vocab_dir):
self.name = name
self.base_dir = base_dir
self.vocab_dir = vocab_dir
self.word2index = {}
self.word2count = {}
self.index2word = {}
self.n_words = 0
self.PAD_token = 0
self.EOS_token = 1
self.add_word('<PAD>')
self.add_word('<EOS>')
def save(self):
output_dir = self.vocab_dir
if not exists(output_dir):
makedirs(output_dir)
with open(output_dir + '/' + self.__class__.__name__ + '.pkl', 'wb') as f:
saved_dict = self.__dict__.copy()
for k in saved_dict:
if saved_dict[k].__class__.__module__ != 'builtins':
saved_dict[k] = None
cPickle.dump(saved_dict, f)
def load(self):
print('self.vocab_dir:', self.vocab_dir)
with open(self.vocab_dir + '/' + self.__class__.__name__ + '.pkl', 'rb') as f:
recovered = cPickle.load(f)
for name in recovered:
if recovered[name] is not None:
setattr(self, name, recovered[name])
def indexes_from_sentence(self, sentence):
return [self.get_index(word) for word in self.tokenize(sentence)] + [self.EOS_token]
def tokenize(self, sentence):
#return sentence.split()
words = word_tokenize(sentence)
print(sentence.split())
print(words)
print("-------")
return words
def pad_seq(self, seq, max_length):
# Pad a with the PAD symbol
seq += [self.PAD_token] * (max_length - len(seq))
return seq
def get_index(self, word):
if word in self.word2index:
return self.word2index[word]
# if its an unseen word, just pad instead
return self.PAD_token
def get_word(self, index):
if index in self.index2word:
return self.index2word[index]
return "<PAD>"
def get_vocab_size(self):
return self.n_words
def add_sentence(self, sentence, normalize=False):
for word in self.tokenize(sentence):
if normalize:
word = self.normalize_string(word)
if len(word.strip()) == 0:
continue
self.add_word(word)
def add_word(self, word):
if word not in self.word2index:
self.word2index[word] = self.n_words
self.word2count[word] = 1
self.index2word[self.n_words] = word
self.n_words += 1
else:
self.word2count[word] += 1
def unicode_to_ascii(self, s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
)
def normalize_string(self, s):
"""
Lowercase, trim, and remove non-letter characters
"""
s = s.lower().strip()
s = self.unicode_to_ascii(s.lower().strip())
s = re.sub(r"([.!?])", r" \1", s)
s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
return s
class NaturalLang(Lang):
def __init__(self, name, base_dir, vocab_dir):
super(NaturalLang, self).__init__(name, base_dir, vocab_dir)
import spacy
self.nlp = spacy.load('en')
def tokenize(self, sentence):
# document = self.nlp(sentence)
# return [token.text for token in document]
words = word_tokenize(sentence)
#print(sentence.split())
#print(words)
#print("-------")
return words
def indexes_from_sentence(self, sentence, normalize=True):
# print('sentence:', sentence)
results = []
for word in self.tokenize(sentence):
if normalize:
word = self.normalize_string(word)
if len(word.strip()) == 0:
continue
results.append(self.get_index(word))
results.append(self.EOS_token)
return results
def get_vocab_embeddings(self, output_dir):
embeddings = {}
with open(output_dir + 'embedding_oov.txt', 'w') as output_file:
for word in self.word2index:
url = "http://localhost:5000/api/vector/fasttext/" + word
try:
response = urllib.request.urlopen(url)
data = json.loads(response.read())
embeddings[word] = data['vector']
except Exception as err:
print('Cannot get embedding for', word, err)
output_file.write(word + '\t' + str(err) + '\n')
return embeddings
class CUILang(Lang):
def __init__(self, name, base_dir, vocab_dir):
super(CUILang, self).__init__(name, base_dir, vocab_dir)
def tokenize(self, sentence):
if ',' in sentence:
return [token.strip() for token in sentence.split(',') if len(token.strip()) > 0]
else:
return [token.strip() for token in sentence.split() if len(token.strip()) > 0]
def get_vocab_embeddings(self, output_dir):
embeddings = {}
with open(output_dir + 'embedding_oov.txt', 'w') as output_file:
for word in self.word2index:
url = "http://localhost:5000/api/vector/cui/" + word
try:
response = urllib.request.urlopen(url)
data = json.loads(response.read())
embeddings[word] = data['vector']
except Exception as err:
print('Cannot get embedding for', word, err)
output_file.write(word + '\t' + str(err) + '\n')
return embeddings