-
Notifications
You must be signed in to change notification settings - Fork 4
/
embedding.py
108 lines (83 loc) · 3.64 KB
/
embedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import sys
import numpy as np
from scipy import spatial
import pickle as pickle
# from importlib import reload
# reload(sys)
# sys.setdefaultencoding('UTF-8')
class Word2Vec:
def __init__(self, word2vec_filename, vocabulary):
if word2vec_filename is None:
return
self.filename = word2vec_filename
self.vocabulary = vocabulary
self.word_vectors = None
self._load_base_file()
self._add_unknown_words()
# self._modify_entity_vector()
def _load_base_file(self):
word_vectors = dict()
print ("opening file: %s" % self.filename)
with open(self.filename, "rb") as f:
header = f.readline()
vocab_size, layer1_size = map(int, header.split())
binary_len = np.dtype('float32').itemsize * layer1_size
print("vocab size: %d, layer1_size: %d" % (vocab_size, layer1_size))
for line in range(vocab_size):
word = []
while True:
ch = f.read(1)
if ch == b' ':
word = b''.join(word)
break
if ch != b'\n':
word.append(ch)
# I made every word lower case
word_vectors[word.lower()] = np.fromstring(f.read(binary_len), dtype='float32')
print("num words already in word2vec: %d" % len(word_vectors))
print("vocabulary size: %d" % len(self.vocabulary))
self.word_vectors = word_vectors
def get_word_vector(self, word):
try:
return self.word_vectors[word]
except KeyError:
return None
def _add_unknown_words(self):
not_present = 0
for word in self.vocabulary:
if word not in self.word_vectors:
self.word_vectors[word] = np.random.uniform(-0.25, 0.25, 300)
not_present += 1
print ('randomized words: %d out of %d' % (not_present, len(self.vocabulary)))
def _modify_entity_vector(self):
complex_entity = 0
for word in self.vocabulary:
tokens = word.split('_')
if len(tokens) > 1:
complex_entity += 1
self.word_vectors[word] = np.zeros(300)
for token in tokens:
self.word_vectors[word] += self.word_vectors[token]
print ('complex entities: %d out of %d' % (complex_entity, len(self.vocabulary)))
def get_similar_words(self, word, n=5):
similarity_list = []
if type(word) is not np.ndarray:
word = self.word_vectors[word]
for idx, w in enumerate(self.word_vectors):
similarity_list.append((w, 1 - spatial.distance.cosine(self.word_vectors[w], word)))
return (sorted(similarity_list, key=lambda x: x[1], reverse=True))[:n]
def get_dissimilar_words(self, word, n=5):
similarity_list = []
if type(word) is not np.ndarray:
word = self.word_vectors[word]
for idx, w in enumerate(self.word_vectors):
similarity_list.append((w, 1 - spatial.distance.cosine(self.word_vectors[w], word)))
return (sorted(similarity_list, key=lambda x: x[1]))[:n]
def load_model(self, file_path):
pickle_obj = pickle.load(open(file_path))
print ('loaded %d vectors from %s' % (len(pickle_obj[1]), file_path))
self.vocabulary = pickle_obj[0]
self.word_vectors = pickle_obj[1]
def save_model(self, file_path):
print ('dumping %d vectors into %s' % (len(self.word_vectors), file_path))
pickle.dump([self.vocabulary, self.word_vectors], open(file_path, 'w'), protocol=2)