-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
151 lines (134 loc) · 5.44 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import os
import re
import numpy as np
import torch
import torch.autograd as autograd
import torch.nn as nn
import codecs
import numpy as np
import matplotlib.pyplot as plt
import cPickle
import json
def create_dico(item_list):
"""
Create a dictionary of items from a list of list of items.
"""
assert type(item_list) is list
dico = {}
for items in item_list:
for item in items:
if item not in dico:
dico[item] = 1
else:
dico[item] += 1
return dico
def create_mapping(dico, vocabulary_size=2000):
"""
Create a mapping (item to ID / ID to item) from a dictionary.
Items are ordered by decreasing frequency.
"""
sorted_items = sorted(dico.items(),
key=lambda x: (-x[1], x[0]))[:vocabulary_size]
id_to_item = {i: v[0] for i, v in enumerate(sorted_items)}
item_to_id = {v: k for k, v in id_to_item.items()}
return item_to_id, id_to_item
def read_pre_training(emb_path):
"""
Read pre-train word embeding
The detail of this dataset can be found in the following link
https://nlp.stanford.edu/projects/glove/
"""
print('Preparing pre-train dictionary')
emb_dictionary={}
for line in codecs.open(emb_path, 'r', 'utf-8'):
temp = line.split()
emb_dictionary[temp[0]] = np.asarray(temp[1:], dtype= np.float16)
return emb_dictionary
def zero_digits(s):
"""
Replace every digit in a string by a zero.
"""
return re.sub('\d', '0', s)
def evaluate(model, sentences, dictionaries, lower):
"""
Evaluate current model using CoNLL script.
"""
output_path = 'tmp/evaluate.txt'
scores_path = 'tmp/score.txt'
eval_script = './tmp/conlleval'
with codecs.open(output_path, 'w', 'utf8') as f:
for index in xrange(len(sentences)):
#input sentence
input_words = autograd.Variable(torch.LongTensor(sentences[index]['words']))
#calculate the tag score
if lower == 1:
input_caps = torch.LongTensor(sentences[index]['caps'])
input_letter_digits = torch.LongTensor(sentences[index]['letter_digits'])
input_apostrophe_ends = torch.LongTensor(sentences[index]['apostrophe_ends'])
input_punctuations = torch.LongTensor(sentences[index]['punctuations'])
tags = model.get_tags(input_words = input_words,
input_caps = input_caps,
input_letter_digits = input_letter_digits,
input_apostrophe_ends = input_apostrophe_ends,
input_punctuations = input_punctuations )
else:
tags = model.get_tags(input_words = input_words)
#tags = model.get_tags(sentence_in)
# get predict tags
predict_tags = [dictionaries['id_to_tag'][tag] if (tag in dictionaries['id_to_tag']) else 'START_STOP' for tag in tags]
# get true tags
true_tags = [dictionaries['id_to_tag'][tag] for tag in sentences[index]['tags']]
# write words pos true_tag predict_tag into a file
for word, pos, true_tag, predict_tag in zip(sentences[index]['str_words'],
sentences[index]['pos'],
true_tags, predict_tags):
f.write('%s %s %s %s\n' % (word, pos ,true_tag, predict_tag))
f.write('\n')
os.system("%s < %s > %s" % (eval_script, output_path, scores_path))
eval_lines = [l.rstrip() for l in codecs.open(scores_path, 'r', 'utf8')]
result={
'accuracy' : float(eval_lines[1].strip().split()[1][:-2]),
'precision': float(eval_lines[1].strip().split()[3][:-2]),
'recall': float(eval_lines[1].strip().split()[5][:-2]),
'FB1': float(eval_lines[1].strip().split()[7])
}
print(eval_lines[1])
return result
def plot_result(accuracys, precisions, recalls, FB1s):
plt.figure()
plt.plot(accuracys,"g-",label="accuracy")
plt.plot(precisions,"r-.",label="precision")
plt.plot(recalls,"m-.",label="recalls")
plt.plot(FB1s,"k-.",label="FB1s")
plt.xlabel("epoches")
plt.ylabel("%")
plt.title("CONLL2000 dataset")
plt.grid(True)
plt.legend()
plt.show()
def save_model_dictionaries(path, model, dictionaries, opts):
"""
We need to save the mappings if we want to use the model later.
"""
print("Model is saved in:"+path)
with open(path+'/dictionaries.dic', 'wb') as f:
cPickle.dump(dictionaries, f)
torch.save(model.state_dict(), path+'/model.mdl')
with open(path+'/parameters.json', 'w') as outfile:
json.dump(vars(opts), outfile, sort_keys = True, indent = 4)
def load_parameters(path, opts):
param_file = os.path.join(path, 'parameters.json')
with open(param_file, 'r') as file:
params = json.load(file)
# Read network architecture parameters from previously saved
# parameter file.
opts.clip = params['clip']
opts.decode_method = params['decode_method']
opts.embedding_dim = params['embedding_dim']
opts.freeze = params['freeze']
opts.hidden_dim = params['hidden_dim']
opts.loss_function = params['loss_function']
opts.lower = params['lower']
opts.vocab_size = params['vocab_size']
opts.zeros = params['zeros']
return opts