-
Notifications
You must be signed in to change notification settings - Fork 8
/
train.py
137 lines (109 loc) · 4.34 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import sys
import math
from decimal import *
import codecs
tag_list = set()
tag_count = {}
word_set = set()
def parse_traindata():
fin = "train_data/train_data.txt"
output_file = "model/hmmmodel.txt"
wordtag_list = []
try:
# read the training data file #
input_file = codecs.open(fin, mode = 'r', encoding="utf-8")
lines = input_file.readlines()
# pushing words of a line into a list #
for line in lines:
line = line.strip('\n')
data = line.split(" ")
wordtag_list.append(data)
input_file.close()
return wordtag_list
except IOError:
fo = codecs.open(output_file,mode = 'w',encoding="utf-8")
fo.write("File not found: {}".format(fin))
fo.close()
sys.exit()
def transition_count(train_data):
global tag_list
global word_set
transition_dict = {}
global tag_count
for value in train_data:
previous = "start"
for data in value:
# we store words and their corresponding tags #
i = data[::-1]
word = data[:-i.find("/") - 1]
word_set.add(word.lower())
data = data.split("/")
tag = data[-1]
tag_list.add(tag)
# store frequency of each tag #
if tag in tag_count:
tag_count[tag] += 1
else:
tag_count[tag] = 1
# store the frequency of each combination of tags #
if (previous + "~tag~" + tag) in transition_dict:
transition_dict[previous + "~tag~" + tag] += 1
previous = tag
else:
transition_dict[previous + "~tag~" + tag] = 1
previous = tag
return transition_dict
def transition_probability(train_data):
count_dict = transition_count(train_data)
prob_dict = {}
for key in count_dict:
den = 0
val = key.split("~tag~")[0]
# Probabilty of a tagA to be followed by tagB out of all possible tags #
for key_2 in count_dict:
if key_2.split("~tag~")[0] == val:
den += count_dict[key_2]
prob_dict[key] = Decimal(count_dict[key])/(den)
return prob_dict
def transition_smoothing(train_data):
transition_prob = transition_probability(train_data)
for tag in tag_list:
# if a tag does not occur as a start tag, then set its probability to be a start tag to minimum value #
if "start" + tag not in transition_prob:
transition_prob[("start" + "~tag~" + tag)] = Decimal(1) / Decimal(len(word_set) + tag_count[tag])
for tag1 in tag_list:
for tag2 in tag_list:
# if a particular tag combination does not exist in the dictionary, we set its probability to minimum#
if (tag1 +"~tag~" + tag2) not in transition_prob:
transition_prob[(tag1+"~tag~"+tag2)] = Decimal(1)/Decimal(len(word_set) + tag_count[tag1])
return transition_prob
def emission_count(train_data):
count_word = {}
for value in train_data:
for data in value:
i = data[::-1]
word = data[:-i.find("/") - 1]
tag = data.split("/")[-1]
# map the words in the training set to their tagged POS #
if word.lower() + "/" + tag in count_word:
count_word[word.lower() + "/" + tag] +=1
else:
count_word[word.lower() + "/" + tag] = 1
return count_word
def emission_probability(train_data):
global tag_count
word_count = emission_count(train_data)
emission_prob_dict = {}
# calculate probability of a word to be a certain Tag out of all the possible tags that it can be #
for key in word_count:
emission_prob_dict[key] = Decimal(word_count[key])/tag_count[key.split("/")[-1]]
return emission_prob_dict
train_data = parse_traindata()
transition_model = transition_smoothing(train_data)
emission_model = emission_probability(train_data)
fout = codecs.open("model/hmmmodel.txt", mode ='w', encoding="utf-8")
for key, value in transition_model.items():
fout.write('%s:%s\n' % (key, value))
fout.write(u'Emission Model\n')
for key, value in emission_model.items():
fout.write('%s:%s\n' % (key, value))