-
Notifications
You must be signed in to change notification settings - Fork 1
/
gen.py
71 lines (61 loc) · 2.09 KB
/
gen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import random
import nltk
def get_trainTokens(genre):
os.chdir(directory + 'train_books/' + str(genre))
all_tokens = []
print "tokenizing " + genre + " books...."
for file in glob.glob("*.txt"):
f = open(file)
tokens = nltk.word_tokenize(f.read().decode("latin1"))
all_tokens += tokens
f.close()
return all_tokens
def get_uniCounts(all_tokens):
unigram_table = {}
for token in all_tokens:
if token in unigram_table:
unigram_table[token] += 1
else:
unigram_table[token] = 1
return unigram_table, len(all_tokens)
def get_biCounts(all_tokens):
uniCounts, length = get_uniCounts(all_tokens)
bigram_table = {}
num_bigrams = 0
for x in range(0, length - 1):
if all_tokens[x] in bigram_table:
if all_tokens[x + 1] in bigram_table[all_tokens[x]]:
bigram_table[all_tokens[x]][all_tokens[x + 1]] += 1
else:
bigram_table[all_tokens[x]][all_tokens[x + 1]] = 1
num_bigrams += 1
else:
bigram_table[all_tokens[x]] = {}
bigram_table[all_tokens[x]][all_tokens[x + 1]] = 1
num_bigrams += 1
return bigram_table, num_bigrams
def get_biSentence(min, max, genre, sentence=''):
print "computing bigrams and generating random sentence:"
table = get_biTable(genre)
length = len(sentence)
if length == 0:
sentence = random_next(table['.'])
sentence_tokens = nltk.word_tokenize(sentence)
last_word = sentence_tokens[-1]
for x in range(max):
generating = True
while (generating):
if last_word in table:
next = random_next(table[last_word])
else:
next = random.choice(table.keys())
generating = False
if (next == '.' and len < min):
generating = True
sentence = sentence + ' ' + next
if next == '.':
return sentence
length += 1
last_word = next
return sentence + '.'
s2 = get_biSentence(5, 50, genre, 'I must')