-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.py
140 lines (104 loc) · 4.01 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
from flask import Flask
import random
import time
import pprint
class MarkovChain:
def __init__(self, corpus):
'''
TODO: Create Nth Order Chain
TODO: Create 2nd Order chain first
'''
self.markov_structure = self.generate_second_order_markov_structure(corpus)
def walk(self, steps):
'''
TODO: Return N step random walk
'''
#take first step
markov_keys = list(self.markov_structure.keys())
#get random first token
word = markov_keys[random.randint(0, len(markov_keys)-1)]
sentence = word
#take additional steps
for _ in range(1,steps):
second_hist = self.markov_structure[word]
word = self.stochastic_sample(second_hist)
if not word:
word = markov_keys[random.randint(0, len(markov_keys)-1)]
sentence = sentence + " " + word
return sentence
def stochastic_sample(self, histogram):
'''
Return random word based on weight in histogram
'''
random_value = random.random()
word_prob = 0
if len(histogram) > 0:
for type_tuple in histogram:
word_prob += type_tuple[1]/len(histogram)
if random_value <= word_prob:
return type_tuple[0]
else:
return False
def generate_second_order_markov_structure(self, corpus):
#list with word as key and value is word histogram
markov_structure = {}
corpus_length = len(corpus)
#Corpus too small for 2nd order chain -- return error
if corpus_length < 3:
return 1
following_token_position = 1
for token in corpus:
#if not at end of corpus
if following_token_position < corpus_length:
second_token = corpus[following_token_position]
#look in markov structure for first token in digram
if token in markov_structure:
second_hist = markov_structure[token]
self.add_to_histogram(second_token, second_hist)
if second_token not in markov_structure:
markov_structure[second_token] = []
else:
#if token not in markov structure add it
markov_structure[token] = [(second_token, 1)]
if second_token not in markov_structure:
markov_structure[second_token] = []
following_token_position += 1
return markov_structure
def add_to_histogram(self, word, histogram):
found = False
index = 0
if len(histogram) == 0:
histogram.append((word, 1))
for value in histogram:
if value[0] == word:
found = True
new_value = (word, value[1] + 1)
histogram[index] = new_value
index += 1
if not found:
histogram.append((word, 1))
return histogram
app = Flask()
@app.route("/")
def help():
start_time = int(round(time.time()*1000))
sentence = markov_chain.walk(7)
end_time = int(round(time.time()*1000))
print("\nGenerated sentence in {}ms.".format(end_time-start_time))
return sentence
if __name__ == '__main__':
import sys
import filewrangler as fw
print("Loading Corpus ...")
start_time = int(round(time.time()*1000))
corpus = fw.create_corpus("corpus.txt")
end_time = int(round(time.time()*1000))
time_delta = end_time - start_time
print("\nFinished in {}ms.".format(time_delta))
print("Creating Markov Chain...")
start_time = int(round(time.time()))
#Create markovchain datastructure in memory
markov_chain = MarkovChain(corpus)
end_time = int(round(time.time()))
print("\nMarkov structure generated in {}s.".format(end_time-start_time))
app.run()