-
Notifications
You must be signed in to change notification settings - Fork 0
/
classifyDocs.py
258 lines (233 loc) · 12.4 KB
/
classifyDocs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
import numpy as np
import pandas as pd
import gensim
import sys
import os
from gensim.test.utils import datapath
from gensim import corpora
from sqlalchemy import create_engine
from tokenizer import *
import warnings
import json
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc = {'figure.figsize':(16,9)})
#can change default below (larger lines and labels)
sns.set_context('talk')
#import argument parser
import argparse
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument("--type", help="Specify whether to use model trained with posts or comments. Default is comments.", choices=['posts','comments'], default='comments')
arg_parser.add_argument("--pages", help="Group of pages to use. Default is guy.", choices=['all','nh','guy'],default = 'guy')
arg_parser.add_argument("--num_topics", help="Number of topics in model. Default is 10.", type=int, default=10)
arg_parser.add_argument("--start_date", help="Start date for documents and trained model in format YYYY-MM-DD. Default is 2019-04-01.", default="2019-04-01")
arg_parser.add_argument("--end_date", help="End date for documents and trained model in format YYYY-MM-DD. Default is 2019-06-22.", default="2019-06-22")
arg_parser.add_argument("--model", help="Name of trained model to use (if different than date range). If not included, finds model with date range")
arg_parser.add_argument("--ignore", help="Ignore warnings", action="store_true")
args = arg_parser.parse_args()
#specify path for model to load
if args.model == None:
file_path = datapath("ldamodel_{0}_{1}_{2}topics_{3}_to_{4}".format(args.type,args.pages,args.num_topics, args.start_date, args.end_date))
else:
file_path = datapath(args.model)
#create name for common dictionary
if args.model == None:
dictionary_name = "./tmp/dict_fb_{0}_{1}_{2}_to_{3}.dict".format(args.type,args.pages,args.start_date,args.end_date)
print('dictionary name: ',dictionary_name)
else:
dictionary_args = args.model.split('_')
dictionary_name = "./tmp/dict_fb_{0}_{1}_{2}_to_{3}.dict".format(dictionary_args[1],dictionary_args[2],dictionary_args[4],dictionary_args[6])
print('dictionary name: ',dictionary_name)
#read off input values
if args.ignore:
print('Ignoring all warnings...')
warnings.filterwarnings("ignore", category=DeprecationWarning)
print("Using %s for analysis" % args.type)
print("Using start_date of: ",args.start_date)
#load stop words and stop lemmas from dict meta file
if args.model == None:
tokenize_meta_path = "./tmp/meta_fb_{0}_{1}_{2}_to_{3}.txt".format(args.type,args.pages, args.start_date, args.end_date)
else:
meta_args = args.model.split('_')
tokenize_meta_path = "./tmp/meta_fb_{0}_{1}_{2}_to_{3}.txt".format(meta_args[1],meta_args[2], meta_args[4], meta_args[6])
if os.path.exists(tokenize_meta_path):
with open(tokenize_meta_path,"r") as tokens_meta_file:
tokenize_args = json.load(tokens_meta_file)
stop_words = tokenize_args['stop_words']
stop_lemmas = tokenize_args['stop_lemmas']
allowed_pos = tokenize_args['allowed_pos']
print('stop_words: ',stop_words)
print('stop_lemmas: ',stop_lemmas)
print('allowed_pos: ',allowed_pos)
else:
#add stop words
stop_words = ["lol","READ","MORE","NEWS"]
if args.type == 'comments':
#stop lemmas for comments
stop_lemmas = ["say", "man", "people","know","time","need","want","go","get","year","word","guyana","like","good","thing","come","let","think","look","right","day"]
else:
#stop lemmas for posts
stop_lemmas = ["say", "man", "people","know","time","need","want","go","get","year","word","guyana","like","good","thing","come","let","think","look","right","day","national","guyanese"]
#parts of speech
allowed_pos = ['NOUN', 'VERB', 'PROPN']
#define and instantiate tokenizer
tokenizer_inst = Tokenizer(stop_words=stop_words, stop_lemmas=stop_lemmas, remove_unicode=True, allowed_pos=allowed_pos, lower_token=True, bigrams=True)
#check if dictionary and corpus are already saved
if os.path.exists(dictionary_name):
#load dictionary and corpus
dictionary = corpora.Dictionary.load(dictionary_name)
else:
sys.exit('Error: dictionary for {0} for {1} starting at {2} and ending at {3} does not exist.'.format(args.type, args.pages, args.start_date, args.end_date))
ldamodel = gensim.models.ldamodel.LdaModel.load(file_path)
all_topics = ldamodel.print_topics(num_topics=-1)
for topic in all_topics:
print(topic)
#create correct engine given parameters
if args.pages == 'guy':
engine = create_engine('sqlite:///./guy_sentiment.db')
out_engine = create_engine('sqlite:///./guy_classified.db')
elif args.pages == 'nh':
engine = create_engine('sqlite:///./nh19_sentiment.db')
out_engine = create_engine('sqlite:///./nh19_classified.db')
elif args.pages == 'all':
engine = create_engine('sqlite:///./raw_sentiment.db')
out_engine = create_engine('sqlite:///./raw_classified.db')
#select all comments/posts
df = pd.read_sql("""select * from {0} where created_time >= '{1}' and created_time <= '{2}'""".format(args.type, args.start_date, args.end_date), engine)
#log size of df
print('Df shape: ', df.shape)
#read topic meta file to get names of topics
if args.model == None:
meta_file_path = "./models/ldamodel_topics_{0}_{1}_{2}topics_{3}_to_{4}.txt".format(args.type,args.pages,args.num_topics, args.start_date, args.end_date)
else:
model_name_list = args.model.split('_')
model_name_list.insert(1,'topics')
meta_file_path = "./models/{0}.txt".format("_".join(model_name_list))
with open(meta_file_path,"r") as meta_file:
json_topics = json.load(meta_file)
topics = {int(topic): json_topics[topic] for topic in json_topics}
df['topic'] = ""
count = 0
for index,row in df.iterrows():
#comment table and post table both have row called message (no longer using column
#'post_message' on comments table)
message = row['message']
#tokenize one message (both post and comment messages are filtered in their respective tables
#when generating sentiment scores -- no need to catch null message now)
tokens = tokenizer_inst.tokenize([message], return_docs=False)
#convert message to bag of words
doc = dictionary.doc2bow(tokens)
#classify as a topic (topic_dist is a list of tuples, where each tuple stores (topic_id, probability))
topic_dist = ldamodel[doc]
#take max over second element of tuple (probability)
most_likely_topic = max(topic_dist, key=lambda item:item[1])
most_likely_topic_name = topics[most_likely_topic[0]]['name']
df.at[index, 'topic'] = most_likely_topic_name
#log status
if index % 1000 == 0:
print('%d %s classified' % (count*1000,args.type))
count += 1
#TODO: try grouping by day
#average sentiment per time period for each topic (every week)
time_df = df
time_df['created_time'] = pd.to_datetime(time_df.created_time)
time_df = time_df.groupby([pd.Grouper(key='created_time', freq='W-MON'),'topic']).mean().reset_index()
time_df = time_df.fillna(0)
time_df['created_time'] = time_df['created_time'].dt.strftime("%Y-%m-%d")
print('time dataframe')
print(time_df.head())
unique_topics = time_df['topic'].unique()
ax = sns.pointplot(x='created_time',y='compound', hue='topic', markers=["."]*len(unique_topics), ci=None, data=time_df, palette=sns.color_palette("muted"))
ax.grid(True)
ax.tick_params(axis = 'both', which = 'major', labelsize = 14)
#can choose to not show all tick marks
#ax.set_xticks(ax.get_xticks()[::4])
plt.title('Sentiment by Topic Over Time for {0}'.format(args.type.capitalize()), fontsize=24)
plt.xticks(rotation=45, ha='right', rotation_mode='anchor')
plt.subplots_adjust(bottom=0.2)
plt.xlabel("Created Date",fontsize=18)
plt.ylabel("Average Sentiment Score", fontsize = 18)
plt.savefig("./topics/topic_over_time_{0}_{1}_{2}topics_{3}_to_{4}.png".format(args.type,args.pages,args.num_topics, args.start_date, args.end_date))
#sentiment variance per time period for each topic (every week)
time_std_df = df
time_std_df['created_time'] = pd.to_datetime(time_std_df.created_time)
time_std_df = time_std_df.groupby([pd.Grouper(key='created_time', freq='W-MON'),'topic']).std().reset_index()
time_std_df = time_std_df.fillna(0)
time_std_df['created_time'] = time_std_df['created_time'].dt.strftime("%Y-%m-%d")
print('standard deviation over time dataframe')
print(time_std_df.head())
plt.figure()
unique_topics = time_std_df['topic'].unique()
ax = sns.pointplot(x='created_time',y='compound', hue='topic', markers=["."]*len(unique_topics), ci=None, data=time_std_df, palette=sns.color_palette("muted"))
ax.grid(True)
ax.tick_params(axis = 'both', which = 'major', labelsize = 14)
plt.title('Sentiment Std. Dev. by Topic Over Time for {0}'.format(args.type.capitalize()), fontsize=24)
plt.xticks(rotation=45, ha='right', rotation_mode='anchor')
plt.subplots_adjust(bottom=0.2)
plt.xlabel("Created Date", fontsize = 18)
plt.ylabel("Std. Dev. of Sentiment Score", fontsize = 18)
plt.savefig("./topics/topic_std_over_time_{0}_{1}_{2}topics_{3}_to_{4}.png".format(args.type,args.pages,args.num_topics, args.start_date, args.end_date))
#count per period for each topic (every week)
time_count_df = df
time_count_df['created_time'] = pd.to_datetime(time_count_df.created_time)
time_count_df = time_count_df.groupby([pd.Grouper(key='created_time', freq='W-MON'),'topic']).size().reset_index(name='counts')
time_count_df = time_count_df.fillna(0)
time_count_df['created_time'] = time_count_df['created_time'].dt.strftime("%Y-%m-%d")
print('count of topic over time dataframe')
print(time_count_df.head())
plt.figure()
unique_topics = time_count_df['topic'].unique()
ax = sns.pointplot(x='created_time',y='counts', hue='topic', markers=["."]*len(unique_topics), ci=None, data=time_count_df, palette=sns.color_palette("muted"))
ax.grid(True)
ax.tick_params(axis = 'both', which = 'major', labelsize = 14)
plt.title('Number of {0} by Topic Over Time'.format(args.type.capitalize()), fontsize=24)
plt.xticks(rotation=45, ha='right', rotation_mode='anchor')
plt.subplots_adjust(bottom=0.2)
plt.xlabel("Created Date", fontsize = 18)
plt.ylabel("Count", fontsize = 18)
plt.savefig("./topics/topic_count_over_time_{0}_{1}_{2}topics_{3}_to_{4}.png".format(args.type,args.pages,args.num_topics, args.start_date, args.end_date))
#average sentiment per topic
topic_df = df.groupby(['topic']).mean()['compound'].sort_values(ascending=False)
print('topic dataframe')
print(topic_df)
plt.figure(figsize=(12,8))
topic_df.plot.bar()
plt.xticks(rotation=40, ha='right', rotation_mode='anchor')
plt.subplots_adjust(bottom=0.35)
plt.xlabel("Topic", fontsize = 20)
plt.ylabel("Average Sentiment", fontsize = 20)
ax = plt.gca()
ax.tick_params(axis = 'both', which = 'major', labelsize = 16)
plt.title("Average Sentiment for {0} Per Topic, {1} to {2}".format(args.type.capitalize(), args.start_date, args.end_date), fontsize = 24)
plt.savefig("./topics/topic_avg_sentiment_{0}_{1}_{2}topics_{3}_to_{4}.png".format(args.type,args.pages,args.num_topics, args.start_date, args.end_date))
#sentiment standard deviation per topic
topic_std_df = df.groupby(['topic']).std()['compound'].sort_values(ascending=False)
print('topic sentiment std. dev. dataframe')
print(topic_std_df.head())
plt.figure(figsize=(12,8))
topic_std_df.plot.bar()
plt.xticks(rotation=40, ha='right', rotation_mode='anchor')
plt.subplots_adjust(bottom=0.35)
plt.xlabel("Topic", fontsize = 20)
plt.ylabel("Sentiment Std. Dev.", fontsize = 20)
ax = plt.gca()
ax.tick_params(axis = 'both', which = 'major', labelsize = 16)
plt.title("Sentiment Std. Dev. for {0} Per Topic, {1} to {2}".format(args.type.capitalize(), args.start_date, args.end_date), fontsize = 24)
plt.savefig("./topics/topic_std_sentiment_{0}_{1}_{2}topics_{3}_to_{4}.png".format(args.type,args.pages,args.num_topics, args.start_date, args.end_date))
#number of messages (posts or comments) per topic
number_df = df.groupby(['topic']).size().sort_values(ascending=False)
print('number dataframe')
print(number_df)
plt.figure(figsize=(12,8))
number_df.plot.bar()
plt.xticks(rotation=40, ha='right', rotation_mode='anchor')
plt.subplots_adjust(bottom=0.35)
plt.xlabel("Topic", fontsize=20)
plt.ylabel("Count", fontsize=20)
ax = plt.gca()
ax.tick_params(axis = 'both', which = 'major', labelsize = 16)
plt.title("Number of {0} Per Topic, {1} to {2}".format(args.type.capitalize(),args.start_date, args.end_date), fontsize = 24)
plt.savefig("./topics/topic_count_{0}_{1}_{2}topics_{3}_to_{4}.png".format(args.type,args.pages,args.num_topics, args.start_date, args.end_date))
#write to database
df.to_sql(args.type,con=out_engine,if_exists="replace",index=False)
plt.show()