-
Notifications
You must be signed in to change notification settings - Fork 0
/
gamer_words.py
536 lines (459 loc) · 22.6 KB
/
gamer_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
"""
Contains the functions to find the "gamer words" list, compute
statistics for user similarity to gamer and normal dictionaries,
compute z-scores for gamer and normal distributions, and determine
whether individuals are gamers or not.
"""
from operator import itemgetter
import math
import os
import numpy as np
from scrape_data import csv_to_dict
def find_most_frequent(word_dictionary, number_items):
"""
Find a specified number of keys from a dictionary that have the highest
integer values.
Args:
word_dictionary: A dictionary with strings as keys and positive
integers as values.
number_items: An integer determining how many value ordered items to
output.
Returns:
most_frequent_dictionary: A value ordered dictionary with strings as
keys and integers as values.
"""
most_frequent_dictionary = dict(
sorted(word_dictionary.items(), key=itemgetter(1), reverse=True) \
[:number_items])
return most_frequent_dictionary
def instances_to_decimal(dictionary):
"""
Convert the values of a dictionary from an integer to a decimal ratio
of what frequency of the time that word is used in the total dataset.
Args:
dictionary: A dictionary with strings as keys and positive integers
as values.
Returns:
return_dict: A dictionary with strings as keys and floats as values.
total_words: An integer that is the sum of the values in the input
dictionary representing the total number of words used is a
language set.
"""
return_dict = {}
total_words = sum(dictionary.values())
# Avoids 0 division error
if total_words <= 0:
return return_dict, total_words
for word in dictionary:
return_dict[word] = dictionary[word]/total_words
return return_dict, total_words
def remove_most_common(normal_dictionary, gamer_dictionary, normal_total_words,
gamer_total_words):
"""
Remove words between dictionaries that show up similarly frequently between
the "normal" dataset and the "gamer" dataset.
Args:
normal_dictionary = A dictionary with strings as keys representing
words and floats as values representing what ratio of the time a
word gets used in the normal dataset
gamer_dictionary = A dictionary with strings as keys representing words
and floats as values representing what ratio of the time a word
gets used in the gamer dataset
normal_total_words: An integer representing the total instances of
word uses in the normal dataset
gamer_total_words: An integer representing the total instances of
word uses in the gamer dataset
Returns:
normal_return_dict = A dictionary with strings as keys representing
words and floats as values representing what ratio of the time a
word gets used in the normal dataset.
gamer_return_dict = A dictionary with strings as keys representing
words and floats as values representing what ratio of the time a
word gets used in the gamer dataset.
ignore_list = A list of strings containing words which were omitted
from both dictionaries.
"""
ignore_list = []
# Create list of words to ignore
for word in normal_dictionary:
if word in gamer_dictionary and normal_dictionary[word]*1.25 > \
gamer_dictionary[word] > normal_dictionary[word]*.75:
ignore_list.append(word)
# What to do if nothing is in the normal dictionary
if normal_total_words == 0:
normal_return_dict = {}
gamer_return_dict = dict(gamer_dictionary.items())
return normal_return_dict, gamer_return_dict, ignore_list
# What to do if nothing is in the gamer dictionary
if gamer_total_words == 0:
normal_return_dict = dict(normal_dictionary.items())
gamer_return_dict = {word: value*gamer_total_words for (word, value) in
gamer_dictionary.items()}
return normal_return_dict, gamer_return_dict, ignore_list
# Turn the decimal dictionaries into frequency dictionaries
normal_return_dict = {word: value*normal_total_words for (word, value) in
normal_dictionary.items() if word not in ignore_list}
gamer_return_dict = {word: value*gamer_total_words for (word, value) in
gamer_dictionary.items() if word not in ignore_list}
# Find the sum of all values in both frequency dictionaries
new_gamer_total_words = sum(gamer_return_dict.values())
new_normal_total_words = sum(normal_return_dict.values())
# Divide the frequency dictionaries by the total values in each dictionary
normal_return_dict = {word: value/new_normal_total_words for (word, value)
in normal_return_dict.items()}
gamer_return_dict = {word: value/new_gamer_total_words for (word, value)
in gamer_return_dict.items()}
return normal_return_dict, gamer_return_dict, ignore_list
def remove_too_uncommon(word_dictionary, threshold=20):
"""
Remove words from a dictionary that show up less than a specified
number of times. Also remove entries that are strings longer than length 20
or words that fall into a specific type of typo.
Args:
word_dictionary: A dictionary with strings as keys representing words
and integers as values representing how many times that word
is used in a dataset.
threshold: An integer determining the minimum number of usages for a
word to be considered in the dictionary.
Returns:
word_return_dictionary: A dictionary with strings as keys representing
words and integers as values representing how many times that word
is used in a dataset.
"""
delete_word = [key for key in dict(word_dictionary).keys() if
int(word_dictionary[key]) < threshold or len(key) > 20]
# Remove typo words that repeat themselves of the form "wordcword"
for key in dict(word_dictionary).keys():
if key not in delete_word and "c" in key and (len(key) - 1) % 2 == 0:
half_word = key[0:int((len(key)-1)/2)]
if half_word in key[int(((len(key)-1)/2)+1):len(key)]:
delete_word.append(key)
# For key in delete_word: word_dictionary.pop(key)
word_return_dictionary = {word: value for (word, value) in
word_dictionary.items() if word not in \
delete_word}
return word_return_dictionary
def determine_gamer_words(normal_dictionary, gamer_dictionary):
"""
Determine words specific to a language set by comparing and finding words
that are used significantly more frequently by a community than the general
populace.
Args:
normal_dictionary: A dictionary with strings as keys and floats as the
values representing what ratio of the time that string gets
used in the dataset.
gamer_dictionary: A dictionary with strings as keys and floats as the
values representing what ratio of the time that string gets
used in the dataset.
Returns:
gamer_words: A list of strings representing words specific to the
gamer vocabulary.
"""
gamer_words = []
for word in gamer_dictionary:
# Determine a word to be a gamer word if it is used 50 times more
# frequently in gamer subreddits than normal subreddits
if (word in normal_dictionary) and (normal_dictionary[word] <
gamer_dictionary[word]/8):
gamer_words.append(word)
# If the word is not present in the normal dictionary, then use a
# simple percentage of uses comparison to determine if the word is
# used frequently enough to be determined a gamer word
elif (word not in normal_dictionary) and (gamer_dictionary[word] >
.000079):
gamer_words.append(word)
return gamer_words
def parse_words(normal_dictionary, gamer_dictionary, threshold):
"""
Parse through a dictionary of words and their frequencies in "gamer" and
"normal" subreddits to find meaningfully different language patterns
between the two word sets.
Args:
normal_dictionary = A dictionary with strings as keys representing
words and integers as values representing the number of times a
word gets used in the normal dataset
gamer_dictionary = A dictionary with strings as keys representing words
and integers as values representing the number of times a word
gets used in the gamer dataset
threshold: An integer determining the minimum number of usages for a
word to be considered in the dictionary for use in the
remove_too_uncommon function()
Returns:
working_normal_dictionary: a dictionary with strings as keys and floats
as the values representing what ratio of the time that string gets
used in the normal dataset.
working_gamer_dictionary: a dictionary with strings as keys and floats
as the values representing what ratio of the time that string gets
used in the gamer dataset.
gamer_words: a list of strings representing words determined to be
meaningfully distinct to the gamer vocabulary.
ignore_list = a list of strings representing the words that were
removed from the original word dictionaries in this function
"""
# Curate dictionary sets for word usages
working_gamer_dictionary = remove_too_uncommon(gamer_dictionary, threshold)
working_normal_dictionary = remove_too_uncommon(
normal_dictionary, threshold)
# Change from a word frequency list to a ratio of the amount of times that
# word is used
working_normal_dictionary, normal_total_words = \
instances_to_decimal(working_normal_dictionary)
working_gamer_dictionary, gamer_total_words = \
instances_to_decimal(working_gamer_dictionary)
# Curate dictionaries by comparing them to each other
working_normal_dictionary, working_gamer_dictionary, ignore_list = \
remove_most_common(working_normal_dictionary,
working_gamer_dictionary, normal_total_words, \
gamer_total_words)
# Determine gamer words
gamer_words = determine_gamer_words(working_normal_dictionary,
working_gamer_dictionary)
return working_normal_dictionary, working_gamer_dictionary, gamer_words, \
ignore_list
def determine_language_similarity(word_dictionary, user_dictionary):
"""
Determine how similar a user's vocabulary is to a certain curated language
set with lower numbers determining closer similarities in language.
If every word that a user types is considered a dimension in vector
space, then one can construct a vector for a user's dataset and a vector
for the community language dataset by setting the magnitude of each vector
component as the ratio of times a certain word is used total in the user
and community language dataset respectively. One can then quantify how
close these two language datasets are to each other by subtracting these
two vectors and finding the magnitude of the resultant vector.
Args:
word_dictionary: A dictionary with strings as keys and floats
as the values representing what ratio of the time that string gets
used in a language dataset.
user_dictionary: A dictionary with strings as keys and floats
as the values representing what ratio of the time that string gets
used in a user's personal language dataset.
Returns:
A float representing how close a user's total language usage is to a
given set of data with lower numbers being closer.
"""
difference_list = []
# We specifically iterate through the user's list of used words because
# this list will be smaller by necessity than the words used by the entire
# dataset of users in a subreddit. It would therefore be unfair to judge
# closeness of a user's language used by the amount of words that they do
# not use that appear in a given dataset.
for word in user_dictionary:
if word in word_dictionary:
difference_list.append((user_dictionary[word] - word_dictionary
[word])**2)
else:
difference_list.append(user_dictionary[word]**2)
return math.sqrt(sum(difference_list))
def analyze_users_language(normal_dictionary, gamer_dictionary, gamer_words,
ignore_list, folder_path):
"""
Analyze a set of several user's language usage data, stored in
a folder as csv's, similarity to a normal and gamer language set and output
the result.
Args:
normal_dictionary: A curated dictionary with strings as keys and floats
as the values representing the number of the time that string gets
used in the normal dataset.
gamer_dictionary: A curated dictionary with strings as keys and floats
as the values representing what ratio of the time that string gets
used in the gamer dataset.
gamer_words: A list of strings representing words used much more
commonly by gamers than non gamers.
ignore_list: A list of strings representing words to remove from a
user's language set
folder_path: A string representing the relative path of the folder that
all of the user data csv's are in.
Returns:
user_value_dictionary: A dictionary with a string representing the
relative file path of a user's language usage data as a key and a
list of integers as the value. The list value is of the form:
[normal_closeness, gamer_closeness, ratio_gamer_words_used]
normal_closeness: An integer representing how close a user's language
usage is to the normal dataset.
gamer_closeness: An integer representing how close a user's language
usage is to the gamer dataset.
ratio_gamer_wordS_used: An integer representing the ratio of gamer
words used by a user out of all of the words they use.
"""
# Create a list of user csv file paths
file_list = get_file_list(folder_path)
user_value_dict = {}
swap_list = []
# Iterate through users
for user in file_list:
swap_list = []
# Create a dictionary from the user csv
user_dictionary = csv_to_dict(user)
# Remove non useful user data
user_dictionary = remove_too_uncommon(user_dictionary, 1)
user_dictionary = {word: value for (word, value) in
user_dictionary.items() if word not in ignore_list}
# Make the values of the user dictionary a ratio
user_dictionary, _ = instances_to_decimal(user_dictionary)
# Append to the user's output list their closeness values for the
# Normal language set and gamer language set in that order
swap_list.append(determine_language_similarity(
normal_dictionary, user_dictionary))
swap_list.append(determine_language_similarity(
gamer_dictionary, user_dictionary))
# Compute what ratio of words that a user uses are gamer words
ratio_gamer_words_used = 0
for word in gamer_words:
if word in user_dictionary.keys():
ratio_gamer_words_used += user_dictionary[word]
ratio_gamer_words_used = ratio_gamer_words_used / \
sum(user_dictionary.values())
# Append to a user's ouput list the ratio of gamer words they use
swap_list.append(ratio_gamer_words_used)
user_value_dict[user] = swap_list
return user_value_dict
def stats_and_z_info(stats_dict, folder_path):
"""
Given a dictionary containing users and their respective analysis
statistics, generates the overall statsistics lists, and the z-score values
for each user.
Args:
stats_dict: A dictionary representing all of the testing users'
statistics. The keys are strings representing the user CSV file
paths and the values are a list containing the two closeness stats
and the gamer:all words ratio.
folder_path: A string representing the path to the folder containing
all of the test user CSVs.
Returns:
stats: A list of three lists containing floats; each float list
contains the data for a singular statistic from the user analysis
for all of the users.
z_dict: A dictionary representing all of the testing users' z-scores
for closeness to the gamer and normal words. The keys are strings
representing the user CSV file paths and the values are lists
containing one index per z-score.
z_list: A list containing two lists of floats, one per z-value, for all
of the testing users.
"""
# Stats dict is output of analyze users_language
file_list = get_file_list(folder_path)
stats = [[],[],[]]
# Calculate stats lists
for user in file_list:
stats[0].append(stats_dict[user][0])
stats[1].append(stats_dict[user][1])
stats[2].append(stats_dict[user][2])
# Calculate means & standard deviations
mean_normal = sum(stats[0])/len(stats[0])
std_normal = np.std(stats[0])
mean_gamer = sum(stats[1])/len(stats[1])
std_gamer = np.std(stats[1])
# Form z dict
z_dict = {}
z_lists = [[], []]
for user in file_list:
z_normal = (stats_dict[user][0]-mean_normal)/std_normal
z_gamer = (stats_dict[user][1]-mean_gamer)/std_gamer
z_dict[user] = [z_normal, z_gamer]
z_lists[0].append(z_normal)
z_lists[1].append(z_gamer)
return stats, z_dict, z_lists
def is_gamer(gamer_z, normal_z):
"""
Determines whether an individual user is considered a gamer or not.
Args:
gamer_z: The individual user's z-score for closeness to the gamer
words.
normal_z: The individual user's z-score for closeness to the normal
words.
Returns:
A boolean representing whether the user whose z-scores were given is
a gamer or not.
"""
if gamer_z-normal_z < 0:
return True
return False
def get_file_list(folder_path):
"""
Gets a list of all the files given a folder path.
Args:
Folder path: A string representing a path to the folder containing
the test users' CSVs.
Returns:
A list of strings representing the paths to each of the individual
users' CSV files.
"""
file_list = os.listdir(folder_path)
file_list = [folder_path + "/" + user for user in file_list]
return file_list
def find_most_frequent_gamer_words(user_dict, gamer_words, num_items):
"""
Given a users' frequency dictionary and a number, determines the most
frequent gamer words up to that number.
Args:
user_dict: A dictionary representing an individual user's word
frequencies. The keys are strings reprenting the words that the
user has said and the values are integers representing how many
times the corresponding key has been sent as a message by the user.
gamer_words: A list of strings representing the gamer words.
num_items: The number of gamer_words to output.
Returns:
A dictionary representing the most frequent gamer words present in the
users' messages. The keys are strings representing the gamer words and
the values are numbers representing their corresponding frequencies.
"""
user_gamer_words = {}
for word in user_dict:
if word in gamer_words:
user_gamer_words[word] = user_dict[word]
return find_most_frequent(user_gamer_words, num_items)
def determine_gamer_words_frequency(gamer_words_list, gamer_dictionary):
"""
Creates a frequency dictionary for the gamer words list.
Args:
gamer_words: A list containing strings representing the current gamer
words.
gamer_dictionary: A dictionary representing the gamer words prior to
removal of overly common terms. The keys are strings representing
the words and the values are integers representing their
corresponding frequencies.
Returns:
A dictionary representing the gamer words. The keys are strings
representing the words and the values are integers representing
their corresponding frequencies.
"""
gamer_words = {}
for word in gamer_words_list:
if word in gamer_dictionary:
gamer_words[word] = gamer_dictionary[word]
return gamer_words
def generate_user_id_dict(z_dict, user_stats_dict, gamer_words, folder_path):
"""
Generates the info necessary to create the individual user ID cards.
Args:
z_dict: A dictionary representing the z-scores for each individual
user. The keys are strings representing the path to the users' data
and the values are a list containing two z-scores.
user_stats_dict: A dictionary representing the stats for each
individual user. The keys are strings representing the path to the
users' data and the values are a list containing the three primary
stats calculated for each user.
gamer_words: A list of strings represenging the gamer words.
folder_path: A string representing the path to the folder containing
all the individual users' data.
Returns:
user_id_dict: A dictionary representing the information that goes on
each individual users' ID card. The keys are strings representing
the path to the users' data and the values are a list containing
the information necessary for the ID card.
"""
user_id_dict = {}
file_list = get_file_list(folder_path)
for user in file_list:
username = str(user)[len(str(folder_path))+1:-4]
gamer_z = z_dict[user][1]
normal_z = z_dict[user][0]
gamer_status = is_gamer(gamer_z, normal_z)
gamer_all_freq = user_stats_dict[user][2]
top = list(find_most_frequent_gamer_words(
csv_to_dict(user), gamer_words, 5).keys())
id_info = [username, gamer_status,
gamer_z, normal_z, gamer_all_freq, top]
user_id_dict[user] = id_info
return user_id_dict