-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse.py
103 lines (71 loc) · 2.86 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing, tree
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from scipy import stats
from getURLFreq import getURLFreq
import csv, sys, cgi
from twitterHandler import getUserAttributes
import pydot
#Function block
def encode(array, data):
"""encode some data for use in Decision Tree Classifier"""
le = preprocessing.LabelEncoder()
le.fit(array[data])
array[data] = le.transform(array[data])
def crossValScore(array, features, type):
"""Determine the cross validation score for a given array of data"""
print("Performing cross validation and obtaining scores...")
#del users_features[8] #Discard "type" column as it causes the system to "cheat"
x = array[features]
y = array[type]
dt.fit(x, y)
scores = cross_val_score(dt, x, y, cv=10)
print("Mean Accuracy: {:.3f} (std: {:.3f}) (std err: {:.3f})".format(scores.mean(), scores.std(), stats.sem(scores.mean(), axis=None, ddof=0)))
def assignType(data, type):
if type == 'bot':
data['type'] = 1
if type == 'human':
data['type'] = 0
#def assignURLFreqs(humanFreqs ,botFreqs):
# botData['URLfreq'] = 0
# humanData['URLfreq'] = 0
# for i in xrange(0, len(humanFreqs)):
# humanData['URLfreq'][i] = humanFreqs[i]
# for i in xrange(0, len(botFreqs)):
# botData['URLfreq'][i] = botFreqs[i]
dt = DecisionTreeClassifier(min_samples_split=20, random_state=99)
def predictUser(username):
print("Importing CSV files into Pandas Dataframes...\n")
botData = pd.read_csv('content_polluters.txt', sep='\t', header=0)
humanData = pd.read_csv('legitimate_users.txt', sep='\t', header=0)
print("Assigning types to data for Decision Tree Model...\n")
assignType(botData, 'bot')
assignType(humanData, 'human')
print("Combining human and bot data into single dataframe...\n")
frames = [botData, humanData]
userData = pd.concat(frames)
print("Adding URL Frequencies to data frame...")
userData['URLFreq'] = 0
userData['URLFreq'][0] = "URLFreq"
with open("URLFreqs.tsv") as input:
for line in csv.reader(input, delimiter='\t'):
freqs = line
freqSeries = pd.Series(freqs)
userData['URLFreq'] = freqSeries
print("Encoding dates into int form via label encoder...\n")
encode(userData, 'createdAt')
encode(userData, 'collectedAt')
users_features = ['descLen', 'nameLen', 'numFollowers', 'numFollowings','numTweets', 'URLFreq']
scores = crossValScore(userData, users_features, "type")
userAttributes = getUserAttributes(username)
predict = dt.predict_proba(userAttributes, users_features)
print("raw results: ", predict)
print("chance of being human: ", predict[0][0])
tree.export_graphviz(dt, out_file='tree.dot')
return {'username': username, 'predict': predict[0][0]}
def convToPng():
graph = pydot.graph_from_dot_file('tree.dot')
graph.write_png('tree.png')