-
Notifications
You must be signed in to change notification settings - Fork 2
/
main.py
451 lines (362 loc) · 17.9 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
# -*- coding: utf-8 -*-
"""i181655.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1ldTiTeaEIDPknQotEotbbma6PK0TbH2F
"""
import csv
import os
import spacy
import math
nlp = spacy.blank('ur')
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from google.colab import drive
drive.mount('/content/drive')
"""Using the os library path to the Train folder is being given and a list of files in the real and fake is obtained. Using that list of file names the files are being read"""
path = "/content/drive/My Drive/NLP/Train/Real"
os.chdir(path)
real_files = os.listdir(path)
real_corpus = list()
for filename in real_files:
with open(os.path.join(path, filename), 'r') as f:
text = f.read()
real_corpus.append(text)
path = "/content/drive/My Drive/NLP/Train/Fake"
os.chdir(path)
fake_files = os.listdir(path)
fake_corpus = list()
for filename in fake_files:
with open(os.path.join(path, filename), 'r') as f:
text = f.read()
fake_corpus.append(text)
"""In the function below the real and fake corpus is being tokenised into a list of words using spacy and vocabulary is being extracted
Making Vocabulary
"""
real_copy = real_corpus.copy() #***********making copies so the actual corpus does no get modified otherwise the corpus gets modified************
fake_copy = fake_corpus.copy()
def extract_vocab(): #extracting vocabulary from the training corpus
vocab = list() #list to contain all the words of the vocabulary
i = 0
#for real traing corpus
while i < len(real_corpus): #traversing through the real news text files
doc = nlp(real_copy[i]) #using spacy tokenizer
for word in doc:
vocab.append(str(word)) #appending word to the vocab list
i += 1
#for fake training corpus
i = 0
while i < len(fake_corpus): #traversing through the fake news text files
doc = nlp(fake_copy[i]) #using spacy tokenizer
for word in doc:
vocab.append(str(word)) #appending word to the vocab list
i += 1
return vocab
"""Making a copy of corpus just to make sure the original corpuses are not modified in any way"""
real_copy = list()
fake_copy = list()
real_copy = real_corpus.copy()
fake_copy = fake_corpus.copy()
"""To count the freq of words in both the real and fake files corpus"""
real_dict = dict() #dictionary to store unigram of words from the real corpus
fake_dict = dict() #dictionary to store unigram of words from the fake corpus
def unigram(v):
i = 0
while i < len(real_corpus): #traversing through the real news text files
doc = nlp(real_copy[i])
for word in doc:
if str(word) in v: #if word exists in the vocabulary
if str(word) not in real_dict.keys(): #if the word is not already in the dictionary
real_dict[str(word)] = 1 #then initialise it by 1
elif str(word) in real_dict: #if it is in the dict
real_dict[str(word)] += 1 #add 1 to the freq
i += 1
i = 0
while i < len(fake_corpus): #traversing through the fake news text files
doc = nlp(fake_copy[i])
for word in doc:
if str(word) in v: #if word exists in the vocabulary
if str(word) in fake_dict.keys(): #if the word is already in the dictionary
fake_dict[str(word)] += 1 #then add 1 to the count
else: #if it is not in the dict
fake_dict[str(word)] = 1 #initialise the count by 1
i += 1
"""Function to count all the texts in the training corpus i.e all the files"""
def count_texts(r, f): #counting all the texts in the training corpus
n = len(r) + len(f) #both real and fake texts
return n
"""Function to count all the words in the class by using spacy to tokenise them and making a list of words"""
def count_all_words(class_text):
words_list = list() #to store all the word in the class
text = list()
text = class_text.copy()
i = 0
while i < len(class_text): #traversing through the class texts
doc = nlp(text[i])
for word in doc:
words_list.append(str(word)) #making a list of words by appending each word in the word list
i += 1
return len(words_list)
"""function to count how many times a word has appeared in a particular class"""
def count_token_of_words(word, Doc):
word_count = 0
word_count = Doc[word] #Doc is a dict that contains unigrams of all the words of the class
return word_count #returning the total number of times has has appeared in the said class
"""Making dict() for conditional probability, prior and score"""
cond_prob = dict() #dictionary to store all the conditional probabilities
cond_prob['Real'] = dict()
cond_prob['Fake'] = dict()
prior = dict() #dictionary to store prior of real and fake corpus
score = dict() #dictionary to store score of real and fake corpus
"""Training through the multinomial Naive Bayes theorem"""
def TrainMultinomialNB(v, r, f, real_cor, fake_cor, real_dic, fake_dic):
Vocabulary = v #extracting vocab of the whole corpus
N = count_texts(r, f) #counting all the texts in the corpus
Nc = len(r) #count texts in each class
Nw = count_all_words(real_cor) #count words in all the texts of the said class
prior['Real'] = Nc/N #calculate prior
Doc_c = real_dic #dict() containing all teh counts of the words that appeared
for w in Vocabulary: #if word in vocab
if w in Doc_c: #and word in doc
Ni = count_token_of_words(w, Doc_c) #get count of that word
cond_prob['Real'][w] = (Ni + 1) / (Nw + len(Vocabulary)) #calculate conditional probability
Nc = 0
Nw = 0
Doc_c = dict()
Nc = len(f) #count texts in each class
Nw = count_all_words(fake_cor) #count words in all the texts of the said class
prior['Fake'] = Nc/N #calculate prior
Doc_c = fake_dic #dict() containing all teh counts of the words that appeared
for w1 in Vocabulary: #if word in vocab
if w1 in Doc_c: #and word in doc
Ni = count_token_of_words(w1, Doc_c) #get count of that word
cond_prob['Fake'][w1] = (Ni + 1) / (Nw + len(Vocabulary)) #calculate conditional probability
return Vocabulary, prior, cond_prob #return vocab, prior and conditional prob
v = extract_vocab() #extracting vocab
unigram(v) #making unigrams of both classes
Vocab, Prior, Cond_prob = TrainMultinomialNB(v, real_files, fake_files, real_corpus, fake_corpus, real_dict, fake_dict) #training through Naive Bayes
"""Function to extract all the words from text by using spacy """
def extractWordsFromText(text):
w_list = list() #list to append the words in
doc = nlp(text)
for word in doc: #for each word in the doc
w_list.append(str(word)) #append word into the word list
return w_list #return list of words
"""Applying the Multinomial Theorem"""
def ApplyMultinomialNB(Cond_prob, Prior, test):
Word_list = list()
Word_list = extractWordsFromText(test) #getting all the words from the text
score['Real'] = math.log(Prior['Real']) #get prior of the class
for w in Word_list: #for each word in the list
if str(w) in Cond_prob['Real']:
score['Real'] += math.log(Cond_prob['Real'][w]) #get its conditional prob and add it to the prior and so on
score['Fake'] = math.log(Prior['Fake']) #get prior of the class
for w in Word_list: #for each word in the list
if str(w) in Cond_prob['Fake']:
score['Fake'] += math.log(Cond_prob['Fake'][w]) #get its conditional prob and add it to the prior and so on
return score #return score
def Detection():
path = "/content/drive/My Drive/NLP/Test/Real"
os.chdir(path)
r_files = os.listdir(path) #getting a list of all the txt file names in the test real folder
path2 = "/content/drive/My Drive/NLP/Test/Fake"
os.chdir(path2)
f_files = os.listdir(path2) #getting a list of all the txt file names in the test fake folder
answer = list() #list to get the actual answer after applying the algo
predicted = list() #list to contain the predicted answer
for filename in r_files: #traversing through real files
with open(os.path.join(path, filename), 'r') as f: #reading all the texts from the real class
test = f.read()
final = dict()
final = ApplyMultinomialNB(Cond_prob, Prior, test) #applying the multinomial algo
argmax = max(final, key=final.get) #get max value of score
predicted.append('Real') #predicted answers
answer.append(argmax) #answer after applying algo
for filename in f_files: #traversing through fake files
with open(os.path.join(path, filename), 'r') as f: #reading all the texts from the fake class
test = f.read()
final = dict()
final = ApplyMultinomialNB(Cond_prob, Prior, test) #applying the nultinomial algo
argmax = max(final, key=final.get) #get max value of score
predicted.append('Fake') #predicted answers
answer.append(argmax) #answer after applying algo
accuracy = accuracy_score(answer, predicted) #getting accuracy
precision = precision_score(answer, predicted, average = 'macro') #getting precision
recall = recall_score(answer, predicted, average = 'macro') #getting recall
f1 = f1_score(answer, predicted, average = 'macro') #getting f1 mearsure
print("Accuracy = ", accuracy)
print("Precision = ", precision)
print("Recall = ", recall)
print("F1 = ", f1)
"""Without Removing stopwords and duplicates from each text"""
Detection() #Detecting real vs fake news
"""After removing Stopwords
Reading the stop words file
"""
with open('/content/drive/My Drive/NLP/stopwords-ur.txt', mode = 'r') as f:
stopwords = f.read()
f.close()
"""Making a copy of corpus just to make sure the original corpuses are not modified in any way.
Doing this again for without the stopwords and text duplicates
"""
real_copy = list()
fake_copy = list()
real_copy = real_corpus.copy()
fake_copy = fake_corpus.copy()
"""Extracting vocabulary and removing stop words"""
def extract_vocab_no_stopwords(): #extracting vocabulary from the training corpus
vocab = list() #list to contain all the words of the vocabulary
i = 0
while i < len(real_corpus): #traversing through the real news text files
doc = nlp(real_copy[i])
for word in doc:
if str(word) not in stopwords: #if word is not in the stopwords list only then add to the vocab
vocab.append(str(word))
i += 1
i = 0
while i < len(fake_corpus): #traversing through the fake news text files
doc = nlp(fake_copy[i])
for word in doc:
if str(word) not in stopwords: #if word is not in the stopwords list only then add to the vocab
vocab.append(str(word))
i += 1
return vocab
"""Making unigram of the corpuses without stopwords"""
real_dict = dict() #dictionary to store unigram of words from the real corpus
fake_dict = dict() #dictionary to store unigram of words from the fake corpus
real_copy = list()
fake_copy = list()
real_copy = real_corpus.copy()
fake_copy = fake_corpus.copy()
def unigram_without_sw(v):
i = 0
while i < len(real_corpus): #traversing through the real news text files
doc = nlp(real_copy[i])
for word in doc:
if str(word) not in stopwords:
if str(word) in v: #if word exists in the vocabulary
if str(word) not in real_dict.keys(): #if the word is not already in the dictionary
real_dict[str(word)] = 1 #then initialise it by 1
elif str(word) in real_dict: #if it is in the dict
real_dict[str(word)] += 1 #add 1 to the freq
i += 1
i = 0
while i < len(fake_corpus): #traversing through the fake news text files
doc = nlp(fake_copy[i])
for word in doc:
if str(word) not in stopwords:
if str(word) in v: #if word exists in the vocabulary
if str(word) in fake_dict.keys(): #if the word is already in the dictionary
fake_dict[str(word)] += 1 #then add 1 to the count
else: #if it is not in the dict
fake_dict[str(word)] = 1 #initialise the count by 1
i += 1
"""With only stopwords words removed"""
cond_prob = dict() #dictionary to store all the conditional probabilities
cond_prob['Real'] = dict()
cond_prob['Fake'] = dict()
prior = dict() #dictionary to store prior of real and fake corpus
score = dict() #dictionary to store score of real and fake corpus
v = extract_vocab_no_stopwords() #extracting vocab
unigram_without_sw(v) #making unigrams of both classes
Vocab, Prior, Cond_prob = TrainMultinomialNB(v, real_files, fake_files, real_corpus, fake_corpus, real_dict, fake_dict) #training through Naive Bayes
Detection() #Detection of real vs fake news using Boolean Naive Bayes
"""Removing duplicates from each text
"""
real_copy = list()
fake_copy = list()
real_copy = real_corpus.copy()
fake_copy = fake_corpus.copy()
noDup_real = dict() #to store count of each word appearing in a real texts
noDup_fake = dict() #to store count of each word appearing in a fake texts
real_corpus_noDup = list() #list to store real text after removal of duplicates
fake_corpus_noDup = list() #list to store fake text after removal of duplicates
def remove_duplicates():
i = 0
while i < len(real_corpus): #traversing through real corpus
w_list = real_copy[i].split() #splitting the text into words
done = ''
for word in w_list:
if word not in done: #if word not already visited than add to list
if word in noDup_real.keys(): #if word already exists in the dict because it might have been present in the last text than add 1 to count
noDup_real[word] += 1
else:
noDup_real[word] = 1 #else just initialise word count by 1
done = done + word + ' ' #concatenate words together for form the text without any duplicates
real_corpus_noDup.append(done) #append to real corpus texts list
i += 1
i = 0
while i < len(fake_corpus): #traversing through fake corpus
w_list1 = fake_copy[i].split() #splitting the text into words
done = ''
for word in w_list1:
if word not in done: #if word not already visited than add to list
if word in noDup_fake.keys(): #if word already exists in the dict because it might have been present in the last text than add 1 to count
noDup_fake[word] += 1
else:
noDup_fake[word] = 1 #else just initialise word count by 1
done = done + word + ' ' #concatenate words together for form the text without any duplicates
fake_corpus_noDup.append(done) #append to fake corpus texts list
i += 1
"""BOOLEAN NAIVE BAYES
(With stopwords but duplicates removed from the coupus)
"""
cond_prob = dict() #dictionary to store all the conditional probabilities
cond_prob['Real'] = dict()
cond_prob['Fake'] = dict()
real_copy = real_corpus.copy()
fake_copy = fake_corpus.copy()
prior = dict() #dictionary to store prior of real and fake corpus
score = dict() #dictionary to store score of real and fake corpus
v = extract_vocab()
remove_duplicates() #remove duplicates from each text
Vocab, Prior, Cond_prob = TrainMultinomialNB(v, real_files, fake_files, real_corpus_noDup, fake_corpus_noDup, noDup_real, noDup_fake) #Training through multinomial Naive Bayes
Detection() #Detection of real vs fake news using Boolean Naive Bayes
"""Removing Stopwords and duplicates from each text (Basically removing tsopwords from Boolean Naive Bayes)"""
def remove_duplicates_and_stopwords():
i = 0
while i < len(real_corpus): #traversing through real corpus
w_list = real_copy[i].split() #splitting the text into words
done = ''
for word in w_list:
if word not in stopwords: #if word in not in the stopwords list
if word not in done: #if word not already visited than add to list
if word in noDup_real.keys(): #if word already exists in the dict because it might have been present in the last text than add 1 to count
noDup_real[word] += 1
else:
noDup_real[word] = 1 #else just initialise word count by 1
done = done + word + ' ' #concatenate words together for form the text without any duplicates
real_corpus_noDup.append(done) #append to real corpus texts list
i += 1
i = 0
while i < len(fake_corpus): #traversing through fake corpus
w_list1 = fake_copy[i].split() #splitting the text into words
done = ''
for word in w_list1:
if word not in stopwords: #if word in not in the stopwords list
if word not in done: #if word not already visited than add to list
if word in noDup_fake.keys(): #if word already exists in the dict because it might have been present in the last text than add 1 to count
noDup_fake[word] += 1
else:
noDup_fake[word] = 1 #else just initialise word count by 1
done = done + word + ' ' #concatenate words together for form the text without any duplicates
fake_corpus_noDup.append(done) #append to fake corpus texts list
i += 1
cond_prob = dict() #dictionary to store all the conditional probabilities
cond_prob['Real'] = dict()
cond_prob['Fake'] = dict()
prior = dict() #dictionary to store prior of real and fake corpus
score = dict() #dictionary to store score of real and fake corpus
v = extract_vocab_no_stopwords() #extracting vocab without any stopwords in it
remove_duplicates_and_stopwords() #remove duplicates from each text
Vocab, Prior, Cond_prob = TrainMultinomialNB(v, real_files, fake_files, real_corpus_noDup, fake_corpus_noDup, noDup_real, noDup_fake) #Training through multinomial Naive Bayes
Detection() #Detection of real vs fake news after stopwords and duplicates
"""
**CASES:**
1. Without Removing stopwords and duplicates
2. Removing stopwords only
1. Removing Duplicates only
2. Removing Both Stopwords and Duplicates
"""