main.py

# -*- coding: utf-8 -*-
"""i181655.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1ldTiTeaEIDPknQotEotbbma6PK0TbH2F
"""

import csv
import os
import spacy
import math
nlp = spacy.blank('ur')
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from google.colab import drive
drive.mount('/content/drive')

"""Using the os library path to the Train folder is being given and a list of files in the real and fake is obtained. Using that list of file names the files are being read"""

path = "/content/drive/My Drive/NLP/Train/Real"
os.chdir(path)
real_files = os.listdir(path) 

real_corpus = list()

for filename in real_files:
  with open(os.path.join(path, filename), 'r') as f:
    text = f.read()
    real_corpus.append(text)

path = "/content/drive/My Drive/NLP/Train/Fake"
os.chdir(path)
fake_files = os.listdir(path)

fake_corpus = list()

for filename in fake_files:
  with open(os.path.join(path, filename), 'r') as f:
    text = f.read()
    fake_corpus.append(text)

"""In the function below the real and fake corpus is being tokenised into a list of words using spacy and vocabulary is being extracted

Making Vocabulary
"""

real_copy = real_corpus.copy()    #***********making copies so the actual corpus does no get modified otherwise the corpus gets modified************
fake_copy = fake_corpus.copy()
def extract_vocab():  #extracting vocabulary from the training corpus
  vocab = list()  #list to contain all the words of the vocabulary 
  i = 0
  #for real traing corpus
  while i < len(real_corpus):  #traversing through the real news text files
    doc = nlp(real_copy[i])   #using spacy tokenizer
    for word in doc:
      vocab.append(str(word))   #appending word to the vocab list
    i += 1

  #for fake training corpus
  i = 0
  while i < len(fake_corpus): #traversing through the fake news text files
    doc = nlp(fake_copy[i])   #using spacy tokenizer
    for word in doc:
      vocab.append(str(word)) #appending word to the vocab list
    i += 1
  return vocab

"""Making a copy of corpus just to make sure the original corpuses are not modified in any way"""

real_copy = list()  
fake_copy = list()
real_copy = real_corpus.copy()   
fake_copy = fake_corpus.copy()

"""To count the freq of words in both the real and fake files corpus"""

real_dict = dict()    #dictionary to store unigram of words from the real corpus
fake_dict = dict()    #dictionary to store unigram of words from the fake corpus
def unigram(v): 
  i = 0
  while i < len(real_corpus):  #traversing through the real news text files
    doc = nlp(real_copy[i])     
    for word in doc:
      if str(word) in v:          #if word exists in the vocabulary
        if str(word) not in real_dict.keys():    #if the word is not already in the dictionary 
          real_dict[str(word)] = 1      #then  initialise it by 1
        elif str(word) in real_dict:    #if it is in the dict 
          real_dict[str(word)] += 1 #add 1 to the freq
    i += 1

  i = 0
  while i < len(fake_corpus): #traversing through the fake news text files
    doc = nlp(fake_copy[i])
    for word in doc:
      if str(word) in v:        #if word exists in the vocabulary
        if str(word) in fake_dict.keys():    #if the word is already in the dictionary
          fake_dict[str(word)] += 1    #then add 1 to the count
        else:       #if it is not in the dict 
          fake_dict[str(word)] = 1     #initialise the count by 1
    i += 1

"""Function to count all the texts in the training corpus i.e all the files"""

def count_texts(r, f):  #counting all the texts in the training corpus
  n = len(r) + len(f) #both real and fake texts
  return n

"""Function to count all the words in the class by using spacy to tokenise them and making a list of words"""

def count_all_words(class_text):
  words_list = list()   #to store all the word in the class
  text = list()
  text = class_text.copy()
  i = 0
  while i < len(class_text): #traversing through the class texts 
    doc = nlp(text[i])    
    for word in doc:    
      words_list.append(str(word))    #making a list of words by appending each word in the word list
    i += 1
  
  return len(words_list)

"""function to count how many times a word has appeared in a particular class"""

def count_token_of_words(word, Doc):
  word_count = 0    
  word_count = Doc[word]    #Doc is a dict that contains unigrams of all the words of the class
  
  return word_count   #returning the total number of times has has appeared in the said class

"""Making dict() for conditional probability, prior and score"""

cond_prob = dict()    #dictionary to store all the conditional probabilities
cond_prob['Real'] = dict()  
cond_prob['Fake'] = dict()
prior = dict()  #dictionary to store prior of real and fake corpus
score = dict()  #dictionary to store score of real and fake corpus

"""Training through the multinomial Naive Bayes theorem"""

def TrainMultinomialNB(v, r, f, real_cor, fake_cor, real_dic, fake_dic): 
  Vocabulary = v    #extracting vocab of the whole corpus
  N = count_texts(r, f)   #counting all the texts in the corpus
  
  Nc = len(r)       #count texts in each class
  Nw = count_all_words(real_cor)    #count words in all the texts of the said class
  prior['Real'] = Nc/N    #calculate prior
  Doc_c = real_dic    #dict() containing all teh counts of the words that appeared
  for w in Vocabulary:    #if word in vocab
    if w in Doc_c:    #and word in doc
      Ni = count_token_of_words(w, Doc_c)   #get count of that word
      cond_prob['Real'][w] = (Ni + 1) / (Nw + len(Vocabulary))  #calculate conditional probability

  Nc = 0
  Nw = 0
  Doc_c = dict()
  Nc = len(f)   #count texts in each class
  Nw = count_all_words(fake_cor)  #count words in all the texts of the said class
  prior['Fake'] = Nc/N    #calculate prior
  Doc_c = fake_dic    #dict() containing all teh counts of the words that appeared
  for w1 in Vocabulary:  #if word in vocab
    if w1 in Doc_c:    #and word in doc
      Ni = count_token_of_words(w1, Doc_c)     #get count of that word
      cond_prob['Fake'][w1] = (Ni + 1) / (Nw + len(Vocabulary))    #calculate conditional probability
  return Vocabulary, prior, cond_prob   #return vocab, prior and conditional prob

v = extract_vocab()   #extracting vocab
unigram(v)    #making unigrams of both classes
Vocab, Prior, Cond_prob = TrainMultinomialNB(v, real_files, fake_files, real_corpus, fake_corpus, real_dict, fake_dict) #training through Naive Bayes

"""Function to extract all the words from text by using spacy """

def extractWordsFromText(text):
  w_list = list()   #list to append the words in 
  doc = nlp(text)
  for word in doc:    #for each word in the doc
    w_list.append(str(word))    #append word into the word list
  
  return w_list   #return list of words

"""Applying the Multinomial Theorem"""

def ApplyMultinomialNB(Cond_prob, Prior, test):
  Word_list = list()
  Word_list = extractWordsFromText(test)    #getting all the words from the text
  score['Real'] = math.log(Prior['Real'])   #get prior of the class 
  for w in Word_list:   #for each word in the list 
    if str(w) in Cond_prob['Real']:     
      score['Real'] += math.log(Cond_prob['Real'][w])   #get its conditional prob and add it to the prior and so on 

  score['Fake'] = math.log(Prior['Fake'])   #get prior of the class 
  for w in Word_list:       #for each word in the list 
    if str(w) in Cond_prob['Fake']:
      score['Fake'] += math.log(Cond_prob['Fake'][w])  #get its conditional prob and add it to the prior and so on 

  return score   #return score

def Detection():
  path = "/content/drive/My Drive/NLP/Test/Real"
  os.chdir(path)
  r_files = os.listdir(path)    #getting a list of all the txt file names in the test real folder


  path2 = "/content/drive/My Drive/NLP/Test/Fake"
  os.chdir(path2)
  f_files = os.listdir(path2)       #getting a list of all the txt file names in the test fake folder

  answer = list()   #list to get the actual answer after applying the algo
  predicted = list()    #list to contain the predicted answer

  for filename in r_files:      #traversing through real files
    with open(os.path.join(path, filename), 'r') as f:    #reading all the texts from the real class
      test = f.read()
      final = dict()
      final = ApplyMultinomialNB(Cond_prob, Prior, test)    #applying the multinomial algo
      argmax = max(final, key=final.get)     #get max value of score
      predicted.append('Real')    #predicted answers
      answer.append(argmax)     #answer after applying algo

  for filename in f_files:    #traversing through fake files
    with open(os.path.join(path, filename), 'r') as f:    #reading all the texts from the fake class
      test = f.read()
      final = dict()
      final = ApplyMultinomialNB(Cond_prob, Prior, test)    #applying the nultinomial algo
      argmax = max(final, key=final.get)       #get max value of score
      predicted.append('Fake')      #predicted answers
      answer.append(argmax)       #answer after applying algo

  accuracy = accuracy_score(answer, predicted)        #getting accuracy
  precision = precision_score(answer, predicted, average = 'macro')  #getting precision
  recall = recall_score(answer, predicted, average = 'macro')     #getting recall
  f1 = f1_score(answer, predicted, average = 'macro')     #getting f1 mearsure

  print("Accuracy = ", accuracy)
  print("Precision = ", precision)
  print("Recall = ", recall)
  print("F1 = ", f1)

"""Without Removing stopwords and duplicates from each text"""

Detection()     #Detecting real vs fake news

"""After removing Stopwords

Reading the stop words file
"""

with open('/content/drive/My Drive/NLP/stopwords-ur.txt', mode = 'r') as f:
  stopwords = f.read()

f.close()

"""Making a copy of corpus just to make sure the original corpuses are not modified in any way.


Doing this again for without the stopwords and text duplicates
"""

real_copy = list()
fake_copy = list()
real_copy = real_corpus.copy()
fake_copy = fake_corpus.copy()

"""Extracting vocabulary and removing stop words"""

def extract_vocab_no_stopwords():  #extracting vocabulary from the training corpus
  vocab = list()  #list to contain all the words of the vocabulary 
  i = 0
  while i < len(real_corpus):  #traversing through the real news text files
    doc = nlp(real_copy[i])
    for word in doc:
      if str(word) not in stopwords:    #if word is not in the stopwords list only then add to the vocab
        vocab.append(str(word))
    i += 1

  i = 0
  while i < len(fake_corpus): #traversing through the fake news text files
    doc = nlp(fake_copy[i])
    for word in doc:
      if str(word) not in stopwords:    #if word is not in the stopwords list only then add to the vocab
        vocab.append(str(word))
    i += 1
  return vocab

"""Making unigram of the corpuses without stopwords"""

real_dict = dict()    #dictionary to store unigram of words from the real corpus
fake_dict = dict()    #dictionary to store unigram of words from the fake corpus
real_copy = list()
fake_copy = list()
real_copy = real_corpus.copy()
fake_copy = fake_corpus.copy()
def unigram_without_sw(v):  
  i = 0
  while i < len(real_corpus):  #traversing through the real news text files
    doc = nlp(real_copy[i])     
    for word in doc:
      if str(word) not in stopwords:
        if str(word) in v:          #if word exists in the vocabulary
          if str(word) not in real_dict.keys():    #if the word is not already in the dictionary 
            real_dict[str(word)] = 1      #then  initialise it by 1
          elif str(word) in real_dict:    #if it is in the dict 
            real_dict[str(word)] += 1 #add 1 to the freq
    i += 1

  i = 0
  while i < len(fake_corpus): #traversing through the fake news text files
    doc = nlp(fake_copy[i])
    for word in doc:
      if str(word) not in stopwords:
        if str(word) in v:        #if word exists in the vocabulary
          if str(word) in fake_dict.keys():    #if the word is already in the dictionary
            fake_dict[str(word)] += 1    #then add 1 to the count
          else:       #if it is not in the dict 
            fake_dict[str(word)] = 1     #initialise the count by 1
    i += 1

"""With only stopwords words removed"""

cond_prob = dict()    #dictionary to store all the conditional probabilities
cond_prob['Real'] = dict()  
cond_prob['Fake'] = dict()
prior = dict()  #dictionary to store prior of real and fake corpus
score = dict()  #dictionary to store score of real and fake corpus
v = extract_vocab_no_stopwords()   #extracting vocab
unigram_without_sw(v)    #making unigrams of both classes
Vocab, Prior, Cond_prob = TrainMultinomialNB(v, real_files, fake_files, real_corpus, fake_corpus, real_dict, fake_dict) #training through Naive Bayes
Detection()     #Detection of real vs fake news using Boolean Naive Bayes

"""Removing duplicates from each text


"""

real_copy = list()
fake_copy = list()
real_copy = real_corpus.copy()
fake_copy = fake_corpus.copy()
noDup_real = dict()   #to store count of each word appearing in a real texts
noDup_fake = dict()   #to store count of each word appearing in a fake texts
real_corpus_noDup = list()    #list to store real text after removal of duplicates
fake_corpus_noDup = list()    #list to store fake text after removal of duplicates

def remove_duplicates():
  i = 0
  while i < len(real_corpus):     #traversing through real corpus
    w_list = real_copy[i].split()   #splitting the text into words
    done = ''
    for word in w_list:   
      if word not in done:    #if word not already visited than add to list
        if word in noDup_real.keys():   #if word already exists in the dict because it might have been present in the last text than add 1 to count
          noDup_real[word] += 1
        else:
          noDup_real[word] = 1          #else just initialise word count by 1    
        done = done + word + ' '        #concatenate words together for form the text without any duplicates
    real_corpus_noDup.append(done)      #append to real corpus texts list
    i += 1

  i = 0
  while i < len(fake_corpus):   #traversing through fake corpus
    w_list1 = fake_copy[i].split()      #splitting the text into words
    done = ''
    for word in w_list1:
      if word not in done:   #if word not already visited than add to list
        if word in noDup_fake.keys():   #if word already exists in the dict because it might have been present in the last text than add 1 to count
          noDup_fake[word] += 1
        else:
          noDup_fake[word] = 1          #else just initialise word count by 1
        done = done + word + ' '        #concatenate words together for form the text without any duplicates
    fake_corpus_noDup.append(done)      #append to fake corpus texts list
    i += 1

"""BOOLEAN NAIVE BAYES

(With stopwords but duplicates removed from the coupus)
"""

cond_prob = dict()    #dictionary to store all the conditional probabilities
cond_prob['Real'] = dict()  
cond_prob['Fake'] = dict()
real_copy = real_corpus.copy()
fake_copy = fake_corpus.copy()
prior = dict()  #dictionary to store prior of real and fake corpus
score = dict()  #dictionary to store score of real and fake corpus
v = extract_vocab()
remove_duplicates()     #remove duplicates from each text
Vocab, Prior, Cond_prob = TrainMultinomialNB(v, real_files, fake_files, real_corpus_noDup, fake_corpus_noDup, noDup_real, noDup_fake)  #Training through multinomial Naive Bayes
Detection()     #Detection of real vs fake news using Boolean Naive Bayes

"""Removing Stopwords and duplicates from each text (Basically removing tsopwords from Boolean Naive Bayes)"""

def remove_duplicates_and_stopwords():
  i = 0
  while i < len(real_corpus):     #traversing through real corpus
    w_list = real_copy[i].split()   #splitting the text into words
    done = ''
    for word in w_list: 
      if word not in stopwords:  #if word in not in the stopwords list
        if word not in done:    #if word not already visited than add to list
          if word in noDup_real.keys():   #if word already exists in the dict because it might have been present in the last text than add 1 to count
            noDup_real[word] += 1
          else:
            noDup_real[word] = 1          #else just initialise word count by 1    
          done = done + word + ' '        #concatenate words together for form the text without any duplicates
    real_corpus_noDup.append(done)      #append to real corpus texts list
    i += 1

  i = 0
  while i < len(fake_corpus):   #traversing through fake corpus
    w_list1 = fake_copy[i].split()      #splitting the text into words
    done = ''
    for word in w_list1:
      if word not in stopwords:  #if word in not in the stopwords list
        if word not in done:   #if word not already visited than add to list
          if word in noDup_fake.keys():   #if word already exists in the dict because it might have been present in the last text than add 1 to count
            noDup_fake[word] += 1
          else:
            noDup_fake[word] = 1          #else just initialise word count by 1
          done = done + word + ' '        #concatenate words together for form the text without any duplicates
    fake_corpus_noDup.append(done)      #append to fake corpus texts list
    i += 1

cond_prob = dict()    #dictionary to store all the conditional probabilities
cond_prob['Real'] = dict()  
cond_prob['Fake'] = dict()
prior = dict()  #dictionary to store prior of real and fake corpus
score = dict()  #dictionary to store score of real and fake corpus

v = extract_vocab_no_stopwords()    #extracting vocab without any stopwords in it
remove_duplicates_and_stopwords()     #remove duplicates from each text
Vocab, Prior, Cond_prob = TrainMultinomialNB(v, real_files, fake_files, real_corpus_noDup, fake_corpus_noDup, noDup_real, noDup_fake)  #Training through multinomial Naive Bayes
Detection()  #Detection of real vs fake news after stopwords and duplicates

"""
**CASES:**


1.   Without Removing stopwords and duplicates
2.   Removing stopwords only

1.   Removing Duplicates only
2.   Removing Both Stopwords and Duplicates


"""