-
Notifications
You must be signed in to change notification settings - Fork 1
/
PlagiarismChecker.py
107 lines (89 loc) · 3.26 KB
/
PlagiarismChecker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""
----------------------------------------
Plagiarism Checker
----------------------------------------
With content creation and blogging one of
the good businesses in the market everyone
wants to try their hands on this but some
lack sufficient funds to give their articles
a free plagiarism check as mostly plagiarism
checkers do not come for free. Building a
Python plagiarism checker could be built here
using a natural language processing library
along with the search API to search the first
few pages of Google and detect plagiarism if any.
----------------------------------------
"""
import re
import nltk; nltk.download('punkt')
from nltk.util import ngrams, pad_sequence, everygrams
from nltk.tokenize import word_tokenize
from nltk.lm import MLE, WittenBellInterpolated
import numpy as np
import plotly.graph_objects as go
from scipy.ndimage import gaussian_filter
# Training data file
train_data_file = ""
# read training data
with open(train_data_file) as f:
train_text = f.read().lower()
# apply preprocessing (remove text inside square and curly brackets and rem punc)
train_text = re.sub(r"\[.*\]|\{.*\}", "", train_text)
train_text = re.sub(r'[^\w\s]', "", train_text)
# set ngram number
n = 4
# pad the text and tokenize
training_data = list(pad_sequence(word_tokenize(train_text), n,
pad_left=True,
left_pad_symbol="<s>"))
# generate ngrams
ngrams = list(everygrams(training_data, max_len=n))
print("Number of ngrams:", len(ngrams))
# build ngram language models
model = WittenBellInterpolated(n)
model.fit([ngrams], vocabulary_text=training_data)
print(model.vocab)
# testing data file
test_data_file = ""
# Read testing data
with open(test_data_file) as f:
test_text = f.read().lower()
test_text = re.sub(r'[^\w\s]', "", test_text)
# Tokenize and pad the text
testing_data = list(pad_sequence(word_tokenize(test_text), n,
pad_left=True,
left_pad_symbol="<s>"))
print("Length of test data:", len(testing_data))
# assign scores
scores = []
for i, item in enumerate(testing_data[n-1:]):
s = model.score(item, testing_data[i:i+n-1])
scores.append(s)
scores_np = np.array(scores)
# set width and height
width = 8
height = np.ceil(len(testing_data)/width).astype("int32")
print("Width, Height:", width, ",", height)
# copy scores to rectangular blank array
a = np.zeros(width*height)
a[:len(scores_np)] = scores_np
diff = len(a) - len(scores_np)
# apply gaussian smoothing for aesthetics
a = gaussian_filter(a, sigma=1.0)
# reshape to fit rectangle
a = a.reshape(-1, width)
# format labels
labels = [" ".join(testing_data[i:i+width]) for i in range(n-1, len(testing_data), width)]
labels_individual = [x.split() for x in labels]
labels_individual[-1] += [""]*diff
labels = [f"{x:60.60}" for x in labels]
# create heatmap
fig = go.Figure(data=go.Heatmap(
z=a, x0=0, dx=1,
y=labels, zmin=0, zmax=1,
customdata=labels_individual,
hovertemplate='%{customdata} <br><b>Score:%{z:.3f}<extra></extra>',
colorscale="burg"))
fig.update_layout({"height":height*28, "width":1000, "font":{"family":"Courier New"}})
fig['layout']['yaxis']['autorange'] = "reversed"
fig.show()