-
Notifications
You must be signed in to change notification settings - Fork 0
/
Count_occurences.py
63 lines (58 loc) · 1.48 KB
/
Count_occurences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
counter = 0
ls = []
rs = []
guess = "pakistan"
replace = ", . - ' ; : / ™ [ ] { } ( ) * - + & ! @ # $ % ^ _ = ` ~"
i = 0
# change the value of this variable with the number of files that have been parsed
max1 = 43
number_pdf = 0
with open("./To_download_links.txt",'r') as t:
x = t.read()
for link in x.split():
if i < max1:
i += 1
link = link.split("/")[-1]
print(link)
try:
with open("./Better_cleaned/"+link+'.txt','rb') as f:
number_pdf +=1
rs.append(number_pdf)
for line in f:
line = line.lower()
total_words = 0
for word in line.split():
# print(word)
try:
word = word.decode('utf-8')
total_words +=1
for x in replace.split():
if x in word:
# print('g')
word = word.replace(x,"")
# print(word)
# outer loop for loop for x in guess:
if word.lower() == guess:
# print(word)
counter+=1
except:
pass
print(counter)
print(total_words)
print("Percentage = ",counter/total_words * 100)
print()
ls.append(counter)
counter = 0
except:
pass
# print(ls)
ls.reverse()
print(len(ls),len(rs))
# this is the graph plotting section of the code
from pandas import DataFrame
import matplotlib.pyplot as plt
Data ={ 'Day_number':rs,'Occurences':ls}
df = DataFrame(Data,columns=['Day_number','Occurences'])
df.plot(x ='Day_number', y='Occurences', kind = 'line')
plt.show()
# convert file to excel thne to csv for R