-
Notifications
You must be signed in to change notification settings - Fork 1
/
sentence_score_summarizer.py
61 lines (50 loc) · 1.73 KB
/
sentence_score_summarizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import bs4 as bs
import urllib.request
import re
import nltk
nltk.download('stopwords')
import heapq
def main_func(text):
# Fetch the article
# for paragraph in soup.find_all('p'):
# text += paragraph.text
# Preprocessing the data
text = re.sub(r'\[[0-9]*\]',' ',text)
text = re.sub(r'\s+',' ',text)
article = text.lower()
article = re.sub(r'\W',' ',article)
article = re.sub(r'\d',' ',article)
article = re.sub(r'\s+',' ',article)
# Tokenize sentences
token_sen = nltk.sent_tokenize(text)
# Stopword list
stop_words = nltk.corpus.stopwords.words('english')
# Word counts
count = {}
for word in nltk.word_tokenize(article):
if word not in stop_words:
if word not in count.keys():
count[word] = 1
else:
count[word] += 1
# Converting counts to weights
max_count = max(count.values())
for key in count.keys():
count[key] = count[key]/max_count
# Product sentence scores
score = {}
for sentence in token_sen:
for word in nltk.word_tokenize(sentence.lower()):
if word in count.keys():
if len(sentence.split(' ')) < 25: #length of each sentence of summary
if sentence not in score.keys():
score[sentence] = count[word]
else:
score[sentence] += count[word]
# Gettings best 5 lines
best_sentences = heapq.nlargest(5, score, key=score.get)
# print('-------------------------------------------------------')
best = ""
for sentence in best_sentences:
best += sentence + ' '
return best