-
Notifications
You must be signed in to change notification settings - Fork 0
/
NewsScraper.py
114 lines (103 loc) · 4.22 KB
/
NewsScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import feedparser as fp
import json
import newspaper
from newspaper import Article
from time import mktime
from datetime import datetime
from PIL import Image
import requests
from io import BytesIO
import urllib.request
from bs4 import BeautifulSoup # BeautifulSoup is in bs4 package
import requests
# Set the limit for number of articles to download
LIMIT = 1000
data = {}
data['newspapers'] = {}
# Loads the JSON files with news sites
with open('NewsPapers.json') as data_file:
companies = json.load(data_file)
count = 1
# Iterate through each news company
for company, value in companies.items():
# If a RSS link is provided in the JSON file, this will be the first choice.
# Reason for this is that, RSS feeds often give more consistent and correct data.
# If you do not want to scrape from the RSS-feed, just leave the RSS attr empty in the JSON file.
if 'rss' in value:
d = fp.parse(value['rss'])
print("Downloading articles from ", company)
newsPaper = {
"rss": value['rss'],
"link": value['link'],
"articles": []
}
for entry in d.entries:
# Check if publish date is provided, if no the article is skipped.
# This is done to keep consistency in the data and to keep the script from crashing.
if hasattr(entry, 'published'):
if count > LIMIT:
break
article = {}
article['link'] = entry.link
date = entry.published_parsed
article['published'] = datetime.fromtimestamp(mktime(date)).isoformat()
try:
content = Article(entry.link, language='tr')
content.download()
content.parse()
except Exception as e:
# If the download for some reason fails (ex. 404) the script will continue downloading
# the next article.
print(e)
print("continuing...")
continue
article['title'] = content.title
article['text'] = content.text
newsPaper['articles'].append(article)
print(count, "articles downloaded from", company, ", url: ", entry.link)
count = count + 1
else:
# This is the fallback method if a RSS-feed link is not provided.
# It uses the python newspaper library to extract articles
print("Building site for ", company)
paper = newspaper.build(value['link'], memoize_articles=False)
newsPaper = {
"link": value['link'],
"articles": []
}
noneTypeCount = 0
for content in paper.articles:
try:
content.download()
content.parse()
except Exception as e:
print(e)
print("continuing...")
continue
# Again, for consistency, if there is no found publish date the article will be skipped.
# After 10 downloaded articles from the same newspaper without publish date, the company will be skipped.
if content.publish_date is None:
print(count, " Article has date of type None...")
noneTypeCount = noneTypeCount + 1
if noneTypeCount > 10:
print("Too many noneType dates, aborting...")
noneTypeCount = 0
break
count = count + 1
continue
article = {}
article['title'] = content.title
article['text'] = content.text
article['link'] = content.url
article['published'] = content.publish_date.isoformat()
newsPaper['articles'].append(article)
print(count, "articles downloaded from", company, " using newspaper, url: ", content.url)
count = count + 1
noneTypeCount = 0
count = 1
data['newspapers'][company] = newsPaper
# Finally it saves the articles as a JSON-file.
try:
with open('scraped_articles.json', 'w', encoding='utf8') as outfile:
json.dump(data, outfile, ensure_ascii=False)
except Exception as e: print(e)