-
Notifications
You must be signed in to change notification settings - Fork 8
/
basicCrawler.py
38 lines (35 loc) · 1.32 KB
/
basicCrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
##################
# This crawler gets the most recent "Business and Finance" articles
# from the Brookings Institute, and prints out their title and lede
# (or the first paragraph)
#################
def getArticle(url):
print("URL: "+url)
html = urlopen(url)
articleObj = BeautifulSoup(html.read())
#Get article title. This should have a class name ending in "title"
title = articleObj.find("h1", {"class":re.compile(".*title")}).get_text()
#Get the main body of the article text
body = articleObj.find("div", {"itemprop":"articleBody"})
lede = body.find("div", {"class":"lede"})
if not lede:
#If an official lede does not exist, get the first paragraph
lede = body.find("p")
print("TITLE: "+title)
print("LEDE: "+lede.get_text())
print("-----------------------------")
for i in range(0, 10):
start = str(i*25+1)
print("Scraping page: "+str(start)+" of articles")
url = "http://www.brookings.edu/research/commentary?topic=Business%20and%20Finance&start="+start+"&sort=ContentDate"
html = urlopen(url)
listingObj = BeautifulSoup(html.read())
urls = listingObj.findAll("h3", {"class":"title"})
for url in urls:
newPage = url.find("a").attrs['href']
#Ignore external URLs
if newPage.startswith("/"):
getArticle("http://brookings.edu"+newPage)