-
Notifications
You must be signed in to change notification settings - Fork 29
/
1_Web_Scraping.py
73 lines (61 loc) · 1.72 KB
/
1_Web_Scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#################################
### Author: Paul Soto ###
### paul.soto@upf.edu ###
# #
# This file shows the basic of ##
# BeautifulSoup to datascrape ###
# an HTML based website and save#
# the data as a csv. #
#################################
from bs4 import BeautifulSoup
import urllib2
import pandas as pd
"""
NOTE IN PYTHON 3:
1) "import urllib" instead of "import urllib2"
2) To open a webpage: use
url = 'http://example.com/'
req = urllib.request.urlopen(url)
html = req.read()
"""
def get_HTML(url):
"""
This file creates a HTML soup from a given url
"""
req = urllib2.Request(url, headers={ 'User-Agent': 'Mozilla/5.0' })
html = urllib2.urlopen(req).read()
soup = BeautifulSoup(html, 'html.parser')
return soup
# Get HTML Soup
afi_soup = get_HTML("http://www.afi.com/100Years/quotes.aspx")
# Isolate the table with the
tables = afi_soup.find_all("table")
quote_table = tables[1]
# Loop through each row, retrieving the
# quote ID number, quote, movie and year
first = True
for row in quote_table.find_all("tr"):
# Get row elements
row_elements = row.find_all('td')
# Strip unnecessary text
row_txt = map(lambda x: x.text.strip(),row_elements)
if first:
final_df = pd.DataFrame(columns=row_txt)
first = False
continue
# Add to dataset
final_df.loc[len(final_df)+1] = row_txt
# Clicking movie link to get the director
final_df['Director'] = ""
max_iter = 10
index = 0
for row in quote_table.find_all("tr"):
if index>max_iter:
break
if row.find("a"):
movie_soup = get_HTML(row.find("a")['href'])
director = movie_soup.find(text="Director:").find_next().text
final_df.loc[index,"Director"] = director
index+=1
# Export to CSV
final_df.to_csv("movie_quotes.csv",index=False, encoding='utf-8')