-
Notifications
You must be signed in to change notification settings - Fork 0
/
py4dsais_w5_webscraping.py
138 lines (115 loc) · 5.84 KB
/
py4dsais_w5_webscraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests # this module helps us to download a web page
table="<table><tr><td id='flight'>Flight No</td><td>Launch site</td> <td>Payload mass</td></tr><tr> <td>1</td><td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a></td><td>300 kg</td></tr><tr><td>2</td><td><a href='https://en.wikipedia.org/wiki/Texas'>Texas</a></td><td>94 kg</td></tr><tr><td>3</td><td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a> </td><td>80 kg</td></tr></table>"
table_bs = BeautifulSoup(table, 'html5lib')
table_rows=table_bs.find_all('tr')
table_rows
first_row =table_rows[0]
first_row
for i,row in enumerate(table_rows):
print("row",i,"is",row)
"""
row 0 is <tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>
row 1 is <tr> <td>1</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td><td>300 kg</td></tr>
row 2 is <tr><td>2</td><td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td><td>94 kg</td></tr>
row 3 is <tr><td>3</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td><td>80 kg</td></tr>
"""
for i,row in enumerate(table_rows):
print("row",i)
cells=row.find_all('td')
for j,cell in enumerate(cells):
print('colunm',j,"cell",cell)
"""
row 0
colunm 0 cell <td id="flight">Flight No</td>
colunm 1 cell <td>Launch site</td>
colunm 2 cell <td>Payload mass</td>
row 1
colunm 0 cell <td>1</td>
colunm 1 cell <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td>
colunm 2 cell <td>300 kg</td>
row 2
colunm 0 cell <td>2</td>
colunm 1 cell <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td>
colunm 2 cell <td>94 kg</td>
row 3
colunm 0 cell <td>3</td>
colunm 1 cell <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td>
colunm 2 cell <td>80 kg</td>
"""
# --- SCRAPING A WEBPAGE ----
url = "http://www.ibm.com"
data = requests.get(url).text
soup = BeautifulSoup(data,"html5lib") # create a soup object using the variable 'data'
for link in soup.find_all('a',href=True): # in html anchor/link is represented by the tag <a>
print(link.get('href'))
for link in soup.find_all('img'):# in html image is represented by the tag <img>
print(link)
print(link.get('src'))
# --- HTML TABLE ----
#The below url contains an html table with data about colors and color codes.
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/HTMLColorCodes.html"
# get the contents of the webpage in text format and store in a variable called data
data = requests.get(url).text
soup = BeautifulSoup(data,"html5lib")
#find a html table in the web page
table = soup.find('table') # in html table is represented by the tag <table>
#Get all rows from the table
for row in table.find_all('tr'): # in html table row is represented by the tag <tr>
# Get all columns in each row.
cols = row.find_all('td') # in html a column is represented by the tag <td>
color_name = cols[2].string # store the value in column 3 as color_name
color_code = cols[3].string # store the value in column 4 as color_code
print("{}--->{}".format(color_name,color_code))
# ---HTML TO PANDAS ---
import pandas as pd
#The below url contains html tables with data about world population.
url = "https://en.wikipedia.org/wiki/World_population"
# get the contents of the webpage in text format and store in a variable called data
data = requests.get(url).text
soup = BeautifulSoup(data,"html5lib")
#find all html tables in the web page
tables = soup.find_all('table') # in html table is represented by the tag <table>
# Assume that we are looking for the 10 most densly populated countries
for index,table in enumerate(tables):
if ("10 most densely populated countries" in str(table)):
table_index = index
print(table_index)
print(tables[table_index].prettify())
population_data = pd.DataFrame(columns=["Rank", "Country", "Population", "Area", "Density"])
for row in tables[table_index].tbody.find_all("tr"):
col = row.find_all("td")
if (col != []):
rank = col[0].text
country = col[1].text
population = col[2].text.strip()
area = col[3].text.strip()
density = col[4].text.strip()
population_data = population_data.append({"Rank":rank, "Country":country, "Population":population, "Area":area, "Density":density}, ignore_index=True)
print(population_data)
"""
Scrape data from HTML tables into a DataFrame using BeautifulSoup and read_html
Using the same url, data, soup, and tables object as in the last section we can use
the read_html function to create a DataFrame.
Remember the table we need is located in tables[table_index]
We can now use the pandas function read_html and give it the string version of the table
as well as the flavor which is the parsing engine bs4.
"""
pd.read_html(str(tables[5]), flavor='bs4')
"""
[ Rank Country Population Area(km2) Density(pop/km2)
0 1 Singapore 5704000 710 8033
1 2 Bangladesh 170320000 143998 1183
2 3 Lebanon 6856000 10452 656
3 4 Taiwan 23604000 36193 652
4 5 South Korea 51781000 99538 520
5 6 Rwanda 12374000 26338 470
6 7 Haiti 11578000 27065 428
7 8 Netherlands 17570000 41526 423
8 9 Israel 9320000 22072 422
9 10 India 1374470000 3287240 418]
The function read_html always returns a list of DataFrames so we must pick the one we want out of the list.
"""
population_data_read_html = pd.read_html(str(tables[5]), flavor='bs4')[0]
population_data_read_html
#THIS IS A BIT WEIRD BECAUSE IT IS A SINGLE OBJECT BY THE LOOK OF things