-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_mars.py
197 lines (135 loc) · 5.31 KB
/
scrape_mars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# Dependencies
import time
from splinter import Browser
from bs4 import BeautifulSoup
import requests
import pandas as pd
from selenium import webdriver
import string
#create a function to call all parts of the scraping code
def scrape():
mars_dict = mars_news()
mars_dict.update(featured_image())
mars_dict.update(mars_weather())
mars_dict.update({'facts': mars_facts()})
mars_dict.update({'hemisphere_image_urls': mars_hemi()})
return mars_dict
#NASA Mars News
def mars_news():
url = "https://mars.nasa.gov/news/"
response = requests.get(url)
soup = BeautifulSoup(response.text,'html.parser')
#find news titles within parsed html
title = soup.find_all('div',class_="content_title")
news_title=[]
for item in title:
contents = item.a.text
news_title.append(contents.strip('\n'))
#find news descriptions
desc = soup.find_all('div', class_="rollover_description_inner")
news_p=[]
for item in desc:
contents = item.text
news_p.append(contents.strip('\n'))
return {"news_title": news_title, "news_p": news_p}
#Initialize browser function; used within functions that need splinter
def init_browser():
executable_path = {"executable_path": "chromedriver.exe"}
return Browser("chrome", **executable_path)
#JPL Mars Space Images - Featured Image
#returns path to featured image
def featured_image():
browser = init_browser()
base_url = "https://www.jpl.nasa.gov"
search_url = base_url + "/spaceimages/?search=&category=Mars"
browser.visit(search_url)
time.sleep(1)
html = browser.html
soup = BeautifulSoup(html, "html.parser")
main_img = soup.find('article', class_="carousel_item").get('style')
rel_url = ""
d= False
base_url = "https://www.jpl.nasa.gov"
for i in main_img:
if(i=="'" and d== False):
d = True
elif(i=="'" and d==True):
d = False
elif(i!="'" and d==True):
rel_url = rel_url + i
image_url = {'image_url':base_url + rel_url}
browser.quit()
return image_url
#Mars Weather
#function to pull information on mars weather from twitter page
def mars_weather():
url_weather = "https://twitter.com/marswxreport?lang=en"
response_weather = requests.get(url_weather)
soup_weather = BeautifulSoup(response_weather.text,'html.parser')
#find paragraph text within parsed html
tweets = soup_weather.find_all('p', class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text")
weather_p=[]
for item in tweets:
i = item.text
if i.startswith("Sol") == True:
contents = item.text
weather_p.append(contents.strip('\n'))
weather = {'mars_weather': weather_p[0]}
return weather
#Mars Facts
#Pull information from space facts page
def mars_facts():
url_facts = "http://space-facts.com/mars/"
response_facts = requests.get(url_facts)
soup_facts = BeautifulSoup(response_facts.text,'html.parser')
#find paragraph text within parsed html
div_contents = soup_facts.find('div',class_='post-content')
#pull mars description out of post-content div
para = div_contents.find_all('p')
#print(para)
desc={0:['Description:'],1:[ para[1].text]}
facts_df = pd.DataFrame(data=desc)
profile_contents = pd.read_html("https://space-facts.com/mars/")[0]
#append profile to description to get the full list of facts about mars, in one df
facts = facts_df.append(profile_contents,ignore_index=True)
facts = facts.set_index(0)
facts = facts.rename(index=str,columns={1: 'value'})
facts.index.name = None
mars_facts_html = facts.to_html()
return mars_facts_html
#Mars Hemispheres
def mars_hemi():
#pull html contents of the website containing mars hemisphere information
url_hemi = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
response_hemi = requests.get(url_hemi)
soup_hemi = BeautifulSoup(response_hemi.text,'html.parser')
#find title of each mars hemisphere
#print(soup_hemi.prettify())
h3 = soup_hemi.find_all('h3')
h3_parsed = []
for item in h3:
h3_parsed.append(item.text)
#find path to the image page, that will be used by splinter to find the exact path to a sample image
img_tag = soup_hemi.find_all('a', class_="itemLink product-item")
#print(img_tag)
img_path=[]
for item in img_tag:
i = item.get('href')
img_path.append('https://astrogeology.usgs.gov' + i)
#use splinter to navigate to each image download page, and retrieve its image url
browser = init_browser()
downloads = []
for url in img_path:
browser.visit(url)
time.sleep(1)
html = browser.html
downloads.append(BeautifulSoup(html, "html.parser").find('div',class_='downloads'))
browser.quit()
image_url=[]
for item in downloads:
contents = item.a.get('href')
image_url.append(contents)
hemisphere_image_urls = []
for index,item in enumerate(h3_parsed):
hemisphere_image_urls.append({"title": item , "img_url": image_url[index]})
return hemisphere_image_urls