Skip to content

Commit

Permalink
Merge pull request #671 from biglocalnews/md-670
Browse files Browse the repository at this point in the history
Rework MD #670
  • Loading branch information
stucka authored Nov 6, 2024
2 parents c5807ec + cf39cff commit 2199830
Showing 1 changed file with 37 additions and 9 deletions.
46 changes: 37 additions & 9 deletions warn/scrapers/md.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,13 @@ def scrape(
# Set the cache
cache = Cache(cache_dir)

# In November 2024 Maryland began throwing out many failed connection messages. These two things helped.
request_headers = {"User-Agent": "BigLocalNews.org"}
request_verify = False

# Get the page
url = "https://www.dllr.state.md.us/employment/warn.shtml"
r = utils.get_url(url)
r = utils.get_url(url, headers=request_headers, verify=request_verify)
r.encoding = "utf-8"
html = r.text

Expand All @@ -56,17 +60,41 @@ def scrape(
html_list = []
html_list.append(html) # Save the source HTML for parsing also

old_pages = [
"warn2023.shtml",
"warn2022.shtml",
"warn2021.shtml",
"warn2020.shtml",
"warn2019.shtml",
"warn2018.shtml",
"warn2017.shtml",
"warn2016.shtml",
"warn2015.shtml",
"warn2014.shtml",
"warn2013.shtml",
"warn2012.shtml",
"warn2011.shtml",
"warn2010.shtml",
]

for href in href_list:
# Request the HTML
url = f"https://www.dllr.state.md.us/employment/{href}"
r = utils.get_url(url)
r.encoding = "utf-8"
html = r.text

# Save it to the cache
cache.write(f"md/{href}.html", html)

sleep(naptime) # Try to stop blocked connections by being less aggressive
filename = cache_dir / f"md/{href}.html"

if href not in old_pages:
sleep(naptime) # Try to stop blocked connections by being less aggressive
r = utils.get_url(url, headers=request_headers, verify=request_verify)
r.encoding = "utf-8"
html = r.text

# Save it to the cache
cache.write(filename, html)
else:
r = utils.fetch_if_not_cached(
filename, url, headers=request_headers, verify=request_verify
)
html = cache.read(filename)

# Add it to the list
html_list.append(html)
Expand Down

0 comments on commit 2199830

Please sign in to comment.