Skip to content

Commit

Permalink
Merge pull request #540 from stucka/main
Browse files Browse the repository at this point in the history
Build in SSL verification workaround; patch pre-commit; patch AZ
  • Loading branch information
stucka authored Aug 14, 2023
2 parents 9caad33 + dcd7d23 commit 72b11d6
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 10 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ repos:
- id: isort

- repo: https://github.com/PyCQA/flake8
rev: 6.0.0
rev: 6.1.0
hooks:
- id: flake8
additional_dependencies:
Expand Down
7 changes: 5 additions & 2 deletions warn/platforms/job_center/site.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,16 @@ class Site:
state (str): State postal code
url (str): Search URL for the site (should end in '/warn_lookups')
cache_dir (str): Cache directory
verify (boolean, default True): SSL certificate verification
"""

def __init__(self, state, url, cache_dir):
def __init__(self, state, url, cache_dir, verify=True):
"""Initialize a new instance."""
self.state = state.upper()
self.url = url
self.cache = Cache(cache_dir)
self.verify = verify
print(f"Site init SSL verification status: {self.verify}")

def scrape(self, start_date=None, end_date=None, detail_pages=True, use_cache=True):
"""
Expand Down Expand Up @@ -110,7 +113,7 @@ def _get_page(self, url, params=None, use_cache=True):
return self.cache.fetch(url, params)
else:
logger.debug("Pulling from the web")
response = requests.get(url, params=params)
response = requests.get(url, params=params, verify=self.verify)
logger.debug(f"Response code: {response.status_code}")
html = response.text
self.cache.save(url, params, html)
Expand Down
26 changes: 21 additions & 5 deletions warn/platforms/job_center/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,13 @@


def scrape_state(
state_postal, search_url, output_csv, stop_year, cache_dir, use_cache=True
state_postal,
search_url,
output_csv,
stop_year,
cache_dir,
use_cache=True,
verify=True,
):
"""Date-based scraper for Job Center states.
Expand All @@ -29,6 +35,7 @@ def scrape_state(
stop_year (int): First year that data is available for state (requires manaul research)
cache_dir (str): The root directory for WARN's cache files (e.g. ~/.warn-scraper/cache)
use_cache (boolean, default True): Whether to use cached files for older years
verify (boolean, default True): Use SSL certificate verifcation
Returns:
Full path to exported csv (e.g. ~/.warn-scraper/exports/ks.csv)
Expand All @@ -41,7 +48,10 @@ def scrape_state(

# Set up scraper instance
state_cache_dir = cache_dir / state_postal.lower()
site = JobCenterSite(state_postal.upper(), search_url, cache_dir=state_cache_dir)
print(f"scrape_state verify: {verify}")
site = JobCenterSite(
state_postal.upper(), search_url, cache_dir=state_cache_dir, verify=verify
)

# Date-based searches produce search result pages that appear to have certain
# records duplicated over paged results. We'll initially write all data to a raw
Expand All @@ -66,16 +76,22 @@ def scrape_state(
# Execute the scrape in two batches
# 1. Current and prior year. Always scrape fresh (i.e. never use cached files)
# in case records have been updated.
_scrape_years(site, raw_csv, headers, no_cache_years, use_cache=False)
_scrape_years(
site, raw_csv, headers, no_cache_years, use_cache=False, verify=verify
)
# 2. Years before current & prior, going back to stop_year.
# We should generally use cached files for these older years,
# since data is less likely to be updated.
_scrape_years(site, raw_csv, headers, yearly_dates, use_cache=use_cache)
_scrape_years(
site, raw_csv, headers, yearly_dates, use_cache=use_cache, verify=verify
)
_dedupe(raw_csv, output_csv)
return output_csv


def _scrape_years(site, output_csv, headers, start_end_dates, use_cache=True):
def _scrape_years(
site, output_csv, headers, start_end_dates, use_cache=True, verify=True
):
"""Loop through years of data and write out to CSV."""
# NOTE: Scraping for Jan 1 - Dec 31 for current year works
# throughout the year. Additionally, it allows us to avoid
Expand Down
15 changes: 13 additions & 2 deletions warn/scrapers/az.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from .. import utils

__authors__ = ["zstumgoren", "Dilcia19"]
__authors__ = ["zstumgoren", "Dilcia19", "stucka"]
__tags__ = [
"jobcenter",
]
Expand Down Expand Up @@ -32,12 +32,23 @@ def scrape(
output_csv = data_dir / "az.csv"
search_url = "https://www.azjobconnection.gov/search/warn_lookups"

# Use SSL certificate? Broke August 2023
verify = False

# Date chosen based on manual research
stop_year = 2010

# Use cache for years before current and prior year
print(f"AZ cache status: {use_cache}")
print(f"AZ SSL verification: {verify}")
scrape_state(
"AZ", search_url, output_csv, stop_year, cache_dir, use_cache=use_cache
"AZ",
search_url,
output_csv,
stop_year,
cache_dir,
use_cache=use_cache,
verify=verify,
)

return output_csv
Expand Down

0 comments on commit 72b11d6

Please sign in to comment.