Merge pull request #540 from stucka/main

Build in SSL verification workaround; patch pre-commit; patch AZ
biglocalnews · Aug 14, 2023 · 72b11d6 · 72b11d6
2 parents 9caad33 + dcd7d23
commit 72b11d6
Show file tree

Hide file tree

Showing 4 changed files with 40 additions and 10 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -31,7 +31,7 @@ repos:
     -   id: isort
 
 -   repo: https://github.com/PyCQA/flake8
-    rev: 6.0.0
+    rev: 6.1.0
     hooks:
     - id: flake8
       additional_dependencies:

diff --git a/warn/platforms/job_center/site.py b/warn/platforms/job_center/site.py
@@ -30,13 +30,16 @@ class Site:
         state (str): State postal code
         url (str): Search URL for the site (should end in '/warn_lookups')
         cache_dir (str): Cache directory
+        verify (boolean, default True): SSL certificate verification
     """
 
-    def __init__(self, state, url, cache_dir):
+    def __init__(self, state, url, cache_dir, verify=True):
         """Initialize a new instance."""
         self.state = state.upper()
         self.url = url
         self.cache = Cache(cache_dir)
+        self.verify = verify
+        print(f"Site init SSL verification status: {self.verify}")
 
     def scrape(self, start_date=None, end_date=None, detail_pages=True, use_cache=True):
         """
@@ -110,7 +113,7 @@ def _get_page(self, url, params=None, use_cache=True):
             return self.cache.fetch(url, params)
         else:
             logger.debug("Pulling from the web")
-            response = requests.get(url, params=params)
+            response = requests.get(url, params=params, verify=self.verify)
             logger.debug(f"Response code: {response.status_code}")
             html = response.text
             self.cache.save(url, params, html)

diff --git a/warn/platforms/job_center/utils.py b/warn/platforms/job_center/utils.py
@@ -10,7 +10,13 @@
 
 
 def scrape_state(
-    state_postal, search_url, output_csv, stop_year, cache_dir, use_cache=True
+    state_postal,
+    search_url,
+    output_csv,
+    stop_year,
+    cache_dir,
+    use_cache=True,
+    verify=True,
 ):
     """Date-based scraper for Job Center states.
 
@@ -29,6 +35,7 @@ def scrape_state(
         stop_year (int): First year that data is available for state (requires manaul research)
         cache_dir (str): The root directory for WARN's cache files (e.g. ~/.warn-scraper/cache)
         use_cache (boolean, default True): Whether to use cached files for older years
+        verify (boolean, default True): Use SSL certificate verifcation
 
     Returns:
         Full path to exported csv (e.g. ~/.warn-scraper/exports/ks.csv)
@@ -41,7 +48,10 @@ def scrape_state(
 
     # Set up scraper instance
     state_cache_dir = cache_dir / state_postal.lower()
-    site = JobCenterSite(state_postal.upper(), search_url, cache_dir=state_cache_dir)
+    print(f"scrape_state verify: {verify}")
+    site = JobCenterSite(
+        state_postal.upper(), search_url, cache_dir=state_cache_dir, verify=verify
+    )
 
     # Date-based searches produce search result pages that appear to have certain
     # records duplicated over paged results. We'll initially write all data to a raw
@@ -66,16 +76,22 @@ def scrape_state(
     # Execute the scrape in two batches
     # 1. Current and prior year. Always scrape fresh (i.e. never use cached files)
     #    in case records have been updated.
-    _scrape_years(site, raw_csv, headers, no_cache_years, use_cache=False)
+    _scrape_years(
+        site, raw_csv, headers, no_cache_years, use_cache=False, verify=verify
+    )
     # 2. Years before current & prior, going back to stop_year.
     #    We should generally use cached files for these older years,
     #    since data is less likely to be updated.
-    _scrape_years(site, raw_csv, headers, yearly_dates, use_cache=use_cache)
+    _scrape_years(
+        site, raw_csv, headers, yearly_dates, use_cache=use_cache, verify=verify
+    )
     _dedupe(raw_csv, output_csv)
     return output_csv
 
 
-def _scrape_years(site, output_csv, headers, start_end_dates, use_cache=True):
+def _scrape_years(
+    site, output_csv, headers, start_end_dates, use_cache=True, verify=True
+):
     """Loop through years of data and write out to CSV."""
     # NOTE: Scraping for Jan 1 - Dec 31 for current year works
     # throughout the year. Additionally, it allows us to avoid

diff --git a/warn/scrapers/az.py b/warn/scrapers/az.py
@@ -4,7 +4,7 @@
 
 from .. import utils
 
-__authors__ = ["zstumgoren", "Dilcia19"]
+__authors__ = ["zstumgoren", "Dilcia19", "stucka"]
 __tags__ = [
     "jobcenter",
 ]
@@ -32,12 +32,23 @@ def scrape(
     output_csv = data_dir / "az.csv"
     search_url = "https://www.azjobconnection.gov/search/warn_lookups"
 
+    # Use SSL certificate? Broke August 2023
+    verify = False
+
     # Date chosen based on manual research
     stop_year = 2010
 
     # Use cache for years before current and prior year
+    print(f"AZ cache status: {use_cache}")
+    print(f"AZ SSL verification: {verify}")
     scrape_state(
-        "AZ", search_url, output_csv, stop_year, cache_dir, use_cache=use_cache
+        "AZ",
+        search_url,
+        output_csv,
+        stop_year,
+        cache_dir,
+        use_cache=use_cache,
+        verify=verify,
     )
 
     return output_csv