diff --git a/juriscraper/opinions/united_states/state/pa.py b/juriscraper/opinions/united_states/state/pa.py index de6f51af0..6f7d01917 100644 --- a/juriscraper/opinions/united_states/state/pa.py +++ b/juriscraper/opinions/united_states/state/pa.py @@ -17,10 +17,11 @@ class Site(OpinionSiteLinear): court = "Supreme" base_url = "https://www.pacourts.us/api/opinion?" document_url = "https://www.pacourts.us/assets/opinions/{}/out/{}" - days_interval = 20 + days_interval = 1 api_dt_format = "%Y-%m-%dT00:00:00-05:00" first_opinion_date = datetime(1998, 4, 27) judge_key = "AuthorCode" + regional_cite_regex = re.compile(r"\d{1,3} A\.3d \d+") def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -28,8 +29,8 @@ def __init__(self, *args, **kwargs): self.regex = re.compile(r"(.*)(?:[,-]?\s+Nos?\.)(.*)") self.status = "Published" - now = datetime.now() + timedelta(days=1) - start = now - timedelta(days=7) + now = datetime.now() + start = now - timedelta(days=1) self.params = { "startDate": start.strftime(self.api_dt_format), "endDate": now.strftime(self.api_dt_format), @@ -50,9 +51,13 @@ def _process_html(self) -> None: json_response = self.html for cluster in json_response["Items"]: - title = cluster["Caption"] disposition_date = cluster["DispositionDate"].split("T")[0] + title = cluster["Caption"] name, docket = self.parse_case_title(title) + # A.3d cites seem to exist only for pasuperct + cite = "" + if cite_match := self.regional_cite_regex.search(title): + cite = cite_match.group(0) for op in cluster["Postings"]: per_curiam = False @@ -75,9 +80,18 @@ def _process_html(self) -> None: "judge": author_str, "status": status, "per_curiam": per_curiam, + "citation": cite, } ) + if not self.test_mode_enabled() and json_response.get("HasNext"): + next_page = json_response["PageNumber"] + 1 + logger.info("Paginating to page %s", next_page) + self.params["pageNumber"] = next_page + self.url = f"{self.base_url}{urlencode(self.params)}" + self.html = self._download() + self._process_html() + def parse_case_title(self, title: str) -> Tuple[str, str]: """Separates case_name and docket_number from case string diff --git a/juriscraper/opinions/united_states/state/pacommwct.py b/juriscraper/opinions/united_states/state/pacommwct.py index e87a3b2e6..937aec386 100644 --- a/juriscraper/opinions/united_states/state/pacommwct.py +++ b/juriscraper/opinions/united_states/state/pacommwct.py @@ -19,12 +19,16 @@ from urllib.parse import urlencode from juriscraper.opinions.united_states.state import pasuperct +from juriscraper.OpinionSite import OpinionSite class Site(pasuperct.Site): court = "Commonwealth" - days_interval = 30 first_opinion_date = datetime(1998, 8, 17) + # Deactivate extract_from_text from parent class + # and avoid triggering the example requirement from + # tests.local.test_ScraperExtractFromTextTest + extract_from_text = OpinionSite.extract_from_text def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) diff --git a/juriscraper/opinions/united_states/state/pasuperct.py b/juriscraper/opinions/united_states/state/pasuperct.py index c23fe35f9..bb75406f4 100644 --- a/juriscraper/opinions/united_states/state/pasuperct.py +++ b/juriscraper/opinions/united_states/state/pasuperct.py @@ -9,7 +9,7 @@ import re from datetime import datetime -from typing import Dict +from typing import Dict, Optional from urllib.parse import urlencode from juriscraper.opinions.united_states.state import pa @@ -17,7 +17,6 @@ class Site(pa.Site): court = "Superior" - days_interval = 20 first_opinion_date = datetime(1998, 2, 15) judge_key = "AuthorName" @@ -61,3 +60,16 @@ def clean_judge(self, author_str: str) -> str: " by ", " " ) return author_str + + def extract_from_text(self, scraped_text: str) -> Optional[Dict]: + """Get neutral citation from the first lines in the first page + + Not all scraped opinions have them + """ + neutral_citation_regex = ( + r"(?P\d{4}) (?PPA Super) (?P\d+)" + ) + if cite_match := re.search(neutral_citation_regex, scraped_text[:200]): + cite_data = cite_match.groupdict() + cite_data["type"] = 8 # Neutral citation + return {"Citation": cite_data} diff --git a/tests/examples/opinions/united_states/pa_example.compare.json b/tests/examples/opinions/united_states/pa_example.compare.json index abfeda15a..8569928ff 100644 --- a/tests/examples/opinions/united_states/pa_example.compare.json +++ b/tests/examples/opinions/united_states/pa_example.compare.json @@ -8,6 +8,7 @@ "date_filed_is_approximate": false, "docket_numbers": "108 MAP 2023", "judges": "Dougherty, Kevin M.", + "citations": "", "case_name_shorts": "", "per_curiam": false }, @@ -20,6 +21,7 @@ "date_filed_is_approximate": false, "docket_numbers": "108 MAP 2023", "judges": "Brobson, P. Kevin", + "citations": "", "case_name_shorts": "", "per_curiam": false }, @@ -32,6 +34,7 @@ "date_filed_is_approximate": false, "docket_numbers": "108 MAP 2023", "judges": "Wecht, David N.", + "citations": "", "case_name_shorts": "", "per_curiam": false }, @@ -44,6 +47,7 @@ "date_filed_is_approximate": false, "docket_numbers": "212 WAL 2024", "judges": "", + "citations": "", "case_name_shorts": "", "per_curiam": false } diff --git a/tests/examples/opinions/united_states/pacommwct_example.compare.json b/tests/examples/opinions/united_states/pacommwct_example.compare.json index ef780abe1..d53789343 100644 --- a/tests/examples/opinions/united_states/pacommwct_example.compare.json +++ b/tests/examples/opinions/united_states/pacommwct_example.compare.json @@ -8,6 +8,7 @@ "date_filed_is_approximate": false, "docket_numbers": "526 C.D. 2023", "judges": "Wallace", + "citations": "", "case_name_shorts": "PPB", "per_curiam": false }, @@ -20,6 +21,7 @@ "date_filed_is_approximate": false, "docket_numbers": "168 M.D. 2023", "judges": "Wallace", + "citations": "", "case_name_shorts": "", "per_curiam": false }, @@ -32,6 +34,7 @@ "date_filed_is_approximate": false, "docket_numbers": "337 M.D. 2023", "judges": "Covey", + "citations": "", "case_name_shorts": "", "per_curiam": false }, @@ -44,6 +47,7 @@ "date_filed_is_approximate": false, "docket_numbers": "379 M.D. 2024", "judges": "Leadbetter", + "citations": "", "case_name_shorts": "", "per_curiam": false }, @@ -56,6 +60,7 @@ "date_filed_is_approximate": false, "docket_numbers": "374 C.D. 2023", "judges": "Leavitt", + "citations": "", "case_name_shorts": "", "per_curiam": false }, @@ -68,6 +73,7 @@ "date_filed_is_approximate": false, "docket_numbers": "716 C.D. 2023", "judges": "Covey", + "citations": "", "case_name_shorts": "UCBR", "per_curiam": false }, @@ -80,6 +86,7 @@ "date_filed_is_approximate": false, "docket_numbers": "651 C.D. 2023", "judges": "Wallace", + "citations": "", "case_name_shorts": "S.E.N.", "per_curiam": false }, @@ -92,6 +99,7 @@ "date_filed_is_approximate": false, "docket_numbers": "469 C.D. 2023", "judges": "Dumas", + "citations": "", "case_name_shorts": "", "per_curiam": false }, @@ -104,6 +112,7 @@ "date_filed_is_approximate": false, "docket_numbers": "804 C.D. 2023", "judges": "Wojcik", + "citations": "", "case_name_shorts": "PPB", "per_curiam": false }, @@ -116,6 +125,7 @@ "date_filed_is_approximate": false, "docket_numbers": "229 C.D. 2022", "judges": "Ceisler", + "citations": "", "case_name_shorts": "", "per_curiam": false }, @@ -128,6 +138,7 @@ "date_filed_is_approximate": false, "docket_numbers": "824 C.D. 2023", "judges": "Leavitt", + "citations": "", "case_name_shorts": "", "per_curiam": false }, @@ -140,6 +151,7 @@ "date_filed_is_approximate": false, "docket_numbers": "264 & 1012 C.D. 2022", "judges": "Leadbetter", + "citations": "", "case_name_shorts": "", "per_curiam": false }, @@ -152,6 +164,7 @@ "date_filed_is_approximate": false, "docket_numbers": "390 C.D. 2023", "judges": "McCullough", + "citations": "", "case_name_shorts": "DHS", "per_curiam": false }, @@ -164,6 +177,7 @@ "date_filed_is_approximate": false, "docket_numbers": "506 C.D. 2023", "judges": "Covey", + "citations": "", "case_name_shorts": "", "per_curiam": false }, @@ -176,6 +190,7 @@ "date_filed_is_approximate": false, "docket_numbers": "791 C.D. 2023", "judges": "Covey", + "citations": "", "case_name_shorts": "PPB", "per_curiam": false }, @@ -188,6 +203,7 @@ "date_filed_is_approximate": false, "docket_numbers": "164 M.D. 2023", "judges": "Covey", + "citations": "", "case_name_shorts": "", "per_curiam": false }, @@ -200,6 +216,7 @@ "date_filed_is_approximate": false, "docket_numbers": "44 C.D. 2023", "judges": "McCullough. Dumas", + "citations": "", "case_name_shorts": "", "per_curiam": false }, @@ -212,6 +229,7 @@ "date_filed_is_approximate": false, "docket_numbers": "35, 371 & 388 C.D. 2023", "judges": "Wallace", + "citations": "", "case_name_shorts": "", "per_curiam": false }, @@ -224,6 +242,7 @@ "date_filed_is_approximate": false, "docket_numbers": "369 M.D. 2023", "judges": "Ceisler", + "citations": "", "case_name_shorts": "", "per_curiam": false }, @@ -236,6 +255,7 @@ "date_filed_is_approximate": false, "docket_numbers": "33 M.D. 2024", "judges": "Cohn Jubelirer. McCullough", + "citations": "", "case_name_shorts": "", "per_curiam": false } diff --git a/tests/examples/opinions/united_states/pasuperct_example.compare.json b/tests/examples/opinions/united_states/pasuperct_example.compare.json index dfb92d772..fa1786b9b 100644 --- a/tests/examples/opinions/united_states/pasuperct_example.compare.json +++ b/tests/examples/opinions/united_states/pasuperct_example.compare.json @@ -8,6 +8,7 @@ "date_filed_is_approximate": false, "docket_numbers": "796 EDA 2024", "judges": "Bowes", + "citations": "", "case_name_shorts": "", "per_curiam": false }, @@ -20,6 +21,7 @@ "date_filed_is_approximate": false, "docket_numbers": "254 EDA 2024", "judges": "Bowes", + "citations": "", "case_name_shorts": "Com.", "per_curiam": false }, @@ -32,6 +34,7 @@ "date_filed_is_approximate": false, "docket_numbers": "9 MDA 2023", "judges": "Lazarus", + "citations": "", "case_name_shorts": "Com.", "per_curiam": false }, @@ -44,6 +47,7 @@ "date_filed_is_approximate": false, "docket_numbers": "650 EDA 2023", "judges": "Beck", + "citations": "", "case_name_shorts": "Com.", "per_curiam": false }, @@ -56,6 +60,7 @@ "date_filed_is_approximate": false, "docket_numbers": "650 EDA 2023", "judges": "Panella", + "citations": "", "case_name_shorts": "Com.", "per_curiam": false }, @@ -68,6 +73,7 @@ "date_filed_is_approximate": false, "docket_numbers": "1285 MDA 2023", "judges": "Panella", + "citations": "", "case_name_shorts": "Com.", "per_curiam": false }, @@ -80,6 +86,7 @@ "date_filed_is_approximate": false, "docket_numbers": "2466 EDA 2022", "judges": "Colins", + "citations": "", "case_name_shorts": "Com.", "per_curiam": false }, @@ -92,6 +99,7 @@ "date_filed_is_approximate": false, "docket_numbers": "1980 EDA 2023", "judges": "Sullivan", + "citations": "", "case_name_shorts": "Com.", "per_curiam": false }, @@ -104,6 +112,7 @@ "date_filed_is_approximate": false, "docket_numbers": "1134 WDA 2023", "judges": "Panella", + "citations": "", "case_name_shorts": "Com.", "per_curiam": false }, @@ -116,6 +125,7 @@ "date_filed_is_approximate": false, "docket_numbers": "345 MDA 2024", "judges": "Panella", + "citations": "", "case_name_shorts": "", "per_curiam": false }, @@ -128,6 +138,7 @@ "date_filed_is_approximate": false, "docket_numbers": "612 EDA 2023", "judges": "Dubow", + "citations": "", "case_name_shorts": "", "per_curiam": false }, @@ -140,6 +151,7 @@ "date_filed_is_approximate": false, "docket_numbers": "1652 MDA 2023", "judges": "Dubow", + "citations": "", "case_name_shorts": "", "per_curiam": false }, @@ -152,6 +164,7 @@ "date_filed_is_approximate": false, "docket_numbers": "2581 EDA 2023", "judges": "Dubow", + "citations": "", "case_name_shorts": "", "per_curiam": false }, @@ -164,6 +177,7 @@ "date_filed_is_approximate": false, "docket_numbers": "1710 EDA 2023", "judges": "Murray", + "citations": "", "case_name_shorts": "Com.", "per_curiam": false }, @@ -176,6 +190,7 @@ "date_filed_is_approximate": false, "docket_numbers": "1244 WDA 2023", "judges": "Panella", + "citations": "", "case_name_shorts": "Com.", "per_curiam": false }, @@ -188,6 +203,7 @@ "date_filed_is_approximate": false, "docket_numbers": "1205 MDA 2023", "judges": "Kunselman", + "citations": "", "case_name_shorts": "Com.", "per_curiam": false }, @@ -200,6 +216,7 @@ "date_filed_is_approximate": false, "docket_numbers": "1205 WDA 2023", "judges": "Sullivan", + "citations": "", "case_name_shorts": "Com.", "per_curiam": false }, @@ -212,6 +229,7 @@ "date_filed_is_approximate": false, "docket_numbers": "1171 MDA 2023", "judges": "Dubow", + "citations": "", "case_name_shorts": "Com.", "per_curiam": false }, @@ -224,6 +242,7 @@ "date_filed_is_approximate": false, "docket_numbers": "60 EDA 2023", "judges": "Colins", + "citations": "", "case_name_shorts": "Com.", "per_curiam": false }, @@ -236,6 +255,7 @@ "date_filed_is_approximate": false, "docket_numbers": "18 EDA 2024", "judges": "Murray", + "citations": "", "case_name_shorts": "Com.", "per_curiam": false }, @@ -248,7 +268,21 @@ "date_filed_is_approximate": false, "docket_numbers": "270 WDA 2024", "judges": "King", + "citations": "", "case_name_shorts": "Com.", "per_curiam": false + }, + { + "case_dates": "2021-05-27", + "case_names": "B.S.G. v. D.M.C.", + "download_urls": "https://www.pacourts.us/assets/opinions/Superior/out/J-A07035-21oo.pdf", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "2000 EDA 2020", + "judges": "Stevens", + "citations": "255 A.3d 528", + "case_name_shorts": "B.S.G.", + "per_curiam": false } ] \ No newline at end of file diff --git a/tests/examples/opinions/united_states/pasuperct_example.json b/tests/examples/opinions/united_states/pasuperct_example.json index d5129f08e..1f24fc250 100644 --- a/tests/examples/opinions/united_states/pasuperct_example.json +++ b/tests/examples/opinions/united_states/pasuperct_example.json @@ -1,5 +1,59 @@ { "Items": [ + { + "Author": null, + "BoardDocketNumber": null, + "Caption": "B.S.G. v. D.M.C.\nNo. 2000 EDA 2020\n255 A.3d 528", + "CourtDocketNumber": null, + "CourtType": 2, + "DispositionDate": "2021-05-27T00:00:00", + "Keywords": null, + "UserIdentifier": "The Superior Court Reporter", + "UploadDate": "0001-01-01T00:00:00", + "PostedToday": false, + "Postings": [ + { + "Id": 59145, + "AuthorId": "27", + "OpinionId": 53314, + "FileName": "J-A07035-21oo.pdf", + "ProcessedDate": "2021-05-27T00:00:00", + "PostingTypeId": "1", + "PublicationTypeId": 2, + "RenderedDate": "0001-01-01T00:00:00", + "SortOrder": 0, + "FileVersion": 1, + "Author": { + "Id": 27, + "AuthorName": "Stevens, P.J.E.", + "AuthorCode": null, + "Selectable": true, + "SortOrder": 7160 + }, + "PostType": { + "Id": 1, + "PostingTypeCode": "Opinion", + "PostingTypeId": "1", + "SortOrder": 1 + }, + "PublicationType": { + "Id": 2, + "Description": "Precedential", + "SortOrder": null + } + } + ], + "CreatedById": null, + "DeletedById": null, + "UpdatedById": null, + "CreatedOn": null, + "DeletedOn": null, + "UpdatedOn": null, + "CreatedBy": null, + "DeletedBy": null, + "UpdatedBy": null, + "Id": 53314 + }, { "Author": null, "BoardDocketNumber": null, diff --git a/tests/local/test_ScraperExtractFromTextTest.py b/tests/local/test_ScraperExtractFromTextTest.py index 1dd9551bc..0eb0ee327 100644 --- a/tests/local/test_ScraperExtractFromTextTest.py +++ b/tests/local/test_ScraperExtractFromTextTest.py @@ -691,6 +691,19 @@ class ScraperExtractFromText(unittest.TestCase): {}, ), ], + "juriscraper.opinions.united_states.state.pasuperct": [ + ( + "J-A13044-21\n\n 2021 PA Super 113\n\n\n COMMONWEALTH OF PENNSYLVANIA : IN THE SUPERIOR COURT OF\n : PENNSYLVANIA\n :\n ", + { + "Citation": { + "volume": "2021", + "reporter": "PA Super", + "page": "113", + "type": 8, + } + }, + ) + ], } def test_extract_from_text(self):