Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(pa): collect neutral citations and regional citations; also paginate results #1255

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 18 additions & 4 deletions juriscraper/opinions/united_states/state/pa.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,20 @@ class Site(OpinionSiteLinear):
court = "Supreme"
base_url = "https://www.pacourts.us/api/opinion?"
document_url = "https://www.pacourts.us/assets/opinions/{}/out/{}"
days_interval = 20
days_interval = 1
api_dt_format = "%Y-%m-%dT00:00:00-05:00"
first_opinion_date = datetime(1998, 4, 27)
judge_key = "AuthorCode"
regional_cite_regex = re.compile(r"\d{1,3} A\.3d \d+")

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.regex = re.compile(r"(.*)(?:[,-]?\s+Nos?\.)(.*)")
self.status = "Published"

now = datetime.now() + timedelta(days=1)
start = now - timedelta(days=7)
now = datetime.now()
start = now - timedelta(days=1)
self.params = {
"startDate": start.strftime(self.api_dt_format),
"endDate": now.strftime(self.api_dt_format),
Expand All @@ -50,9 +51,13 @@ def _process_html(self) -> None:
json_response = self.html

for cluster in json_response["Items"]:
title = cluster["Caption"]
disposition_date = cluster["DispositionDate"].split("T")[0]
title = cluster["Caption"]
name, docket = self.parse_case_title(title)
# A.3d cites seem to exist only for pasuperct
cite = ""
if cite_match := self.regional_cite_regex.search(title):
cite = cite_match.group(0)

for op in cluster["Postings"]:
per_curiam = False
Expand All @@ -75,9 +80,18 @@ def _process_html(self) -> None:
"judge": author_str,
"status": status,
"per_curiam": per_curiam,
"citation": cite,
}
)

if not self.test_mode_enabled() and json_response.get("HasNext"):
next_page = json_response["PageNumber"] + 1
logger.info("Paginating to page %s", next_page)
self.params["pageNumber"] = next_page
self.url = f"{self.base_url}{urlencode(self.params)}"
self.html = self._download()
self._process_html()

def parse_case_title(self, title: str) -> Tuple[str, str]:
"""Separates case_name and docket_number from case string

Expand Down
6 changes: 5 additions & 1 deletion juriscraper/opinions/united_states/state/pacommwct.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,16 @@
from urllib.parse import urlencode

from juriscraper.opinions.united_states.state import pasuperct
from juriscraper.OpinionSite import OpinionSite


class Site(pasuperct.Site):
court = "Commonwealth"
days_interval = 30
first_opinion_date = datetime(1998, 8, 17)
# Deactivate extract_from_text from parent class
# and avoid triggering the example requirement from
# tests.local.test_ScraperExtractFromTextTest
extract_from_text = OpinionSite.extract_from_text

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand Down
16 changes: 14 additions & 2 deletions juriscraper/opinions/united_states/state/pasuperct.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,14 @@

import re
from datetime import datetime
from typing import Dict
from typing import Dict, Optional
from urllib.parse import urlencode

from juriscraper.opinions.united_states.state import pa


class Site(pa.Site):
court = "Superior"
days_interval = 20
first_opinion_date = datetime(1998, 2, 15)
judge_key = "AuthorName"

Expand Down Expand Up @@ -61,3 +60,16 @@ def clean_judge(self, author_str: str) -> str:
" by ", " "
)
return author_str

def extract_from_text(self, scraped_text: str) -> Optional[Dict]:
"""Get neutral citation from the first lines in the first page

Not all scraped opinions have them
"""
neutral_citation_regex = (
r"(?P<volume>\d{4}) (?P<reporter>PA Super) (?P<page>\d+)"
)
if cite_match := re.search(neutral_citation_regex, scraped_text[:200]):
cite_data = cite_match.groupdict()
cite_data["type"] = 8 # Neutral citation
return {"Citation": cite_data}
4 changes: 4 additions & 0 deletions tests/examples/opinions/united_states/pa_example.compare.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "108 MAP 2023",
"judges": "Dougherty, Kevin M.",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -20,6 +21,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "108 MAP 2023",
"judges": "Brobson, P. Kevin",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -32,6 +34,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "108 MAP 2023",
"judges": "Wecht, David N.",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -44,6 +47,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "212 WAL 2024",
"judges": "",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "526 C.D. 2023",
"judges": "Wallace",
"citations": "",
"case_name_shorts": "PPB",
"per_curiam": false
},
Expand All @@ -20,6 +21,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "168 M.D. 2023",
"judges": "Wallace",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -32,6 +34,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "337 M.D. 2023",
"judges": "Covey",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -44,6 +47,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "379 M.D. 2024",
"judges": "Leadbetter",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -56,6 +60,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "374 C.D. 2023",
"judges": "Leavitt",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -68,6 +73,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "716 C.D. 2023",
"judges": "Covey",
"citations": "",
"case_name_shorts": "UCBR",
"per_curiam": false
},
Expand All @@ -80,6 +86,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "651 C.D. 2023",
"judges": "Wallace",
"citations": "",
"case_name_shorts": "S.E.N.",
"per_curiam": false
},
Expand All @@ -92,6 +99,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "469 C.D. 2023",
"judges": "Dumas",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -104,6 +112,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "804 C.D. 2023",
"judges": "Wojcik",
"citations": "",
"case_name_shorts": "PPB",
"per_curiam": false
},
Expand All @@ -116,6 +125,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "229 C.D. 2022",
"judges": "Ceisler",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -128,6 +138,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "824 C.D. 2023",
"judges": "Leavitt",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -140,6 +151,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "264 & 1012 C.D. 2022",
"judges": "Leadbetter",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -152,6 +164,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "390 C.D. 2023",
"judges": "McCullough",
"citations": "",
"case_name_shorts": "DHS",
"per_curiam": false
},
Expand All @@ -164,6 +177,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "506 C.D. 2023",
"judges": "Covey",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -176,6 +190,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "791 C.D. 2023",
"judges": "Covey",
"citations": "",
"case_name_shorts": "PPB",
"per_curiam": false
},
Expand All @@ -188,6 +203,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "164 M.D. 2023",
"judges": "Covey",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -200,6 +216,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "44 C.D. 2023",
"judges": "McCullough. Dumas",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -212,6 +229,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "35, 371 & 388 C.D. 2023",
"judges": "Wallace",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -224,6 +242,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "369 M.D. 2023",
"judges": "Ceisler",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -236,6 +255,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "33 M.D. 2024",
"judges": "Cohn Jubelirer. McCullough",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
}
Expand Down
Loading
Loading