parse_html.activity: about 30% speedup for html parsing (#66)

A few optimizations - using `.find` instead of `.select` is faster since it's not using CSS selectors - using `SoupStrainer` is faster since it only does partial parsing and avoids materializing parts of soup we don't actually use Measurements on a big `Chrome/MyActivity.html` file - before - parsing (up to for loop over `outer_divs`: 17s - processing (everything in for loop): 16s - after - parsing: 13s - procesing: 11s
purarue · Sep 11, 2024 · b79f5b3 · b79f5b3
1 parent 9aea89f
commit b79f5b3
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 6 deletions.
diff --git a/google_takeout_parser/__main__.py b/google_takeout_parser/__main__.py
@@ -88,7 +88,7 @@ def _serialize_default(obj: Any) -> Any:
     if isinstance(obj, Exception):
         return {"type": type(obj).__name__, "value": str(obj)}
     elif dataclasses.is_dataclass(obj):
-        d = dataclasses.asdict(obj)
+        d = dataclasses.asdict(obj)  # type: ignore[call-overload]  # see https://github.com/python/mypy/issues/17550
         assert "type" not in d
         d["type"] = type(obj).__name__
         return d

diff --git a/google_takeout_parser/parse_html/activity.py b/google_takeout_parser/parse_html/activity.py
@@ -4,7 +4,7 @@
 
 from pathlib import Path
 from datetime import datetime
-from typing import List, Iterator, Optional, Tuple, Union, Dict, Iterable
+from typing import Any, List, Iterator, Optional, Tuple, Union, Dict, Iterable
 from urllib.parse import urlparse, parse_qs
 
 import bs4
@@ -260,7 +260,7 @@ def _parse_activity_div(
     *,
     file_dt: Optional[datetime],
 ) -> Res[Activity]:
-    header_el = div.select_one("p.mdl-typography--title")
+    header_el = div.find("p", class_="mdl-typography--title")
     if header_el is None:
         return ValueError(f"Could not find header in {div}")
     header = header_el.text.strip()
@@ -282,7 +282,7 @@ def _parse_activity_div(
     # iterate over content-cells (contain all the info in this cell)
     # and categorize the cells. Pretty sure there should only be one
     # of each, but doing this to be safe
-    for d in div.select(".content-cell"):
+    for d in div.find_all(class_="content-cell"):
         div_classes = d.attrs["class"]
         # these are used for spacing on the right
         if "mdl-typography--text-right" in div_classes:
@@ -334,8 +334,15 @@ def _parse_activity_div(
 
 def _parse_html_activity(p: Path) -> Iterator[Res[Activity]]:
     file_dt = datetime.fromtimestamp(p.stat().st_mtime)
-    soup = bs4.BeautifulSoup(p.read_text(), "lxml")
-    for outer_div in soup.select("div.outer-cell"):
+    data = p.read_text()
+
+    def soup_filter(tag: str, data: Dict[str, Any]) -> bool:
+        return tag == 'div' and 'outer-cell' in data.get('class', '')
+
+    soup = bs4.BeautifulSoup(data, "lxml", parse_only=bs4.SoupStrainer(soup_filter))  # type: ignore[arg-type]  # this overload is missing from stubs
+
+    outer_divs: Iterable[bs4.element.Tag] = soup.children  # type: ignore[assignment]  # mypy can't guess they will actually be tags..
+    for outer_div in outer_divs:
         try:
             yield _parse_activity_div(outer_div, file_dt=file_dt)
         except Exception as ae: