Skip to content

Commit

Permalink
parse_html.activity: about 30% speedup for html parsing (#66)
Browse files Browse the repository at this point in the history
A few optimizations
- using `.find` instead of `.select` is faster since it's not using CSS selectors
- using `SoupStrainer` is faster since it only does partial parsing and avoids materializing parts of soup we don't actually use

Measurements on a big `Chrome/MyActivity.html` file

- before
  - parsing (up to for loop over `outer_divs`: 17s
  - processing (everything in for loop): 16s

- after
  - parsing: 13s
  - procesing: 11s
  • Loading branch information
karlicoss authored Sep 11, 2024
1 parent 9aea89f commit b79f5b3
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 6 deletions.
2 changes: 1 addition & 1 deletion google_takeout_parser/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def _serialize_default(obj: Any) -> Any:
if isinstance(obj, Exception):
return {"type": type(obj).__name__, "value": str(obj)}
elif dataclasses.is_dataclass(obj):
d = dataclasses.asdict(obj)
d = dataclasses.asdict(obj) # type: ignore[call-overload] # see https://github.com/python/mypy/issues/17550
assert "type" not in d
d["type"] = type(obj).__name__
return d
Expand Down
17 changes: 12 additions & 5 deletions google_takeout_parser/parse_html/activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from pathlib import Path
from datetime import datetime
from typing import List, Iterator, Optional, Tuple, Union, Dict, Iterable
from typing import Any, List, Iterator, Optional, Tuple, Union, Dict, Iterable
from urllib.parse import urlparse, parse_qs

import bs4
Expand Down Expand Up @@ -260,7 +260,7 @@ def _parse_activity_div(
*,
file_dt: Optional[datetime],
) -> Res[Activity]:
header_el = div.select_one("p.mdl-typography--title")
header_el = div.find("p", class_="mdl-typography--title")
if header_el is None:
return ValueError(f"Could not find header in {div}")
header = header_el.text.strip()
Expand All @@ -282,7 +282,7 @@ def _parse_activity_div(
# iterate over content-cells (contain all the info in this cell)
# and categorize the cells. Pretty sure there should only be one
# of each, but doing this to be safe
for d in div.select(".content-cell"):
for d in div.find_all(class_="content-cell"):
div_classes = d.attrs["class"]
# these are used for spacing on the right
if "mdl-typography--text-right" in div_classes:
Expand Down Expand Up @@ -334,8 +334,15 @@ def _parse_activity_div(

def _parse_html_activity(p: Path) -> Iterator[Res[Activity]]:
file_dt = datetime.fromtimestamp(p.stat().st_mtime)
soup = bs4.BeautifulSoup(p.read_text(), "lxml")
for outer_div in soup.select("div.outer-cell"):
data = p.read_text()

def soup_filter(tag: str, data: Dict[str, Any]) -> bool:
return tag == 'div' and 'outer-cell' in data.get('class', '')

soup = bs4.BeautifulSoup(data, "lxml", parse_only=bs4.SoupStrainer(soup_filter)) # type: ignore[arg-type] # this overload is missing from stubs

outer_divs: Iterable[bs4.element.Tag] = soup.children # type: ignore[assignment] # mypy can't guess they will actually be tags..
for outer_div in outer_divs:
try:
yield _parse_activity_div(outer_div, file_dt=file_dt)
except Exception as ae:
Expand Down

0 comments on commit b79f5b3

Please sign in to comment.