Skip to content

Commit

Permalink
parse_json._parse_location_history: speedup parsing about 50% (#67)
Browse files Browse the repository at this point in the history
* parse_json._parse_location_history: speedup parsing about 50%

- use orjson (add as an optional dep) if it's available with fallback to builtin json
- use builtin fromisoformat from python 3.11, it's much faster (and simpler!)

- before:
  - parsing json 9.6s
  - processing data: 7.0s
- after
  - parsing json: 5.7s
  - processing data: 3.1s

---------

Co-authored-by: Sean Breckenridge <seanbrecke@gmail.com>
  • Loading branch information
karlicoss and purarue authored Sep 11, 2024
1 parent b79f5b3 commit a3a402a
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 4 deletions.
13 changes: 12 additions & 1 deletion google_takeout_parser/parse_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from pathlib import Path
from datetime import datetime, timezone
from typing import Iterator, Any, Dict, Iterable, Optional, List
import warnings

from .http_allowlist import convert_to_https_opt
from .time_utils import parse_datetime_millis
Expand All @@ -25,6 +26,16 @@
from .time_utils import parse_json_utc_date


def _read_json_data(p: Path) -> Any:
try:
import orjson
except ModuleNotFoundError:
warnings.warn("orjson not found, it can significantly speed up json parsing. Consider installing via 'pip install orjson'. Falling back onto stdlib json")
return json.loads(p.read_text())
else:
return orjson.loads(p.read_bytes())


# "YouTube and YouTube Music/history/search-history.json"
# "YouTube and YouTube Music/history/watch-history.json"
# This is also the 'My Activity' JSON format
Expand Down Expand Up @@ -124,7 +135,7 @@ def _parse_timestamp_key(d: Dict[str, Any], key: str) -> datetime:
def _parse_location_history(p: Path) -> Iterator[Res[Location]]:
### HMMM, seems that all the locations are right after one another. broken? May just be all the location history that google has on me
### see numpy.diff(list(map(lambda yy: y.at, filter(lambda y: isinstance(Location), events()))))
json_data = json.loads(p.read_text())
json_data = _read_json_data(p)
if "locations" not in json_data:
yield RuntimeError(f"Locations: no 'locations' key in '{p}'")
for loc in json_data.get("locations", []):
Expand Down
13 changes: 10 additions & 3 deletions google_takeout_parser/time_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import sys
from typing import Union
from datetime import datetime, timezone

Expand All @@ -10,11 +11,17 @@ def parse_datetime_millis(d: Union[str, float, int]) -> datetime:
return parse_datetime_sec(int(d) / 1000)


def parse_json_utc_date(ds: str) -> datetime:
utc_naive = datetime.fromisoformat(ds.rstrip("Z"))
return utc_naive.replace(tzinfo=timezone.utc)
if sys.version_info[:2] >= (3, 11):
# from 3.11, it supports parsing strings ending with Z
parse_json_utc_date = datetime.fromisoformat
else:
def parse_json_utc_date(ds: str) -> datetime:
utc_naive = datetime.fromisoformat(ds.rstrip("Z"))
return utc_naive.replace(tzinfo=timezone.utc)


def test_parse_utc_date() -> None:
expected = datetime(2021, 9, 30, 1, 44, 33, tzinfo=timezone.utc)
assert parse_json_utc_date("2021-09-30T01:44:33.000Z") == expected

assert parse_json_utc_date("2023-01-27T22:46:47.389352Z") == datetime(2023, 1, 27, 22, 46, 47, 389352, tzinfo=timezone.utc)
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ console_scripts =
google_takeout_parser = google_takeout_parser.__main__:main

[options.extras_require]
optional =
orjson
testing =
flake8
mypy
Expand Down

0 comments on commit a3a402a

Please sign in to comment.