diff --git a/google_takeout_parser/parse_json.py b/google_takeout_parser/parse_json.py index a99d550..cdea89f 100644 --- a/google_takeout_parser/parse_json.py +++ b/google_takeout_parser/parse_json.py @@ -6,6 +6,7 @@ from pathlib import Path from datetime import datetime, timezone from typing import Iterator, Any, Dict, Iterable, Optional, List +import warnings from .http_allowlist import convert_to_https_opt from .time_utils import parse_datetime_millis @@ -25,6 +26,16 @@ from .time_utils import parse_json_utc_date +def _read_json_data(p: Path) -> Any: + try: + import orjson + except ModuleNotFoundError: + warnings.warn("orjson not found, it can significantly speed up json parsing. Consider installing via 'pip install orjson'. Falling back onto stdlib json") + return json.loads(p.read_text()) + else: + return orjson.loads(p.read_bytes()) + + # "YouTube and YouTube Music/history/search-history.json" # "YouTube and YouTube Music/history/watch-history.json" # This is also the 'My Activity' JSON format @@ -124,7 +135,7 @@ def _parse_timestamp_key(d: Dict[str, Any], key: str) -> datetime: def _parse_location_history(p: Path) -> Iterator[Res[Location]]: ### HMMM, seems that all the locations are right after one another. broken? May just be all the location history that google has on me ### see numpy.diff(list(map(lambda yy: y.at, filter(lambda y: isinstance(Location), events())))) - json_data = json.loads(p.read_text()) + json_data = _read_json_data(p) if "locations" not in json_data: yield RuntimeError(f"Locations: no 'locations' key in '{p}'") for loc in json_data.get("locations", []): diff --git a/google_takeout_parser/time_utils.py b/google_takeout_parser/time_utils.py index 729db0c..6843998 100644 --- a/google_takeout_parser/time_utils.py +++ b/google_takeout_parser/time_utils.py @@ -1,3 +1,4 @@ +import sys from typing import Union from datetime import datetime, timezone @@ -10,11 +11,17 @@ def parse_datetime_millis(d: Union[str, float, int]) -> datetime: return parse_datetime_sec(int(d) / 1000) -def parse_json_utc_date(ds: str) -> datetime: - utc_naive = datetime.fromisoformat(ds.rstrip("Z")) - return utc_naive.replace(tzinfo=timezone.utc) +if sys.version_info[:2] >= (3, 11): + # from 3.11, it supports parsing strings ending with Z + parse_json_utc_date = datetime.fromisoformat +else: + def parse_json_utc_date(ds: str) -> datetime: + utc_naive = datetime.fromisoformat(ds.rstrip("Z")) + return utc_naive.replace(tzinfo=timezone.utc) def test_parse_utc_date() -> None: expected = datetime(2021, 9, 30, 1, 44, 33, tzinfo=timezone.utc) assert parse_json_utc_date("2021-09-30T01:44:33.000Z") == expected + + assert parse_json_utc_date("2023-01-27T22:46:47.389352Z") == datetime(2023, 1, 27, 22, 46, 47, 389352, tzinfo=timezone.utc) diff --git a/setup.cfg b/setup.cfg index 01f3900..d4736a1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -49,6 +49,8 @@ console_scripts = google_takeout_parser = google_takeout_parser.__main__:main [options.extras_require] +optional = + orjson testing = flake8 mypy