From b930293e1f414a089ac337f10baeb9020cc06b62 Mon Sep 17 00:00:00 2001 From: Miguel Habana Date: Thu, 26 Sep 2024 05:54:29 +0200 Subject: [PATCH] Add some additional fields to existing models (#76) * add mypy types for some packages * update semantic location history mock data * update playstoreinstall, location fields * update some mappings for browser history --- google_takeout_parser/locales/en.py | 1 + google_takeout_parser/models.py | 19 ++- google_takeout_parser/parse_json.py | 12 +- tests/test_json.py | 199 +++++++++++++++++++++++++++- 4 files changed, 220 insertions(+), 11 deletions(-) diff --git a/google_takeout_parser/locales/en.py b/google_takeout_parser/locales/en.py index 3d76139..3f90962 100644 --- a/google_takeout_parser/locales/en.py +++ b/google_takeout_parser/locales/en.py @@ -42,6 +42,7 @@ # HANDLER_MAP: HandlerMap = { r"Chrome/BrowserHistory.json": _parse_chrome_history, + r"Chrome/History.json": _parse_chrome_history, # Seems to have been renamed from BrowserHistory.json to History.json sometime between Oct 2023 to Sep 2024 r"Chrome": None, # Ignore rest of Chrome stuff r"Google Play Store/Installs.json": _parse_app_installs, r"Google Play Store/": None, # ignore anything else in Play Store diff --git a/google_takeout_parser/models.py b/google_takeout_parser/models.py index efafc6a..fd102f3 100644 --- a/google_takeout_parser/models.py +++ b/google_takeout_parser/models.py @@ -167,12 +167,22 @@ def key(self) -> int: @dataclass class PlayStoreAppInstall(BaseEvent): title: str - dt: datetime - device_name: Optional[str] + lastUpdateTime: datetime # timestamp for when the installation event occurred + firstInstallationTime: datetime # timetamp for when you first installed the app on the given device + deviceName: Optional[str] + deviceCarrier: Optional[str] + deviceManufacturer: Optional[str] + + # noticed that lastUpdateTime was more accurate timestamp for the dt field + # since different installation events of the same app had pretty close firstInstallation times + # but the lastUpdate time was always at a later timestamp so I assumed it was the installation event + @property + def dt(self) -> datetime: + return self.lastUpdateTime # previously returned the firstInstallationTime @property def key(self) -> int: - return int(self.dt.timestamp()) + return int(self.lastUpdateTime.timestamp()) @dataclass @@ -180,6 +190,8 @@ class Location(BaseEvent): lat: float lng: float accuracy: Optional[float] + deviceTag: Optional[int] + source: Optional[str] dt: datetime @property @@ -264,6 +276,7 @@ class ChromeHistory(BaseEvent): title: str url: Url dt: datetime + pageTransition: Optional[str] @property def key(self) -> Tuple[str, int]: diff --git a/google_takeout_parser/parse_json.py b/google_takeout_parser/parse_json.py index 2ebcced..3136abd 100644 --- a/google_takeout_parser/parse_json.py +++ b/google_takeout_parser/parse_json.py @@ -126,8 +126,11 @@ def _parse_app_installs(p: Path) -> Iterator[Res[PlayStoreAppInstall]]: try: yield PlayStoreAppInstall( title=japp["install"]["doc"]["title"], - device_name=japp["install"]["deviceAttribute"].get("deviceDisplayName"), - dt=parse_json_utc_date(japp["install"]["firstInstallationTime"]), + deviceName=japp.get("install", {}).get("deviceAttribute", {}).get("deviceDisplayName"), + deviceCarrier=japp.get("install", {}).get("deviceAttribute", {}).get("carrier"), + deviceManufacturer=japp.get("install", {}).get("deviceAttribute", {}).get("manufacturer"), + lastUpdateTime=parse_json_utc_date(japp["install"]["lastUpdateTime"]), + firstInstallationTime=parse_json_utc_date(japp['install']['firstInstallationTime']), ) except Exception as e: yield e @@ -149,12 +152,16 @@ def _parse_location_history(p: Path) -> Iterator[Res[Location]]: yield RuntimeError(f"Locations: no 'locations' key in '{p}'") for loc in json_data.get("locations", []): accuracy = loc.get("accuracy") + deviceTag = loc.get("deviceTag") + source = loc.get("source") try: yield Location( lng=float(loc["longitudeE7"]) / 1e7, lat=float(loc["latitudeE7"]) / 1e7, dt=_parse_timestamp_key(loc, "timestamp"), accuracy=None if accuracy is None else float(accuracy), + deviceTag=None if deviceTag is None else int(deviceTag), + source=None if source is None else str(source), ) except Exception as e: yield e @@ -259,6 +266,7 @@ def _parse_chrome_history(p: Path) -> Iterator[Res[ChromeHistory]]: # and there's likely lots of items that aren't https url=item["url"], dt=time_naive.replace(tzinfo=timezone.utc), + pageTransition=item.get("page_transition") ) except Exception as e: yield e diff --git a/tests/test_json.py b/tests/test_json.py index c432c59..9a3f6f0 100644 --- a/tests/test_json.py +++ b/tests/test_json.py @@ -81,18 +81,23 @@ def test_parse_likes_json(tmp_path_f: Path) -> None: def test_parse_app_installs(tmp_path_f: Path) -> None: - contents = """[{"install": {"doc": {"documentType": "Android Apps", "title": "Discord - Talk, Video Chat & Hang Out with Friends"}, "firstInstallationTime": "2020-05-25T03:11:53.055Z", "deviceAttribute": {"manufacturer": "motorola", "deviceDisplayName": "motorola moto g(7) play"}, "lastUpdateTime": "2020-08-27T02:55:33.259Z"}}]""" + contents = """[{"install":{"doc":{"documentType":"Android Apps","title":"ClickUp - Manage Teams & Tasks"},"firstInstallationTime":"2022-03-14T07:06:12.070725Z","deviceAttribute":{"model":"SM-S901E","carrier":"Vodafone","manufacturer":"samsung","deviceDisplayName":"samsung SM-S901E"},"lastUpdateTime":"2024-08-27T22:55:15.184610Z"}}]""" fp = tmp_path_f / "file" fp.write_text(contents) res = list(prj._parse_app_installs(fp)) assert res == [ models.PlayStoreAppInstall( - title="Discord - Talk, Video Chat & Hang Out with Friends", - dt=datetime.datetime( - 2020, 5, 25, 3, 11, 53, 55000, tzinfo=datetime.timezone.utc + title="ClickUp - Manage Teams \u0026 Tasks", + lastUpdateTime=datetime.datetime( + 2024, 8, 27, 22, 55, 15, 184610, tzinfo=datetime.timezone.utc + ), + firstInstallationTime=datetime.datetime( + 2022, 3, 14, 7, 6, 12, 70725, tzinfo=datetime.timezone.utc ), - device_name="motorola moto g(7) play", + deviceName="samsung SM-S901E", + deviceCarrier="Vodafone", + deviceManufacturer="samsung", ) ] @@ -110,12 +115,14 @@ def test_location_old(tmp_path_f: Path) -> None: 2017, 12, 10, 23, 14, 58, tzinfo=datetime.timezone.utc ), accuracy=10.0, + deviceTag=None, + source=None, ), ] def test_location_new(tmp_path_f: Path) -> None: - contents = '{"locations": [{"latitudeE7": 351324213, "longitudeE7": -1122434441, "accuracy": 10, "deviceTag": -80241446968629135069, "deviceDesignation": "PRIMARY", "timestamp": "2017-12-10T23:14:58.030Z"}]}' + contents = '{"locations": [{"latitudeE7": 351324213, "longitudeE7": -1122434441, "accuracy": 10, "deviceTag": -8024144696862913506, "deviceDesignation": "PRIMARY", "timestamp": "2017-12-10T23:14:58.030Z"}]}' fp = tmp_path_f / "file" fp.write_text(contents) res = list(prj._parse_location_history(fp)) @@ -127,6 +134,27 @@ def test_location_new(tmp_path_f: Path) -> None: 2017, 12, 10, 23, 14, 58, 30000, tzinfo=datetime.timezone.utc ), accuracy=10.0, + deviceTag=-8024144696862913506, + source=None, + ), + ] + + +def test_location_2024(tmp_path_f: Path) -> None: + contents = '{"locations":[{"latitudeE7":351324213,"longitudeE7":-1122434441,"accuracy":10,"activity":[{"activity":[{"type":"UNKNOWN","confidence":65},{"type":"IN_VEHICLE","confidence":27},{"type":"STILL","confidence":6},{"type":"ON_BICYCLE","confidence":2}],"timestamp":"2014-07-18T15:00:04.403Z"}],"source":"WIFI","deviceTag":1978796627,"timestamp":"2014-07-18T14:59:59.914Z"}]}' + fp = tmp_path_f / "file" + fp.write_text(contents) + res = list(prj._parse_location_history(fp)) + assert res == [ + models.Location( + lng=-112.2434441, + lat=35.1324213, + dt=datetime.datetime( + 2014, 7, 18, 14, 59, 59, 914000, tzinfo=datetime.timezone.utc + ), + accuracy=10.0, + deviceTag=1978796627, + source="WIFI", ), ] @@ -143,6 +171,7 @@ def test_chrome_history(tmp_path_f: Path) -> None: dt=datetime.datetime( 2021, 4, 2, 23, 4, 50, 134513, tzinfo=datetime.timezone.utc ), + pageTransition="LINK" ), ] @@ -242,3 +271,161 @@ def test_semantic_location_history(tmp_path_f: Path) -> None: ), ], ) + + +def test_semantic_location_history_2024(tmp_path_f: Path) -> None: + data = { + "timelineObjects": [ + { + "placeVisit": { + "location": { + "latitudeE7": 555555555, + "longitudeE7": -1066666666, + "placeId": "JK4E4P", + "address": "address", + "name": "name", + "sourceInfo": {"deviceTag": 987654321}, + "locationConfidence": 60.45, + }, + "duration": { + "startTimestamp": "2017-12-10T23:29:25.026Z", + "endTimestamp": "2017-12-11T01:20:06.106Z", + }, + "placeConfidence": "MEDIUM_CONFIDENCE", + "centerLatE7": 555555555, + "centerLngE7": -1666666666, + "visitConfidence": 65.45, + "otherCandidateLocations": [ + { + "latitudeE7": 423984239, + "longitudeE7": -1565656565, + "placeId": "XPRK4E4P", + "address": "address2", + "name": "name2", + "locationConfidence": 24.475897, + }, + { + "latitudeE7": 910000000, + "longitudeE7": -1000, + "semanticType": "TYPE_WORK", + }, + ], + "editConfirmationStatus": "NOT_CONFIRMED", + "locationConfidence": 55, + "placeVisitType": "SINGLE_PLACE", + "placeVisitImportance": "MAIN", + } + }, + { + "activitySegment": { + "startLocation": { + "latitudeE7": 555555555, + "longitudeE7": -1066666666 + }, + "endLocation": { + "latitudeE7": 555555567, + "longitudeE7": -1066666678 + }, + "duration": { + "startTimestamp": "2017-12-11T01:20:06.106Z", + "endTimestamp": "2017-12-11T01:40:06.106Z" + }, + "distance": 13071, + "activityType": "IN_PASSENGER_VEHICLE", + "confidence": "MEDIUM", + "activities": [{ + "activityType": "IN_PASSENGER_VEHICLE", + "probability": 85.514968640442 + }, { + "activityType": "MOTORCYCLING", + "probability": 8.858836042221917 + }, { + "activityType": "WALKING", + "probability": 4.7803567526550035 + }], + "waypointPath": { + "waypoints": [{ + "latE7": 123456789, + "lngE7": 1210000000 + }, { + "latE7": 123456089, + "lngE7": 1210000200 + }, { + "latE7": 123456289, + "lngE7": 1210000500 + }], + "source": "INFERRED" + }, + "simplifiedRawPath": { + "points": [{ + "latE7": 123456489, + "lngE7": 1210000240, + "accuracyMeters": 10, + "timestamp": "2017-12-11T01:35:04Z" + }] + }, + "editConfirmationStatus": "NOT_CONFIRMED", + "parkingEvent": { + "location": { + "latitudeE7": 123456289, + "longitudeE7": 1210000500, + "accuracyMetres": 163 + }, + "method": "END_OF_ACTIVITY_SEGMENT", + "locationSource": "UNKNOWN", + "timestamp": "2017-12-11T01:40:06Z" + } + } + } + ] + } + fp = tmp_path_f / "file" + fp.write_text(json.dumps(data)) + res = list(prj._parse_semantic_location_history(fp)) + obj = res[0] + assert not isinstance(obj, Exception) + # remove JSON, compare manually below + assert obj == models.PlaceVisit( + lat=55.5555555, + lng=-106.6666666, + centerLat=55.5555555, + centerLng=-166.6666666, + name="name", + address="address", + locationConfidence=60.45, + placeId="JK4E4P", + startTime=datetime.datetime( + 2017, 12, 10, 23, 29, 25, 26000, tzinfo=datetime.timezone.utc + ), + endTime=datetime.datetime( + 2017, 12, 11, 1, 20, 6, 106000, tzinfo=datetime.timezone.utc + ), + sourceInfoDeviceTag=987654321, + placeConfidence="MEDIUM_CONFIDENCE", + placeVisitImportance="MAIN", + placeVisitType="SINGLE_PLACE", + visitConfidence=65.45, + editConfirmationStatus="NOT_CONFIRMED", + otherCandidateLocations=[ + models.CandidateLocation( + lat=42.3984239, + lng=-156.5656565, + name="name2", + address="address2", + locationConfidence=24.475897, + placeId="XPRK4E4P", + semanticType=None, + sourceInfoDeviceTag=None, + ), + models.CandidateLocation( + lat=91.0, + lng=-0.0001, + name=None, + address=None, + locationConfidence=None, + placeId=None, + semanticType='TYPE_WORK', + sourceInfoDeviceTag=None, + ), + ], + )