From d7eb628971b4fea0ede12eebfc00b2265f33af09 Mon Sep 17 00:00:00 2001 From: JD Rudie Date: Thu, 22 Feb 2024 19:19:25 -0500 Subject: [PATCH] feat: start converting pandas to polars --- poetry.lock | 87 +++++++++++++++++++++++++++++++++++++++++++++++++- pyproject.toml | 3 ++ src/gtfs.py | 32 +++++++++---------- 3 files changed, 105 insertions(+), 17 deletions(-) diff --git a/poetry.lock b/poetry.lock index 524b830..eb604b1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -681,6 +681,43 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "polars" +version = "0.20.5" +description = "Blazingly fast DataFrame library" +optional = false +python-versions = ">=3.8" +files = [ + {file = "polars-0.20.5-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:19693d0815e7be757b2320a5ed988a209f9a505562ed937084b0c7d59109f6b7"}, + {file = "polars-0.20.5-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:6b8674a18b4915207ae46855e72b188391e341e519a72f24b9591ce5164b837d"}, + {file = "polars-0.20.5-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa5d8139020688b0a8f4cdf765df17fe9fa4c8defac6361412bd4bc80a12433c"}, + {file = "polars-0.20.5-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:d9d069bb4e0cad8efbd7e6211d68e65698d50e77e72490565e52ff035236c08e"}, + {file = "polars-0.20.5-cp38-abi3-win_amd64.whl", hash = "sha256:4d614503f963cd5a8cea3240e7fd9f56b6e574d00ef80091e8689bb6defaf880"}, + {file = "polars-0.20.5.tar.gz", hash = "sha256:fa4abc22cee024b5872961ddcd8a13a0a76150df345e21ce4308c2b1a36b47aa"}, +] + +[package.extras] +adbc = ["adbc_driver_sqlite"] +all = ["polars[adbc,cloudpickle,connectorx,deltalake,fsspec,gevent,numpy,pandas,plot,pyarrow,pydantic,pyiceberg,sqlalchemy,timezone,xlsx2csv,xlsxwriter]"] +cloudpickle = ["cloudpickle"] +connectorx = ["connectorx (>=0.3.2)"] +deltalake = ["deltalake (>=0.14.0)"] +fsspec = ["fsspec"] +gevent = ["gevent"] +matplotlib = ["matplotlib"] +numpy = ["numpy (>=1.16.0)"] +openpyxl = ["openpyxl (>=3.0.0)"] +pandas = ["pandas", "pyarrow (>=7.0.0)"] +plot = ["hvplot (>=0.9.1)"] +pyarrow = ["pyarrow (>=7.0.0)"] +pydantic = ["pydantic"] +pyiceberg = ["pyiceberg (>=0.5.0)"] +pyxlsb = ["pyxlsb (>=1.0)"] +sqlalchemy = ["pandas", "sqlalchemy"] +timezone = ["backports.zoneinfo", "tzdata"] +xlsx2csv = ["xlsx2csv (>=0.8.0)"] +xlsxwriter = ["xlsxwriter"] + [[package]] name = "protobuf" version = "4.25.2" @@ -701,6 +738,54 @@ files = [ {file = "protobuf-4.25.2.tar.gz", hash = "sha256:fe599e175cb347efc8ee524bcd4b902d11f7262c0e569ececcb89995c15f0a5e"}, ] +[[package]] +name = "pyarrow" +version = "14.0.2" +description = "Python library for Apache Arrow" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pyarrow-14.0.2-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:ba9fe808596c5dbd08b3aeffe901e5f81095baaa28e7d5118e01354c64f22807"}, + {file = "pyarrow-14.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:22a768987a16bb46220cef490c56c671993fbee8fd0475febac0b3e16b00a10e"}, + {file = "pyarrow-14.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2dbba05e98f247f17e64303eb876f4a80fcd32f73c7e9ad975a83834d81f3fda"}, + {file = "pyarrow-14.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a898d134d00b1eca04998e9d286e19653f9d0fcb99587310cd10270907452a6b"}, + {file = "pyarrow-14.0.2-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:87e879323f256cb04267bb365add7208f302df942eb943c93a9dfeb8f44840b1"}, + {file = "pyarrow-14.0.2-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:76fc257559404ea5f1306ea9a3ff0541bf996ff3f7b9209fc517b5e83811fa8e"}, + {file = "pyarrow-14.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:b0c4a18e00f3a32398a7f31da47fefcd7a927545b396e1f15d0c85c2f2c778cd"}, + {file = "pyarrow-14.0.2-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:87482af32e5a0c0cce2d12eb3c039dd1d853bd905b04f3f953f147c7a196915b"}, + {file = "pyarrow-14.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:059bd8f12a70519e46cd64e1ba40e97eae55e0cbe1695edd95384653d7626b23"}, + {file = "pyarrow-14.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f16111f9ab27e60b391c5f6d197510e3ad6654e73857b4e394861fc79c37200"}, + {file = "pyarrow-14.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06ff1264fe4448e8d02073f5ce45a9f934c0f3db0a04460d0b01ff28befc3696"}, + {file = "pyarrow-14.0.2-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:6dd4f4b472ccf4042f1eab77e6c8bce574543f54d2135c7e396f413046397d5a"}, + {file = "pyarrow-14.0.2-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:32356bfb58b36059773f49e4e214996888eeea3a08893e7dbde44753799b2a02"}, + {file = "pyarrow-14.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:52809ee69d4dbf2241c0e4366d949ba035cbcf48409bf404f071f624ed313a2b"}, + {file = "pyarrow-14.0.2-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:c87824a5ac52be210d32906c715f4ed7053d0180c1060ae3ff9b7e560f53f944"}, + {file = "pyarrow-14.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a25eb2421a58e861f6ca91f43339d215476f4fe159eca603c55950c14f378cc5"}, + {file = "pyarrow-14.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c1da70d668af5620b8ba0a23f229030a4cd6c5f24a616a146f30d2386fec422"}, + {file = "pyarrow-14.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2cc61593c8e66194c7cdfae594503e91b926a228fba40b5cf25cc593563bcd07"}, + {file = "pyarrow-14.0.2-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:78ea56f62fb7c0ae8ecb9afdd7893e3a7dbeb0b04106f5c08dbb23f9c0157591"}, + {file = "pyarrow-14.0.2-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:37c233ddbce0c67a76c0985612fef27c0c92aef9413cf5aa56952f359fcb7379"}, + {file = "pyarrow-14.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:e4b123ad0f6add92de898214d404e488167b87b5dd86e9a434126bc2b7a5578d"}, + {file = "pyarrow-14.0.2-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:e354fba8490de258be7687f341bc04aba181fc8aa1f71e4584f9890d9cb2dec2"}, + {file = "pyarrow-14.0.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:20e003a23a13da963f43e2b432483fdd8c38dc8882cd145f09f21792e1cf22a1"}, + {file = "pyarrow-14.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc0de7575e841f1595ac07e5bc631084fd06ca8b03c0f2ecece733d23cd5102a"}, + {file = "pyarrow-14.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66e986dc859712acb0bd45601229021f3ffcdfc49044b64c6d071aaf4fa49e98"}, + {file = "pyarrow-14.0.2-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:f7d029f20ef56673a9730766023459ece397a05001f4e4d13805111d7c2108c0"}, + {file = "pyarrow-14.0.2-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:209bac546942b0d8edc8debda248364f7f668e4aad4741bae58e67d40e5fcf75"}, + {file = "pyarrow-14.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:1e6987c5274fb87d66bb36816afb6f65707546b3c45c44c28e3c4133c010a881"}, + {file = "pyarrow-14.0.2-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:a01d0052d2a294a5f56cc1862933014e696aa08cc7b620e8c0cce5a5d362e976"}, + {file = "pyarrow-14.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a51fee3a7db4d37f8cda3ea96f32530620d43b0489d169b285d774da48ca9785"}, + {file = "pyarrow-14.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:64df2bf1ef2ef14cee531e2dfe03dd924017650ffaa6f9513d7a1bb291e59c15"}, + {file = "pyarrow-14.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c0fa3bfdb0305ffe09810f9d3e2e50a2787e3a07063001dcd7adae0cee3601a"}, + {file = "pyarrow-14.0.2-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:c65bf4fd06584f058420238bc47a316e80dda01ec0dfb3044594128a6c2db794"}, + {file = "pyarrow-14.0.2-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:63ac901baec9369d6aae1cbe6cca11178fb018a8d45068aaf5bb54f94804a866"}, + {file = "pyarrow-14.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:75ee0efe7a87a687ae303d63037d08a48ef9ea0127064df18267252cfe2e9541"}, + {file = "pyarrow-14.0.2.tar.gz", hash = "sha256:36cef6ba12b499d864d1def3e990f97949e0b79400d08b7cf74504ffbd3eb025"}, +] + +[package.dependencies] +numpy = ">=1.16.6" + [[package]] name = "pycodestyle" version = "2.11.1" @@ -1002,4 +1087,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">=3.11" -content-hash = "4bf96fbbfc0945424938f3abd29d21ce9be9b5e92038d9465a019b9a75e0f0ab" +content-hash = "153b78b244a058720f2305259f286370deec58b73ff0273b349a6c54d8a27100" diff --git a/pyproject.toml b/pyproject.toml index 3b96ef1..5f1f06f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,9 @@ requests = "2.31.0" boto3 = "^1.34.20" ddtrace = "^2.4.0" python-json-logger = "^2.0.7" +polars = "^0.20.5" +numpy = "^1.26.3" +pyarrow = "^14.0.2" [tool.poetry.dev-dependencies] pip = ">=23.1.0" diff --git a/src/gtfs.py b/src/gtfs.py index caa2f73..e60178e 100644 --- a/src/gtfs.py +++ b/src/gtfs.py @@ -1,5 +1,6 @@ import datetime import pandas as pd +import polars as pl import pathlib import shutil import urllib.request @@ -169,40 +170,39 @@ def add_gtfs_headways(events_df: pd.DataFrame, all_trips: pd.DataFrame, all_stop trip_start_times = gtfs_stops.groupby("trip_id").arrival_time.transform("min") gtfs_stops["scheduled_tt"] = (gtfs_stops.arrival_time - trip_start_times).dt.seconds + gtfs_stops_pl = pl.from_pandas(gtfs_stops) + # assign each actual timepoint a scheduled headway # merge_asof 'backward' matches the previous scheduled value of 'arrival_time' days_events["arrival_time"] = days_events.event_time - pd.Timestamp(service_date).tz_localize("US/Eastern") - augmented_events = pd.merge_asof( - days_events.sort_values(by="arrival_time"), - gtfs_stops[RTE_DIR_STOP + ["arrival_time", "scheduled_headway"]], + days_events = pl.from_pandas(days_events) + augmented_events = days_events.sort(by="arrival_time").join_asof( + gtfs_stops_pl[RTE_DIR_STOP + ["arrival_time", "scheduled_headway"]], on="arrival_time", - direction="backward", + strategy="backward", by=RTE_DIR_STOP, ) # assign each actual trip a scheduled trip_id, based on when it started the route - route_starts = days_events.loc[days_events.groupby("trip_id").event_time.idxmin()] + route_starts = days_events.select(days_events.groupby("trip_id").agg("event_time")) route_starts = route_starts[RTE_DIR_STOP + ["trip_id", "arrival_time"]] - trip_id_map = pd.merge_asof( - route_starts.sort_values(by="arrival_time"), - gtfs_stops[RTE_DIR_STOP + ["arrival_time", "trip_id"]], + trip_id_map = route_starts.sort(by="arrival_time").join_asof( + gtfs_stops_pl[RTE_DIR_STOP + ["arrival_time", "trip_id"]], on="arrival_time", - direction="nearest", + strategy="nearest", by=RTE_DIR_STOP, - suffixes=["", "_scheduled"], + suffix="_scheduled" ) - trip_id_map = trip_id_map.set_index("trip_id").trip_id_scheduled + trip_id_map = trip_id_map.select("trip_id_scheduled") # use the scheduled trip matching to get the scheduled traveltime - augmented_events["scheduled_trip_id"] = augmented_events.trip_id.map(trip_id_map) - augmented_events = pd.merge( - augmented_events, - gtfs_stops[RTE_DIR_STOP + ["trip_id", "scheduled_tt"]], + augmented_events["scheduled_trip_id"] = augmented_events["trip_id"].apply(trip_id_map) + augmented_events = augmented_events.join(gtfs_stops_pl[RTE_DIR_STOP + ["trip_id", "scheduled_tt"]], how="left", left_on=RTE_DIR_STOP + ["scheduled_trip_id"], right_on=RTE_DIR_STOP + ["trip_id"], - suffixes=["", "_gtfs"], + suffix="_gtfs", ) # finally, put all the days together