Merge pull request #2514 from moj-analytical-services/maint/deps

Update lockfile + fixes for latest package versions
moj-analytical-services · Nov 21, 2024 · 9161712 · 9161712
2 parents 1c8ad3a + af18a92
commit 9161712
Show file tree

Hide file tree

Showing 7 changed files with 828 additions and 628 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -48,6 +48,13 @@ sqlalchemy = ">=1.4.0"
 # temporarily use binary version, to avoid issues with pg_config path
 psycopg2-binary = ">=2.8.0"
 igraph = ">=0.11.2"
+# 2.2.2 is first version that supports numpy >= 2.0.0
+# if we don't constrain this, we can end up with numpy >= 2.0.0 and pandas < 2.2.2
+# particularly in pythons 3.10 and 3.11
+pandas = [
+   {version= ">1.3.5", python = ">=3.8"},
+   {version=">=2.2.2", python = ">=3.10"},
+]
 
 [tool.poetry.group.linting]
 [tool.poetry.group.linting.dependencies]

diff --git a/scripts/postgres_docker/setup.sh b/scripts/postgres_docker/setup.sh
@@ -2,4 +2,4 @@
 # run from root
 
 # add -d for detached mode (run in background)
-docker-compose -f scripts/postgres/docker-compose.yaml up
+docker-compose -f scripts/postgres_docker/docker-compose.yaml up
diff --git a/splink/internals/dialects.py b/splink/internals/dialects.py
@@ -208,7 +208,7 @@ def default_date_format(self):
 
     @property
     def default_timestamp_format(self):
-        return "%Y-%m-%dT%H:%M:%S%Z"
+        return "%Y-%m-%dT%H:%M:%SZ"
 
     def _try_parse_date_raw(self, name: str, date_format: str = None) -> str:
         if date_format is None:

diff --git a/splink/internals/spark/spark_helpers/custom_spark_dialect.py b/splink/internals/spark/spark_helpers/custom_spark_dialect.py
@@ -1,6 +1,5 @@
 from sqlglot import exp
 from sqlglot.dialects import Dialect, Spark
-from sqlglot.generator import Generator as GeneratorSqlglot
 
 
 def cast_as_double_edit(self, expression):
@@ -25,7 +24,7 @@ class Parser(Spark.Parser):
             **Spark.Parser.FUNCTIONS,
         }
 
-    class Generator(GeneratorSqlglot):
+    class Generator(Spark.Generator):
         TYPE_MAPPING = {
             **Spark.Generator.TYPE_MAPPING,
         }

diff --git a/tests/test_caching.py b/tests/test_caching.py
@@ -16,15 +16,28 @@
 from tests.basic_settings import get_settings_dict
 
 df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
-__splink__dummy_frame = pd.DataFrame(["id"])
+_dummy_pd_frame = pd.DataFrame(["id"])
 
 
 def make_mock_execute(db_api):
     # creates a mock version of linker._sql_to_splink_dataframe,
     # so we can count calls
-    dummy_splink_df = DuckDBDataFrame("template", "__splink__dummy_frame", db_api)
+    dummy_table_name = "__splink__dummy_frame"
+    dummy_splink_df = DuckDBDataFrame("template", dummy_table_name, db_api)
+
+    def register_and_return_dummy_frame(*args, **kwargs):
+        # need to make sure that the dummy frame always exist in the context
+        # we are running tests
+        # not actually interested in the frame itself, but needs to exist in
+        # connexion in case a method tries to access it
+        db_api._con.sql(
+            f"CREATE TABLE IF NOT EXISTS {dummy_table_name} AS "
+            f"SELECT * FROM _dummy_pd_frame"
+        )
+        return dummy_splink_df
+
     mock_execute = create_autospec(
-        db_api._sql_to_splink_dataframe, return_value=dummy_splink_df
+        db_api._sql_to_splink_dataframe, side_effect=register_and_return_dummy_frame
     )
     return mock_execute
 

diff --git a/tests/test_sql_transform.py b/tests/test_sql_transform.py
@@ -11,7 +11,7 @@
 
 
 def move_l_r_test(br, expected):
-    res = move_l_r_table_prefix_to_column_suffix(br)
+    res = move_l_r_table_prefix_to_column_suffix(br, sqlglot_dialect="duckdb")
     assert res.lower() == expected.lower()
 
 
@@ -20,8 +20,8 @@ def test_move_l_r_table_prefix_to_column_suffix():
     expected = "first_name_l = first_name_r"
     move_l_r_test(br, expected)
 
-    br = "substr(l.last_name, 1, 2) = substr(r.last_name, 1, 2)"
-    expected = "substr(last_name_l, 1, 2) = substr(last_name_r, 1, 2)"
+    br = "substring(l.last_name, 1, 2) = substring(r.last_name, 1, 2)"
+    expected = "substring(last_name_l, 1, 2) = substring(last_name_r, 1, 2)"
     move_l_r_test(br, expected)
 
     br = "l.name['first'] = r.name['first'] and levenshtein(l.dob, r.dob) < 2"