Merge pull request #2510 from moj-analytical-services/bug/columns-emp…

…ty-table Bug - get columns of DuckDB frame even when table is empty
moj-analytical-services · Nov 18, 2024 · 7b5055a · 7b5055a
2 parents fff3433 + d903d64
commit 7b5055a
Show file tree

Hide file tree

Showing 3 changed files with 54 additions and 3 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,8 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+### Fixed
+
+- Various bugfixes for `debug_mode` ([#2481](https://github.com/moj-analytical-services/splink/pull/2481))
+- Clustering still works in DuckDB even if no edges are available ([#2510](https://github.com/moj-analytical-services/splink/pull/2510))
+
 ## [4.0.5] - 2024-11-06
 
+### Fixed
+
 - Dataframes to be registered when using `compare_two_records`, to avoid problems with data typing (because the input data can have an explicit schema) ([#2493](https://github.com/moj-analytical-services/splink/pull/2493))
 
 ## [4.0.4] - 2024-10-13
@@ -162,7 +169,12 @@ Major release - see our [blog](https://moj-analytical-services.github.io/splink/
 - Corrected path for Spark `.jar` file containing UDFs to work correctly for Spark < 3.0 ([#1622](https://github.com/moj-analytical-services/splink/pull/1622))
 - Spark UDF `damerau_levensthein` is now only registered for Spark >= 3.0, as it is not compatible with earlier versions ([#1622](https://github.com/moj-analytical-services/splink/pull/1622))
 
-[unreleased]: https://github.com/moj-analytical-services/splink/compare/4.0.0...HEAD
+[Unreleased]: https://github.com/moj-analytical-services/splink/compare/4.0.5...HEAD
+[4.0.5]: https://github.com/moj-analytical-services/splink/compare/4.0.4...4.0.5
+[4.0.4]: https://github.com/moj-analytical-services/splink/compare/4.0.3...4.0.4
+[4.0.3]: https://github.com/moj-analytical-services/splink/compare/4.0.2...4.0.3
+[4.0.2]: https://github.com/moj-analytical-services/splink/compare/4.0.1...4.0.2
+[4.0.1]: https://github.com/moj-analytical-services/splink/compare/4.0.0...4.0.1
 [4.0.0]: https://github.com/moj-analytical-services/splink/compare/3.9.15...4.0.0
 [3.9.15]: https://github.com/moj-analytical-services/splink/compare/3.9.14...3.9.15
 [3.9.14]: https://github.com/moj-analytical-services/splink/compare/3.9.13...3.9.14

diff --git a/splink/internals/duckdb/dataframe.py b/splink/internals/duckdb/dataframe.py
@@ -20,9 +20,14 @@ class DuckDBDataFrame(SplinkDataFrame):
 
     @property
     def columns(self) -> list[InputColumn]:
-        d = self.as_record_dict(1)[0]
+        sql = (
+            f"SELECT column_name FROM information_schema.columns "
+            f"WHERE table_name = '{self.physical_name}'"
+        )
+        col_strings = (self.db_api._execute_sql_against_backend(sql).to_df().to_dict())[
+            "column_name"
+        ].values()
 
-        col_strings = list(d.keys())
         return [InputColumn(c, sqlglot_dialect_str="duckdb") for c in col_strings]
 
     def validate(self):

diff --git a/tests/test_clustering.py b/tests/test_clustering.py
@@ -63,3 +63,37 @@ def test_clustering(test_helpers, dialect, link_type, input_pd_tables):
 
     df_predict = linker.inference.predict()
     linker.clustering.cluster_pairwise_predictions_at_threshold(df_predict, 0.95)
+
+
+@mark_with_dialects_excluding()
+def test_clustering_no_edges(test_helpers, dialect):
+    helper = test_helpers[dialect]
+
+    df = pd.DataFrame(
+        [
+            {"id": 1, "first_name": "Andy", "surname": "Bandy", "city": "London"},
+            {"id": 2, "first_name": "Andi", "surname": "Bandi", "city": "London"},
+            {"id": 3, "first_name": "Terry", "surname": "Berry", "city": "Glasgow"},
+            {"id": 4, "first_name": "Terri", "surname": "Berri", "city": "Glasgow"},
+        ]
+    )
+
+    settings = SettingsCreator(
+        link_type="dedupe_only",
+        comparisons=[
+            cl.ExactMatch("first_name"),
+            cl.ExactMatch("surname"),
+            cl.ExactMatch("city"),
+        ],
+        blocking_rules_to_generate_predictions=[
+            block_on("surname"),
+            block_on("first_name"),
+        ],
+        unique_id_column_name="id",
+    )
+    linker_input = helper.convert_frame(df)
+    linker = Linker(linker_input, settings, **helper.extra_linker_args())
+
+    # due to blocking rules, df_predict will be empty
+    df_predict = linker.inference.predict()
+    linker.clustering.cluster_pairwise_predictions_at_threshold(df_predict, 0.95)