From c4f951a9ee7897bb70ad4ee4c443e7b0320e58da Mon Sep 17 00:00:00 2001 From: ADBond <48208438+ADBond@users.noreply.github.com> Date: Wed, 13 Nov 2024 16:52:39 +0000 Subject: [PATCH 1/4] test clustering works even if we have an empty edges table --- tests/test_clustering.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tests/test_clustering.py b/tests/test_clustering.py index 2cd8bb8fd..ab85132fb 100644 --- a/tests/test_clustering.py +++ b/tests/test_clustering.py @@ -63,3 +63,37 @@ def test_clustering(test_helpers, dialect, link_type, input_pd_tables): df_predict = linker.inference.predict() linker.clustering.cluster_pairwise_predictions_at_threshold(df_predict, 0.95) + + +@mark_with_dialects_excluding() +def test_clustering_no_edges(test_helpers, dialect): + helper = test_helpers[dialect] + + df = pd.DataFrame( + [ + {"id": 1, "first_name": "Andy", "surname": "Bandy", "city": "London"}, + {"id": 2, "first_name": "Andi", "surname": "Bandi", "city": "London"}, + {"id": 3, "first_name": "Terry", "surname": "Berry", "city": "Glasgow"}, + {"id": 4, "first_name": "Terri", "surname": "Berri", "city": "Glasgow"}, + ] + ) + + settings = SettingsCreator( + link_type="dedupe_only", + comparisons=[ + cl.ExactMatch("first_name"), + cl.ExactMatch("surname"), + cl.ExactMatch("city"), + ], + blocking_rules_to_generate_predictions=[ + block_on("surname"), + block_on("first_name"), + ], + unique_id_column_name="id", + ) + linker_input = helper.convert_frame(df) + linker = Linker(linker_input, settings, **helper.extra_linker_args()) + + # due to blocking rules, df_predict will be empty + df_predict = linker.inference.predict() + linker.clustering.cluster_pairwise_predictions_at_threshold(df_predict, 0.95) From 69676d472adf7540656f25faccbdc71f0fa8fc9a Mon Sep 17 00:00:00 2001 From: ADBond <48208438+ADBond@users.noreply.github.com> Date: Wed, 13 Nov 2024 16:58:15 +0000 Subject: [PATCH 2/4] duckdb - columns from information_schema this handles the case where the table has no rows, and is also faster --- splink/internals/duckdb/dataframe.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/splink/internals/duckdb/dataframe.py b/splink/internals/duckdb/dataframe.py index 787946e2a..e51a8d7d3 100644 --- a/splink/internals/duckdb/dataframe.py +++ b/splink/internals/duckdb/dataframe.py @@ -20,9 +20,14 @@ class DuckDBDataFrame(SplinkDataFrame): @property def columns(self) -> list[InputColumn]: - d = self.as_record_dict(1)[0] + sql = ( + f"SELECT column_name FROM information_schema.columns " + f"WHERE table_name = '{self.physical_name}'" + ) + col_strings = (self.db_api._execute_sql_against_backend(sql).to_df().to_dict())[ + "column_name" + ].values() - col_strings = list(d.keys()) return [InputColumn(c, sqlglot_dialect_str="duckdb") for c in col_strings] def validate(self): From e7cc974ba05294ca67e6c917e94c1f2ada8e6339 Mon Sep 17 00:00:00 2001 From: ADBond <48208438+ADBond@users.noreply.github.com> Date: Mon, 18 Nov 2024 12:03:44 +0000 Subject: [PATCH 3/4] changelog entry --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 27454c8db..c6366fb75 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +### Fixed + +- Clustering still works in DuckDB even if no edges are available ([#2510](https://github.com/moj-analytical-services/splink/pull/2510)) + ## [4.0.5] - 2024-11-06 - Dataframes to be registered when using `compare_two_records`, to avoid problems with data typing (because the input data can have an explicit schema) ([#2493](https://github.com/moj-analytical-services/splink/pull/2493)) From d903d64bd39256235fc2a2263a2182502c711b8f Mon Sep 17 00:00:00 2001 From: ADBond <48208438+ADBond@users.noreply.github.com> Date: Mon, 18 Nov 2024 12:04:16 +0000 Subject: [PATCH 4/4] updating some changelog info --- CHANGELOG.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c6366fb75..cbcb03d00 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,10 +9,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +- Various bugfixes for `debug_mode` ([#2481](https://github.com/moj-analytical-services/splink/pull/2481)) - Clustering still works in DuckDB even if no edges are available ([#2510](https://github.com/moj-analytical-services/splink/pull/2510)) ## [4.0.5] - 2024-11-06 +### Fixed + - Dataframes to be registered when using `compare_two_records`, to avoid problems with data typing (because the input data can have an explicit schema) ([#2493](https://github.com/moj-analytical-services/splink/pull/2493)) ## [4.0.4] - 2024-10-13 @@ -166,7 +169,12 @@ Major release - see our [blog](https://moj-analytical-services.github.io/splink/ - Corrected path for Spark `.jar` file containing UDFs to work correctly for Spark < 3.0 ([#1622](https://github.com/moj-analytical-services/splink/pull/1622)) - Spark UDF `damerau_levensthein` is now only registered for Spark >= 3.0, as it is not compatible with earlier versions ([#1622](https://github.com/moj-analytical-services/splink/pull/1622)) -[unreleased]: https://github.com/moj-analytical-services/splink/compare/4.0.0...HEAD +[Unreleased]: https://github.com/moj-analytical-services/splink/compare/4.0.5...HEAD +[4.0.5]: https://github.com/moj-analytical-services/splink/compare/4.0.4...4.0.5 +[4.0.4]: https://github.com/moj-analytical-services/splink/compare/4.0.3...4.0.4 +[4.0.3]: https://github.com/moj-analytical-services/splink/compare/4.0.2...4.0.3 +[4.0.2]: https://github.com/moj-analytical-services/splink/compare/4.0.1...4.0.2 +[4.0.1]: https://github.com/moj-analytical-services/splink/compare/4.0.0...4.0.1 [4.0.0]: https://github.com/moj-analytical-services/splink/compare/3.9.15...4.0.0 [3.9.15]: https://github.com/moj-analytical-services/splink/compare/3.9.14...3.9.15 [3.9.14]: https://github.com/moj-analytical-services/splink/compare/3.9.13...3.9.14