Add DenseClus Implementation notebook for jumpstart (#60)

* add jumpstart notebook * updating with latest changes * add jumpstart notebook * add updated nb * clean nb * update based on feedback * clleanup * notebook linter formatting updates * incorporate feedback * updated results and analysis * notebook cleanup * Updating metircs for clustering scoring; bumping versions * added CH score to evalutor, notebook cleanup * notebook linter updates * combining nbs; version bumps --------- Co-authored-by: Charles Frenzel <frenzcha@amazon.com> Co-authored-by: Bharat Venkat <bharven@amazon.com> Co-authored-by: Charles Frenzel <69225447+momonga-ml@users.noreply.github.com>
awslabs · Feb 29, 2024 · a9ed99d · a9ed99d
1 parent 4f9d48f
commit a9ed99d
Show file tree

Hide file tree

Showing 6 changed files with 1,243 additions and 20 deletions.
diff --git a/denseclus/DenseClus.py b/denseclus/DenseClus.py
@@ -18,7 +18,6 @@
     clusters = dense_clus.score()
 """
 
-
 import logging
 import warnings
 from importlib.util import find_spec
@@ -29,6 +28,7 @@
 import pandas as pd
 import umap.umap_ as umap
 from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.metrics import calinski_harabasz_score
 
 from .categorical import extract_categorical
 from .numerical import extract_numerical
@@ -593,15 +593,15 @@ def predict(self, df_new: pd.DataFrame) -> np.ndarray:
         )
         return predictions
 
-    def evaluate(self) -> np.array:
+    def evaluate(self, log_dbcv=False) -> np.array:
         """Evaluates the cluster and returns the cluster assigned to each row.
 
          This is a wrapper function for HDBSCAN. It outputs the cluster labels
          that HDBSCAN converged on.
 
          Parameters
          ----------
-         None : None
+         log_dbcv (bool) : Whether to log DBCV scores. Defaults to False
 
         Returns
         -------
@@ -612,15 +612,33 @@ def evaluate(self) -> np.array:
         clustered = labels >= 0
 
         if isinstance(self.hdbscan_, dict) or self.umap_combine_method == "ensemble":
-            print(f"DBCV score {self.hdbscan_['hdb_numerical'].relative_validity_}")
-            print(f"DBCV score {self.hdbscan_['hdb_categorical'].relative_validity_}")
-            embedding_len = self.numerical_umap_.embedding_.shape[0]
+            if log_dbcv:
+                print(f"DBCV numerical score {self.hdbscan_['hdb_numerical'].relative_validity_}")
+                print(
+                    f"DBCV categorical score {self.hdbscan_['hdb_categorical'].relative_validity_}"
+                )
+
+            embeddings = self.numerical_umap_.embedding_
+            embedding_len = embeddings.shape[0]
+
             coverage = np.sum(clustered) / embedding_len
             print(f"Coverage {coverage}")
+
+            ch_score = calinski_harabasz_score(embeddings, labels)
+            print(f"Calinski-Harabasz Score: {ch_score}")
+
             return labels
 
-        print(f"DBCV score {self.hdbscan_.relative_validity_}")
-        embedding_len = self.mapper_.embedding_.shape[0]
+        if log_dbcv:
+            print(f"DBCV score {self.hdbscan_.relative_validity_}")
+
+        embeddings = self.mapper_.embedding_
+        embedding_len = embeddings.shape[0]
+
         coverage = np.sum(clustered) / embedding_len
         print(f"Coverage {coverage}")
+
+        ch_score = calinski_harabasz_score(embeddings, labels)
+        print(f"Calinski-Harabasz Score: {ch_score}")
+
         return labels
diff --git a/notebooks/02_TuningWithHDBSCAN.ipynb b/notebooks/02_TuningWithHDBSCAN.ipynb
@@ -365,6 +365,8 @@
    ],
    "source": [
     "# we will make our own scorer for DBCV\n",
+    "\n",
+    "\n",
     "def dbcv_score(X, labels):\n",
     "    return validity_index(X, labels)\n",
     "\n",