Skip to content

Commit

Permalink
Add DenseClus Implementation notebook for jumpstart (#60)
Browse files Browse the repository at this point in the history
* add jumpstart notebook

* updating with latest changes

* add jumpstart notebook

* add updated nb

* clean nb

* update based on feedback

* clleanup

* notebook linter formatting updates

* incorporate feedback

* updated results and analysis

* notebook cleanup

* Updating metircs for clustering scoring; bumping versions

* added CH score to evalutor, notebook cleanup

* notebook linter updates

* combining nbs; version bumps

---------

Co-authored-by: Charles Frenzel <frenzcha@amazon.com>
Co-authored-by: Bharat Venkat <bharven@amazon.com>
Co-authored-by: Charles Frenzel <69225447+momonga-ml@users.noreply.github.com>
  • Loading branch information
4 people authored Feb 29, 2024
1 parent 4f9d48f commit a9ed99d
Show file tree
Hide file tree
Showing 6 changed files with 1,243 additions and 20 deletions.
34 changes: 26 additions & 8 deletions denseclus/DenseClus.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
clusters = dense_clus.score()
"""


import logging
import warnings
from importlib.util import find_spec
Expand All @@ -29,6 +28,7 @@
import pandas as pd
import umap.umap_ as umap
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import calinski_harabasz_score

from .categorical import extract_categorical
from .numerical import extract_numerical
Expand Down Expand Up @@ -593,15 +593,15 @@ def predict(self, df_new: pd.DataFrame) -> np.ndarray:
)
return predictions

def evaluate(self) -> np.array:
def evaluate(self, log_dbcv=False) -> np.array:
"""Evaluates the cluster and returns the cluster assigned to each row.
This is a wrapper function for HDBSCAN. It outputs the cluster labels
that HDBSCAN converged on.
Parameters
----------
None : None
log_dbcv (bool) : Whether to log DBCV scores. Defaults to False
Returns
-------
Expand All @@ -612,15 +612,33 @@ def evaluate(self) -> np.array:
clustered = labels >= 0

if isinstance(self.hdbscan_, dict) or self.umap_combine_method == "ensemble":
print(f"DBCV score {self.hdbscan_['hdb_numerical'].relative_validity_}")
print(f"DBCV score {self.hdbscan_['hdb_categorical'].relative_validity_}")
embedding_len = self.numerical_umap_.embedding_.shape[0]
if log_dbcv:
print(f"DBCV numerical score {self.hdbscan_['hdb_numerical'].relative_validity_}")
print(
f"DBCV categorical score {self.hdbscan_['hdb_categorical'].relative_validity_}"
)

embeddings = self.numerical_umap_.embedding_
embedding_len = embeddings.shape[0]

coverage = np.sum(clustered) / embedding_len
print(f"Coverage {coverage}")

ch_score = calinski_harabasz_score(embeddings, labels)
print(f"Calinski-Harabasz Score: {ch_score}")

return labels

print(f"DBCV score {self.hdbscan_.relative_validity_}")
embedding_len = self.mapper_.embedding_.shape[0]
if log_dbcv:
print(f"DBCV score {self.hdbscan_.relative_validity_}")

embeddings = self.mapper_.embedding_
embedding_len = embeddings.shape[0]

coverage = np.sum(clustered) / embedding_len
print(f"Coverage {coverage}")

ch_score = calinski_harabasz_score(embeddings, labels)
print(f"Calinski-Harabasz Score: {ch_score}")

return labels
2 changes: 2 additions & 0 deletions notebooks/02_TuningWithHDBSCAN.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,8 @@
],
"source": [
"# we will make our own scorer for DBCV\n",
"\n",
"\n",
"def dbcv_score(X, labels):\n",
" return validity_index(X, labels)\n",
"\n",
Expand Down
Loading

0 comments on commit a9ed99d

Please sign in to comment.