diff --git a/docs/conf.py b/docs/conf.py index d7b3763..89d402e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -21,7 +21,7 @@ author = 'Panagiotis Anagnostou' # The full version, including alpha/beta/rc tags -release = '1.0.1' +release = '1.0.2' # -- General configuration --------------------------------------------------- diff --git a/setup.py b/setup.py index ee0ba9b..914f38e 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ with open("README.md", "r", encoding="utf-8") as fh: long_description = fh.read() -__version__ = "1.0.1" +__version__ = "1.0.2" setuptools.setup( name="HiPart", diff --git a/src/HiPart/__init__.py b/src/HiPart/__init__.py index c9d2804..880aebf 100644 --- a/src/HiPart/__init__.py +++ b/src/HiPart/__init__.py @@ -37,7 +37,7 @@ from KDEpy.TreeKDE import TreeKDE from KDEpy.FFTKDE import FFTKDE -__version__ = "1.0.1" +__version__ = "1.0.2" __author__ = "Panagiotis Anagnostou" TreeKDE = TreeKDE diff --git a/src/HiPart/__utility_functions.py b/src/HiPart/__utility_functions.py index 8e6955f..4833aee 100644 --- a/src/HiPart/__utility_functions.py +++ b/src/HiPart/__utility_functions.py @@ -714,12 +714,12 @@ def grid_position(current, rows, splits, with_marginal=True): col_from = curCol * num_of_subgrid_elements col_to = (curCol * num_of_subgrid_elements) + num_of_subgrid_elements - else: # splits % rows != 0: - position_corection = (rows - 1) / (splits % rows) * sub_grid_size + else: # splits % rows != 0: + position_correction = (rows - 1) / (splits % rows) * sub_grid_size col_from = int( - (curCol * num_of_subgrid_elements) + position_corection) + (curCol * num_of_subgrid_elements) + position_correction) col_to = int( - (curCol * num_of_subgrid_elements) + position_corection) + num_of_subgrid_elements + (curCol * num_of_subgrid_elements) + position_correction) + num_of_subgrid_elements return row_from, row_to, col_from, col_to @@ -751,7 +751,7 @@ def _get_node_depth(path_to_leaves, i): return depth -def create_linkage(tree_in): +def create_linkage(tree_in, color_keys=False, debug=False): """ Create the linkage matrix for the encoding of the divisive clustering tree created by the member algorithms of the HiPart package. @@ -781,6 +781,12 @@ def create_linkage(tree_in): # The indicator for the next free node of the linkage tree we are creating dendrogram_counts = samples_number + if color_keys: + # Initialize the dictionary of the color keys for the clusters + dict_keys = { + tree.get_node(i[-1]).data["color_key"]: tree.get_node(i[-1]).data[ + "indices"] for i in path_to_leaves} + # Initialize the linkage matrix Z = np.array([[0, 0, 0, 0]]) # Loop through the nodes of the algorithm`s execution tree and do the @@ -797,6 +803,7 @@ def create_linkage(tree_in): tree.get_node(i).data["unlinked_nodes"] = tree.get_node(i).data[ "indices" ] + start = dendrogram_counts # Create the dendrogram`s subtree and update the algorithm tree # node`s data and the index for the next free node ( @@ -806,10 +813,18 @@ def create_linkage(tree_in): ) = linkage_data_maestro( tree.get_node(i), dendrogram_counts, - 0.2 + 0.5 * (max_distance - _get_node_depth(path_to_leaves, i)) / max_distance, + 0.2, ) dendrogram_counts += 1 Z = np.vstack((Z, cluster_linkage)) + + if color_keys: + # Include under the key of the cluster exept of all the + # samples also their connections until we reach the root of + # the cluster's subtree + key = tree.get_node(i).data["color_key"] + dict_keys[key] = np.hstack( + (dict_keys[key], np.arange(start, dendrogram_counts))) else: if not tree.get_node(i).data["dendrogram_check"]: # Connect the children of the algorithm tree internal node to @@ -830,8 +845,7 @@ def create_linkage(tree_in): tree.get_node(i).data[ "dendromgram_indicator"] = dendrogram_counts tree.get_node(i).data["counts"] = ( - children[-1].data["counts"] + children[-2].data[ - "counts"] + children[-1].data["counts"] + children[-2].data["counts"] ) tree.get_node(i).data["unlinked_nodes"] = [dendrogram_counts] dendrogram_counts += 1 @@ -840,7 +854,23 @@ def create_linkage(tree_in): # initialization`s row of zeros Z = Z[1:, :] - return Z + if color_keys: + if debug: + list_of_connections = [] + for i in dict_keys: + list_of_connections = np.hstack( + (list_of_connections, dict_keys[i])) + multiples = np.sum(np.unique(list_of_connections, return_counts=True)[1] > 1) + print("Number of multiple appearances of one connection: {}".format(multiples)) + + sequence = np.arange(0, list_of_connections.shape[0]) + seq_check = np.sum([i in sequence for i in list_of_connections]) == list_of_connections.shape[0] + print("The list of connections is a sequence from 0 to the max " + "number of connections: {}".format(seq_check)) + return Z, dict_keys, multiples, seq_check + return Z, dict_keys + else: + return Z def linkage_data_maestro(node, dendrogram_counts, distance): @@ -1021,3 +1051,34 @@ def rgba_to_hex(rgba): # Format it into a hex string return '#{:02x}{:02x}{:02x}{:02x}'.format(r, g, b, int(a * 255)) + + +def search_dict(d, v, c, default="C0"): + """ + Search for the key of a value in a dictionary and the return the color of + the key. If the value is not found in the dictionary the default color is + returned. + + Parameters + ---------- + d : dictionary + The dictionary to search. + v : int + The value to search for. + c : function + The function to use for the color creation. + default : str, optional + The default color to return if the value is not found. The default is + "C0". + + Returns + ------- + color : str + The color of the key that the value was found in hexadecimal form. + + """ + + for k in d: + if v in d[k]: + return rgba_to_hex(c(k)) + return default diff --git a/src/HiPart/interactive_visualization.py b/src/HiPart/interactive_visualization.py index 21e522f..376c5b8 100644 --- a/src/HiPart/interactive_visualization.py +++ b/src/HiPart/interactive_visualization.py @@ -22,6 +22,7 @@ package that utilise one decomposition method to one dimension to split the data. +@author: Panagiotis Anagnostou """ from dash import dcc diff --git a/src/HiPart/visualizations.py b/src/HiPart/visualizations.py index 430cd61..2ae5695 100644 --- a/src/HiPart/visualizations.py +++ b/src/HiPart/visualizations.py @@ -24,10 +24,8 @@ @author: Panagiotis Anagnostou @author: Nicos Pavlidis """ -from typing import List, Tuple import matplotlib - import HiPart.__utility_functions as util import math import matplotlib.gridspec as gridspec @@ -237,7 +235,7 @@ def split_visualization(hipart_object, color_map="viridis", mdh_split_plot=True) if i == 0: hist.title.set_text( "Original data with 1st split" - ) + ) else: hist.title.set_text("Split no. " + str(i + 1)) @@ -494,7 +492,7 @@ def mdh_visualization(mdh_obj, color_map="viridis"): return plt -def dendrogram_visualization(hipart_object, cmap="viridis", **dendrogram_parameters): +def dendrogram_visualization(hipart_object, cmap="viridis", default_coloring=True, **dendrogram_parameters): """ Create a dendrogram visualization of the divisive clustering based on the HiPart`s algorithm execution. The characteristic of this dendrogram is that @@ -516,7 +514,13 @@ def dendrogram_visualization(hipart_object, cmap="viridis", **dendrogram_paramet cmap : string The name of the matplotlib color map to be used for the data visualization. - **dendrogram_parameters : dict + default_coloring : bool, optional + If True, the dendrogram will be colored according to the default HiPart + tree coloring, based on the clustering implemented by the package. If + False, the dendrogram will be colored according to the default + methodology used by the `scipy.cluster.hierarchy.dendrogram` function. + Note that either way the "color_threshold" parameter can be changed. + **dendrogram_parameters : optional All the parameters the scipy.cluster.hierarchy.dendrogram function can take except the color_threshold parameter. Except for the "color_threshold" parameter. This parameter takes a default threshold @@ -556,16 +560,32 @@ def dendrogram_visualization(hipart_object, cmap="viridis", **dendrogram_paramet clusters but only their hierarchy.""" ) - if "count_sort" not in dendrogram_parameters: - dendrogram_parameters["count_sort"] = True + if "above_threshold_color" not in dendrogram_parameters: + default = "C0" + else: + default = dendrogram_parameters["above_threshold_color"] - Z = util.create_linkage(hipart_object.tree) + # Initialize the color map based on the number of clusters color_map = matplotlib.cm.get_cmap(cmap, hipart_object.max_clusters_number) - colors = np.array([util.rgba_to_hex(color_map(i)) for i in range(hipart_object.max_clusters_number)]) - keys = np.array([i.data["color_key"] for i in hipart_object.tree.leaves()]) - hierarchy.set_link_color_palette(list(colors[keys])) - dn = hierarchy.dendrogram(Z, color_threshold=1, **dendrogram_parameters) + # Create the linkage and the color keys for the dendrogram + Z, link_keys = util.create_linkage(hipart_object.tree, color_keys=True) + + # Create the dendrogram + if default_coloring: + # Create the default coloring link color function for the dendrogram + dn = hierarchy.dendrogram( + Z, + link_color_func=lambda x: util.search_dict(link_keys, x, color_map, default), + **dendrogram_parameters + ) + else: + # Create the default colors for the dendrogram function + colors = np.array([util.rgba_to_hex(color_map(i)) for i in range(hipart_object.max_clusters_number)]) + # keys = np.array([i.data["color_key"] for i in hipart_object.tree.leaves()]) + # list(colors[keys])) + hierarchy.set_link_color_palette(list(colors)) + dn = hierarchy.dendrogram(Z, color_threshold=0.3, **dendrogram_parameters) return dn diff --git a/tests/test_package.py b/tests/test_package.py index c859713..8e7544a 100644 --- a/tests/test_package.py +++ b/tests/test_package.py @@ -900,19 +900,43 @@ def test_split_visualization_valueerror_2(datadir): def test_dendrogram_visualization(datadir): with open(datadir.join('test_data.dump'), "rb") as inf: data_import = pickle.load(inf) + success_score = 0 clustering = DePDDP(max_clusters_number=3).fit(data_import["data"]) + new_plot = viz.dendrogram_visualization(clustering) + if isinstance(new_plot, dict): + success_score += 1 - assert isinstance(new_plot, dict) + new_plot = viz.dendrogram_visualization(clustering, above_threshold_color="C1") + if isinstance(new_plot, dict): + success_score += 1 + new_plot = viz.dendrogram_visualization(clustering, default_coloring=False) + if isinstance(new_plot, dict): + success_score += 1 + + assert success_score == 3 def test_dendrogram_visualization_typeerror(datadir): + with open(datadir.join('test_data.dump'), "rb") as inf: + data_import = pickle.load(inf) + + clustering = IPDDP(max_clusters_number=3).fit(data_import["data"]) + + success_score = 0 + try: viz.dendrogram_visualization(np.array([1, 2, 3])) - assert False except Exception: - assert True + success_score += 1 + + try: + viz.dendrogram_visualization(clustering, color_threshold=0.5) + except Exception: + success_score += 1 + + assert success_score == 2 def test_linkage(datadir): @@ -925,6 +949,22 @@ def test_linkage(datadir): assert isinstance(links, np.ndarray) +def test_create_linkage(datadir): + with open(datadir.join('test_data.dump'), "rb") as inf: + data_import = pickle.load(inf) + success_score = 0 + + clustering = DePDDP(max_clusters_number=3).fit(data_import["data"]) + _, _, multiples, seq_check = uf.create_linkage(tree_in=clustering.tree, color_keys=True, debug=True) + if multiples == 0: + success_score += 1 + + if seq_check: + success_score += 1 + + assert success_score == 2 + + def test_linkage_typeerror(datadir): try: viz.linkage(np.array([1, 2, 3]))