Skip to content

Commit

Permalink
Merge pull request #42 from panagiotisanagnostou/new_features
Browse files Browse the repository at this point in the history
New features

- Updates on the dendrogram visualization

Matching of the cluster colors to all the static visualization. This helps the intrepretation of the resutls by the package users.

- Version update

Transition to version v1.0.2

- pytest update and minor fixies

Test creation for the new features added
  • Loading branch information
panagiotisanagnostou authored Feb 1, 2024
2 parents 5345a2a + 1b37869 commit 0e4eec2
Show file tree
Hide file tree
Showing 7 changed files with 149 additions and 27 deletions.
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
author = 'Panagiotis Anagnostou'

# The full version, including alpha/beta/rc tags
release = '1.0.1'
release = '1.0.2'

# -- General configuration ---------------------------------------------------

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
with open("README.md", "r", encoding="utf-8") as fh:
long_description = fh.read()

__version__ = "1.0.1"
__version__ = "1.0.2"

setuptools.setup(
name="HiPart",
Expand Down
2 changes: 1 addition & 1 deletion src/HiPart/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
from KDEpy.TreeKDE import TreeKDE
from KDEpy.FFTKDE import FFTKDE

__version__ = "1.0.1"
__version__ = "1.0.2"
__author__ = "Panagiotis Anagnostou"

TreeKDE = TreeKDE
Expand Down
79 changes: 70 additions & 9 deletions src/HiPart/__utility_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -714,12 +714,12 @@ def grid_position(current, rows, splits, with_marginal=True):
col_from = curCol * num_of_subgrid_elements
col_to = (curCol * num_of_subgrid_elements) + num_of_subgrid_elements

else: # splits % rows != 0:
position_corection = (rows - 1) / (splits % rows) * sub_grid_size
else: # splits % rows != 0:
position_correction = (rows - 1) / (splits % rows) * sub_grid_size
col_from = int(
(curCol * num_of_subgrid_elements) + position_corection)
(curCol * num_of_subgrid_elements) + position_correction)
col_to = int(
(curCol * num_of_subgrid_elements) + position_corection) + num_of_subgrid_elements
(curCol * num_of_subgrid_elements) + position_correction) + num_of_subgrid_elements

return row_from, row_to, col_from, col_to

Expand Down Expand Up @@ -751,7 +751,7 @@ def _get_node_depth(path_to_leaves, i):
return depth


def create_linkage(tree_in):
def create_linkage(tree_in, color_keys=False, debug=False):
"""
Create the linkage matrix for the encoding of the divisive clustering tree
created by the member algorithms of the HiPart package.
Expand Down Expand Up @@ -781,6 +781,12 @@ def create_linkage(tree_in):
# The indicator for the next free node of the linkage tree we are creating
dendrogram_counts = samples_number

if color_keys:
# Initialize the dictionary of the color keys for the clusters
dict_keys = {
tree.get_node(i[-1]).data["color_key"]: tree.get_node(i[-1]).data[
"indices"] for i in path_to_leaves}

# Initialize the linkage matrix
Z = np.array([[0, 0, 0, 0]])
# Loop through the nodes of the algorithm`s execution tree and do the
Expand All @@ -797,6 +803,7 @@ def create_linkage(tree_in):
tree.get_node(i).data["unlinked_nodes"] = tree.get_node(i).data[
"indices"
]
start = dendrogram_counts
# Create the dendrogram`s subtree and update the algorithm tree
# node`s data and the index for the next free node
(
Expand All @@ -806,10 +813,18 @@ def create_linkage(tree_in):
) = linkage_data_maestro(
tree.get_node(i),
dendrogram_counts,
0.2 + 0.5 * (max_distance - _get_node_depth(path_to_leaves, i)) / max_distance,
0.2,
)
dendrogram_counts += 1
Z = np.vstack((Z, cluster_linkage))

if color_keys:
# Include under the key of the cluster exept of all the
# samples also their connections until we reach the root of
# the cluster's subtree
key = tree.get_node(i).data["color_key"]
dict_keys[key] = np.hstack(
(dict_keys[key], np.arange(start, dendrogram_counts)))
else:
if not tree.get_node(i).data["dendrogram_check"]:
# Connect the children of the algorithm tree internal node to
Expand All @@ -830,8 +845,7 @@ def create_linkage(tree_in):
tree.get_node(i).data[
"dendromgram_indicator"] = dendrogram_counts
tree.get_node(i).data["counts"] = (
children[-1].data["counts"] + children[-2].data[
"counts"]
children[-1].data["counts"] + children[-2].data["counts"]
)
tree.get_node(i).data["unlinked_nodes"] = [dendrogram_counts]
dendrogram_counts += 1
Expand All @@ -840,7 +854,23 @@ def create_linkage(tree_in):
# initialization`s row of zeros
Z = Z[1:, :]

return Z
if color_keys:
if debug:
list_of_connections = []
for i in dict_keys:
list_of_connections = np.hstack(
(list_of_connections, dict_keys[i]))
multiples = np.sum(np.unique(list_of_connections, return_counts=True)[1] > 1)
print("Number of multiple appearances of one connection: {}".format(multiples))

sequence = np.arange(0, list_of_connections.shape[0])
seq_check = np.sum([i in sequence for i in list_of_connections]) == list_of_connections.shape[0]
print("The list of connections is a sequence from 0 to the max "
"number of connections: {}".format(seq_check))
return Z, dict_keys, multiples, seq_check
return Z, dict_keys
else:
return Z


def linkage_data_maestro(node, dendrogram_counts, distance):
Expand Down Expand Up @@ -1021,3 +1051,34 @@ def rgba_to_hex(rgba):

# Format it into a hex string
return '#{:02x}{:02x}{:02x}{:02x}'.format(r, g, b, int(a * 255))


def search_dict(d, v, c, default="C0"):
"""
Search for the key of a value in a dictionary and the return the color of
the key. If the value is not found in the dictionary the default color is
returned.
Parameters
----------
d : dictionary
The dictionary to search.
v : int
The value to search for.
c : function
The function to use for the color creation.
default : str, optional
The default color to return if the value is not found. The default is
"C0".
Returns
-------
color : str
The color of the key that the value was found in hexadecimal form.
"""

for k in d:
if v in d[k]:
return rgba_to_hex(c(k))
return default
1 change: 1 addition & 0 deletions src/HiPart/interactive_visualization.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
package that utilise one decomposition method to one dimension to split the
data.
@author: Panagiotis Anagnostou
"""

from dash import dcc
Expand Down
44 changes: 32 additions & 12 deletions src/HiPart/visualizations.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,8 @@
@author: Panagiotis Anagnostou
@author: Nicos Pavlidis
"""
from typing import List, Tuple

import matplotlib

import HiPart.__utility_functions as util
import math
import matplotlib.gridspec as gridspec
Expand Down Expand Up @@ -237,7 +235,7 @@ def split_visualization(hipart_object, color_map="viridis", mdh_split_plot=True)
if i == 0:
hist.title.set_text(
"Original data with 1st split"
)
)
else:
hist.title.set_text("Split no. " + str(i + 1))

Expand Down Expand Up @@ -494,7 +492,7 @@ def mdh_visualization(mdh_obj, color_map="viridis"):
return plt


def dendrogram_visualization(hipart_object, cmap="viridis", **dendrogram_parameters):
def dendrogram_visualization(hipart_object, cmap="viridis", default_coloring=True, **dendrogram_parameters):
"""
Create a dendrogram visualization of the divisive clustering based on the
HiPart`s algorithm execution. The characteristic of this dendrogram is that
Expand All @@ -516,7 +514,13 @@ def dendrogram_visualization(hipart_object, cmap="viridis", **dendrogram_paramet
cmap : string
The name of the matplotlib color map to be used for the data
visualization.
**dendrogram_parameters : dict
default_coloring : bool, optional
If True, the dendrogram will be colored according to the default HiPart
tree coloring, based on the clustering implemented by the package. If
False, the dendrogram will be colored according to the default
methodology used by the `scipy.cluster.hierarchy.dendrogram` function.
Note that either way the "color_threshold" parameter can be changed.
**dendrogram_parameters : optional
All the parameters the scipy.cluster.hierarchy.dendrogram function can
take except the color_threshold parameter. Except for the
"color_threshold" parameter. This parameter takes a default threshold
Expand Down Expand Up @@ -556,16 +560,32 @@ def dendrogram_visualization(hipart_object, cmap="viridis", **dendrogram_paramet
clusters but only their hierarchy."""
)

if "count_sort" not in dendrogram_parameters:
dendrogram_parameters["count_sort"] = True
if "above_threshold_color" not in dendrogram_parameters:
default = "C0"
else:
default = dendrogram_parameters["above_threshold_color"]

Z = util.create_linkage(hipart_object.tree)
# Initialize the color map based on the number of clusters
color_map = matplotlib.cm.get_cmap(cmap, hipart_object.max_clusters_number)
colors = np.array([util.rgba_to_hex(color_map(i)) for i in range(hipart_object.max_clusters_number)])

keys = np.array([i.data["color_key"] for i in hipart_object.tree.leaves()])
hierarchy.set_link_color_palette(list(colors[keys]))
dn = hierarchy.dendrogram(Z, color_threshold=1, **dendrogram_parameters)
# Create the linkage and the color keys for the dendrogram
Z, link_keys = util.create_linkage(hipart_object.tree, color_keys=True)

# Create the dendrogram
if default_coloring:
# Create the default coloring link color function for the dendrogram
dn = hierarchy.dendrogram(
Z,
link_color_func=lambda x: util.search_dict(link_keys, x, color_map, default),
**dendrogram_parameters
)
else:
# Create the default colors for the dendrogram function
colors = np.array([util.rgba_to_hex(color_map(i)) for i in range(hipart_object.max_clusters_number)])
# keys = np.array([i.data["color_key"] for i in hipart_object.tree.leaves()])
# list(colors[keys]))
hierarchy.set_link_color_palette(list(colors))
dn = hierarchy.dendrogram(Z, color_threshold=0.3, **dendrogram_parameters)

return dn

Expand Down
46 changes: 43 additions & 3 deletions tests/test_package.py
Original file line number Diff line number Diff line change
Expand Up @@ -900,19 +900,43 @@ def test_split_visualization_valueerror_2(datadir):
def test_dendrogram_visualization(datadir):
with open(datadir.join('test_data.dump'), "rb") as inf:
data_import = pickle.load(inf)
success_score = 0

clustering = DePDDP(max_clusters_number=3).fit(data_import["data"])

new_plot = viz.dendrogram_visualization(clustering)
if isinstance(new_plot, dict):
success_score += 1

assert isinstance(new_plot, dict)
new_plot = viz.dendrogram_visualization(clustering, above_threshold_color="C1")
if isinstance(new_plot, dict):
success_score += 1

new_plot = viz.dendrogram_visualization(clustering, default_coloring=False)
if isinstance(new_plot, dict):
success_score += 1

assert success_score == 3

def test_dendrogram_visualization_typeerror(datadir):
with open(datadir.join('test_data.dump'), "rb") as inf:
data_import = pickle.load(inf)

clustering = IPDDP(max_clusters_number=3).fit(data_import["data"])

success_score = 0

try:
viz.dendrogram_visualization(np.array([1, 2, 3]))
assert False
except Exception:
assert True
success_score += 1

try:
viz.dendrogram_visualization(clustering, color_threshold=0.5)
except Exception:
success_score += 1

assert success_score == 2


def test_linkage(datadir):
Expand All @@ -925,6 +949,22 @@ def test_linkage(datadir):
assert isinstance(links, np.ndarray)


def test_create_linkage(datadir):
with open(datadir.join('test_data.dump'), "rb") as inf:
data_import = pickle.load(inf)
success_score = 0

clustering = DePDDP(max_clusters_number=3).fit(data_import["data"])
_, _, multiples, seq_check = uf.create_linkage(tree_in=clustering.tree, color_keys=True, debug=True)
if multiples == 0:
success_score += 1

if seq_check:
success_score += 1

assert success_score == 2


def test_linkage_typeerror(datadir):
try:
viz.linkage(np.array([1, 2, 3]))
Expand Down

0 comments on commit 0e4eec2

Please sign in to comment.