diff --git a/.DS_Store b/.DS_Store index 831d759..e62f57d 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/chronoclust/hddstream.py b/chronoclust/hddstream.py index 9436253..8536989 100644 --- a/chronoclust/hddstream.py +++ b/chronoclust/hddstream.py @@ -93,14 +93,17 @@ def _set_dataset_dependent_parameters(self, input_dataset): None """ - self.dataset_dimensionality = input_dataset.shape[1] + dataset_dim = input_dataset.shape[1] + config = self.config + + self.dataset_dimensionality = dataset_dim # Set projected_dimensionality_threshold. This will be set only once. - config_pi = int(float(self.config.find("pi").text)) + config_pi = float(config.find("pi").text) if config_pi <= 0: # if config for projected_dimensionality_threshold is set to absurd size i.e. 0 or negative, # the dimensionality of the dataset is going to be used instead. - self.pi = self.dataset_dimensionality + self.pi = dataset_dim else: # Doesn't make sense for this to not be whole number self.pi = round(config_pi) @@ -108,7 +111,7 @@ def _set_dataset_dependent_parameters(self, input_dataset): # Setting outlier deletion point. It's given as proportion of number of data_autoencoder points. # So need to set whole number. # It'll be based on proportion given * number of data_autoencoder points in previous day. - self.omicron = float(self.config.find("omicron").text) * self.dataset_size + self.omicron = float(config.find("omicron").text) * self.dataset_size # make sure this is done only after we set outlier deletion point! This is because we want the deletion point # to be based on "previous day dataset size"! @@ -173,7 +176,9 @@ def online_microcluster_maintenance(self, input_dataset, input_dataset_daystamp, if reset_param: self._set_dataset_dependent_parameters(input_dataset) - self.logger.info(f"Setting up online phase for timepoint {input_dataset_daystamp} with following params:\n" + logger = self.logger + + logger.info(f"Setting up online phase for timepoint {input_dataset_daystamp} with following params:\n" f"Pcore density threshold factor(beta) = {self.beta}\n" f"Decay rate(lambda) = {self.lambbda}\n" f"Radius threshold(epsilon) = {self.epsilon}\n" @@ -187,7 +192,7 @@ def online_microcluster_maintenance(self, input_dataset, input_dataset_daystamp, # Check whether we need to decay the cluster. We only decay it if this dataset is not for the same day as the # previous dataset processed by the online microcluster maintenance. if (self.last_data_timestamp - input_dataset_daystamp) != 0: - self.logger.info("Decaying and downgrading microclusters") + logger.info("Decaying and downgrading microclusters") # The time difference is converted to days because we only decay as each day has passed between datasets. interval = input_dataset_daystamp - self.last_data_timestamp self._decay_clusters_weight(interval) @@ -204,9 +209,9 @@ def online_microcluster_maintenance(self, input_dataset, input_dataset_daystamp, num_datapoints = input_dataset.shape[0] - self.logger.info("Starting online microcluster maintenance for timepoint {}".format(input_dataset_daystamp)) + logger.info("Starting online microcluster maintenance for timepoint {}".format(input_dataset_daystamp)) # progress bar widget - progress_bar = TqdmToLogger(self.logger, level=logging.INFO) + progress_bar = TqdmToLogger(logger, level=logging.INFO) for row in tqdm(range(num_datapoints), file=progress_bar, mininterval=1): # You may find sometimes the progress line doesn't work well. In that case uncomment below. datapoint = input_dataset[row] @@ -226,8 +231,8 @@ def online_microcluster_maintenance(self, input_dataset, input_dataset_daystamp, # We create a new outlier cluster for the datapoint. self._create_new_outlier_cluster(datapoint, input_dataset_daystamp) - self.logger.info("Finish online microcluster maintenance for timepoint {}".format(input_dataset_daystamp)) - self.logger.info("Online maintenance yield {} pcores and {} outlier".format( + logger.info("Finish online microcluster maintenance for timepoint {}".format(input_dataset_daystamp)) + logger.info("Online maintenance yield {} pcores and {} outlier".format( len(self.pcore_MC), len(self.outlier_MC))) self.last_data_timestamp = input_dataset_daystamp @@ -246,15 +251,21 @@ def _decay_clusters_weight(self, interval): Returns: None. """ - for pcore in self.pcore_MC: + pcore_MCs = self.pcore_MC + outlier_MCs = self.outlier_MC + + for pcore in pcore_MCs: self.decay_a_cluster_weight(interval, pcore) - for outlier_mc in self.outlier_MC: + for outlier_mc in outlier_MCs: self.decay_a_cluster_weight(interval, outlier_mc) def decay_a_cluster_weight(self, interval, microcluster): """ Method to decay a microcluster's weight. + There is no need to update the preferred dimension as its variance would have remained the same. + Remember, decay is affecting CF1, CF2, and weight, and variance is calculated purely from them. + So if all of them change, then the variance will remain the same. Args: interval (float): Time difference between last dataset processed by online cluster maintenance @@ -282,18 +293,25 @@ def _add_to_pcore(self, datapoint, datapoint_timestamp): closest_distance = None closest_cluster_index = None + # To save time rather than keep on referring to the self attributes. + pcore_MCs = self.pcore_MC + delta_squared = self.delta_squared + k = self.k + pi = self.pi + epsilon_squared = self.epsilon_squared + # calculate distances between datapoint and all pcore MCs. - for index, pmc in enumerate(self.pcore_MC): + for index, pmc in enumerate(pcore_MCs): # In the Figure 2 paper[1] line 3-4, # we want to just temporarily add data_autoencoder point to each microcluster to # see if the datapoint can fit in it by checking the microcluster's pdim. We don't want to interfere the # original microcluster, so we clone it and pretend to add a point it. - temp_pmc = pmc.get_copy_with_new_point(datapoint, self.delta_squared, self.k) + temp_pmc = pmc.get_copy_with_new_point(datapoint, delta_squared, k) pdim_temp_pmc = (np.array(temp_pmc.preferred_dimension_vector) != 1).sum() - if pdim_temp_pmc <= self.pi: + if pdim_temp_pmc <= pi: # Rather than keeping array of potential microclusters, we just calculate distance to it, and compare # to see if there has been one that was closer before. If there hasn't # then store this as closest. Otherwise leave it. @@ -306,16 +324,14 @@ def _add_to_pcore(self, datapoint, datapoint_timestamp): # We got here when there exists a potential microcluster that can accomodate the point. We then check to # see if the potential microcluster can actually accomodate the point i.e. its radius will not blow out # beyond the radius threshold. See line 14-15 in Figure 2 paper[1]. - tmp_closest_cluster = self.pcore_MC[closest_cluster_index].get_copy_with_new_point(datapoint, - self.delta_squared, - self.k) + tmp_closest_cluster = pcore_MCs[closest_cluster_index].get_copy_with_new_point(datapoint, delta_squared, k) projected_radius_squared = tmp_closest_cluster.calculate_projected_radius_squared() - if projected_radius_squared <= self.epsilon_squared: + if projected_radius_squared <= epsilon_squared: - self.pcore_MC[closest_cluster_index].add_new_point(datapoint, datapoint_timestamp) - self.pcore_MC[closest_cluster_index].update_preferred_dimensions(self.delta_squared, - self.k) + # TODO: check if referring to local variable affect the actual self as well. If not then revert to self. + pcore_MCs[closest_cluster_index].add_new_point(datapoint, datapoint_timestamp) + pcore_MCs[closest_cluster_index].update_preferred_dimensions(delta_squared, k) return True return False @@ -337,30 +353,34 @@ def _add_to_outlier(self, datapoint, datapoint_timestamp): closest_distance = None closest_cluster_index = None + outlier_MCs = self.outlier_MC + delta_squared = self.delta_squared + k = self.k + epsilon_squared = self.epsilon_squared + # Find closest outlier microcluster. - for index, omc in enumerate(self.outlier_MC): + for index, omc in enumerate(outlier_MCs): distance = omc.get_projected_dist_to_point(datapoint) if closest_distance is None or distance < closest_distance: closest_distance = distance closest_cluster_index = index + # TODO merge this with pcore one as it is very similar if closest_distance is not None: # We got here when there exists an outlier microcluster that can accomodate the point. We then check to # see if the outlier microcluster can actually accomodate the point i.e. its radius will not blow out # beyond the radius threshold. - tmp_outlier_mc = self.outlier_MC[closest_cluster_index].get_copy_with_new_point(datapoint, - self.delta_squared, - self.k) + tmp_outlier_mc = outlier_MCs[closest_cluster_index].get_copy_with_new_point(datapoint, delta_squared, k) projected_radius_squared = tmp_outlier_mc.calculate_projected_radius_squared() - if projected_radius_squared <= self.epsilon_squared: - self.outlier_MC[closest_cluster_index].add_new_point(datapoint, datapoint_timestamp) - self.outlier_MC[closest_cluster_index].update_preferred_dimensions(self.delta_squared, - self.k) + if projected_radius_squared <= epsilon_squared: + # TODO: check if referring to local variable affect the actual self as well. If not then revert to self. + outlier_MCs[closest_cluster_index].add_new_point(datapoint, datapoint_timestamp) + outlier_MCs[closest_cluster_index].update_preferred_dimensions(delta_squared, k) # From here on, we then check whether the outlier microcluster can be upgraded to pcore microcluster. - self._upgrade_outlier_microcluster(self.outlier_MC[closest_cluster_index]) + self._upgrade_outlier_microcluster(outlier_MCs[closest_cluster_index]) return True return False @@ -379,15 +399,23 @@ def _upgrade_outlier_microcluster(self, outlier_mc): Returns: None. """ - weight_threshold_obeyed = outlier_mc.cumulative_weight >= self.beta * \ - self.mu - pdim_threshold_obeyed = np.array(outlier_mc.preferred_dimension_vector > 1).sum() <= \ - self.pi + beta = self.beta + mu = self.mu + pi = self.pi + pcore_MCs = self.pcore_MC + + weight_threshold_obeyed = outlier_mc.cumulative_weight >= beta * mu + pdim_threshold_obeyed = np.array(outlier_mc.preferred_dimension_vector > 1).sum() <= pi if weight_threshold_obeyed and pdim_threshold_obeyed: - outlier_mc.id = list(range(len(self.pcore_MC), len(self.pcore_MC) + 1)) + # TODO: remove if it doesn't break the code + # num_pcore_MCs = len(pcore_MCs) + # outlier_mc.id = list(range(num_pcore_MCs, num_pcore_MCs + 1)) + + outlier_mc.id = [len(pcore_MCs)] self.outlier_MC.remove(outlier_mc) - self.pcore_MC.append(outlier_mc) + # TODO: check if referring to local variable affect the actual self as well. If not then revert to self. + pcore_MCs.append(outlier_mc) def _create_new_outlier_cluster(self, datapoint, creation_time): """ @@ -401,12 +429,21 @@ def _create_new_outlier_cluster(self, datapoint, creation_time): Returns: None. """ - outlier_mc_id = set(range(len(self.outlier_MC), len(self.outlier_MC) + 1)) - outlier_mc = Microcluster(cf1=np.zeros(len(datapoint)), cf2=np.zeros(len(datapoint)), id=outlier_mc_id, + outlier_MCs = self.outlier_MC + # TODO: remove if new code doesn't break + # num_outlier_MCs = len(self.outlier_MC) + # outlier_mc_id = set(range(num_outlier_MCs, num_outlier_MCs + 1)) + outlier_mc_id = set() + outlier_mc_id.add(len(outlier_MCs)) + + num_datapoints = len(datapoint) + outlier_mc = Microcluster(cf1=np.zeros(num_datapoints), cf2=np.zeros(num_datapoints), id=outlier_mc_id, creation_time_in_hrs=creation_time) outlier_mc.add_new_point(datapoint, creation_time) outlier_mc.update_preferred_dimensions(self.delta_squared, self.k) - self.outlier_MC.append(outlier_mc) + + # TODO: check if referring to local variable affect the actual self as well. If not then revert to self. + outlier_MCs.append(outlier_mc) def offline_clustering(self, dataset_daystamp): """ @@ -421,14 +458,19 @@ def offline_clustering(self, dataset_daystamp): datapoints = {} num_core = 0 - for cluster in self.pcore_MC: + pcore_MCs = self.pcore_MC + epsilon_squared = self.epsilon_squared + mu = self.mu + pi = self.pi + logger = self.logger + + for cluster in pcore_MCs: # For offline clustering, the core status of each cluster is determined by the cluster itself rather than # by PreDeCon. cluster_id = next(iter(cluster.id)) - cluster_is_core = cluster.is_core(self.epsilon_squared, self.mu, - self.pi) + cluster_is_core = cluster.is_core(epsilon_squared, mu, pi) if cluster_is_core: num_core += 1 @@ -436,20 +478,20 @@ def offline_clustering(self, dataset_daystamp): datapoint_id=cluster_id, is_core_cluster=cluster_is_core, cluster_CF1=cluster.CF1, cluster_CF2=cluster.CF2, cluster_cumulative_weight=cluster.cumulative_weight) - num_pcore = len(self.pcore_MC) - num_core - self.logger.info(f'Starting offline clustering with {num_core} core clusters and {num_pcore} pcore clusters.') + num_pcore = len(pcore_MCs) - num_core + logger.info(f'Starting offline clustering with {num_core} core clusters and {num_pcore} pcore clusters.') predecon_offline = PreDeCon(datapoints=datapoints, dataset_dimensionality=self.dataset_dimensionality, epsilon=self.upsilon, delta=self.delta, - lambbda=self.pi, - mu=self.mu, + lambbda=pi, + mu=mu, k=self.k) predecon_offline.run() self.final_clusters = predecon_offline.clusters - self.logger.info('Finish offline clustering for dataset with timepoint: {}'.format(dataset_daystamp)) - self.logger.info("Offline clustering yield {} cores.".format(len(self.final_clusters))) + logger.info('Finish offline clustering for dataset with timepoint: {}'.format(dataset_daystamp)) + logger.info("Offline clustering yield {} clusters.".format(len(self.final_clusters))) def downgrade_microclusters(self): self._downgrade_potential_microclusters() @@ -461,26 +503,33 @@ def _downgrade_potential_microclusters(self): no longer obeyed as in Definition 6 in paper[1]. """ - for potential_cluster in self.pcore_MC: - weight_threshold_obeyed = potential_cluster.cumulative_weight < self.beta * \ - self.mu - pdim_threshold_obeyed = np.array(potential_cluster.preferred_dimension_vector > 1).sum() > \ - self.pi + pcore_MCs = self.pcore_MC + outlier_MCs = self.outlier_MC + beta = self.beta + mu = self.mu + pi = self.pi + + for potential_cluster in pcore_MCs: + weight_threshold_obeyed = potential_cluster.cumulative_weight < beta * mu + pdim_threshold_obeyed = np.array(potential_cluster.preferred_dimension_vector > 1).sum() > pi if weight_threshold_obeyed or pdim_threshold_obeyed: - potential_cluster.id = list(range(len(self.outlier_MC), len(self.outlier_MC) + 1)) - self.pcore_MC.remove(potential_cluster) - self.outlier_MC.append(potential_cluster) + # potential_cluster.id = list(range(len(self.outlier_MC), len(self.outlier_MC) + 1)) + potential_cluster.id = [len(outlier_MCs)] + pcore_MCs.remove(potential_cluster) + outlier_MCs.append(potential_cluster) def _downgrade_outlier_microclusters(self): """ Delete outlier microcluster. See section 4.4 in paper [1]. """ - for outlier_cluster in self.outlier_MC: + outlier_MCs = self.outlier_MC + omicron = self.omicron + for outlier_cluster in outlier_MCs: - if outlier_cluster.cumulative_weight <= self.omicron: - self.outlier_MC.remove(outlier_cluster) + if outlier_cluster.cumulative_weight <= omicron: + outlier_MCs.remove(outlier_cluster) del outlier_cluster diff --git a/chronoclust/helper_objects.py b/chronoclust/helper_objects.py index 48d0d95..e5dfaa5 100644 --- a/chronoclust/helper_objects.py +++ b/chronoclust/helper_objects.py @@ -125,6 +125,11 @@ def __init__(self, cf1=None, cf2=None, id=set(), cumulative_weight=0, :param cf1: (Cluster Feature 1) weighted linear sum of all points in this Microcluster for each dimension. + Don't get confused with the paper definition of f(t) * pij where f(t) is weight of the datapoint at + time t when adding new data point. The way we do it is fine because rather than compunding the decay + rate based on the arrival time, we just apply it again to the new weight at every time interval. + So for instance, CF1 at t0 is 7, at t1 it's decayed to 3.5 by multiplying 7 with 2^-1 assuming lambda is 1. + Then at t2, rather than multiplying 7 with 2^-2, we multiply 3.5 by 2^-1 again, yielding same value 1.75. :param cf2: (Cluster Feature 2) weighted linear sum of square of all points in this Microcluster for each dimension. :param id: id of the Microcluster. Array containing single int for potential and outlier, @@ -161,18 +166,28 @@ def update_preferred_dimensions(self, variance_threshold_squared, k_constant): Returns: None. """ - num_dimensions = self.CF1.shape[0] + # we use local variable to speed up the method. Referring to self all the time will add processing time. + cf1 = self.CF1 + cf2 = self.CF2 + cum_weight = self.cumulative_weight + + num_dimensions = cf1.shape[0] # initialise the dimension preference vector to 1. Initially assume that each dimension has variance greater # than the threshold. - self.preferred_dimension_vector = np.ones(num_dimensions) + # self.preferred_dimension_vector = np.ones(num_dimensions) + # this is the updated preferred dimension vector + updated_pref_dim_vector = [] for index in range(num_dimensions): - squared_variance = (self.CF2[index] / self.cumulative_weight) - ( - (self.CF1[index] / self.cumulative_weight) ** 2) + squared_variance = (cf2[index] / cum_weight) - ((cf1[index] / cum_weight) ** 2) # only need to change the dimension preference to p_k as the list was initially initialised with 1. if squared_variance <= variance_threshold_squared: - self.preferred_dimension_vector[index] = k_constant + updated_pref_dim_vector.append(k_constant) + else: + updated_pref_dim_vector.append(1.0) + + self.preferred_dimension_vector = np.array(updated_pref_dim_vector) def add_new_point(self, new_point_values, new_point_timestamp, new_point_weight=1): """ @@ -214,8 +229,10 @@ def get_projected_dist_to_point(self, other_point): Float: Projected distance between this point and point given as argument. """ + centroids = self.cluster_centroids + pref_dim_vector = self.preferred_dimension_vector dist = 0.0 - for c_i, p_i, d_i in zip(self.cluster_centroids, other_point, self.preferred_dimension_vector): + for c_i, p_i, d_i in zip(centroids, other_point, pref_dim_vector): dist += ((p_i - c_i) ** 2) / d_i return dist @@ -226,11 +243,16 @@ def calculate_projected_radius_squared(self): Returns: Float: Squared Projected radius. """ + cf1 = self.CF1 + cf2 = self.CF2 + pref_dim_vector = self.preferred_dimension_vector + cum_weight = self.cumulative_weight + radius_squared = 0.0 - for i in range(self.CF1.shape[0]): - dimension = 1.0 / self.preferred_dimension_vector[i] - value = (self.CF2[i] / self.cumulative_weight) - \ - ((self.CF1[i] / self.cumulative_weight) ** 2) + + for c1, c2, p_dim in zip(cf1, cf2, pref_dim_vector): + dimension = 1.0 / p_dim + value = (c2 / cum_weight) - ((c1 / cum_weight) ** 2) radius_squared += dimension * value return radius_squared @@ -242,8 +264,11 @@ def get_copy(self): Returns: Microcluster: A clone of itself. """ - new_cf1 = np.zeros(len(self.CF1)) + self.CF1 - new_cf2 = np.zeros(len(self.CF2)) + self.CF2 + cf1 = self.CF1 + cf2 = self.CF2 + + new_cf1 = np.zeros(len(cf1)) + cf1 + new_cf2 = np.zeros(len(cf2)) + cf2 return Microcluster(cf1=new_cf1, cf2=new_cf2, cumulative_weight=self.cumulative_weight) def get_copy_with_new_point(self, datapoint, variance_threshold_squared, k_constant): @@ -412,7 +437,8 @@ def add_parent(self, id): self.parents.update([id]) def set_parents(self, parent_pcores_to_id): - for pcore in self.pcore_ids: + pcore_ids = self.pcore_ids + for pcore in pcore_ids: if pcore in parent_pcores_to_id: self.add_parent(parent_pcores_to_id[pcore]) @@ -420,7 +446,9 @@ def get_parents(self): return self.parents def add_pcore_objects(self, pcore_id_to_object): - for pcore_id in self.pcore_ids: + pcore_ids = self.pcore_ids + + for pcore_id in pcore_ids: pcore = pcore_id_to_object[pcore_id] pcore_copy = pcore.get_copy() pcore_copy.preferred_dimension_vector = np.zeros( @@ -442,19 +470,22 @@ def add_historical_associate_pcore(self, pcore_id): self.historical_associates_pcores.update(pcore_id) def get_historical_associates_as_str(self): - return '&'.join(str(s) for s in self.historical_associates) + hist_assoc = self.historical_associates + return '&'.join(str(s) for s in hist_assoc) def get_historical_associates_pcore_as_str(self): - return '&'.join(str(s) for s in self.historical_associates_pcores) + hist_assoc_pcores = self.historical_associates_pcores + return '&'.join(str(s) for s in hist_assoc_pcores) def get_projected_dist_to_point(self, other_point): """ This is exact copy of the microcluster version. TODO Merge this with microcluster. """ - + centroid = self.centroid + pref_dim = self.preferred_dimensions dist = 0.0 - for c_i, p_i, d_i in zip(self.centroid, other_point, self.preferred_dimensions): + for c_i, p_i, d_i in zip(centroid, other_point, pref_dim): dist += ((float(p_i) - float(c_i)) ** 2) / float(d_i) return dist @@ -463,7 +494,8 @@ def get_dist_to_point(self, other_point): This is exact copy of the microcluster version. TODO Merge this with microcluster. """ + centroid = self.centroid dist = 0.0 - for i, cluster_centroid in enumerate(self.centroid): + for i, cluster_centroid in enumerate(centroid): dist += ((float(other_point[i]) - float(cluster_centroid)) ** 2) return dist diff --git a/chronoclust/predecon.py b/chronoclust/predecon.py index 3949fd5..173722b 100644 --- a/chronoclust/predecon.py +++ b/chronoclust/predecon.py @@ -54,26 +54,36 @@ def run(self): """ self._find_weighted_neighbours() - # For each datapoint, check and update its core_point status. This is only true if we don't use PreDeCon for - # offline clustering where datapoint is actually a cluster. In this case, the core status of each datapoint ( - # cluster) would have been set! + # For each datapoint, check and update its core_point status. + # This method will only change the core status if used for initialisation, + # i.e. if data points are actually data points. + # In the case of offline clustering where data points are MCs, the method does nothing. + # This is because the core status is already set by online phase. In other words, the core MCs are the seed. self._set_is_core_pt() - for datapt_id, datapoint in self.datapoints.items(): + datapoints_items = self.datapoints.items() + delta_squared = self.delta_squared + k = self.k + + for datapt_id, datapoint in datapoints_items: if datapoint.is_unclassified(): if datapoint.is_core_point: - last_cluster = set(range(len(self.clusters), len(self.clusters) + 1)) + # last_cluster = set(range(len(self.clusters), len(self.clusters) + 1)) + last_cluster = set() + last_cluster.add(len(self.clusters)) # Need to do it this way as for offline clustering, the cluster id must be an empty set. new_cluster_id = datapoint.get_new_cluster_id(last_cluster) - new_cluster = Microcluster(cf1=np.zeros(len(datapoint.dimension_values)), - cf2=np.zeros(len(datapoint.dimension_values)), id=new_cluster_id, - preferred_dimension_vector=np.ones(len(datapoint.dimension_values))) + datapoint_values = len(datapoint.dimension_values) + + new_cluster = Microcluster(cf1=np.zeros(datapoint_values), + cf2=np.zeros(datapoint_values), id=new_cluster_id, + preferred_dimension_vector=np.ones(datapoint_values)) # Try to expand the new_cluster with the datapoint self._expand(new_cluster, datapoint) - new_cluster.update_preferred_dimensions(self.delta_squared, self.k) + new_cluster.update_preferred_dimensions(delta_squared, k) # Sanity check to make sure we do not include empty cluster. if new_cluster.cumulative_weight > 0: @@ -97,16 +107,16 @@ def _expand(self, cluster, datapoint): # a copy of potential_cluster_members because it will be modified. We don't want to modify parameters. This # will make a copy as the weighted_neighbour_pts contains ids as int. queue = list(datapoint.weighted_neighbour_pts) - + datapoints = self.datapoints while len(queue) > 0: # q in paper[2] - first_datapt = self.datapoints[queue.pop(0)] + first_datapt = datapoints[queue.pop(0)] # R in paper[2] - directly_reachable_pts = self._find_directly_reachable_points(first_datapt, list(self.datapoints.keys())) + directly_reachable_pts = self._find_directly_reachable_points(first_datapt, list(datapoints.keys())) for dir_reachable_pt_id in directly_reachable_pts: # x in paper[2] figure 4. - x = self.datapoints[dir_reachable_pt_id] + x = datapoints[dir_reachable_pt_id] # line 10-14 of figure 4 in paper[2] if x.is_unclassified(): @@ -122,9 +132,12 @@ def _set_is_core_pt(self): Returns: None. """ - for datapt_id, datapt in self.datapoints.items(): - datapt.set_is_core_point(self.lambbda, - self.mu) + datapoints_items = self.datapoints.items() + lambbda = self.lambbda + mu = self.mu + + for datapt_id, datapt in datapoints_items: + datapt.set_is_core_point(lambbda, mu) def _find_weighted_neighbours(self): """ @@ -135,16 +148,20 @@ def _find_weighted_neighbours(self): Returns: None. """ - for datapt_id, datapt in self.datapoints.items(): + datapoints = self.datapoints + datapoints_items = datapoints.items() + epsilon_squared = self.epsilon_squared + + for datapt_id, datapt in datapoints_items: # Find all the neighbour points and calculate the subspace preference vector datapt.neighbour_pts = self._find_neighbour_points(datapt) datapt.subspace_preference_vector = self._calculate_subspace_preference_vector(datapt) # This MUST be done after the preference vectors for all points must have been calculated. - for datapt_id, datapt in self.datapoints.items(): + for datapt_id, datapt in datapoints_items: for neighbour_pt_id in datapt.neighbour_pts: - dist = self._calculate_general_weighted_dist_squared(datapt, self.datapoints[neighbour_pt_id]) - if dist <= self.epsilon_squared: + dist = self._calculate_general_weighted_dist_squared(datapt, datapoints[neighbour_pt_id]) + if dist <= epsilon_squared: datapt.weighted_neighbour_pts.append(neighbour_pt_id) def _find_neighbour_points(self, point): @@ -159,11 +176,14 @@ def _find_neighbour_points(self, point): """ neighbour_points_id = [] - for datapt_id, datapt in self.datapoints.items(): + datapoints_items = self.datapoints.items() + epsilon = self.epsilon + + for datapt_id, datapt in datapoints_items: euclidean_dist = self._calculate_euclidean_dist(datapt.dimension_values, point.dimension_values) # See beginning of chapter 3 of paper[2] for neighbourhood points criteria - if euclidean_dist <= self.epsilon: + if euclidean_dist <= epsilon: neighbour_points_id.append(datapt_id) return neighbour_points_id @@ -192,11 +212,13 @@ def _calculate_subspace_preference_vector(self, point): Returns: Array: Subspace preference vector represented by an array. """ + delta = self.delta + k = self.k subspace_preference_vector = [] for dimension in range(self.dataset_dimensionality): variance = self._calculate_variance_along_dimension(point, dimension) - if variance <= self.delta: - subspace_preference_vector.append(self.k) + if variance <= delta: + subspace_preference_vector.append(k) else: subspace_preference_vector.append(1) return subspace_preference_vector @@ -213,9 +235,12 @@ def _calculate_variance_along_dimension(self, point, dimension): Returns: Float: Variance of neighbourhood of a point along a dimension """ + datapoints = self.datapoints + result = 0.0 + for i in point.neighbour_pts: - result += (point.dimension_values[dimension] - self.datapoints[i].dimension_values[dimension]) ** 2 + result += (point.dimension_values[dimension] - datapoints[i].dimension_values[dimension]) ** 2 return result / len(point.neighbour_pts) def _calculate_general_weighted_dist_squared(self, p, q): @@ -263,10 +288,13 @@ def _find_directly_reachable_points(self, point, potential_directly_reachable_po Returns: List: List of directly reachable points """ + datapoints = self.datapoints + lambbda = self.lambbda + dir_reachable_pts = [] for datapt_id in potential_directly_reachable_points: point_is_core = point.is_core_point - pdim_datapt_less_than_threshold = self.datapoints[datapt_id].get_pdim() <= self.lambbda + pdim_datapt_less_than_threshold = datapoints[datapt_id].get_pdim() <= lambbda datapt_is_neighbour_of_point = datapt_id in point.weighted_neighbour_pts if point_is_core and pdim_datapt_less_than_threshold and datapt_is_neighbour_of_point: