Skip to content

Commit

Permalink
[ADD] filtering based on pvalues
Browse files Browse the repository at this point in the history
  • Loading branch information
BenCretois committed Mar 7, 2024
1 parent 89347a7 commit 9aa6475
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 35 deletions.
10 changes: 7 additions & 3 deletions CONFIG_PREDICT.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ data:
normalize: True # used in preprocessing
frame_length: 25.0 # used in preprocessing
tensor_length: 128 # used in preprocessing
n_shot: 5
n_query: 10
n_shot: 3
n_query: 2
overlap: 0.5 # used in preprocessing
n_subsample: 1
num_mel_bins: 128 # used in preprocessing
Expand Down Expand Up @@ -54,7 +54,11 @@ model:
# PARAMETERS FOR MODEL PREDICTION #
###################################
predict:
wav_save: False
wav_save: True
overwrite: True
n_self_detected_supports: 0
tolerance: 0
filter_by_p_values: True

plot:
tsne: False
18 changes: 4 additions & 14 deletions evaluate/_utils_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,21 +216,11 @@ def predict_labels_query(

return pred_labels, labels, begins, ends, d_to_pos, q_embeddings

def update_labels_for_outliers(X, Y, target_class=1, upper_threshold=0.95):
# Filter X and Y for the target class
X_filtered = X[Y == target_class]
indices_filtered = np.arange(len(X))[Y == target_class] # Indices of Y == target_class in the original array
def filter_outliers_by_p_values(Y, p_values, target_class=1, upper_threshold=0.05):
# Identify indices where the p-value is less than the threshold and the corresponding Y value equals the target_class
outlier_indices = np.where((p_values < upper_threshold) & (Y == target_class))[0]

# Calculate p-values for the filtered subset of X
p_values_filtered = calculate_p_values(X_filtered)

# Identify outliers within the filtered subset based on p-values
outlier_flags = (p_values_filtered > upper_threshold)

# Map back the indices of identified outliers to the original array
outlier_indices = indices_filtered[outlier_flags]

# Update labels in the original Y array for identified outliers
# Update labels in the original Y array for identified indices
Y[outlier_indices] = 0

return Y
92 changes: 74 additions & 18 deletions evaluate/evaluateDCASE.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env python3

import argparse
import numpy as np
import pandas as pd
import glob
Expand All @@ -20,7 +19,6 @@

from prototypicalbeats.prototraining import ProtoBEATsModel
from datamodules.TestDCASEDataModule import DCASEDataModule, AudioDatasetDCASE
from data_utils.audiolist import AudioList

import pytorch_lightning as pl

Expand All @@ -31,7 +29,7 @@
from evaluate._utils_writing import write_wav, write_results
from evaluate._utils_compute import (to_dataframe, get_proto_coordinates, calculate_distance,
compute_scores, merge_preds, reshape_support, training,
predict_labels_query)
predict_labels_query, filter_outliers_by_p_values)

import hydra
from omegaconf import DictConfig, OmegaConf
Expand Down Expand Up @@ -93,9 +91,8 @@ def compute(
support_samples_pos = reshape_support(support_samples_pos, tensor_length=cfg["data"]["tensor_length"])
z_pos_supports, _ = model.get_embeddings(support_samples_pos, padding_mask=None)

_, d_supports_to_POS_prototypes = calculate_distance(
model_type, z_pos_supports, prototypes[pos_index]
)
_, d_supports_to_POS_prototypes = calculate_distance(model_type, z_pos_supports, prototypes[pos_index])

print(f"DISTANCE TO POS = {d_supports_to_POS_prototypes}")
ecdf = ECDF(d_supports_to_POS_prototypes.detach().numpy())

Expand Down Expand Up @@ -201,23 +198,82 @@ def compute(
################################################
# PLOT PROTOTYPES AND EMBEDDINGS IN A 2D SPACE #
################################################
#prototypes=prototypes.to_numpy()
#z_pos_supports = z_pos_supports.to_numpy()
#z_neg_supports = z_neg_supports.to_numpy()
#q_embeddings = q_embeddings.to_numpy()
#gt_labels = labels
#other_labels = np.concatenate(([0,1], np.repeat(1, z_pos_supports.shape(0)), np.repeat(0, z_neg_supports.shape(0))), axis=None)

#f = np.concatenate([q_embeddings, prototypes, z_pos_supports, z_neg_supports])

#representation = np.concatenate(np.array(list(features)), axis=0)
#tsne = TSNE(n_components=2, perplexity=perplexity)

if cfg["plot"]["tsne"]:

from sklearn.manifold import TSNE
import seaborn as sns

prototypes=prototypes.detach().numpy()
z_pos_supports = z_pos_supports.detach().numpy()
z_neg_supports = z_neg_supports.detach().numpy()
q_embeddings = q_embeddings.detach().numpy()
gt_labels = labels
other_labels = np.concatenate(([0,1], np.repeat(1, z_pos_supports.shape(0)), np.repeat(0, z_neg_supports.shape(0))), axis=None)

feat = np.concatenate([q_embeddings, prototypes, z_pos_supports, z_neg_supports])
tsne = TSNE(n_components=2, perplexity=5)
features_2d = tsne.fit_transform(feat)

# Do the figure!
fig = sns.scatterplot(x=features_2d[:, 0], y=features_2d[:, 1], hue=labels)
sns.move_legend(fig, "upper left", bbox_to_anchor=(1, 1))

fig_name = os.path.basename(support_spectrograms).split("data_")[1].split(".")[0] + ".png"
output = os.path.join(target_path, fig_name)
fig.get_figure().savefig(output, bbox_inches="tight")

import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# Assuming `prototypes`, `z_pos_supports`, `z_neg_supports`, `q_embeddings`, and `labels` are already defined
# Convert tensors to numpy arrays if they are in tensor format
# e.g., z_pos_supports = z_pos_supports.detach().numpy()

# Create a labels array for all points
# Label for prototypes, positive supports, negative supports, and query embeddings respectively
prototypes_labels = np.array([2] * prototypes.shape[0]) # Assuming 2 is not used in `gt_labels`
pos_supports_labels = np.array([3] * z_pos_supports.shape[0]) # Assuming 3 is not used in `gt_labels`
neg_supports_labels = np.array([4] * z_neg_supports.shape[0]) # Assuming 4 is not used in `gt_labels`

# Concatenate everything into one dataset
feat = np.concatenate([prototypes, z_pos_supports, z_neg_supports, q_embeddings])
all_labels = np.concatenate([prototypes_labels, pos_supports_labels, neg_supports_labels, gt_labels])

# Run t-SNE
tsne = TSNE(n_components=2, perplexity=30)
features_2d = tsne.fit_transform(feat)

# Plot
plt.figure(figsize=(10, 8))
# Define marker for each type of point
markers = {2: "P", 3: "o", 4: "X"} # P for prototypes, o for supports, X for negative supports

for label in np.unique(all_labels):
# Plot each class with its own color and marker
idx = np.where(all_labels == label)
if label in markers: # Prototypes or supports
plt.scatter(features_2d[idx, 0], features_2d[idx, 1], label=label, alpha=1.0, marker=markers[label], s=100) # Larger size
else: # Query embeddings
plt.scatter(features_2d[idx, 0], features_2d[idx, 1], label=label, alpha=0.5, s=50) # Smaller size, more transparent

plt.legend()
plt.title('t-SNE visualization of embeddings, prototypes, and supports')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.grid(True)

# Save the figure
plt.savefig(output, bbox_inches="tight")
plt.show()


# GET THE PVALUES
p_values_pos = 1 - ecdf(distances_to_pos)

if cfg["predict"]["filter_by_p_values"]:
predicted_labels = filter_outliers_by_p_values(predicted_labels, p_values_pos, target_class=1, upper_threshold=0.05)

# Compute the scores for the analysed file -- just as information
acc, recall, precision, f1score = compute_scores(
predicted_labels=predicted_labels, #updated_labels,
Expand Down

0 comments on commit 9aa6475

Please sign in to comment.