From bdf162e0a9c86bad660ca457042a20c56580f03a Mon Sep 17 00:00:00 2001 From: Benjamin Cretois Date: Mon, 4 Mar 2024 08:44:32 +0100 Subject: [PATCH] [ADD] stratify on the train / val split --- dcase_fine_tune/CONFIG.yaml | 2 + dcase_fine_tune/FTBeats.py | 6 +- dcase_fine_tune/FTDataModule.py | 120 +++++++++++++++----------------- dcase_fine_tune/FTtrain.py | 8 +-- 4 files changed, 65 insertions(+), 71 deletions(-) diff --git a/dcase_fine_tune/CONFIG.yaml b/dcase_fine_tune/CONFIG.yaml index 636b72d..38cf43b 100644 --- a/dcase_fine_tune/CONFIG.yaml +++ b/dcase_fine_tune/CONFIG.yaml @@ -36,9 +36,11 @@ trainer: num_workers: 4 patience: 20 min_sample_per_category: 10 + test_size: 0.2 model: lr: 1.0e-05 + ft_entire_network: True num_target_classes: 2 model_path: "/data/models/BEATs/BEATs_iter3_plus_AS2M.pt" specaugment_params: null diff --git a/dcase_fine_tune/FTBeats.py b/dcase_fine_tune/FTBeats.py index 31f8551..e7f70e1 100644 --- a/dcase_fine_tune/FTBeats.py +++ b/dcase_fine_tune/FTBeats.py @@ -60,7 +60,8 @@ def _build_model(self): self.beats.load_state_dict(self.checkpoint["model"]) # 2. Classifier - self.fc = nn.Linear(self.cfg.encoder_embed_dim, self.cfg.predictor_class) + print(f"Classifier has {self.num_target_classes} output neurons") + self.fc = nn.Linear(self.cfg.encoder_embed_dim, self.num_target_classes) def extract_features(self, x, padding_mask=None): if padding_mask != None: @@ -81,7 +82,7 @@ def forward(self, x, padding_mask=None): # Get the logits x = self.fc(x) - # Mean pool the second layer + # Mean pool the second dimension (these are the tokens) x = x.mean(dim=1) return x @@ -99,6 +100,7 @@ def training_step(self, batch, batch_idx): train_loss = self.loss(y_probs, y_true) # 3. Compute accuracy: + self.log("train_loss", train_loss, prog_bar=True) self.log("train_acc", self.train_acc(y_probs, y_true), prog_bar=True) return train_loss diff --git a/dcase_fine_tune/FTDataModule.py b/dcase_fine_tune/FTDataModule.py index c25bc3c..13ba164 100644 --- a/dcase_fine_tune/FTDataModule.py +++ b/dcase_fine_tune/FTDataModule.py @@ -1,6 +1,6 @@ from torch.utils.data import Dataset, DataLoader from pytorch_lightning import LightningDataModule -from sklearn.preprocessing import LabelEncoder +from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.model_selection import train_test_split import torch import pandas as pd @@ -8,53 +8,7 @@ from torch.utils.data import WeightedRandomSampler - -class AudioDatasetDCASE(Dataset): - def __init__( - self, - data_frame, - label_dict=None, - ): - self.data_frame = data_frame - self.label_encoder = LabelEncoder() - if label_dict is not None: - self.label_encoder.fit(list(label_dict.keys())) - self.label_dict = label_dict - else: - self.label_encoder.fit(self.data_frame["category"]) - self.label_dict = dict( - zip( - self.label_encoder.classes_, - self.label_encoder.transform(self.label_encoder.classes_), - ) - ) - - def __len__(self): - return len(self.data_frame) - - def get_labels(self): - labels = [] - - for i in range(0, len(self.data_frame)): - label = self.data_frame.iloc[i]["category"] - label = self.label_encoder.transform([label])[0] - labels.append(label) - - return labels - - def __getitem__(self, idx): - input_feature = torch.Tensor(self.data_frame.iloc[idx]["feature"]) - label = self.data_frame.iloc[idx]["category"] - - # Encode label as integer - label = self.label_encoder.transform([label])[0] - - return input_feature, label - - def get_label_dict(self): - return self.label_dict - -class AudioDatasetDCASEV2(Dataset): +class TrainAudioDatasetDCASE(Dataset): def __init__( self, data_frame, @@ -98,28 +52,16 @@ def __init__( self.test_size = test_size self.min_sample_per_category = min_sample_per_category - self.label_encoder = LabelEncoder() - self.label_encoder.fit(self.data_frame["category"]) - self.label_dict = dict( - zip( - self.label_encoder.classes_, - self.label_encoder.transform(self.label_encoder.classes_), - ) - ) - self.setup() self.divide_train_val() def setup(self, stage=None): # load data - self.data_frame["category"] = self.label_encoder.fit_transform(self.data_frame["category"]) - self.complete_dataset = AudioDatasetDCASEV2(data_frame=self.data_frame) + self.data_frame["category"] = LabelEncoder().fit_transform(self.data_frame["category"]) + self.complete_dataset = TrainAudioDatasetDCASE(data_frame=self.data_frame) def divide_train_val(self): - value_counts = self.data_frame["category"].value_counts() - self.num_target_classes = len(self.data_frame["category"].unique()) - # Separate into training and validation set train_indices, validation_indices, _, _ = train_test_split( range(len(self.complete_dataset)), @@ -132,7 +74,7 @@ def divide_train_val(self): data_frame_train = self.data_frame.loc[train_indices] data_frame_train.reset_index(drop=True, inplace=True) - # deal with class imbalance + # deal with class imbalance in the training set value_counts = data_frame_train["category"].value_counts() weight = 1. / value_counts samples_weight = np.array([weight[t] for t in data_frame_train["category"]]) @@ -140,14 +82,15 @@ def divide_train_val(self): samples_weight = samples_weight.double() self.sampler = WeightedRandomSampler(samples_weight, len(samples_weight)) + # Make the validation set data_frame_validation = self.data_frame.loc[validation_indices] data_frame_validation.reset_index(drop=True, inplace=True) # generate subset based on indices - self.train_set = AudioDatasetDCASEV2( + self.train_set = TrainAudioDatasetDCASE( data_frame=data_frame_train, ) - self.val_set = AudioDatasetDCASEV2( + self.val_set = TrainAudioDatasetDCASE( data_frame=data_frame_validation, ) @@ -195,6 +138,53 @@ def collate_fn( return (all_images, all_labels) + + +class AudioDatasetDCASE(Dataset): + def __init__( + self, + data_frame, + label_dict=None, + ): + self.data_frame = data_frame + self.label_encoder = LabelEncoder() + if label_dict is not None: + self.label_encoder.fit(list(label_dict.keys())) + self.label_dict = label_dict + else: + self.label_encoder.fit(self.data_frame["category"]) + self.label_dict = dict( + zip( + self.label_encoder.classes_, + self.label_encoder.transform(self.label_encoder.classes_), + ) + ) + + def __len__(self): + return len(self.data_frame) + + def get_labels(self): + labels = [] + + for i in range(0, len(self.data_frame)): + label = self.data_frame.iloc[i]["category"] + label = self.label_encoder.transform([label])[0] + labels.append(label) + + return labels + + def __getitem__(self, idx): + input_feature = torch.Tensor(self.data_frame.iloc[idx]["feature"]) + label = self.data_frame.iloc[idx]["category"] + + # Encode label as integer + label = self.label_encoder.transform([label])[0] + + return input_feature, label + + def get_label_dict(self): + return self.label_dict + class predictLoader(): def __init__( self, diff --git a/dcase_fine_tune/FTtrain.py b/dcase_fine_tune/FTtrain.py index b5a5fb3..8c6c256 100644 --- a/dcase_fine_tune/FTtrain.py +++ b/dcase_fine_tune/FTtrain.py @@ -30,7 +30,7 @@ def train_model( auto_select_gpus=True, callbacks=[ pl.callbacks.LearningRateMonitor(logging_interval="step"), - pl.callbacks.EarlyStopping(monitor="val_loss", mode="min", patience=patience), + pl.callbacks.EarlyStopping(monitor="train_loss", mode="min", patience=patience), ], default_root_dir=root_dir, enable_checkpointing=True @@ -81,16 +81,16 @@ def main(cfg: DictConfig): batch_size=cfg["trainer"]["batch_size"], num_workers=cfg["trainer"]["num_workers"], tensor_length=cfg["data"]["tensor_length"], - test_size=0.2, + test_size=cfg["trainer"]["test_size"], min_sample_per_category=cfg["trainer"]["min_sample_per_category"]) # create the model object num_target_classes = len(df["category"].unique()) - print(num_target_classes) model = BEATsTransferLearningModel(model_path=cfg["model"]["model_path"], num_target_classes=num_target_classes, - lr=cfg["model"]["lr"]) + lr=cfg["model"]["lr"], + ft_entire_network=cfg["model"]["ft_entire_network"]) train_model(model, Loader,