config.yml

---

model: quartznet5x5  # ["quartznet5x5", "quartznet10x5", "quartznet15x5"]

# Datasets
train:
  data_list: assets/train_data.txt
  data_dirs: ["LibriTTS/train-clean-360"]
  apply_speed_perturbation: True
  apply_masking: True

val:
  data_list: assets/val_data.txt
  data_dirs: ["LibriTTS/dev-clean"]

test:
  weights: checkpoints/model_best.pt
  data_list: assets/test_data.txt
  data_dirs: [ "LibriTTS/test-clean" ]

# Training
max_length: 12  # maximum file length in seconds. Can be used to avoid OOM (out-of-memory) error
epochs: 100
batch_size: 32
learning_rate: 5e-4  # constant learning rate value. Omitted if using OneCycleLR
weight_decay: 0.0001
checkpoint_dir: checkpoints/
log_dir: logs/
use_onecyclelr: True
normalize: True
num_workers: 2  # dataloader's

# OneCycleLR parameters
# https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.OneCycleLR.html#torch.optim.lr_scheduler.OneCycleLR
max_lr: 1e-3
div_factor: 25.0
pct_start: 0.3

# Augmentation
speed_perturbation: 0.1
masking:
  chunk_size: 60  # size of each spectrogram segment (time axis) to apply augmentation to. Set equal to -1 to apply original pytorch's masking
  freq_masking: 10
  time_masking: 10

spec_params:
  sr: 22050
  n_mels: 64
  n_fft: 320
  win_length: 320
  hop_length: 160

stats: assets/stats_64.npy  # file with the mean and std of the data