From 10316e209730b6ab4693ab3d9099dedd638a7abc Mon Sep 17 00:00:00 2001 From: Jose Giraldo Date: Sat, 17 Feb 2024 14:23:41 +0100 Subject: [PATCH 1/3] Update torchaudio mel spectrogram paramters --- vocos/feature_extractors.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/vocos/feature_extractors.py b/vocos/feature_extractors.py index 799f1b4..ba9f34b 100644 --- a/vocos/feature_extractors.py +++ b/vocos/feature_extractors.py @@ -26,7 +26,16 @@ def forward(self, audio: torch.Tensor, **kwargs) -> torch.Tensor: class MelSpectrogramFeatures(FeatureExtractor): - def __init__(self, sample_rate=24000, n_fft=1024, hop_length=256, n_mels=100, padding="center"): + def __init__(self, + sample_rate=24000, + n_fft=1024, + hop_length=256, + n_mels=100, + padding="center", + f_min=0, # to match matcha :X + f_max=8000, + norm="slaney", + mel_scale="slaney"): super().__init__() if padding not in ["center", "same"]: raise ValueError("Padding must be 'center' or 'same'.") @@ -38,6 +47,10 @@ def __init__(self, sample_rate=24000, n_fft=1024, hop_length=256, n_mels=100, pa n_mels=n_mels, center=padding == "center", power=1, + f_min=f_min, # to match matcha :X + f_max=f_max, + norm=norm, + mel_scale=mel_scale ) def forward(self, audio, **kwargs): From 342276d8ee24e23cbe38cca6dea0b94e599f0b10 Mon Sep 17 00:00:00 2001 From: Jose Giraldo Date: Sat, 17 Feb 2024 14:24:38 +0100 Subject: [PATCH 2/3] update reconstruction loss with new mel features --- vocos/loss.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/vocos/loss.py b/vocos/loss.py index 029f6ac..e69916f 100644 --- a/vocos/loss.py +++ b/vocos/loss.py @@ -12,12 +12,28 @@ class MelSpecReconstructionLoss(nn.Module): L1 distance between the mel-scaled magnitude spectrograms of the ground truth sample and the generated sample """ - def __init__( - self, sample_rate: int = 24000, n_fft: int = 1024, hop_length: int = 256, n_mels: int = 100, + def __init__(self, + sample_rate: int = 22050, + n_fft: int = 1024, + hop_length: int = 256, + n_mels: int = 80, + f_min: int = 0, + f_max: int = 8000, + norm: str = "slaney", + mel_scale: str = "slaney", ): super().__init__() self.mel_spec = torchaudio.transforms.MelSpectrogram( - sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, center=True, power=1, + sample_rate=sample_rate, + n_fft=n_fft, + hop_length=hop_length, + n_mels=n_mels, + center=True, + power=1, + f_min=f_min, + f_max=f_max, + norm=norm, + mel_scale=mel_scale ) def forward(self, y_hat, y) -> torch.Tensor: From 734bc2f7fc87ce8c41d359a0a18811336aa192aa Mon Sep 17 00:00:00 2001 From: Jose Giraldo Date: Sat, 17 Feb 2024 14:26:50 +0100 Subject: [PATCH 3/3] Create new config with matcha parameters --- configs/vocos-matcha.yaml | 93 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 configs/vocos-matcha.yaml diff --git a/configs/vocos-matcha.yaml b/configs/vocos-matcha.yaml new file mode 100644 index 0000000..a5e6548 --- /dev/null +++ b/configs/vocos-matcha.yaml @@ -0,0 +1,93 @@ +# pytorch_lightning==1.8.6 +seed_everything: 4444 + +data: + class_path: vocos.dataset.VocosDataModule + init_args: + train_params: + filelist_path: ??? + sampling_rate: 22050 + num_samples: 16384 + batch_size: 16 + num_workers: 8 + + val_params: + filelist_path: ??? + sampling_rate: 22050 + num_samples: 48384 + batch_size: 16 + num_workers: 8 + +model: + class_path: vocos.experiment.VocosExp + init_args: + sample_rate: 22050 + initial_learning_rate: 5e-4 + mel_loss_coeff: 45 + mrd_loss_coeff: 0.1 + num_warmup_steps: 0 # Optimizers warmup steps + pretrain_mel_steps: 0 # 0 means GAN objective from the first iteration + + # automatic evaluation + evaluate_utmos: true + evaluate_pesq: true + evaluate_periodicty: true + + feature_extractor: + class_path: vocos.feature_extractors.MelSpectrogramFeatures + init_args: + sample_rate: 22050 + n_fft: 1024 + hop_length: 256 + n_mels: 80 + padding: same + f_min: 0 + f_max: 8000 + norm: "slaney" + mel_scale: "slaney" + + + backbone: + class_path: vocos.models.VocosBackbone + init_args: + input_channels: 80 + dim: 512 + intermediate_dim: 1536 + num_layers: 8 + + head: + class_path: vocos.heads.ISTFTHead + init_args: + dim: 512 + n_fft: 1024 + hop_length: 256 + padding: same + + +trainer: + logger: + class_path: pytorch_lightning.loggers.TensorBoardLogger + init_args: + save_dir: /mnt/netapp1/Proxecto_NOS/bsc/speech/TTS/outputs/logs/vocos + callbacks: + - class_path: pytorch_lightning.callbacks.LearningRateMonitor + - class_path: pytorch_lightning.callbacks.ModelSummary + init_args: + max_depth: 2 + - class_path: pytorch_lightning.callbacks.ModelCheckpoint + init_args: + monitor: val_loss + filename: vocos_checkpoint_{epoch}_{step}_{val_loss:.4f} + save_top_k: 3 + save_last: true + - class_path: vocos.helpers.GradNormCallback + + # Lightning calculates max_steps across all optimizer steps (rather than number of batches) + # This equals to 1M steps per generator and 1M per discriminator + max_steps: 2000000 + # You might want to limit val batches when evaluating all the metrics, as they are time-consuming + limit_val_batches: 100 + accelerator: gpu + strategy: ddp + devices: [0] + log_every_n_steps: 100