From 27ba9178115dc68b100e0b756bce92c72e157a55 Mon Sep 17 00:00:00 2001
From: styagi130 <styagi130@gmail.com>
Date: Thu, 24 Aug 2023 22:58:12 +0530
Subject: [PATCH] specify explicitly to set pretrained model paths (#7305)

Signed-off-by: Siddharth Tyagi <siddhartht@nvidia.com>
Co-authored-by: Siddharth Tyagi <siddhartht@nvidia.com>
---
 .../tts/FastPitch_Adapter_Finetuning.ipynb    | 24 +++++++++++++++----
 1 file changed, 19 insertions(+), 5 deletions(-)
diff --git a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb
index 0499c12c90ec1..263d22b60599d 100644
--- a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb
+++ b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb
@@ -80,6 +80,16 @@
     "!wandb login #PASTE_WANDB_APIKEY_HERE"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "b73283fc",
+   "metadata": {},
+   "source": [
+    "## Set finetuning params\n",
+    "\n",
+    "This notebook expects a pretrained model to finetune. If you have a pretrained multispeaker checkpoint, set the path in next block to the path of pretrained checkpoint. You can also pretrain a multispeaker adapter checkpoint using the [FastPitch_MultiSpeaker_Pretraining tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb)."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -88,8 +98,9 @@
    "outputs": [],
    "source": [
     "# .nemo files for your pre-trained FastPitch and HiFiGAN\n",
-    "pretrained_fastpitch_checkpoint = \"\"\n",
-    "finetuned_hifigan_on_multispeaker_checkpoint = \"\""
+    "pretrained_fastpitch_checkpoint = \"<Multispeaker pretrained checkpoint path.>\"\n",
+    "finetuned_hifigan_on_multispeaker_checkpoint = \"<Pretrained hifiGan checkpoint path.>\"\n",
+    "use_ipa = True #Set to False while using Arpabet."
    ]
   },
   {
@@ -430,12 +441,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "phoneme_dict_path = os.path.abspath(os.path.join(code_dir, \"scripts\", \"tts_dataset_files\", \"ipa_cmudict-0.7b_nv23.01.txt\"))\n",
+    "phone_dict_name = \"ipa_cmudict-0.7b_nv23.01.txt\" if use_ipa else \"cmudict-0.7b_nv22.10\"\n",
+    "phoneme_dict_path = os.path.abspath(os.path.join(code_dir, \"scripts\", \"tts_dataset_files\", phone_dict_name))\n",
     "heteronyms_path = os.path.abspath(os.path.join(code_dir, \"scripts\", \"tts_dataset_files\", \"heteronyms-052722\"))\n",
     "\n",
     "# Copy and Paste the PITCH_MEAN and PITCH_STD from previous steps (train_manifest) to override pitch_mean and pitch_std configs below.\n",
     "PITCH_MEAN=175.48513793945312\n",
-    "PITCH_STD=42.3786735534668"
+    "PITCH_STD=42.3786735534668\n",
+    "\n",
+    "config_filename = \"fastpitch_align_ipa_adapter.yaml\" if use_ipa else \"fastpitch_align_44100_adapter.yaml\""
    ]
   },
   {
@@ -468,7 +482,7 @@
    "source": [
     "# Normally 200 epochs\n",
     "!cd {code_dir} && python examples/tts/fastpitch_finetune_adapters.py \\\n",
-    "--config-name=fastpitch_align_ipa_adapter.yaml \\\n",
+    "--config-name={config_filename} \\\n",
     "+init_from_nemo_model={pretrained_fastpitch_checkpoint} \\\n",
     "train_dataset={train_manifest} \\\n",
     "validation_datasets={valid_manifest} \\\n",