Merge pull request #108 from jbaczek/modify_ub_tp_config_resolution

Move config name resolution to hydra
NVIDIA · Aug 11, 2023 · 9e54708 · 9e54708
2 parents 45d6d33 + 6dcc162
commit 9e54708
Show file tree

Hide file tree

Showing 13 changed files with 45 additions and 52 deletions.
diff --git a/launcher_scripts/conf/training/gpt3/126m.yaml b/launcher_scripts/conf/training/gpt3/126m.yaml
@@ -1,3 +1,7 @@
+hydra:
+  searchpath:
+    - file:///opt/NeMo/examples/nlp/language_modeling/conf
+
 run:
   name: gpt3_126m
   results_dir: ${base_results_dir}/${.name}

diff --git a/launcher_scripts/conf/training/gpt3/175b.yaml b/launcher_scripts/conf/training/gpt3/175b.yaml
@@ -1,3 +1,7 @@
+hydra:
+  searchpath:
+    - file:///opt/NeMo/examples/nlp/language_modeling/conf
+
 run:
   name: gpt3_175b
   results_dir: ${base_results_dir}/${.name}

diff --git a/launcher_scripts/conf/training/gpt3/175b_performance.yaml b/launcher_scripts/conf/training/gpt3/175b_performance.yaml
@@ -1,6 +1,10 @@
 # The configurations below provide the best 175B training performance with the NeMo SW stack.
 # We have confirmed the model convergence only with a limited number of tokens and the full model
 # convergence (e.g., 300B tokens) is not guaranteed.
+hydra:
+  searchpath:
+    - file:///opt/NeMo/examples/nlp/language_modeling/conf
+
 run:
   name: gpt3_175b
   results_dir: ${base_results_dir}/${.name}

diff --git a/launcher_scripts/conf/training/gpt3/1b_improved.yaml b/launcher_scripts/conf/training/gpt3/1b_improved.yaml
@@ -1,3 +1,7 @@
+hydra:
+  searchpath:
+    - file:///opt/NeMo/examples/nlp/language_modeling/conf
+
 run:
   name: gpt_1b_improved
   results_dir: ${base_results_dir}/${.name}

diff --git a/launcher_scripts/conf/training/gpt3/20b.yaml b/launcher_scripts/conf/training/gpt3/20b.yaml
@@ -1,3 +1,7 @@
+hydra:
+  searchpath:
+    - file:///opt/NeMo/examples/nlp/language_modeling/conf
+
 run:
   name: gpt3_20b
   results_dir: ${base_results_dir}/${.name}

diff --git a/launcher_scripts/conf/training/gpt3/400m_improved.yaml b/launcher_scripts/conf/training/gpt3/400m_improved.yaml
@@ -1,3 +1,7 @@
+hydra:
+  searchpath:
+    - file:///opt/NeMo/examples/nlp/language_modeling/conf
+
 run:
   name: gpt_400m_improved
   results_dir: ${base_results_dir}/${.name}

diff --git a/launcher_scripts/conf/training/gpt3/40b.yaml b/launcher_scripts/conf/training/gpt3/40b.yaml
@@ -1,3 +1,7 @@
+hydra:
+  searchpath:
+    - file:///opt/NeMo/examples/nlp/language_modeling/conf
+
 run:
   name: gpt3_40b
   results_dir: ${base_results_dir}/${.name}

diff --git a/launcher_scripts/conf/training/gpt3/40b_improved.yaml b/launcher_scripts/conf/training/gpt3/40b_improved.yaml
@@ -1,3 +1,7 @@
+hydra:
+  searchpath:
+    - file:///opt/NeMo/examples/nlp/language_modeling/conf
+
 run:
   name: gpt_40b_improved
   results_dir: ${base_results_dir}/${.name}

diff --git a/launcher_scripts/conf/training/gpt3/5b.yaml b/launcher_scripts/conf/training/gpt3/5b.yaml
@@ -1,3 +1,7 @@
+hydra:
+  searchpath:
+    - file:///opt/NeMo/examples/nlp/language_modeling/conf
+
 run:
   name: gpt3_5b
   results_dir: ${base_results_dir}/${.name}

diff --git a/launcher_scripts/conf/training/gpt3/7b_improved.yaml b/launcher_scripts/conf/training/gpt3/7b_improved.yaml
@@ -1,3 +1,7 @@
+hydra:
+  searchpath:
+    - file:///opt/NeMo/examples/nlp/language_modeling/conf
+
 run:
   name: gpt_7b_improved
   results_dir: ${base_results_dir}/${.name}

diff --git a/launcher_scripts/nemo_launcher/collections/conditional_cfgs.py b/launcher_scripts/nemo_launcher/collections/conditional_cfgs.py
@@ -12,12 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
-import math
 import pynvml
-import os
 import sys
-from collections import defaultdict
 
 import hydra
 
@@ -28,31 +24,6 @@
 pynvml.nvmlShutdown()
 
 
-@hydra.main(version_base=None, config_path="conf", config_name="get_ub_cfg_file")
-def get_ub_cfg_file(cfg):
-    """
-    Find and return the userbuffer config file. If it doesn't exist return `null`.
-    """
-    global cuda_capability
-    device_name = None
-    if cuda_capability == 8:
-        device_name = "a100"
-    elif cuda_capability == 9:
-        device_name = "h100"    
-    ub_cfg_path = cfg.get("ub_cfg_path")
-    tp_size = cfg.get("tp_size")
-    hidden_size = cfg.get("hidden_size")
-    mb_size = cfg.get("mb_size")
-    seqlen = cfg.get("seqlen")
-    cfg_file_name =  f"ub_cfg_{device_name}_h{hidden_size}_tp{tp_size}_mbs{mb_size}_seqlen{seqlen}.yaml"
-    cfg_file = os.path.join(ub_cfg_path, cfg_file_name)
-
-    if os.path.isfile(cfg_file):
-        print(f"{cfg_file}")
-    else:
-        print(f"null")
-
-
 @hydra.main(version_base=None, config_path="conf", config_name="get_ln_sm_margin")
 def get_ln_sm_margin(cfg):
     """
@@ -83,8 +54,6 @@ def get_ag_overlap(cfg):
 
 
 if __name__ == "__main__":
-    if sys.argv[1] == "name=get_ub_cfg_file":
-        get_ub_cfg_file()
     elif sys.argv[1] == "name=get_ln_sm_margin":
         get_ln_sm_margin()
     elif sys.argv[1] == "name=get_ag_overlap":

diff --git a/launcher_scripts/nemo_launcher/collections/conf/get_ub_cfg_file.yaml b/launcher_scripts/nemo_launcher/collections/conf/get_ub_cfg_file.yaml
diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
@@ -600,8 +600,8 @@ def _make_hydra_override(self) -> List:
             )
             hydra_override += [f"model.data.data_prefix=\$({auto_blend_command})"]
         if self.stage_cfg.model.get("ub_tp_comm_overlap", False):
-            get_ub_cfg_file_command = self._get_ub_cfg_file()
-            hydra_override += [f"+model.ub_tp_comm_overlap_cfg=\$({get_ub_cfg_file_command})"]
+            ub_cfg_name = self._get_ub_cfg_override()
+            hydra_override += [f"'+tp_overlap@model.ub_tp_comm_overlap_cfg={ub_cfg_name}'"]
         if self.stage_cfg.model.get("gc_interval", 0) > 1:
             gc_interval = min(self.stage_cfg.model.get("gc_interval"), self.cfg.training.trainer.get("val_check_interval"))
             hydra_override += [f"model.gc_interval={gc_interval}"]
@@ -624,26 +624,16 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
         }
         return model_type_to_code_path[model_type]
 
-    def _get_ub_cfg_file(self) -> str:
+    def _get_ub_cfg_override(self) -> str:
         """
         Spawn the script to search UB configuration file
         """
         tp_size = self.stage_cfg.model.get("tensor_model_parallel_size")
         hidden_size = self.stage_cfg.model.get("hidden_size")
         mb_size = self.stage_cfg.model.get("micro_batch_size")
         seqlen = self.stage_cfg.model.get("encoder_seq_length")
-        ub_cfg_path = os.path.join(self._launcher_scripts_path, "launcher_scripts/conf/training/gpt3/ub-confs")
-
-        get_ub_cfg_file_command = (
-            f"python3 {self._launcher_scripts_path / 'nemo_launcher/collections/conditional_cfgs.py'} "
-            f"name=get_ub_cfg_file "
-            f"ub_cfg_path={ub_cfg_path} "
-            f"tp_size={tp_size} "
-            f"hidden_size={hidden_size} "
-            f"mb_size={mb_size} "
-            f"seqlen={seqlen}"
-        )
-        return get_ub_cfg_file_command
+        cfg_name =  f"ub_cfg_\\${{gpu_name:}}_h{hidden_size}_tp{tp_size}_mbs{mb_size}_seqlen{seqlen}"
+        return cfg_name
 
 
 class FineTuning(NeMoStage):