Skip to content

Commit

Permalink
Merge pull request #108 from jbaczek/modify_ub_tp_config_resolution
Browse files Browse the repository at this point in the history
Move config name resolution to hydra
  • Loading branch information
yaoyu-33 authored Aug 11, 2023
2 parents 45d6d33 + 6dcc162 commit 9e54708
Show file tree
Hide file tree
Showing 13 changed files with 45 additions and 52 deletions.
4 changes: 4 additions & 0 deletions launcher_scripts/conf/training/gpt3/126m.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
hydra:
searchpath:
- file:///opt/NeMo/examples/nlp/language_modeling/conf

run:
name: gpt3_126m
results_dir: ${base_results_dir}/${.name}
Expand Down
4 changes: 4 additions & 0 deletions launcher_scripts/conf/training/gpt3/175b.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
hydra:
searchpath:
- file:///opt/NeMo/examples/nlp/language_modeling/conf

run:
name: gpt3_175b
results_dir: ${base_results_dir}/${.name}
Expand Down
4 changes: 4 additions & 0 deletions launcher_scripts/conf/training/gpt3/175b_performance.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# The configurations below provide the best 175B training performance with the NeMo SW stack.
# We have confirmed the model convergence only with a limited number of tokens and the full model
# convergence (e.g., 300B tokens) is not guaranteed.
hydra:
searchpath:
- file:///opt/NeMo/examples/nlp/language_modeling/conf

run:
name: gpt3_175b
results_dir: ${base_results_dir}/${.name}
Expand Down
4 changes: 4 additions & 0 deletions launcher_scripts/conf/training/gpt3/1b_improved.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
hydra:
searchpath:
- file:///opt/NeMo/examples/nlp/language_modeling/conf

run:
name: gpt_1b_improved
results_dir: ${base_results_dir}/${.name}
Expand Down
4 changes: 4 additions & 0 deletions launcher_scripts/conf/training/gpt3/20b.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
hydra:
searchpath:
- file:///opt/NeMo/examples/nlp/language_modeling/conf

run:
name: gpt3_20b
results_dir: ${base_results_dir}/${.name}
Expand Down
4 changes: 4 additions & 0 deletions launcher_scripts/conf/training/gpt3/400m_improved.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
hydra:
searchpath:
- file:///opt/NeMo/examples/nlp/language_modeling/conf

run:
name: gpt_400m_improved
results_dir: ${base_results_dir}/${.name}
Expand Down
4 changes: 4 additions & 0 deletions launcher_scripts/conf/training/gpt3/40b.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
hydra:
searchpath:
- file:///opt/NeMo/examples/nlp/language_modeling/conf

run:
name: gpt3_40b
results_dir: ${base_results_dir}/${.name}
Expand Down
4 changes: 4 additions & 0 deletions launcher_scripts/conf/training/gpt3/40b_improved.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
hydra:
searchpath:
- file:///opt/NeMo/examples/nlp/language_modeling/conf

run:
name: gpt_40b_improved
results_dir: ${base_results_dir}/${.name}
Expand Down
4 changes: 4 additions & 0 deletions launcher_scripts/conf/training/gpt3/5b.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
hydra:
searchpath:
- file:///opt/NeMo/examples/nlp/language_modeling/conf

run:
name: gpt3_5b
results_dir: ${base_results_dir}/${.name}
Expand Down
4 changes: 4 additions & 0 deletions launcher_scripts/conf/training/gpt3/7b_improved.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
hydra:
searchpath:
- file:///opt/NeMo/examples/nlp/language_modeling/conf

run:
name: gpt_7b_improved
results_dir: ${base_results_dir}/${.name}
Expand Down
31 changes: 0 additions & 31 deletions launcher_scripts/nemo_launcher/collections/conditional_cfgs.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.


import math
import pynvml
import os
import sys
from collections import defaultdict

import hydra

Expand All @@ -28,31 +24,6 @@
pynvml.nvmlShutdown()


@hydra.main(version_base=None, config_path="conf", config_name="get_ub_cfg_file")
def get_ub_cfg_file(cfg):
"""
Find and return the userbuffer config file. If it doesn't exist return `null`.
"""
global cuda_capability
device_name = None
if cuda_capability == 8:
device_name = "a100"
elif cuda_capability == 9:
device_name = "h100"
ub_cfg_path = cfg.get("ub_cfg_path")
tp_size = cfg.get("tp_size")
hidden_size = cfg.get("hidden_size")
mb_size = cfg.get("mb_size")
seqlen = cfg.get("seqlen")
cfg_file_name = f"ub_cfg_{device_name}_h{hidden_size}_tp{tp_size}_mbs{mb_size}_seqlen{seqlen}.yaml"
cfg_file = os.path.join(ub_cfg_path, cfg_file_name)

if os.path.isfile(cfg_file):
print(f"{cfg_file}")
else:
print(f"null")


@hydra.main(version_base=None, config_path="conf", config_name="get_ln_sm_margin")
def get_ln_sm_margin(cfg):
"""
Expand Down Expand Up @@ -83,8 +54,6 @@ def get_ag_overlap(cfg):


if __name__ == "__main__":
if sys.argv[1] == "name=get_ub_cfg_file":
get_ub_cfg_file()
elif sys.argv[1] == "name=get_ln_sm_margin":
get_ln_sm_margin()
elif sys.argv[1] == "name=get_ag_overlap":
Expand Down

This file was deleted.

20 changes: 5 additions & 15 deletions launcher_scripts/nemo_launcher/core/stages.py
Original file line number Diff line number Diff line change
Expand Up @@ -600,8 +600,8 @@ def _make_hydra_override(self) -> List:
)
hydra_override += [f"model.data.data_prefix=\$({auto_blend_command})"]
if self.stage_cfg.model.get("ub_tp_comm_overlap", False):
get_ub_cfg_file_command = self._get_ub_cfg_file()
hydra_override += [f"+model.ub_tp_comm_overlap_cfg=\$({get_ub_cfg_file_command})"]
ub_cfg_name = self._get_ub_cfg_override()
hydra_override += [f"'+tp_overlap@model.ub_tp_comm_overlap_cfg={ub_cfg_name}'"]
if self.stage_cfg.model.get("gc_interval", 0) > 1:
gc_interval = min(self.stage_cfg.model.get("gc_interval"), self.cfg.training.trainer.get("val_check_interval"))
hydra_override += [f"model.gc_interval={gc_interval}"]
Expand All @@ -624,26 +624,16 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
}
return model_type_to_code_path[model_type]

def _get_ub_cfg_file(self) -> str:
def _get_ub_cfg_override(self) -> str:
"""
Spawn the script to search UB configuration file
"""
tp_size = self.stage_cfg.model.get("tensor_model_parallel_size")
hidden_size = self.stage_cfg.model.get("hidden_size")
mb_size = self.stage_cfg.model.get("micro_batch_size")
seqlen = self.stage_cfg.model.get("encoder_seq_length")
ub_cfg_path = os.path.join(self._launcher_scripts_path, "launcher_scripts/conf/training/gpt3/ub-confs")

get_ub_cfg_file_command = (
f"python3 {self._launcher_scripts_path / 'nemo_launcher/collections/conditional_cfgs.py'} "
f"name=get_ub_cfg_file "
f"ub_cfg_path={ub_cfg_path} "
f"tp_size={tp_size} "
f"hidden_size={hidden_size} "
f"mb_size={mb_size} "
f"seqlen={seqlen}"
)
return get_ub_cfg_file_command
cfg_name = f"ub_cfg_\\${{gpu_name:}}_h{hidden_size}_tp{tp_size}_mbs{mb_size}_seqlen{seqlen}"
return cfg_name


class FineTuning(NeMoStage):
Expand Down

0 comments on commit 9e54708

Please sign in to comment.