From d4ec7c8b3a80499e6233255b7d9462d6e1d64a4f Mon Sep 17 00:00:00 2001 From: Selvaraj Anandaraj Date: Mon, 1 Apr 2024 13:41:54 -0700 Subject: [PATCH] Added gc_interval for manual garbage collection (#278) Signed-off-by: Selvaraj Anandaraj Co-authored-by: Selvaraj Anandaraj --- launcher_scripts/conf/training/gpt3/175b.yaml | 1 + launcher_scripts/conf/training/llama/llama2_70b.yaml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/launcher_scripts/conf/training/gpt3/175b.yaml b/launcher_scripts/conf/training/gpt3/175b.yaml index b1acc80ca3..cdf74931d9 100755 --- a/launcher_scripts/conf/training/gpt3/175b.yaml +++ b/launcher_scripts/conf/training/gpt3/175b.yaml @@ -156,6 +156,7 @@ model: use_cpu_initialization: False # Init weights on the CPU (slow for large models) onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this + gc_interval: 10 #Manual garbage collection # Nsys profiling options nsys_profile: diff --git a/launcher_scripts/conf/training/llama/llama2_70b.yaml b/launcher_scripts/conf/training/llama/llama2_70b.yaml index 4a79bc886e..891ed31cde 100644 --- a/launcher_scripts/conf/training/llama/llama2_70b.yaml +++ b/launcher_scripts/conf/training/llama/llama2_70b.yaml @@ -139,7 +139,7 @@ model: use_flash_attention: true overlap_p2p_comm: true batch_p2p_comm: false - gc_interval: 100 + gc_interval: 10 optim: name: distributed_fused_adam lr: 0.00015