From d4ec7c8b3a80499e6233255b7d9462d6e1d64a4f Mon Sep 17 00:00:00 2001
From: Selvaraj Anandaraj <anandaraj@wisc.edu>
Date: Mon, 1 Apr 2024 13:41:54 -0700
Subject: [PATCH] Added gc_interval for manual garbage collection (#278)

Signed-off-by: Selvaraj Anandaraj <selvaraja@login-eos02.eos.clusters.nvidia.com>
Co-authored-by: Selvaraj Anandaraj <selvaraja@login-eos02.eos.clusters.nvidia.com>
---
 launcher_scripts/conf/training/gpt3/175b.yaml        | 1 +
 launcher_scripts/conf/training/llama/llama2_70b.yaml | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/launcher_scripts/conf/training/gpt3/175b.yaml b/launcher_scripts/conf/training/gpt3/175b.yaml
index b1acc80ca3..cdf74931d9 100755
--- a/launcher_scripts/conf/training/gpt3/175b.yaml
+++ b/launcher_scripts/conf/training/gpt3/175b.yaml
@@ -156,6 +156,7 @@ model:
   use_cpu_initialization: False # Init weights on the CPU (slow for large models)
   onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
   apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
+  gc_interval: 10 #Manual garbage collection
 
   # Nsys profiling options
   nsys_profile:
diff --git a/launcher_scripts/conf/training/llama/llama2_70b.yaml b/launcher_scripts/conf/training/llama/llama2_70b.yaml
index 4a79bc886e..891ed31cde 100644
--- a/launcher_scripts/conf/training/llama/llama2_70b.yaml
+++ b/launcher_scripts/conf/training/llama/llama2_70b.yaml
@@ -139,7 +139,7 @@ model:
   use_flash_attention: true
   overlap_p2p_comm: true
   batch_p2p_comm: false
-  gc_interval: 100
+  gc_interval: 10
   optim:
     name: distributed_fused_adam
     lr: 0.00015