From 3dcb2974da465b3d3a061215694464787737108b Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Sun, 13 Oct 2024 22:36:10 -0500
Subject: [PATCH 1/2] fixed bugs and added commandline option

---
 megatron/arguments.py        |  2 ++
 megatron/data/gpt_dataset.py | 10 ++++++----
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 9a5e4b8da7..82e000923f 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -1252,6 +1252,8 @@ def _add_data_args(parser):
     group.add_argument('--data-file-list', type=str, default=None,
                        help='The file with the list of dataset and weights')
     
+    group.add_argument('--shuffle-sample', action='stored_true', help="Whether to shuffle the samples within in the dataset files")
+
     group.add_argument('--split', type=str, default='969, 30, 1',
                        help='Comma-separated list of proportions for training,'
                        ' validation, and test split. For example the split '
diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index c801a6a5ae..8c32be7d8e 100755
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -94,7 +94,7 @@ def Build(self):
 
         class BuildConcatDataset(torch.utils.data.Dataset):
             @dlp.log
-            def __init__(self, dataset_builders):
+            def __init__(self, dataset_builders, shuffle=False):
                 self.dataset_builders = dataset_builders
                 self.num_datasets = len(dataset_builders)
                 self.num_samples = np.sum([d.num_samples for d in dataset_builders])
@@ -117,7 +117,9 @@ def _build_indices():
 
                 self.dataset_index, self.dataset_sample_index = _build_indices()
                 np_rng = np.random.RandomState(seed=dataset_builders[0].seed)
-                self.shuffle_index=np_rng.shuffle(range(self.num_samples))
+                self.shuffle_index = np.arange(self.num_samples)
+                if shuffle:
+                    np_rng.shuffle(self.shuffle_index)
                 for i in range(self.num_datasets):
                     self.desc += dataset_builders[i].prefix + ","
 
@@ -146,7 +148,7 @@ def __getitem__(self, idx):
         valid_datasets = []
         test_datasets = []
         # Build individual datasets.
-
+        args = get_args()
         @dlp.log
         def build_corpus_datasets(dataset_type='train'):
             start_time = time.time()
@@ -172,7 +174,7 @@ def build_corpus_datasets(dataset_type='train'):
             print_rank_0(" > number of samples for each corpus ")
             corpus_weights_achieved={}
             for c in corpus_list:
-                datasets.append(BuildConcatDataset(corpus_builders[c]))
+                datasets.append(BuildConcatDataset(corpus_builders[c], args.shuffle_sample))
                 total += datasets[-1].num_samples
                 corpus_weights_achieved[c] =  float(datasets[-1].num_samples)/train_num_samples                
                 print_rank_0(f"    {c}: {datasets[-1].num_samples} w={corpus_weights_achieved[c]} (expected: {corpus_weights[c]})")

From 43fc2feaace3781bb399e32b2f6e827920622c61 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Sun, 13 Oct 2024 23:00:21 -0500
Subject: [PATCH 2/2] fixed typo

---
 megatron/arguments.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 82e000923f..307e725e51 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -1252,7 +1252,7 @@ def _add_data_args(parser):
     group.add_argument('--data-file-list', type=str, default=None,
                        help='The file with the list of dataset and weights')
     
-    group.add_argument('--shuffle-sample', action='stored_true', help="Whether to shuffle the samples within in the dataset files")
+    group.add_argument('--shuffle-sample', action='store_true', help="Whether to shuffle the samples within in the dataset files")
 
     group.add_argument('--split', type=str, default='969, 30, 1',
                        help='Comma-separated list of proportions for training,'