From 3dcb2974da465b3d3a061215694464787737108b Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Sun, 13 Oct 2024 22:36:10 -0500 Subject: [PATCH 1/2] fixed bugs and added commandline option --- megatron/arguments.py | 2 ++ megatron/data/gpt_dataset.py | 10 ++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index 9a5e4b8da7..82e000923f 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -1252,6 +1252,8 @@ def _add_data_args(parser): group.add_argument('--data-file-list', type=str, default=None, help='The file with the list of dataset and weights') + group.add_argument('--shuffle-sample', action='stored_true', help="Whether to shuffle the samples within in the dataset files") + group.add_argument('--split', type=str, default='969, 30, 1', help='Comma-separated list of proportions for training,' ' validation, and test split. For example the split ' diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py index c801a6a5ae..8c32be7d8e 100755 --- a/megatron/data/gpt_dataset.py +++ b/megatron/data/gpt_dataset.py @@ -94,7 +94,7 @@ def Build(self): class BuildConcatDataset(torch.utils.data.Dataset): @dlp.log - def __init__(self, dataset_builders): + def __init__(self, dataset_builders, shuffle=False): self.dataset_builders = dataset_builders self.num_datasets = len(dataset_builders) self.num_samples = np.sum([d.num_samples for d in dataset_builders]) @@ -117,7 +117,9 @@ def _build_indices(): self.dataset_index, self.dataset_sample_index = _build_indices() np_rng = np.random.RandomState(seed=dataset_builders[0].seed) - self.shuffle_index=np_rng.shuffle(range(self.num_samples)) + self.shuffle_index = np.arange(self.num_samples) + if shuffle: + np_rng.shuffle(self.shuffle_index) for i in range(self.num_datasets): self.desc += dataset_builders[i].prefix + "," @@ -146,7 +148,7 @@ def __getitem__(self, idx): valid_datasets = [] test_datasets = [] # Build individual datasets. - + args = get_args() @dlp.log def build_corpus_datasets(dataset_type='train'): start_time = time.time() @@ -172,7 +174,7 @@ def build_corpus_datasets(dataset_type='train'): print_rank_0(" > number of samples for each corpus ") corpus_weights_achieved={} for c in corpus_list: - datasets.append(BuildConcatDataset(corpus_builders[c])) + datasets.append(BuildConcatDataset(corpus_builders[c], args.shuffle_sample)) total += datasets[-1].num_samples corpus_weights_achieved[c] = float(datasets[-1].num_samples)/train_num_samples print_rank_0(f" {c}: {datasets[-1].num_samples} w={corpus_weights_achieved[c]} (expected: {corpus_weights[c]})") From 43fc2feaace3781bb399e32b2f6e827920622c61 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Sun, 13 Oct 2024 23:00:21 -0500 Subject: [PATCH 2/2] fixed typo --- megatron/arguments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index 82e000923f..307e725e51 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -1252,7 +1252,7 @@ def _add_data_args(parser): group.add_argument('--data-file-list', type=str, default=None, help='The file with the list of dataset and weights') - group.add_argument('--shuffle-sample', action='stored_true', help="Whether to shuffle the samples within in the dataset files") + group.add_argument('--shuffle-sample', action='store_true', help="Whether to shuffle the samples within in the dataset files") group.add_argument('--split', type=str, default='969, 30, 1', help='Comma-separated list of proportions for training,'