From 63b1901b6127cd71f5b54877fad211714499120f Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 16 Oct 2024 11:24:09 -0500 Subject: [PATCH] Update `megatron/data/gpt_dataset.py` --- megatron/data/gpt_dataset.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py index c412d02b31..0a3d898d63 100644 --- a/megatron/data/gpt_dataset.py +++ b/megatron/data/gpt_dataset.py @@ -168,6 +168,10 @@ def _build_indices(): for i in range(self.num_datasets): self.desc += dataset_builders[i].prefix + "," + log.info( + f"[BuildConcatDataset] Caught {shuffle=} across" + f" {self.num_samples} samples" + ) self.desc += ( f"-{self.num_samples}" + f"-{dataset_builders[0].seq_length}"