Skip to content

Commit

Permalink
test: fix failing sft e2e test
Browse files Browse the repository at this point in the history
  • Loading branch information
davidkaczer committed Nov 4, 2024
1 parent b08f02f commit eb658c9
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 44 deletions.
95 changes: 53 additions & 42 deletions tests/config/test_configs/config_sft.yaml
Original file line number Diff line number Diff line change
@@ -1,24 +1,54 @@
settings:
settings:
experiment_id: ${modalities_env:experiment_id}
config_file_path: ${modalities_env:config_file_path}
referencing_keys:
sample_key: input_ids
target_key: target_ids
training:
training_log_interval_in_steps: 2
checkpointing_interval_in_steps: 2
evaluation_interval_in_steps: 2
global_num_seen_tokens: 0
activation_checkpointing_modules: [GPT2Block]
gradient_acc_steps: 2
local_train_micro_batch_size: 1
sequence_length: 2048
prediction_key: logits
cuda_env:
local_rank: ${cuda_env:LOCAL_RANK}
global_rank: ${cuda_env:RANK}
world_size: ${cuda_env:WORLD_SIZE}
paths:
checkpointing_path: data/checkpoints
checkpoint_saving_path: data/checkpoints
train_dataset_path: ./data/lorem_ipsum.pbin
intervals:
training_log_interval_in_steps: 2
checkpointing_interval_in_steps: 2
evaluation_interval_in_steps: 2
consistency_enforcement:
enforce_tokens_per_step_consistency: true
enforce_last_step_logged: false
enforce_last_step_evaluated: false
enforce_last_step_checkpointed: false
step_profile:
gradient_accumulation_steps: 2
local_train_micro_batch_size: 1
sequence_length: 2048
training_target:
num_target_tokens:
component_key: number_conversion
variant_key: num_tokens_from_packed_mem_map_dataset_continuous
config:
dataset_path: ${settings.paths.train_dataset_path}
sequence_length: ${settings.step_profile.sequence_length}
num_ranks: ${settings.cuda_env.world_size}
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}
num_target_steps: # for the batch progress subscriber
component_key: number_conversion
variant_key: num_steps_from_num_tokens
config:
num_ranks: ${settings.cuda_env.world_size}
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
global_num_tokens: ${settings.training_target.num_target_tokens}
sequence_length: ${settings.step_profile.sequence_length}
gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}
training_progress:
global_num_seen_tokens: 0
num_seen_steps: 0
local_num_seen_batches: 0
last_step: -1

collate_fn:
component_key: collate_fn
Expand All @@ -44,8 +74,8 @@ train_dataset:
component_key: dataset
variant_key: packed_mem_map_dataset_continuous
config:
raw_data_path: ./tests/test_data/lorem_ipsum_sft_7e71e5e/lorem_ipsum_sft_converted_test.7e71e5e.pbin
sequence_length: ${settings.training.sequence_length}
raw_data_path: ./tests/test_data/lorem_ipsum_sft_7e71e5e/lorem_ipsum_sft_converted_train.7e71e5e.pbin
sequence_length: ${settings.step_profile.sequence_length}
sample_key: ${settings.referencing_keys.sample_key}
reuse_last_target: false

Expand All @@ -55,7 +85,6 @@ train_dataloader:
config:
num_workers: 2
pin_memory: true
shuffle: false
dataloader_tag: train
dataset:
instance_key: train_dataset
Expand All @@ -64,7 +93,7 @@ train_dataloader:
component_key: batch_sampler
variant_key: default
config:
batch_size: ${settings.training.local_train_micro_batch_size}
batch_size: ${settings.step_profile.local_train_micro_batch_size}
drop_last: true
sampler:
component_key: sampler
Expand All @@ -86,7 +115,6 @@ val_dataloader:
config:
num_workers: 2
pin_memory: true
shuffle: false
dataloader_tag: "val"
dataset:
instance_key: train_dataset
Expand Down Expand Up @@ -117,7 +145,6 @@ test_dataloader:
config:
num_workers: 2
pin_memory: true
shuffle: false
dataloader_tag: "test"
dataset:
instance_key: train_dataset
Expand Down Expand Up @@ -161,16 +188,9 @@ checkpoint_saving:
component_key: checkpoint_saving_execution
variant_key: fsdp
config:
checkpoint_path: ${settings.paths.checkpointing_path}
checkpoint_path: ${settings.paths.checkpoint_saving_path}
global_rank: ${settings.cuda_env.global_rank}
experiment_id: ${settings.experiment_id}
get_num_tokens_from_num_steps_callable:
component_key: number_conversion
variant_key: num_tokens_from_num_steps_callable
config:
num_ranks: ${settings.cuda_env.world_size}
local_micro_batch_size: ${settings.training.local_train_micro_batch_size}
sequence_length: ${settings.training.sequence_length}

# resolving class types via different enums sucks...
loss_fn:
Expand Down Expand Up @@ -215,7 +235,7 @@ model_raw:
config:
sample_key: ${settings.referencing_keys.sample_key}
poe_type: NOPE
sequence_length: ${settings.training.sequence_length}
sequence_length: ${settings.step_profile.sequence_length}
prediction_key: ${loss_fn.config.prediction_key}
vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
n_layer: 2
Expand Down Expand Up @@ -306,26 +326,17 @@ gradient_clipper:
norm_type: P2_NORM
max_norm: 1.0

batch_progress_subscriber:
progress_subscriber:
component_key: progress_subscriber
variant_key: rich
config:
global_rank: ${settings.cuda_env.global_rank}
global_num_seen_steps:
component_key: number_conversion
variant_key: num_steps_from_num_tokens
config:
num_ranks: ${settings.cuda_env.world_size}
local_micro_batch_size: ${settings.training.local_train_micro_batch_size}
global_num_tokens: ${settings.training.global_num_seen_tokens}
sequence_length: ${settings.training.sequence_length}
gradient_acc_steps: ${settings.training.gradient_acc_steps}
train_dataloader:
instance_key: train_dataloader
pass_type: BY_REFERENCE
num_seen_steps: ${settings.training_progress.num_seen_steps}
num_target_steps: ${settings.training_target.num_target_steps}
train_dataloader_tag: ${train_dataloader.config.dataloader_tag}
eval_dataloaders:
instance_key: eval_dataloaders
pass_type: BY_REFERENCE
instance_key: eval_dataloaders
pass_type: BY_REFERENCE

evaluation_subscriber:
component_key: results_subscriber
Expand All @@ -335,5 +346,5 @@ evaluation_subscriber:
project: modalities_lorem_ipsum
mode: OFFLINE
experiment_id: ${settings.experiment_id}
directory: "."
directory: wandb_storage
config_file_path: ${settings.config_file_path}
4 changes: 2 additions & 2 deletions tests/instruction_tuning/test_e2e_instruction_tuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def test_e2e_instruction_tuning(monkeypatch, tmp_path):

# Adapt config for test
checkpointing_path = tmp_path / "sft_checkpoints/"
config_dict["settings"]["paths"]["checkpointing_path"] = checkpointing_path.__str__()
config_dict["settings"]["paths"]["checkpoint_saving_path"] = checkpointing_path.__str__()
config_dict["checkpoint_saving"]["config"]["checkpoint_saving_execution"]["config"][
"checkpoint_path"
] = checkpointing_path.__str__()
Expand All @@ -45,4 +45,4 @@ def test_e2e_instruction_tuning(monkeypatch, tmp_path):
"model" in path.name or "optimizer" in path.name or path.suffix == ".yaml"
for path in list(checkpointing_path.glob("*"))[0].glob("*")
]
assert sum(checkpoint_files) == 3, "Output of the test i.e. a model checkpoint was not created!"
assert sum(checkpoint_files) == 1, "Output of the test i.e. a model checkpoint was not created!"

0 comments on commit eb658c9

Please sign in to comment.