Skip to content

Commit

Permalink
switch to fsdp training
Browse files Browse the repository at this point in the history
Signed-off-by: Michael Clifford <[email protected]>
  • Loading branch information
MichaelClifford committed Oct 9, 2024
1 parent 79ef52e commit de79bc6
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 83 deletions.
160 changes: 80 additions & 80 deletions pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -779,7 +779,7 @@ deploymentSpec:
\ = f\"train-{phase_name}-{name_suffix.rstrip('-sdg')}\"\n if phase_name\
\ == \"first\":\n path_to_model = \"/input_model/model\"\n elif\
\ phase_name == \"second\":\n path_to_model = list_phase1_final_model()\n\
\ image = \"registry.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.1-1724960989\"\
\ image = \"registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.2\"\
\n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\
\ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \
\ name: {name}\n spec:\n nprocPerNode: \\\\\"{nproc_per_node}\\\
Expand All @@ -798,31 +798,31 @@ deploymentSpec:
\ --data_path=/input_data/processed_data/data.jsonl --output_dir=/output/model\
\ --num_epochs=2 --effective_batch_size=3840 --learning_rate=1e-4 --num_warmup_steps=800\
\ --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer\
\ --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch\n \
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n \
\ image: {image}\n name: pytorch\n \
\ volumeMounts:\n - mountPath: /input_data\n\
\ name: input-data\n readOnly:\
\ true\n - mountPath: /input_model\n \
\ name: model\n readOnly: true\n \
\ - mountPath: /output\n \
\ name: output\n env:\n - name:\
\ NNODES\n value: \\\\\"{nnodes}\\\\\"\n \
\ - name: NPROC_PER_NODE\n value:\
\ \\\\\"{nproc_per_node}\\\\\"\n resources:\n \
\ requests:\n cpu: 2\n \
\ \"nvidia.com/gpu\": {nproc_per_node}\n \
\ limits:\n cpu: 2\n \
\ \"nvidia.com/gpu\": {nproc_per_node}\n volumes:\n\
\ - name: input-data\n persistentVolumeClaim:\n\
\ claimName: {input_pvc_name}\n \
\ - name: model\n persistentVolumeClaim:\n \
\ claimName: {model_pvc_name}\n - name:\
\ output\n persistentVolumeClaim:\n \
\ claimName: {output_pvc_name}\n Worker:\n \
\ replicas: {nnodes-1}\n restartPolicy: OnFailure\n \
\ template:\n metadata:\n annotations:\n\
\ --distributed_training_framework fsdp --is_granite --checkpoint_at_epoch\n\
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n \
\ image: {image}\n name: pytorch\n \
\ volumeMounts:\n - mountPath:\
\ /input_data\n name: input-data\n \
\ readOnly: true\n - mountPath: /input_model\n\
\ name: model\n readOnly:\
\ true\n - mountPath: /output\n \
\ name: output\n env:\n \
\ - name: NNODES\n value: \\\\\"{nnodes}\\\\\
\"\n - name: NPROC_PER_NODE\n \
\ value: \\\\\"{nproc_per_node}\\\\\"\n resources:\n\
\ requests:\n cpu: 2\n \
\ \"nvidia.com/gpu\": {nproc_per_node}\n \
\ limits:\n cpu: 2\n \
\ \"nvidia.com/gpu\": {nproc_per_node}\n \
\ volumes:\n - name: input-data\n \
\ persistentVolumeClaim:\n claimName: {input_pvc_name}\n\
\ - name: model\n persistentVolumeClaim:\n\
\ claimName: {model_pvc_name}\n \
\ - name: output\n persistentVolumeClaim:\n \
\ claimName: {output_pvc_name}\n Worker:\n\
\ replicas: {nnodes-1}\n restartPolicy: OnFailure\n\
\ template:\n metadata:\n annotations:\n\
\ sidecar.istio.io/inject: 'false'\n spec:\n\
\ containers:\n - args:\n \
\ - |\n mkdir -p /tmp/model;\n \
Expand All @@ -833,21 +833,21 @@ deploymentSpec:
\ --node_rank \\$(RANK) --rdzv_endpoint \\$(MASTER_ADDR):\\$(MASTER_PORT)\
\ -m instructlab.training.main_ds --model_name_or_path={path_to_model} \
\ --data_path=/input_data/processed_data/data.jsonl --output_dir=/tmp/model\
\ --num_epochs=2 --effective_batch_size=3840 --learning_rate=2e-6 --num_warmup_steps=800\
\ --num_epochs=2 --effective_batch_size=3840 --learning_rate=1e-4 --num_warmup_steps=800\
\ --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer\
\ --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch\n \
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n \
\ image: {image}\n name: pytorch\n \
\ volumeMounts:\n - mountPath: /input_data\n\
\ name: input-data\n readOnly:\
\ true\n - mountPath: /input_model\n \
\ name: model\n readOnly: true\n \
\ - mountPath: /output\n \
\ name: output\n readOnly: true\n \
\ env:\n - name: NNODES\n \
\ value: \\\\\"{nnodes}\\\\\"\n - name:\
\ NPROC_PER_NODE\n value: \\\\\"{nproc_per_node}\\\
\ --distributed_training_framework fsdp --is_granite --checkpoint_at_epoch\n\
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n \
\ image: {image}\n name: pytorch\n \
\ volumeMounts:\n - mountPath:\
\ /input_data\n name: input-data\n \
\ readOnly: true\n - mountPath: /input_model\n\
\ name: model\n readOnly:\
\ true\n - mountPath: /output\n \
\ name: output\n readOnly: true\n \
\ env:\n - name: NNODES\n \
\ value: \\\\\"{nnodes}\\\\\"\n \
\ - name: NPROC_PER_NODE\n value: \\\\\"{nproc_per_node}\\\
\\\"\n resources:\n requests:\n\
\ cpu: 2\n \"nvidia.com/gpu\"\
: {nproc_per_node}\n limits:\n \
Expand Down Expand Up @@ -899,7 +899,7 @@ deploymentSpec:
\ = f\"train-{phase_name}-{name_suffix.rstrip('-sdg')}\"\n if phase_name\
\ == \"first\":\n path_to_model = \"/input_model/model\"\n elif\
\ phase_name == \"second\":\n path_to_model = list_phase1_final_model()\n\
\ image = \"registry.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.1-1724960989\"\
\ image = \"registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.2\"\
\n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\
\ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \
\ name: {name}\n spec:\n nprocPerNode: \\\\\"{nproc_per_node}\\\
Expand All @@ -918,31 +918,31 @@ deploymentSpec:
\ --data_path=/input_data/processed_data/data.jsonl --output_dir=/output/model\
\ --num_epochs=2 --effective_batch_size=3840 --learning_rate=1e-4 --num_warmup_steps=800\
\ --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer\
\ --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch\n \
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n \
\ image: {image}\n name: pytorch\n \
\ volumeMounts:\n - mountPath: /input_data\n\
\ name: input-data\n readOnly:\
\ true\n - mountPath: /input_model\n \
\ name: model\n readOnly: true\n \
\ - mountPath: /output\n \
\ name: output\n env:\n - name:\
\ NNODES\n value: \\\\\"{nnodes}\\\\\"\n \
\ - name: NPROC_PER_NODE\n value:\
\ \\\\\"{nproc_per_node}\\\\\"\n resources:\n \
\ requests:\n cpu: 2\n \
\ \"nvidia.com/gpu\": {nproc_per_node}\n \
\ limits:\n cpu: 2\n \
\ \"nvidia.com/gpu\": {nproc_per_node}\n volumes:\n\
\ - name: input-data\n persistentVolumeClaim:\n\
\ claimName: {input_pvc_name}\n \
\ - name: model\n persistentVolumeClaim:\n \
\ claimName: {model_pvc_name}\n - name:\
\ output\n persistentVolumeClaim:\n \
\ claimName: {output_pvc_name}\n Worker:\n \
\ replicas: {nnodes-1}\n restartPolicy: OnFailure\n \
\ template:\n metadata:\n annotations:\n\
\ --distributed_training_framework fsdp --is_granite --checkpoint_at_epoch\n\
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n \
\ image: {image}\n name: pytorch\n \
\ volumeMounts:\n - mountPath:\
\ /input_data\n name: input-data\n \
\ readOnly: true\n - mountPath: /input_model\n\
\ name: model\n readOnly:\
\ true\n - mountPath: /output\n \
\ name: output\n env:\n \
\ - name: NNODES\n value: \\\\\"{nnodes}\\\\\
\"\n - name: NPROC_PER_NODE\n \
\ value: \\\\\"{nproc_per_node}\\\\\"\n resources:\n\
\ requests:\n cpu: 2\n \
\ \"nvidia.com/gpu\": {nproc_per_node}\n \
\ limits:\n cpu: 2\n \
\ \"nvidia.com/gpu\": {nproc_per_node}\n \
\ volumes:\n - name: input-data\n \
\ persistentVolumeClaim:\n claimName: {input_pvc_name}\n\
\ - name: model\n persistentVolumeClaim:\n\
\ claimName: {model_pvc_name}\n \
\ - name: output\n persistentVolumeClaim:\n \
\ claimName: {output_pvc_name}\n Worker:\n\
\ replicas: {nnodes-1}\n restartPolicy: OnFailure\n\
\ template:\n metadata:\n annotations:\n\
\ sidecar.istio.io/inject: 'false'\n spec:\n\
\ containers:\n - args:\n \
\ - |\n mkdir -p /tmp/model;\n \
Expand All @@ -953,21 +953,21 @@ deploymentSpec:
\ --node_rank \\$(RANK) --rdzv_endpoint \\$(MASTER_ADDR):\\$(MASTER_PORT)\
\ -m instructlab.training.main_ds --model_name_or_path={path_to_model} \
\ --data_path=/input_data/processed_data/data.jsonl --output_dir=/tmp/model\
\ --num_epochs=2 --effective_batch_size=3840 --learning_rate=2e-6 --num_warmup_steps=800\
\ --num_epochs=2 --effective_batch_size=3840 --learning_rate=1e-4 --num_warmup_steps=800\
\ --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer\
\ --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch\n \
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n \
\ image: {image}\n name: pytorch\n \
\ volumeMounts:\n - mountPath: /input_data\n\
\ name: input-data\n readOnly:\
\ true\n - mountPath: /input_model\n \
\ name: model\n readOnly: true\n \
\ - mountPath: /output\n \
\ name: output\n readOnly: true\n \
\ env:\n - name: NNODES\n \
\ value: \\\\\"{nnodes}\\\\\"\n - name:\
\ NPROC_PER_NODE\n value: \\\\\"{nproc_per_node}\\\
\ --distributed_training_framework fsdp --is_granite --checkpoint_at_epoch\n\
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n \
\ image: {image}\n name: pytorch\n \
\ volumeMounts:\n - mountPath:\
\ /input_data\n name: input-data\n \
\ readOnly: true\n - mountPath: /input_model\n\
\ name: model\n readOnly:\
\ true\n - mountPath: /output\n \
\ name: output\n readOnly: true\n \
\ env:\n - name: NNODES\n \
\ value: \\\\\"{nnodes}\\\\\"\n \
\ - name: NPROC_PER_NODE\n value: \\\\\"{nproc_per_node}\\\
\\\"\n resources:\n requests:\n\
\ cpu: 2\n \"nvidia.com/gpu\"\
: {nproc_per_node}\n limits:\n \
Expand Down
Loading

0 comments on commit de79bc6

Please sign in to comment.