switch to fsdp training

Signed-off-by: Michael Clifford <[email protected]>
redhat-et · Oct 9, 2024 · de79bc6 · de79bc6
1 parent 79ef52e
commit de79bc6
Show file tree

Hide file tree

Showing 2 changed files with 83 additions and 83 deletions.
diff --git a/pipeline.yaml b/pipeline.yaml
@@ -779,7 +779,7 @@ deploymentSpec:
  \ = f\"train-{phase_name}-{name_suffix.rstrip('-sdg')}\"\n if phase_name\
  \ == \"first\":\n path_to_model = \"/input_model/model\"\n elif\
  \ phase_name == \"second\":\n path_to_model = list_phase1_final_model()\n\
- \  image = \"registry.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.1-1724960989\"\
+ \  image = \"registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.2\"\
  \n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\
  \ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \
  \  name: {name}\n spec:\n nprocPerNode: \\\\\"{nproc_per_node}\\\
@@ -798,31 +798,31 @@ deploymentSpec:
  \ --data_path=/input_data/processed_data/data.jsonl --output_dir=/output/model\
  \ --num_epochs=2 --effective_batch_size=3840 --learning_rate=1e-4 --num_warmup_steps=800\
  \ --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer\
- \ --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch\n \
- \  command:\n - /bin/bash\n  \
- \  - '-c'\n - '--'\n  \
- \  image: {image}\n name: pytorch\n  \
- \  volumeMounts:\n - mountPath: /input_data\n\
- \  name: input-data\n  readOnly:\
- \ true\n   - mountPath: /input_model\n \
- \  name: model\n readOnly: true\n \
- \  - mountPath: /output\n  \
- \ name: output\n env:\n  - name:\
- \ NNODES\n value: \\\\\"{nnodes}\\\\\"\n \
- \  - name: NPROC_PER_NODE\n  value:\
- \ \\\\\"{nproc_per_node}\\\\\"\n resources:\n \
- \  requests:\n cpu: 2\n  \
- \  \"nvidia.com/gpu\": {nproc_per_node}\n  \
- \  limits:\n cpu: 2\n  \
- \  \"nvidia.com/gpu\": {nproc_per_node}\n  volumes:\n\
- \  - name: input-data\n  persistentVolumeClaim:\n\
- \  claimName: {input_pvc_name}\n \
- \  - name: model\n persistentVolumeClaim:\n \
- \  claimName: {model_pvc_name}\n  - name:\
- \ output\n persistentVolumeClaim:\n  \
- \  claimName: {output_pvc_name}\n Worker:\n \
- \  replicas: {nnodes-1}\n restartPolicy: OnFailure\n \
- \  template:\n metadata:\n annotations:\n\
+ \ --distributed_training_framework fsdp --is_granite --checkpoint_at_epoch\n\
+ \   command:\n - /bin/bash\n \
+ \   - '-c'\n - '--'\n \
+ \   image: {image}\n name: pytorch\n \
+ \   volumeMounts:\n - mountPath:\
+ \ /input_data\n  name: input-data\n \
+ \  readOnly: true\n   - mountPath: /input_model\n\
+ \   name: model\n readOnly:\
+ \ true\n  - mountPath: /output\n \
+ \  name: output\n env:\n \
+ \  - name: NNODES\n value: \\\\\"{nnodes}\\\\\
+ \"\n  - name: NPROC_PER_NODE\n \
+ \  value: \\\\\"{nproc_per_node}\\\\\"\n resources:\n\
+ \   requests:\n cpu: 2\n \
+ \   \"nvidia.com/gpu\": {nproc_per_node}\n \
+ \   limits:\n cpu: 2\n \
+ \   \"nvidia.com/gpu\": {nproc_per_node}\n \
+ \  volumes:\n  - name: input-data\n \
+ \  persistentVolumeClaim:\n  claimName: {input_pvc_name}\n\
+ \   - name: model\n persistentVolumeClaim:\n\
+ \   claimName: {model_pvc_name}\n \
+ \  - name: output\n persistentVolumeClaim:\n \
+ \   claimName: {output_pvc_name}\n Worker:\n\
+ \   replicas: {nnodes-1}\n restartPolicy: OnFailure\n\
+ \   template:\n metadata:\n annotations:\n\
  \  sidecar.istio.io/inject: 'false'\n spec:\n\
  \  containers:\n - args:\n \
  \  - |\n mkdir -p /tmp/model;\n \
@@ -833,21 +833,21 @@ deploymentSpec:
  \ --node_rank \\$(RANK) --rdzv_endpoint \\$(MASTER_ADDR):\\$(MASTER_PORT)\
  \ -m instructlab.training.main_ds --model_name_or_path={path_to_model} \
  \ --data_path=/input_data/processed_data/data.jsonl --output_dir=/tmp/model\
- \ --num_epochs=2 --effective_batch_size=3840 --learning_rate=2e-6 --num_warmup_steps=800\
+ \ --num_epochs=2 --effective_batch_size=3840 --learning_rate=1e-4 --num_warmup_steps=800\
  \ --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer\
- \ --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch\n \
- \  command:\n - /bin/bash\n  \
- \  - '-c'\n - '--'\n  \
- \  image: {image}\n name: pytorch\n  \
- \  volumeMounts:\n - mountPath: /input_data\n\
- \  name: input-data\n  readOnly:\
- \ true\n   - mountPath: /input_model\n \
- \  name: model\n readOnly: true\n \
- \  - mountPath: /output\n  \
- \ name: output\n readOnly: true\n  \
- \  env:\n - name: NNODES\n  \
- \  value: \\\\\"{nnodes}\\\\\"\n  - name:\
- \ NPROC_PER_NODE\n value: \\\\\"{nproc_per_node}\\\
+ \ --distributed_training_framework fsdp --is_granite --checkpoint_at_epoch\n\
+ \   command:\n - /bin/bash\n \
+ \   - '-c'\n - '--'\n \
+ \   image: {image}\n name: pytorch\n \
+ \   volumeMounts:\n - mountPath:\
+ \ /input_data\n  name: input-data\n \
+ \  readOnly: true\n   - mountPath: /input_model\n\
+ \   name: model\n readOnly:\
+ \ true\n  - mountPath: /output\n \
+ \  name: output\n readOnly: true\n \
+ \   env:\n - name: NNODES\n \
+ \   value: \\\\\"{nnodes}\\\\\"\n \
+ \  - name: NPROC_PER_NODE\n value: \\\\\"{nproc_per_node}\\\
  \\\"\n resources:\n requests:\n\
  \  cpu: 2\n \"nvidia.com/gpu\"\
  : {nproc_per_node}\n limits:\n \
@@ -899,7 +899,7 @@ deploymentSpec:
  \ = f\"train-{phase_name}-{name_suffix.rstrip('-sdg')}\"\n if phase_name\
  \ == \"first\":\n path_to_model = \"/input_model/model\"\n elif\
  \ phase_name == \"second\":\n path_to_model = list_phase1_final_model()\n\
- \  image = \"registry.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.1-1724960989\"\
+ \  image = \"registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.2\"\
  \n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\
  \ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \
  \  name: {name}\n spec:\n nprocPerNode: \\\\\"{nproc_per_node}\\\
@@ -918,31 +918,31 @@ deploymentSpec:
  \ --data_path=/input_data/processed_data/data.jsonl --output_dir=/output/model\
  \ --num_epochs=2 --effective_batch_size=3840 --learning_rate=1e-4 --num_warmup_steps=800\
  \ --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer\
- \ --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch\n \
- \  command:\n - /bin/bash\n  \
- \  - '-c'\n - '--'\n  \
- \  image: {image}\n name: pytorch\n  \
- \  volumeMounts:\n - mountPath: /input_data\n\
- \  name: input-data\n  readOnly:\
- \ true\n   - mountPath: /input_model\n \
- \  name: model\n readOnly: true\n \
- \  - mountPath: /output\n  \
- \ name: output\n env:\n  - name:\
- \ NNODES\n value: \\\\\"{nnodes}\\\\\"\n \
- \  - name: NPROC_PER_NODE\n  value:\
- \ \\\\\"{nproc_per_node}\\\\\"\n resources:\n \
- \  requests:\n cpu: 2\n  \
- \  \"nvidia.com/gpu\": {nproc_per_node}\n  \
- \  limits:\n cpu: 2\n  \
- \  \"nvidia.com/gpu\": {nproc_per_node}\n  volumes:\n\
- \  - name: input-data\n  persistentVolumeClaim:\n\
- \  claimName: {input_pvc_name}\n \
- \  - name: model\n persistentVolumeClaim:\n \
- \  claimName: {model_pvc_name}\n  - name:\
- \ output\n persistentVolumeClaim:\n  \
- \  claimName: {output_pvc_name}\n Worker:\n \
- \  replicas: {nnodes-1}\n restartPolicy: OnFailure\n \
- \  template:\n metadata:\n annotations:\n\
+ \ --distributed_training_framework fsdp --is_granite --checkpoint_at_epoch\n\
+ \   command:\n - /bin/bash\n \
+ \   - '-c'\n - '--'\n \
+ \   image: {image}\n name: pytorch\n \
+ \   volumeMounts:\n - mountPath:\
+ \ /input_data\n  name: input-data\n \
+ \  readOnly: true\n   - mountPath: /input_model\n\
+ \   name: model\n readOnly:\
+ \ true\n  - mountPath: /output\n \
+ \  name: output\n env:\n \
+ \  - name: NNODES\n value: \\\\\"{nnodes}\\\\\
+ \"\n  - name: NPROC_PER_NODE\n \
+ \  value: \\\\\"{nproc_per_node}\\\\\"\n resources:\n\
+ \   requests:\n cpu: 2\n \
+ \   \"nvidia.com/gpu\": {nproc_per_node}\n \
+ \   limits:\n cpu: 2\n \
+ \   \"nvidia.com/gpu\": {nproc_per_node}\n \
+ \  volumes:\n  - name: input-data\n \
+ \  persistentVolumeClaim:\n  claimName: {input_pvc_name}\n\
+ \   - name: model\n persistentVolumeClaim:\n\
+ \   claimName: {model_pvc_name}\n \
+ \  - name: output\n persistentVolumeClaim:\n \
+ \   claimName: {output_pvc_name}\n Worker:\n\
+ \   replicas: {nnodes-1}\n restartPolicy: OnFailure\n\
+ \   template:\n metadata:\n annotations:\n\
  \  sidecar.istio.io/inject: 'false'\n spec:\n\
  \  containers:\n - args:\n \
  \  - |\n mkdir -p /tmp/model;\n \
@@ -953,21 +953,21 @@ deploymentSpec:
  \ --node_rank \\$(RANK) --rdzv_endpoint \\$(MASTER_ADDR):\\$(MASTER_PORT)\
  \ -m instructlab.training.main_ds --model_name_or_path={path_to_model} \
  \ --data_path=/input_data/processed_data/data.jsonl --output_dir=/tmp/model\
- \ --num_epochs=2 --effective_batch_size=3840 --learning_rate=2e-6 --num_warmup_steps=800\
+ \ --num_epochs=2 --effective_batch_size=3840 --learning_rate=1e-4 --num_warmup_steps=800\
  \ --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer\
- \ --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch\n \
- \  command:\n - /bin/bash\n  \
- \  - '-c'\n - '--'\n  \
- \  image: {image}\n name: pytorch\n  \
- \  volumeMounts:\n - mountPath: /input_data\n\
- \  name: input-data\n  readOnly:\
- \ true\n   - mountPath: /input_model\n \
- \  name: model\n readOnly: true\n \
- \  - mountPath: /output\n  \
- \ name: output\n readOnly: true\n  \
- \  env:\n - name: NNODES\n  \
- \  value: \\\\\"{nnodes}\\\\\"\n  - name:\
- \ NPROC_PER_NODE\n value: \\\\\"{nproc_per_node}\\\
+ \ --distributed_training_framework fsdp --is_granite --checkpoint_at_epoch\n\
+ \   command:\n - /bin/bash\n \
+ \   - '-c'\n - '--'\n \
+ \   image: {image}\n name: pytorch\n \
+ \   volumeMounts:\n - mountPath:\
+ \ /input_data\n  name: input-data\n \
+ \  readOnly: true\n   - mountPath: /input_model\n\
+ \   name: model\n readOnly:\
+ \ true\n  - mountPath: /output\n \
+ \  name: output\n readOnly: true\n \
+ \   env:\n - name: NNODES\n \
+ \   value: \\\\\"{nnodes}\\\\\"\n \
+ \  - name: NPROC_PER_NODE\n value: \\\\\"{nproc_per_node}\\\
  \\\"\n resources:\n requests:\n\
  \  cpu: 2\n \"nvidia.com/gpu\"\
  : {nproc_per_node}\n limits:\n \