hpcaitech · GuangyaoZhang · Jun 24, 2024 · Jun 25, 2024 · Jun 25, 2024 · Jun 25, 2024
@@ -90,7 +90,7 @@ jobs:
  runs-on: [self-hosted, gpu]
  container:
  image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
- options: --gpus all --rm -v /dev/shm -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
+ options: --gpus all --rm -v /dev/shm -v /data/scratch:/data/scratch
  timeout-minutes: 90
  defaults:
  run:
@@ -165,6 +165,7 @@ jobs:
  env:
  LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
  LLAMA_PATH: /data/scratch/llama-tiny
+ MOE_TENSOR_PATH: /data/scratch/moe_tensors
 
  - name: Collate artifact
  env:

@@ -13,7 +13,7 @@ jobs:
  runs-on: [self-hosted, gpu]
  container:
  image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
- options: --gpus all --rm -v /dev/shm -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
+ options: --gpus all --rm -v /dev/shm -v /data/scratch/:/data/scratch/
  timeout-minutes: 90
  steps:
  - name: Check GPU Availability # ensure all GPUs have enough memory
@@ -69,6 +69,7 @@ jobs:
  env:
  LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
  LLAMA_PATH: /data/scratch/llama-tiny
+ MOE_TENSOR_PATH: /data/scratch/moe_tensors
 
  - name: Notify Lark
  id: message-preparation

@@ -50,7 +50,7 @@ jobs:
  matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
  container:
  image: ${{ matrix.container }}
- options: --gpus all --rm -v /dev/shm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
+ options: --gpus all --rm -v /dev/shm -v /data/scratch/:/data/scratch/
  timeout-minutes: 200
  steps:
  - name: Install dependencies
@@ -92,3 +92,4 @@ jobs:
  DATA: /data/scratch/cifar-10
  LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
  LLAMA_PATH: /data/scratch/llama-tiny
+ MOE_TENSOR_PATH: /data/scratch/moe_tensors
@@ -41,7 +41,7 @@ jobs:
  matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
  container:
  image: ${{ matrix.container }}
- options: --gpus all --rm -v /dev/shm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
+ options: --gpus all --rm -v /dev/shm -v /data/scratch/:/data/scratch/
  timeout-minutes: 200
  concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-test-${{ matrix.container }}
@@ -87,3 +87,4 @@ jobs:
  DATA: /data/scratch/cifar-10
  LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
  LLAMA_PATH: /data/scratch/llama-tiny
+ MOE_TENSOR_PATH: /data/scratch/moe_tensors
@@ -38,7 +38,7 @@ jobs:
  matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
  container:
  image: ${{ matrix.container }}
- options: --gpus all --rm -v /dev/shm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
+ options: --gpus all --rm -v /dev/shm -v /data/scratch/:/data/scratch/
  timeout-minutes: 200
  steps:
  - name: Install dependencies
@@ -85,6 +85,7 @@ jobs:
  DATA: /data/scratch/cifar-10
  LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
  LLAMA_PATH: /data/scratch/llama-tiny
+ MOE_TENSOR_PATH: /data/scratch/moe_tensors
 
  - name: Notify Lark
  id: message-preparation

@@ -1,34 +1,34 @@
 repos:
 
  - repo: https://github.com/PyCQA/autoflake
- rev: v2.2.1
+ rev: v2.3.1
  hooks:
  - id: autoflake
  name: autoflake (python)
  args: ['--in-place', '--remove-unused-variables', '--remove-all-unused-imports', '--ignore-init-module-imports']
 
  - repo: https://github.com/pycqa/isort
- rev: 5.12.0
+ rev: 5.13.2
  hooks:
  - id: isort
  name: sort all imports (python)
 
  - repo: https://github.com/psf/black-pre-commit-mirror
- rev: 23.9.1
+ rev: 24.4.2
  hooks:
  - id: black
  name: black formatter
  args: ['--line-length=120', '--target-version=py37', '--target-version=py38', '--target-version=py39','--target-version=py310']
 
  - repo: https://github.com/pre-commit/mirrors-clang-format
- rev: v13.0.1
+ rev: v18.1.8
  hooks:
  - id: clang-format
  name: clang formatter
  types_or: [c++, c]
 
  - repo: https://github.com/pre-commit/pre-commit-hooks
- rev: v4.3.0
+ rev: v4.6.0
  hooks:
  - id: check-yaml
  - id: check-merge-conflict

@@ -83,15 +83,19 @@ def __call__(self, instances: Sequence[Dict[str, List[int]]]) -> Dict[str, torch
 
  # `List[torch.Tensor]`
  batch_input_ids = [
- torch.LongTensor(instance["input_ids"][: self.max_length])
- if len(instance["input_ids"]) > self.max_length
- else torch.LongTensor(instance["input_ids"])
+ (
+ torch.LongTensor(instance["input_ids"][: self.max_length])
+ if len(instance["input_ids"]) > self.max_length
+ else torch.LongTensor(instance["input_ids"])
+ )
  for instance in instances
  ]
  batch_labels = [
- torch.LongTensor(instance["labels"][: self.max_length])
- if len(instance["labels"]) > self.max_length
- else torch.LongTensor(instance["labels"])
+ (
+ torch.LongTensor(instance["labels"][: self.max_length])
+ if len(instance["labels"]) > self.max_length
+ else torch.LongTensor(instance["labels"])
+ )
  for instance in instances
  ]
  if self.tokenizer.padding_side == "right":

@@ -1,6 +1,7 @@
 """
 loss functions
 """
+
 from typing import Optional, Tuple
 
 import torch

@@ -1,6 +1,7 @@
 """
 reward model
 """
+
 from typing import Optional
 
 import torch

@@ -1,6 +1,7 @@
 """
 Training utilities for Coati.
 """
+
 from typing import Any
 
 import torch

@@ -78,7 +78,9 @@ def get_prompt(line: Dict, dataset_name: str, logger: DistributedLogger) -> Dict
  option_string = "ABCDEFG"
  count = len(line["options"])
 
- input = "问题：" + line["question"] + " " + "从以下选项中选择：" + " ".join(line["options"]) + "\n" + "答案："
+ input = (
+ "问题：" + line["question"] + " " + "从以下选项中选择：" + " ".join(line["options"]) + "\n" + "答案："
+ )
 
  all_classes = list(option_string[0:count])
 
@@ -150,7 +152,15 @@ def combine_prompt(prompt_path, dataset_name, load_explanation=True, chat_mode=F
  )
  elif dataset_name in chinese_qa_datasets:
  question_input = (
- "问题：" + passage + " " + question + "\n" + "从以下选项中选择：" + " ".join(options) + "\n" + "答案：{}".format(label)
+ "问题："
+ + passage
+ + " "
+ + question
+ + "\n"
+ + "从以下选项中选择："
+ + " ".join(options)
+ + "\n"
+ + "答案：{}".format(label)
  )
  elif dataset_name in english_cloze_datasets:
  question_input = "Question: ".format(idx + 1) + question + "\n" + "Answer: {}".format(answer)

@@ -57,7 +57,11 @@
  "urban_and_rural_planner": ["Urban and Rural Planner", "注册城乡规划师", "Other"],
  "accountant": ["Accountant", "注册会计师", "Other"],
  "fire_engineer": ["Fire Engineer", "注册消防工程师", "Other"],
- "environmental_impact_assessment_engineer": ["Environmental Impact Assessment Engineer", "环境影响评价工程师", "Other"],
+ "environmental_impact_assessment_engineer": [
+ "Environmental Impact Assessment Engineer",
+ "环境影响评价工程师",
+ "Other",
+ ],
  "tax_accountant": ["Tax Accountant", "税务师", "Other"],
  "physician": ["Physician", "医师资格", "Other"],
 }

@@ -56,9 +56,11 @@ def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
  "instruction": question["turns"],
  "input": "",
  "output": [],
- "target": [""] * turn_number
- if question["question_id"] not in reference
- else reference[question["question_id"]],
+ "target": (
+ [""] * turn_number
+ if question["question_id"] not in reference
+ else reference[question["question_id"]]
+ ),
  }
 
  if category in dataset["test"]:

@@ -77,7 +77,9 @@ def _get_choices_indices(self, language: str):
  self.indices_for_choices[0].append(
  self.tokenizer(f"Answer: {choice}", add_special_tokens=False).input_ids[-1]
  )
- self.indices_for_choices[1].append(self.tokenizer(f"答案：{choice}", add_special_tokens=False).input_ids[-1])
+ self.indices_for_choices[1].append(
+ self.tokenizer(f"答案：{choice}", add_special_tokens=False).input_ids[-1]
+ )
 
  def _load_tokenizer(self, path: str, tokenizer_path: Optional[str], tokenizer_kwargs: dict):
  """

@@ -2,8 +2,6 @@
 
 import torch
 import torch.distributed as dist
-from colossal_moe.models.mixtral_checkpoint import MixtralMoEHybridParallelCheckpointIO
-from colossal_moe.models.mixtral_policy import MixtralForCausalLMPolicy
 from transformers import AutoTokenizer
 from transformers.models.mixtral import MixtralConfig, MixtralForCausalLM
 
@@ -70,8 +68,6 @@ def main():
  ep_size=ep_size,
  zero_stage=1,
  precision=args.precision,
- custom_policy=MixtralForCausalLMPolicy(),
- checkpoint_io=MixtralMoEHybridParallelCheckpointIO,
  enable_fused_normalization=args.use_layernorm_kernel,
  enable_jit_fused=args.use_kernel,
  )

@@ -1,5 +1,6 @@
 NUM_GPU=2
-MODEL="mistralai/Mixtral-8x7B-v0.1"
+# MODEL="mistralai/Mixtral-8x7B-v0.1"
+MODEL="mistralai/Mixtral-8x7B-Instruct-v0.1"
 
 # ep
 torchrun --standalone --nproc_per_node $NUM_GPU infer.py \