Merge branch 'main' into sync/main

hpcaitech · Mar 4, 2024 · 0310b76 · 0310b76
2 parents 0aa27f1 + 4b8312c
commit 0310b76
Show file tree

Hide file tree

Showing 80 changed files with 3,479 additions and 1,249 deletions.
diff --git a/.github/workflows/release_nightly_on_schedule.yml b/.github/workflows/release_nightly_on_schedule.yml
@@ -6,19 +6,23 @@ on:
  - cron: '0 0 * * 6' # release on every Sunday 00:00 UTC time
 
 jobs:
- build-n-publish:
+ publish:
  if: github.repository == 'hpcaitech/ColossalAI'
  name: Build and publish Python 🐍 distributions 📦 to PyPI
  runs-on: ubuntu-latest
  timeout-minutes: 20
+ outputs:
+ status: ${{ steps.publish.outcome }}
  steps:
  - uses: actions/checkout@v2
 
  - uses: actions/setup-python@v2
  with:
  python-version: '3.8.14'
 
- - run: NIGHTLY=1 python setup.py sdist build
+ - run: |
+ python .github/workflows/scripts/update_setup_for_nightly.py
+ python setup.py sdist build
 
  # publish to PyPI if executed on the main branch
  - name: Publish package to PyPI
@@ -31,7 +35,7 @@ jobs:
 
  notify:
  name: Notify Lark via webhook
- needs: build-n-publish
+ needs: publish
  runs-on: ubuntu-latest
  if: ${{ always() }} && github.repository == 'hpcaitech/ColossalAI'
  steps:
@@ -62,4 +66,4 @@ jobs:
  REPO: ${{ github.repository }}
  RUN_ID: ${{ github.run_id }}
  WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }}
- STATUS: ${{ steps.publish.outcome }}
+ STATUS: ${{ needs.publish.outputs.status }}
diff --git a/.github/workflows/release_test_pypi_before_merge.yml b/.github/workflows/release_test_pypi_before_merge.yml
@@ -49,6 +49,6 @@ jobs:
  # we need to install the requirements.txt first
  # as test-pypi may not contain the distributions for libs listed in the txt file
  pip install -r requirements/requirements.txt
- pip install --index-url https://test.pypi.org/simple/ colossalai==$VERSION
+ pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.python.org/pypi colossalai==$VERSION
  env:
  VERSION: ${{ steps.prep-version.outputs.version }}
diff --git a/.github/workflows/scripts/update_setup_for_nightly.py b/.github/workflows/scripts/update_setup_for_nightly.py
@@ -0,0 +1,34 @@
+from datetime import datetime
+
+
+def open_setup_file():
+ with open("setup.py", "r") as f:
+ file_lines = f.readlines()
+ return file_lines
+
+
+def replace_nightly_package_info(file_lines):
+ version = datetime.today().strftime("%Y.%m.%d")
+ package_name = "colossalai-nightly"
+
+ for idx, line in enumerate(file_lines):
+ if "version = get_version()" in line:
+ file_lines[idx] = f'version = "{version}"\n'
+ if 'package_name = "colossalai"' in line:
+ file_lines[idx] = f'package_name = "{package_name}"\n'
+ return file_lines
+
+
+def write_setup_file(file_lines):
+ with open("setup.py", "w") as f:
+ f.writelines(file_lines)
+
+
+def main():
+ file_lines = open_setup_file()
+ file_lines = replace_nightly_package_info(file_lines)
+ write_setup_file(file_lines)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@
  <a href="https://www.colossalai.org/"> Documentation </a> |
  <a href="https://github.com/hpcaitech/ColossalAI/tree/main/examples"> Examples </a> |
  <a href="https://github.com/hpcaitech/ColossalAI/discussions"> Forum </a> |
- <a href="https://medium.com/@hpcaitech"> Blog </a></h3>
+ <a href="https://hpc-ai.com/blog"> Blog </a></h3>
 
  [![GitHub Repo stars](https://img.shields.io/github/stars/hpcaitech/ColossalAI?style=social)](https://github.com/hpcaitech/ColossalAI/stargazers)
  [![Build](https://github.com/hpcaitech/ColossalAI/actions/workflows/build_on_schedule.yml/badge.svg)](https://github.com/hpcaitech/ColossalAI/actions/workflows/build_on_schedule.yml)
@@ -398,10 +398,10 @@ pip install colossalai
 
 **Note: only Linux is supported for now.**
 
-However, if you want to build the PyTorch extensions during installation, you can set `CUDA_EXT=1`.
+However, if you want to build the PyTorch extensions during installation, you can set `BUILD_EXT=1`.
 
 ```bash
-CUDA_EXT=1 pip install colossalai
+BUILD_EXT=1 pip install colossalai
 ```
 
 **Otherwise, CUDA kernels will be built during runtime when you actually need them.**
@@ -429,7 +429,7 @@ By default, we do not compile CUDA/C++ kernels. ColossalAI will build them durin
 If you want to install and enable CUDA kernel fusion (compulsory installation when using fused optimizer):
 
 ```shell
-CUDA_EXT=1 pip install .
+BUILD_EXT=1 pip install .
 ```
 
 For Users with CUDA 10.2, you can still build ColossalAI from source. However, you need to manually download the cub library and copy it to the corresponding directory.
@@ -445,7 +445,7 @@ unzip 1.8.0.zip
 cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
 
 # install
-CUDA_EXT=1 pip install .
+BUILD_EXT=1 pip install .
 ```
 
 <p align="right">(<a href="#top">back to top</a>)</p>

diff --git a/applications/Chat/coati/dataset/sft_dataset.py b/applications/Chat/coati/dataset/sft_dataset.py
@@ -49,12 +49,13 @@ def _preprocess(
  max_length: int,
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
  """Preprocess the data by tokenizing."""
- sequences = [s + t for s, t in zip(sources, targets)]
+ sequences = [s + t + tokenizer.eos_token for s, t in zip(sources, targets)]
  sequences_token = tokenizer(
- sequences, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
+ sequences, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt", add_special_tokens=False
  )
+
  sources_token = tokenizer(
- sources, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
+ sources, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt", add_special_tokens=False
  )
 
  assert sequences_token["attention_mask"].dim() == 2, "seq2seq model should be preprocessed differently"
@@ -65,7 +66,8 @@ def _preprocess(
  if tokenizer.padding_side == "right":
  # |prompt|completion|eos|pad|
  labels[i][:source_len] = IGNORE_INDEX
- labels[i][-pad_len:] = IGNORE_INDEX
+ if pad_len>0:
+ labels[i][-pad_len:] = IGNORE_INDEX
  elif tokenizer.padding_side == "left":
  # |pad|prompt|completion|eos|
  labels[i][: pad_len + source_len] = IGNORE_INDEX

diff --git a/applications/Chat/examples/train_sft.sh b/applications/Chat/examples/train_sft.sh
@@ -25,4 +25,4 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \
  --accumulation_steps 8 \
  --lr 2e-5 \
  --max_datasets_size 512 \
- --max_epochs 1
+ --max_epochs 1
diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/dataset/loader.py b/applications/Colossal-LLaMA-2/colossal_llama2/dataset/loader.py
@@ -1,20 +1,16 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
-import numpy as np
 import os
-import random
 from dataclasses import dataclass
-from typing import Dict, List, Union, Sequence, Optional, Iterator, Callable
+from typing import Dict, Iterator, List, Optional, Sequence, Union
 
 import torch
-from datasets import dataset_dict, load_from_disk
+import torch.nn.functional as F
 from datasets import Dataset as HFDataset
-from torch.distributed import ProcessGroup
-from torch.distributed.distributed_c10d import _get_default_group
-from torch.utils.data import ConcatDataset, Dataset, DataLoader, DistributedSampler
+from datasets import dataset_dict, load_from_disk
+from torch.utils.data import ConcatDataset, Dataset, DistributedSampler
 from transformers.tokenization_utils import PreTrainedTokenizer
-import torch.nn.functional as F
 
 DatasetType = Union[Dataset, ConcatDataset, dataset_dict.Dataset]
 PathType = Union[str, os.PathLike]
@@ -62,6 +58,7 @@ class DataCollatorForSupervisedDataset(object):
  tokenizer: PreTrainedTokenizer
  max_length: int = 4096
  ignore_index: int = -100
+ padding: str = "max_length"
 
  def __call__(self, instances: Sequence[Dict[str, List[int]]]) -> Dict[str, torch.Tensor]:
  """
@@ -106,10 +103,11 @@ def __call__(self, instances: Sequence[Dict[str, List[int]]]) -> Dict[str, torch
  batch_first=True,
  padding_value=self.ignore_index,
  ) # (bsz, max_len)
- # pad to max
- to_pad = self.max_length - input_ids.size(1)
- input_ids = F.pad(input_ids, (0, to_pad), value=self.tokenizer.pad_token_id)
- labels = F.pad(labels, (0, to_pad), value=self.ignore_index)
+ if self.padding == "max_length":
+ # pad to max
+ to_pad = self.max_length - input_ids.size(1)
+ input_ids = F.pad(input_ids, (0, to_pad), value=self.tokenizer.pad_token_id)
+ labels = F.pad(labels, (0, to_pad), value=self.ignore_index)
  elif self.tokenizer.padding_side == "left":
  reversed_input_ids = [seq.flip(dims=(0,)) for seq in batch_input_ids]
  reversed_input_ids = torch.nn.utils.rnn.pad_sequence(
@@ -171,49 +169,3 @@ def __len__(self) -> int:
 
  def set_start_index(self, start_index: int) -> None:
  self.start_index = start_index
-
-
-def setup_distributed_dataloader(
- dataset: DatasetType,
- batch_size: int = 1,
- shuffle: bool = False,
- seed: int = 1024,
- drop_last: bool = False,
- pin_memory: bool = False,
- num_workers: int = 0,
- collate_fn: Callable[[Sequence[Dict[str, Union[str, List[int]]]]], Dict[str, torch.Tensor]] = None,
- process_group: Optional[ProcessGroup] = None,
- **kwargs,
-) -> DataLoader:
- """
- Setup dataloader for distributed training.
- """
- _kwargs = kwargs.copy()
- process_group = process_group or _get_default_group()
- sampler = StatefulDistributedSampler(
- dataset=dataset,
- num_replicas=process_group.size(),
- rank=process_group.rank(),
- shuffle=shuffle,
- seed=seed,
- drop_last=drop_last,
- )
-
- # Deterministic dataloader
- def seed_worker(worker_id: int) -> None:
- worker_seed = seed
- np.random.seed(worker_seed)
- torch.manual_seed(worker_seed)
- random.seed(worker_seed)
-
- return DataLoader(
- dataset=dataset,
- batch_size=batch_size,
- sampler=sampler,
- num_workers=num_workers,
- collate_fn=collate_fn,
- pin_memory=pin_memory,
- drop_last=drop_last,
- worker_init_fn=seed_worker,
- **_kwargs,
- )