diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index f82ce5d242a4..8a1bc8e113de 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -208,7 +208,7 @@ jobs: - name: Execute Unit Testing run: | - CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest --testmon --testmon-cov=. tests/ + CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest --testmon --testmon-cov=. --durations=10 tests/ env: DATA: /data/scratch/cifar-10 NCCL_SHM_DISABLE: 1 diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml index 0589cd617b80..03b47e6cb5b6 100644 --- a/.github/workflows/build_on_schedule.yml +++ b/.github/workflows/build_on_schedule.yml @@ -3,7 +3,7 @@ name: Build on Schedule on: schedule: # run at 00:00 of every Sunday - - cron: '0 0 * * *' + - cron: "0 0 * * *" workflow_dispatch: jobs: @@ -60,7 +60,7 @@ jobs: - name: Unit Testing if: steps.check-avai.outputs.avai == 'true' run: | - PYTHONPATH=$PWD pytest tests + PYTHONPATH=$PWD pytest --durations=0 tests env: DATA: /data/scratch/cifar-10 LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 diff --git a/.github/workflows/compatiblity_test_on_dispatch.yml b/.github/workflows/compatiblity_test_on_dispatch.yml index 3dcc4dfd182a..1778d64ee287 100644 --- a/.github/workflows/compatiblity_test_on_dispatch.yml +++ b/.github/workflows/compatiblity_test_on_dispatch.yml @@ -72,7 +72,7 @@ jobs: ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} - name: Download cub for CUDA 10.2 run: | - CUDA_VERSION=$(cat $CUDA_HOME/version.txt | grep "CUDA Version" | awk '{print $NF}' | cut -d. -f1,2) + CUDA_VERSION=$(nvcc -V | awk -F ',| ' '/release/{print $6}') # check if it is CUDA 10.2 # download cub diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml index 5098b8e364d0..c0f45c65a7fc 100644 --- a/.github/workflows/compatiblity_test_on_pr.yml +++ b/.github/workflows/compatiblity_test_on_pr.yml @@ -66,7 +66,7 @@ jobs: ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} - name: Download cub for CUDA 10.2 run: | - CUDA_VERSION=$(cat $CUDA_HOME/version.txt | grep "CUDA Version" | awk '{print $NF}' | cut -d. -f1,2) + CUDA_VERSION=$(nvcc -V | awk -F ',| ' '/release/{print $6}') # check if it is CUDA 10.2 # download cub diff --git a/.github/workflows/compatiblity_test_on_schedule.yml b/.github/workflows/compatiblity_test_on_schedule.yml index 9802795fad24..15ac4f1a92bb 100644 --- a/.github/workflows/compatiblity_test_on_schedule.yml +++ b/.github/workflows/compatiblity_test_on_schedule.yml @@ -61,6 +61,18 @@ jobs: with: ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} + - name: Download cub for CUDA 10.2 + run: | + CUDA_VERSION=$(nvcc -V | awk -F ',| ' '/release/{print $6}') + + # check if it is CUDA 10.2 + # download cub + if [ "$CUDA_VERSION" = "10.2" ]; then + wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip + unzip 1.8.0.zip + cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/ + fi + - name: Install Colossal-AI run: | pip install -v --no-cache-dir . diff --git a/.github/workflows/cuda_ext_check_before_merge.yml b/.github/workflows/cuda_ext_check_before_merge.yml index eba5bb98ec07..686f0f395c73 100644 --- a/.github/workflows/cuda_ext_check_before_merge.yml +++ b/.github/workflows/cuda_ext_check_before_merge.yml @@ -37,6 +37,18 @@ jobs: - name: Install PyTorch run: eval ${{ matrix.build.torch_command }} + - name: Download cub for CUDA 10.2 + run: | + CUDA_VERSION=$(nvcc -V | awk -F ',| ' '/release/{print $6}') + + # check if it is CUDA 10.2 + # download cub + if [ "$CUDA_VERSION" = "10.2" ]; then + wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip + unzip 1.8.0.zip + cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/ + fi + - name: Build run: | CUDA_EXT=1 pip install -v . diff --git a/.github/workflows/run_chatgpt_examples.yml b/.github/workflows/run_chatgpt_examples.yml index 510f6b6f0985..650689498fda 100644 --- a/.github/workflows/run_chatgpt_examples.yml +++ b/.github/workflows/run_chatgpt_examples.yml @@ -43,7 +43,9 @@ jobs: run: | cd applications/Chat rm -rf ~/.cache/colossalai - ./examples/test_ci.sh + ./tests/test_inference.sh + ./tests/test_benchmarks.sh + ./tests/test_train.sh env: NCCL_SHM_DISABLE: 1 MAX_JOBS: 8 diff --git a/README.md b/README.md index 34c8a6b730a3..44e4f97f1f4e 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,8 @@ ## Latest News +* [2023/07] [HPC-AI Tech Raises 22 Million USD in Series A Funding](https://www.hpc-ai.tech/blog/hpc-ai-tech-raises-22-million-usd-in-series-a-funding-to-fuel-team-expansion-and-business-growth) +* [2023/07] [65B Model Pretraining Accelerated by 38%, Best Practices for Building LLaMA-Like Base Models Open-Source](https://www.hpc-ai.tech/blog/large-model-pretraining) * [2023/03] [ColossalChat: An Open-Source Solution for Cloning ChatGPT With a Complete RLHF Pipeline](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b) * [2023/03] [Intel and Colossal-AI Partner to Deliver Cost-Efficient Open-Source Solution for Protein Folding Structure Prediction](https://www.hpc-ai.tech/blog/intel-habana) * [2023/03] [AWS and Google Fund Colossal-AI with Startup Cloud Programs](https://www.hpc-ai.tech/blog/aws-and-google-fund-colossal-ai-with-startup-cloud-programs) @@ -32,7 +34,6 @@ * [2023/01] [Hardware Savings Up to 46 Times for AIGC and Automatic Parallelism](https://medium.com/pytorch/latest-colossal-ai-boasts-novel-automatic-parallelism-and-offers-savings-up-to-46x-for-stable-1453b48f3f02) * [2022/11] [Diffusion Pretraining and Hardware Fine-Tuning Can Be Almost 7X Cheaper](https://www.hpc-ai.tech/blog/diffusion-pretraining-and-hardware-fine-tuning-can-be-almost-7x-cheaper) * [2022/10] [Use a Laptop to Analyze 90% of Proteins, With a Single-GPU Inference Sequence Exceeding 10,000](https://www.hpc-ai.tech/blog/use-a-laptop-to-analyze-90-of-proteins-with-a-single-gpu-inference-sequence-exceeding) -* [2022/09] [HPC-AI Tech Completes $6 Million Seed and Angel Round Fundraising](https://www.hpc-ai.tech/blog/hpc-ai-tech-completes-6-million-seed-and-angel-round-fundraising-led-by-bluerun-ventures-in-the) ## Table of Contents