.github/workflows/llm_inf2_integration.yml

name: Inferentia2 integration tests

on:
  workflow_dispatch:
    inputs:
      djl-version:
        description: 'The released version of DJL'
        required: false
        default: ''
  schedule:
    - cron: '0 15 * * *'


jobs:
  create-runners:
    runs-on: [self-hosted, scheduler]
    steps:
      - name: Create new Inf2.24xl instance
        id: create_inf2
        run: |
          cd /home/ubuntu/djl_benchmark_script/scripts
          token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
          https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
          --fail \
          | jq '.token' | tr -d '"' )
          ./start_instance.sh action_inf2 $token djl-serving
      - name: Create new Inf2.24xl instance
        id: create_inf2_2
        run: |
          cd /home/ubuntu/djl_benchmark_script/scripts
          token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
          https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
          --fail \
          | jq '.token' | tr -d '"' )
          ./start_instance.sh action_inf2 $token djl-serving
    outputs:
      inf2_instance_id_1: ${{ steps.create_inf2.outputs.action_inf2_instance_id }}
      inf2_instance_id_2: ${{ steps.create_inf2_2.outputs.action_inf2_instance_id }}

  transformers-neuronx-test-1:
    runs-on: [ self-hosted, inf2 ]
    timeout-minutes: 90
    needs: create-runners
    steps:
      - uses: actions/checkout@v3
      - name: Clean env
        run: |
          yes | docker system prune -a --volumes
          sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
          echo "wait dpkg lock..."
          while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
      - name: Set up Python3
        uses: actions/setup-python@v4
        with:
          python-version: '3.10.x'
      - name: Install pip dependencies
        run: pip3 install requests numpy pillow
      - name: Build container name
        run: ./serving/docker/scripts/docker_name_builder.sh pytorch-inf2 ${{ github.event.inputs.djl-version }}
      - name: Download models and dockers
        working-directory: tests/integration
        run: |
          docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
          mkdir logs
          ./download_models.sh pytorch-inf2
      - name: Test Pytorch model
        working-directory: tests/integration
        run: |
          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-1 \
          serve -m test::PyTorch:nc0=file:/opt/ml/model/resnet18_inf2_2_4.tar.gz
          ./test_client.sh image/jpg models/kitten.jpg
          docker rm -f $(docker ps -aq)
      - name: Test Python mode
        working-directory: tests/integration
        run: |
          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-1 \
          serve -m test::Python:nc0=file:/opt/ml/model/resnet18_inf2_2_4.tar.gz
          ./test_client.sh image/jpg models/kitten.jpg
          docker rm -f $(docker ps -aq)
      - name: Test transformers-neuronx gpt2 with handler
        working-directory: tests/integration
        run: |
          rm -rf models
          python3 llm/prepare.py transformers_neuronx gpt2
          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-1 \
          serve
          curl http://127.0.0.1:8080/models
          python3 llm/client.py transformers_neuronx gpt2
          docker rm -f $(docker ps -aq)
          sudo rm -rf models
      - name: Test transformers-neuronx gpt2 quantization with handler
        working-directory: tests/integration
        run: |
          rm -rf models
          python3 llm/prepare.py transformers_neuronx gpt2-quantize
          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-1 \
          serve
          curl http://127.0.0.1:8080/models
          python3 llm/client.py transformers_neuronx gpt2-quantize
          docker rm -f $(docker ps -aq)
          sudo rm -rf models
      - name: Test transformers-neuronx opt-1.3b with handler
        working-directory: tests/integration
        run: |
          rm -rf models
          python3 llm/prepare.py transformers_neuronx opt-1.3b
          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-6 \
          serve
          curl http://127.0.0.1:8080/models
          python3 llm/client.py transformers_neuronx opt-1.3b
          docker rm -f $(docker ps -aq)
          sudo rm -rf models
      - name: Test transformers-neuronx gpt-j-6b with handler
        working-directory: tests/integration
        run: |
          rm -rf models
          python3 llm/prepare.py transformers_neuronx gpt-j-6b
          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-6 \
          serve
          curl http://127.0.0.1:8080/models
          python3 llm/client.py transformers_neuronx gpt-j-6b
          docker rm -f $(docker ps -aq)
          sudo rm -rf models
      - name: Test transformers-neuronx pythia-2.8b with handler
        working-directory: tests/integration
        run: |
          rm -rf models
          python3 llm/prepare.py transformers_neuronx pythia-2.8b
          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-2 \
          serve
          curl http://127.0.0.1:8080/models
          python3 llm/client.py transformers_neuronx pythia-2.8b
          docker rm -f $(docker ps -aq)
          sudo rm -rf models
      - name: Test gpt2 partition
        working-directory: tests/integration
        run: |
          sudo rm -rf models
          python3 llm/prepare.py transformers_neuronx_aot gpt2
          # To test the requirements.txt download.
          echo "dummy_test" >> $PWD/models/test/requirements.txt

          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-1 \
          partition --model-dir /opt/ml/input/data/training/ --skip-copy | tee partition_output.log

          # checking if pt files are generated.
          sudo mv $PWD/models/test/partition-test $PWD/models/
          if ls $PWD/models/partition-test/*.pt &>/dev/null ; then echo "checkpoint files generated"; else exit 1;  fi
          
          # checking whether requirements.txt download is successful
          if grep -F "pip install requirements succeed!" partition_output.log &>/dev/null; \
          then echo "requirements.txt install was successful"; else exit 1; fi
      - name: Test gpt2-quantize partition
        working-directory: tests/integration
        run: |
          sudo rm -rf models
          python3 llm/prepare.py transformers_neuronx_aot gpt2-quantize
          # To test the requirements.txt download.
          echo "dummy_test" >> $PWD/models/test/requirements.txt

          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-1 \
          partition --model-dir /opt/ml/input/data/training/ --skip-copy | tee partition_output.log

          # checking if pt files are generated.
          sudo mv $PWD/models/test/partition-test $PWD/models/
          if ls $PWD/models/partition-test/*.pt &>/dev/null ; then echo "checkpoint files generated"; else exit 1;  fi
          
          # checking whether requirements.txt download is successful
          if grep -F "pip install requirements succeed!" partition_output.log &>/dev/null; \
          then echo "requirements.txt install was successful"; else exit 1; fi
      - name: On fail step
        if: ${{ failure() }}
        working-directory: tests/integration
        run: |
          cat logs/serving.log
      - name: Upload test logs
        uses: actions/upload-artifact@v3
        with:
          name: transformers-neuronx-${{ matrix.arch }}-logs
          path: tests/integration/logs/

  transformers-neuronx-test-2:
    runs-on: [ self-hosted, inf2 ]
    timeout-minutes: 90
    needs: create-runners
    steps:
      - uses: actions/checkout@v3
      - name: Clean env
        run: |
          yes | docker system prune -a --volumes
          sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
          echo "wait dpkg lock..."
          while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
      - name: Set up Python3
        uses: actions/setup-python@v4
        with:
          python-version: '3.10.x'
      - name: Install pip dependencies
        run: pip3 install requests numpy pillow
      - name: Build container name
        run: ./serving/docker/scripts/docker_name_builder.sh pytorch-inf2 ${{ github.event.inputs.djl-version }}
      - name: Download models and dockers
        working-directory: tests/integration
        run: |
          docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
          mkdir logs
          ./download_models.sh pytorch-inf2
      - name: Test transformers-neuronx bloom-7b1 with handler
        working-directory: tests/integration
        run: |
          rm -rf models
          python3 llm/prepare.py transformers_neuronx bloom-7b1
          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-2 \
          serve
          curl http://127.0.0.1:8080/models
          python3 llm/client.py transformers_neuronx bloom-7b1
          docker rm -f $(docker ps -aq)
          sudo rm -rf models
      - name: Test transformers-neuronx open-llama-7b with handler
        working-directory: tests/integration
        run: |
          rm -rf models
          python3 llm/prepare.py transformers_neuronx open-llama-7b
          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-2 \
          serve
          curl http://127.0.0.1:8080/models
          python3 llm/client.py transformers_neuronx open-llama-7b
          docker rm -f $(docker ps -aq)
          sudo rm -rf models
      - name: Test streaming transformers-neuronx opt-1.3b with handler
        working-directory: tests/integration
        run: |
          rm -rf models
          python3 llm/prepare.py transformers_neuronx opt-1.3b-streaming
          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-6 \
          serve
          curl http://127.0.0.1:8080/models
          python3 llm/client.py transformers_neuronx opt-1.3b-streaming
          docker rm -f $(docker ps -aq)
          sudo rm -rf models
      - name: Test stable diffusion with handler
        working-directory: tests/integration
        run: |
          rm -rf models
          python3 llm/prepare.py transformers_neuronx stable-diffusion-2.1-base-neuron
          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-2 \
          serve
          curl http://127.0.0.1:8080/models
          python3 llm/client.py stable-diffusion stable-diffusion-2.1-base-neuron
          docker rm -f $(docker ps -aq)
          sudo rm -rf models
      - name: Test stable diffusion bf16 with handler
        working-directory: tests/integration
        run: |
          rm -rf models
          python3 llm/prepare.py transformers_neuronx stable-diffusion-2.1-base-neuron-bf16
          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-2 \
          serve
          curl http://127.0.0.1:8080/models
          python3 llm/client.py stable-diffusion stable-diffusion-2.1-base-neuron
          docker rm -f $(docker ps -aq)
          sudo rm -rf models
      - name: Test load split model with llama2 7B in handler
        working-directory: tests/integration
        run: |
          rm -rf models
          python3 llm/prepare.py transformers_neuronx llama-7b-split
          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-2 \
          serve
          curl http://127.0.0.1:8080/models
          python3 llm/client.py transformers_neuronx llama-7b-split
          docker rm -f $(docker ps -aq)
          sudo rm -rf models
      - name: Test load optimum llama2 7B in handler
        working-directory: tests/integration
        run: |
          rm -rf models
          python3 llm/prepare.py transformers_neuronx llama2-7b
          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-2 \
          serve
          curl http://127.0.0.1:8080/models
          python3 llm/client.py transformers_neuronx llama2-7b
          docker rm -f $(docker ps -aq)
          sudo rm -rf models
      - name: On fail step
        if: ${{ failure() }}
        working-directory: tests/integration
        run: |
          cat logs/serving.log
      - name: Upload test logs
        uses: actions/upload-artifact@v3
        with:
          name: transformers-neuronx-${{ matrix.arch }}-logs
          path: tests/integration/logs/

  stop-runners:
    if: always()
    runs-on: [ self-hosted, scheduler ]
    needs: [ create-runners, transformers-neuronx-test-1, transformers-neuronx-test-2 ]
    steps:
      - name: Stop all instances
        run: |
          cd /home/ubuntu/djl_benchmark_script/scripts
          instance_id=${{ needs.create-runners.outputs.inf2_instance_id_1 }}
          ./stop_instance.sh $instance_id
          instance_id=${{ needs.create-runners.outputs.inf2_instance_id_2 }}
          ./stop_instance.sh $instance_id