Skip to content

Run Benchmarks

Run Benchmarks #28

Workflow file for this run

name: Run Benchmarks
on:
workflow_dispatch:
env:
GRADLE_OPTS: -Dorg.gradle.daemon=false
jobs:
langTest:
name: Benchmark
runs-on: ubuntu-latest
timeout-minutes: 20
strategy:
fail-fast: false
matrix:
suite:
- "edit_mode"
- "idle_mode"
modelName:
# - "gpt-4o-2024-08-06"
# - "gpt-4o-mini"
# - "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
# - "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"
# - "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo"
- "gemini-1.5-flash-latest"
- "gemini-1.5-pro-latest"
- "gemini-1.5-pro-002"
- "gemini-1.5-flash-002"
# - "gemini-1.5-pro-exp-0801"
# - "claude-3-5-sonnet-20240620"
# - "claude-3-sonnet-20240229"
# - "codestral-latest"
# - "open-mistral-nemo"
lang:
- "JAVA"
- "kotlin"
- "Python"
- "go"
- "JavaScript"
# markdownTools:
# - "true"
# - "false"
steps:
# Free GitHub Actions Environment Disk Space
- name: Maximize Build Space
uses: jlumbroso/free-disk-space@main
with:
tool-cache: false
large-packages: false
- name: Fetch Sources
uses: actions/checkout@v4
- name: Setup Java
uses: actions/setup-java@v4
with:
distribution: zulu
java-version: 17
- name: Setup Gradle
uses: gradle/actions/setup-gradle@v4
with:
gradle-home-cache-cleanup: true
- name: Setup Environment
run: |
if [[ ${{ matrix.modelName }} =~ ^mistral ]]; then
echo "VQL_MODEL_PROVIDER=mistralai" >> $GITHUB_ENV
echo "VQL_MODEL_KEY=${{ secrets.MISTRAL_API_KEY }}" >> $GITHUB_ENV
elif [[ ${{ matrix.modelName }} =~ ^open-mistral ]]; then
echo "VQL_MODEL_PROVIDER=mistralai" >> $GITHUB_ENV
echo "VQL_MODEL_KEY=${{ secrets.MISTRAL_API_KEY }}" >> $GITHUB_ENV
elif [[ ${{ matrix.modelName }} =~ ^codestral ]]; then
echo "VQL_MODEL_PROVIDER=mistralai" >> $GITHUB_ENV
echo "VQL_MODEL_KEY=${{ secrets.MISTRAL_API_KEY }}" >> $GITHUB_ENV
elif [[ ${{ matrix.modelName }} =~ ^gemini ]]; then
echo "VQL_MODEL_PROVIDER=google_api" >> $GITHUB_ENV
echo "VQL_MODEL_KEY=${{ secrets.GOOGLE_API_KEY }}" >> $GITHUB_ENV
elif [[ ${{ matrix.modelName }} =~ ^meta ]]; then
echo "VQL_MODEL_PROVIDER=togetherai" >> $GITHUB_ENV
echo "VQL_MODEL_KEY=${{ secrets.TOGETHERAI_API_KEY }}" >> $GITHUB_ENV
elif [[ ${{ matrix.modelName }} =~ ^llama ]]; then
echo "VQL_MODEL_PROVIDER=groq" >> $GITHUB_ENV
echo "VQL_MODEL_KEY=${{ secrets.GROQ_API_KEY }}" >> $GITHUB_ENV
elif [[ ${{ matrix.modelName }} =~ ^claude ]]; then
echo "VQL_MODEL_PROVIDER=anthropic" >> $GITHUB_ENV
echo "VQL_MODEL_KEY=${{ secrets.ANTHROPIC_API_KEY }}" >> $GITHUB_ENV
else
echo "VQL_MODEL_PROVIDER=openai" >> $GITHUB_ENV
echo "VQL_MODEL_KEY=${{ secrets.OPENAI_API_KEY }}" >> $GITHUB_ENV
fi
- name: Run Benchmark (${{ matrix.suite }} - ${{ matrix.modelName }} - ${{ matrix.lang }})
run: ./gradlew --scan :test -PrunSpecificTests='benchmark/**'
env:
VQL_BENCHMARK_SUITE: ${{ matrix.suite }}
VQL_LANG: ${{ matrix.lang }}
VQL_BENCHMARK_VERSION: ${{ github.sha }}
VQL_MODEL_NAME: ${{ matrix.modelName }}
VQL_BENCHMARK_MODE: true
VQL_OBSERVABILITY_PROVIDER: Helicone
VQL_OBSERVABILITY_KEY: ${{ secrets.HELICONE_API_KEY }}
VQL_OBSERVABILITY_USER_ID: benchmark
VQL_EDIT_FORMAT: FULL_TEXT
# VQL_MARKDOWN_TOOLS: ${{ matrix.markdownTools }}
- name: Output log
run: cat /tmp/voqal-test.log
if: always()
- name: Set sanitized modelName
run: echo "modelName=$(echo '${{ matrix.modelName }}' | sed 's/\//-/g')" >> $GITHUB_ENV
- name: Upload results
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.suite }}-${{ matrix.lang }}-${{ env.modelName }}
path: ./benchmark/${{ matrix.suite }}-${{ matrix.lang }}.json
benchCalc:
name: Benchmark Calculator
needs: [ langTest ]
runs-on: ubuntu-latest
timeout-minutes: 20
steps:
# Free GitHub Actions Environment Disk Space
- name: Maximize Build Space
uses: jlumbroso/free-disk-space@main
with:
tool-cache: false
large-packages: false
- name: Fetch Sources
uses: actions/checkout@v4
- name: Setup Java
uses: actions/setup-java@v4
with:
distribution: zulu
java-version: 17
- name: Setup Gradle
uses: gradle/actions/setup-gradle@v4
with:
gradle-home-cache-cleanup: true
- name: Download all artifacts
uses: actions/download-artifact@v4
with:
path: ./benchmark/
- name: Benchmark Calculator
run: ./gradlew --scan :runBenchmarkCalculator
- name: Upload results
uses: actions/upload-artifact@v4
with:
name: edit_mode
path: ./benchmark/edit_mode.js
- name: Upload results
uses: actions/upload-artifact@v4
with:
name: idle_mode
path: ./benchmark/idle_mode.js
- name: Upload results
uses: actions/upload-artifact@v4
with:
name: all
path: ./benchmark/all.js
- name: Results Combiner
run: ./gradlew --scan :runResultsCombiner
- name: Upload results
uses: actions/upload-artifact@v4
with:
name: results
path: ./benchmark/results.jsonl
- name: Delete artifacts
uses: geekyeggo/delete-artifact@v5
with:
name: |
edit_mode-*
idle_mode-*