Add knn benchmark for CI (#426)

* add benchmark test for ci * add 8 threads * seperate debug and release * skip 2 warmup tests * set name for profiler * fix split id
infiniflow · Jan 5, 2024 · fdd1fbf · fdd1fbf
1 parent 3af3ed9
commit fdd1fbf
Show file tree

Hide file tree

Showing 4 changed files with 163 additions and 46 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -13,12 +13,77 @@ on:
       - '*.md'
 
 jobs:
-  tests:
-    name: tests
+  debug_tests:
+    name: debug_tests
     # https://docs.github.com/en/actions/using-jobs/using-conditions-to-control-job-execution
     # https://github.com/orgs/community/discussions/26261
     if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'ci') }}
     runs-on: [ "self-hosted" ]
+    steps:
+      # https://github.com/hmarr/debug-action
+      #- uses: hmarr/debug-action@v2
+
+      - name: Show PR labels
+        if: ${{ !cancelled() && !failure() }}
+        run: |
+          echo "Workflow triggered by ${{ github.event_name }}"
+          if [[ ${{ github.event_name }} == 'pull_request' ]]; then
+            echo "PR labels: ${{ join(github.event.pull_request.labels.*.name, ', ') }}"
+          fi
+
+      - name: Ensure workspace ownership
+        if: ${{ !cancelled() && !failure() }}
+        run: echo "chown -R $USER $GITHUB_WORKSPACE" && sudo chown -R $USER $GITHUB_WORKSPACE
+
+      - name: Check out code
+        if: ${{ !cancelled() && !failure() }}
+        uses: actions/checkout@v3
+        with:
+          ssh-key: ${{ secrets.MY_DEPLOY_KEY }}
+
+      - name: Start builder container
+        if: ${{ !cancelled() && !failure() }}
+        run: |
+          TZ=$(readlink -f /etc/localtime | awk -F '/zoneinfo/' '{print $2}')
+          sudo docker rm -f infinity_build && sudo docker run -d --privileged --name infinity_build --network=host -e TZ=$TZ -v $PWD:/infinity infiniflow/infinity_builder:ubuntu2310
+
+      - name: Build debug version
+        if: ${{ !cancelled() && !failure() }}
+        run: sudo docker exec infinity_build bash -c "cd /infinity && rm -fr cmake-build-debug && mkdir -p cmake-build-debug && cmake -G Ninja -DCMAKE_BUILD_TYPE=Debug -DCMAKE_JOB_POOL_LINK:STRING=link_pool -DCMAKE_JOB_POOLS:STRING=link_pool=1 -S /infinity -B /infinity/cmake-build-debug && cmake --build /infinity/cmake-build-debug"
+
+      - name: Unit test debug version
+        if: ${{ !cancelled() && !failure() }}
+        run: sudo docker exec infinity_build bash -c "cd /infinity/ && cmake-build-debug/src/test_main"
+
+      - name: Install pysdk
+        if: ${{ !cancelled() && !failure() }}
+        run: sudo docker exec infinity_build bash -c "cd /infinity/python && python3 setup.py install"
+
+      - name: Start infinity debug version
+        if: ${{ !cancelled() && !failure() }}
+        run: |
+          # Run a command in the background
+          sudo docker exec infinity_build bash -c "cd /infinity/ && rm -fr /tmp/infinity && cmake-build-debug/src/infinity > debug.log 2>&1" &
+
+      - name: pysdk & sqllogictest debug version
+        if: ${{ !cancelled() && !failure() }}
+        run: sudo docker exec infinity_build bash -c "cd /infinity/ && python3 tools/sqllogictest.py"
+
+      - name: Stop infinity debug
+        if: ${{ !cancelled() }} # always run this step even if previous steps failed
+        run: sudo kill $(pidof cmake-build-debug/src/infinity)
+
+      - name: Collect infinity debug output
+        # GitHub Actions interprets output lines starting with "Error" as error messages, and it automatically sets the step status to failed when such lines are detected.
+        if: ${{ !cancelled() }} # always run this step even if previous steps failed
+        run: cat debug.log 2>/dev/null || true
+
+  release_tests:
+    name: release_tests and benchmark
+    # https://docs.github.com/en/actions/using-jobs/using-conditions-to-control-job-execution
+    # https://github.com/orgs/community/discussions/26261
+    if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'ci') }}
+    runs-on: [ "self-hosted", "benchmark" ]
     steps:
     # https://github.com/hmarr/debug-action
     #- uses: hmarr/debug-action@v2
@@ -47,37 +112,6 @@ jobs:
         TZ=$(readlink -f /etc/localtime | awk -F '/zoneinfo/' '{print $2}')
         sudo docker rm -f infinity_build && sudo docker run -d --privileged --name infinity_build --network=host -e TZ=$TZ -v $PWD:/infinity infiniflow/infinity_builder:ubuntu2310
 
-    - name: Build debug version
-      if: ${{ !cancelled() && !failure() }}
-      run: sudo docker exec infinity_build bash -c "cd /infinity && rm -fr cmake-build-debug && mkdir -p cmake-build-debug && cmake -G Ninja -DCMAKE_BUILD_TYPE=Debug -DCMAKE_JOB_POOL_LINK:STRING=link_pool -DCMAKE_JOB_POOLS:STRING=link_pool=1 -S /infinity -B /infinity/cmake-build-debug && cmake --build /infinity/cmake-build-debug"
-
-    - name: Unit test debug version
-      if: ${{ !cancelled() && !failure() }}
-      run: sudo docker exec infinity_build bash -c "cd /infinity/ && cmake-build-debug/src/test_main"
-
-    - name: Install pysdk
-      if: ${{ !cancelled() && !failure() }}
-      run: sudo docker exec infinity_build bash -c "cd /infinity/python && python3 setup.py install"
-
-    - name: Start infinity debug version
-      if: ${{ !cancelled() && !failure() }}
-      run: |
-        # Run a command in the background
-        sudo docker exec infinity_build bash -c "cd /infinity/ && rm -fr /tmp/infinity && cmake-build-debug/src/infinity > debug.log 2>&1" &
-
-    - name: pysdk & sqllogictest debug version
-      if: ${{ !cancelled() && !failure() }}
-      run: sudo docker exec infinity_build bash -c "cd /infinity/ && python3 tools/sqllogictest.py"
-
-    - name: Stop infinity debug
-      if: ${{ !cancelled() }} # always run this step even if previous steps failed
-      run: sudo kill $(pidof cmake-build-debug/src/infinity)
-
-    - name: Collect infinity debug output
-      # GitHub Actions interprets output lines starting with "Error" as error messages, and it automatically sets the step status to failed when such lines are detected. 
-      if: ${{ !cancelled() }} # always run this step even if previous steps failed
-      run: cat debug.log 2>/dev/null || true
-
     - name: Build release version
       if: ${{ !cancelled() && !failure() }}
       run: sudo docker exec infinity_build bash -c "cd /infinity && rm -fr cmake-build-release && mkdir -p cmake-build-release && cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_JOB_POOL_LINK:STRING=link_pool -DCMAKE_JOB_POOLS:STRING=link_pool=1 -S /infinity -B /infinity/cmake-build-release && cmake --build /infinity/cmake-build-release"
@@ -103,3 +137,31 @@ jobs:
       # GitHub Actions interprets output lines starting with "Error" as error messages, and it automatically sets the step status to failed when such lines are detected. 
       if: ${{ !cancelled() }} # always run this step even if previous steps failed
       run: cat release.log 2>/dev/null || true
+
+    - name: Prepare sift dataset
+      if: ${{ !cancelled() && !failure() }}
+      run: sudo chmod +x ./tools/ci_tools/check_benchmark_result.py && sudo mkdir -p test/data/benchmark && sudo ln -s $HOME/benchmark/sift1M test/data/benchmark/sift_1m
+
+    - name: Generate config file
+      if: ${{ !cancelled() && !failure() }}
+      run: mkdir -p $PWD/db_tmp && cat conf/infinity_conf.toml | sed -e "s|/var/infinity|$PWD/db_tmp|g" > $PWD/db_tmp/infinity_conf.toml
+
+    - name: Import sift dataset
+      if: ${{ !cancelled() && !failure() }}
+      run: ./cmake-build-release/benchmark/local_infinity/knn_import_benchmark sift $PWD/test/data $PWD/db_tmp
+
+    - name: Benchmark sift search 1 thread repeat 50 times
+      if: ${{ !cancelled() && !failure() }}
+      run: echo $(date --rfc-3339=s) "Benchmark sift search 1 thread average time:" $(echo "1 50" | ./cmake-build-release/benchmark/local_infinity/knn_query_benchmark sift 200 $PWD/test/data $PWD/db_tmp | awk '/Total cost/ {total+=$4; count+=1} END {printf ("%f s", total/count)}') | tee -a $HOME/benchmark/benchmark_sift_1_thread.log
+
+    - name: Benchmark sift search 8 threads repeat 50 times
+      if: ${{ !cancelled() && !failure() }}
+      run: echo $(date --rfc-3339=s) "Benchmark sift search 8 threads average time:" $(echo "8 50" | ./cmake-build-release/benchmark/local_infinity/knn_query_benchmark sift 200 $PWD/test/data $PWD/db_tmp | awk '/Total cost/ {total+=$4; count+=1} END {printf ("%f s", total/count)}') | tee -a $HOME/benchmark/benchmark_sift_8_threads.log
+
+    - name: Benchmark sift check 1 thread result
+      if: ${{ !cancelled() && !failure() }}
+      run: ./tools/ci_tools/check_benchmark_result.py sift_1 $HOME/benchmark/benchmark_sift_1_thread.log
+
+    - name: Benchmark sift check 8 threads result
+      if: ${{ !cancelled() }}
+      run: ./tools/ci_tools/check_benchmark_result.py sift_8 $HOME/benchmark/benchmark_sift_8_threads.log
diff --git a/benchmark/local_infinity/knn/knn_import_benchmark.cpp b/benchmark/local_infinity/knn/knn_import_benchmark.cpp
@@ -38,8 +38,10 @@ import query_result;
 using namespace infinity;
 
 int main(int argc, char *argv[]) {
-    if (argc != 2) {
-        std::cout << "import sift or gist" << std::endl;
+    if (argc < 2) {
+        std::cout << "import sift or gist, with optional test_data_path (default to /infinity/test/data in docker) and optional infinity path "
+                     "(default to /tmp/infinity)"
+                  << std::endl;
         return 1;
     }
     bool sift = true;
@@ -49,6 +51,9 @@ int main(int argc, char *argv[]) {
     sift = strcmp(argv[1], "sift") == 0;
 
     std::string data_path = "/tmp/infinity";
+    if (argc >= 4) {
+        data_path = std::string(argv[3]);
+    }
 
     LocalFileSystem fs;
     if (fs.Exists(data_path)) {
@@ -71,6 +76,9 @@ int main(int argc, char *argv[]) {
         // init column defs
         std::shared_ptr<DataType> col1_type = nullptr;
         std::string base_path = std::string(test_data_path());
+        if (argc >= 3) {
+            base_path = std::string(argv[2]);
+        }
         std::string table_name;
         if (sift) {
             col1_type = std::make_shared<DataType>(LogicalType::kEmbedding, std::make_shared<EmbeddingInfo>(EmbeddingDataType::kElemFloat, 128));

diff --git a/benchmark/local_infinity/knn/knn_query_benchmark.cpp b/benchmark/local_infinity/knn/knn_query_benchmark.cpp
@@ -57,7 +57,7 @@ std::unique_ptr<T[]> load_data(const std::string &filename, size_t &num, int &di
     return data;
 }
 
-template <class Function>
+template <typename Function>
 inline void LoopFor(size_t id_begin, size_t id_end, size_t threadId, Function fn, const std::string &table_name) {
     std::cout << "threadId = " << threadId << " [" << id_begin << ", " << id_end << ")" << std::endl;
     std::shared_ptr<Infinity> infinity = Infinity::LocalConnect();
@@ -68,7 +68,7 @@ inline void LoopFor(size_t id_begin, size_t id_end, size_t threadId, Function fn
     }
 }
 
-template <class Function>
+template <typename Function>
 inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn, const std::string &table_name) {
     if (numThreads <= 0) {
         numThreads = std::thread::hardware_concurrency();
@@ -85,8 +85,10 @@ inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn
 }
 
 int main(int argc, char *argv[]) {
-    if (argc != 3) {
-        std::cout << "query gist/sift ef=?" << std::endl;
+    if (argc < 3) {
+        std::cout << "query gist/sift ef=? , with optional test_data_path (default to /infinity/test/data in docker) and optional infinity path "
+                     "(default to /tmp/infinity)"
+                  << std::endl;
         return 1;
     }
     bool sift = true;
@@ -104,6 +106,9 @@ int main(int argc, char *argv[]) {
     std::cin >> total_times;
 
     std::string path = "/tmp/infinity";
+    if (argc >= 5) {
+        path = std::string(argv[4]);
+    }
     LocalFileSystem fs;
 
     Infinity::LocalInit(path);
@@ -113,8 +118,12 @@ int main(int argc, char *argv[]) {
 
     std::vector<std::string> results;
 
-    std::string query_path = std::string(test_data_path());
-    std::string groundtruth_path = std::string(test_data_path());
+    std::string base_path = std::string(test_data_path());
+    if (argc >= 4) {
+        base_path = std::string(argv[3]);
+    }
+    std::string query_path = base_path;
+    std::string groundtruth_path = base_path;
     size_t dimension = 0;
     int64_t topk = 100;
 
@@ -177,7 +186,7 @@ int main(int argc, char *argv[]) {
             }
         }
     }
-    do {
+    for (size_t times = 0; times < total_times + 2; ++times) {
         std::cout << "--- Start to run search benchmark: " << std::endl;
         std::vector<std::vector<uint64_t>> query_results(query_count);
         for (auto &v : query_results) {
@@ -218,13 +227,17 @@ int main(int argc, char *argv[]) {
                     query_results[query_idx].emplace_back(data[i].ToUint64());
                 }
             }
-//            delete[] embedding_data_ptr;
         };
-        BaseProfiler profiler;
+        BaseProfiler profiler("ParallelFor");
         profiler.Begin();
         ParallelFor(0, query_count, thread_num, query_function, table_name);
         profiler.End();
-        results.push_back(Format("Total cost : {}", profiler.ElapsedToString(1000)));
+        // skip 2 warm up loops
+        if (times >= 2) {
+            auto elapsed_ns = profiler.Elapsed();
+            auto elapsed_s = elapsed_ns / (1'000'000'000.0);
+            results.push_back(Format("Total cost : {} s", elapsed_s));
+        }
         {
             size_t correct_1 = 0, correct_10 = 0, correct_100 = 0;
             for (size_t query_idx = 0; query_idx < query_count; ++query_idx) {
@@ -248,7 +261,7 @@ int main(int argc, char *argv[]) {
             results.push_back(Format("R@10:  {:.3f}", float(correct_10) / float(query_count * 10)));
             results.push_back(Format("R@100: {:.3f}", float(correct_100) / float(query_count * 100)));
         }
-    } while (--total_times);
+    }
 
     std::cout << ">>> Query Benchmark End <<<" << std::endl;
     for (const auto &item : results) {

diff --git a/tools/ci_tools/check_benchmark_result.py b/tools/ci_tools/check_benchmark_result.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+# script to check benchmark result in GitHub actions
+# set input parameter 1 as test type, 2 as test result file path
+# if the benchmark result is better than the standard, return 0, else return 1
+
+import sys
+
+
+def main():
+    benchmark_bars = {"sift_1": 2.256, "sift_4": 0.869, "sift_8": 0.501}
+    benchmark_id = sys.argv[1]
+    standard = benchmark_bars[benchmark_id]
+    file_path = sys.argv[2]
+    with open(file_path, 'r') as f:
+        last_line = f.readlines()[-1]
+    print()
+    print("last line from log:", last_line)
+    result = float(last_line.split(' ')[-2])
+    print("average time:", result, 's')
+    print("required time:", standard, 's')
+    print()
+    difference_percentage = 100 * (result - standard) / standard
+    print("difference percentage: {:.2f}%".format(difference_percentage))
+    print()
+    if difference_percentage < 3:
+        print("benchmark result is acceptable")
+        sys.exit(0)
+    else:
+        print("benchmark result is unacceptable")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()