Skip to content

Commit

Permalink
Add knn benchmark for CI (#426)
Browse files Browse the repository at this point in the history
* add benchmark test for ci

* add 8 threads

* seperate debug and release

* skip 2 warmup tests

* set name for profiler

* fix split id
  • Loading branch information
yangzq50 authored Jan 5, 2024
1 parent 3af3ed9 commit fdd1fbf
Show file tree
Hide file tree
Showing 4 changed files with 163 additions and 46 deletions.
128 changes: 95 additions & 33 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,77 @@ on:
- '*.md'

jobs:
tests:
name: tests
debug_tests:
name: debug_tests
# https://docs.github.com/en/actions/using-jobs/using-conditions-to-control-job-execution
# https://github.com/orgs/community/discussions/26261
if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'ci') }}
runs-on: [ "self-hosted" ]
steps:
# https://github.com/hmarr/debug-action
#- uses: hmarr/debug-action@v2

- name: Show PR labels
if: ${{ !cancelled() && !failure() }}
run: |
echo "Workflow triggered by ${{ github.event_name }}"
if [[ ${{ github.event_name }} == 'pull_request' ]]; then
echo "PR labels: ${{ join(github.event.pull_request.labels.*.name, ', ') }}"
fi
- name: Ensure workspace ownership
if: ${{ !cancelled() && !failure() }}
run: echo "chown -R $USER $GITHUB_WORKSPACE" && sudo chown -R $USER $GITHUB_WORKSPACE

- name: Check out code
if: ${{ !cancelled() && !failure() }}
uses: actions/checkout@v3
with:
ssh-key: ${{ secrets.MY_DEPLOY_KEY }}

- name: Start builder container
if: ${{ !cancelled() && !failure() }}
run: |
TZ=$(readlink -f /etc/localtime | awk -F '/zoneinfo/' '{print $2}')
sudo docker rm -f infinity_build && sudo docker run -d --privileged --name infinity_build --network=host -e TZ=$TZ -v $PWD:/infinity infiniflow/infinity_builder:ubuntu2310
- name: Build debug version
if: ${{ !cancelled() && !failure() }}
run: sudo docker exec infinity_build bash -c "cd /infinity && rm -fr cmake-build-debug && mkdir -p cmake-build-debug && cmake -G Ninja -DCMAKE_BUILD_TYPE=Debug -DCMAKE_JOB_POOL_LINK:STRING=link_pool -DCMAKE_JOB_POOLS:STRING=link_pool=1 -S /infinity -B /infinity/cmake-build-debug && cmake --build /infinity/cmake-build-debug"

- name: Unit test debug version
if: ${{ !cancelled() && !failure() }}
run: sudo docker exec infinity_build bash -c "cd /infinity/ && cmake-build-debug/src/test_main"

- name: Install pysdk
if: ${{ !cancelled() && !failure() }}
run: sudo docker exec infinity_build bash -c "cd /infinity/python && python3 setup.py install"

- name: Start infinity debug version
if: ${{ !cancelled() && !failure() }}
run: |
# Run a command in the background
sudo docker exec infinity_build bash -c "cd /infinity/ && rm -fr /tmp/infinity && cmake-build-debug/src/infinity > debug.log 2>&1" &
- name: pysdk & sqllogictest debug version
if: ${{ !cancelled() && !failure() }}
run: sudo docker exec infinity_build bash -c "cd /infinity/ && python3 tools/sqllogictest.py"

- name: Stop infinity debug
if: ${{ !cancelled() }} # always run this step even if previous steps failed
run: sudo kill $(pidof cmake-build-debug/src/infinity)

- name: Collect infinity debug output
# GitHub Actions interprets output lines starting with "Error" as error messages, and it automatically sets the step status to failed when such lines are detected.
if: ${{ !cancelled() }} # always run this step even if previous steps failed
run: cat debug.log 2>/dev/null || true

release_tests:
name: release_tests and benchmark
# https://docs.github.com/en/actions/using-jobs/using-conditions-to-control-job-execution
# https://github.com/orgs/community/discussions/26261
if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'ci') }}
runs-on: [ "self-hosted", "benchmark" ]
steps:
# https://github.com/hmarr/debug-action
#- uses: hmarr/debug-action@v2
Expand Down Expand Up @@ -47,37 +112,6 @@ jobs:
TZ=$(readlink -f /etc/localtime | awk -F '/zoneinfo/' '{print $2}')
sudo docker rm -f infinity_build && sudo docker run -d --privileged --name infinity_build --network=host -e TZ=$TZ -v $PWD:/infinity infiniflow/infinity_builder:ubuntu2310
- name: Build debug version
if: ${{ !cancelled() && !failure() }}
run: sudo docker exec infinity_build bash -c "cd /infinity && rm -fr cmake-build-debug && mkdir -p cmake-build-debug && cmake -G Ninja -DCMAKE_BUILD_TYPE=Debug -DCMAKE_JOB_POOL_LINK:STRING=link_pool -DCMAKE_JOB_POOLS:STRING=link_pool=1 -S /infinity -B /infinity/cmake-build-debug && cmake --build /infinity/cmake-build-debug"

- name: Unit test debug version
if: ${{ !cancelled() && !failure() }}
run: sudo docker exec infinity_build bash -c "cd /infinity/ && cmake-build-debug/src/test_main"

- name: Install pysdk
if: ${{ !cancelled() && !failure() }}
run: sudo docker exec infinity_build bash -c "cd /infinity/python && python3 setup.py install"

- name: Start infinity debug version
if: ${{ !cancelled() && !failure() }}
run: |
# Run a command in the background
sudo docker exec infinity_build bash -c "cd /infinity/ && rm -fr /tmp/infinity && cmake-build-debug/src/infinity > debug.log 2>&1" &
- name: pysdk & sqllogictest debug version
if: ${{ !cancelled() && !failure() }}
run: sudo docker exec infinity_build bash -c "cd /infinity/ && python3 tools/sqllogictest.py"

- name: Stop infinity debug
if: ${{ !cancelled() }} # always run this step even if previous steps failed
run: sudo kill $(pidof cmake-build-debug/src/infinity)

- name: Collect infinity debug output
# GitHub Actions interprets output lines starting with "Error" as error messages, and it automatically sets the step status to failed when such lines are detected.
if: ${{ !cancelled() }} # always run this step even if previous steps failed
run: cat debug.log 2>/dev/null || true

- name: Build release version
if: ${{ !cancelled() && !failure() }}
run: sudo docker exec infinity_build bash -c "cd /infinity && rm -fr cmake-build-release && mkdir -p cmake-build-release && cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_JOB_POOL_LINK:STRING=link_pool -DCMAKE_JOB_POOLS:STRING=link_pool=1 -S /infinity -B /infinity/cmake-build-release && cmake --build /infinity/cmake-build-release"
Expand All @@ -103,3 +137,31 @@ jobs:
# GitHub Actions interprets output lines starting with "Error" as error messages, and it automatically sets the step status to failed when such lines are detected.
if: ${{ !cancelled() }} # always run this step even if previous steps failed
run: cat release.log 2>/dev/null || true

- name: Prepare sift dataset
if: ${{ !cancelled() && !failure() }}
run: sudo chmod +x ./tools/ci_tools/check_benchmark_result.py && sudo mkdir -p test/data/benchmark && sudo ln -s $HOME/benchmark/sift1M test/data/benchmark/sift_1m

- name: Generate config file
if: ${{ !cancelled() && !failure() }}
run: mkdir -p $PWD/db_tmp && cat conf/infinity_conf.toml | sed -e "s|/var/infinity|$PWD/db_tmp|g" > $PWD/db_tmp/infinity_conf.toml

- name: Import sift dataset
if: ${{ !cancelled() && !failure() }}
run: ./cmake-build-release/benchmark/local_infinity/knn_import_benchmark sift $PWD/test/data $PWD/db_tmp

- name: Benchmark sift search 1 thread repeat 50 times
if: ${{ !cancelled() && !failure() }}
run: echo $(date --rfc-3339=s) "Benchmark sift search 1 thread average time:" $(echo "1 50" | ./cmake-build-release/benchmark/local_infinity/knn_query_benchmark sift 200 $PWD/test/data $PWD/db_tmp | awk '/Total cost/ {total+=$4; count+=1} END {printf ("%f s", total/count)}') | tee -a $HOME/benchmark/benchmark_sift_1_thread.log

- name: Benchmark sift search 8 threads repeat 50 times
if: ${{ !cancelled() && !failure() }}
run: echo $(date --rfc-3339=s) "Benchmark sift search 8 threads average time:" $(echo "8 50" | ./cmake-build-release/benchmark/local_infinity/knn_query_benchmark sift 200 $PWD/test/data $PWD/db_tmp | awk '/Total cost/ {total+=$4; count+=1} END {printf ("%f s", total/count)}') | tee -a $HOME/benchmark/benchmark_sift_8_threads.log

- name: Benchmark sift check 1 thread result
if: ${{ !cancelled() && !failure() }}
run: ./tools/ci_tools/check_benchmark_result.py sift_1 $HOME/benchmark/benchmark_sift_1_thread.log

- name: Benchmark sift check 8 threads result
if: ${{ !cancelled() }}
run: ./tools/ci_tools/check_benchmark_result.py sift_8 $HOME/benchmark/benchmark_sift_8_threads.log
12 changes: 10 additions & 2 deletions benchmark/local_infinity/knn/knn_import_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,10 @@ import query_result;
using namespace infinity;

int main(int argc, char *argv[]) {
if (argc != 2) {
std::cout << "import sift or gist" << std::endl;
if (argc < 2) {
std::cout << "import sift or gist, with optional test_data_path (default to /infinity/test/data in docker) and optional infinity path "
"(default to /tmp/infinity)"
<< std::endl;
return 1;
}
bool sift = true;
Expand All @@ -49,6 +51,9 @@ int main(int argc, char *argv[]) {
sift = strcmp(argv[1], "sift") == 0;

std::string data_path = "/tmp/infinity";
if (argc >= 4) {
data_path = std::string(argv[3]);
}

LocalFileSystem fs;
if (fs.Exists(data_path)) {
Expand All @@ -71,6 +76,9 @@ int main(int argc, char *argv[]) {
// init column defs
std::shared_ptr<DataType> col1_type = nullptr;
std::string base_path = std::string(test_data_path());
if (argc >= 3) {
base_path = std::string(argv[2]);
}
std::string table_name;
if (sift) {
col1_type = std::make_shared<DataType>(LogicalType::kEmbedding, std::make_shared<EmbeddingInfo>(EmbeddingDataType::kElemFloat, 128));
Expand Down
35 changes: 24 additions & 11 deletions benchmark/local_infinity/knn/knn_query_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ std::unique_ptr<T[]> load_data(const std::string &filename, size_t &num, int &di
return data;
}

template <class Function>
template <typename Function>
inline void LoopFor(size_t id_begin, size_t id_end, size_t threadId, Function fn, const std::string &table_name) {
std::cout << "threadId = " << threadId << " [" << id_begin << ", " << id_end << ")" << std::endl;
std::shared_ptr<Infinity> infinity = Infinity::LocalConnect();
Expand All @@ -68,7 +68,7 @@ inline void LoopFor(size_t id_begin, size_t id_end, size_t threadId, Function fn
}
}

template <class Function>
template <typename Function>
inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn, const std::string &table_name) {
if (numThreads <= 0) {
numThreads = std::thread::hardware_concurrency();
Expand All @@ -85,8 +85,10 @@ inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn
}

int main(int argc, char *argv[]) {
if (argc != 3) {
std::cout << "query gist/sift ef=?" << std::endl;
if (argc < 3) {
std::cout << "query gist/sift ef=? , with optional test_data_path (default to /infinity/test/data in docker) and optional infinity path "
"(default to /tmp/infinity)"
<< std::endl;
return 1;
}
bool sift = true;
Expand All @@ -104,6 +106,9 @@ int main(int argc, char *argv[]) {
std::cin >> total_times;

std::string path = "/tmp/infinity";
if (argc >= 5) {
path = std::string(argv[4]);
}
LocalFileSystem fs;

Infinity::LocalInit(path);
Expand All @@ -113,8 +118,12 @@ int main(int argc, char *argv[]) {

std::vector<std::string> results;

std::string query_path = std::string(test_data_path());
std::string groundtruth_path = std::string(test_data_path());
std::string base_path = std::string(test_data_path());
if (argc >= 4) {
base_path = std::string(argv[3]);
}
std::string query_path = base_path;
std::string groundtruth_path = base_path;
size_t dimension = 0;
int64_t topk = 100;

Expand Down Expand Up @@ -177,7 +186,7 @@ int main(int argc, char *argv[]) {
}
}
}
do {
for (size_t times = 0; times < total_times + 2; ++times) {
std::cout << "--- Start to run search benchmark: " << std::endl;
std::vector<std::vector<uint64_t>> query_results(query_count);
for (auto &v : query_results) {
Expand Down Expand Up @@ -218,13 +227,17 @@ int main(int argc, char *argv[]) {
query_results[query_idx].emplace_back(data[i].ToUint64());
}
}
// delete[] embedding_data_ptr;
};
BaseProfiler profiler;
BaseProfiler profiler("ParallelFor");
profiler.Begin();
ParallelFor(0, query_count, thread_num, query_function, table_name);
profiler.End();
results.push_back(Format("Total cost : {}", profiler.ElapsedToString(1000)));
// skip 2 warm up loops
if (times >= 2) {
auto elapsed_ns = profiler.Elapsed();
auto elapsed_s = elapsed_ns / (1'000'000'000.0);
results.push_back(Format("Total cost : {} s", elapsed_s));
}
{
size_t correct_1 = 0, correct_10 = 0, correct_100 = 0;
for (size_t query_idx = 0; query_idx < query_count; ++query_idx) {
Expand All @@ -248,7 +261,7 @@ int main(int argc, char *argv[]) {
results.push_back(Format("R@10: {:.3f}", float(correct_10) / float(query_count * 10)));
results.push_back(Format("R@100: {:.3f}", float(correct_100) / float(query_count * 100)));
}
} while (--total_times);
}

std::cout << ">>> Query Benchmark End <<<" << std::endl;
for (const auto &item : results) {
Expand Down
34 changes: 34 additions & 0 deletions tools/ci_tools/check_benchmark_result.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/usr/bin/env python3
# script to check benchmark result in GitHub actions
# set input parameter 1 as test type, 2 as test result file path
# if the benchmark result is better than the standard, return 0, else return 1

import sys


def main():
benchmark_bars = {"sift_1": 2.256, "sift_4": 0.869, "sift_8": 0.501}
benchmark_id = sys.argv[1]
standard = benchmark_bars[benchmark_id]
file_path = sys.argv[2]
with open(file_path, 'r') as f:
last_line = f.readlines()[-1]
print()
print("last line from log:", last_line)
result = float(last_line.split(' ')[-2])
print("average time:", result, 's')
print("required time:", standard, 's')
print()
difference_percentage = 100 * (result - standard) / standard
print("difference percentage: {:.2f}%".format(difference_percentage))
print()
if difference_percentage < 3:
print("benchmark result is acceptable")
sys.exit(0)
else:
print("benchmark result is unacceptable")
sys.exit(1)


if __name__ == '__main__':
main()

0 comments on commit fdd1fbf

Please sign in to comment.