Skip to content

Commit

Permalink
feat: add clickbench benchmark (#1304)
Browse files Browse the repository at this point in the history
Adds a [clickbench](https://github.com/ClickHouse/ClickBench) run to our
datafusion benchmarks. ~Some queries fail, see #1326.~
Running this is pretty resource intensive as the dataset is ~100m
rows/15GB of parquet.

---------

Co-authored-by: Robert Kruszewski <[email protected]>
Co-authored-by: Dan King <[email protected]>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Andrew Duffy <[email protected]>
Co-authored-by: Will Manning <[email protected]>
Co-authored-by: Nicholas Gates <[email protected]>
Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>
  • Loading branch information
8 people authored Dec 2, 2024
1 parent 91b8001 commit 1b10f05
Show file tree
Hide file tree
Showing 15 changed files with 806 additions and 208 deletions.
54 changes: 52 additions & 2 deletions .github/workflows/bench-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ permissions:
actions: write
contents: read
pull-requests: write
id-token: write

jobs:
label_trigger:
Expand Down Expand Up @@ -58,9 +59,9 @@ jobs:
RUSTFLAGS: '-C target-cpu=native'
run: |
cargo install cargo-criterion
cargo criterion --bench ${{ matrix.benchmark.id }} --message-format=json 2>&1 | tee out.json
cat out.json
sudo apt-get update && sudo apt-get install -y jq
Expand Down Expand Up @@ -145,3 +146,52 @@ jobs:
AWS_ENDPOINT: ${{ secrets.AWS_ENDPOINT }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
clickbench:
needs: label_trigger
runs-on: self-hosted
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/cleanup
- uses: ./.github/actions/setup-rust
- uses: spiraldb/actions/.github/actions/[email protected]

# The compression benchmarks rely on DuckDB being installed to convert CSV to Parquet
- name: Install DuckDB
uses: opt-nc/[email protected]
if: runner.environment != 'self-hosted'
with:
version: v1.0.0

- name: Set tempdir
if: runner.environment == 'self-hosted'
run: |
echo "TMPDIR=/work" >> $GITHUB_ENV
- name: Run ClickBench benchmark
shell: bash
env:
BENCH_VORTEX_RATIOS: '.*'
RUSTFLAGS: '-C target-cpu=native'
HOME: /home/ci-runner
run: |
cargo run --bin clickbench --release -- -d gh-json | tee clickbench.json
- name: Store benchmark result
if: '!cancelled()'
uses: benchmark-action/github-action-benchmark@v1
with:
name: 'Clickbench'
tool: 'customSmallerIsBetter'
gh-pages-branch: gh-pages-bench
github-token: ${{ secrets.GITHUB_TOKEN }}
output-file-path: clickbench.json
summary-always: true
comment-always: true
auto-push: false
save-data-file: false
fail-on-alert: false
env:
# AWS Credentials for R2 storage tests
AWS_BUCKET: vortex-test
AWS_ENDPOINT: ${{ secrets.AWS_ENDPOINT }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
47 changes: 47 additions & 0 deletions .github/workflows/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -128,4 +128,51 @@ jobs:
AWS_ENDPOINT: ${{ secrets.AWS_ENDPOINT }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
clickbench:
runs-on: self-hosted
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/cleanup
- uses: ./.github/actions/setup-rust
- uses: spiraldb/actions/.github/actions/[email protected]

# The compression benchmarks rely on DuckDB being installed to convert CSV to Parquet
- name: Install DuckDB
uses: opt-nc/[email protected]
if: runner.environment != 'self-hosted'
with:
version: v1.0.0

- name: Set tempdir
if: runner.environment == 'self-hosted'
run: |
echo "TMPDIR=/work" >> $GITHUB_ENV
- name: Run Clickbench benchmark
shell: bash
env:
BENCH_VORTEX_RATIOS: '.*'
RUSTFLAGS: '-C target-cpu=native'
HOME: /home/ci-runner
run: |
cargo run --bin clickbench --release -- -d gh-json | tee clickbench.json
- name: Store benchmark result
if: '!cancelled()'
uses: benchmark-action/github-action-benchmark@v1
with:
name: 'Clickbench'
tool: 'customSmallerIsBetter'
gh-pages-branch: gh-pages-bench
github-token: ${{ secrets.GITHUB_TOKEN }}
output-file-path: clickbench.json
summary-always: true
auto-push: true
fail-on-alert: false
env:
# AWS Credentials for R2 storage tests
AWS_BUCKET: vortex-test
AWS_ENDPOINT: ${{ secrets.AWS_ENDPOINT }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}


2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ jobs:
with:
version: v1.0.0
- name: Rust Bench as test
run: cargo bench --bench '*[!noci]' --profile benchtest -- --test
run: cargo bench --bench '*[!noci|clickbench]' --profile benchtest -- --test

generated-files:
name: "Check generated proto/fbs files are up to date"
Expand Down
5 changes: 5 additions & 0 deletions bench-vortex/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,8 @@ harness = false
name = "compressor_throughput"
test = false
harness = false

[[bench]]
name = "clickbench"
test = false
harness = false
69 changes: 69 additions & 0 deletions bench-vortex/benches/clickbench.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#![feature(exit_status_error)]

use std::path::PathBuf;
use std::process::Command;

use bench_vortex::clickbench::{clickbench_queries, HITS_SCHEMA};
use bench_vortex::{clickbench, execute_query, idempotent, IdempotentPath};
use criterion::{criterion_group, criterion_main, Criterion};
use datafusion::prelude::SessionContext;
use tokio::runtime::Builder;

fn benchmark(c: &mut Criterion) {
let runtime = Builder::new_multi_thread().enable_all().build().unwrap();
let basepath = "clickbench".to_data_path();

// The clickbench-provided file is missing some higher-level type info, so we reprocess it
// to add that info, see https://github.com/ClickHouse/ClickBench/issues/7.
for idx in 0..100 {
let output_path = basepath.join(format!("hits_{idx}.parquet"));
idempotent(&output_path, |output_path| {
eprintln!("Fixing parquet file {idx}");
let command = format!(
"
SET home_directory='/home/ci-runner/';
INSTALL HTTPFS;
COPY (SELECT * REPLACE
(epoch_ms(EventTime * 1000) AS EventTime, \
epoch_ms(ClientEventTime * 1000) AS ClientEventTime, \
epoch_ms(LocalEventTime * 1000) AS LocalEventTime, \
DATE '1970-01-01' + INTERVAL (EventDate) DAYS AS EventDate) \
FROM read_parquet('https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{idx}.parquet', binary_as_string=True)) TO '{}' (FORMAT 'parquet');",
output_path.to_str().unwrap()
);
Command::new("duckdb")
.arg("-c")
.arg(command)
.status()?
.exit_ok()?;

anyhow::Ok(PathBuf::from(output_path))
})
.unwrap();
}

let session_context = SessionContext::new();
let context = session_context.clone();
runtime.block_on(async move {
clickbench::register_vortex_file(&context, "hits", basepath.as_path(), &HITS_SCHEMA)
.await
.unwrap();
});

let mut group = c.benchmark_group("clickbench");

for (idx, query) in clickbench_queries().into_iter() {
let context = session_context.clone();
group.bench_function(format!("q-{:02}", idx), |b| {
b.to_async(&runtime)
.iter(|| async { execute_query(&context, &query).await.unwrap() });
});
}
}

criterion_group!(
name = benches;
config = Criterion::default().sample_size(10);
targets = benchmark
);
criterion_main!(benches);
3 changes: 2 additions & 1 deletion bench-vortex/benches/tpch.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use bench_vortex::tpch::dbgen::{DBGen, DBGenOptions};
use bench_vortex::tpch::{load_datasets, run_tpch_query, tpch_queries, Format};
use bench_vortex::tpch::{load_datasets, run_tpch_query, tpch_queries};
use bench_vortex::Format;
use criterion::{criterion_group, criterion_main, Criterion};
use tokio::runtime::Builder;

Expand Down
43 changes: 43 additions & 0 deletions bench-vortex/clickbench_queries.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
SELECT COUNT(*) FROM hits;
SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;
SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;
SELECT AVG(UserID) FROM hits;
SELECT COUNT(DISTINCT UserID) FROM hits;
SELECT COUNT(DISTINCT SearchPhrase) FROM hits;
SELECT MIN(EventDate), MAX(EventDate) FROM hits;
SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;
SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;
SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;
SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;
SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;
SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;
SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;
SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;
SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;
SELECT UserID FROM hits WHERE UserID = 435090932899640449;
SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';
SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;
SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;
SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;
SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;
SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits;
SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;
SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;
SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;
SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;
SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;
SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;
SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000;
Loading

0 comments on commit 1b10f05

Please sign in to comment.