From 3f38c3e13e02a6a259a24883a4ebfd391e747549 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Thu, 5 Dec 2024 13:14:00 -0500 Subject: [PATCH] add benchmark to test different chunk/shard/compression/storage configurations --- CMakeLists.txt | 9 +- benchmarks/CMakeLists.txt | 18 +++ benchmarks/benchmark.cpp | 233 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 259 insertions(+), 1 deletion(-) create mode 100644 benchmarks/CMakeLists.txt create mode 100644 benchmarks/benchmark.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 5d877cb..3b2fd0c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,6 +21,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) option(BUILD_PYTHON "Build Python bindings" OFF) +option(BUILD_BENCHMARK "Build benchmarks" OFF) if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) include(CTest) @@ -33,7 +34,13 @@ else () message(STATUS "Skipping test targets") endif () -if (${BUILD_PYTHON}) +if (BUILD_BENCHMARK) + add_subdirectory(benchmarks) +else () + message(STATUS "Skipping benchmarks") +endif () + +if (BUILD_PYTHON) add_subdirectory(python) else () message(STATUS "Skipping Python bindings") diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt new file mode 100644 index 0000000..80fda26 --- /dev/null +++ b/benchmarks/CMakeLists.txt @@ -0,0 +1,18 @@ +set(project acquire-zarr) + +set(tgt acquire-zarr-benchmark) +add_executable(${tgt} benchmark.cpp) +target_compile_definitions(${tgt} PUBLIC "TEST=\"${tgt}\"") +set_target_properties(${tgt} PROPERTIES + MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>" +) +target_include_directories(${tgt} PRIVATE + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/src/logger +) +target_link_libraries(${tgt} PRIVATE + acquire-logger + acquire-zarr + nlohmann_json::nlohmann_json + miniocpp::miniocpp +) diff --git a/benchmarks/benchmark.cpp b/benchmarks/benchmark.cpp new file mode 100644 index 0000000..69bf060 --- /dev/null +++ b/benchmarks/benchmark.cpp @@ -0,0 +1,233 @@ +#include "acquire.zarr.h" +#include +#include +#include +#include +#include +#include +#include + +#define DIM(name_, type_, array_size, chunk_size, shard_size) \ + { .name = (name_), \ + .type = (type_), \ + .array_size_px = (array_size), \ + .chunk_size_px = (chunk_size), \ + .shard_size_chunks = (shard_size) } + +namespace fs = std::filesystem; + +struct ChunkConfig +{ + unsigned int t, c, z, y, x; +}; + +const std::vector CHUNK_CONFIGS = { { 1, 1, 64, 64, 64 }, + { 1, 1, 128, 128, 128 }, + { 1, 1, 256, 256, 256 } }; + +const unsigned int ARRAY_WIDTH = 1920, ARRAY_HEIGHT = 1080, ARRAY_PLANES = 6, + ARRAY_CHANNELS = 3, ARRAY_TIMEPOINTS = 10; + +const unsigned int NUM_RUNS = 5; + +struct BenchmarkConfig +{ + ChunkConfig chunk; + int zarr_version; + std::string compression; + std::string storage; + unsigned int chunks_per_shard_x; + unsigned int chunks_per_shard_y; + std::string s3_endpoint; + std::string s3_bucket; + std::string s3_access_key; + std::string s3_secret_key; +}; + +class Timer +{ + using Clock = std::chrono::high_resolution_clock; + Clock::time_point start; + + public: + Timer() + : start(Clock::now()) + { + } + double elapsed() + { + auto end = Clock::now(); + return std::chrono::duration(end - start).count(); + } +}; + +ZarrStream* +setup_stream(const BenchmarkConfig& config) +{ + ZarrStreamSettings settings = { .store_path = "benchmark.zarr", + .s3_settings = nullptr, + .compression_settings = nullptr, + .data_type = ZarrDataType_uint16, + .version = static_cast( + config.zarr_version) }; + + ZarrCompressionSettings comp_settings = {}; + if (config.compression != "none") { + comp_settings.compressor = ZarrCompressor_Blosc1; + comp_settings.codec = config.compression == "lz4" + ? ZarrCompressionCodec_BloscLZ4 + : ZarrCompressionCodec_BloscZstd; + comp_settings.level = 1; + comp_settings.shuffle = 1; + settings.compression_settings = &comp_settings; + } + + ZarrS3Settings s3_settings = {}; + if (config.storage == "s3") { + s3_settings = { + .endpoint = config.s3_endpoint.c_str(), + .bucket_name = config.s3_bucket.c_str(), + .access_key_id = config.s3_access_key.c_str(), + .secret_access_key = config.s3_secret_key.c_str(), + }; + settings.s3_settings = &s3_settings; + } + + ZarrStreamSettings_create_dimension_array(&settings, 5); + auto* dims = settings.dimensions; + + dims[0] = + DIM("t", ZarrDimensionType_Time, ARRAY_TIMEPOINTS, config.chunk.t, 1); + dims[1] = + DIM("c", ZarrDimensionType_Channel, ARRAY_CHANNELS, config.chunk.c, 1); + dims[2] = + DIM("z", ZarrDimensionType_Space, ARRAY_PLANES, config.chunk.z, 1); + dims[3] = DIM("y", + ZarrDimensionType_Space, + ARRAY_HEIGHT, + config.chunk.y, + config.chunks_per_shard_y); + dims[4] = DIM("x", + ZarrDimensionType_Space, + ARRAY_WIDTH, + config.chunk.x, + config.chunks_per_shard_x); + + return ZarrStream_create(&settings); +} + +double +run_benchmark(const BenchmarkConfig& config) +{ + auto* stream = setup_stream(config); + if (!stream) + return -1.0; + + const size_t frame_size = ARRAY_WIDTH * ARRAY_HEIGHT * sizeof(uint16_t); + std::vector frame(ARRAY_WIDTH * ARRAY_HEIGHT, 0); + const auto num_frames = ARRAY_PLANES * ARRAY_CHANNELS * ARRAY_TIMEPOINTS; + + Timer timer; + size_t bytes_out; + for (int i = 0; i < num_frames; ++i) { + if (ZarrStream_append(stream, frame.data(), frame_size, &bytes_out) != + ZarrStatusCode_Success) { + ZarrStream_destroy(stream); + return -1.0; + } + } + double elapsed = timer.elapsed(); + + ZarrStream_destroy(stream); + if (config.storage == "filesystem") { + fs::remove_all("benchmark.zarr"); + } + return elapsed; +} + +int +main() +{ + std::ofstream csv("zarr_benchmarks.csv"); + csv << "chunk_size,zarr_version,compression,storage,chunks_per_shard_y," + "chunks_per_shard_x,run,time_seconds\n"; + + std::vector configs; + for (const auto& chunk : CHUNK_CONFIGS) { + + // V2 configurations (no sharding) + for (const auto& compression : { "none", "lz4", "zstd" }) { + configs.push_back({ chunk, 2, compression, "filesystem", 1, 1 }); + + if (std::getenv("ZARR_S3_ENDPOINT")) { + configs.push_back({ chunk, + 2, + compression, + "s3", + 1, + 1, + std::getenv("ZARR_S3_ENDPOINT"), + std::getenv("ZARR_S3_BUCKET_NAME"), + std::getenv("ZARR_S3_ACCESS_KEY_ID"), + std::getenv("ZARR_S3_SECRET_ACCESS_KEY") }); + } + } + + unsigned int max_cps_y = (ARRAY_HEIGHT + chunk.y - 1) / chunk.y; + unsigned int max_cps_x = (ARRAY_WIDTH + chunk.x - 1) / chunk.x; + + // V3 configurations (with sharding) + for (unsigned int cps_y = 1; cps_y <= max_cps_y; cps_y *= 2) { + for (unsigned int cps_x = 1; cps_x <= max_cps_x; cps_x *= 2) { + for (const auto& compression : { "none", "lz4", "zstd" }) { + configs.push_back( + { chunk, 3, compression, "filesystem", cps_x, cps_y }); + + if (std::getenv("ZARR_S3_ENDPOINT")) { + configs.push_back( + { chunk, + 3, + compression, + "s3", + cps_x, + cps_y, + std::getenv("ZARR_S3_ENDPOINT"), + std::getenv("ZARR_S3_BUCKET_NAME"), + std::getenv("ZARR_S3_ACCESS_KEY_ID"), + std::getenv("ZARR_S3_SECRET_ACCESS_KEY") }); + } + } + } + } + } + + for (const auto& config : configs) { + std::string chunk_str = std::to_string(config.chunk.t) + "x" + + std::to_string(config.chunk.c) + "x" + + std::to_string(config.chunk.z) + "x" + + std::to_string(config.chunk.y) + "x" + + std::to_string(config.chunk.x); + + for (unsigned int run = 1; run <= NUM_RUNS; ++run) { + std::cout << "Benchmarking " << chunk_str << " Zarr V" + << config.zarr_version + << ", compression: " << config.compression + << ", storage: " << config.storage + << ", CPS (y): " << config.chunks_per_shard_y + << ", CPS (x): " << config.chunks_per_shard_x << ", (run " + << run << " / " << NUM_RUNS << ")..."; + double time = run_benchmark(config); + std::cout << " " << time << "s\n"; + if (time >= 0) { + csv << chunk_str << "," << config.zarr_version << "," + << config.compression << "," << config.storage << "," + << config.chunks_per_shard_y << "," + << config.chunks_per_shard_x << "," << run << "," + << std::fixed << std::setprecision(3) << time << "\n"; + } + csv.flush(); + } + } + + return 0; +} \ No newline at end of file