diff --git a/bazel/tachyon_deps.bzl b/bazel/tachyon_deps.bzl index e3fca4747..dcd0e290b 100644 --- a/bazel/tachyon_deps.bzl +++ b/bazel/tachyon_deps.bzl @@ -15,6 +15,7 @@ load("//third_party/nasm:workspace.bzl", nasm = "repo") load("//third_party/node_addon_api:install_node_addon_api.bzl", "install_node_addon_api") load("//third_party/omp:omp_configure.bzl", "omp_configure") load("//third_party/pdqsort:workspace.bzl", pdqsort = "repo") +load("//third_party/powersort:workspace.bzl", powersort = "repo") load("//third_party/py:python_configure.bzl", "python_configure") load("//third_party/rapidsnark:workspace.bzl", rapidsnark = "repo") @@ -36,6 +37,7 @@ def tachyon_deps(): json() nasm() pdqsort() + powersort() rapidsnark() install_node_addon_api(name = "node_addon_api") diff --git a/tachyon/base/BUILD.bazel b/tachyon/base/BUILD.bazel index b50d47be8..b844f6d36 100644 --- a/tachyon/base/BUILD.bazel +++ b/tachyon/base/BUILD.bazel @@ -1,5 +1,5 @@ load("//bazel:tachyon.bzl", "if_has_openmp_on_macos", "if_posix") -load("//bazel:tachyon_cc.bzl", "tachyon_cc_library", "tachyon_cc_unittest") +load("//bazel:tachyon_cc.bzl", "tachyon_cc_benchmark", "tachyon_cc_library", "tachyon_cc_unittest") package(default_visibility = ["//visibility:public"]) @@ -146,6 +146,15 @@ tachyon_cc_library( deps = [":logging"], ) +tachyon_cc_library( + name = "sort", + hdrs = ["sort.h"], + deps = [ + "@pdqsort", + "@powersort", + ], +) + tachyon_cc_library( name = "static_storage", hdrs = ["static_storage.h"], @@ -162,6 +171,16 @@ tachyon_cc_library( hdrs = ["type_list.h"], ) +tachyon_cc_benchmark( + name = "sort_benchmark", + srcs = ["sort_benchmark.cc"], + deps = [ + "//tachyon/base:random", + "//tachyon/base:sort", + "//tachyon/base/containers:container_util", + ], +) + tachyon_cc_unittest( name = "base_unittests", srcs = [ diff --git a/tachyon/base/containers/container_util.h b/tachyon/base/containers/container_util.h index 0d4daebd5..a8558818b 100644 --- a/tachyon/base/containers/container_util.h +++ b/tachyon/base/containers/container_util.h @@ -59,7 +59,7 @@ template CreateVectorParallel(size_t size, Generator&& generator) { std::vector ret(size); - OPENMP_PARALLEL_FOR(size_t i = 0; i < size; ++i) { ret[i] = generator(); } + OMP_PARALLEL_FOR(size_t i = 0; i < size; ++i) { ret[i] = generator(); } return ret; } @@ -93,7 +93,7 @@ template CreateVectorParallel(size_t size, Generator&& generator) { std::vector ret(size); - OPENMP_PARALLEL_FOR(size_t i = 0; i < size; ++i) { ret[i] = generator(i); } + OMP_PARALLEL_FOR(size_t i = 0; i < size; ++i) { ret[i] = generator(i); } return ret; } diff --git a/tachyon/base/openmp_util.h b/tachyon/base/openmp_util.h index f1ef02214..19b111a55 100644 --- a/tachyon/base/openmp_util.h +++ b/tachyon/base/openmp_util.h @@ -14,21 +14,21 @@ #if defined(TACHYON_HAS_OPENMP) #define CONSTEXPR_IF_NOT_OPENMP -#define OMP_FOR _Pragma("omp for") -#define OMP_FOR_NOWAIT _Pragma("omp for nowait") +#define OMP_FOR(expr) _Pragma("omp for") for (expr) +#define OMP_FOR_NOWAIT(expr) _Pragma("omp for nowait") for (expr) +#define OMP_NESTED_FOR(expr) _Pragma("omp for collapse(2)") for (expr) #define OMP_PARALLEL _Pragma("omp parallel") -#define OPENMP_PARALLEL_FOR(expr) _Pragma("omp parallel for") for (expr) -#define OPENMP_PARALLEL_NESTED_FOR(expr) \ +#define OMP_PARALLEL_FOR(expr) _Pragma("omp parallel for") for (expr) +#define OMP_PARALLEL_NESTED_FOR(expr) \ _Pragma("omp parallel for collapse(2)") for (expr) -#define OPENMP_FOR(expr) _Pragma("omp for") for (expr) #else #define CONSTEXPR_IF_NOT_OPENMP constexpr -#define OMP_FOR -#define OMP_FOR_NOWAIT +#define OMP_FOR(expr) for (expr) +#define OMP_FOR_NOWAIT(expr) for (expr) +#define OMP_NESTED_FOR(expr) for (expr) #define OMP_PARALLEL -#define OPENMP_PARALLEL_FOR(expr) for (expr) -#define OPENMP_PARALLEL_NESTED_FOR(expr) for (expr) -#define OPENMP_FOR(expr) for (expr) +#define OMP_PARALLEL_FOR(expr) for (expr) +#define OMP_PARALLEL_NESTED_FOR(expr) for (expr) #endif // defined(TACHYON_HAS_OPENMP) namespace tachyon::base { diff --git a/tachyon/base/parallelize.h b/tachyon/base/parallelize.h index a72e0efe9..2183bc868 100644 --- a/tachyon/base/parallelize.h +++ b/tachyon/base/parallelize.h @@ -11,6 +11,92 @@ #include "tachyon/base/openmp_util.h" namespace tachyon::base { +namespace internal { + +template , + typename RunType = typename FunctorTraits::RunType, + typename ArgList = internal::ExtractArgs, + typename SpanTy = internal::GetType<0, ArgList>, + size_t ArgNum = internal::GetSize> +void InvokeParallelizeCallback(Container& container, size_t i, + size_t num_chunks, size_t chunk_size, + Callable callback) { + size_t len = + i == num_chunks - 1 ? std::size(container) - i * chunk_size : chunk_size; + SpanTy chunk(std::data(container) + i * chunk_size, len); + if constexpr (ArgNum == 1) { + callback(chunk); + } else if constexpr (ArgNum == 2) { + callback(chunk, i); + } else { + static_assert(ArgNum == 3); + callback(chunk, i, chunk_size); + } +} + +template , + typename RunType = typename FunctorTraits::RunType, + typename ArgList = internal::ExtractArgs, + size_t ArgNum = internal::GetSize> +void InvokeParallelizeCallback(size_t size, size_t i, size_t num_chunks, + size_t chunk_size, Callable callback) { + size_t len = i == num_chunks - 1 ? size - i * chunk_size : chunk_size; + if constexpr (ArgNum == 1) { + callback(len); + } else if constexpr (ArgNum == 2) { + callback(len, i); + } else { + static_assert(ArgNum == 3); + callback(len, i, chunk_size); + } +} + +template , + typename RunType = typename FunctorTraits::RunType, + typename ReturnType = typename FunctorTraits::ReturnType, + typename ArgList = internal::ExtractArgs, + typename SpanTy = internal::GetType<0, ArgList>, + size_t ArgNum = internal::GetSize> +void InvokeParallelizeCallback(Container& container, size_t i, + size_t num_chunks, size_t chunk_size, + Callable callback, + std::vector& values) { + size_t len = + i == num_chunks - 1 ? std::size(container) - i * chunk_size : chunk_size; + SpanTy chunk(std::data(container) + i * chunk_size, len); + if constexpr (ArgNum == 1) { + values[i] = callback(chunk); + } else if constexpr (ArgNum == 2) { + values[i] = callback(chunk, i); + } else { + static_assert(ArgNum == 3); + values[i] = callback(chunk, i, chunk_size); + } +} + +template , + typename RunType = typename FunctorTraits::RunType, + typename ReturnType = typename FunctorTraits::ReturnType, + typename ArgList = internal::ExtractArgs, + size_t ArgNum = internal::GetSize> +void InvokeParallelizeCallback(size_t size, size_t i, size_t num_chunks, + size_t chunk_size, Callable callback, + std::vector& values) { + size_t len = i == num_chunks - 1 ? size - i * chunk_size : chunk_size; + if constexpr (ArgNum == 1) { + values[i] = callback(len); + } else if constexpr (ArgNum == 2) { + values[i] = callback(len, i); + } else { + static_assert(ArgNum == 3); + values[i] = callback(len, i, chunk_size); + } +} +} // namespace internal template using ParallelizeCallback1 = std::function)>; @@ -21,51 +107,35 @@ using ParallelizeCallback3 = std::function, size_t, size_t)>; // Splits the |container| by |chunk_size| and executes |callback| in parallel. // See parallelize_unittest.cc for more details. -template , - typename RunType = typename FunctorTraits::RunType, - typename ArgList = internal::ExtractArgs, - typename SpanTy = internal::GetType<0, ArgList>, - typename T = typename SpanTy::value_type, - size_t ArgNum = internal::GetSize> +template void ParallelizeByChunkSize(Container& container, size_t chunk_size, Callable callback) { if (chunk_size == 0) return; size_t num_chunks = (std::size(container) + chunk_size - 1) / chunk_size; - OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) { - size_t len = i == num_chunks - 1 ? std::size(container) - i * chunk_size - : chunk_size; - SpanTy chunk(std::data(container) + i * chunk_size, len); - if constexpr (ArgNum == 1) { - callback(chunk); - } else if constexpr (ArgNum == 2) { - callback(chunk, i); - } else { - static_assert(ArgNum == 3); - callback(chunk, i, chunk_size); - } + if (num_chunks == 1) { + internal::InvokeParallelizeCallback(container, 0, num_chunks, chunk_size, + callback); + return; + } + OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) { + internal::InvokeParallelizeCallback(container, i, num_chunks, chunk_size, + callback); } } // Splits the |size| by |chunk_size| and executes |callback| in parallel. -template , - typename RunType = typename FunctorTraits::RunType, - typename ArgList = internal::ExtractArgs, - size_t ArgNum = internal::GetSize> +template void ParallelizeByChunkSize(size_t size, size_t chunk_size, Callable callback) { if (chunk_size == 0) return; size_t num_chunks = (size + chunk_size - 1) / chunk_size; - OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) { - size_t len = i == num_chunks - 1 ? size - i * chunk_size : chunk_size; - if constexpr (ArgNum == 1) { - callback(len); - } else if constexpr (ArgNum == 2) { - callback(len, i); - } else { - static_assert(ArgNum == 3); - callback(len, i, chunk_size); - } + if (num_chunks == 1) { + internal::InvokeParallelizeCallback(size, 0, num_chunks, chunk_size, + callback); + return; + } + OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) { + internal::InvokeParallelizeCallback(size, i, num_chunks, chunk_size, + callback); } } @@ -95,29 +165,21 @@ void Parallelize(size_t size, Callable callback, template , typename RunType = typename FunctorTraits::RunType, - typename ReturnType = typename FunctorTraits::ReturnType, - typename ArgList = internal::ExtractArgs, - typename SpanTy = internal::GetType<0, ArgList>, - typename T = typename SpanTy::value_type, - size_t ArgNum = internal::GetSize> + typename ReturnType = typename FunctorTraits::ReturnType> std::vector ParallelizeMapByChunkSize(Container& container, size_t chunk_size, Callable callback) { if (chunk_size == 0) return {}; size_t num_chunks = (std::size(container) + chunk_size - 1) / chunk_size; std::vector values(num_chunks); - OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) { - size_t len = i == num_chunks - 1 ? std::size(container) - i * chunk_size - : chunk_size; - SpanTy chunk(std::data(container) + i * chunk_size, len); - if constexpr (ArgNum == 1) { - values[i] = callback(chunk); - } else if constexpr (ArgNum == 2) { - values[i] = callback(chunk, i); - } else { - static_assert(ArgNum == 3); - values[i] = callback(chunk, i, chunk_size); - } + if (num_chunks == 1) { + internal::InvokeParallelizeCallback(container, 0, num_chunks, chunk_size, + callback, values); + return values; + } + OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) { + internal::InvokeParallelizeCallback(container, i, num_chunks, chunk_size, + callback, values); } return values; } @@ -128,25 +190,21 @@ std::vector ParallelizeMapByChunkSize(Container& container, template , typename RunType = typename FunctorTraits::RunType, - typename ReturnType = typename FunctorTraits::ReturnType, - typename ArgList = internal::ExtractArgs, - size_t ArgNum = internal::GetSize> + typename ReturnType = typename FunctorTraits::ReturnType> std::vector ParallelizeMapByChunkSize(size_t size, size_t chunk_size, Callable callback) { if (chunk_size == 0) return {}; size_t num_chunks = (size + chunk_size - 1) / chunk_size; std::vector values(num_chunks); - OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) { - size_t len = i == num_chunks - 1 ? size - i * chunk_size : chunk_size; - if constexpr (ArgNum == 1) { - values[i] = callback(len); - } else if constexpr (ArgNum == 2) { - values[i] = callback(len, i); - } else { - static_assert(ArgNum == 3); - values[i] = callback(len, i, chunk_size); - } + if (num_chunks == 1) { + internal::InvokeParallelizeCallback(size, 0, num_chunks, chunk_size, + callback, values); + return values; + } + OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) { + internal::InvokeParallelizeCallback(size, i, num_chunks, chunk_size, + callback, values); } return values; } diff --git a/tachyon/base/sort.h b/tachyon/base/sort.h new file mode 100644 index 000000000..9f2b42d23 --- /dev/null +++ b/tachyon/base/sort.h @@ -0,0 +1,28 @@ +#ifndef TACHYON_BASE_SORT_H_ +#define TACHYON_BASE_SORT_H_ + +#include "third_party/pdqsort/include/pdqsort.h" +#include "third_party/powersort/include/sorts/powersort.h" + +namespace tachyon::base { + +template +void UnstableSort(Iter begin, Iter end) { + return pdqsort(begin, end); +} + +template +void UnstableSort(Iter begin, Iter end, Compare compare) { + return pdqsort(begin, end, compare); +} + +// TODO(chokobole): Add StableSort() with compare version. +template +void StableSort(Iter begin, Iter end) { + algorithms::powersort sort; + sort.sort(begin, end); +} + +} // namespace tachyon::base + +#endif // TACHYON_BASE_SORT_H_ diff --git a/tachyon/base/sort_benchmark.cc b/tachyon/base/sort_benchmark.cc new file mode 100644 index 000000000..cd0fb041a --- /dev/null +++ b/tachyon/base/sort_benchmark.cc @@ -0,0 +1,259 @@ +#include "benchmark/benchmark.h" + +#include "tachyon/base/containers/container_util.h" +#include "tachyon/base/random.h" +#include "tachyon/base/sort.h" + +namespace tachyon::math { + +enum class SortMethod { + kPdq, + kPowersort, + kStdStableSort, + kStdSort, +}; + +std::vector GetData(size_t size) { + static std::map>* s_data_map = nullptr; + if (s_data_map == nullptr) { + s_data_map = new std::map>(); + } + std::vector& data = (*s_data_map)[size]; + if (data.empty()) { + data = base::CreateVector(size, [](size_t i) { + return base::Uniform(base::Range::All()); + }); + } + return data; +} + +std::vector GetPartiallySortedData(size_t size) { + static std::map>* s_data_map = nullptr; + if (s_data_map == nullptr) { + s_data_map = new std::map>(); + } + std::vector& data = (*s_data_map)[size]; + if (data.empty()) { + data = base::CreateVector(size, [](size_t i) { return uint64_t{i}; }); + size_t shuffle_count = size / 8; + for (size_t i = 0; i < shuffle_count; ++i) { + size_t idx = base::Uniform(base::Range::Until(shuffle_count)); + size_t idx2 = base::Uniform(base::Range::Until(shuffle_count)); + std::swap(data[idx], data[idx2]); + } + } + return data; +} + +template +void BM_SortRandomData(benchmark::State& state) { + std::vector data = GetData(state.range(0)); + std::vector data2 = data; + for (auto _ : state) { + if constexpr (kSortMethod == SortMethod::kPdq) { + base::UnstableSort(data2.begin(), data2.end()); + } else if constexpr (kSortMethod == SortMethod::kPowersort) { + base::StableSort(data2.begin(), data2.end()); + } else if constexpr (kSortMethod == SortMethod::kStdStableSort) { + std::stable_sort(data2.begin(), data2.end()); + } else if constexpr (kSortMethod == SortMethod::kStdSort) { + std::sort(data2.begin(), data2.end()); + } + data2 = data; + } + benchmark::DoNotOptimize(data); + benchmark::DoNotOptimize(data2); +} + +template +void BM_SortPartiallySortedData(benchmark::State& state) { + std::vector data = GetPartiallySortedData(state.range(0)); + std::vector data2 = data; + for (auto _ : state) { + if constexpr (kSortMethod == SortMethod::kPdq) { + base::UnstableSort(data2.begin(), data2.end()); + } else if constexpr (kSortMethod == SortMethod::kPowersort) { + base::StableSort(data2.begin(), data2.end()); + } else if constexpr (kSortMethod == SortMethod::kStdStableSort) { + std::stable_sort(data2.begin(), data2.end()); + } else if constexpr (kSortMethod == SortMethod::kStdSort) { + std::sort(data2.begin(), data2.end()); + } + data2 = data; + } + benchmark::DoNotOptimize(data); + benchmark::DoNotOptimize(data2); +} + +BENCHMARK_TEMPLATE(BM_SortRandomData, SortMethod::kPdq) + ->RangeMultiplier(2) + ->Range(1 << 5, 1 << 20); +BENCHMARK_TEMPLATE(BM_SortRandomData, SortMethod::kStdSort) + ->RangeMultiplier(2) + ->Range(1 << 5, 1 << 20); +BENCHMARK_TEMPLATE(BM_SortRandomData, SortMethod::kPowersort) + ->RangeMultiplier(2) + ->Range(1 << 5, 1 << 20); +BENCHMARK_TEMPLATE(BM_SortRandomData, SortMethod::kStdStableSort) + ->RangeMultiplier(2) + ->Range(1 << 5, 1 << 20); + +BENCHMARK_TEMPLATE(BM_SortPartiallySortedData, SortMethod::kPdq) + ->RangeMultiplier(2) + ->Range(1 << 5, 1 << 20); +BENCHMARK_TEMPLATE(BM_SortPartiallySortedData, SortMethod::kStdSort) + ->RangeMultiplier(2) + ->Range(1 << 5, 1 << 20); +BENCHMARK_TEMPLATE(BM_SortPartiallySortedData, SortMethod::kPowersort) + ->RangeMultiplier(2) + ->Range(1 << 5, 1 << 20); +BENCHMARK_TEMPLATE(BM_SortPartiallySortedData, SortMethod::kStdStableSort) + ->RangeMultiplier(2) + ->Range(1 << 5, 1 << 20); + +} // namespace tachyon::math + +// clang-format off +// Executing tests from //tachyon/base:sort_benchmark +// ----------------------------------------------------------------------------- +// 2024-08-05T09:26:02+00:00 +// Running /home/chokobole/.cache/bazel/_bazel_chokobole/234690e3562329d13f7f07caac03dae4/execroot/kroma_network_tachyon/bazel-out/k8-opt/bin/tachyon/base/sort_benchmark.runfiles/kroma_network_tachyon/tachyon/base/sort_benchmark +// Run on (32 X 5499.96 MHz CPU s) +// CPU Caches: +// L1 Data 48 KiB (x16) +// L1 Instruction 32 KiB (x16) +// L2 Unified 2048 KiB (x16) +// L3 Unified 36864 KiB (x1) +// Load Average: 2.58, 2.64, 2.46 +// --------------------------------------------------------------------------------------------------------- +// Benchmark Time CPU Iterations +// --------------------------------------------------------------------------------------------------------- +// BM_SortRandomData/32 64.5 ns 64.5 ns 10927050 +// BM_SortRandomData/64 173 ns 173 ns 4082994 +// BM_SortRandomData/128 458 ns 458 ns 1526526 +// BM_SortRandomData/256 917 ns 917 ns 767867 +// BM_SortRandomData/512 1994 ns 1994 ns 349757 +// BM_SortRandomData/1024 4694 ns 4694 ns 148456 +// BM_SortRandomData/2048 9822 ns 9822 ns 71312 +// BM_SortRandomData/4096 29664 ns 29660 ns 23724 +// BM_SortRandomData/8192 87432 ns 87430 ns 8014 +// BM_SortRandomData/16384 206323 ns 206317 ns 3389 +// BM_SortRandomData/32768 449141 ns 449055 ns 1560 +// BM_SortRandomData/65536 951382 ns 951308 ns 736 +// BM_SortRandomData/131072 1971393 ns 1971368 ns 355 +// BM_SortRandomData/262144 4258559 ns 4258174 ns 166 +// BM_SortRandomData/524288 8783038 ns 8781649 ns 71 +// BM_SortRandomData/1048576 18092094 ns 18091328 ns 39 +// BM_SortRandomData/32 65.7 ns 65.7 ns 10562672 +// BM_SortRandomData/64 135 ns 135 ns 5318656 +// BM_SortRandomData/128 369 ns 369 ns 1917457 +// BM_SortRandomData/256 780 ns 780 ns 906629 +// BM_SortRandomData/512 1720 ns 1720 ns 402973 +// BM_SortRandomData/1024 3937 ns 3937 ns 178163 +// BM_SortRandomData/2048 22613 ns 22612 ns 30836 +// BM_SortRandomData/4096 84293 ns 84293 ns 8256 +// BM_SortRandomData/8192 212055 ns 212048 ns 3296 +// BM_SortRandomData/16384 463883 ns 463879 ns 1505 +// BM_SortRandomData/32768 1011653 ns 1011613 ns 692 +// BM_SortRandomData/65536 2219845 ns 2219834 ns 315 +// BM_SortRandomData/131072 4715038 ns 4714815 ns 149 +// BM_SortRandomData/262144 10003182 ns 10002959 ns 70 +// BM_SortRandomData/524288 20982786 ns 20982067 ns 33 +// BM_SortRandomData/1048576 43885082 ns 43884019 ns 16 +// BM_SortRandomData/32 84.1 ns 84.1 ns 8342074 +// BM_SortRandomData/64 190 ns 190 ns 3655272 +// BM_SortRandomData/128 446 ns 446 ns 1580095 +// BM_SortRandomData/256 984 ns 984 ns 711206 +// BM_SortRandomData/512 2340 ns 2340 ns 302783 +// BM_SortRandomData/1024 5165 ns 5164 ns 130697 +// BM_SortRandomData/2048 33551 ns 33551 ns 21216 +// BM_SortRandomData/4096 100123 ns 100120 ns 6908 +// BM_SortRandomData/8192 246248 ns 246236 ns 2893 +// BM_SortRandomData/16384 544985 ns 544968 ns 1285 +// BM_SortRandomData/32768 1185154 ns 1185143 ns 590 +// BM_SortRandomData/65536 2544378 ns 2544265 ns 275 +// BM_SortRandomData/131072 5465731 ns 5465444 ns 128 +// BM_SortRandomData/262144 11856806 ns 11856582 ns 59 +// BM_SortRandomData/524288 25558533 ns 25557610 ns 27 +// BM_SortRandomData/1048576 53946952 ns 53946318 ns 12 +// BM_SortRandomData/32 106 ns 106 ns 6572790 +// BM_SortRandomData/64 233 ns 233 ns 2997692 +// BM_SortRandomData/128 497 ns 497 ns 1416719 +// BM_SortRandomData/256 1133 ns 1133 ns 614965 +// BM_SortRandomData/512 2474 ns 2473 ns 281798 +// BM_SortRandomData/1024 5541 ns 5541 ns 126485 +// BM_SortRandomData/2048 39425 ns 39424 ns 17762 +// BM_SortRandomData/4096 99459 ns 99456 ns 7006 +// BM_SortRandomData/8192 238091 ns 238082 ns 2953 +// BM_SortRandomData/16384 539281 ns 539263 ns 1300 +// BM_SortRandomData/32768 1168490 ns 1168483 ns 597 +// BM_SortRandomData/65536 2514056 ns 2513922 ns 279 +// BM_SortRandomData/131072 5325962 ns 5325863 ns 131 +// BM_SortRandomData/262144 11590090 ns 11589633 ns 61 +// BM_SortRandomData/524288 24997070 ns 24993431 ns 29 +// BM_SortRandomData/1048576 52364991 ns 52364081 ns 13 +// BM_SortPartiallySortedData/32 26.1 ns 26.1 ns 27818789 +// BM_SortPartiallySortedData/64 67.9 ns 67.9 ns 10076970 +// BM_SortPartiallySortedData/128 159 ns 159 ns 4460125 +// BM_SortPartiallySortedData/256 326 ns 326 ns 2155077 +// BM_SortPartiallySortedData/512 612 ns 612 ns 1158772 +// BM_SortPartiallySortedData/1024 1205 ns 1205 ns 587540 +// BM_SortPartiallySortedData/2048 2252 ns 2252 ns 309958 +// BM_SortPartiallySortedData/4096 4450 ns 4450 ns 157626 +// BM_SortPartiallySortedData/8192 9237 ns 9237 ns 75456 +// BM_SortPartiallySortedData/16384 19177 ns 19176 ns 36507 +// BM_SortPartiallySortedData/32768 48304 ns 48302 ns 14462 +// BM_SortPartiallySortedData/65536 124313 ns 124310 ns 5590 +// BM_SortPartiallySortedData/131072 282868 ns 282862 ns 2471 +// BM_SortPartiallySortedData/262144 748262 ns 748224 ns 930 +// BM_SortPartiallySortedData/524288 1587168 ns 1587159 ns 441 +// BM_SortPartiallySortedData/1048576 3434923 ns 3434856 ns 204 +// BM_SortPartiallySortedData/32 37.8 ns 37.8 ns 17865125 +// BM_SortPartiallySortedData/64 81.7 ns 81.7 ns 8572684 +// BM_SortPartiallySortedData/128 201 ns 201 ns 3502366 +// BM_SortPartiallySortedData/256 459 ns 459 ns 1513415 +// BM_SortPartiallySortedData/512 1124 ns 1124 ns 622606 +// BM_SortPartiallySortedData/1024 2566 ns 2566 ns 274741 +// BM_SortPartiallySortedData/2048 5570 ns 5570 ns 125864 +// BM_SortPartiallySortedData/4096 12436 ns 12435 ns 56418 +// BM_SortPartiallySortedData/8192 26709 ns 26708 ns 26237 +// BM_SortPartiallySortedData/16384 69598 ns 69597 ns 10023 +// BM_SortPartiallySortedData/32768 187689 ns 187657 ns 3598 +// BM_SortPartiallySortedData/65536 422855 ns 422759 ns 1658 +// BM_SortPartiallySortedData/131072 926691 ns 926666 ns 755 +// BM_SortPartiallySortedData/262144 2141686 ns 2141672 ns 327 +// BM_SortPartiallySortedData/524288 4528703 ns 4528676 ns 153 +// BM_SortPartiallySortedData/1048576 9690864 ns 9690796 ns 72 +// BM_SortPartiallySortedData/32 47.6 ns 47.6 ns 14566847 +// BM_SortPartiallySortedData/64 69.7 ns 69.7 ns 10073432 +// BM_SortPartiallySortedData/128 119 ns 119 ns 5879687 +// BM_SortPartiallySortedData/256 234 ns 234 ns 2997032 +// BM_SortPartiallySortedData/512 472 ns 472 ns 1489784 +// BM_SortPartiallySortedData/1024 959 ns 958 ns 729569 +// BM_SortPartiallySortedData/2048 1963 ns 1963 ns 358399 +// BM_SortPartiallySortedData/4096 4909 ns 4908 ns 142478 +// BM_SortPartiallySortedData/8192 10384 ns 10381 ns 67528 +// BM_SortPartiallySortedData/16384 43864 ns 43863 ns 15994 +// BM_SortPartiallySortedData/32768 118543 ns 118541 ns 5922 +// BM_SortPartiallySortedData/65536 282220 ns 282215 ns 2485 +// BM_SortPartiallySortedData/131072 639465 ns 639359 ns 1093 +// BM_SortPartiallySortedData/262144 1601955 ns 1601811 ns 435 +// BM_SortPartiallySortedData/524288 3426178 ns 3425631 ns 206 +// BM_SortPartiallySortedData/1048576 7438610 ns 7438426 ns 91 +// BM_SortPartiallySortedData/32 70.3 ns 70.3 ns 10067148 +// BM_SortPartiallySortedData/64 131 ns 131 ns 5360170 +// BM_SortPartiallySortedData/128 266 ns 266 ns 2622105 +// BM_SortPartiallySortedData/256 631 ns 631 ns 1074449 +// BM_SortPartiallySortedData/512 1243 ns 1243 ns 566539 +// BM_SortPartiallySortedData/1024 2805 ns 2805 ns 249929 +// BM_SortPartiallySortedData/2048 5767 ns 5767 ns 121282 +// BM_SortPartiallySortedData/4096 12814 ns 12813 ns 55051 +// BM_SortPartiallySortedData/8192 34371 ns 34370 ns 20305 +// BM_SortPartiallySortedData/16384 95992 ns 95990 ns 7274 +// BM_SortPartiallySortedData/32768 201706 ns 201701 ns 3472 +// BM_SortPartiallySortedData/65536 453366 ns 453346 ns 1543 +// BM_SortPartiallySortedData/131072 977696 ns 977652 ns 715 +// BM_SortPartiallySortedData/262144 2291897 ns 2291858 ns 305 +// BM_SortPartiallySortedData/524288 5089614 ns 5089385 ns 137 +// BM_SortPartiallySortedData/1048576 11416225 ns 11415938 ns 60 +// clang-format on diff --git a/tachyon/crypto/commitments/fri/two_adic_fri_config.h b/tachyon/crypto/commitments/fri/two_adic_fri_config.h index 22b029fb8..59a5b17f2 100644 --- a/tachyon/crypto/commitments/fri/two_adic_fri_config.h +++ b/tachyon/crypto/commitments/fri/two_adic_fri_config.h @@ -54,7 +54,7 @@ std::vector FoldMatrix(const ExtF& beta, ExtF::GetBitRevIndexSuccessivePowers(rows, w_inv, half_beta); std::vector ret(rows); - OPENMP_PARALLEL_FOR(size_t r = 0; r < rows; ++r) { + OMP_PARALLEL_FOR(size_t r = 0; r < rows; ++r) { const ExtF& lo = mat(r, 0); const ExtF& hi = mat(r, 1); ret[r] = (one_half + powers[r]) * lo + (one_half - powers[r]) * hi; diff --git a/tachyon/crypto/commitments/fri/two_adic_fri_prover.h b/tachyon/crypto/commitments/fri/two_adic_fri_prover.h index 5267f202c..158cf2082 100644 --- a/tachyon/crypto/commitments/fri/two_adic_fri_prover.h +++ b/tachyon/crypto/commitments/fri/two_adic_fri_prover.h @@ -70,7 +70,7 @@ CommitPhaseResult CommitPhase( // |folded| will never be the size of |inputs[0]|. for (size_t i = 1; i < inputs.size(); ++i) { if (inputs[i].size() == folded.size()) { - OPENMP_PARALLEL_FOR(size_t j = 0; j < inputs[i].size(); ++j) { + OMP_PARALLEL_FOR(size_t j = 0; j < inputs[i].size(); ++j) { folded[j] += inputs[i][j]; } } @@ -81,7 +81,7 @@ CommitPhaseResult CommitPhase( VLOG(2) << "FRI(final_eval): " << final_eval.ToHexString(true); #if DCHECK_IS_ON() - OPENMP_PARALLEL_FOR(size_t i = 0; i < folded.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < folded.size(); ++i) { DCHECK_EQ(folded[i], final_eval); } #endif diff --git a/tachyon/crypto/commitments/merkle_tree/binary_merkle_tree/binary_merkle_tree.h b/tachyon/crypto/commitments/merkle_tree/binary_merkle_tree/binary_merkle_tree.h index 2e5699e42..63afb0173 100644 --- a/tachyon/crypto/commitments/merkle_tree/binary_merkle_tree/binary_merkle_tree.h +++ b/tachyon/crypto/commitments/merkle_tree/binary_merkle_tree/binary_merkle_tree.h @@ -68,8 +68,8 @@ class BinaryMerkleTree final // Finally, the remaining tree should be constructed from leaves 1 and 2. size_t leaves_size = std::size(leaves); if (leaves_size > leaves_size_for_parallelization_) { - OPENMP_PARALLEL_FOR(size_t i = 0; i < leaves_size; - i += leaves_size_for_parallelization_) { + OMP_PARALLEL_FOR(size_t i = 0; i < leaves_size; + i += leaves_size_for_parallelization_) { size_t from = leaves_size - 1 + i; size_t to = from + leaves_size_for_parallelization_; BuildTreeFromLeaves(base::Range(from, to)); @@ -135,7 +135,7 @@ class BinaryMerkleTree final } base::CheckedNumeric n = leaves_size; storage_->Allocate(((n << 1) - 1).ValueOrDie()); - OPENMP_PARALLEL_FOR(size_t i = 0; i < leaves_size; ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < leaves_size; ++i) { storage_->SetHash(leaves_size + i - 1, hasher_->ComputeLeafHash(leaves[i])); } diff --git a/tachyon/crypto/commitments/merkle_tree/field_merkle_tree/BUILD.bazel b/tachyon/crypto/commitments/merkle_tree/field_merkle_tree/BUILD.bazel index 07cb3a0a5..0d17b5532 100644 --- a/tachyon/crypto/commitments/merkle_tree/field_merkle_tree/BUILD.bazel +++ b/tachyon/crypto/commitments/merkle_tree/field_merkle_tree/BUILD.bazel @@ -14,6 +14,7 @@ tachyon_cc_library( deps = [ "//tachyon/base:logging", "//tachyon/base:parallelize", + "//tachyon/base:sort", "//tachyon/base/containers:container_util", "//tachyon/math/finite_fields:extension_field_traits_forward", "//tachyon/math/finite_fields:finite_field_traits", @@ -31,6 +32,7 @@ tachyon_cc_library( deps = [ ":field_merkle_tree", "//tachyon/base:bits", + "//tachyon/base:sort", "//tachyon/crypto/commitments:mixed_matrix_commitment_scheme", ], ) diff --git a/tachyon/crypto/commitments/merkle_tree/field_merkle_tree/field_merkle_tree.h b/tachyon/crypto/commitments/merkle_tree/field_merkle_tree/field_merkle_tree.h index 70a177fcc..dbc57afca 100644 --- a/tachyon/crypto/commitments/merkle_tree/field_merkle_tree/field_merkle_tree.h +++ b/tachyon/crypto/commitments/merkle_tree/field_merkle_tree/field_merkle_tree.h @@ -19,6 +19,7 @@ #include "tachyon/base/containers/container_util.h" #include "tachyon/base/logging.h" #include "tachyon/base/parallelize.h" +#include "tachyon/base/sort.h" #include "tachyon/math/finite_fields/extension_field_traits_forward.h" #include "tachyon/math/finite_fields/finite_field_traits.h" #include "tachyon/math/finite_fields/packed_field_traits_forward.h" @@ -50,15 +51,11 @@ class FieldMerkleTree { std::vector>&& leaves) { CHECK(!leaves.empty()); - std::vector*> sorted_leaves = base::Map( - leaves, [](const math::RowMajorMatrix& matrix) { return &matrix; }); - // TODO(chokobole): Use https://github.com/timsort/cpp-TimSort or - // https://github.com/sebawild/powersort for better performance. - std::stable_sort( - sorted_leaves.begin(), sorted_leaves.end(), - [](const math::RowMajorMatrix* a, const math::RowMajorMatrix* b) { - return a->rows() > b->rows(); + std::vector sorted_leaves = + base::Map(leaves, [](const math::RowMajorMatrix& matrix) { + return RowMajorMatrixView(&matrix); }); + base::StableSort(sorted_leaves.begin(), sorted_leaves.end()); #if DCHECK_IS_ON() { @@ -81,9 +78,9 @@ class FieldMerkleTree { break; } } - absl::Span*> tallest_matrices = + absl::Span tallest_matrices = absl::MakeSpan(sorted_leaves.data(), first_layer_size); - absl::Span*> remaining_leaves = + absl::Span remaining_leaves = absl::MakeSpan(sorted_leaves.data() + first_layer_size, sorted_leaves.size() - first_layer_size); @@ -103,7 +100,7 @@ class FieldMerkleTree { break; } } - absl::Span*> matrices_to_inject; + absl::Span matrices_to_inject; if (next_layer_size > 0) { matrices_to_inject = remaining_leaves.subspan(0, next_layer_size); remaining_leaves.remove_prefix(next_layer_size); @@ -124,6 +121,32 @@ class FieldMerkleTree { const Digest& GetRoot() const { return digest_layers_.back()[0]; } private: + class RowMajorMatrixView { + public: + RowMajorMatrixView() = default; + explicit RowMajorMatrixView(const math::RowMajorMatrix* ptr) + : ptr_(ptr) {} + + // TODO(chokobole): This comparison is intentionally reversed to sort in + // descending order, as powersort doesn't accept custom callbacks. + bool operator<(const RowMajorMatrixView& other) const { + return ptr_->rows() > other.ptr_->rows(); + } + bool operator<=(const RowMajorMatrixView& other) const { + return ptr_->rows() >= other.ptr_->rows(); + } + bool operator>(const RowMajorMatrixView& other) const { + return ptr_->rows() < other.ptr_->rows(); + } + + const math::RowMajorMatrix* operator->() const { return ptr_; } + + const math::RowMajorMatrix& operator*() const { return *ptr_; } + + private: + const math::RowMajorMatrix* ptr_ = nullptr; + }; + FieldMerkleTree(std::vector>&& leaves, std::vector>&& digest_layers) : leaves_(std::move(leaves)), digest_layers_(std::move(digest_layers)) {} @@ -131,7 +154,7 @@ class FieldMerkleTree { template static std::vector CreateFirstDigestLayer( const Hasher& hasher, const PackedHasher& packed_hasher, - absl::Span*> tallest_matrices) { + absl::Span tallest_matrices) { size_t max_rows = static_cast(tallest_matrices[0]->rows()); size_t max_rows_padded = absl::bit_ceil(max_rows); @@ -143,8 +166,8 @@ class FieldMerkleTree { absl::Span chunk, size_t chunk_offset, size_t chunk_size) { size_t start = chunk_offset * chunk_size; if (chunk.size() == chunk_size) { - std::vector packed_prime_fields = base::FlatMap( - tallest_matrices, [start](const math::RowMajorMatrix* m) { + std::vector packed_prime_fields = + base::FlatMap(tallest_matrices, [start](RowMajorMatrixView m) { return math::PackRowVertically(*m, start); }); PackedDigest packed_digest = @@ -170,7 +193,7 @@ class FieldMerkleTree { const Hasher& hasher, const PackedHasher& packed_hasher, const Compressor& compressor, const PackedCompressor& packed_compressor, const std::vector& prev_layer, - absl::Span*> matrices_to_inject) { + absl::Span matrices_to_inject) { if (matrices_to_inject.empty()) return Compress(compressor, packed_compressor, prev_layer); @@ -202,7 +225,7 @@ class FieldMerkleTree { }; inputs[0] = packed_compressor.Compress(inputs); std::vector packed_prime_fields = base::FlatMap( - matrices_to_inject, [start](const math::RowMajorMatrix* m) { + matrices_to_inject, [start](RowMajorMatrixView m) { return math::PackRowVertically(*m, start); }); inputs[1] = packed_hasher.Hash(packed_prime_fields); @@ -290,8 +313,8 @@ class FieldMerkleTree { } static std::vector GetRowAsPrimeFieldVector( - absl::Span*> matrices, size_t row) { - return base::FlatMap(matrices, [row](const math::RowMajorMatrix* m) { + absl::Span matrices, size_t row) { + return base::FlatMap(matrices, [row](RowMajorMatrixView m) { if constexpr (math::FiniteFieldTraits::kIsExtensionField) { static_assert( math::ExtensionFieldTraits::kDegreeOverBasePrimeField == diff --git a/tachyon/crypto/commitments/merkle_tree/field_merkle_tree/field_merkle_tree_mmcs.h b/tachyon/crypto/commitments/merkle_tree/field_merkle_tree/field_merkle_tree_mmcs.h index 16855e1fe..a0a69c5d7 100644 --- a/tachyon/crypto/commitments/merkle_tree/field_merkle_tree/field_merkle_tree_mmcs.h +++ b/tachyon/crypto/commitments/merkle_tree/field_merkle_tree/field_merkle_tree_mmcs.h @@ -68,6 +68,18 @@ class FieldMerkleTreeMMCS final size_t index; math::Dimensions dimensions; + // TODO(chokobole): This comparison is intentionally reversed to sort in + // descending order, as powersort doesn't accept custom callbacks. + bool operator<(const IndexedDimensions& other) const { + return dimensions.height > other.dimensions.height; + } + bool operator<=(const IndexedDimensions& other) const { + return dimensions.height >= other.dimensions.height; + } + bool operator>(const IndexedDimensions& other) const { + return dimensions.height < other.dimensions.height; + } + std::string ToString() const { return absl::Substitute("($0, $1)", index, dimensions.ToString()); } @@ -131,13 +143,8 @@ class FieldMerkleTreeMMCS final return IndexedDimensions{index, dimensions}; }); - // TODO(chokobole): Use https://github.com/timsort/cpp-TimSort or - // https://github.com/sebawild/powersort for better performance. - std::stable_sort( - sorted_dimensions_list.begin(), sorted_dimensions_list.end(), - [](const IndexedDimensions& a, const IndexedDimensions& b) { - return a.dimensions.height > b.dimensions.height; - }); + base::StableSort(sorted_dimensions_list.begin(), + sorted_dimensions_list.end()); absl::Span remaining_dimensions_list = absl::MakeConstSpan(sorted_dimensions_list); diff --git a/tachyon/crypto/commitments/mixed_matrix_commitment_scheme.h b/tachyon/crypto/commitments/mixed_matrix_commitment_scheme.h index d890d4d99..381087d04 100644 --- a/tachyon/crypto/commitments/mixed_matrix_commitment_scheme.h +++ b/tachyon/crypto/commitments/mixed_matrix_commitment_scheme.h @@ -28,7 +28,7 @@ class MixedMatrixCommitmentScheme { [[nodiscard]] bool Commit(const std::vector& vector, Commitment* commitment, ProverData* prover_data) { math::RowMajorMatrix matrix(vector.size(), 1); - OPENMP_PARALLEL_FOR(size_t i = 0; i < vector.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < vector.size(); ++i) { matrix(i, 0) = vector[i]; } return Commit(std::move(matrix), commitment, prover_data); diff --git a/tachyon/crypto/commitments/polynomial_openings.h b/tachyon/crypto/commitments/polynomial_openings.h index 1b47fc38f..aa64eca79 100644 --- a/tachyon/crypto/commitments/polynomial_openings.h +++ b/tachyon/crypto/commitments/polynomial_openings.h @@ -155,7 +155,7 @@ struct GroupedPolynomialOpenings { const Field& r, const std::vector& low_degree_extensions) const { // numerators: [P₀(X) - R₀(X), P₁(X) - R₁(X), P₂(X) - R₂(X)] std::vector numerators(low_degree_extensions.size()); - OPENMP_PARALLEL_FOR(size_t i = 0; i < low_degree_extensions.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < low_degree_extensions.size(); ++i) { numerators[i] = *poly_openings_vec[i].poly_oracle - low_degree_extensions[i]; } diff --git a/tachyon/crypto/sumcheck/multilinear/sumcheck_prover.h b/tachyon/crypto/sumcheck/multilinear/sumcheck_prover.h index cfeaec3bf..598a4d231 100644 --- a/tachyon/crypto/sumcheck/multilinear/sumcheck_prover.h +++ b/tachyon/crypto/sumcheck/multilinear/sumcheck_prover.h @@ -99,7 +99,7 @@ class SumcheckProver { std::vector> finished_evaluations( num_chunks, std::vector(max_evaluations_ + 1, F::Zero())); - OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) { size_t begin = i * chunk_size; size_t len = (i == num_chunks - 1) ? size - begin : chunk_size; std::vector intermediate_evaluations(max_evaluations_ + 1, F::Zero()); diff --git a/tachyon/crypto/sumcheck/multilinear/sumcheck_verifier.h b/tachyon/crypto/sumcheck/multilinear/sumcheck_verifier.h index f42a9e506..cfb69b08b 100644 --- a/tachyon/crypto/sumcheck/multilinear/sumcheck_verifier.h +++ b/tachyon/crypto/sumcheck/multilinear/sumcheck_verifier.h @@ -190,7 +190,7 @@ F InterpolateUniPoly(const std::vector& poly, const F& evaluation_point) { std::vector products(num_chunks, F::One()); std::vector denom_ups(num_chunks, F::One()); std::vector> list_of_evals(num_chunks); - OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) { size_t begin = i * chunk_size; size_t len = (i == num_chunks - 1) ? poly_size - begin : chunk_size; list_of_evals[i].reserve(len); diff --git a/tachyon/math/base/batch_inverse_benchmark.cc b/tachyon/math/base/batch_inverse_benchmark.cc index 33e4e0726..5f656b097 100644 --- a/tachyon/math/base/batch_inverse_benchmark.cc +++ b/tachyon/math/base/batch_inverse_benchmark.cc @@ -36,7 +36,7 @@ void BM_InverseParallelFor(benchmark::State& state) { std::vector fields = base::CreateVectorParallel( state.range(0), [](size_t i) { return F::FromBigInt(BigInt(i + 1)); }); for (auto _ : state) { - OPENMP_PARALLEL_FOR(size_t i = 0; i < fields.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < fields.size(); ++i) { CHECK(fields[i].InverseInPlace()); } } diff --git a/tachyon/math/base/groups.h b/tachyon/math/base/groups.h index a6c61ff9d..d6f251b56 100644 --- a/tachyon/math/base/groups.h +++ b/tachyon/math/base/groups.h @@ -91,7 +91,7 @@ class MultiplicativeGroup : public MultiplicativeSemigroup { size_t chunk_size = base::GetNumElementsPerThread(groups); size_t num_chunks = (size + chunk_size - 1) / chunk_size; std::atomic check_valid(true); - OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) { size_t len = i == num_chunks - 1 ? size - i * chunk_size : chunk_size; absl::Span groups_chunk(std::data(groups) + i * chunk_size, len); diff --git a/tachyon/math/base/semigroups.h b/tachyon/math/base/semigroups.h index f3a68c9d6..4983de25c 100644 --- a/tachyon/math/base/semigroups.h +++ b/tachyon/math/base/semigroups.h @@ -556,7 +556,7 @@ class AdditiveSemigroup { LOG(ERROR) << "scalars and bases are empty"; return false; } - OPENMP_PARALLEL_FOR(size_t i = 0; i < size; ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < size; ++i) { (*outputs)[i] = bases[i].ScalarMul(scalars[i]); } return true; @@ -572,7 +572,7 @@ class AdditiveSemigroup { LOG(ERROR) << "scalars are empty"; return false; } - OPENMP_PARALLEL_FOR(size_t i = 0; i < size; ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < size; ++i) { (*outputs)[i] = base.ScalarMul(scalars[i]); } return true; @@ -589,7 +589,7 @@ class AdditiveSemigroup { LOG(ERROR) << "bases are empty"; return false; } - OPENMP_PARALLEL_FOR(size_t i = 0; i < size; ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < size; ++i) { (*outputs)[i] = bases[i].ScalarMul(scalar); } return true; diff --git a/tachyon/math/elliptic_curves/msm/algorithms/pippenger/pippenger.h b/tachyon/math/elliptic_curves/msm/algorithms/pippenger/pippenger.h index 4c23d64ce..8cc8962f5 100644 --- a/tachyon/math/elliptic_curves/msm/algorithms/pippenger/pippenger.h +++ b/tachyon/math/elliptic_curves/msm/algorithms/pippenger/pippenger.h @@ -144,7 +144,7 @@ class Pippenger : public PippengerBase { FillDigits(scalars[i], ctx_.window_bits, &scalar_digits[i]); } if (parallel_windows_) { - OPENMP_PARALLEL_FOR(size_t i = 0; i < ctx_.window_count; ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < ctx_.window_count; ++i) { AccumulateSingleWindowNAFSum(bases_first, scalar_digits, i, &(*window_sums)[i], i == ctx_.window_count - 1); @@ -203,7 +203,7 @@ class Pippenger : public PippengerBase { absl::Span> scalars, std::vector* window_sums) { if (parallel_windows_) { - OPENMP_PARALLEL_FOR(size_t i = 0; i < ctx_.window_count; ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < ctx_.window_count; ++i) { AccumulateSingleWindowSum(bases_first, scalars, ctx_.window_bits * i, &(*window_sums)[i]); } diff --git a/tachyon/math/elliptic_curves/msm/algorithms/pippenger/pippenger_adapter.h b/tachyon/math/elliptic_curves/msm/algorithms/pippenger/pippenger_adapter.h index c94d146a3..f79c9147e 100644 --- a/tachyon/math/elliptic_curves/msm/algorithms/pippenger/pippenger_adapter.h +++ b/tachyon/math/elliptic_curves/msm/algorithms/pippenger/pippenger_adapter.h @@ -82,7 +82,7 @@ class PippengerAdapter { size_t num_chunks = (scalars_size + chunk_size - 1) / chunk_size; std::vector results; results.resize(num_chunks); - OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) { size_t start = i * chunk_size; size_t len = i == num_chunks - 1 ? scalars_size - start : chunk_size; Pippenger pippenger; diff --git a/tachyon/math/elliptic_curves/msm/fixed_base_msm.h b/tachyon/math/elliptic_curves/msm/fixed_base_msm.h index 2c09ec096..0fbdab43b 100644 --- a/tachyon/math/elliptic_curves/msm/fixed_base_msm.h +++ b/tachyon/math/elliptic_curves/msm/fixed_base_msm.h @@ -127,7 +127,7 @@ class FixedBaseMSM { LOG(ERROR) << "the size of scalar and output iterators don't match "; return false; } - OPENMP_PARALLEL_FOR(difference_type i = 0; i < size; ++i) { + OMP_PARALLEL_FOR(difference_type i = 0; i < size; ++i) { *(outputs_first + i) = ScalarMul(*(scalars_first + i)); } return true; @@ -201,7 +201,7 @@ class FixedBaseMSM { base_multiples_ = std::vector>( window_count, std::vector(window_size)); - OPENMP_PARALLEL_FOR(size_t i = 0; i < window_count; ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < window_count; ++i) { size_t cur_window_size = i == window_count - 1 ? last_window_size : window_size; diff --git a/tachyon/math/elliptic_curves/short_weierstrass/jacobian_point.h b/tachyon/math/elliptic_curves/short_weierstrass/jacobian_point.h index 5e1099fff..1915a1d02 100644 --- a/tachyon/math/elliptic_curves/short_weierstrass/jacobian_point.h +++ b/tachyon/math/elliptic_curves/short_weierstrass/jacobian_point.h @@ -120,7 +120,7 @@ class JacobianPoint< ScalarField::kParallelBatchInverseDivisorThreshold)) { size_t chunk_size = base::GetNumElementsPerThread(jacobian_points); size_t num_chunks = (size + chunk_size - 1) / chunk_size; - OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) { size_t len = i == num_chunks - 1 ? size - i * chunk_size : chunk_size; absl::Span> affine_points_chunk( std::data(*affine_points) + i * chunk_size, len); diff --git a/tachyon/math/elliptic_curves/short_weierstrass/point_xyzz.h b/tachyon/math/elliptic_curves/short_weierstrass/point_xyzz.h index 0b9de58f4..590516d3d 100644 --- a/tachyon/math/elliptic_curves/short_weierstrass/point_xyzz.h +++ b/tachyon/math/elliptic_curves/short_weierstrass/point_xyzz.h @@ -123,7 +123,7 @@ class PointXYZZ<_Curve, ScalarField::kParallelBatchInverseDivisorThreshold)) { size_t chunk_size = base::GetNumElementsPerThread(point_xyzzs); size_t num_chunks = (size + chunk_size - 1) / chunk_size; - OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) { size_t len = i == num_chunks - 1 ? size - i * chunk_size : chunk_size; absl::Span> affine_points_chunk( std::data(*affine_points) + i * chunk_size, len); diff --git a/tachyon/math/elliptic_curves/short_weierstrass/projective_point.h b/tachyon/math/elliptic_curves/short_weierstrass/projective_point.h index dba9e37ea..6b1ef006d 100644 --- a/tachyon/math/elliptic_curves/short_weierstrass/projective_point.h +++ b/tachyon/math/elliptic_curves/short_weierstrass/projective_point.h @@ -119,7 +119,7 @@ class ProjectivePoint< ScalarField::kParallelBatchInverseDivisorThreshold)) { size_t chunk_size = base::GetNumElementsPerThread(projective_points); size_t num_chunks = (size + chunk_size - 1) / chunk_size; - OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) { size_t len = i == num_chunks - 1 ? size - i * chunk_size : chunk_size; absl::Span> affine_points_chunk( std::data(*affine_points) + i * chunk_size, len); diff --git a/tachyon/math/matrix/matrix_operations.h b/tachyon/math/matrix/matrix_operations.h index d2a4d74af..19fe5a6aa 100644 --- a/tachyon/math/matrix/matrix_operations.h +++ b/tachyon/math/matrix/matrix_operations.h @@ -43,7 +43,7 @@ math::Vector MulMatVec(const Eigen::MatrixBase& matrix, static_assert(std::is_same_v); math::Vector ret = math::Vector::Constant(vector.size(), F::Zero()); - OPENMP_PARALLEL_FOR(Eigen::Index i = 0; i < matrix.rows(); ++i) { + OMP_PARALLEL_FOR(Eigen::Index i = 0; i < matrix.rows(); ++i) { for (Eigen::Index j = 0; j < matrix.cols(); ++j) { ret[i] += matrix(i, j) * vector[j]; } @@ -75,13 +75,13 @@ math::Vector MulMatVec( math::Vector ret = math::Vector::Constant(vector.size(), F::Zero()); if (vector.rows() == 1) { - OPENMP_PARALLEL_FOR(Eigen::Index i = 0; i < matrix.rows(); ++i) { + OMP_PARALLEL_FOR(Eigen::Index i = 0; i < matrix.rows(); ++i) { for (Eigen::Index j = 0; j < matrix.cols(); ++j) { ret[i] += matrix(i, j) * vector(0, j); } } } else if (vector.cols() == 1) { - OPENMP_PARALLEL_FOR(Eigen::Index i = 0; i < matrix.rows(); ++i) { + OMP_PARALLEL_FOR(Eigen::Index i = 0; i < matrix.rows(); ++i) { for (Eigen::Index j = 0; j < matrix.cols(); ++j) { ret[i] += matrix(i, j) * vector(j, 0); } @@ -127,7 +127,7 @@ math::Matrix MulMatMat(const Eigen::MatrixBase& matrix, math::Matrix ret = math::Matrix::Constant(matrix.rows(), matrix2.cols(), F::Zero()); - OPENMP_PARALLEL_FOR(Eigen::Index i = 0; i < matrix.rows(); ++i) { + OMP_PARALLEL_FOR(Eigen::Index i = 0; i < matrix.rows(); ++i) { for (Eigen::Index j = 0; j < matrix.cols(); ++j) { for (Eigen::Index k = 0; k < matrix2.cols(); ++k) { ret(i, k) += matrix(i, j) * matrix2(j, k); diff --git a/tachyon/math/matrix/matrix_utils.h b/tachyon/math/matrix/matrix_utils.h index a2aa7330f..28843f6e7 100644 --- a/tachyon/math/matrix/matrix_utils.h +++ b/tachyon/math/matrix/matrix_utils.h @@ -154,7 +154,7 @@ void ExpandInPlaceWithZeroPad(Eigen::MatrixBase& mat, Derived padded = Derived::Zero(new_rows, cols); - OPENMP_PARALLEL_FOR(Eigen::Index row = 0; row < original_rows; ++row) { + OMP_PARALLEL_FOR(Eigen::Index row = 0; row < original_rows; ++row) { Eigen::Index padded_row_index = row << added_bits; // TODO(ashjeong): Check if moved properly padded.row(padded_row_index) = std::move(mat.row(row)); @@ -173,7 +173,7 @@ void ReverseMatrixIndexBits(Eigen::MatrixBase& mat) { } uint32_t log_n = base::bits::CheckedLog2(rows); - OPENMP_PARALLEL_FOR(size_t row = 1; row < rows; ++row) { + OMP_PARALLEL_FOR(size_t row = 1; row < rows; ++row) { size_t ridx = base::bits::ReverseBitsLen(row, log_n); if (row < ridx) { mat.row(row).swap(mat.row(ridx)); diff --git a/tachyon/math/polynomials/multivariate/linear_combination_term.h b/tachyon/math/polynomials/multivariate/linear_combination_term.h index ac49788b1..e92dc7ac7 100644 --- a/tachyon/math/polynomials/multivariate/linear_combination_term.h +++ b/tachyon/math/polynomials/multivariate/linear_combination_term.h @@ -51,7 +51,7 @@ struct LinearCombinationTerm { size_t num_chunks = (size + chunk_size - 1) / chunk_size; std::vector sums(num_chunks, F::Zero()); - OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) { size_t start = i * chunk_size; size_t len = (i == num_chunks - 1) ? size - start : chunk_size; for (size_t j = start; j < start + len; ++j) { diff --git a/tachyon/math/polynomials/multivariate/multilinear_extension_ops.h b/tachyon/math/polynomials/multivariate/multilinear_extension_ops.h index 94cb6231c..40390e097 100644 --- a/tachyon/math/polynomials/multivariate/multilinear_extension_ops.h +++ b/tachyon/math/polynomials/multivariate/multilinear_extension_ops.h @@ -32,7 +32,7 @@ class MultilinearExtensionOp> { } CHECK_EQ(l_evaluations.size(), r_evaluations.size()); std::vector o_evaluations(r_evaluations.size()); - OPENMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) { o_evaluations[i] = l_evaluations[i] + r_evaluations[i]; } return MultilinearExtension(D(std::move(o_evaluations))); @@ -51,7 +51,7 @@ class MultilinearExtensionOp> { return self; } CHECK_EQ(l_evaluations.size(), r_evaluations.size()); - OPENMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) { l_evaluations[i] += r_evaluations[i]; } return self; @@ -71,7 +71,7 @@ class MultilinearExtensionOp> { } CHECK_EQ(l_evaluations.size(), r_evaluations.size()); std::vector o_evaluations(r_evaluations.size()); - OPENMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) { o_evaluations[i] = l_evaluations[i] - r_evaluations[i]; } return MultilinearExtension(D(std::move(o_evaluations))); @@ -90,7 +90,7 @@ class MultilinearExtensionOp> { return self; } CHECK_EQ(l_evaluations.size(), r_evaluations.size()); - OPENMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) { l_evaluations[i] -= r_evaluations[i]; } return self; @@ -102,7 +102,7 @@ class MultilinearExtensionOp> { return self; } std::vector o_evaluations(i_evaluations.size()); - OPENMP_PARALLEL_FOR(size_t i = 0; i < o_evaluations.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < o_evaluations.size(); ++i) { o_evaluations[i] = -i_evaluations[i]; } return MultilinearExtension(D(std::move(o_evaluations))); @@ -114,7 +114,7 @@ class MultilinearExtensionOp> { return self; } // clang-format off - OPENMP_PARALLEL_FOR(F& evaluation : evaluations) { + OMP_PARALLEL_FOR(F& evaluation : evaluations) { // clang-format on evaluation.NegateInPlace(); } @@ -131,7 +131,7 @@ class MultilinearExtensionOp> { } CHECK_EQ(l_evaluations.size(), r_evaluations.size()); std::vector o_evaluations(r_evaluations.size()); - OPENMP_PARALLEL_FOR(size_t i = 0; i < l_evaluations.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < l_evaluations.size(); ++i) { o_evaluations[i] = l_evaluations[i] * r_evaluations[i]; } return MultilinearExtension(D(std::move(o_evaluations))); @@ -151,7 +151,7 @@ class MultilinearExtensionOp> { return self; } CHECK_EQ(l_evaluations.size(), r_evaluations.size()); - OPENMP_PARALLEL_FOR(size_t i = 0; i < l_evaluations.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < l_evaluations.size(); ++i) { l_evaluations[i] *= r_evaluations[i]; } return self; @@ -178,7 +178,7 @@ class MultilinearExtensionOp> { } std::vector o_evaluations(r_evaluations.size()); std::atomic check_valid(true); - OPENMP_PARALLEL_FOR(size_t i = 0; i < l_evaluations.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < l_evaluations.size(); ++i) { std::optional div = l_evaluations[i] / r_evaluations[i]; if (UNLIKELY(!div)) { check_valid.store(false, std::memory_order_relaxed); @@ -214,7 +214,7 @@ class MultilinearExtensionOp> { return std::nullopt; } std::atomic check_valid(true); - OPENMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) { if (UNLIKELY(!(l_evaluations[i] /= r_evaluations[i]))) check_valid.store(false, std::memory_order_relaxed); } diff --git a/tachyon/math/polynomials/multivariate/multivariate_polynomial_ops.h b/tachyon/math/polynomials/multivariate/multivariate_polynomial_ops.h index 05229c11e..c8218691a 100644 --- a/tachyon/math/polynomials/multivariate/multivariate_polynomial_ops.h +++ b/tachyon/math/polynomials/multivariate/multivariate_polynomial_ops.h @@ -79,7 +79,7 @@ class MultivariatePolynomialOp> { } const Terms& i_terms = self.coefficients_.terms_; Terms o_terms(i_terms.size()); - OPENMP_PARALLEL_FOR(size_t i = 0; i < o_terms.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < o_terms.size(); ++i) { o_terms[i] = -i_terms[i]; } return MultivariatePolynomial( @@ -93,7 +93,7 @@ class MultivariatePolynomialOp> { } Terms& terms = self.coefficients_.terms_; // clang-format off - OPENMP_PARALLEL_FOR(Term& term : terms) { term.coefficient.NegateInPlace(); } + OMP_PARALLEL_FOR(Term& term : terms) { term.coefficient.NegateInPlace(); } // clang-format on return self; } diff --git a/tachyon/math/polynomials/univariate/BUILD.bazel b/tachyon/math/polynomials/univariate/BUILD.bazel index 8c7d40b9e..cbb2e13ca 100644 --- a/tachyon/math/polynomials/univariate/BUILD.bazel +++ b/tachyon/math/polynomials/univariate/BUILD.bazel @@ -154,6 +154,7 @@ tachyon_cc_library( "//tachyon/base:logging", "//tachyon/base:optional", "//tachyon/base:parallelize", + "//tachyon/base:sort", "//tachyon/base/buffer:copyable", "//tachyon/base/containers:adapters", "//tachyon/base/containers:container_util", @@ -166,7 +167,6 @@ tachyon_cc_library( "@com_google_absl//absl/hash", "@com_google_absl//absl/numeric:bits", "@com_google_absl//absl/types:span", - "@pdqsort", ], ) diff --git a/tachyon/math/polynomials/univariate/evaluations_utils.h b/tachyon/math/polynomials/univariate/evaluations_utils.h index c34fc3965..7587ddf71 100644 --- a/tachyon/math/polynomials/univariate/evaluations_utils.h +++ b/tachyon/math/polynomials/univariate/evaluations_utils.h @@ -30,7 +30,7 @@ template void SwapBitRevElementsInPlace(Container& container, size_t size, size_t log_len) { if (size <= 1) return; - OPENMP_PARALLEL_FOR(size_t idx = 1; idx < size; ++idx) { + OMP_PARALLEL_FOR(size_t idx = 1; idx < size; ++idx) { size_t ridx = base::bits::ReverseBitsLen(idx, log_len); if (idx < ridx) { std::swap(container.at(idx), container.at(ridx)); diff --git a/tachyon/math/polynomials/univariate/lagrange_interpolation_unittest.cc b/tachyon/math/polynomials/univariate/lagrange_interpolation_unittest.cc index 979f9959d..85d968fec 100644 --- a/tachyon/math/polynomials/univariate/lagrange_interpolation_unittest.cc +++ b/tachyon/math/polynomials/univariate/lagrange_interpolation_unittest.cc @@ -24,7 +24,7 @@ TEST(LagrangeInterpolationTest, LagrangeInterpolate) { UnivariateDensePolynomial poly; EXPECT_TRUE(LagrangeInterpolate(points, evals, &poly)); - OPENMP_PARALLEL_FOR(size_t i = 0; i < points.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < points.size(); ++i) { EXPECT_EQ(poly.Evaluate(points[i]), evals[i]); } } diff --git a/tachyon/math/polynomials/univariate/mixed_radix_evaluation_domain.h b/tachyon/math/polynomials/univariate/mixed_radix_evaluation_domain.h index 82664495d..af4521cb9 100644 --- a/tachyon/math/polynomials/univariate/mixed_radix_evaluation_domain.h +++ b/tachyon/math/polynomials/univariate/mixed_radix_evaluation_domain.h @@ -102,7 +102,7 @@ class MixedRadixEvaluationDomain BestFFT(poly, this->group_gen_inv_); if (this->offset_.IsOne()) { // clang-format off - OPENMP_PARALLEL_FOR(F& coeff : poly.coefficients_.coefficients_) { + OMP_PARALLEL_FOR(F& coeff : poly.coefficients_.coefficients_) { // clang-format on coeff *= this->size_inv_; } diff --git a/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h b/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h index d350a0eba..702d4a6d3 100644 --- a/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h +++ b/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h @@ -137,7 +137,7 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain, // an explanation) std::vector weights = F::GetSuccessivePowers(this->size_, shift, this->size_inv_); - OPENMP_PARALLEL_FOR(size_t row = 0; row < weights.size(); ++row) { + OMP_PARALLEL_FOR(size_t row = 0; row < weights.size(); ++row) { // Reverse bits because |mat| is encoded in bit-reversed order mat.row(base::bits::ReverseBitsLen(row, this->log_size_of_group_)) *= weights[row]; @@ -227,7 +227,7 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain, IFFTHelperInPlace(poly); if (this->offset_.IsOne()) { // clang-format off - OPENMP_PARALLEL_FOR(F& val : poly.coefficients_.coefficients_) { + OMP_PARALLEL_FOR(F& val : poly.coefficients_.coefficients_) { // clang-format on val *= this->size_inv_; } @@ -262,8 +262,8 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain, // Each butterfly cluster uses 2 * |gap| positions. size_t chunk_size = 2 * gap; - OPENMP_PARALLEL_NESTED_FOR(size_t i = 0; i < poly_or_evals.NumElements(); - i += chunk_size) { + OMP_PARALLEL_NESTED_FOR(size_t i = 0; i < poly_or_evals.NumElements(); + i += chunk_size) { for (size_t j = 0; j < gap; ++j) { fn(poly_or_evals.at(i + j), poly_or_evals.at(i + j + gap), roots[j]); } @@ -322,8 +322,8 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain, } } - roots_vec_[this->log_size_of_group_ - 1] = largest; - inv_roots_vec_[0] = largest_inv; + roots_vec_[this->log_size_of_group_ - 1] = std::move(largest); + inv_roots_vec_[0] = std::move(largest_inv); // Prepare space in each vector for the others. size_t size = this->size_ / 2; @@ -334,7 +334,7 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain, } // Assign every element based on the biggest vector. - OPENMP_PARALLEL_FOR(size_t i = 1; i < this->log_size_of_group_; ++i) { + OMP_PARALLEL_FOR(size_t i = 1; i < this->log_size_of_group_; ++i) { for (size_t j = 0; j < this->size_ / std::pow(2, i + 1); ++j) { size_t k = std::pow(2, i) * j; roots_vec_[this->log_size_of_group_ - i - 1][j] = roots_vec_.back()[k]; @@ -374,8 +374,8 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain, size_t chunk_rows = 1 << mid_; // max block size: 2^|mid_| - // TODO(ashjeong): benchmark between |OPENMP_PARALLEL_FOR| here vs - // |OPENMP_PARALLEL_NESTED_FOR| in |RunDitLayers| + // TODO(ashjeong): benchmark between |OMP_PARALLEL_FOR| here vs + // |OMP_PARALLEL_NESTED_FOR| in |RunDitLayers| for (size_t block_start = 0; block_start < this->size_; block_start += chunk_rows) { size_t cur_chunk_rows = std::min(chunk_rows, this->size_ - block_start); @@ -400,8 +400,8 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain, size_t chunk_rows = 1 << (this->log_size_of_group_ - mid_); // max block size: 2^(|this->log_size_of_group_| - |mid_|) - // TODO(ashjeong): benchmark between |OPENMP_PARALLEL_FOR| here vs - // |OPENMP_PARALLEL_NESTED_FOR| in |RunDitLayers| + // TODO(ashjeong): benchmark between |OMP_PARALLEL_FOR| here vs + // |OMP_PARALLEL_NESTED_FOR| in |RunDitLayers| for (size_t block_start = 0; block_start < this->size_; block_start += chunk_rows) { size_t thread = block_start / chunk_rows; @@ -433,8 +433,8 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain, size_t sub_rows = static_cast(submat.rows()); DCHECK_GE(sub_rows, block_size); - OPENMP_PARALLEL_NESTED_FOR(size_t block_start = 0; block_start < sub_rows; - block_start += block_size) { + OMP_PARALLEL_NESTED_FOR(size_t block_start = 0; block_start < sub_rows; + block_start += block_size) { for (size_t i = 0; i < half_block_size; ++i) { size_t lo = block_start + i; size_t hi = lo + half_block_size; @@ -464,7 +464,7 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain, std::vector shorts_2 = PackRowHorizontally(row_2_block, suffix_2); - OPENMP_PARALLEL_FOR(size_t i = 0; i < shorts_1.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < shorts_1.size(); ++i) { UnivariateEvaluationDomain::template ButterflyFnOutIn< PackedPrimeField>(*shorts_1[i], *shorts_2[i], packed_twiddle); } diff --git a/tachyon/math/polynomials/univariate/two_adic_subgroup.h b/tachyon/math/polynomials/univariate/two_adic_subgroup.h index 427b41fb9..cb504b002 100644 --- a/tachyon/math/polynomials/univariate/two_adic_subgroup.h +++ b/tachyon/math/polynomials/univariate/two_adic_subgroup.h @@ -50,7 +50,7 @@ class TwoAdicSubgroup { Eigen::Index cols = mat.cols(); std::vector weights = F::GetSuccessivePowers(rows, shift); - OPENMP_PARALLEL_NESTED_FOR(Eigen::Index row = 0; row < rows; ++row) { + OMP_PARALLEL_NESTED_FOR(Eigen::Index row = 0; row < rows; ++row) { for (Eigen::Index col = 0; col < cols; ++col) { mat(row, col) *= weights[row]; } diff --git a/tachyon/math/polynomials/univariate/univariate_dense_coefficients.h b/tachyon/math/polynomials/univariate/univariate_dense_coefficients.h index 812e4732c..06a5c5d0a 100644 --- a/tachyon/math/polynomials/univariate/univariate_dense_coefficients.h +++ b/tachyon/math/polynomials/univariate/univariate_dense_coefficients.h @@ -179,7 +179,7 @@ class UnivariateDenseCoefficients { Fold(const Field& r) const { size_t size = coefficients_.size(); std::vector coefficients((size + 1) >> 1); - OPENMP_PARALLEL_FOR(size_t i = 0; i < size; i += 2) { + OMP_PARALLEL_FOR(size_t i = 0; i < size; i += 2) { coefficients[i >> 1] = coefficients_[i + 1] * r; coefficients[i >> 1] += coefficients_[i]; } diff --git a/tachyon/math/polynomials/univariate/univariate_evaluation_domain.h b/tachyon/math/polynomials/univariate/univariate_evaluation_domain.h index b3fb88a46..c694dcd02 100644 --- a/tachyon/math/polynomials/univariate/univariate_evaluation_domain.h +++ b/tachyon/math/polynomials/univariate/univariate_evaluation_domain.h @@ -454,7 +454,7 @@ class UnivariateEvaluationDomain : public EvaluationDomain { // Invariant: |pow| = |c|*|g|ⁱ at the i-th iteration of the loop size_t size = poly_or_evals.NumElements(); size_t num_elems_per_thread = std::max(size / thread_nums, size_t{1024}); - OPENMP_PARALLEL_FOR(size_t i = 0; i < size; i += num_elems_per_thread) { + OMP_PARALLEL_FOR(size_t i = 0; i < size; i += num_elems_per_thread) { F pow = c * g.Pow(i); for (size_t j = 0; j < num_elems_per_thread; ++j) { if (i + j >= size) break; diff --git a/tachyon/math/polynomials/univariate/univariate_evaluations_ops.h b/tachyon/math/polynomials/univariate/univariate_evaluations_ops.h index 237471ea2..ea1dbcfde 100644 --- a/tachyon/math/polynomials/univariate/univariate_evaluations_ops.h +++ b/tachyon/math/polynomials/univariate/univariate_evaluations_ops.h @@ -36,7 +36,7 @@ class UnivariateEvaluationsOp { } CHECK_EQ(l_evaluations.size(), r_evaluations.size()); std::vector o_evaluations(r_evaluations.size()); - OPENMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) { o_evaluations[i] = l_evaluations[i] + r_evaluations[i]; } return Poly(std::move(o_evaluations)); @@ -54,7 +54,7 @@ class UnivariateEvaluationsOp { return self; } CHECK_EQ(l_evaluations.size(), r_evaluations.size()); - OPENMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) { l_evaluations[i] += r_evaluations[i]; } return self; @@ -73,7 +73,7 @@ class UnivariateEvaluationsOp { } CHECK_EQ(l_evaluations.size(), r_evaluations.size()); std::vector o_evaluations(r_evaluations.size()); - OPENMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) { o_evaluations[i] = l_evaluations[i] - r_evaluations[i]; } return Poly(std::move(o_evaluations)); @@ -91,7 +91,7 @@ class UnivariateEvaluationsOp { return self; } CHECK_EQ(l_evaluations.size(), r_evaluations.size()); - OPENMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) { l_evaluations[i] -= r_evaluations[i]; } return self; @@ -103,7 +103,7 @@ class UnivariateEvaluationsOp { return self; } std::vector o_evaluations(i_evaluations.size()); - OPENMP_PARALLEL_FOR(size_t i = 0; i < i_evaluations.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < i_evaluations.size(); ++i) { o_evaluations[i] = -i_evaluations[i]; } return Poly(std::move(o_evaluations)); @@ -115,7 +115,7 @@ class UnivariateEvaluationsOp { return self; } // clang-format off - OPENMP_PARALLEL_FOR(F& evaluation : evaluations) { + OMP_PARALLEL_FOR(F& evaluation : evaluations) { // clang-format on evaluation.NegateInPlace(); } @@ -131,7 +131,7 @@ class UnivariateEvaluationsOp { } CHECK_EQ(l_evaluations.size(), r_evaluations.size()); std::vector o_evaluations(r_evaluations.size()); - OPENMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) { o_evaluations[i] = l_evaluations[i] * r_evaluations[i]; } return Poly(std::move(o_evaluations)); @@ -150,7 +150,7 @@ class UnivariateEvaluationsOp { return self; } CHECK_EQ(l_evaluations.size(), r_evaluations.size()); - OPENMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) { l_evaluations[i] *= r_evaluations[i]; } return self; @@ -167,7 +167,7 @@ class UnivariateEvaluationsOp { return self; } std::vector o_evaluations(l_evaluations.size()); - OPENMP_PARALLEL_FOR(size_t i = 0; i < l_evaluations.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < l_evaluations.size(); ++i) { o_evaluations[i] = l_evaluations[i] * scalar; } return Poly(std::move(o_evaluations)); @@ -179,7 +179,7 @@ class UnivariateEvaluationsOp { // 0 * s or f(x) * 1 return self; } - OPENMP_PARALLEL_FOR(size_t i = 0; i < l_evaluations.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < l_evaluations.size(); ++i) { l_evaluations[i] *= scalar; } return self; @@ -205,7 +205,7 @@ class UnivariateEvaluationsOp { } std::vector o_evaluations(r_evaluations.size()); std::atomic check_valid(true); - OPENMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) { const std::optional div = l_evaluations[i] / r_evaluations[i]; if (UNLIKELY(!div)) { check_valid.store(false, std::memory_order_relaxed); @@ -239,7 +239,7 @@ class UnivariateEvaluationsOp { return std::nullopt; } std::atomic check_valid(true); - OPENMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) { if (UNLIKELY(!(l_evaluations[i] /= r_evaluations[i]))) check_valid.store(false, std::memory_order_relaxed); } diff --git a/tachyon/math/polynomials/univariate/univariate_polynomial_ops.h b/tachyon/math/polynomials/univariate/univariate_polynomial_ops.h index f2a7534e2..478529438 100644 --- a/tachyon/math/polynomials/univariate/univariate_polynomial_ops.h +++ b/tachyon/math/polynomials/univariate/univariate_polynomial_ops.h @@ -12,10 +12,9 @@ #include #include -#include "third_party/pdqsort/include/pdqsort.h" - #include "tachyon/base/openmp_util.h" #include "tachyon/base/optional.h" +#include "tachyon/base/sort.h" #include "tachyon/math/base/arithmetics_results.h" #include "tachyon/math/polynomials/univariate/univariate_polynomial.h" @@ -43,7 +42,7 @@ class UnivariatePolynomialOp> { std::vector& o_coefficients = ret.coefficients_.coefficients_; o_coefficients.resize( std::max(l_coefficients.size(), r_coefficients.size())); - OPENMP_PARALLEL_FOR(size_t i = 0; i < o_coefficients.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < o_coefficients.size(); ++i) { o_coefficients[i] = self.coefficients_[i] + other.coefficients_[i]; } @@ -63,7 +62,7 @@ class UnivariatePolynomialOp> { const std::vector& r_coefficients = other.coefficients_.coefficients_; l_coefficients.resize( std::max(l_coefficients.size(), r_coefficients.size())); - OPENMP_PARALLEL_FOR(size_t i = 0; i < r_coefficients.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < r_coefficients.size(); ++i) { l_coefficients[i] += r_coefficients[i]; } @@ -84,7 +83,7 @@ class UnivariatePolynomialOp> { UnivariatePolynomial ret; std::vector& o_coefficients = ret.coefficients_.coefficients_; o_coefficients.resize(std::max(degree, other_degree) + 1); - OPENMP_PARALLEL_FOR(size_t i = 0; i < o_coefficients.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < o_coefficients.size(); ++i) { o_coefficients[i] = self.coefficients_[i] + other.coefficients()[i]; } @@ -109,7 +108,7 @@ class UnivariatePolynomialOp> { std::vector& l_coefficients = self.coefficients_.coefficients_; const std::vector& r_terms = other.coefficients().terms_; - OPENMP_PARALLEL_FOR(const Term& r_term : r_terms) { + OMP_PARALLEL_FOR(const Term& r_term : r_terms) { if (r_term.degree <= degree) { l_coefficients[r_term.degree] += r_term.coefficient; } else { @@ -138,7 +137,7 @@ class UnivariatePolynomialOp> { std::vector& o_coefficients = ret.coefficients_.coefficients_; o_coefficients.resize( std::max(l_coefficients.size(), r_coefficients.size())); - OPENMP_PARALLEL_FOR(size_t i = 0; i < o_coefficients.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < o_coefficients.size(); ++i) { o_coefficients[i] = self.coefficients_[i] - other.coefficients_[i]; } @@ -158,7 +157,7 @@ class UnivariatePolynomialOp> { const std::vector& r_coefficients = other.coefficients_.coefficients_; l_coefficients.resize( std::max(l_coefficients.size(), r_coefficients.size())); - OPENMP_PARALLEL_FOR(size_t i = 0; i < r_coefficients.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < r_coefficients.size(); ++i) { l_coefficients[i] -= r_coefficients[i]; } @@ -179,7 +178,7 @@ class UnivariatePolynomialOp> { UnivariatePolynomial ret; std::vector& o_coefficients = ret.coefficients_.coefficients_; o_coefficients.resize(std::max(degree, other_degree) + 1); - OPENMP_PARALLEL_FOR(size_t i = 0; i < o_coefficients.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < o_coefficients.size(); ++i) { o_coefficients[i] = self.coefficients_[i] - other.coefficients()[i]; } @@ -204,7 +203,7 @@ class UnivariatePolynomialOp> { std::vector& l_coefficients = self.coefficients_.coefficients_; const std::vector& r_terms = other.coefficients().terms_; - OPENMP_PARALLEL_FOR(const Term& r_term : r_terms) { + OMP_PARALLEL_FOR(const Term& r_term : r_terms) { if (r_term.degree <= degree) { l_coefficients[r_term.degree] -= r_term.coefficient; } else { @@ -225,7 +224,7 @@ class UnivariatePolynomialOp> { } const std::vector& i_coefficients = self.coefficients_.coefficients_; std::vector o_coefficients(i_coefficients.size()); - OPENMP_PARALLEL_FOR(size_t i = 0; i < o_coefficients.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < o_coefficients.size(); ++i) { o_coefficients[i] = -i_coefficients[i]; } return UnivariatePolynomial(D(std::move(o_coefficients))); @@ -237,7 +236,7 @@ class UnivariatePolynomialOp> { } std::vector& coefficients = self.coefficients_.coefficients_; // clang-format off - OPENMP_PARALLEL_FOR(F& coefficient : coefficients) { + OMP_PARALLEL_FOR(F& coefficient : coefficients) { // clang-format on coefficient.NegateInPlace(); } @@ -254,7 +253,7 @@ class UnivariatePolynomialOp> { } const std::vector& l_coefficients = self.coefficients_.coefficients_; std::vector o_coefficients(l_coefficients.size()); - OPENMP_PARALLEL_FOR(size_t i = 0; i < l_coefficients.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < l_coefficients.size(); ++i) { o_coefficients[i] = l_coefficients[i] * scalar; } return UnivariatePolynomial(D(std::move(o_coefficients))); @@ -267,7 +266,7 @@ class UnivariatePolynomialOp> { } std::vector& coefficients = self.coefficients_.coefficients_; // clang-format off - OPENMP_PARALLEL_FOR(F& coefficient : coefficients) { + OMP_PARALLEL_FOR(F& coefficient : coefficients) { // clang-format on coefficient *= scalar; } @@ -446,7 +445,7 @@ class UnivariatePolynomialOp> { l_coefficients = std::vector(other.Degree() + 1); const std::vector& r_terms = other.coefficients().terms_; - OPENMP_PARALLEL_FOR(const Term& r_term : r_terms) { + OMP_PARALLEL_FOR(const Term& r_term : r_terms) { if constexpr (NEGATION) { l_coefficients[r_term.degree] = -r_term.coefficient; } else { @@ -762,9 +761,7 @@ class UnivariatePolynomialOp> { } size_t size = self.Degree() + 1; std::vector coefficients(size); - OPENMP_PARALLEL_FOR(size_t i = 0; i < size; ++i) { - coefficients[i] = self[i]; - } + OMP_PARALLEL_FOR(size_t i = 0; i < size; ++i) { coefficients[i] = self[i]; } return UnivariatePolynomial(D(std::move(coefficients))); } @@ -852,7 +849,7 @@ class UnivariatePolynomialOp> { } } } - pdqsort(c_terms.begin(), c_terms.end()); + base::UnstableSort(c_terms.begin(), c_terms.end()); c.coefficients_ = S(std::move(c_terms)); } }; diff --git a/tachyon/zk/air/plonky3/challenger/challenger.h b/tachyon/zk/air/plonky3/challenger/challenger.h index 0223a149f..586c96309 100644 --- a/tachyon/zk/air/plonky3/challenger/challenger.h +++ b/tachyon/zk/air/plonky3/challenger/challenger.h @@ -97,7 +97,7 @@ class Challenger { uint32_t chunk_size = range.GetSize() / thread_nums; std::vector ret(thread_nums, std::numeric_limits::max()); - OPENMP_PARALLEL_FOR(uint32_t i = 0; i < thread_nums; ++i) { + OMP_PARALLEL_FOR(uint32_t i = 0; i < thread_nums; ++i) { uint32_t start = range.from + i * chunk_size; uint32_t end = start + std::min(range.to - start, chunk_size); for (uint32_t j = start; j < end; ++j) { diff --git a/tachyon/zk/base/nested_for_loop_openmp_benchmark.cc b/tachyon/zk/base/nested_for_loop_openmp_benchmark.cc index 4da05eed6..052228e08 100644 --- a/tachyon/zk/base/nested_for_loop_openmp_benchmark.cc +++ b/tachyon/zk/base/nested_for_loop_openmp_benchmark.cc @@ -14,7 +14,7 @@ void BM_NestedForLoopParallelCols(benchmark::State& state) { return base::CreateVector(rows, []() { return F::Random(); }); }); for (auto _ : state) { - OPENMP_PARALLEL_FOR(size_t i = 0; i < cols; ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < cols; ++i) { for (size_t j = 0; j < rows; ++j) { table[i][j].DoubleInPlace(); } @@ -33,7 +33,7 @@ void BM_NestedForLoopParallelRows(benchmark::State& state) { }); for (auto _ : state) { for (size_t i = 0; i < cols; ++i) { - OPENMP_PARALLEL_FOR(size_t j = 0; j < rows; ++j) { + OMP_PARALLEL_FOR(size_t j = 0; j < rows; ++j) { table[i][j].DoubleInPlace(); } } @@ -50,7 +50,7 @@ void BM_NestedForLoopParallelCollapse(benchmark::State& state) { return base::CreateVector(rows, []() { return F::Random(); }); }); for (auto _ : state) { - OPENMP_PARALLEL_NESTED_FOR(size_t i = 0; i < cols; ++i) { + OMP_PARALLEL_NESTED_FOR(size_t i = 0; i < cols; ++i) { for (size_t j = 0; j < rows; ++j) { table[i][j].DoubleInPlace(); } diff --git a/tachyon/zk/base/parallelize_benchmark.cc b/tachyon/zk/base/parallelize_benchmark.cc index 5abcb7831..d71a6aac2 100644 --- a/tachyon/zk/base/parallelize_benchmark.cc +++ b/tachyon/zk/base/parallelize_benchmark.cc @@ -26,7 +26,7 @@ void BM_ForLoop(benchmark::State& state) { std::vector vec = base::CreateVectorParallel(n, []() { return F::Random(); }); for (auto _ : state) { - OPENMP_PARALLEL_FOR(size_t i = 0; i < n; ++i) { vec[i].DoubleInPlace(); } + OMP_PARALLEL_FOR(size_t i = 0; i < n; ++i) { vec[i].DoubleInPlace(); } } benchmark::DoNotOptimize(vec); } diff --git a/tachyon/zk/lookup/halo2/BUILD.bazel b/tachyon/zk/lookup/halo2/BUILD.bazel index 660929a40..1fbbc6f27 100644 --- a/tachyon/zk/lookup/halo2/BUILD.bazel +++ b/tachyon/zk/lookup/halo2/BUILD.bazel @@ -22,6 +22,7 @@ tachyon_cc_library( name = "permute_expression_pair", hdrs = ["permute_expression_pair.h"], deps = [ + "//tachyon/base:sort", "//tachyon/zk/base/entities:prover_base", "//tachyon/zk/lookup:pair", "@com_google_absl//absl/container:btree", diff --git a/tachyon/zk/lookup/halo2/permute_expression_pair.h b/tachyon/zk/lookup/halo2/permute_expression_pair.h index 21708a4c4..15de2fe94 100644 --- a/tachyon/zk/lookup/halo2/permute_expression_pair.h +++ b/tachyon/zk/lookup/halo2/permute_expression_pair.h @@ -12,8 +12,8 @@ #include #include "absl/container/btree_map.h" -#include "third_party/pdqsort/include/pdqsort.h" +#include "tachyon/base/sort.h" #include "tachyon/zk/base/entities/prover_base.h" #include "tachyon/zk/lookup/pair.h" @@ -35,8 +35,8 @@ template std::vector permuted_input_expressions = in.input().evaluations(); // sort input lookup expression values - pdqsort(permuted_input_expressions.begin(), - permuted_input_expressions.begin() + usable_rows); + base::UnstableSort(permuted_input_expressions.begin(), + permuted_input_expressions.begin() + usable_rows); // a map of each unique element in the table expression and its count absl::btree_map leftover_table_map; diff --git a/tachyon/zk/lookup/log_derivative_halo2/BUILD.bazel b/tachyon/zk/lookup/log_derivative_halo2/BUILD.bazel index 245132416..2e0302c20 100644 --- a/tachyon/zk/lookup/log_derivative_halo2/BUILD.bazel +++ b/tachyon/zk/lookup/log_derivative_halo2/BUILD.bazel @@ -22,6 +22,7 @@ tachyon_cc_library( deps = [ "//tachyon/base:parallelize", "//tachyon/base:ref", + "//tachyon/base:sort", "//tachyon/base/containers:container_util", "//tachyon/crypto/commitments:polynomial_openings", "//tachyon/zk/base/entities:prover_base", diff --git a/tachyon/zk/lookup/log_derivative_halo2/evaluator.h b/tachyon/zk/lookup/log_derivative_halo2/evaluator.h index 0ba455ccc..6a4c706b5 100644 --- a/tachyon/zk/lookup/log_derivative_halo2/evaluator.h +++ b/tachyon/zk/lookup/log_derivative_halo2/evaluator.h @@ -110,19 +110,28 @@ class Evaluator { // = Σᵢ(τ(X) * Π_{j != i} φⱼ(X)) - m(X) * Π(φᵢ(X)) // // (1 - (l_last(X) + l_blind(X))) * (LHS - RHS) = 0 + std::vector inputs_value; size_t start = chunk_offset * chunk_size; for (size_t idx = 0; idx < chunk.size(); ++idx) { size_t cur_idx = start + idx; // φᵢ(X) = fᵢ(X) + β - std::vector inputs_value = base::Map( - inputs_eval_data, - [&inputs_evaluator, &cur_idx]( - size_t i, - plonk::EvaluationInput& input_eval_data) { - return inputs_evaluator[i].Evaluate(input_eval_data, cur_idx, - /*scale=*/1, F::Zero()); - }); + if (idx == 0) { + inputs_value = base::Map( + inputs_eval_data, + [&inputs_evaluator, &cur_idx]( + size_t i, plonk::EvaluationInput& + input_eval_data) { + return inputs_evaluator[i].Evaluate(input_eval_data, cur_idx, + /*scale=*/1, F::Zero()); + }); + } else { + for (size_t i = 0; i < inputs_value.size(); ++i) { + inputs_value[i] = + inputs_evaluator[i].Evaluate(inputs_eval_data[i], cur_idx, + /*scale=*/1, F::Zero()); + } + } // Π(φᵢ(X)) F inputs_prod = std::accumulate( diff --git a/tachyon/zk/lookup/log_derivative_halo2/prover.h b/tachyon/zk/lookup/log_derivative_halo2/prover.h index efb3094dc..8742e99d9 100644 --- a/tachyon/zk/lookup/log_derivative_halo2/prover.h +++ b/tachyon/zk/lookup/log_derivative_halo2/prover.h @@ -32,6 +32,12 @@ struct TableEvalWithIndex { bool operator<(const TableEvalWithIndex& other) const { return eval < other.eval; } + bool operator<=(const TableEvalWithIndex& other) const { + return eval <= other.eval; + } + bool operator>(const TableEvalWithIndex& other) const { + return eval > other.eval; + } }; template @@ -41,7 +47,7 @@ struct ComputeMPolysTempStorage { explicit ComputeMPolysTempStorage(size_t usable_rows) : sorted_table_with_indices(usable_rows), m_values_atomic(usable_rows) { - OPENMP_PARALLEL_FOR(RowIndex i = 0; i < usable_rows; ++i) { + OMP_PARALLEL_FOR(RowIndex i = 0; i < usable_rows; ++i) { m_values_atomic[i] = 0; } } diff --git a/tachyon/zk/lookup/log_derivative_halo2/prover_impl.h b/tachyon/zk/lookup/log_derivative_halo2/prover_impl.h index fa29a8cec..5451087da 100644 --- a/tachyon/zk/lookup/log_derivative_halo2/prover_impl.h +++ b/tachyon/zk/lookup/log_derivative_halo2/prover_impl.h @@ -14,6 +14,7 @@ #include "tachyon/base/containers/container_util.h" #include "tachyon/base/parallelize.h" #include "tachyon/base/ref.h" +#include "tachyon/base/sort.h" #include "tachyon/zk/lookup/log_derivative_halo2/prover.h" #include "tachyon/zk/plonk/expressions/compress_expression.h" @@ -97,33 +98,33 @@ BlindedPolynomial Prover::ComputeMPoly( const Evals& compressed_table, ComputeMPolysTempStorage& storage) { RowIndex usable_rows = prover->GetUsableRows(); - OPENMP_PARALLEL_FOR(RowIndex i = 0; i < usable_rows; ++i) { + OMP_PARALLEL_FOR(RowIndex i = 0; i < usable_rows; ++i) { storage.sorted_table_with_indices[i] = {i, compressed_table[i].ToBigInt()}; } - // TODO(chokobole): Use https://github.com/timsort/cpp-TimSort or - // https://github.com/sebawild/powersort for better performance. - std::stable_sort(storage.sorted_table_with_indices.begin(), + base::StableSort(storage.sorted_table_with_indices.begin(), storage.sorted_table_with_indices.end()); - OPENMP_PARALLEL_NESTED_FOR(size_t i = 0; i < compressed_inputs.size(); ++i) { - for (RowIndex j = 0; j < usable_rows; ++j) { - BigInt input = compressed_inputs[i][j].ToBigInt(); - auto it = base::BinarySearchByKey( - storage.sorted_table_with_indices.begin(), - storage.sorted_table_with_indices.end(), input, LessThan{}); - if (it != storage.sorted_table_with_indices.end()) { - storage.m_values_atomic[it->index].fetch_add(1, - std::memory_order_relaxed); + std::vector m_values(prover->pcs().N()); + OMP_PARALLEL { + OMP_NESTED_FOR(size_t i = 0; i < compressed_inputs.size(); ++i) { + for (RowIndex j = 0; j < usable_rows; ++j) { + BigInt input = compressed_inputs[i][j].ToBigInt(); + auto it = base::BinarySearchByKey( + storage.sorted_table_with_indices.begin(), + storage.sorted_table_with_indices.end(), input, LessThan{}); + if (it != storage.sorted_table_with_indices.end()) { + storage.m_values_atomic[it->index].fetch_add( + 1, std::memory_order_relaxed); + } } } - } - // Convert atomic |m_values| to |Evals|. - std::vector m_values(prover->pcs().N()); - OPENMP_PARALLEL_FOR(RowIndex i = 0; i < usable_rows; ++i) { - m_values[i] = - F(storage.m_values_atomic[i].exchange(0, std::memory_order_relaxed)); + // Convert atomic |m_values| to |Evals|. + OMP_FOR(RowIndex i = 0; i < usable_rows; ++i) { + m_values[i] = + F(storage.m_values_atomic[i].exchange(0, std::memory_order_relaxed)); + } } BlindedPolynomial m_poly(Evals(std::move(m_values)), @@ -206,11 +207,11 @@ BlindedPolynomial Prover::CreateGrandSumPoly( ComputeLogDerivatives(compressed_inputs[i], beta, input_log_derivatives); if (i == 0) { - OPENMP_PARALLEL_FOR(size_t j = 0; j < usable_rows; ++j) { + OMP_PARALLEL_FOR(size_t j = 0; j < usable_rows; ++j) { storage.inputs_log_derivatives[j] = input_log_derivatives[j]; } } else { - OPENMP_PARALLEL_FOR(size_t j = 0; j < usable_rows; ++j) { + OMP_PARALLEL_FOR(size_t j = 0; j < usable_rows; ++j) { storage.inputs_log_derivatives[j] += input_log_derivatives[j]; } } @@ -227,7 +228,7 @@ BlindedPolynomial Prover::CreateGrandSumPoly( // |storage.inputs_log_derivatives| since the current values of // |storage.inputs_log_derivatives| are not needed anymore. std::vector& log_derivatives_diff = storage.inputs_log_derivatives; - OPENMP_PARALLEL_FOR(size_t i = 0; i < usable_rows; ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < usable_rows; ++i) { log_derivatives_diff[i] -= m_values[i] * storage.table_log_derivatives[i]; if (i != usable_rows - 1) { grand_sum[i + 1] = log_derivatives_diff[i]; diff --git a/tachyon/zk/plonk/expressions/compress_expression.h b/tachyon/zk/plonk/expressions/compress_expression.h index b5de054ef..2d1b80217 100644 --- a/tachyon/zk/plonk/expressions/compress_expression.h +++ b/tachyon/zk/plonk/expressions/compress_expression.h @@ -27,13 +27,13 @@ Evals CompressExpressions( for (size_t expr_idx = 0; expr_idx < expressions.size(); ++expr_idx) { if (UNLIKELY(expr_idx == 0)) { - OPENMP_PARALLEL_FOR(size_t i = 0; i < compressed_values.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < compressed_values.size(); ++i) { ProvingEvaluator evaluator = evaluator_tpl; evaluator.set_idx(i); compressed_values[i] = evaluator.Evaluate(expressions[expr_idx].get()); } } else { - OPENMP_PARALLEL_FOR(size_t i = 0; i < compressed_values.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < compressed_values.size(); ++i) { ProvingEvaluator evaluator = evaluator_tpl; evaluator.set_idx(i); compressed_values[i] *= theta; diff --git a/tachyon/zk/plonk/keys/proving_key.h b/tachyon/zk/plonk/keys/proving_key.h index bcfbd975d..c85fab4aa 100644 --- a/tachyon/zk/plonk/keys/proving_key.h +++ b/tachyon/zk/plonk/keys/proving_key.h @@ -164,7 +164,7 @@ class ProvingKey : public Key { // | 5 | 0 | // | 6 | 0 | // | 7 | 0 | - OPENMP_PARALLEL_FOR(size_t i = 0; i < usable_rows; ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < usable_rows; ++i) { // NOTE(chokobole): It's safe to access since we created |domain->size()| // |evals|, which is greater than |usable_rows|. evals.at(i) = F::One(); diff --git a/tachyon/zk/plonk/layout/floor_planner/v1/BUILD.bazel b/tachyon/zk/plonk/layout/floor_planner/v1/BUILD.bazel index ed935b9cf..f6ab49f06 100644 --- a/tachyon/zk/plonk/layout/floor_planner/v1/BUILD.bazel +++ b/tachyon/zk/plonk/layout/floor_planner/v1/BUILD.bazel @@ -67,12 +67,12 @@ tachyon_cc_library( deps = [ "//tachyon:export", "//tachyon/base:logging", + "//tachyon/base:sort", "//tachyon/base/containers:container_util", "//tachyon/zk/plonk/base:column_key", "//tachyon/zk/plonk/base:column_type", "//tachyon/zk/plonk/layout:region_shape", "//tachyon/zk/plonk/layout/floor_planner:allocations", "@com_google_absl//absl/container:flat_hash_map", - "@pdqsort", ], ) diff --git a/tachyon/zk/plonk/layout/floor_planner/v1/v1_strategy.h b/tachyon/zk/plonk/layout/floor_planner/v1/v1_strategy.h index 9e2391cbf..7412f6d84 100644 --- a/tachyon/zk/plonk/layout/floor_planner/v1/v1_strategy.h +++ b/tachyon/zk/plonk/layout/floor_planner/v1/v1_strategy.h @@ -15,10 +15,10 @@ #include #include "absl/container/flat_hash_map.h" -#include "third_party/pdqsort/include/pdqsort.h" #include "tachyon/base/containers/container_util.h" #include "tachyon/base/logging.h" +#include "tachyon/base/sort.h" #include "tachyon/export.h" #include "tachyon/zk/plonk/base/column_key.h" #include "tachyon/zk/plonk/base/column_type.h" @@ -72,10 +72,10 @@ SlotInResult SlotIn(std::vector>& region_shapes) { // - The sort order relies on Column's Ord implementation! std::vector region_columns(region.columns().begin(), region.columns().end()); - pdqsort(region_columns.begin(), region_columns.end(), - [](const RegionColumn& lhs, const RegionColumn& rhs) { - return lhs < rhs; - }); + base::UnstableSort(region_columns.begin(), region_columns.end(), + [](const RegionColumn& lhs, const RegionColumn& rhs) { + return lhs < rhs; + }); std::optional region_start = FirstFitRegion(&column_allocations, region_columns, region.row_count(), @@ -102,41 +102,43 @@ SlotInBiggestAdviceFirstResult SlotInBiggestAdviceFirst( // NOTE(TomTaehoonKim): Sorted result might be different from the original // See // https://github.com/kroma-network/halo2/blob/7d0a369/halo2_proofs/src/layout/floor_planner/v1/strategy.rs#L202-L215 - pdqsort(sorted_regions.begin(), sorted_regions.end(), - [](const RegionShape& lhs, const RegionShape& rhs) { - // Count the number of advice columns - size_t lhs_advice_cols = 0; - for (const RegionColumn& column : lhs.columns()) { - if (column.type() == RegionColumn::Type::kColumn) { - const AnyColumnKey& c = column.column(); - if (c.type() == ColumnType::kAdvice) { - ++lhs_advice_cols; - } - } - } - size_t rhs_advice_cols = 0; - for (const RegionColumn& column : rhs.columns()) { - if (column.type() == RegionColumn::Type::kColumn) { - const AnyColumnKey& c = column.column(); - if (c.type() == ColumnType::kAdvice) { - ++rhs_advice_cols; - } - } - } - // Sort by advice area (since this has the most contention). - return lhs_advice_cols * lhs.row_count() < - rhs_advice_cols * rhs.row_count(); - }); + base::UnstableSort(sorted_regions.begin(), sorted_regions.end(), + [](const RegionShape& lhs, const RegionShape& rhs) { + // Count the number of advice columns + size_t lhs_advice_cols = 0; + for (const RegionColumn& column : lhs.columns()) { + if (column.type() == RegionColumn::Type::kColumn) { + const AnyColumnKey& c = column.column(); + if (c.type() == ColumnType::kAdvice) { + ++lhs_advice_cols; + } + } + } + size_t rhs_advice_cols = 0; + for (const RegionColumn& column : rhs.columns()) { + if (column.type() == RegionColumn::Type::kColumn) { + const AnyColumnKey& c = column.column(); + if (c.type() == ColumnType::kAdvice) { + ++rhs_advice_cols; + } + } + } + // Sort by advice area (since this has the most + // contention). + return lhs_advice_cols * lhs.row_count() < + rhs_advice_cols * rhs.row_count(); + }); std::reverse(sorted_regions.begin(), sorted_regions.end()); // Lay out the sorted regions. SlotInResult result = SlotIn(sorted_regions); // Un-sort the regions so they match the original indexing. - pdqsort(result.regions.begin(), result.regions.end(), - [](const RegionInfo& lhs, const RegionInfo& rhs) { - return lhs.region.region_index() < rhs.region.region_index(); - }); + base::UnstableSort(result.regions.begin(), result.regions.end(), + [](const RegionInfo& lhs, const RegionInfo& rhs) { + return lhs.region.region_index() < + rhs.region.region_index(); + }); std::vector region_starts = base::Map( result.regions, [](const RegionInfo& region) { return region.region_start; }); diff --git a/tachyon/zk/plonk/permutation/grand_product_argument.h b/tachyon/zk/plonk/permutation/grand_product_argument.h index 8dd4727fa..435758e68 100644 --- a/tachyon/zk/plonk/permutation/grand_product_argument.h +++ b/tachyon/zk/plonk/permutation/grand_product_argument.h @@ -80,7 +80,7 @@ class GrandProductArgument { size_t chunk_size = base::GetNumElementsPerThread(grand_product); size_t num_chunks = (size + chunk_size - 1) / chunk_size; - OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) { RowIndex start = i * chunk_size; RowIndex end = i == num_chunks - 1 ? size : start + chunk_size; for (size_t j = 0; j < num_cols; ++j) { diff --git a/tachyon/zk/plonk/permutation/permutation_assembly.h b/tachyon/zk/plonk/permutation/permutation_assembly.h index 5ee8da3df..6fa67d042 100644 --- a/tachyon/zk/plonk/permutation/permutation_assembly.h +++ b/tachyon/zk/plonk/permutation/permutation_assembly.h @@ -131,7 +131,7 @@ class TACHYON_EXPORT PermutationAssembly { domain->template Zero()); // Assign |unpermuted_table| to |permutations|. - OPENMP_PARALLEL_NESTED_FOR(size_t i = 0; i < permutations.size(); ++i) { + OMP_PARALLEL_NESTED_FOR(size_t i = 0; i < permutations.size(); ++i) { for (size_t j = 0; j < rows_; ++j) { // NOTE(chokobole): It's safe to access since we created |kDegree| // |Zeros()|. diff --git a/tachyon/zk/plonk/permutation/unpermuted_table.h b/tachyon/zk/plonk/permutation/unpermuted_table.h index 3dc779c71..f786805ce 100644 --- a/tachyon/zk/plonk/permutation/unpermuted_table.h +++ b/tachyon/zk/plonk/permutation/unpermuted_table.h @@ -70,7 +70,7 @@ class UnpermutedTable { // Assign [δⁱω⁰, δⁱω¹, δⁱω², ..., δⁱωⁿ⁻¹] to each col. for (size_t i = 1; i < cols; ++i) { std::vector col(rows); - OPENMP_PARALLEL_FOR(RowIndex j = 0; j < rows; ++j) { + OMP_PARALLEL_FOR(RowIndex j = 0; j < rows; ++j) { col[j] = unpermuted_table[i - 1][j] * delta; } unpermuted_table.push_back(Evals(std::move(col))); diff --git a/tachyon/zk/plonk/vanishing/vanishing_prover_impl.h b/tachyon/zk/plonk/vanishing/vanishing_prover_impl.h index b2c34faf3..acd37e563 100644 --- a/tachyon/zk/plonk/vanishing/vanishing_prover_impl.h +++ b/tachyon/zk/plonk/vanishing/vanishing_prover_impl.h @@ -171,7 +171,7 @@ void VanishingProver::BatchEvaluate( [](absl::Span h_piece) { return h_piece; }); std::vector coeffs(n); for (size_t i = h_pieces.size() - 1; i != SIZE_MAX; --i) { - OPENMP_PARALLEL_FOR(size_t j = 0; j < n; ++j) { + OMP_PARALLEL_FOR(size_t j = 0; j < n; ++j) { coeffs[j] *= x_n; coeffs[j] += h_pieces[i][j]; } diff --git a/tachyon/zk/plonk/vanishing/vanishing_utils.h b/tachyon/zk/plonk/vanishing/vanishing_utils.h index 2d6ee9c96..626dbd7e6 100644 --- a/tachyon/zk/plonk/vanishing/vanishing_utils.h +++ b/tachyon/zk/plonk/vanishing/vanishing_utils.h @@ -103,7 +103,7 @@ ExtendedEvals& DivideByVanishingPolyInPlace( // Multiply the inverse to obtain the quotient polynomial in the coset // evaluation domain. std::vector& evaluations = evals.evaluations(); - OPENMP_PARALLEL_FOR(size_t i = 0; i < evaluations.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < evaluations.size(); ++i) { evaluations[i] *= t_evaluations[i % t_evaluations.size()]; } @@ -126,7 +126,7 @@ void DistributePowersZeta(Poly& poly, bool into_coset) { into_coset ? zeta_inv : zeta}; std::vector& coeffs = poly.coefficients().coefficients(); - OPENMP_PARALLEL_FOR(size_t i = 0; i < coeffs.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < coeffs.size(); ++i) { size_t j = i % 3; if (j == 0) continue; coeffs[i] *= coset_powers[j - 1]; @@ -186,7 +186,7 @@ std::vector BuildExtendedColumnWithColumns( size_t rows = columns[0].size(); std::vector flattened_transposed_columns(cols * rows); - OPENMP_PARALLEL_NESTED_FOR(size_t i = 0; i < columns.size(); ++i) { + OMP_PARALLEL_NESTED_FOR(size_t i = 0; i < columns.size(); ++i) { for (size_t j = 0; j < rows; ++j) { flattened_transposed_columns[j * cols + i] = columns[i][j]; } diff --git a/tachyon/zk/r1cs/constraint_system/BUILD.bazel b/tachyon/zk/r1cs/constraint_system/BUILD.bazel index ada522475..a3f711dba 100644 --- a/tachyon/zk/r1cs/constraint_system/BUILD.bazel +++ b/tachyon/zk/r1cs/constraint_system/BUILD.bazel @@ -34,10 +34,10 @@ tachyon_cc_library( hdrs = ["linear_combination.h"], deps = [ ":term", + "//tachyon/base:sort", "//tachyon/base/containers:container_util", "//tachyon/base/ranges:algorithm", "@com_google_googletest//:gtest_prod", - "@pdqsort", ], ) diff --git a/tachyon/zk/r1cs/constraint_system/linear_combination.h b/tachyon/zk/r1cs/constraint_system/linear_combination.h index 8c0a8fe3f..06ec4ac6e 100644 --- a/tachyon/zk/r1cs/constraint_system/linear_combination.h +++ b/tachyon/zk/r1cs/constraint_system/linear_combination.h @@ -16,10 +16,10 @@ #include "absl/strings/str_join.h" #include "gtest/gtest_prod.h" -#include "third_party/pdqsort/include/pdqsort.h" #include "tachyon/base/containers/container_util.h" #include "tachyon/base/ranges/algorithm.h" +#include "tachyon/base/sort.h" #include "tachyon/zk/r1cs/constraint_system/term.h" namespace tachyon::zk::r1cs { @@ -64,10 +64,10 @@ class LinearCombination { std::vector>&& TakeTerms() && { return std::move(terms_); } void Deduplicate() { - pdqsort(terms_.begin(), terms_.end(), - [](const Term& a, const Term& b) { - return a.variable < b.variable; - }); + base::UnstableSort(terms_.begin(), terms_.end(), + [](const Term& a, const Term& b) { + return a.variable < b.variable; + }); bool is_first = true; auto cur_var_first_it = terms_.begin(); auto it = terms_.begin(); diff --git a/tachyon/zk/r1cs/constraint_system/quadratic_arithmetic_program.h b/tachyon/zk/r1cs/constraint_system/quadratic_arithmetic_program.h index 2b37d7ba2..20edeefde 100644 --- a/tachyon/zk/r1cs/constraint_system/quadratic_arithmetic_program.h +++ b/tachyon/zk/r1cs/constraint_system/quadratic_arithmetic_program.h @@ -139,18 +139,15 @@ class QuadraticArithmeticProgram { // where x is |full_assignments|. // clang-format on OMP_PARALLEL { - OMP_FOR_NOWAIT - for (size_t i = 0; i < matrices.num_constraints; ++i) { + OMP_FOR_NOWAIT(size_t i = 0; i < matrices.num_constraints; ++i) { a[i] = EvaluateConstraint(matrices.a[i], full_assignments); } - OMP_FOR_NOWAIT - for (size_t i = 0; i < matrices.num_constraints; ++i) { + OMP_FOR_NOWAIT(size_t i = 0; i < matrices.num_constraints; ++i) { b[i] = EvaluateConstraint(matrices.b[i], full_assignments); } - OMP_FOR - for (size_t i = 0; i < matrices.num_constraints; ++i) { + OMP_FOR(size_t i = 0; i < matrices.num_constraints; ++i) { c[i] = EvaluateConstraint(matrices.c[i], full_assignments); } } @@ -181,7 +178,7 @@ class QuadraticArithmeticProgram { .Inverse()); // |h_evals[i]| = (|a[i]| * |b[i]| - |c[i]|)) / (g * ωⁿ⁺ˡ⁺¹ - 1) - OPENMP_PARALLEL_FOR(size_t i = 0; i < domain->size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < domain->size(); ++i) { F& h_evals_i = a_evals.at(i); h_evals_i *= b_evals[i]; h_evals_i -= c_evals[i]; diff --git a/tachyon/zk/r1cs/groth16/proving_key.h b/tachyon/zk/r1cs/groth16/proving_key.h index 3eb06b6a2..212d24b7c 100644 --- a/tachyon/zk/r1cs/groth16/proving_key.h +++ b/tachyon/zk/r1cs/groth16/proving_key.h @@ -115,7 +115,7 @@ class ProvingKey : public Key { std::vector& a = qap_instance_map_result.a; std::vector& b = qap_instance_map_result.b; std::vector& c = qap_instance_map_result.c; - OPENMP_PARALLEL_FOR(size_t i = 0; i < l.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < l.size(); ++i) { l[i] = ComputeABC( a[num_instance_variables + i], b[num_instance_variables + i], c[num_instance_variables + i], toxic_waste, delta_inverse); diff --git a/tachyon/zk/r1cs/groth16/verifying_key.h b/tachyon/zk/r1cs/groth16/verifying_key.h index 3f1ab1c73..216e49721 100644 --- a/tachyon/zk/r1cs/groth16/verifying_key.h +++ b/tachyon/zk/r1cs/groth16/verifying_key.h @@ -91,7 +91,7 @@ class VerifyingKey : public Key { const std::vector& a = qap_instance_map_result.a; const std::vector& b = qap_instance_map_result.b; const std::vector& c = qap_instance_map_result.c; - OPENMP_PARALLEL_FOR(size_t i = 0; i < l.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < l.size(); ++i) { l[i] = ComputeABC(a[i], b[i], c[i], toxic_waste, gamma_inverse); } diff --git a/third_party/pdqsort/workspace.bzl b/third_party/pdqsort/workspace.bzl index b5271935b..cc7b013fb 100644 --- a/third_party/pdqsort/workspace.bzl +++ b/third_party/pdqsort/workspace.bzl @@ -1,4 +1,4 @@ -"""loads the hwloc library, used by Tachyon.""" +"""loads the pdqsort library, used by Tachyon.""" load("//third_party:repo.bzl", "tachyon_http_archive", "tf_mirror_urls") diff --git a/third_party/powersort/BUILD.bazel b/third_party/powersort/BUILD.bazel new file mode 100644 index 000000000..e69de29bb diff --git a/third_party/powersort/fix_multiple_definitions.patch b/third_party/powersort/fix_multiple_definitions.patch new file mode 100644 index 000000000..2725d3572 --- /dev/null +++ b/third_party/powersort/fix_multiple_definitions.patch @@ -0,0 +1,176 @@ +diff --git a/src/sorts/merging.h b/src/sorts/merging.h +index a2a3769..9afc08f 100644 +--- a/src/sorts/merging.h ++++ b/src/sorts/merging.h +@@ -7,14 +7,6 @@ + + namespace algorithms { + +-#ifdef COUNT_MERGECOST +- const bool COUNT_MERGE_COSTS = true; +-#else +- const bool COUNT_MERGE_COSTS = false; +-#endif +- long long volatile totalMergeCosts = 0; +- long long volatile totalBufferCosts = 0; +- + /** + * A sentinel value used by some merging method; + * this value must be strictly larger than any value in the input. +@@ -39,7 +31,7 @@ namespace algorithms { + COPY_BOTH_WITH_SENTINELS + }; + +- std::string to_string(merging_methods mergingMethod) { ++ inline std::string to_string(merging_methods mergingMethod) { + switch (mergingMethod) { + case UNSTABLE_BITONIC_MERGE: + return "UNSTABLE_BITONIC_MERGE"; +@@ -70,10 +62,8 @@ namespace algorithms { + */ + template + void merge_runs_bitonic(Iter l, Iter m, Iter r, Iter2 B) { +- if (COUNT_MERGE_COSTS) totalMergeCosts += (r-l); + std::copy_backward(l,m,B+(m-l)); + std::reverse_copy(m,r,B+(m-l)); +- if (COUNT_MERGE_COSTS) totalBufferCosts += (r-l); + auto i = B, j = B+(r-l-1); + for (auto k = l; k < r; ++k) + *k = *j < *i ? *j-- : *i++; +@@ -90,10 +80,8 @@ namespace algorithms { + template + void merge_runs_bitonic_manual_copy(Iter l, Iter m, Iter r, Iter2 B) { + Iter i1, j1; Iter2 b; +- if (COUNT_MERGE_COSTS) totalMergeCosts += (r-l); + for (i1 = m-1, b = B+(m-1-l); i1 >= l;) *b-- = *i1--; + for (j1 = r, b = B+(m-l); j1 > m;) *b++ = *--j1; +- if (COUNT_MERGE_COSTS) totalBufferCosts += (r-l); + auto i = B, j = B+(r-l-1); + for (auto k = l; k < r; ++k) + *k = *j < *i ? *j-- : *i++; +@@ -111,10 +99,8 @@ namespace algorithms { + */ + template + void merge_runs_bitonic_branchless(Iter l, Iter m, Iter r, Iter2 B) { +- if (COUNT_MERGE_COSTS) totalMergeCosts += (r-l); + std::copy_backward(l,m,B+(m-l)); + std::reverse_copy(m,r,B+(m-l)); +- if (COUNT_MERGE_COSTS) totalBufferCosts += (r-l); + Iter2 i = B, j = B+(r-l-1); + for (auto k = l; k < r; ++k) { + bool const cmp = *j < *i; +@@ -133,10 +119,8 @@ namespace algorithms { + template + void merge_runs_copy_half(Iter l, Iter m, Iter r, Iter2 B) { + auto n1 = m-l, n2 = r-m; +- if (COUNT_MERGE_COSTS) totalMergeCosts += (n1+n2); + if (n1 <= n2) { + std::copy(l,m,B); +- if (COUNT_MERGE_COSTS) totalBufferCosts += (m-l); + auto c1 = B, e1 = B + n1; + auto c2 = m, e2 = r, o = l; + while (c1 < e1 && c2 < e2) +@@ -144,7 +128,6 @@ namespace algorithms { + while (c1 < e1) *o++ = *c1++; + } else { + std::copy(m,r,B); +- if (COUNT_MERGE_COSTS) totalBufferCosts += (r-m); + auto c1 = m-1, s1 = l, o = r-1; + auto c2 = B+n2-1, s2 = B; + while (c1 >= s1 && c2 >= s2) +@@ -161,9 +144,7 @@ namespace algorithms { + template + void merge_runs_basic(Iter l, Iter m, Iter r, Iter2 B) { + auto n1 = m-l, n2 = r-m; +- if (COUNT_MERGE_COSTS) totalMergeCosts += (n1+n2); + std::copy(l,r,B); +- if (COUNT_MERGE_COSTS) totalBufferCosts += (n1+n2); + auto c1 = B, e1 = B + n1, c2 = e1, e2 = e1 + n2; + auto o = l; + while (c1 < e1 && c2 < e2) +@@ -182,12 +163,10 @@ namespace algorithms { + typedef typename std::iterator_traits::value_type T; + static_assert(std::numeric_limits::is_specialized, "Needs numeric type (for sentinels)"); + auto n1 = m-l, n2 = r-m; +- if (COUNT_MERGE_COSTS) totalMergeCosts += (n1+n2); + std::copy(l, m, B); + *(B + (m - l)) = plus_inf_sentinel(); + std::copy(m, r, B + (m - l + 1)); + *(B + (r - l) + 1) = plus_inf_sentinel(); +- if (COUNT_MERGE_COSTS) totalBufferCosts += (n1+n2+2); + auto c1 = B, c2 = B + (m - l + 1), o = l; + while (o < r) *o++ = *c1 <= *c2 ? *c1++ : *c2++; + } +diff --git a/src/sorts/powersort.h b/src/sorts/powersort.h +index 93d2ace..6a0b36b 100644 +--- a/src/sorts/powersort.h ++++ b/src/sorts/powersort.h +@@ -24,7 +24,7 @@ namespace algorithms { + BITWISE_LOOP, + MOST_SIGNIFICANT_SET_BIT, + }; +- std::string to_string(node_power_implementations implementation) { ++ inline std::string to_string(node_power_implementations implementation) { + switch (implementation) { + case TRIVIAL: return "TRIVIAL"; + case DIVISION_LOOP: return "DIVISION_LOOP"; +@@ -36,7 +36,7 @@ namespace algorithms { + }; + + +- power_t node_power_trivial(size_t begin, size_t end, ++ inline power_t node_power_trivial(size_t begin, size_t end, + size_t beginA, size_t beginB, size_t endB) { + size_t n = end - begin; + size_t n1 = beginB - beginA, n2 = endB - beginB; +@@ -51,7 +51,7 @@ namespace algorithms { + return k; + } + +- power_t node_power_div(size_t begin, size_t end, ++ inline power_t node_power_div(size_t begin, size_t end, + size_t beginA, size_t beginB, size_t endB) { + size_t twoN = 2*(end - begin); // 2*n + size_t n1 = beginB - beginA, n2 = endB - beginB; // lengths of runs +@@ -66,7 +66,7 @@ namespace algorithms { + return k; + } + +- power_t node_power_bitwise(size_t begin, size_t end, ++ inline power_t node_power_bitwise(size_t begin, size_t end, + size_t beginA, size_t beginB, size_t endB) { + size_t n = end - begin; + assert (n < (size_t{1} << 63)); +@@ -87,7 +87,7 @@ namespace algorithms { + return nCommonBits + 1; + } + +- power_t node_power_clz(size_t begin, size_t end, ++ inline power_t node_power_clz(size_t begin, size_t end, + size_t beginA, size_t beginB, size_t endB) { + size_t n = end - begin; + assert(n <= (size_t{1} << 31)); +@@ -99,7 +99,7 @@ namespace algorithms { + } + + // not precise enough for large powers ... +- power_t node_power_clz_unconstrained(ptrdiff_t begin, ptrdiff_t end, ++ inline power_t node_power_clz_unconstrained(ptrdiff_t begin, ptrdiff_t end, + ptrdiff_t beginA, ptrdiff_t beginB, ptrdiff_t endB) { + assert(begin <= beginA && beginA <= beginB && beginB <= endB && endB <= end); + auto n = static_cast(end - begin); +@@ -128,12 +128,12 @@ namespace algorithms { + } + } + +- unsigned floor_log2(unsigned int n) { ++ inline unsigned floor_log2(unsigned int n) { + if (n <= 0) return 0; + return 31 - __builtin_clz( n ); + } + +- unsigned floor_log2(unsigned long n) { ++ inline unsigned floor_log2(unsigned long n) { + if (n <= 0) return 0; + return 63 - __builtin_clzl( n ); + } diff --git a/third_party/powersort/fix_sign_compare_warning.patch b/third_party/powersort/fix_sign_compare_warning.patch new file mode 100644 index 000000000..25c7ef454 --- /dev/null +++ b/third_party/powersort/fix_sign_compare_warning.patch @@ -0,0 +1,31 @@ +diff --git a/src/sorts/powersort.h b/src/sorts/powersort.h +index 54ab704..93d2ace 100644 +--- a/src/sorts/powersort.h ++++ b/src/sorts/powersort.h +@@ -69,7 +69,7 @@ namespace algorithms { + power_t node_power_bitwise(size_t begin, size_t end, + size_t beginA, size_t beginB, size_t endB) { + size_t n = end - begin; +- assert (n < (1L << 63)); ++ assert (n < (size_t{1} << 63)); + size_t l = beginA - begin + beginB - begin; + size_t r = beginB - begin + endB - begin; + // a and b are given by l/(2*n) and r/(2*n), both are in [0,1). +@@ -90,7 +90,7 @@ namespace algorithms { + power_t node_power_clz(size_t begin, size_t end, + size_t beginA, size_t beginB, size_t endB) { + size_t n = end - begin; +- assert(n <= (1L << 31)); ++ assert(n <= (size_t{1} << 31)); + unsigned long l2 = beginA + beginB - 2*begin; // 2*l + unsigned long r2 = beginB + endB - 2*begin; // 2*r + auto a = static_cast((l2 << 30) / n); +@@ -103,7 +103,7 @@ namespace algorithms { + ptrdiff_t beginA, ptrdiff_t beginB, ptrdiff_t endB) { + assert(begin <= beginA && beginA <= beginB && beginB <= endB && endB <= end); + auto n = static_cast(end - begin); +- assert(n < (1L << 63)); ++ assert(n < (size_t{1} << 63)); + auto l2 = static_cast((beginA - begin) + (beginB - begin)); // 2*l + auto r2 = static_cast((beginB - begin) + (endB - begin)); // 2*r + static_assert(sizeof(size_t) == 8, "assume 64bit size_t"); // can compute with 64 bits diff --git a/third_party/powersort/fix_static_assertion.patch b/third_party/powersort/fix_static_assertion.patch new file mode 100644 index 000000000..b9e862908 --- /dev/null +++ b/third_party/powersort/fix_static_assertion.patch @@ -0,0 +1,49 @@ +diff --git a/src/sorts/merging.h b/src/sorts/merging.h +index 9afc08f..835b6d2 100644 +--- a/src/sorts/merging.h ++++ b/src/sorts/merging.h +@@ -254,27 +254,23 @@ namespace algorithms { + template + void merge_runs(Iter l, Iter m, Iter r, Iter2 B) { +- switch(mergingMethod) { +- case UNSTABLE_BITONIC_MERGE: +- return merge_runs_bitonic(l, m, r, B); +- case UNSTABLE_BITONIC_MERGE_MANUAL_COPY: +- return merge_runs_bitonic_manual_copy(l, m, r, B); +- case UNSTABLE_BITONIC_MERGE_BRANCHLESS: +- return merge_runs_bitonic_branchless(l, m, r, B); +- case COPY_SMALLER: +- return merge_runs_copy_half(l, m, r, B); +- case COPY_BOTH: +- return merge_runs_basic(l, m, r, B); +- case COPY_BOTH_WITH_SENTINELS: +- return merge_runs_basic_sentinels(l, m, r, B); +- default: +- assert(false); +- __builtin_unreachable(); +- } +- } +- +- +- ++ if constexpr (mergingMethod == UNSTABLE_BITONIC_MERGE) { ++ return merge_runs_bitonic(l, m, r, B); ++ } else if constexpr (mergingMethod == UNSTABLE_BITONIC_MERGE_MANUAL_COPY) { ++ return merge_runs_bitonic_manual_copy(l, m, r, B); ++ } else if constexpr (mergingMethod == UNSTABLE_BITONIC_MERGE_BRANCHLESS) { ++ return merge_runs_bitonic_branchless(l, m, r, B); ++ } else if constexpr (mergingMethod == COPY_SMALLER) { ++ return merge_runs_copy_half(l, m, r, B); ++ } else if constexpr (mergingMethod == COPY_BOTH) { ++ return merge_runs_basic(l, m, r, B); ++ } else if constexpr (mergingMethod == COPY_BOTH_WITH_SENTINELS) { ++ return merge_runs_basic_sentinels(l, m, r, B); ++ } else { ++ assert(false); ++ __builtin_unreachable(); ++ } ++ } + } + + #endif //MERGESORTS_MERGING_H diff --git a/third_party/powersort/powersort.BUILD b/third_party/powersort/powersort.BUILD new file mode 100644 index 000000000..8ff6ee1a7 --- /dev/null +++ b/third_party/powersort/powersort.BUILD @@ -0,0 +1,15 @@ +load("@rules_cc//cc:defs.bzl", "cc_library") + +package(default_visibility = ["//visibility:public"]) + +cc_library( + name = "powersort", + hdrs = [ + "src/algorithms.h", + "src/sorts/insertionsort.h", + "src/sorts/merging.h", + "src/sorts/powersort.h", + ], + include_prefix = "third_party/powersort/include", + strip_include_prefix = "src", +) diff --git a/third_party/powersort/remove_binary_function.patch b/third_party/powersort/remove_binary_function.patch new file mode 100644 index 000000000..724c7a4d1 --- /dev/null +++ b/third_party/powersort/remove_binary_function.patch @@ -0,0 +1,13 @@ +diff --git a/src/algorithms.h b/src/algorithms.h +index 4b94d9b..a2a2576 100644 +--- a/src/algorithms.h ++++ b/src/algorithms.h +@@ -16,7 +16,7 @@ namespace algorithms { + + /** superclass for sorting methods */ + template +- class sorter : std::binary_function { ++ class sorter { + protected: + using elem_t = typename std::iterator_traits::value_type ; + using diff_t = typename std::iterator_traits::difference_type ; diff --git a/third_party/powersort/workspace.bzl b/third_party/powersort/workspace.bzl new file mode 100644 index 000000000..0322cfce7 --- /dev/null +++ b/third_party/powersort/workspace.bzl @@ -0,0 +1,20 @@ +"""loads the powersort library, used by Tachyon.""" + +load("//third_party:repo.bzl", "tachyon_http_archive", "tf_mirror_urls") + +def repo(): + tachyon_http_archive( + name = "powersort", + urls = tf_mirror_urls("https://github.com/sebawild/powersort/archive/48e31e909280ca43bb2c33dd3df9922b0a0f3f84.tar.gz"), + sha256 = "89122b7e7e2a0f0b41cc5411f9adde581769ff2f7d141335ce7e5011b932da06", + strip_prefix = "powersort-48e31e909280ca43bb2c33dd3df9922b0a0f3f84", + build_file = "//third_party/powersort:powersort.BUILD", + patch_file = [ + "@kroma_network_tachyon//third_party/powersort:fix_sign_compare_warning.patch", + "@kroma_network_tachyon//third_party/powersort:fix_multiple_definitions.patch", + "@kroma_network_tachyon//third_party/powersort:fix_static_assertion.patch", + # In c++ 17, std::binary_function is removed. + # See https://en.cppreference.com/w/cpp/utility/functional/binary_function. + "@kroma_network_tachyon//third_party/powersort:remove_binary_function.patch", + ], + ) diff --git a/vendors/circom/benchmark/rapidsnark_runner.h b/vendors/circom/benchmark/rapidsnark_runner.h index 0d9f8bf45..809ab48bb 100644 --- a/vendors/circom/benchmark/rapidsnark_runner.h +++ b/vendors/circom/benchmark/rapidsnark_runner.h @@ -82,7 +82,7 @@ class RapidsnarkRunner : public Runner { base::TimeTicks now = base::TimeTicks::Now(); std::vector full_assignments(full_assignments_in.size()); - OPENMP_PARALLEL_FOR(size_t i = 0; i < full_assignments_in.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < full_assignments_in.size(); ++i) { using BigInt = typename F::BigIntTy; BigInt bigint = full_assignments_in[i].ToBigInt(); memcpy(full_assignments[i].v, bigint.limbs, BigInt::kByteNums); diff --git a/vendors/circom/circomlib/circuit/quadratic_arithmetic_program.h b/vendors/circom/circomlib/circuit/quadratic_arithmetic_program.h index a5ee0db72..a6be43316 100644 --- a/vendors/circom/circomlib/circuit/quadratic_arithmetic_program.h +++ b/vendors/circom/circomlib/circuit/quadratic_arithmetic_program.h @@ -38,7 +38,7 @@ class QuadraticArithmeticProgram { omp_lock_t locks[kNumLocks]; for (size_t i = 0; i < kNumLocks; i++) omp_init_lock(&locks[i]); #endif - OPENMP_PARALLEL_FOR(size_t i = 0; i < coefficients.size(); i++) { + OMP_PARALLEL_FOR(size_t i = 0; i < coefficients.size(); i++) { const Coefficient& c = coefficients[i]; std::vector& ab = (c.matrix == 0) ? a : b; @@ -58,7 +58,7 @@ class QuadraticArithmeticProgram { for (size_t i = 0; i < kNumLocks; i++) omp_destroy_lock(&locks[i]); #endif - OPENMP_PARALLEL_FOR(size_t i = 0; i < domain->size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < domain->size(); ++i) { c[i] = a[i] * b[i]; } @@ -81,7 +81,7 @@ class QuadraticArithmeticProgram { c_evals = domain->FFT(std::move(c_poly)); // |h_evals[i]| = |a[i]| * |b[i]| - |c[i]| - OPENMP_PARALLEL_FOR(size_t i = 0; i < domain->size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < domain->size(); ++i) { F& h_evals_i = a_evals.at(i); h_evals_i *= b_evals[i]; h_evals_i -= c_evals[i]; diff --git a/vendors/circom/circomlib/wtns/wtns.h b/vendors/circom/circomlib/wtns/wtns.h index b62bc2bff..64a6b0be3 100644 --- a/vendors/circom/circomlib/wtns/wtns.h +++ b/vendors/circom/circomlib/wtns/wtns.h @@ -127,7 +127,7 @@ struct WtnsDataSection { if (!buffer.ReadPtr(&ptr, header.num_witness)) return false; witnesses = {ptr, header.num_witness}; - OPENMP_PARALLEL_FOR(uint32_t i = 0; i < header.num_witness; ++i) { + OMP_PARALLEL_FOR(uint32_t i = 0; i < header.num_witness; ++i) { witnesses[i] = F(witnesses[i].value()); } return true; diff --git a/vendors/circom/circomlib/zkey/zkey.h b/vendors/circom/circomlib/zkey/zkey.h index 09d542e64..fb9b10c69 100644 --- a/vendors/circom/circomlib/zkey/zkey.h +++ b/vendors/circom/circomlib/zkey/zkey.h @@ -215,7 +215,7 @@ struct CoefficientsSection { if (!buffer.ReadPtr(&ptr, num_coefficients)) return false; coefficients = {ptr, num_coefficients}; - OPENMP_PARALLEL_FOR(size_t i = 0; i < coefficients.size(); ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < coefficients.size(); ++i) { coefficients[i].value = F::FromMontgomery(coefficients[i].value.ToBigInt()); }