Skip to content

Commit

Permalink
Merge pull request #506 from kroma-network/perf/optimize-proof-genera…
Browse files Browse the repository at this point in the history
…tion

perf: optimize proof generation
  • Loading branch information
chokobole authored Aug 7, 2024
2 parents a6a8723 + e46908e commit b3452b4
Show file tree
Hide file tree
Showing 76 changed files with 1,018 additions and 302 deletions.
2 changes: 2 additions & 0 deletions bazel/tachyon_deps.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ load("//third_party/nasm:workspace.bzl", nasm = "repo")
load("//third_party/node_addon_api:install_node_addon_api.bzl", "install_node_addon_api")
load("//third_party/omp:omp_configure.bzl", "omp_configure")
load("//third_party/pdqsort:workspace.bzl", pdqsort = "repo")
load("//third_party/powersort:workspace.bzl", powersort = "repo")
load("//third_party/py:python_configure.bzl", "python_configure")
load("//third_party/rapidsnark:workspace.bzl", rapidsnark = "repo")

Expand All @@ -36,6 +37,7 @@ def tachyon_deps():
json()
nasm()
pdqsort()
powersort()
rapidsnark()

install_node_addon_api(name = "node_addon_api")
Expand Down
21 changes: 20 additions & 1 deletion tachyon/base/BUILD.bazel
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
load("//bazel:tachyon.bzl", "if_has_openmp_on_macos", "if_posix")
load("//bazel:tachyon_cc.bzl", "tachyon_cc_library", "tachyon_cc_unittest")
load("//bazel:tachyon_cc.bzl", "tachyon_cc_benchmark", "tachyon_cc_library", "tachyon_cc_unittest")

package(default_visibility = ["//visibility:public"])

Expand Down Expand Up @@ -146,6 +146,15 @@ tachyon_cc_library(
deps = [":logging"],
)

tachyon_cc_library(
name = "sort",
hdrs = ["sort.h"],
deps = [
"@pdqsort",
"@powersort",
],
)

tachyon_cc_library(
name = "static_storage",
hdrs = ["static_storage.h"],
Expand All @@ -162,6 +171,16 @@ tachyon_cc_library(
hdrs = ["type_list.h"],
)

tachyon_cc_benchmark(
name = "sort_benchmark",
srcs = ["sort_benchmark.cc"],
deps = [
"//tachyon/base:random",
"//tachyon/base:sort",
"//tachyon/base/containers:container_util",
],
)

tachyon_cc_unittest(
name = "base_unittests",
srcs = [
Expand Down
4 changes: 2 additions & 2 deletions tachyon/base/containers/container_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ template <typename Generator,
std::vector<ReturnType> CreateVectorParallel(size_t size,
Generator&& generator) {
std::vector<ReturnType> ret(size);
OPENMP_PARALLEL_FOR(size_t i = 0; i < size; ++i) { ret[i] = generator(); }
OMP_PARALLEL_FOR(size_t i = 0; i < size; ++i) { ret[i] = generator(); }
return ret;
}

Expand Down Expand Up @@ -93,7 +93,7 @@ template <typename Generator,
std::vector<ReturnType> CreateVectorParallel(size_t size,
Generator&& generator) {
std::vector<ReturnType> ret(size);
OPENMP_PARALLEL_FOR(size_t i = 0; i < size; ++i) { ret[i] = generator(i); }
OMP_PARALLEL_FOR(size_t i = 0; i < size; ++i) { ret[i] = generator(i); }
return ret;
}

Expand Down
20 changes: 10 additions & 10 deletions tachyon/base/openmp_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,21 @@

#if defined(TACHYON_HAS_OPENMP)
#define CONSTEXPR_IF_NOT_OPENMP
#define OMP_FOR _Pragma("omp for")
#define OMP_FOR_NOWAIT _Pragma("omp for nowait")
#define OMP_FOR(expr) _Pragma("omp for") for (expr)
#define OMP_FOR_NOWAIT(expr) _Pragma("omp for nowait") for (expr)
#define OMP_NESTED_FOR(expr) _Pragma("omp for collapse(2)") for (expr)
#define OMP_PARALLEL _Pragma("omp parallel")
#define OPENMP_PARALLEL_FOR(expr) _Pragma("omp parallel for") for (expr)
#define OPENMP_PARALLEL_NESTED_FOR(expr) \
#define OMP_PARALLEL_FOR(expr) _Pragma("omp parallel for") for (expr)
#define OMP_PARALLEL_NESTED_FOR(expr) \
_Pragma("omp parallel for collapse(2)") for (expr)
#define OPENMP_FOR(expr) _Pragma("omp for") for (expr)
#else
#define CONSTEXPR_IF_NOT_OPENMP constexpr
#define OMP_FOR
#define OMP_FOR_NOWAIT
#define OMP_FOR(expr) for (expr)
#define OMP_FOR_NOWAIT(expr) for (expr)
#define OMP_NESTED_FOR(expr) for (expr)
#define OMP_PARALLEL
#define OPENMP_PARALLEL_FOR(expr) for (expr)
#define OPENMP_PARALLEL_NESTED_FOR(expr) for (expr)
#define OPENMP_FOR(expr) for (expr)
#define OMP_PARALLEL_FOR(expr) for (expr)
#define OMP_PARALLEL_NESTED_FOR(expr) for (expr)
#endif // defined(TACHYON_HAS_OPENMP)

namespace tachyon::base {
Expand Down
186 changes: 122 additions & 64 deletions tachyon/base/parallelize.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,92 @@
#include "tachyon/base/openmp_util.h"

namespace tachyon::base {
namespace internal {

template <typename Container, typename Callable,
typename FunctorTraits = internal::MakeFunctorTraits<Callable>,
typename RunType = typename FunctorTraits::RunType,
typename ArgList = internal::ExtractArgs<RunType>,
typename SpanTy = internal::GetType<0, ArgList>,
size_t ArgNum = internal::GetSize<ArgList>>
void InvokeParallelizeCallback(Container& container, size_t i,
size_t num_chunks, size_t chunk_size,
Callable callback) {
size_t len =
i == num_chunks - 1 ? std::size(container) - i * chunk_size : chunk_size;
SpanTy chunk(std::data(container) + i * chunk_size, len);
if constexpr (ArgNum == 1) {
callback(chunk);
} else if constexpr (ArgNum == 2) {
callback(chunk, i);
} else {
static_assert(ArgNum == 3);
callback(chunk, i, chunk_size);
}
}

template <typename Callable,
typename FunctorTraits = internal::MakeFunctorTraits<Callable>,
typename RunType = typename FunctorTraits::RunType,
typename ArgList = internal::ExtractArgs<RunType>,
size_t ArgNum = internal::GetSize<ArgList>>
void InvokeParallelizeCallback(size_t size, size_t i, size_t num_chunks,
size_t chunk_size, Callable callback) {
size_t len = i == num_chunks - 1 ? size - i * chunk_size : chunk_size;
if constexpr (ArgNum == 1) {
callback(len);
} else if constexpr (ArgNum == 2) {
callback(len, i);
} else {
static_assert(ArgNum == 3);
callback(len, i, chunk_size);
}
}

template <typename Container, typename Callable,
typename FunctorTraits = internal::MakeFunctorTraits<Callable>,
typename RunType = typename FunctorTraits::RunType,
typename ReturnType = typename FunctorTraits::ReturnType,
typename ArgList = internal::ExtractArgs<RunType>,
typename SpanTy = internal::GetType<0, ArgList>,
size_t ArgNum = internal::GetSize<ArgList>>
void InvokeParallelizeCallback(Container& container, size_t i,
size_t num_chunks, size_t chunk_size,
Callable callback,
std::vector<ReturnType>& values) {
size_t len =
i == num_chunks - 1 ? std::size(container) - i * chunk_size : chunk_size;
SpanTy chunk(std::data(container) + i * chunk_size, len);
if constexpr (ArgNum == 1) {
values[i] = callback(chunk);
} else if constexpr (ArgNum == 2) {
values[i] = callback(chunk, i);
} else {
static_assert(ArgNum == 3);
values[i] = callback(chunk, i, chunk_size);
}
}

template <typename Callable,
typename FunctorTraits = internal::MakeFunctorTraits<Callable>,
typename RunType = typename FunctorTraits::RunType,
typename ReturnType = typename FunctorTraits::ReturnType,
typename ArgList = internal::ExtractArgs<RunType>,
size_t ArgNum = internal::GetSize<ArgList>>
void InvokeParallelizeCallback(size_t size, size_t i, size_t num_chunks,
size_t chunk_size, Callable callback,
std::vector<ReturnType>& values) {
size_t len = i == num_chunks - 1 ? size - i * chunk_size : chunk_size;
if constexpr (ArgNum == 1) {
values[i] = callback(len);
} else if constexpr (ArgNum == 2) {
values[i] = callback(len, i);
} else {
static_assert(ArgNum == 3);
values[i] = callback(len, i, chunk_size);
}
}
} // namespace internal

template <typename T>
using ParallelizeCallback1 = std::function<void(absl::Span<T>)>;
Expand All @@ -21,51 +107,35 @@ using ParallelizeCallback3 = std::function<void(absl::Span<T>, size_t, size_t)>;

// Splits the |container| by |chunk_size| and executes |callback| in parallel.
// See parallelize_unittest.cc for more details.
template <typename Container, typename Callable,
typename FunctorTraits = internal::MakeFunctorTraits<Callable>,
typename RunType = typename FunctorTraits::RunType,
typename ArgList = internal::ExtractArgs<RunType>,
typename SpanTy = internal::GetType<0, ArgList>,
typename T = typename SpanTy::value_type,
size_t ArgNum = internal::GetSize<ArgList>>
template <typename Container, typename Callable>
void ParallelizeByChunkSize(Container& container, size_t chunk_size,
Callable callback) {
if (chunk_size == 0) return;
size_t num_chunks = (std::size(container) + chunk_size - 1) / chunk_size;
OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
size_t len = i == num_chunks - 1 ? std::size(container) - i * chunk_size
: chunk_size;
SpanTy chunk(std::data(container) + i * chunk_size, len);
if constexpr (ArgNum == 1) {
callback(chunk);
} else if constexpr (ArgNum == 2) {
callback(chunk, i);
} else {
static_assert(ArgNum == 3);
callback(chunk, i, chunk_size);
}
if (num_chunks == 1) {
internal::InvokeParallelizeCallback(container, 0, num_chunks, chunk_size,
callback);
return;
}
OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
internal::InvokeParallelizeCallback(container, i, num_chunks, chunk_size,
callback);
}
}

// Splits the |size| by |chunk_size| and executes |callback| in parallel.
template <typename Callable,
typename FunctorTraits = internal::MakeFunctorTraits<Callable>,
typename RunType = typename FunctorTraits::RunType,
typename ArgList = internal::ExtractArgs<RunType>,
size_t ArgNum = internal::GetSize<ArgList>>
template <typename Callable>
void ParallelizeByChunkSize(size_t size, size_t chunk_size, Callable callback) {
if (chunk_size == 0) return;
size_t num_chunks = (size + chunk_size - 1) / chunk_size;
OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
size_t len = i == num_chunks - 1 ? size - i * chunk_size : chunk_size;
if constexpr (ArgNum == 1) {
callback(len);
} else if constexpr (ArgNum == 2) {
callback(len, i);
} else {
static_assert(ArgNum == 3);
callback(len, i, chunk_size);
}
if (num_chunks == 1) {
internal::InvokeParallelizeCallback(size, 0, num_chunks, chunk_size,
callback);
return;
}
OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
internal::InvokeParallelizeCallback(size, i, num_chunks, chunk_size,
callback);
}
}

Expand Down Expand Up @@ -95,29 +165,21 @@ void Parallelize(size_t size, Callable callback,
template <typename Container, typename Callable,
typename FunctorTraits = internal::MakeFunctorTraits<Callable>,
typename RunType = typename FunctorTraits::RunType,
typename ReturnType = typename FunctorTraits::ReturnType,
typename ArgList = internal::ExtractArgs<RunType>,
typename SpanTy = internal::GetType<0, ArgList>,
typename T = typename SpanTy::value_type,
size_t ArgNum = internal::GetSize<ArgList>>
typename ReturnType = typename FunctorTraits::ReturnType>
std::vector<ReturnType> ParallelizeMapByChunkSize(Container& container,
size_t chunk_size,
Callable callback) {
if (chunk_size == 0) return {};
size_t num_chunks = (std::size(container) + chunk_size - 1) / chunk_size;
std::vector<ReturnType> values(num_chunks);
OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
size_t len = i == num_chunks - 1 ? std::size(container) - i * chunk_size
: chunk_size;
SpanTy chunk(std::data(container) + i * chunk_size, len);
if constexpr (ArgNum == 1) {
values[i] = callback(chunk);
} else if constexpr (ArgNum == 2) {
values[i] = callback(chunk, i);
} else {
static_assert(ArgNum == 3);
values[i] = callback(chunk, i, chunk_size);
}
if (num_chunks == 1) {
internal::InvokeParallelizeCallback(container, 0, num_chunks, chunk_size,
callback, values);
return values;
}
OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
internal::InvokeParallelizeCallback(container, i, num_chunks, chunk_size,
callback, values);
}
return values;
}
Expand All @@ -128,25 +190,21 @@ std::vector<ReturnType> ParallelizeMapByChunkSize(Container& container,
template <typename Callable,
typename FunctorTraits = internal::MakeFunctorTraits<Callable>,
typename RunType = typename FunctorTraits::RunType,
typename ReturnType = typename FunctorTraits::ReturnType,
typename ArgList = internal::ExtractArgs<RunType>,
size_t ArgNum = internal::GetSize<ArgList>>
typename ReturnType = typename FunctorTraits::ReturnType>
std::vector<ReturnType> ParallelizeMapByChunkSize(size_t size,
size_t chunk_size,
Callable callback) {
if (chunk_size == 0) return {};
size_t num_chunks = (size + chunk_size - 1) / chunk_size;
std::vector<ReturnType> values(num_chunks);
OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
size_t len = i == num_chunks - 1 ? size - i * chunk_size : chunk_size;
if constexpr (ArgNum == 1) {
values[i] = callback(len);
} else if constexpr (ArgNum == 2) {
values[i] = callback(len, i);
} else {
static_assert(ArgNum == 3);
values[i] = callback(len, i, chunk_size);
}
if (num_chunks == 1) {
internal::InvokeParallelizeCallback(size, 0, num_chunks, chunk_size,
callback, values);
return values;
}
OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
internal::InvokeParallelizeCallback(size, i, num_chunks, chunk_size,
callback, values);
}
return values;
}
Expand Down
28 changes: 28 additions & 0 deletions tachyon/base/sort.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#ifndef TACHYON_BASE_SORT_H_
#define TACHYON_BASE_SORT_H_

#include "third_party/pdqsort/include/pdqsort.h"
#include "third_party/powersort/include/sorts/powersort.h"

namespace tachyon::base {

template <typename Iter>
void UnstableSort(Iter begin, Iter end) {
return pdqsort(begin, end);
}

template <typename Iter, typename Compare>
void UnstableSort(Iter begin, Iter end, Compare compare) {
return pdqsort(begin, end, compare);
}

// TODO(chokobole): Add StableSort() with compare version.
template <typename Iter>
void StableSort(Iter begin, Iter end) {
algorithms::powersort<Iter> sort;
sort.sort(begin, end);
}

} // namespace tachyon::base

#endif // TACHYON_BASE_SORT_H_
Loading

0 comments on commit b3452b4

Please sign in to comment.