Merge pull request #548 from kroma-network/perf/reduce-redundant-zero…

…-initializations perf: reduce redundant zero initializations
kroma-network · Oct 8, 2024 · ab62362 · ab62362
2 parents ee3ba4d + 90452b4
commit ab62362
Show file tree

Hide file tree

Showing 55 changed files with 350 additions and 112 deletions.
diff --git a/tachyon/base/parallelize.h b/tachyon/base/parallelize.h
@@ -233,6 +233,73 @@ auto ParallelizeMap(size_t size, Callable callback,
  std::move(callback));
 }
 
+template <typename Container>
+void ParallelizeFill(Container& container, typename Container::value_type value,
+ std::optional<size_t> threshold = std::nullopt) {
+ Parallelize(
+ container,
+ [&value](absl::Span<typename Container::value_type> chunk) {
+ for (auto& v : chunk) {
+ v = value;
+ }
+ },
+ threshold);
+}
+
+template <typename Container>
+void ParallelizeResize(Container& container, size_t size,
+ std::optional<size_t> threshold = std::nullopt) {
+ if (container.capacity() > size) {
+ container.resize(size);
+ } else {
+ std::vector<typename Container::value_type> new_container(size);
+ auto copy_span = absl::MakeSpan(new_container).first(container.size());
+ Parallelize(
+ copy_span,
+ [&container](absl::Span<typename Container::value_type> chunk,
+ size_t chunk_offset, size_t chunk_size) {
+ size_t start = chunk_offset * chunk_size;
+ for (size_t i = 0; i < chunk.size(); ++i) {
+ chunk[i] = std::move(container[start + i]);
+ }
+ },
+ threshold);
+ container = std::move(new_container);
+ }
+}
+
+template <typename Container>
+void ParallelizeResize(Container& container, size_t size,
+ typename Container::value_type value,
+ std::optional<size_t> threshold = std::nullopt) {
+ if (container.capacity() > size) {
+ size_t old_size = container.size();
+ container.resize(size);
+ auto init_span = absl::MakeSpan(container).last(size - old_size);
+ Parallelize(
+ init_span,
+ [&value](absl::Span<typename Container::value_type> chunk) {
+ std::fill(chunk.begin(), chunk.end(), value);
+ },
+ threshold);
+ } else {
+ std::vector<typename Container::value_type> new_container(size);
+ Parallelize(
+ new_container,
+ [&container, &value](absl::Span<typename Container::value_type> chunk,
+ size_t chunk_offset, size_t chunk_size) {
+ size_t start = chunk_offset * chunk_size;
+ for (size_t i = 0; i < chunk.size(); ++i) {
+ chunk[i] = (start + i) < container.size()
+ ? std::move(container[start + i])
+ : value;
+ }
+ },
+ threshold);
+ container = std::move(new_container);
+ }
+}
+
 } // namespace tachyon::base
 
 #endif // TACHYON_BASE_PARALLELIZE_H_
diff --git a/tachyon/crypto/challenger/multi_field32_conversions.h b/tachyon/crypto/challenger/multi_field32_conversions.h
@@ -26,7 +26,7 @@ BigF Reduce(absl::Span<const SmallF> values) {
  using BigInt = typename BigF::BigIntTy;
  CHECK_LT(values.size(), BigInt::kLimbNums * 2);
 
- BigInt ret;
+ BigInt ret(0);
  for (size_t i = 0; i < values.size(); i += 2) {
  uint32_t value = values[i].value();
  if constexpr (SmallF::Config::kUseMontgomery) {

diff --git a/tachyon/crypto/commitments/fri/two_adic_multiplicative_coset.h b/tachyon/crypto/commitments/fri/two_adic_multiplicative_coset.h
@@ -114,6 +114,8 @@ class TwoAdicMultiplicativeCoset {
  domain_->group_gen_inv();
 
  size_t sz = coset.domain()->size();
+ // NOTE(batzor): These vectors are initialized below in the parallel loop so
+ // it is safe to keep it uninitialized here.
  std::vector<F> first_row(sz);
  std::vector<F> last_row(sz);
  std::vector<F> transition(sz);

diff --git a/tachyon/crypto/commitments/kzg/kzg.h b/tachyon/crypto/commitments/kzg/kzg.h
@@ -114,6 +114,8 @@ class KZG {
 #endif
 
  void ResizeBatchCommitments(size_t size) {
+ // WARN(batzor): When resizing to a larger size, the last values will be
+ // garbage and should be filled with commitment results.
 #if TACHYON_CUDA
  if (msm_gpu_) {
  gpu_batch_commitments_.resize(size);
@@ -124,6 +126,8 @@ class KZG {
  }
 
  std::vector<Commitment> GetBatchCommitments(BatchCommitmentState& state) {
+ // NOTE(batzor): Resizing this vector without initialization is safe since
+ // |BatchNormalize| will overwrite them.
  std::vector<Commitment> batch_commitments;
 #if TACHYON_CUDA
  if (msm_gpu_) {

diff --git a/tachyon/crypto/commitments/pedersen/pedersen.h b/tachyon/crypto/commitments/pedersen/pedersen.h
@@ -85,6 +85,8 @@ class Pedersen final
 #endif
 
  void ResizeBatchCommitments() {
+ // WARN(batzor): When resizing to a larger size, the last values will be
+ // garbage and should be filled with commitment results.
  size_t size = this->batch_commitment_state_.batch_count;
 #if TACHYON_CUDA
  if (msm_gpu_) {
@@ -96,6 +98,8 @@ class Pedersen final
  }
 
  std::vector<Commitment> GetBatchCommitments() {
+ // NOTE(batzor): Resizing this vector without initialization is safe since
+ // |BatchNormalize| will overwrite them.
  std::vector<Commitment> batch_commitments;
 #if TACHYON_CUDA
  if (msm_gpu_) {

diff --git a/tachyon/crypto/hashes/sponge/poseidon/poseidon_sponge_base.h b/tachyon/crypto/hashes/sponge/poseidon/poseidon_sponge_base.h
@@ -70,7 +70,7 @@ struct PoseidonSpongeBase : public FieldBasedCryptographicSponge<Derived> {
  bytes.insert(bytes.end(), elem_bytes.begin(), elem_bytes.end());
  }
 
- bytes.resize(num_bytes);
+ bytes.resize(num_bytes, F::Zero());
  return bytes;
  }
 
@@ -121,6 +121,8 @@ struct PoseidonSpongeBase : public FieldBasedCryptographicSponge<Derived> {
  size_t num_elements) const {
  const Derived& derived = static_cast<const Derived&>(*this);
 
+ // NOTE(batzor): |SqueezeInternal| will fill all the garbage values, so it
+ // is safe to have it uninitialized.
  std::vector<F> ret(num_elements);
  switch (state.mode.type) {
  case DuplexSpongeMode::Type::kAbsorbing: {

diff --git a/tachyon/math/base/BUILD.bazel b/tachyon/math/base/BUILD.bazel
@@ -66,6 +66,11 @@ tachyon_cc_library(
  ],
 )
 
+tachyon_cc_library(
+ name = "const_init",
+ hdrs = ["const_init.h"],
+)
+
 tachyon_cc_library(
  name = "egcd",
  hdrs = ["egcd.h"],
@@ -88,6 +93,11 @@ tachyon_cc_library(
  ],
 )
 
+tachyon_cc_library(
+ name = "parallelize_threshold",
+ hdrs = ["parallelize_threshold.h"],
+)
+
 tachyon_cc_library(
  name = "rational_field",
  hdrs = ["rational_field.h"],

diff --git a/tachyon/math/base/arithmetics_results.h b/tachyon/math/base/arithmetics_results.h
@@ -9,8 +9,8 @@ namespace tachyon::math {
 
 template <typename T>
 struct AddResult {
- T result{};
- T carry{};
+ T result;
+ T carry{0};
 
  constexpr bool operator==(const AddResult& other) const {
  return result == other.result && carry == other.carry;
@@ -26,8 +26,8 @@ struct AddResult {
 
 template <typename T>
 struct SubResult {
- T result{};
- T borrow{};
+ T result;
+ T borrow{0};
 
  constexpr bool operator==(const SubResult& other) const {
  return result == other.result && borrow == other.borrow;
@@ -43,8 +43,8 @@ struct SubResult {
 
 template <typename T>
 struct MulResult {
- T hi{};
- T lo{};
+ T hi{0};
+ T lo{0};
 
  constexpr bool operator==(const MulResult& other) const {
  return hi == other.hi && lo == other.lo;
@@ -60,8 +60,8 @@ struct MulResult {
 
 template <typename T>
 struct DivResult {
- T quotient{};
- T remainder{};
+ T quotient;
+ T remainder = T::Zero();
 
  constexpr bool operator==(const DivResult& other) const {
  return quotient == other.quotient && remainder == other.remainder;

diff --git a/tachyon/math/base/big_int.h b/tachyon/math/base/big_int.h
@@ -48,9 +48,7 @@ TACHYON_EXPORT std::string LimbsToHexString(const uint64_t* limbs,
 // designed to support a wide range of big integer arithmetic operations.
 template <size_t N>
 struct BigInt {
- uint64_t limbs[N] = {
- 0,
- };
+ uint64_t limbs[N];
  constexpr static size_t kLimbNums = N;
  constexpr static size_t kSmallestLimbIdx = SMALLEST_INDEX(N);
  constexpr static size_t kBiggestLimbIdx = BIGGEST_INDEX(N);
@@ -61,16 +59,16 @@ struct BigInt {
 
  constexpr BigInt() = default;
  template <typename T, std::enable_if_t<std::is_signed_v<T>>* = nullptr>
- constexpr explicit BigInt(T value) {
+ constexpr explicit BigInt(T value) : limbs{0} {
  DCHECK_GE(value, 0);
  limbs[kSmallestLimbIdx] = value;
  }
  template <typename T, std::enable_if_t<std::is_unsigned_v<T>>* = nullptr>
- constexpr explicit BigInt(T value) {
+ constexpr explicit BigInt(T value) : limbs{0} {
  limbs[kSmallestLimbIdx] = value;
  }
  template <typename T, std::enable_if_t<std::is_signed_v<T>>* = nullptr>
- constexpr explicit BigInt(std::initializer_list<T> values) {
+ constexpr explicit BigInt(std::initializer_list<T> values) : limbs{0} {
  DCHECK_LE(values.size(), N);
  auto it = values.begin();
  for (size_t i = 0; i < values.size(); ++i, ++it) {
@@ -79,7 +77,7 @@ struct BigInt {
  }
  }
  template <typename T, std::enable_if_t<std::is_unsigned_v<T>>* = nullptr>
- constexpr explicit BigInt(std::initializer_list<T> values) {
+ constexpr explicit BigInt(std::initializer_list<T> values) : limbs{0} {
  DCHECK_LE(values.size(), N);
  auto it = values.begin();
  for (size_t i = 0; i < values.size(); ++i, ++it) {
@@ -117,14 +115,14 @@ struct BigInt {
 
  // Convert a decimal string to a BigInt.
  static std::optional<BigInt> FromDecString(std::string_view str) {
- BigInt ret;
+ BigInt ret(0);
  if (!internal::StringToLimbs(str, ret.limbs, N)) return std::nullopt;
  return ret;
  }
 
  // Convert a hexadecimal string to a BigInt.
  static std::optional<BigInt> FromHexString(std::string_view str) {
- BigInt ret;
+ BigInt ret(0);
  if (!(internal::HexStringToLimbs(str, ret.limbs, N))) return std::nullopt;
  return ret;
  }
@@ -255,8 +253,8 @@ struct BigInt {
  constexpr BigInt<N2> Extend() const {
  static_assert(N2 > N);
  BigInt<N2> ret;
- for (size_t i = 0; i < N; ++i) {
- ret[i] = limbs[i];
+ for (size_t i = 0; i < N2; ++i) {
+ ret[i] = i < N ? limbs[i] : 0;
  }
  return ret;
  }
@@ -707,8 +705,8 @@ struct BigInt {
  LOG_IF_NOT_GPU(ERROR) << "Division by zero attempted";
  return false;
  }
- BigInt quotient;
- BigInt remainder;
+ BigInt quotient(0);
+ BigInt remainder(0);
  size_t bits = BitTraits<BigInt>::GetNumBits(*this);
  uint64_t carry = 0;
  uint64_t& smallest_bit = remainder.limbs[kSmallestLimbIdx];

diff --git a/tachyon/math/base/const_init.h b/tachyon/math/base/const_init.h
@@ -0,0 +1,24 @@
+#ifndef TACHYON_MATH_BASE_CONST_INIT_H_
+#define TACHYON_MATH_BASE_CONST_INIT_H_
+
+namespace tachyon::math {
+
+enum ZeroInitType {
+ kZeroInit,
+};
+
+enum OneInitType {
+ kOneInit,
+};
+
+enum MinusOneInitType {
+ kMinusOneInit,
+};
+
+enum TwoInvInitType {
+ kTwoInvInit,
+};
+
+} // namespace tachyon::math
+
+#endif // TACHYON_MATH_BASE_CONST_INIT_H_
diff --git a/tachyon/math/base/parallelize_threshold.h b/tachyon/math/base/parallelize_threshold.h
@@ -0,0 +1,15 @@
+#ifndef TACHYON_MATH_BASE_PARALLELIZE_THRESHOLD_H_
+#define TACHYON_MATH_BASE_PARALLELIZE_THRESHOLD_H_
+
+namespace tachyon::math {
+
+struct ParallelizeThreshold {
+ // The threshold for parallelizing a loop. If the size of the loop is less
+ // than this threshold, the loop will be executed sequentially.
+ static constexpr int kFieldInit = 1e6;
+ static constexpr int kFieldSimpleOp = 1e5;
+};
+
+} // namespace tachyon::math
+
+#endif // TACHYON_MATH_BASE_PARALLELIZE_THRESHOLD_H_
diff --git a/tachyon/math/base/rational_field.h b/tachyon/math/base/rational_field.h
@@ -30,7 +30,7 @@ class RationalField : public Field<RationalField<F>> {
  : numerator_(std::move(numerator)),
  denominator_(std::move(denominator)) {}
 
- constexpr static RationalField Zero() { return RationalField(); }
+ constexpr static RationalField Zero() { return RationalField(F::Zero()); }
 
  constexpr static RationalField One() { return RationalField(F::One()); }
 

diff --git a/tachyon/math/elliptic_curves/bn/generator/generator.cc b/tachyon/math/elliptic_curves/bn/generator/generator.cc
@@ -13,7 +13,7 @@ namespace tachyon {
 
 template <size_t N>
 std::vector<int8_t> ComputeAteLoopCount(const mpz_class& six_x_plus_2) {
- math::BigInt<N> x;
+ math::BigInt<N> x(0);
  math::gmp::CopyLimbs(six_x_plus_2, x.limbs);
  return x.ToNAF();
 }

diff --git a/tachyon/math/finite_fields/fp12.h b/tachyon/math/finite_fields/fp12.h
@@ -341,7 +341,7 @@ class Fp12 final : public QuadraticExtensionField<Fp12<Config>> {
  // |kFrobeniusCoeffs[0]| = q^((P⁰ - 1) / 6)
  Config::kFrobeniusCoeffs[0] = FrobeniusCoefficient::One();
 #define SET_FROBENIUS_COEFF(d) \
- BigInt<d * N> exp##d;  \
+ BigInt<d * N> exp##d(0); \
  gmp::CopyLimbs(exp##d##_gmp, exp##d.limbs); \
  Config::kFrobeniusCoeffs[d] = BaseFieldConfig::kNonResidue.Pow(exp##d)
  // |kFrobeniusCoeffs[1]| = q^(exp₁) = q^((P¹ - 1) / 6)

diff --git a/tachyon/math/finite_fields/fp3.h b/tachyon/math/finite_fields/fp3.h
@@ -81,7 +81,7 @@ class Fp3 final : public CubicExtensionField<Fp3<Config>> {
  // |kFrobeniusCoeffs[0]| = q^((P⁰ - 1) / 3) = 1
  Config::kFrobeniusCoeffs[0] = FrobeniusCoefficient::One();
 #define SET_FROBENIUS_COEFF(d) \
- BigInt<d * N> exp##d;  \
+ BigInt<d * N> exp##d(0); \
  gmp::CopyLimbs(exp##d##_gmp, exp##d.limbs); \
  Config::kFrobeniusCoeffs[d] = Config::kNonResidue.Pow(exp##d)
 

diff --git a/tachyon/math/finite_fields/fp4.h b/tachyon/math/finite_fields/fp4.h
@@ -89,7 +89,7 @@ class Fp4<Config, std::enable_if_t<Config::kDegreeOverBaseField == 2>> final
  // |kFrobeniusCoeffs[0]| = q^((P⁰ - 1) / 4) = 1
  Config::kFrobeniusCoeffs[0] = FrobeniusCoefficient::One();
 #define SET_FROBENIUS_COEFF(d) \
- BigInt<d * N> exp##d;  \
+ BigInt<d * N> exp##d(0); \
  gmp::CopyLimbs(exp##d##_gmp, exp##d.limbs); \
  Config::kFrobeniusCoeffs[d] = BaseFieldConfig::kNonResidue.Pow(exp##d)
 
@@ -176,7 +176,7 @@ class Fp4<Config, std::enable_if_t<Config::kDegreeOverBaseField == 4>> final
  // |kFrobeniusCoeffs[0]| = q^((P⁰ - 1) / 4) = 1
  Config::kFrobeniusCoeffs[0] = FrobeniusCoefficient::One();
 #define SET_FROBENIUS_COEFF(d) \
- BigInt<d * N> exp##d;  \
+ BigInt<d * N> exp##d(0); \
  gmp::CopyLimbs(exp##d##_gmp, exp##d.limbs); \
  Config::kFrobeniusCoeffs[d] = Config::kNonResidue.Pow(exp##d)