From ef89b133b7e3ace91e11826ee7a3993bb73c966a Mon Sep 17 00:00:00 2001 From: Ryan Kim Date: Fri, 9 Aug 2024 12:43:14 +0900 Subject: [PATCH 01/11] refac: use `uint32_t` to represent log size instead of `size_t` --- .../crypto/commitments/fri/two_adic_fri_config.h | 2 +- tachyon/crypto/commitments/fri/two_adic_fri_pcs.h | 2 +- .../field_merkle_tree/field_merkle_tree_mmcs.h | 6 +++--- tachyon/math/base/semigroups.h | 4 ++-- .../msm/algorithms/icicle/icicle_msm_utils.cc | 2 +- .../polynomials/univariate/evaluations_utils.h | 2 +- .../univariate/radix2_evaluation_domain.h | 14 +++++++------- .../radix2_evaluation_domain_unittest.cc | 4 ++-- .../univariate_evaluation_domain_unittest.cc | 8 ++++---- .../plonky3/base/two_adic_multiplicative_coset.h | 4 ++-- 10 files changed, 24 insertions(+), 24 deletions(-) diff --git a/tachyon/crypto/commitments/fri/two_adic_fri_config.h b/tachyon/crypto/commitments/fri/two_adic_fri_config.h index a4764eb910..45db1757d0 100644 --- a/tachyon/crypto/commitments/fri/two_adic_fri_config.h +++ b/tachyon/crypto/commitments/fri/two_adic_fri_config.h @@ -63,7 +63,7 @@ std::vector FoldMatrix(const ExtF& beta, // NOTE(ashjeong): |arity| is subject to change in the future template -ExtF FoldRow(size_t index, size_t log_num_rows, const ExtF& beta, +ExtF FoldRow(size_t index, uint32_t log_num_rows, const ExtF& beta, const std::vector& evals) { using F = typename math::ExtensionFieldTraits::BaseField; const size_t kArity = 2; diff --git a/tachyon/crypto/commitments/fri/two_adic_fri_pcs.h b/tachyon/crypto/commitments/fri/two_adic_fri_pcs.h index a20d2396db..cd5cde7114 100644 --- a/tachyon/crypto/commitments/fri/two_adic_fri_pcs.h +++ b/tachyon/crypto/commitments/fri/two_adic_fri_pcs.h @@ -204,7 +204,7 @@ class TwoAdicFriPCS { // Batch combination challenge const ExtF alpha = challenger.template SampleExtElement(); VLOG(2) << "FRI(alpha): " << alpha.ToHexString(true); - size_t log_global_max_num_rows = + uint32_t log_global_max_num_rows = proof.commit_phase_commits.size() + fri_.log_blowup; return TwoAdicFriPCSVerify( fri_, proof, challenger, diff --git a/tachyon/crypto/commitments/merkle_tree/field_merkle_tree/field_merkle_tree_mmcs.h b/tachyon/crypto/commitments/merkle_tree/field_merkle_tree/field_merkle_tree_mmcs.h index a0a69c5d78..6600c6c8c3 100644 --- a/tachyon/crypto/commitments/merkle_tree/field_merkle_tree/field_merkle_tree_mmcs.h +++ b/tachyon/crypto/commitments/merkle_tree/field_merkle_tree/field_merkle_tree_mmcs.h @@ -105,15 +105,15 @@ class FieldMerkleTreeMMCS final std::vector>* openings, Proof* proof) const { size_t max_row_size = this->GetMaxRowSize(prover_data); - size_t log_max_row_size = base::bits::Log2Ceiling(max_row_size); + uint32_t log_max_row_size = base::bits::Log2Ceiling(max_row_size); // TODO(chokobole): Is it able to be parallelized? *openings = base::Map( prover_data.leaves(), [log_max_row_size, index](const math::RowMajorMatrix& matrix) { - size_t log_row_size = + uint32_t log_row_size = base::bits::Log2Ceiling(static_cast(matrix.rows())); - size_t bits_reduced = log_max_row_size - log_row_size; + uint32_t bits_reduced = log_max_row_size - log_row_size; size_t reduced_index = index >> bits_reduced; return base::CreateVector(matrix.cols(), [reduced_index, &matrix](size_t col) { diff --git a/tachyon/math/base/semigroups.h b/tachyon/math/base/semigroups.h index a3c9a0906c..524a68976a 100644 --- a/tachyon/math/base/semigroups.h +++ b/tachyon/math/base/semigroups.h @@ -224,7 +224,7 @@ class MultiplicativeSemigroup { constexpr static std::vector GetBitRevIndexSuccessivePowers( size_t size, const G& generator, const G& c = G::One()) { std::vector ret(size); - size_t log_size = base::bits::CheckedLog2(size); + uint32_t log_size = base::bits::CheckedLog2(size); base::Parallelize( ret, [log_size, &generator, &c, &ret]( absl::Span chunk, size_t chunk_offset, size_t chunk_size) { @@ -248,7 +248,7 @@ class MultiplicativeSemigroup { constexpr static std::vector GetBitRevIndexSuccessivePowersSerial( size_t size, const G& generator, const G& c = G::One()) { std::vector ret(size); - size_t log_size = base::bits::CheckedLog2(size); + uint32_t log_size = base::bits::CheckedLog2(size); MulResult pow = c.IsOne() ? G::One() : c; for (size_t idx = 0; idx < size - 1; ++idx) { ret[base::bits::ReverseBitsLen(idx, log_size)] = pow; diff --git a/tachyon/math/elliptic_curves/msm/algorithms/icicle/icicle_msm_utils.cc b/tachyon/math/elliptic_curves/msm/algorithms/icicle/icicle_msm_utils.cc index f88c143943..0288de9451 100644 --- a/tachyon/math/elliptic_curves/msm/algorithms/icicle/icicle_msm_utils.cc +++ b/tachyon/math/elliptic_curves/msm/algorithms/icicle/icicle_msm_utils.cc @@ -16,7 +16,7 @@ size_t DetermineMsmDivisionsForMemory(size_t scalar_t_mem_size, size_t free_memory = device::gpu::GpuMemLimitInfo(device::gpu::MemoryUsage::kHigh); size_t shift = 0; - size_t log_msm_size = base::bits::Log2Ceiling(msm_size); + uint32_t log_msm_size = base::bits::Log2Ceiling(msm_size); for (size_t number_of_divisions = 0; number_of_divisions < log_msm_size; ++number_of_divisions) { diff --git a/tachyon/math/polynomials/univariate/evaluations_utils.h b/tachyon/math/polynomials/univariate/evaluations_utils.h index 2a186a1661..3c08074897 100644 --- a/tachyon/math/polynomials/univariate/evaluations_utils.h +++ b/tachyon/math/polynomials/univariate/evaluations_utils.h @@ -29,7 +29,7 @@ std::vector SwapBitRevElements(const std::vector& vals) { // element at index 4(100) are swapped. template void SwapBitRevElementsInPlace(Container& container, size_t size, - size_t log_len) { + uint32_t log_len) { TRACE_EVENT("Utils", "SwapBitRevElementsInPlace"); if (size <= 1) return; OMP_PARALLEL_FOR(size_t idx = 1; idx < size; ++idx) { diff --git a/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h b/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h index 8e6d4fc345..1c72ce7395 100644 --- a/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h +++ b/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h @@ -99,7 +99,7 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain, NOTREACHED(); } CHECK_EQ(this->size_, static_cast(mat.rows())); - size_t log_n = this->log_size_of_group_; + uint32_t log_n = this->log_size_of_group_; mid_ = log_n / 2; // The first half looks like a normal DIT. @@ -120,7 +120,7 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain, NOTREACHED(); } CHECK_EQ(this->size_, static_cast(mat.rows())); - size_t log_n = this->log_size_of_group_; + uint32_t log_n = this->log_size_of_group_; mid_ = log_n / 2; // The first half looks like a normal DIT. @@ -409,7 +409,7 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain, size_t cur_chunk_rows = std::min(chunk_rows, this->size_ - block_start); Eigen::Block> submat = mat.block(block_start, 0, cur_chunk_rows, cols); - for (size_t layer = 0; layer < mid_; ++layer) { + for (uint32_t layer = 0; layer < mid_; ++layer) { RunDitLayers(submat, layer, absl::MakeSpan(twiddles), absl::MakeSpan(packed_twiddles_rev), false); } @@ -440,7 +440,7 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain, size_t cur_chunk_rows = std::min(chunk_rows, this->size_ - block_start); Eigen::Block> submat = mat.block(block_start, 0, cur_chunk_rows, cols); - for (size_t layer = mid_; layer < this->log_size_of_group_; ++layer) { + for (uint32_t layer = mid_; layer < this->log_size_of_group_; ++layer) { size_t first_block = thread << (layer - mid_); RunDitLayers(submat, layer, absl::MakeSpan(twiddles_rev.data() + first_block, @@ -454,14 +454,14 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain, } CONSTEXPR_IF_NOT_OPENMP void RunDitLayers( - Eigen::Block>& submat, size_t layer, + Eigen::Block>& submat, uint32_t layer, const absl::Span& twiddles, const absl::Span& packed_twiddles, bool rev) { TRACE_EVENT("EvaluationDomain", "RunDitLayers"); if constexpr (F::Config::kModulusBits > 32) { NOTREACHED(); } - size_t layer_rev = this->log_size_of_group_ - 1 - layer; + uint32_t layer_rev = this->log_size_of_group_ - 1 - layer; size_t half_block_size = size_t{1} << (rev ? layer_rev : layer); size_t block_size = half_block_size * 2; size_t sub_rows = static_cast(submat.rows()); @@ -515,7 +515,7 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain, } } - size_t mid_ = 0; + uint32_t mid_ = 0; // For small prime fields std::vector rev_roots_vec_; std::vector rev_inv_roots_vec_; diff --git a/tachyon/math/polynomials/univariate/radix2_evaluation_domain_unittest.cc b/tachyon/math/polynomials/univariate/radix2_evaluation_domain_unittest.cc index 6fb752949d..df8dc76d15 100644 --- a/tachyon/math/polynomials/univariate/radix2_evaluation_domain_unittest.cc +++ b/tachyon/math/polynomials/univariate/radix2_evaluation_domain_unittest.cc @@ -25,7 +25,7 @@ TYPED_TEST_SUITE(Radix2EvaluationDomainTest, PrimeFieldTypes); TYPED_TEST(Radix2EvaluationDomainTest, FFTBatch) { using F = TypeParam; - for (size_t log_r = 0; log_r < 5; ++log_r) { + for (uint32_t log_r = 0; log_r < 5; ++log_r) { RowMajorMatrix expected = RowMajorMatrix::Random(size_t{1} << log_r, 3); RowMajorMatrix result = expected; @@ -40,7 +40,7 @@ TYPED_TEST(Radix2EvaluationDomainTest, FFTBatch) { TYPED_TEST(Radix2EvaluationDomainTest, CosetLDEBatch) { using F = TypeParam; - for (size_t log_r = 0; log_r < 5; ++log_r) { + for (uint32_t log_r = 0; log_r < 5; ++log_r) { RowMajorMatrix expected = RowMajorMatrix::Random(size_t{1} << log_r, 3); RowMajorMatrix result = expected; diff --git a/tachyon/math/polynomials/univariate/univariate_evaluation_domain_unittest.cc b/tachyon/math/polynomials/univariate/univariate_evaluation_domain_unittest.cc index 1e1044dcab..5d56e1c735 100644 --- a/tachyon/math/polynomials/univariate/univariate_evaluation_domain_unittest.cc +++ b/tachyon/math/polynomials/univariate/univariate_evaluation_domain_unittest.cc @@ -90,11 +90,11 @@ TYPED_TEST(UnivariateEvaluationDomainTest, FilterPolynomial) { using DensePoly = typename Domain::DensePoly; if constexpr (std::is_same_v) { - for (size_t log_domain_size = 1; log_domain_size < 4; ++log_domain_size) { + for (uint32_t log_domain_size = 1; log_domain_size < 4; ++log_domain_size) { size_t domain_size = size_t{1} << log_domain_size; std::unique_ptr domain = Domain::Create(domain_size); - for (size_t log_subdomain_size = 1; log_subdomain_size <= log_domain_size; - ++log_subdomain_size) { + for (uint32_t log_subdomain_size = 1; + log_subdomain_size <= log_domain_size; ++log_subdomain_size) { size_t subdomain_size = size_t{1} << log_subdomain_size; std::unique_ptr subdomain = Domain::Create(subdomain_size); @@ -263,7 +263,7 @@ TYPED_TEST(UnivariateEvaluationDomainTest, FFTCorrectness) { const size_t kLogDegree = 5; const size_t kDegree = (size_t{1} << kLogDegree) - 1; DensePoly rand_poly = DensePoly::Random(kDegree); - for (size_t log_domain_size = kLogDegree; log_domain_size < kLogDegree + 2; + for (uint32_t log_domain_size = kLogDegree; log_domain_size < kLogDegree + 2; ++log_domain_size) { size_t domain_size = size_t{1} << log_domain_size; this->TestDomains( diff --git a/tachyon/zk/air/plonky3/base/two_adic_multiplicative_coset.h b/tachyon/zk/air/plonky3/base/two_adic_multiplicative_coset.h index b468f221f3..19bf96d6f5 100644 --- a/tachyon/zk/air/plonky3/base/two_adic_multiplicative_coset.h +++ b/tachyon/zk/air/plonky3/base/two_adic_multiplicative_coset.h @@ -26,7 +26,7 @@ class TwoAdicMultiplicativeCoset { public: constexpr TwoAdicMultiplicativeCoset() = default; - TwoAdicMultiplicativeCoset(size_t log_n, const F& shift) { + TwoAdicMultiplicativeCoset(uint32_t log_n, const F& shift) { domain_.reset(static_cast*>( math::Radix2EvaluationDomain::Create(size_t{1} << log_n) ->GetCoset(shift) @@ -88,7 +88,7 @@ class TwoAdicMultiplicativeCoset { CHECK_EQ(domain_->offset(), F::One()); CHECK_NE(coset_shift, F::One()); CHECK_GE(domain_->log_size_of_group(), coset.domain()->log_size_of_group()); - size_t rate_bits = + uint32_t rate_bits = coset.domain()->log_size_of_group() - domain_->log_size_of_group(); F s_pow_n = coset_shift.ExpPowOfTwo(domain_->log_size_of_group()); From b2c7e3f40dbf26788804be1c6f22f61d78656559 Mon Sep 17 00:00:00 2001 From: Ryan Kim Date: Fri, 9 Aug 2024 14:20:25 +0900 Subject: [PATCH 02/11] perf(zk): use `ExpOfPowTwo()` instead of `Pow()` --- tachyon/zk/air/plonky3/base/two_adic_multiplicative_coset.h | 3 ++- tachyon/zk/plonk/halo2/prover.h | 2 +- tachyon/zk/plonk/halo2/verifier.h | 2 +- tachyon/zk/plonk/vanishing/vanishing_utils.h | 5 +++-- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tachyon/zk/air/plonky3/base/two_adic_multiplicative_coset.h b/tachyon/zk/air/plonky3/base/two_adic_multiplicative_coset.h index 19bf96d6f5..3315a0c90e 100644 --- a/tachyon/zk/air/plonky3/base/two_adic_multiplicative_coset.h +++ b/tachyon/zk/air/plonky3/base/two_adic_multiplicative_coset.h @@ -110,7 +110,8 @@ class TwoAdicMultiplicativeCoset { CHECK(F::BatchInverseInPlaceSerial(inv_denoms_inv_zeroifier_chunk)); }); - F coset_i = domain_->group_gen().Pow(domain_->size() - 1); + F coset_i = domain_->group_gen().ExpPowOfTwo(domain_->log_size_of_group()) * + domain_->group_gen_inv(); size_t sz = coset.domain()->size(); std::vector first_row(sz); diff --git a/tachyon/zk/plonk/halo2/prover.h b/tachyon/zk/plonk/halo2/prover.h index 552a4f30ba..939ba91f46 100644 --- a/tachyon/zk/plonk/halo2/prover.h +++ b/tachyon/zk/plonk/halo2/prover.h @@ -339,7 +339,7 @@ class Prover : public ProverBase { proving_key.verifying_key().constraint_system(); const F& x = permutation_opening_point_set.x; - F x_n = x.Pow(this->pcs_.N()); + F x_n = x.ExpPowOfTwo(this->pcs_.K()); vanishing_prover.BatchEvaluate(this, constraint_system, poly_tables, x, x_n); PermutationProver::EvaluateProvingKey( diff --git a/tachyon/zk/plonk/halo2/verifier.h b/tachyon/zk/plonk/halo2/verifier.h index 4ab48d2cb5..15e9a7727e 100644 --- a/tachyon/zk/plonk/halo2/verifier.h +++ b/tachyon/zk/plonk/halo2/verifier.h @@ -142,7 +142,7 @@ class Verifier : public VerifierBase { proof.x_prev = Rotation::Prev().RotateOmega(this->domain(), proof.x); proof.x_last = Rotation(-(blinding_factors + 1)).RotateOmega(this->domain(), proof.x); - proof.x_n = proof.x.Pow(this->pcs_.N()); + proof.x_n = proof.x.ExpPowOfTwo(this->pcs_.K()); } bool ValidateInstanceColumnsVec( diff --git a/tachyon/zk/plonk/vanishing/vanishing_utils.h b/tachyon/zk/plonk/vanishing/vanishing_utils.h index 626dbd7e6d..d90ea417f6 100644 --- a/tachyon/zk/plonk/vanishing/vanishing_utils.h +++ b/tachyon/zk/plonk/vanishing/vanishing_utils.h @@ -80,8 +80,9 @@ ExtendedEvals& DivideByVanishingPolyInPlace( << (extended_domain->log_size_of_group() - domain->log_size_of_group()); // |coset_gen_pow_n| = w'ⁿ where w' is generator of extended domain. - const F coset_gen_pow_n = extended_domain->group_gen().Pow(domain->size()); - const F zeta_pow_n = zeta.Pow(domain->size()); + const F coset_gen_pow_n = + extended_domain->group_gen().ExpPowOfTwo(domain->log_size_of_group()); + const F zeta_pow_n = zeta.ExpPowOfTwo(domain->log_size_of_group()); std::vector t_evaluations(kTEvaluationsSize); // |t_evaluations| = [ζⁿ - 1, ζⁿ * w'ⁿ - 1, ζⁿ * w'²ⁿ - 1, ...] base::Parallelize(t_evaluations, [&coset_gen_pow_n, &zeta_pow_n]( From a03107bb713a2761721aa8a1b45748e190e9452c Mon Sep 17 00:00:00 2001 From: Ryan Kim Date: Fri, 9 Aug 2024 14:22:47 +0900 Subject: [PATCH 03/11] chore(math): fix the comment in `UnivariateEvaluationDomain` --- .../polynomials/univariate/univariate_evaluation_domain.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tachyon/math/polynomials/univariate/univariate_evaluation_domain.h b/tachyon/math/polynomials/univariate/univariate_evaluation_domain.h index 62fc108648..14bd1b0017 100644 --- a/tachyon/math/polynomials/univariate/univariate_evaluation_domain.h +++ b/tachyon/math/polynomials/univariate/univariate_evaluation_domain.h @@ -58,9 +58,9 @@ class UnivariateEvaluationDomain : public EvaluationDomain { size_inv_ = unwrap(size_as_field_element_.Inverse()); // Compute the generator for the multiplicative subgroup. - // It should be the 2^|log_size_of_group_| root of unity. + // It should be the |size_|-th root of unity. CHECK(F::GetRootOfUnity(size_, &group_gen_)); - // Check that it is indeed the 2^(log_size_of_group) root of unity. + // Check that it is indeed the |size_|-th root of unity. DCHECK_EQ(group_gen_.Pow(size_), F::One()); group_gen_inv_ = unwrap(group_gen_.Inverse()); #if TACHYON_CUDA From 2132a429e1d53b55f8ae9cbb8f03961e7454415b Mon Sep 17 00:00:00 2001 From: Wonyong Kim Date: Sat, 10 Aug 2024 20:07:16 +0900 Subject: [PATCH 04/11] refac(math): remove redundant template type --- .../math/polynomials/univariate/radix2_evaluation_domain.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h b/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h index 1c72ce7395..75ac932feb 100644 --- a/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h +++ b/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h @@ -502,14 +502,14 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain, { TRACE_EVENT("Subtask", "ButterflyOMPLoop"); OMP_PARALLEL_FOR(size_t i = 0; i < shorts_1.size(); ++i) { - UnivariateEvaluationDomain::template ButterflyFnOutIn< - PackedPrimeField>(*shorts_1[i], *shorts_2[i], packed_twiddle); + UnivariateEvaluationDomain::template ButterflyFnOutIn( + *shorts_1[i], *shorts_2[i], packed_twiddle); } } { TRACE_EVENT("Subtask", "ButterflyLoop"); for (size_t i = 0; i < suffix_1.size(); ++i) { - UnivariateEvaluationDomain::template ButterflyFnOutIn( + UnivariateEvaluationDomain::template ButterflyFnOutIn( *suffix_1[i], *suffix_2[i], twiddle); } } From f0f235abab78dda1cfcf09a08613baef0071b442 Mon Sep 17 00:00:00 2001 From: Wonyong Kim Date: Sat, 10 Aug 2024 20:16:53 +0900 Subject: [PATCH 05/11] refac(math): pass `absl::Span` by value See https://quuxplusone.github.io/blog/2021/11/09/pass-string-view-by-value/ --- .../math/polynomials/univariate/radix2_evaluation_domain.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h b/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h index 75ac932feb..b83cefa962 100644 --- a/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h +++ b/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h @@ -455,8 +455,8 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain, CONSTEXPR_IF_NOT_OPENMP void RunDitLayers( Eigen::Block>& submat, uint32_t layer, - const absl::Span& twiddles, - const absl::Span& packed_twiddles, bool rev) { + absl::Span twiddles, + absl::Span packed_twiddles, bool rev) { TRACE_EVENT("EvaluationDomain", "RunDitLayers"); if constexpr (F::Config::kModulusBits > 32) { NOTREACHED(); From a9a686a4035f3e71bce43f34b6a9a0b0bf3b193f Mon Sep 17 00:00:00 2001 From: Wonyong Kim Date: Sat, 10 Aug 2024 21:32:00 +0900 Subject: [PATCH 06/11] refac(math): remove unneeded scope --- .../univariate/radix2_evaluation_domain.h | 73 +++++++++---------- 1 file changed, 34 insertions(+), 39 deletions(-) diff --git a/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h b/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h index b83cefa962..645383bb69 100644 --- a/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h +++ b/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h @@ -342,28 +342,25 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain, } } - { - TRACE_EVENT("Subtask", "PrepareRootsVec"); + TRACE_EVENT("Subtask", "PrepareRootsVec"); - roots_vec_[this->log_size_of_group_ - 1] = std::move(largest); - inv_roots_vec_[0] = std::move(largest_inv); + roots_vec_[this->log_size_of_group_ - 1] = std::move(largest); + inv_roots_vec_[0] = std::move(largest_inv); - // Prepare space in each vector for the others. - size_t size = this->size_ / 2; - for (size_t i = 1; i < this->log_size_of_group_; ++i) { - size /= 2; - roots_vec_[this->log_size_of_group_ - i - 1].resize(size); - inv_roots_vec_[i].resize(size); - } + // Prepare space in each vector for the others. + size_t size = this->size_ / 2; + for (size_t i = 1; i < this->log_size_of_group_; ++i) { + size /= 2; + roots_vec_[this->log_size_of_group_ - i - 1].resize(size); + inv_roots_vec_[i].resize(size); + } - // Assign every element based on the biggest vector. - OMP_PARALLEL_FOR(size_t i = 1; i < this->log_size_of_group_; ++i) { - for (size_t j = 0; j < this->size_ / std::pow(2, i + 1); ++j) { - size_t k = std::pow(2, i) * j; - roots_vec_[this->log_size_of_group_ - i - 1][j] = - roots_vec_.back()[k]; - inv_roots_vec_[i][j] = inv_roots_vec_.front()[k]; - } + // Assign every element based on the biggest vector. + OMP_PARALLEL_FOR(size_t i = 1; i < this->log_size_of_group_; ++i) { + for (size_t j = 0; j < this->size_ / std::pow(2, i + 1); ++j) { + size_t k = std::pow(2, i) * j; + roots_vec_[this->log_size_of_group_ - i - 1][j] = roots_vec_.back()[k]; + inv_roots_vec_[i][j] = inv_roots_vec_.front()[k]; } } } @@ -429,26 +426,24 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain, size_t cols = static_cast(mat.cols()); size_t chunk_rows = size_t{1} << (this->log_size_of_group_ - mid_); - { - TRACE_EVENT("Subtask", "RunDitLayersLoop"); - // max block size: 2^(|this->log_size_of_group_| - |mid_|) - // TODO(ashjeong): benchmark between |OMP_PARALLEL_FOR| here vs - // |OMP_PARALLEL_NESTED_FOR| in |RunDitLayers| - for (size_t block_start = 0; block_start < this->size_; - block_start += chunk_rows) { - size_t thread = block_start / chunk_rows; - size_t cur_chunk_rows = std::min(chunk_rows, this->size_ - block_start); - Eigen::Block> submat = - mat.block(block_start, 0, cur_chunk_rows, cols); - for (uint32_t layer = mid_; layer < this->log_size_of_group_; ++layer) { - size_t first_block = thread << (layer - mid_); - RunDitLayers(submat, layer, - absl::MakeSpan(twiddles_rev.data() + first_block, - twiddles_rev.size() - first_block), - absl::MakeSpan(packed_twiddles_rev.data() + first_block, - packed_twiddles_rev.size() - first_block), - true); - } + TRACE_EVENT("Subtask", "RunDitLayersLoop"); + // max block size: 2^(|this->log_size_of_group_| - |mid_|) + // TODO(ashjeong): benchmark between |OMP_PARALLEL_FOR| here vs + // |OMP_PARALLEL_NESTED_FOR| in |RunDitLayers| + for (size_t block_start = 0; block_start < this->size_; + block_start += chunk_rows) { + size_t thread = block_start / chunk_rows; + size_t cur_chunk_rows = std::min(chunk_rows, this->size_ - block_start); + Eigen::Block> submat = + mat.block(block_start, 0, cur_chunk_rows, cols); + for (uint32_t layer = mid_; layer < this->log_size_of_group_; ++layer) { + size_t first_block = thread << (layer - mid_); + RunDitLayers(submat, layer, + absl::MakeSpan(twiddles_rev.data() + first_block, + twiddles_rev.size() - first_block), + absl::MakeSpan(packed_twiddles_rev.data() + first_block, + packed_twiddles_rev.size() - first_block), + true); } } } From 631b720518d6ce3aa3a0e13c0ccc79d93087116b Mon Sep 17 00:00:00 2001 From: Wonyong Kim Date: Sat, 10 Aug 2024 21:33:46 +0900 Subject: [PATCH 07/11] perf(math): parallelize loop that runs over domain size --- tachyon/math/polynomials/univariate/radix2_evaluation_domain.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h b/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h index 645383bb69..fe1f1e9cff 100644 --- a/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h +++ b/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h @@ -331,7 +331,7 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain, packed_inv_roots_vec_[1].resize(vec_largest_size); rev_roots_vec_ = SwapBitRevElements(largest); rev_inv_roots_vec_ = SwapBitRevElements(largest_inv); - for (size_t i = 0; i < vec_largest_size; ++i) { + OMP_PARALLEL_FOR(size_t i = 0; i < vec_largest_size; ++i) { packed_roots_vec_[0][i] = PackedPrimeField::Broadcast(largest[i]); packed_inv_roots_vec_[0][i] = PackedPrimeField::Broadcast(largest_inv[i]); From 5e2769a1361ea91735a4b17425a1db1a187cdc5e Mon Sep 17 00:00:00 2001 From: Wonyong Kim Date: Sat, 10 Aug 2024 21:57:32 +0900 Subject: [PATCH 08/11] perf: copy value if F is guaranteed to be small --- tachyon/crypto/commitments/fri/two_adic_fri_pcs.h | 6 +++--- .../math/polynomials/univariate/radix2_evaluation_domain.h | 4 ++-- tachyon/math/polynomials/univariate/two_adic_subgroup.h | 5 ++--- tachyon/zk/air/plonky3/base/two_adic_multiplicative_coset.h | 4 ++-- 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/tachyon/crypto/commitments/fri/two_adic_fri_pcs.h b/tachyon/crypto/commitments/fri/two_adic_fri_pcs.h index cd5cde7114..8609d8bc91 100644 --- a/tachyon/crypto/commitments/fri/two_adic_fri_pcs.h +++ b/tachyon/crypto/commitments/fri/two_adic_fri_pcs.h @@ -287,7 +287,7 @@ class TwoAdicFriPCS { absl::flat_hash_map> ComputeInverseDenominators( const std::vector>>& matrices_by_round, - const std::vector& points_by_round, const F& coset_shift) { + const std::vector& points_by_round, F coset_shift) { size_t num_rounds = matrices_by_round.size(); absl::flat_hash_map max_log_num_rows_for_point; @@ -327,7 +327,7 @@ class TwoAdicFriPCS { uint32_t log_num_rows = it->second; std::vector temp = base::Map( absl::MakeSpan(subgroup.data(), (size_t{1} << log_num_rows)), - [&point](const F& x) { return ExtF(x) - point; }); + [&point](F x) { return ExtF(x) - point; }); CHECK(ExtF::BatchInverseInPlace(temp)); ret[point] = std::move(temp); } @@ -338,7 +338,7 @@ class TwoAdicFriPCS { // https://hackmd.io/@vbuterin/barycentric_evaluation template static std::vector InterpolateCoset( - const Eigen::MatrixBase& coset_evals, const F& shift, + const Eigen::MatrixBase& coset_evals, F shift, const ExtF& point) { size_t num_rows = static_cast(coset_evals.rows()); size_t num_cols = static_cast(coset_evals.cols()); diff --git a/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h b/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h index fe1f1e9cff..011b3c9afa 100644 --- a/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h +++ b/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h @@ -467,7 +467,7 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain, for (size_t i = 0; i < half_block_size; ++i) { size_t lo = block_start + i; size_t hi = lo + half_block_size; - const F& twiddle = + F twiddle = rev ? twiddles[block_start / block_size] : twiddles[i << layer_rev]; const PackedPrimeField& packed_twiddle = rev ? packed_twiddles[block_start / block_size] @@ -479,7 +479,7 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain, CONSTEXPR_IF_NOT_OPENMP void ApplyButterflyToRows( Eigen::Block>& mat, size_t row_1, size_t row_2, - const F& twiddle, const PackedPrimeField& packed_twiddle) { + F twiddle, const PackedPrimeField& packed_twiddle) { TRACE_EVENT("EvaluationDomain", "ApplyButterflyToRows"); if constexpr (F::Config::kModulusBits > 32) { NOTREACHED(); diff --git a/tachyon/math/polynomials/univariate/two_adic_subgroup.h b/tachyon/math/polynomials/univariate/two_adic_subgroup.h index 924dd44f5d..5b3ad0a5f2 100644 --- a/tachyon/math/polynomials/univariate/two_adic_subgroup.h +++ b/tachyon/math/polynomials/univariate/two_adic_subgroup.h @@ -39,7 +39,7 @@ class TwoAdicSubgroup { // Compute the "coset DFT" of each column in |mat|. This can be viewed as // interpolation onto a coset of a multiplicative subgroup, rather than the // subgroup itself. - void CosetFFTBatch(RowMajorMatrix& mat, const F& shift) { + void CosetFFTBatch(RowMajorMatrix& mat, F shift) { static_assert(F::Config::kModulusBits <= 32); // Observe that // yᵢ = ∑ⱼ cⱼ (s gⁱ)ʲ @@ -68,8 +68,7 @@ class TwoAdicSubgroup { // Compute the low-degree extension of each column in |mat| onto a coset of // a larger subgroup. - void CosetLDEBatch(RowMajorMatrix& mat, size_t added_bits, - const F& shift) { + void CosetLDEBatch(RowMajorMatrix& mat, size_t added_bits, F shift) { static_assert(F::Config::kModulusBits <= 32); IFFTBatch(mat); Eigen::Index rows = mat.rows(); diff --git a/tachyon/zk/air/plonky3/base/two_adic_multiplicative_coset.h b/tachyon/zk/air/plonky3/base/two_adic_multiplicative_coset.h index 3315a0c90e..2b44005fc4 100644 --- a/tachyon/zk/air/plonky3/base/two_adic_multiplicative_coset.h +++ b/tachyon/zk/air/plonky3/base/two_adic_multiplicative_coset.h @@ -26,7 +26,7 @@ class TwoAdicMultiplicativeCoset { public: constexpr TwoAdicMultiplicativeCoset() = default; - TwoAdicMultiplicativeCoset(uint32_t log_n, const F& shift) { + TwoAdicMultiplicativeCoset(uint32_t log_n, F shift) { domain_.reset(static_cast*>( math::Radix2EvaluationDomain::Create(size_t{1} << log_n) ->GetCoset(shift) @@ -83,7 +83,7 @@ class TwoAdicMultiplicativeCoset { LagrangeSelectors> GetSelectorsOnCoset( const TwoAdicMultiplicativeCoset& coset) const { - const F& coset_shift = coset.domain()->offset(); + F coset_shift = coset.domain()->offset(); CHECK_EQ(domain_->offset(), F::One()); CHECK_NE(coset_shift, F::One()); From ee1a7527b6d38a5793e3f3ab9239e3e27afc046c Mon Sep 17 00:00:00 2001 From: Ryan Kim Date: Mon, 12 Aug 2024 20:10:50 +0900 Subject: [PATCH 09/11] perf(math): parallelize outer loop instead of inner loop --- .../univariate/radix2_evaluation_domain.h | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h b/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h index 011b3c9afa..560b275a60 100644 --- a/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h +++ b/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h @@ -399,10 +399,8 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain, size_t chunk_rows = size_t{1} << mid_; // max block size: 2^|mid_| - // TODO(ashjeong): benchmark between |OMP_PARALLEL_FOR| here vs - // |OMP_PARALLEL_NESTED_FOR| in |RunDitLayers| - for (size_t block_start = 0; block_start < this->size_; - block_start += chunk_rows) { + OMP_PARALLEL_FOR(size_t block_start = 0; block_start < this->size_; + block_start += chunk_rows) { size_t cur_chunk_rows = std::min(chunk_rows, this->size_ - block_start); Eigen::Block> submat = mat.block(block_start, 0, cur_chunk_rows, cols); @@ -428,10 +426,8 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain, TRACE_EVENT("Subtask", "RunDitLayersLoop"); // max block size: 2^(|this->log_size_of_group_| - |mid_|) - // TODO(ashjeong): benchmark between |OMP_PARALLEL_FOR| here vs - // |OMP_PARALLEL_NESTED_FOR| in |RunDitLayers| - for (size_t block_start = 0; block_start < this->size_; - block_start += chunk_rows) { + OMP_PARALLEL_FOR(size_t block_start = 0; block_start < this->size_; + block_start += chunk_rows) { size_t thread = block_start / chunk_rows; size_t cur_chunk_rows = std::min(chunk_rows, this->size_ - block_start); Eigen::Block> submat = @@ -462,8 +458,8 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain, size_t sub_rows = static_cast(submat.rows()); DCHECK_GE(sub_rows, block_size); - OMP_PARALLEL_NESTED_FOR(size_t block_start = 0; block_start < sub_rows; - block_start += block_size) { + for (size_t block_start = 0; block_start < sub_rows; + block_start += block_size) { for (size_t i = 0; i < half_block_size; ++i) { size_t lo = block_start + i; size_t hi = lo + half_block_size; From 604ba11dcb68dff1aa5cea5010b53a7f6af204e3 Mon Sep 17 00:00:00 2001 From: Ryan Kim Date: Mon, 12 Aug 2024 17:15:38 +0900 Subject: [PATCH 10/11] perf(math): remove intermediate allocations from `ApplyButterflyToRows()` --- tachyon/math/matrix/matrix_utils.h | 25 ------------ tachyon/math/matrix/matrix_utils_unittest.cc | 39 ------------------- .../univariate/radix2_evaluation_domain.h | 35 +++++++---------- 3 files changed, 15 insertions(+), 84 deletions(-) diff --git a/tachyon/math/matrix/matrix_utils.h b/tachyon/math/matrix/matrix_utils.h index 28843f6e74..d9d3ac122e 100644 --- a/tachyon/math/matrix/matrix_utils.h +++ b/tachyon/math/matrix/matrix_utils.h @@ -52,31 +52,6 @@ MakeCirculant(const Eigen::MatrixBase& arg) { CirculantFunctor(arg.derived())); } -// Packs a given row of a matrix. Results in a vector of packed fields and a -// vector of remaining values if the number of cols is not a factor of the -// packed field size. -// -// NOTE(ashjeong): |PackRowHorizontally| currently only -// supports row-major matrices. -template -std::vector PackRowHorizontally( - Eigen::Block& matrix_row, - std::vector& remaining_values) { - size_t num_packed = matrix_row.cols() / PackedField::N; - size_t remaining_start_idx = num_packed * PackedField::N; - remaining_values = - base::CreateVector(matrix_row.cols() - remaining_start_idx, - [remaining_start_idx, &matrix_row](size_t col) { - return reinterpret_cast( - matrix_row.data() + remaining_start_idx + col); - }); - return base::CreateVector(num_packed, [&matrix_row](size_t col) { - return reinterpret_cast(matrix_row.data() + - PackedField::N * col); - }); -} - // Creates a vector of packed fields for a given matrix row. If the length // of the row is not a multiple of |PackedField::N|, the last |PackedField| // element populates leftover values with |F::Zero()|. diff --git a/tachyon/math/matrix/matrix_utils_unittest.cc b/tachyon/math/matrix/matrix_utils_unittest.cc index 1c1b574357..5d8408cf2a 100644 --- a/tachyon/math/matrix/matrix_utils_unittest.cc +++ b/tachyon/math/matrix/matrix_utils_unittest.cc @@ -21,45 +21,6 @@ TEST_F(MatrixUtilsTest, Circulant) { class MatrixPackingTest : public FiniteFieldTest {}; -TEST_F(MatrixPackingTest, PackRowHorizontally) { - constexpr size_t N = PackedBabyBear::N; - constexpr size_t R = 3; - - { - RowMajorMatrix matrix = - RowMajorMatrix::Random(2 * N, 2 * N); - auto mat_row = matrix.row(R); - std::vector remaining_values; - std::vector packed_values = - PackRowHorizontally(mat_row, remaining_values); - ASSERT_TRUE(remaining_values.empty()); - ASSERT_EQ(packed_values.size(), 2); - for (size_t i = 0; i < packed_values.size(); ++i) { - for (size_t j = 0; j < N; ++j) { - EXPECT_EQ((*packed_values[i])[j], matrix(R, i * N + j)); - } - } - } - { - RowMajorMatrix matrix = - RowMajorMatrix::Random(2 * N - 1, 2 * N - 1); - auto mat_row = matrix.row(R); - std::vector remaining_values; - std::vector packed_values = - PackRowHorizontally(mat_row, remaining_values); - ASSERT_EQ(remaining_values.size(), N - 1); - ASSERT_EQ(packed_values.size(), 1); - for (size_t i = 0; i < remaining_values.size(); ++i) { - EXPECT_EQ(*remaining_values[i], matrix(R, packed_values.size() * N + i)); - } - for (size_t i = 0; i < packed_values.size(); ++i) { - for (size_t j = 0; j < N; ++j) { - EXPECT_EQ((*packed_values[i])[j], matrix(R, i * N + j)); - } - } - } -} - TEST_F(MatrixPackingTest, PackRowVerticallyWithPrimeField) { constexpr size_t N = PackedBabyBear::N; constexpr size_t R = 3; diff --git a/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h b/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h index 560b275a60..9d4a00b20f 100644 --- a/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h +++ b/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h @@ -480,29 +480,24 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain, if constexpr (F::Config::kModulusBits > 32) { NOTREACHED(); } - std::vector suffix_1; - std::vector suffix_2; - auto row_1_block = mat.row(row_1); - std::vector shorts_1 = - PackRowHorizontally(row_1_block, suffix_1); auto row_2_block = mat.row(row_2); - std::vector shorts_2 = - PackRowHorizontally(row_2_block, suffix_2); - - { - TRACE_EVENT("Subtask", "ButterflyOMPLoop"); - OMP_PARALLEL_FOR(size_t i = 0; i < shorts_1.size(); ++i) { - UnivariateEvaluationDomain::template ButterflyFnOutIn( - *shorts_1[i], *shorts_2[i], packed_twiddle); - } + + for (size_t i = 0; i < row_1_block.cols() / PackedPrimeField::N; ++i) { + UnivariateEvaluationDomain::template ButterflyFnOutIn( + *reinterpret_cast( + &row_1_block.data()[PackedPrimeField::N * i]), + *reinterpret_cast( + &row_2_block.data()[PackedPrimeField::N * i]), + packed_twiddle); } - { - TRACE_EVENT("Subtask", "ButterflyLoop"); - for (size_t i = 0; i < suffix_1.size(); ++i) { - UnivariateEvaluationDomain::template ButterflyFnOutIn( - *suffix_1[i], *suffix_2[i], twiddle); - } + size_t remaining_start_idx = + row_1_block.cols() / PackedPrimeField::N * PackedPrimeField::N; + for (size_t i = remaining_start_idx; + i < static_cast(row_1_block.cols()); ++i) { + UnivariateEvaluationDomain::template ButterflyFnOutIn( + *reinterpret_cast(&row_1_block.data()[i]), + *reinterpret_cast(&row_2_block.data()[i]), twiddle); } } From ff014d7cf1b14fd7ceaec937e5436ca7cbde965c Mon Sep 17 00:00:00 2001 From: Wonyong Kim Date: Mon, 12 Aug 2024 23:46:02 +0900 Subject: [PATCH 11/11] perf(math): avoid montgomery conversions in equality operators --- tachyon/math/finite_fields/prime_field_fallback.h | 4 ++-- tachyon/math/finite_fields/prime_field_gpu_debug.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tachyon/math/finite_fields/prime_field_fallback.h b/tachyon/math/finite_fields/prime_field_fallback.h index 8d01a27b9c..24084cd219 100644 --- a/tachyon/math/finite_fields/prime_field_fallback.h +++ b/tachyon/math/finite_fields/prime_field_fallback.h @@ -168,11 +168,11 @@ class PrimeField<_Config, std::enable_if_t