From 7f38f2113bff84fc425ce3a058891b7403c6aa2e Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 5 Sep 2024 14:58:22 -0400 Subject: [PATCH 01/25] Improve minhash performance by using more working memory --- cpp/benchmarks/text/minhash.cpp | 11 +++--- cpp/src/text/minhash.cu | 61 +++++++++++++++++++------------- cpp/tests/text/minhash_tests.cpp | 50 ++++++++++++++++++++------ 3 files changed, 81 insertions(+), 41 deletions(-) diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp index 31ce60d8f9a..86760528eef 100644 --- a/cpp/benchmarks/text/minhash.cpp +++ b/cpp/benchmarks/text/minhash.cpp @@ -43,12 +43,11 @@ static void bench_minhash(nvbench::state& state) create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile); cudf::strings_column_view input(strings_table->view().column(0)); - data_profile const seeds_profile = data_profile_builder().null_probability(0).distribution( + data_profile const seeds_profile = data_profile_builder().no_validity().distribution( cudf::type_to_id(), distribution_id::NORMAL, 0, row_width); auto const seed_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32; auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile); auto seeds = seeds_table->get_column(0); - seeds.set_null_mask(rmm::device_buffer{}, 0); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); @@ -64,8 +63,8 @@ static void bench_minhash(nvbench::state& state) NVBENCH_BENCH(bench_minhash) .set_name("minhash") - .add_int64_axis("num_rows", {1024, 8192, 16364, 131072}) - .add_int64_axis("row_width", {128, 512, 2048}) - .add_int64_axis("hash_width", {5, 10}) - .add_int64_axis("seed_count", {2, 26}) + .add_int64_axis("num_rows", {16364, 131072}) + .add_int64_axis("row_width", {256, 512, 1024}) + .add_int64_axis("hash_width", {5, 10, 20}) + .add_int64_axis("seed_count", {2, 26, 260}) .add_int64_axis("hash_type", {32, 64}); diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 4318123627d..d2e1dfa58cb 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -56,6 +56,7 @@ namespace { * @param d_strings Strings column to process * @param seeds Seeds for hashing each string * @param width Substring window size in characters + * @param working_memory Memory used to hold intermediate hash values * @param d_hashes Minhash output values for each string */ template < @@ -65,29 +66,30 @@ template < CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, cudf::device_span seeds, cudf::size_type width, + hash_value_type* working_memory, hash_value_type* d_hashes) { - auto const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - if (idx >= (static_cast(d_strings.size()) * - static_cast(cudf::detail::warp_size))) { - return; - } - - auto const str_idx = static_cast(idx / cudf::detail::warp_size); - auto const lane_idx = static_cast(idx % cudf::detail::warp_size); + auto const idx = cudf::detail::grid_1d::global_thread_id(); + auto const str_idx = idx / cudf::detail::warp_size; + if (str_idx >= d_strings.size()) { return; } if (d_strings.is_null(str_idx)) { return; } auto const d_str = d_strings.element(str_idx); - auto const d_output = d_hashes + (str_idx * seeds.size()); + auto const init = d_str.empty() ? 0 : std::numeric_limits::max(); + auto const lane_idx = idx % cudf::detail::warp_size; - // initialize hashes output for this string - if (lane_idx == 0) { - auto const init = d_str.empty() ? 0 : std::numeric_limits::max(); - thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init); + auto warp_hashes = working_memory + (str_idx * cudf::detail::warp_size * seeds.size()); + + for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size(); + seed_idx += cudf::detail::warp_size) { + auto begin = warp_hashes + (seed_idx * cudf::detail::warp_size); + thrust::uninitialized_fill(thrust::seq, begin, begin + cudf::detail::warp_size, init); } __syncwarp(); + auto const d_output = d_hashes + (str_idx * seeds.size()); + auto const begin = d_str.data() + lane_idx; auto const end = d_str.data() + d_str.size_bytes(); @@ -100,24 +102,28 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, if ((itr != d_str.data()) && (left > 0)) { continue; } // true if past the end of the string auto const hash_str = cudf::string_view(itr, bytes); - // hashing with each seed on the same section of the string is 10x faster than - // computing the substrings for each seed for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) { auto const hasher = HashFunction(seeds[seed_idx]); - // hash substring and store the min value + hash_value_type hv; if constexpr (std::is_same_v) { - auto const hvalue = hasher(hash_str); - cuda::atomic_ref ref{*(d_output + seed_idx)}; - ref.fetch_min(hvalue, cuda::std::memory_order_relaxed); + hv = hasher(hash_str); } else { - // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values - // but only uses the first uint64 value as requested by the LLM team. - auto const hvalue = thrust::get<0>(hasher(hash_str)); - cuda::atomic_ref ref{*(d_output + seed_idx)}; - ref.fetch_min(hvalue, cuda::std::memory_order_relaxed); + hv = thrust::get<0>(hasher(hash_str)); } + warp_hashes[(seed_idx * cudf::detail::warp_size) + lane_idx] = + min(hv, warp_hashes[(seed_idx * cudf::detail::warp_size) + lane_idx]); } } + __syncwarp(); + + // compute final result + for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size(); + seed_idx += cudf::detail::warp_size) { + auto begin = warp_hashes + (seed_idx * cudf::detail::warp_size); + auto hv = + thrust::reduce(thrust::seq, begin, begin + cudf::detail::warp_size, init, thrust::minimum{}); + d_output[seed_idx] = hv; + } } template < @@ -152,9 +158,14 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, auto d_hashes = hashes->mutable_view().data(); constexpr int block_size = 256; + + auto const wm_size = cudf::util::round_up_safe( + seeds.size() * cudf::detail::warp_size * input.size(), static_cast(block_size)); + auto working_memory = rmm::device_uvector(wm_size, stream); + cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size}; minhash_kernel<<>>( - *d_strings, seeds, width, d_hashes); + *d_strings, seeds, width, working_memory.data(), d_hashes); return hashes; } diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index 7575a3ba846..4abd45a62c9 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -42,15 +42,27 @@ TEST_F(MinHashTest, Basic) "", "doc 3", "d", - "The quick brown fox jumpéd over the lazy brown dog."}, + "The quick brown fox jumpéd over the lazy brown dog.", + "line eight", + "line nine", + "line ten"}, validity); auto view = cudf::strings_column_view(input); auto results = nvtext::minhash(view); - auto expected = cudf::test::fixed_width_column_wrapper( - {1207251914u, 0u, 21141582u, 0u, 1207251914u, 655955059u, 86520422u}, validity); + auto expected = cudf::test::fixed_width_column_wrapper({1207251914u, + 0u, + 21141582u, + 0u, + 1207251914u, + 655955059u, + 86520422u, + 304329233u, + 640477688u, + 640477688u}, + validity); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); auto results64 = nvtext::minhash64(view); @@ -60,7 +72,10 @@ TEST_F(MinHashTest, Basic) 0ul, 13145552576991307582ul, 14660046701545912182ul, - 398062025280761388ul}, + 398062025280761388ul, + 1273320923074904938ul, + 3456065052701055601ul, + 10664519708968191209ul}, validity); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } @@ -82,7 +97,12 @@ TEST_F(MinHashTest, MultiSeed) "this is doc 2", "doc 3", "d", - "The quick brown fox jumpéd over the lazy brown dog."}); + "The quick brown fox jumpéd over the lazy brown dog.", + "line six", + "line seven", + "line eight", + "line nine", + "line ten"}); auto view = cudf::strings_column_view(input); @@ -95,7 +115,12 @@ TEST_F(MinHashTest, MultiSeed) LCW{ 21141582u, 580916568u, 1258052021u}, LCW{1207251914u, 943567174u, 1109272887u}, LCW{ 655955059u, 488346356u, 2394664816u}, - LCW{ 86520422u, 236622901u, 102546228u}}); + LCW{ 86520422u, 236622901u, 102546228u}, + LCW{ 640477688u, 198451716u, 136303992u}, + LCW{ 640477688u, 198451716u, 577802054u}, + LCW{ 304329233u, 198451716u, 714941560u}, + LCW{ 640477688u, 198451716u, 261342259u}, + LCW{ 640477688u, 198451716u, 139988887u}}); // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); @@ -104,11 +129,16 @@ TEST_F(MinHashTest, MultiSeed) using LCW64 = cudf::test::lists_column_wrapper; // clang-format off - LCW64 expected64({LCW64{ 774489391575805754ul, 10435654231793485448ul, 1188598072697676120ul}, - LCW64{ 3232308021562742685ul, 4445611509348165860ul, 1188598072697676120ul}, - LCW64{13145552576991307582ul, 6846192680998069919ul, 1188598072697676120ul}, + LCW64 expected64({LCW64{ 774489391575805754ul, 10435654231793485448ul, 1188598072697676120ul}, + LCW64{ 3232308021562742685ul, 4445611509348165860ul, 1188598072697676120ul}, + LCW64{13145552576991307582ul, 6846192680998069919ul, 1188598072697676120ul}, LCW64{14660046701545912182ul, 17106501326045553694ul, 17713478494106035784ul}, - LCW64{ 398062025280761388ul, 377720198157450084ul, 984941365662009329ul}}); + LCW64{ 398062025280761388ul, 377720198157450084ul, 984941365662009329ul}, + LCW64{ 2837259098848821044ul, 650799815433771163ul, 2428991957842356245ul}, + LCW64{ 2105419906076957667ul, 650799815433771163ul, 2428991957842356245ul}, + LCW64{ 1273320923074904938ul, 650799815433771163ul, 2428991957842356245ul}, + LCW64{ 3456065052701055601ul, 650799815433771163ul, 2428991957842356245ul}, + LCW64{10664519708968191209ul, 650799815433771163ul, 2428991957842356245ul}}); // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } From f35c16d3fa32d0b95d2952da1b019f22997971db Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 19 Sep 2024 19:45:31 -0400 Subject: [PATCH 02/25] change to block per string --- cpp/src/text/minhash.cu | 43 ++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index ce884de5257..613834fb34e 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -47,6 +47,9 @@ namespace nvtext { namespace detail { namespace { +constexpr cudf::thread_index_type block_size = 256; +constexpr cudf::thread_index_type tile_size = block_size; // cudf::detail::warp_size; + /** * @brief Compute the minhash of each string for each seed * @@ -72,21 +75,20 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, hash_value_type* d_hashes) { auto const idx = cudf::detail::grid_1d::global_thread_id(); - auto const str_idx = idx / cudf::detail::warp_size; + auto const str_idx = idx / tile_size; if (str_idx >= d_strings.size()) { return; } if (d_strings.is_null(str_idx)) { return; } auto const d_str = d_strings.element(str_idx); auto const init = d_str.empty() ? 0 : std::numeric_limits::max(); - auto const lane_idx = idx % cudf::detail::warp_size; + auto const lane_idx = idx % tile_size; - auto warp_hashes = working_memory + (str_idx * cudf::detail::warp_size * seeds.size()); + auto warp_hashes = working_memory + (str_idx * tile_size * seeds.size()); - for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size(); - seed_idx += cudf::detail::warp_size) { - auto begin = warp_hashes + (seed_idx * cudf::detail::warp_size); - thrust::uninitialized_fill(thrust::seq, begin, begin + cudf::detail::warp_size, init); + for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size(); seed_idx += tile_size) { + auto begin = warp_hashes + (seed_idx * tile_size); + thrust::uninitialized_fill(thrust::seq, begin, begin + tile_size, init); } __syncwarp(); @@ -96,7 +98,7 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, auto const end = d_str.data() + d_str.size_bytes(); // each lane hashes 'width' substrings of d_str - for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) { + for (auto itr = begin; itr < end; itr += tile_size) { if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { continue; } auto const check_str = // used for counting 'width' characters cudf::string_view(itr, static_cast(thrust::distance(itr, end))); @@ -112,18 +114,17 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, } else { hv = thrust::get<0>(hasher(hash_str)); } - warp_hashes[(seed_idx * cudf::detail::warp_size) + lane_idx] = - min(hv, warp_hashes[(seed_idx * cudf::detail::warp_size) + lane_idx]); + warp_hashes[(seed_idx * tile_size) + lane_idx] = + cuda::std::min(hv, warp_hashes[(seed_idx * tile_size) + lane_idx]); } } - __syncwarp(); + //__syncwarp(); + __syncthreads(); // compute final result - for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size(); - seed_idx += cudf::detail::warp_size) { - auto begin = warp_hashes + (seed_idx * cudf::detail::warp_size); - auto hv = - thrust::reduce(thrust::seq, begin, begin + cudf::detail::warp_size, init, thrust::minimum{}); + for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size(); seed_idx += tile_size) { + auto begin = warp_hashes + (seed_idx * tile_size); + auto hv = thrust::reduce(thrust::seq, begin, begin + tile_size, init, thrust::minimum{}); d_output[seed_idx] = hv; } } @@ -159,14 +160,12 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, mr); auto d_hashes = hashes->mutable_view().data(); - constexpr cudf::thread_index_type block_size = 256; - - auto const wm_size = cudf::util::round_up_safe( - seeds.size() * cudf::detail::warp_size * input.size(), static_cast(block_size)); + auto const wm_size = cudf::util::round_up_safe(seeds.size() * tile_size * input.size(), + static_cast(block_size)); auto working_memory = rmm::device_uvector(wm_size, stream); - cudf::detail::grid_1d grid{ - static_cast(input.size()) * cudf::detail::warp_size, block_size}; + cudf::detail::grid_1d grid{static_cast(input.size()) * tile_size, + block_size}; minhash_kernel<<>>( *d_strings, seeds, width, working_memory.data(), d_hashes); From 01500dd55bfaf0b9687517cb7d8040cf7fd179cf Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 19 Sep 2024 20:05:11 -0400 Subject: [PATCH 03/25] fix sync call --- cpp/src/text/minhash.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 613834fb34e..216ca511410 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -90,7 +90,8 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, auto begin = warp_hashes + (seed_idx * tile_size); thrust::uninitialized_fill(thrust::seq, begin, begin + tile_size, init); } - __syncwarp(); + //__syncwarp(); + __syncthreads(); auto const d_output = d_hashes + (str_idx * seeds.size()); From 70948b9c50ffa9b8cfa8ed53913c924882fc5d9a Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 1 Oct 2024 20:08:40 -0400 Subject: [PATCH 04/25] fix benchmark ranges --- cpp/benchmarks/text/minhash.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp index 86760528eef..1ba4d51b4af 100644 --- a/cpp/benchmarks/text/minhash.cpp +++ b/cpp/benchmarks/text/minhash.cpp @@ -32,9 +32,9 @@ static void bench_minhash(nvbench::state& state) auto const seed_count = static_cast(state.get_int64("seed_count")); auto const base64 = state.get_int64("hash_type") == 64; - if (static_cast(num_rows) * static_cast(row_width) >= + if ((num_rows * seed_count * (base64 ? sizeof(int64_t) : sizeof(int32_t)) * 32L) >= static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); + state.skip("Skip benchmarks requiring more than 2GB working memory"); } data_profile const strings_profile = data_profile_builder().distribution( @@ -63,7 +63,7 @@ static void bench_minhash(nvbench::state& state) NVBENCH_BENCH(bench_minhash) .set_name("minhash") - .add_int64_axis("num_rows", {16364, 131072}) + .add_int64_axis("num_rows", {16364, 65456}) .add_int64_axis("row_width", {256, 512, 1024}) .add_int64_axis("hash_width", {5, 10, 20}) .add_int64_axis("seed_count", {2, 26, 260}) From ef3b22867f3e63ea8bf41fa86a9010d8274d990f Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 2 Oct 2024 16:04:27 -0400 Subject: [PATCH 05/25] minor fixes --- cpp/src/text/minhash.cu | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 216ca511410..54d92cb68d0 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -53,8 +53,8 @@ constexpr cudf::thread_index_type tile_size = block_size; // cudf::detail::war /** * @brief Compute the minhash of each string for each seed * - * This is a warp-per-string algorithm where parallel threads within a warp - * work on substrings of a single string row. + * This is a block-per-string algorithm where parallel threads within a block + * work on a single string row. * * @tparam HashFunction hash function to use on each substring * @@ -77,20 +77,18 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, auto const idx = cudf::detail::grid_1d::global_thread_id(); auto const str_idx = idx / tile_size; if (str_idx >= d_strings.size()) { return; } - if (d_strings.is_null(str_idx)) { return; } auto const d_str = d_strings.element(str_idx); auto const init = d_str.empty() ? 0 : std::numeric_limits::max(); auto const lane_idx = idx % tile_size; - auto warp_hashes = working_memory + (str_idx * tile_size * seeds.size()); + auto tile_hashes = working_memory + (str_idx * tile_size * seeds.size()); for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size(); seed_idx += tile_size) { - auto begin = warp_hashes + (seed_idx * tile_size); + auto begin = tile_hashes + (seed_idx * tile_size); thrust::uninitialized_fill(thrust::seq, begin, begin + tile_size, init); } - //__syncwarp(); __syncthreads(); auto const d_output = d_hashes + (str_idx * seeds.size()); @@ -115,16 +113,15 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, } else { hv = thrust::get<0>(hasher(hash_str)); } - warp_hashes[(seed_idx * tile_size) + lane_idx] = - cuda::std::min(hv, warp_hashes[(seed_idx * tile_size) + lane_idx]); + tile_hashes[(seed_idx * tile_size) + lane_idx] = + cuda::std::min(hv, tile_hashes[(seed_idx * tile_size) + lane_idx]); } } - //__syncwarp(); __syncthreads(); // compute final result for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size(); seed_idx += tile_size) { - auto begin = warp_hashes + (seed_idx * tile_size); + auto begin = tile_hashes + (seed_idx * tile_size); auto hv = thrust::reduce(thrust::seq, begin, begin + tile_size, init, thrust::minimum{}); d_output[seed_idx] = hv; } From 03d570de78abd400a4763eaa1753b391cca557ec Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 3 Oct 2024 11:11:40 -0400 Subject: [PATCH 06/25] minor cleanups --- cpp/src/text/minhash.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 54d92cb68d0..f89b0bb3a17 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -48,7 +48,8 @@ namespace detail { namespace { constexpr cudf::thread_index_type block_size = 256; -constexpr cudf::thread_index_type tile_size = block_size; // cudf::detail::warp_size; +// for tuning independently from block_size +constexpr cudf::thread_index_type tile_size = block_size; /** * @brief Compute the minhash of each string for each seed @@ -85,6 +86,7 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, auto tile_hashes = working_memory + (str_idx * tile_size * seeds.size()); + // initialize working memory for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size(); seed_idx += tile_size) { auto begin = tile_hashes + (seed_idx * tile_size); thrust::uninitialized_fill(thrust::seq, begin, begin + tile_size, init); From f74716349d07efac55d4c973e65c93fb9f3bb7ee Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 9 Oct 2024 21:36:53 -0400 Subject: [PATCH 07/25] match benchmark to curator parameters --- cpp/benchmarks/text/minhash.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp index 1ba4d51b4af..a0f9196d05f 100644 --- a/cpp/benchmarks/text/minhash.cpp +++ b/cpp/benchmarks/text/minhash.cpp @@ -63,8 +63,8 @@ static void bench_minhash(nvbench::state& state) NVBENCH_BENCH(bench_minhash) .set_name("minhash") - .add_int64_axis("num_rows", {16364, 65456}) - .add_int64_axis("row_width", {256, 512, 1024}) - .add_int64_axis("hash_width", {5, 10, 20}) - .add_int64_axis("seed_count", {2, 26, 260}) + .add_int64_axis("num_rows", {15000, 30000, 60000}) + .add_int64_axis("row_width", {6000, 28000, 50000}) + .add_int64_axis("hash_width", {12, 24}) + .add_int64_axis("seed_count", {26, 260}) .add_int64_axis("hash_type", {32, 64}); From aa6f3e047e5641fd7ad08cf5628a5e1b29372187 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 10 Oct 2024 17:03:05 -0400 Subject: [PATCH 08/25] add minhash_permuted API --- cpp/benchmarks/text/minhash.cpp | 4 +- cpp/include/nvtext/minhash.hpp | 56 ++++++ cpp/src/text/minhash.cu | 169 ++++++++++++++++++ cpp/tests/text/minhash_tests.cpp | 58 ++++++ python/cudf/cudf/_lib/nvtext/minhash.pyx | 46 +++++ python/cudf/cudf/_lib/strings/__init__.py | 2 + python/cudf/cudf/core/column/string.py | 38 ++++ .../pylibcudf/libcudf/nvtext/minhash.pxd | 14 ++ 8 files changed, 385 insertions(+), 2 deletions(-) diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp index a0f9196d05f..06aeff3e644 100644 --- a/cpp/benchmarks/text/minhash.cpp +++ b/cpp/benchmarks/text/minhash.cpp @@ -56,8 +56,8 @@ static void bench_minhash(nvbench::state& state) state.add_global_memory_writes(num_rows); // output are hashes state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = base64 ? nvtext::minhash64(input, seeds.view(), hash_width) - : nvtext::minhash(input, seeds.view(), hash_width); + auto result = base64 ? nvtext::minhash64_permuted(input, seeds.view(), seeds.view(), hash_width) + : nvtext::minhash_permuted(input, seeds.view(), seeds.view(), hash_width); }); } diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index 7c909f1a948..f06323e3f5d 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -90,6 +90,34 @@ std::unique_ptr minhash( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Returns the minhash values for each string + * + * This function uses MurmurHash3_x64_128 for the hash algorithm. + * + * Any null row entries result in corresponding null output rows. + * + * @throw std::invalid_argument if the width < 2 + * @throw std::invalid_argument if parmA is empty + * @throw std::invalid_argument if `parmB.size() != parmA.size()` + * @throw std::overflow_error if `parmA.size() * input.size()` exceeds the column size limit + * + * @param input Strings column to compute minhash + * @param parmA Values used for the hash algorithm + * @param parmB Values used for the hash algorithm + * @param width The character width used for apply substrings + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return List column of minhash values for each string per seed + */ +std::unique_ptr minhash_permuted( + cudf::strings_column_view const& input, + cudf::device_span parmA, + cudf::device_span parmB, + cudf::size_type width, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** * @brief Returns the minhash value for each string * @@ -151,6 +179,34 @@ std::unique_ptr minhash64( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Returns the minhash values for each string + * + * This function uses MurmurHash3_x64_128 for the hash algorithm. + * + * Any null row entries result in corresponding null output rows. + * + * @throw std::invalid_argument if the width < 2 + * @throw std::invalid_argument if parmA is empty + * @throw std::invalid_argument if `parmB.size() != parmA.size()` + * @throw std::overflow_error if `parmA.size() * input.size()` exceeds the column size limit + * + * @param input Strings column to compute minhash + * @param parmA Values used for the hash algorithm + * @param parmB Values used for the hash algorithm + * @param width The character width used for apply substrings + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return List column of minhash values for each string per seed + */ +std::unique_ptr minhash64_permuted( + cudf::strings_column_view const& input, + cudf::device_span parmA, + cudf::device_span parmB, + cudf::size_type width, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** * @brief Returns the minhash values for each row of strings per seed * diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index f89b0bb3a17..1aa6bbb5613 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -129,6 +129,82 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, } } +template < + typename HashFunction, + typename hash_value_type = std:: + conditional_t, uint32_t, uint64_t>> +CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings, + cudf::device_span parmA, + cudf::device_span parmB, + cudf::size_type width, + hash_value_type* d_hashes) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + auto const str_idx = idx / tile_size; + if (str_idx >= d_strings.size()) { return; } + if (d_strings.is_null(str_idx)) { return; } + + auto const d_str = d_strings.element(str_idx); + auto const init = d_str.empty() ? 0 : std::numeric_limits::max(); + auto const lane_idx = idx % tile_size; + + auto const d_output = d_hashes + (str_idx * parmA.size()); + + auto const begin = d_str.data() + (lane_idx); + auto const end = d_str.data() + d_str.size_bytes(); + + constexpr std::size_t seed_chunk = 16; // based on block-size==256 + constexpr uint64_t mersenne_prime = (1UL << 61) - 1; + constexpr hash_value_type hash_max = std::numeric_limits::max(); + + extern __shared__ char shmem[]; + auto const block_hashes = reinterpret_cast(shmem); + + for (std::size_t i = 0; i < parmA.size(); i += seed_chunk) { + // initialize working memory + auto const tile_hashes = block_hashes + (lane_idx * seed_chunk); + thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + seed_chunk, init); + __syncthreads(); + + auto const seed_count = cuda::std::min(seed_chunk, parmA.size() - i); + + // each lane hashes 'width' substrings of d_str + for (auto itr = begin; itr < end; itr += tile_size) { + if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { continue; } + auto const check_str = // used for counting 'width' characters + cudf::string_view(itr, static_cast(thrust::distance(itr, end))); + auto const [bytes, left] = + cudf::strings::detail::bytes_to_character_position(check_str, width); + if ((itr != d_str.data()) && (left > 0)) { continue; } // true if past the end of the string + + auto const hash_str = cudf::string_view(itr, bytes); + auto const hasher = HashFunction(parmA[0]); + hash_value_type hv1; + if constexpr (std::is_same_v) { + hv1 = hasher(hash_str); + } else { + hv1 = thrust::get<0>(hasher(hash_str)); + } + + for (std::size_t seed_idx = i; seed_idx < (i + seed_count); ++seed_idx) { + hash_value_type const hv = + seed_idx == 0 ? hv1 + : ((hv1 * parmA[seed_idx] + parmB[seed_idx]) % mersenne_prime) & hash_max; + auto const block_idx = ((seed_idx % seed_chunk) * tile_size) + lane_idx; + block_hashes[block_idx] = cuda::std::min(hv, block_hashes[block_idx]); + } + } + __syncthreads(); + + if (lane_idx < seed_count) { + auto const hvs = block_hashes + (lane_idx * tile_size); + auto const hv = thrust::reduce(thrust::seq, hvs, hvs + tile_size, init, thrust::minimum{}); + d_output[lane_idx + i] = hv; + } + __syncthreads(); + } +} + template < typename HashFunction, typename hash_value_type = std:: @@ -172,6 +248,53 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, return hashes; } +template < + typename HashFunction, + typename hash_value_type = std:: + conditional_t, uint32_t, uint64_t>> +std::unique_ptr minhash_fn(cudf::strings_column_view const& input, + cudf::device_span parmA, + cudf::device_span parmB, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_EXPECTS(!parmA.empty(), "Parameters A and B cannot be empty", std::invalid_argument); + CUDF_EXPECTS(width >= 2, + "Parameter width should be an integer value of 2 or greater", + std::invalid_argument); + CUDF_EXPECTS((static_cast(input.size()) * parmA.size()) < + static_cast(std::numeric_limits::max()), + "The number of seeds times the number of input rows exceeds the column size limit", + std::overflow_error); + CUDF_EXPECTS(parmA.size() == parmB.size(), + "Parameters A and B should have the same number of elements", + std::invalid_argument); + + auto const output_type = cudf::data_type{cudf::type_to_id()}; + if (input.is_empty()) { return cudf::make_empty_column(output_type); } + + auto const d_strings = cudf::column_device_view::create(input.parent(), stream); + + auto hashes = cudf::make_numeric_column(output_type, + input.size() * static_cast(parmA.size()), + cudf::mask_state::UNALLOCATED, + stream, + mr); + auto d_hashes = hashes->mutable_view().data(); + + // 16 seeds can be held in shared-memory: 32K/block_size(256)/sizeof(hash_value_type) = ~16 + auto const shmem_size = block_size * 16 * sizeof(hash_value_type); + + cudf::detail::grid_1d grid{static_cast(input.size()) * tile_size, + block_size}; + minhash_permuted_kernel + <<>>( + *d_strings, parmA, parmB, width, d_hashes); + + return hashes; +} + /** * @brief Compute the minhash of each list row of strings for each seed * @@ -319,6 +442,18 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); } +std::unique_ptr minhash(cudf::strings_column_view const& input, + cudf::device_span parmA, + cudf::device_span parmB, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; + auto hashes = detail::minhash_fn(input, parmA, parmB, width, stream, mr); + return build_list_result(input.parent(), std::move(hashes), parmA.size(), stream, mr); +} + std::unique_ptr minhash64(cudf::strings_column_view const& input, cudf::numeric_scalar const& seed, cudf::size_type width, @@ -343,6 +478,18 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); } +std::unique_ptr minhash64(cudf::strings_column_view const& input, + cudf::device_span parmA, + cudf::device_span parmB, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; + auto hashes = detail::minhash_fn(input, parmA, parmB, width, stream, mr); + return build_list_result(input.parent(), std::move(hashes), parmA.size(), stream, mr); +} + std::unique_ptr word_minhash(cudf::lists_column_view const& input, cudf::device_span seeds, rmm::cuda_stream_view stream, @@ -384,6 +531,17 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, return detail::minhash(input, seeds, width, stream, mr); } +std::unique_ptr minhash_permuted(cudf::strings_column_view const& input, + cudf::device_span parmA, + cudf::device_span parmB, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::minhash(input, parmA, parmB, width, stream, mr); +} + std::unique_ptr minhash64(cudf::strings_column_view const& input, cudf::numeric_scalar seed, cudf::size_type width, @@ -404,6 +562,17 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, return detail::minhash64(input, seeds, width, stream, mr); } +std::unique_ptr minhash64_permuted(cudf::strings_column_view const& input, + cudf::device_span parmA, + cudf::device_span parmB, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::minhash64(input, parmA, parmB, width, stream, mr); +} + std::unique_ptr word_minhash(cudf::lists_column_view const& input, cudf::device_span seeds, rmm::cuda_stream_view stream, diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index c568c162e81..3f6e6b6d666 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -169,6 +169,64 @@ TEST_F(MinHashTest, MultiSeedWithNullInputRow) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } +TEST_F(MinHashTest, PermutedMultiSeed) +{ + auto input = + cudf::test::strings_column_wrapper({"doc 1", + "this is doc 2", + "doc 3", + "d", + "The quick brown fox jumpéd over the lazy brown dog.", + "line six", + "line seven", + "line eight", + "line nine", + "line ten"}); + + auto view = cudf::strings_column_view(input); + + auto zero = thrust::counting_iterator(0); + auto seeds = cudf::test::fixed_width_column_wrapper(zero, zero + 3); + auto results = + nvtext::minhash_permuted(view, cudf::column_view(seeds), cudf::column_view(seeds), 4); + + using LCW = cudf::test::lists_column_wrapper; + // clang-format off + LCW expected({LCW{1207251914u, 1207251915u, 2414503830u}, + LCW{ 21141582u, 21141583u, 42283166u}, + LCW{1207251914u, 1207251915u, 2414503830u}, + LCW{ 655955059u, 655955060u, 1311910120u}, + LCW{ 86520422u, 86520423u, 173040846u}, + LCW{ 640477688u, 640477689u, 1280955378u}, + LCW{ 640477688u, 640477689u, 1119757432u}, + LCW{ 304329233u, 304329234u, 608658468u}, + LCW{ 640477688u, 640477689u, 1253170324u}, + LCW{ 640477688u, 640477689u, 109416974u}}); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto seeds64 = cudf::test::fixed_width_column_wrapper({0, 1, 2}); + auto results64 = + nvtext::minhash64_permuted(view, cudf::column_view(seeds64), cudf::column_view(seeds64), 4); + + using LCW64 = cudf::test::lists_column_wrapper; + // clang-format off + LCW64 expected64({ + LCW64{ 774489391575805754ul, 774489391575805755ul, 1355379376230368507ul}, + LCW64{ 3232308021562742685ul, 536921680321302593ul, 3662418748204373ul}, + LCW64{13145552576991307582ul, 1616337530922837828ul, 926832052631981697ul}, + LCW64{14660046701545912182ul, 824988646263748477ul, 1649977292527496946ul}, + LCW64{ 398062025280761388ul, 44496840736841087ul, 88993681473682166ul}, + LCW64{ 2837259098848821044ul, 531416089635127094ul, 1062832179270254188ul}, + LCW64{ 2105419906076957667ul, 291144650667480443ul, 582289301334960878ul}, + LCW64{ 1273320923074904938ul, 584582941553763022ul, 77252561447143819ul}, + LCW64{ 3456065052701055601ul, 584582941553763022ul, 1169165883107526036ul}, + LCW64{10664519708968191209ul, 304692340605188099ul, 152114268187701233ul} + }); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); +} + TEST_F(MinHashTest, WordsMinHash) { using LCWS = cudf::test::lists_column_wrapper; diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx index 59cb8d51440..3775548ff42 100644 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -10,6 +10,8 @@ from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.nvtext.minhash cimport ( minhash as cpp_minhash, minhash64 as cpp_minhash64, + minhash64_permuted as cpp_minhash64_permuted, + minhash_permuted as cpp_minhash_permuted, word_minhash as cpp_word_minhash, word_minhash64 as cpp_word_minhash64, ) @@ -38,6 +40,28 @@ def minhash(Column strings, Column seeds, int width): return Column.from_unique_ptr(move(c_result)) +@acquire_spill_lock() +def minhash_permuted(Column strings, Column a, Column b, int width): + + cdef column_view c_strings = strings.view() + cdef size_type c_width = width + cdef column_view c_a = a.view() + cdef column_view c_b = b.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_minhash_permuted( + c_strings, + c_a, + c_b, + c_width + ) + ) + + return Column.from_unique_ptr(move(c_result)) + + @acquire_spill_lock() def minhash64(Column strings, Column seeds, int width): @@ -58,6 +82,28 @@ def minhash64(Column strings, Column seeds, int width): return Column.from_unique_ptr(move(c_result)) +@acquire_spill_lock() +def minhash64_permuted(Column strings, Column a, Column b, int width): + + cdef column_view c_strings = strings.view() + cdef size_type c_width = width + cdef column_view c_a = a.view() + cdef column_view c_b = b.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_minhash64_permuted( + c_strings, + c_a, + c_b, + c_width + ) + ) + + return Column.from_unique_ptr(move(c_result)) + + @acquire_spill_lock() def word_minhash(Column input, Column seeds): diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index e712937f816..1aafc7df066 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -9,6 +9,8 @@ from cudf._lib.nvtext.minhash import ( minhash, minhash64, + minhash64_permuted, + minhash_permuted, word_minhash, word_minhash64, ) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index b50e23bd52e..616ff19cc53 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5349,6 +5349,25 @@ def minhash( libstrings.minhash(self._column, seeds_column, width) ) + def minhash_permuted( + self, a: ColumnLike, b: ColumnLike, width: int = 4 + ) -> SeriesOrIndex: + a_column = column.as_column(a) + if a_column.dtype != np.uint32: + raise ValueError( + f"Expecting a Series with dtype uint32, got {type(a)}" + ) + b_column = column.as_column(b) + if b_column.dtype != np.uint32: + raise ValueError( + f"Expecting a Series with dtype uint32, got {type(b)}" + ) + return self._return_or_inplace( + libstrings.minhash_permuted( + self._column, a_column, b_column, width + ) + ) + def minhash64( self, seeds: ColumnLike | None = None, width: int = 4 ) -> SeriesOrIndex: @@ -5389,6 +5408,25 @@ def minhash64( libstrings.minhash64(self._column, seeds_column, width) ) + def minhash64_permuted( + self, a: ColumnLike, b: ColumnLike, width: int = 4 + ) -> SeriesOrIndex: + a_column = column.as_column(a) + if a_column.dtype != np.uint64: + raise ValueError( + f"Expecting a Series with dtype uint64, got {type(a)}" + ) + b_column = column.as_column(b) + if b_column.dtype != np.uint64: + raise ValueError( + f"Expecting a Series with dtype uint64, got {type(b)}" + ) + return self._return_or_inplace( + libstrings.minhash64_permuted( + self._column, a_column, b_column, width + ) + ) + def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: """ Compute the minhash of a list column of strings. diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd index f2dd22f43aa..54d97e0559f 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd @@ -14,12 +14,26 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: const size_type width, ) except + + cdef unique_ptr[column] minhash_permuted( + const column_view &strings, + const column_view &a, + const column_view &b, + const size_type width, + ) except + + cdef unique_ptr[column] minhash64( const column_view &strings, const column_view &seeds, const size_type width, ) except + + cdef unique_ptr[column] minhash64_permuted( + const column_view &strings, + const column_view &a, + const column_view &b, + const size_type width, + ) except + + cdef unique_ptr[column] word_minhash( const column_view &input, const column_view &seeds From 186477ec6a2640bcc3ec15f2053ca19063441e07 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 11 Oct 2024 16:10:01 -0400 Subject: [PATCH 09/25] revert benchmark API call --- cpp/benchmarks/text/minhash.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp index 06aeff3e644..a0f9196d05f 100644 --- a/cpp/benchmarks/text/minhash.cpp +++ b/cpp/benchmarks/text/minhash.cpp @@ -56,8 +56,8 @@ static void bench_minhash(nvbench::state& state) state.add_global_memory_writes(num_rows); // output are hashes state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = base64 ? nvtext::minhash64_permuted(input, seeds.view(), seeds.view(), hash_width) - : nvtext::minhash_permuted(input, seeds.view(), seeds.view(), hash_width); + auto result = base64 ? nvtext::minhash64(input, seeds.view(), hash_width) + : nvtext::minhash(input, seeds.view(), hash_width); }); } From a7583d0088b79f03d81b532b49bb45443b6026ea Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 14 Oct 2024 15:20:33 -0400 Subject: [PATCH 10/25] experimental single-hash permutation --- cpp/include/nvtext/minhash.hpp | 4 + cpp/src/text/minhash.cu | 234 ++++++++++++++---- cpp/tests/text/minhash_tests.cpp | 52 ++-- python/cudf/cudf/_lib/nvtext/minhash.pyx | 8 +- python/cudf/cudf/core/column/string.py | 8 +- .../pylibcudf/libcudf/nvtext/minhash.pxd | 2 + python/pylibcudf/pylibcudf/nvtext/minhash.pxd | 16 +- python/pylibcudf/pylibcudf/nvtext/minhash.pyx | 24 +- 8 files changed, 258 insertions(+), 90 deletions(-) diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index f06323e3f5d..7ebd945c66d 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -103,6 +103,7 @@ std::unique_ptr minhash( * @throw std::overflow_error if `parmA.size() * input.size()` exceeds the column size limit * * @param input Strings column to compute minhash + * @param seed Seed value used for the hash algorithm * @param parmA Values used for the hash algorithm * @param parmB Values used for the hash algorithm * @param width The character width used for apply substrings @@ -112,6 +113,7 @@ std::unique_ptr minhash( */ std::unique_ptr minhash_permuted( cudf::strings_column_view const& input, + uint32_t seed, cudf::device_span parmA, cudf::device_span parmB, cudf::size_type width, @@ -192,6 +194,7 @@ std::unique_ptr minhash64( * @throw std::overflow_error if `parmA.size() * input.size()` exceeds the column size limit * * @param input Strings column to compute minhash + * @param seed Seed value used for the hash algorithm * @param parmA Values used for the hash algorithm * @param parmB Values used for the hash algorithm * @param width The character width used for apply substrings @@ -201,6 +204,7 @@ std::unique_ptr minhash64( */ std::unique_ptr minhash64_permuted( cudf::strings_column_view const& input, + uint64_t seed, cudf::device_span parmA, cudf::device_span parmB, cudf::size_type width, diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 1aa6bbb5613..e067758aed9 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -65,10 +65,7 @@ constexpr cudf::thread_index_type tile_size = block_size; * @param working_memory Memory used to hold intermediate hash values * @param d_hashes Minhash output values for each string */ -template < - typename HashFunction, - typename hash_value_type = std:: - conditional_t, uint32_t, uint64_t>> +template CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, cudf::device_span seeds, cudf::size_type width, @@ -129,11 +126,49 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, } } -template < - typename HashFunction, - typename hash_value_type = std:: - conditional_t, uint32_t, uint64_t>> +template +std::unique_ptr minhash_fn(cudf::strings_column_view const& input, + cudf::device_span seeds, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument); + CUDF_EXPECTS(width >= 2, + "Parameter width should be an integer value of 2 or greater", + std::invalid_argument); + CUDF_EXPECTS((static_cast(input.size()) * seeds.size()) < + static_cast(std::numeric_limits::max()), + "The number of seeds times the number of input rows exceeds the column size limit", + std::overflow_error); + + auto const output_type = cudf::data_type{cudf::type_to_id()}; + if (input.is_empty()) { return cudf::make_empty_column(output_type); } + + auto const d_strings = cudf::column_device_view::create(input.parent(), stream); + + auto hashes = cudf::make_numeric_column(output_type, + input.size() * static_cast(seeds.size()), + cudf::mask_state::UNALLOCATED, + stream, + mr); + auto d_hashes = hashes->mutable_view().data(); + + auto const wm_size = cudf::util::round_up_safe(seeds.size() * tile_size * input.size(), + static_cast(block_size)); + auto working_memory = rmm::device_uvector(wm_size, stream); + + cudf::detail::grid_1d grid{static_cast(input.size()) * tile_size, + block_size}; + minhash_kernel<<>>( + *d_strings, seeds, width, working_memory.data(), d_hashes); + + return hashes; +} + +template CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings, + hash_value_type seed, cudf::device_span parmA, cudf::device_span parmB, cudf::size_type width, @@ -160,6 +195,8 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string extern __shared__ char shmem[]; auto const block_hashes = reinterpret_cast(shmem); + auto const hasher = HashFunction(seed); + for (std::size_t i = 0; i < parmA.size(); i += seed_chunk) { // initialize working memory auto const tile_hashes = block_hashes + (lane_idx * seed_chunk); @@ -178,7 +215,6 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string if ((itr != d_str.data()) && (left > 0)) { continue; } // true if past the end of the string auto const hash_str = cudf::string_view(itr, bytes); - auto const hasher = HashFunction(parmA[0]); hash_value_type hv1; if constexpr (std::is_same_v) { hv1 = hasher(hash_str); @@ -187,9 +223,8 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string } for (std::size_t seed_idx = i; seed_idx < (i + seed_count); ++seed_idx) { - hash_value_type const hv = - seed_idx == 0 ? hv1 - : ((hv1 * parmA[seed_idx] + parmB[seed_idx]) % mersenne_prime) & hash_max; + hash_value_type const hv = // seed_idx == 0 ? hv1 : + ((hv1 * parmA[seed_idx] + parmB[seed_idx]) % mersenne_prime) & hash_max; auto const block_idx = ((seed_idx % seed_chunk) * tile_size) + lane_idx; block_hashes[block_idx] = cuda::std::min(hv, block_hashes[block_idx]); } @@ -205,54 +240,127 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string } } -template < - typename HashFunction, - typename hash_value_type = std:: - conditional_t, uint32_t, uint64_t>> -std::unique_ptr minhash_fn(cudf::strings_column_view const& input, - cudf::device_span seeds, +template +CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, + hash_value_type seed, + cudf::size_type width, + hash_value_type* working_memory) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + auto const str_idx = idx / tile_size; + if (str_idx >= d_strings.size()) { return; } + if (d_strings.is_null(str_idx)) { return; } + + auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); + auto const offsets_itr = + cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset()); + auto const offset0 = offsets_itr[0]; // pass this in + auto const offset = offsets_itr[str_idx]; + auto const size_bytes = static_cast(offsets_itr[str_idx + 1] - offset); + if (size_bytes == 0) { return; } + auto const seed_hashes = working_memory + offset - offset0; // hashes_offset + + auto const d_str = cudf::string_view(d_strings.head() + offset, size_bytes); + auto const init = d_str.empty() ? 0 : std::numeric_limits::max(); + auto const lane_idx = idx % tile_size; + + auto const begin = d_str.data() + (lane_idx); + auto const end = d_str.data() + d_str.size_bytes(); + auto const hasher = HashFunction(seed); + + auto d_output = seed_hashes + lane_idx; + for (auto itr = begin; itr < end; itr += tile_size, d_output += tile_size) { + if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { + *d_output = 0; + continue; + } + auto const check_str = // used for counting 'width' characters + cudf::string_view(itr, static_cast(thrust::distance(itr, end))); + auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(check_str, width); + if ((itr != d_str.data()) && (left > 0)) { continue; } // true if past the end of the string + + auto const hash_str = cudf::string_view(itr, bytes); + hash_value_type hv; + if constexpr (std::is_same_v) { + hv = hasher(hash_str); + } else { + hv = thrust::get<0>(hasher(hash_str)); + } + *d_output = hv; + } +} + +template +CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings, + cudf::device_span parmA, + cudf::device_span parmB, cudf::size_type width, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) + hash_value_type* working_memory, + hash_value_type* d_hashes) { - CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument); - CUDF_EXPECTS(width >= 2, - "Parameter width should be an integer value of 2 or greater", - std::invalid_argument); - CUDF_EXPECTS((static_cast(input.size()) * seeds.size()) < - static_cast(std::numeric_limits::max()), - "The number of seeds times the number of input rows exceeds the column size limit", - std::overflow_error); + auto const idx = cudf::detail::grid_1d::global_thread_id(); + auto const str_idx = idx / tile_size; + if (str_idx >= d_strings.size()) { return; } + if (d_strings.is_null(str_idx)) { return; } - auto const output_type = cudf::data_type{cudf::type_to_id()}; - if (input.is_empty()) { return cudf::make_empty_column(output_type); } + auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); + auto const offsets_itr = + cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset()); + auto const offset0 = offsets_itr[0]; // pass this in + auto const offset = offsets_itr[str_idx]; + auto const size_bytes = static_cast(offsets_itr[str_idx + 1] - offset); + auto const seed_hashes = working_memory + offset - offset0; // hashes_offset; + auto const hashes_size = + cuda::std::max(static_cast(size_bytes > 0), size_bytes - width + 1); + + auto const init = size_bytes == 0 ? 0 : std::numeric_limits::max(); + auto const lane_idx = idx % tile_size; + auto const d_output = d_hashes + (str_idx * parmA.size()); - auto const d_strings = cudf::column_device_view::create(input.parent(), stream); + auto const begin = seed_hashes + (lane_idx); + auto const end = seed_hashes + hashes_size; - auto hashes = cudf::make_numeric_column(output_type, - input.size() * static_cast(seeds.size()), - cudf::mask_state::UNALLOCATED, - stream, - mr); - auto d_hashes = hashes->mutable_view().data(); + constexpr std::size_t seed_chunk = 16; // based on block-size==256 + constexpr uint64_t mersenne_prime = (1UL << 61) - 1; + constexpr hash_value_type hash_max = std::numeric_limits::max(); - auto const wm_size = cudf::util::round_up_safe(seeds.size() * tile_size * input.size(), - static_cast(block_size)); - auto working_memory = rmm::device_uvector(wm_size, stream); + extern __shared__ char shmem[]; + auto const block_hashes = reinterpret_cast(shmem); - cudf::detail::grid_1d grid{static_cast(input.size()) * tile_size, - block_size}; - minhash_kernel<<>>( - *d_strings, seeds, width, working_memory.data(), d_hashes); + for (std::size_t i = 0; i < parmA.size(); i += seed_chunk) { + // initialize working memory + auto const tile_hashes = block_hashes + (lane_idx * seed_chunk); + thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + seed_chunk, init); + __syncthreads(); - return hashes; + auto const seed_count = cuda::std::min(seed_chunk, parmA.size() - i); + + // each lane hashes 'width' substrings of d_str + for (auto itr = begin; itr < end; itr += tile_size) { + auto const hv1 = *itr; + if (hv1 == 0) { continue; } // skips intermediate UTF-8 bytes + + for (std::size_t seed_idx = i; seed_idx < (i + seed_count); ++seed_idx) { + hash_value_type const hv = // seed_idx == 0 ? hv1 : + ((hv1 * parmA[seed_idx] + parmB[seed_idx]) % mersenne_prime) & hash_max; + auto const block_idx = ((seed_idx % seed_chunk) * tile_size) + lane_idx; + block_hashes[block_idx] = cuda::std::min(hv, block_hashes[block_idx]); + } + } + __syncthreads(); + + if (lane_idx < seed_count) { + auto const hvs = block_hashes + (lane_idx * tile_size); + auto const hv = thrust::reduce(thrust::seq, hvs, hvs + tile_size, init, thrust::minimum{}); + d_output[lane_idx + i] = hv; + } + __syncthreads(); + } } -template < - typename HashFunction, - typename hash_value_type = std:: - conditional_t, uint32_t, uint64_t>> +template std::unique_ptr minhash_fn(cudf::strings_column_view const& input, + hash_value_type seed, cudf::device_span parmA, cudf::device_span parmB, cudf::size_type width, @@ -288,9 +396,23 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, cudf::detail::grid_1d grid{static_cast(input.size()) * tile_size, block_size}; +#if 1 minhash_permuted_kernel <<>>( - *d_strings, parmA, parmB, width, d_hashes); + *d_strings, seed, parmA, parmB, width, d_hashes); + +#else + auto const wm_size = input.chars_size(stream); + auto working_memory = rmm::device_uvector(wm_size, stream); + + minhash_seed_kernel + <<>>( + *d_strings, seed, width, working_memory.data()); + + minhash_permuted_kernel + <<>>( + *d_strings, parmA, parmB, width, working_memory.data(), d_hashes); +#endif return hashes; } @@ -443,6 +565,7 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, } std::unique_ptr minhash(cudf::strings_column_view const& input, + uint32_t seed, cudf::device_span parmA, cudf::device_span parmB, cudf::size_type width, @@ -450,7 +573,7 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, rmm::device_async_resource_ref mr) { using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; - auto hashes = detail::minhash_fn(input, parmA, parmB, width, stream, mr); + auto hashes = detail::minhash_fn(input, seed, parmA, parmB, width, stream, mr); return build_list_result(input.parent(), std::move(hashes), parmA.size(), stream, mr); } @@ -479,6 +602,7 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, } std::unique_ptr minhash64(cudf::strings_column_view const& input, + uint64_t seed, cudf::device_span parmA, cudf::device_span parmB, cudf::size_type width, @@ -486,7 +610,7 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, rmm::device_async_resource_ref mr) { using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; - auto hashes = detail::minhash_fn(input, parmA, parmB, width, stream, mr); + auto hashes = detail::minhash_fn(input, seed, parmA, parmB, width, stream, mr); return build_list_result(input.parent(), std::move(hashes), parmA.size(), stream, mr); } @@ -532,6 +656,7 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, } std::unique_ptr minhash_permuted(cudf::strings_column_view const& input, + uint32_t seed, cudf::device_span parmA, cudf::device_span parmB, cudf::size_type width, @@ -539,7 +664,7 @@ std::unique_ptr minhash_permuted(cudf::strings_column_view const& rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::minhash(input, parmA, parmB, width, stream, mr); + return detail::minhash(input, seed, parmA, parmB, width, stream, mr); } std::unique_ptr minhash64(cudf::strings_column_view const& input, @@ -563,6 +688,7 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, } std::unique_ptr minhash64_permuted(cudf::strings_column_view const& input, + uint64_t seed, cudf::device_span parmA, cudf::device_span parmB, cudf::size_type width, @@ -570,7 +696,7 @@ std::unique_ptr minhash64_permuted(cudf::strings_column_view const rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::minhash64(input, parmA, parmB, width, stream, mr); + return detail::minhash64(input, seed, parmA, parmB, width, stream, mr); } std::unique_ptr word_minhash(cudf::lists_column_view const& input, diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index 3f6e6b6d666..4ca0607ede8 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -185,43 +185,45 @@ TEST_F(MinHashTest, PermutedMultiSeed) auto view = cudf::strings_column_view(input); - auto zero = thrust::counting_iterator(0); - auto seeds = cudf::test::fixed_width_column_wrapper(zero, zero + 3); + auto first = thrust::counting_iterator(10); + auto seeds = cudf::test::fixed_width_column_wrapper(first, first + 3); auto results = - nvtext::minhash_permuted(view, cudf::column_view(seeds), cudf::column_view(seeds), 4); + nvtext::minhash_permuted(view, 0, cudf::column_view(seeds), cudf::column_view(seeds), 4); using LCW = cudf::test::lists_column_wrapper; // clang-format off - LCW expected({LCW{1207251914u, 1207251915u, 2414503830u}, - LCW{ 21141582u, 21141583u, 42283166u}, - LCW{1207251914u, 1207251915u, 2414503830u}, - LCW{ 655955059u, 655955060u, 1311910120u}, - LCW{ 86520422u, 86520423u, 173040846u}, - LCW{ 640477688u, 640477689u, 1280955378u}, - LCW{ 640477688u, 640477689u, 1119757432u}, - LCW{ 304329233u, 304329234u, 608658468u}, - LCW{ 640477688u, 640477689u, 1253170324u}, - LCW{ 640477688u, 640477689u, 109416974u}}); + LCW expected({ + LCW{1392101586u, 394869177u, 811528444u}, + LCW{ 211415830u, 187088503u, 130291444u}, + LCW{2098117052u, 394869177u, 799753544u}, + LCW{2264583304u, 2920538364u, 3576493424u}, + LCW{ 253327882u, 41747273u, 302030804u}, + LCW{2109809594u, 1017470651u, 326988172u}, + LCW{1303819864u, 850676747u, 147107852u}, + LCW{ 736021564u, 720812292u, 1405158760u}, + LCW{ 902780242u, 134064807u, 1613944636u}, + LCW{ 547084870u, 1748895564u, 656501844u} + }); // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto seeds64 = cudf::test::fixed_width_column_wrapper({0, 1, 2}); + auto seeds64 = cudf::test::fixed_width_column_wrapper(first, first + 3); auto results64 = - nvtext::minhash64_permuted(view, cudf::column_view(seeds64), cudf::column_view(seeds64), 4); + nvtext::minhash64_permuted(view, 0, cudf::column_view(seeds64), cudf::column_view(seeds64), 4); using LCW64 = cudf::test::lists_column_wrapper; // clang-format off LCW64 expected64({ - LCW64{ 774489391575805754ul, 774489391575805755ul, 1355379376230368507ul}, - LCW64{ 3232308021562742685ul, 536921680321302593ul, 3662418748204373ul}, - LCW64{13145552576991307582ul, 1616337530922837828ul, 926832052631981697ul}, - LCW64{14660046701545912182ul, 824988646263748477ul, 1649977292527496946ul}, - LCW64{ 398062025280761388ul, 44496840736841087ul, 88993681473682166ul}, - LCW64{ 2837259098848821044ul, 531416089635127094ul, 1062832179270254188ul}, - LCW64{ 2105419906076957667ul, 291144650667480443ul, 582289301334960878ul}, - LCW64{ 1273320923074904938ul, 584582941553763022ul, 77252561447143819ul}, - LCW64{ 3456065052701055601ul, 584582941553763022ul, 1169165883107526036ul}, - LCW64{10664519708968191209ul, 304692340605188099ul, 152114268187701233ul} + LCW64{ 827364888116975697ul, 1601854279692781452ul, 70500662054893256ul}, + LCW64{ 18312093741021833ul, 133793446674258329ul, 21974512489226198ul}, + LCW64{ 22474244732520567ul, 1638811775655358395ul, 949306297364502264ul}, + LCW64{1332357434996402861ul, 2157346081260151330ul, 676491718310205848ul}, + LCW64{ 65816830624808020ul, 43323600380520789ul, 63511816333816345ul}, + LCW64{ 629657184954525200ul, 49741036507643002ul, 97466271004074331ul}, + LCW64{ 301611977846331113ul, 101188874709594830ul, 97466271004074331ul}, + LCW64{ 121498891461700668ul, 171065800427907402ul, 97466271004074331ul}, + LCW64{ 54617739511834072ul, 231454301607238929ul, 97466271004074331ul}, + LCW64{ 576418665851990314ul, 231454301607238929ul, 97466271004074331ul} }); // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx index a0ab7747d73..a95f8edb3c8 100644 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -1,5 +1,7 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. +from libc.stdint cimport uint32_t, uint64_t + from cudf.core.buffer import acquire_spill_lock from cudf._lib.column cimport Column @@ -18,9 +20,10 @@ def minhash(Column input, Column seeds, int width=4): @acquire_spill_lock() -def minhash_permuted(Column input, Column a, Column b, int width): +def minhash_permuted(Column input, uint32_t seed, Column a, Column b, int width): result = nvtext.minhash.minhash_permuted( input.to_pylibcudf(mode="read"), + seed, a.to_pylibcudf(mode="read"), b.to_pylibcudf(mode="read"), width, @@ -39,9 +42,10 @@ def minhash64(Column input, Column seeds, int width=4): @acquire_spill_lock() -def minhash64_permuted(Column input, Column a, Column b, int width): +def minhash64_permuted(Column input, uint64_t seed, Column a, Column b, int width): result = nvtext.minhash.minhash64_permuted( input.to_pylibcudf(mode="read"), + seed, a.to_pylibcudf(mode="read"), b.to_pylibcudf(mode="read"), width, diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index fe50c228248..565806dd271 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5349,7 +5349,7 @@ def minhash( ) def minhash_permuted( - self, a: ColumnLike, b: ColumnLike, width: int = 4 + self, seed: np.uint32, a: ColumnLike, b: ColumnLike, width: int = 4 ) -> SeriesOrIndex: a_column = column.as_column(a) if a_column.dtype != np.uint32: @@ -5363,7 +5363,7 @@ def minhash_permuted( ) return self._return_or_inplace( libstrings.minhash_permuted( - self._column, a_column, b_column, width + self._column, seed, a_column, b_column, width ) ) @@ -5408,7 +5408,7 @@ def minhash64( ) def minhash64_permuted( - self, a: ColumnLike, b: ColumnLike, width: int = 4 + self, seed: np.uint64, a: ColumnLike, b: ColumnLike, width: int = 4 ) -> SeriesOrIndex: a_column = column.as_column(a) if a_column.dtype != np.uint64: @@ -5422,7 +5422,7 @@ def minhash64_permuted( ) return self._return_or_inplace( libstrings.minhash64_permuted( - self._column, a_column, b_column, width + self._column, seed, a_column, b_column, width ) ) diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd index 00c65948a3e..ebf8eda1ce3 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd @@ -24,6 +24,7 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: cdef unique_ptr[column] minhash_permuted( const column_view &strings, + const uint32_t seed, const column_view &a, const column_view &b, const size_type width, @@ -43,6 +44,7 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: cdef unique_ptr[column] minhash64_permuted( const column_view &strings, + const uint64_t seed, const column_view &a, const column_view &b, const size_type width, diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd index 1c639934273..6b544282f44 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd @@ -11,11 +11,23 @@ ctypedef fused ColumnOrScalar: cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*) -cpdef Column minhash_permuted(Column input, Column a, Column b, size_type width) +cpdef Column minhash_permuted( + Column input, + uint32_t seed, + Column a, + Column b, + size_type width +) cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*) -cpdef Column minhash64_permuted(Column input, Column a, Column b, size_type width) +cpdef Column minhash64_permuted( + Column input, + uint64_t seed, + Column a, + Column b, + size_type width +) cpdef Column word_minhash(Column input, Column seeds) diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx index bc7374644fa..c81abce6649 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx @@ -59,7 +59,13 @@ cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): return Column.from_libcudf(move(c_result)) -cpdef Column minhash_permuted(Column input, Column a, Column b, size_type width): +cpdef Column minhash_permuted( + Column input, + uint32_t seed, + Column a, + Column b, + size_type width +): """ Returns the minhash values for each string. This function uses MurmurHash3_x86_32 for the hash algorithm. @@ -70,6 +76,8 @@ cpdef Column minhash_permuted(Column input, Column a, Column b, size_type width) ---------- input : Column Strings column to compute minhash + seed : uint32_t + Seed used for the hash function a : Column Seed value(s) used for the hash algorithm. b : Column @@ -88,6 +96,7 @@ cpdef Column minhash_permuted(Column input, Column a, Column b, size_type width) c_result = move( cpp_minhash_permuted( input.view(), + seed, a.view(), b.view(), width @@ -135,10 +144,16 @@ cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4): return Column.from_libcudf(move(c_result)) -cpdef Column minhash64_permuted(Column input, Column a, Column b, size_type width): +cpdef Column minhash64_permuted( + Column input, + uint64_t seed, + Column a, + Column b, + size_type width +): """ Returns the minhash values for each string. - This function uses MurmurHash3_x128_65 for the hash algorithm. + This function uses MurmurHash3_x64_128 for the hash algorithm. For details, see :cpp:func:`minhash`. @@ -146,6 +161,8 @@ cpdef Column minhash64_permuted(Column input, Column a, Column b, size_type widt ---------- input : Column Strings column to compute minhash + seed : uint64_t + Seed used for the hash function a : Column Seed value(s) used for the hash algorithm. b : Column @@ -164,6 +181,7 @@ cpdef Column minhash64_permuted(Column input, Column a, Column b, size_type widt c_result = move( cpp_minhash64_permuted( input.view(), + seed, a.view(), b.view(), width From 42e7d7bcce484ead47b56f8ee580514e8f668446 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 15 Oct 2024 17:35:39 -0400 Subject: [PATCH 11/25] enable seed-hash temporary memory --- cpp/src/text/minhash.cu | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index e067758aed9..7f77125bf68 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -166,6 +166,12 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, return hashes; } +// Number of seeds to process per thread. +// The intermediate values are stored in shared-memory and therefore limits the count. +// Regardless, this value was found to be the most efficient size for both uint32 and uint64 +// hash types based on benchmarks +CUDF_HOST_DEVICE constexpr std::size_t seeds_chunk_size = 16; + template CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings, hash_value_type seed, @@ -188,7 +194,7 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string auto const begin = d_str.data() + (lane_idx); auto const end = d_str.data() + d_str.size_bytes(); - constexpr std::size_t seed_chunk = 16; // based on block-size==256 + // constants used for the permutation calculations constexpr uint64_t mersenne_prime = (1UL << 61) - 1; constexpr hash_value_type hash_max = std::numeric_limits::max(); @@ -197,13 +203,13 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string auto const hasher = HashFunction(seed); - for (std::size_t i = 0; i < parmA.size(); i += seed_chunk) { + for (std::size_t i = 0; i < parmA.size(); i += seeds_chunk_size) { // initialize working memory - auto const tile_hashes = block_hashes + (lane_idx * seed_chunk); - thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + seed_chunk, init); + auto const tile_hashes = block_hashes + (lane_idx * seeds_chunk_size); + thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + seeds_chunk_size, init); __syncthreads(); - auto const seed_count = cuda::std::min(seed_chunk, parmA.size() - i); + auto const seed_count = cuda::std::min(seeds_chunk_size, parmA.size() - i); // each lane hashes 'width' substrings of d_str for (auto itr = begin; itr < end; itr += tile_size) { @@ -223,9 +229,9 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string } for (std::size_t seed_idx = i; seed_idx < (i + seed_count); ++seed_idx) { - hash_value_type const hv = // seed_idx == 0 ? hv1 : + hash_value_type const hv = ((hv1 * parmA[seed_idx] + parmB[seed_idx]) % mersenne_prime) & hash_max; - auto const block_idx = ((seed_idx % seed_chunk) * tile_size) + lane_idx; + auto const block_idx = ((seed_idx % seeds_chunk_size) * tile_size) + lane_idx; block_hashes[block_idx] = cuda::std::min(hv, block_hashes[block_idx]); } } @@ -261,7 +267,6 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, auto const seed_hashes = working_memory + offset - offset0; // hashes_offset auto const d_str = cudf::string_view(d_strings.head() + offset, size_bytes); - auto const init = d_str.empty() ? 0 : std::numeric_limits::max(); auto const lane_idx = idx % tile_size; auto const begin = d_str.data() + (lane_idx); @@ -320,20 +325,20 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string auto const begin = seed_hashes + (lane_idx); auto const end = seed_hashes + hashes_size; - constexpr std::size_t seed_chunk = 16; // based on block-size==256 + // constants used in the permutation calculations constexpr uint64_t mersenne_prime = (1UL << 61) - 1; constexpr hash_value_type hash_max = std::numeric_limits::max(); extern __shared__ char shmem[]; auto const block_hashes = reinterpret_cast(shmem); - for (std::size_t i = 0; i < parmA.size(); i += seed_chunk) { + for (std::size_t i = 0; i < parmA.size(); i += seeds_chunk_size) { // initialize working memory - auto const tile_hashes = block_hashes + (lane_idx * seed_chunk); - thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + seed_chunk, init); + auto const tile_hashes = block_hashes + (lane_idx * seeds_chunk_size); + thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + seeds_chunk_size, init); __syncthreads(); - auto const seed_count = cuda::std::min(seed_chunk, parmA.size() - i); + auto const seed_count = cuda::std::min(seeds_chunk_size, parmA.size() - i); // each lane hashes 'width' substrings of d_str for (auto itr = begin; itr < end; itr += tile_size) { @@ -341,9 +346,9 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string if (hv1 == 0) { continue; } // skips intermediate UTF-8 bytes for (std::size_t seed_idx = i; seed_idx < (i + seed_count); ++seed_idx) { - hash_value_type const hv = // seed_idx == 0 ? hv1 : + hash_value_type const hv = ((hv1 * parmA[seed_idx] + parmB[seed_idx]) % mersenne_prime) & hash_max; - auto const block_idx = ((seed_idx % seed_chunk) * tile_size) + lane_idx; + auto const block_idx = ((seed_idx % seeds_chunk_size) * tile_size) + lane_idx; block_hashes[block_idx] = cuda::std::min(hv, block_hashes[block_idx]); } } @@ -391,12 +396,12 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, mr); auto d_hashes = hashes->mutable_view().data(); - // 16 seeds can be held in shared-memory: 32K/block_size(256)/sizeof(hash_value_type) = ~16 - auto const shmem_size = block_size * 16 * sizeof(hash_value_type); + // Found to be the most efficient shared memory size for both hash types + constexpr auto shmem_size = block_size * seeds_chunk_size * sizeof(hash_value_type); cudf::detail::grid_1d grid{static_cast(input.size()) * tile_size, block_size}; -#if 1 +#if 0 minhash_permuted_kernel <<>>( *d_strings, seed, parmA, parmB, width, d_hashes); From f79631a79b6449d8abbed4288a51b7183eec6c75 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 16 Oct 2024 16:02:50 -0400 Subject: [PATCH 12/25] dynamic shared memory to static --- cpp/src/text/minhash.cu | 59 ++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 27 deletions(-) diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 7f77125bf68..2c4e08e002c 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -170,7 +170,7 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, // The intermediate values are stored in shared-memory and therefore limits the count. // Regardless, this value was found to be the most efficient size for both uint32 and uint64 // hash types based on benchmarks -CUDF_HOST_DEVICE constexpr std::size_t seeds_chunk_size = 16; +constexpr cuda::std::size_t seeds_chunk_size = 16; template CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings, @@ -181,24 +181,25 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string hash_value_type* d_hashes) { auto const idx = cudf::detail::grid_1d::global_thread_id(); - auto const str_idx = idx / tile_size; + auto const str_idx = idx / block_size; if (str_idx >= d_strings.size()) { return; } if (d_strings.is_null(str_idx)) { return; } auto const d_str = d_strings.element(str_idx); auto const init = d_str.empty() ? 0 : std::numeric_limits::max(); - auto const lane_idx = idx % tile_size; + auto const lane_idx = idx % block_size; auto const d_output = d_hashes + (str_idx * parmA.size()); - auto const begin = d_str.data() + (lane_idx); + auto const begin = d_str.data() + lane_idx; auto const end = d_str.data() + d_str.size_bytes(); // constants used for the permutation calculations constexpr uint64_t mersenne_prime = (1UL << 61) - 1; constexpr hash_value_type hash_max = std::numeric_limits::max(); - extern __shared__ char shmem[]; + // Found to be the most efficient shared memory size for both hash types + __shared__ char shmem[block_size * seeds_chunk_size * sizeof(hash_value_type)]; auto const block_hashes = reinterpret_cast(shmem); auto const hasher = HashFunction(seed); @@ -209,10 +210,11 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + seeds_chunk_size, init); __syncthreads(); - auto const seed_count = cuda::std::min(seeds_chunk_size, parmA.size() - i); + auto const seed_count = + cuda::std::min(static_cast(seeds_chunk_size), parmA.size() - i); // each lane hashes 'width' substrings of d_str - for (auto itr = begin; itr < end; itr += tile_size) { + for (auto itr = begin; itr < end; itr += block_size) { if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { continue; } auto const check_str = // used for counting 'width' characters cudf::string_view(itr, static_cast(thrust::distance(itr, end))); @@ -229,17 +231,19 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string } for (std::size_t seed_idx = i; seed_idx < (i + seed_count); ++seed_idx) { + // permutation formula used by datatrove hash_value_type const hv = ((hv1 * parmA[seed_idx] + parmB[seed_idx]) % mersenne_prime) & hash_max; - auto const block_idx = ((seed_idx % seeds_chunk_size) * tile_size) + lane_idx; + auto const block_idx = ((seed_idx % seeds_chunk_size) * block_size) + lane_idx; block_hashes[block_idx] = cuda::std::min(hv, block_hashes[block_idx]); } } __syncthreads(); + // reduce each seed to a single min value if (lane_idx < seed_count) { - auto const hvs = block_hashes + (lane_idx * tile_size); - auto const hv = thrust::reduce(thrust::seq, hvs, hvs + tile_size, init, thrust::minimum{}); + auto const hvs = block_hashes + (lane_idx * block_size); + auto const hv = thrust::reduce(thrust::seq, hvs, hvs + block_size, init, thrust::minimum{}); d_output[lane_idx + i] = hv; } __syncthreads(); @@ -304,7 +308,7 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string hash_value_type* d_hashes) { auto const idx = cudf::detail::grid_1d::global_thread_id(); - auto const str_idx = idx / tile_size; + auto const str_idx = idx / block_size; if (str_idx >= d_strings.size()) { return; } if (d_strings.is_null(str_idx)) { return; } @@ -319,17 +323,18 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string cuda::std::max(static_cast(size_bytes > 0), size_bytes - width + 1); auto const init = size_bytes == 0 ? 0 : std::numeric_limits::max(); - auto const lane_idx = idx % tile_size; + auto const lane_idx = idx % block_size; auto const d_output = d_hashes + (str_idx * parmA.size()); - auto const begin = seed_hashes + (lane_idx); + auto const begin = seed_hashes + lane_idx; auto const end = seed_hashes + hashes_size; // constants used in the permutation calculations constexpr uint64_t mersenne_prime = (1UL << 61) - 1; constexpr hash_value_type hash_max = std::numeric_limits::max(); - extern __shared__ char shmem[]; + // Found to be the most efficient shared memory size for both hash types + __shared__ char shmem[block_size * seeds_chunk_size * sizeof(hash_value_type)]; auto const block_hashes = reinterpret_cast(shmem); for (std::size_t i = 0; i < parmA.size(); i += seeds_chunk_size) { @@ -338,25 +343,28 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + seeds_chunk_size, init); __syncthreads(); - auto const seed_count = cuda::std::min(seeds_chunk_size, parmA.size() - i); + auto const seed_count = + cuda::std::min(static_cast(seeds_chunk_size), parmA.size() - i); - // each lane hashes 'width' substrings of d_str - for (auto itr = begin; itr < end; itr += tile_size) { + // each lane accumulates min hashes in its spot in shared memory + for (auto itr = begin; itr < end; itr += block_size) { auto const hv1 = *itr; - if (hv1 == 0) { continue; } // skips intermediate UTF-8 bytes + if (hv1 == 0) { continue; } // skip intermediate UTF-8 bytes for (std::size_t seed_idx = i; seed_idx < (i + seed_count); ++seed_idx) { + // permutation formula used by datatrove hash_value_type const hv = ((hv1 * parmA[seed_idx] + parmB[seed_idx]) % mersenne_prime) & hash_max; - auto const block_idx = ((seed_idx % seeds_chunk_size) * tile_size) + lane_idx; + auto const block_idx = ((seed_idx % seeds_chunk_size) * block_size) + lane_idx; block_hashes[block_idx] = cuda::std::min(hv, block_hashes[block_idx]); } } __syncthreads(); + // reduce each seed to a single min value if (lane_idx < seed_count) { - auto const hvs = block_hashes + (lane_idx * tile_size); - auto const hv = thrust::reduce(thrust::seq, hvs, hvs + tile_size, init, thrust::minimum{}); + auto const hvs = block_hashes + (lane_idx * block_size); + auto const hv = thrust::reduce(thrust::seq, hvs, hvs + block_size, init, thrust::minimum{}); d_output[lane_idx + i] = hv; } __syncthreads(); @@ -396,14 +404,11 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, mr); auto d_hashes = hashes->mutable_view().data(); - // Found to be the most efficient shared memory size for both hash types - constexpr auto shmem_size = block_size * seeds_chunk_size * sizeof(hash_value_type); - - cudf::detail::grid_1d grid{static_cast(input.size()) * tile_size, + cudf::detail::grid_1d grid{static_cast(input.size()) * block_size, block_size}; #if 0 minhash_permuted_kernel - <<>>( + <<>>( *d_strings, seed, parmA, parmB, width, d_hashes); #else @@ -415,7 +420,7 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, *d_strings, seed, width, working_memory.data()); minhash_permuted_kernel - <<>>( + <<>>( *d_strings, parmA, parmB, width, working_memory.data(), d_hashes); #endif From fda43cc721176aaee3ee6881d8c10bac1f97e66b Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 17 Oct 2024 19:23:45 -0400 Subject: [PATCH 13/25] cleanup variable names, doxygen --- cpp/include/nvtext/minhash.hpp | 68 +++++++++--- cpp/src/text/minhash.cu | 185 +++++++++++++++++---------------- 2 files changed, 146 insertions(+), 107 deletions(-) diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index 7ebd945c66d..0e352cf9b74 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -93,20 +93,37 @@ std::unique_ptr minhash( /** * @brief Returns the minhash values for each string * - * This function uses MurmurHash3_x64_128 for the hash algorithm. + * This function uses MurmurHash3_x86_32 for the hash algorithm. + * + * The input strings are first hashed using the given `seed` over substrings + * of `width` characters. These hash values are then combined with the `a` + * and `b` values using the following formula: + * ``` + * max_hash = max of uint32 + * mp = (1 << 61) - 1 + * hv = hash value of a substring + * pv[i] = ((hv * a[i] + b[i]) % mp) & max_hash + * ``` + * + * This calculation is performed on each substring and the minimum value is computed + * as follows: + * ``` + * mh[j,i] = min(pv[i]) for all substrings in row j + * and where i=[0,a.size()) + * ``` * * Any null row entries result in corresponding null output rows. * * @throw std::invalid_argument if the width < 2 - * @throw std::invalid_argument if parmA is empty - * @throw std::invalid_argument if `parmB.size() != parmA.size()` - * @throw std::overflow_error if `parmA.size() * input.size()` exceeds the column size limit + * @throw std::invalid_argument if parameter_a is empty + * @throw std::invalid_argument if `parameter_b.size() != parameter_a.size()` + * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit * * @param input Strings column to compute minhash * @param seed Seed value used for the hash algorithm - * @param parmA Values used for the hash algorithm - * @param parmB Values used for the hash algorithm - * @param width The character width used for apply substrings + * @param parameter_a Values used for the permuted calculation + * @param parameter_b Values used for the permuted calculation + * @param width The character width of substrings to hash for each row * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed @@ -114,8 +131,8 @@ std::unique_ptr minhash( std::unique_ptr minhash_permuted( cudf::strings_column_view const& input, uint32_t seed, - cudf::device_span parmA, - cudf::device_span parmB, + cudf::device_span parameter_a, + cudf::device_span parameter_b, cudf::size_type width, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); @@ -186,18 +203,35 @@ std::unique_ptr minhash64( * * This function uses MurmurHash3_x64_128 for the hash algorithm. * + * The input strings are first hashed using the given `seed` over substrings + * of `width` characters. These hash values are then combined with the `a` + * and `b` values using the following formula: + * ``` + * max_hash = max of uint64 + * mp = (1 << 61) - 1 + * hv = hash value of a substring + * pv[i] = ((hv * a[i] + b[i]) % mp) & max_hash + * ``` + * + * This calculation is performed on each substring and the minimum value is computed + * as follows: + * ``` + * mh[j,i] = min(pv[i]) for all substrings in row j + * and where i=[0,a.size()) + * ``` + * * Any null row entries result in corresponding null output rows. * * @throw std::invalid_argument if the width < 2 - * @throw std::invalid_argument if parmA is empty - * @throw std::invalid_argument if `parmB.size() != parmA.size()` - * @throw std::overflow_error if `parmA.size() * input.size()` exceeds the column size limit + * @throw std::invalid_argument if parameter_a is empty + * @throw std::invalid_argument if `parameter_b.size() != parameter_a.size()` + * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit * * @param input Strings column to compute minhash * @param seed Seed value used for the hash algorithm - * @param parmA Values used for the hash algorithm - * @param parmB Values used for the hash algorithm - * @param width The character width used for apply substrings + * @param parameter_a Values used for the permuted calculation + * @param parameter_b Values used for the permuted calculation + * @param width The character width of substrings to hash for each row * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed @@ -205,8 +239,8 @@ std::unique_ptr minhash64( std::unique_ptr minhash64_permuted( cudf::strings_column_view const& input, uint64_t seed, - cudf::device_span parmA, - cudf::device_span parmB, + cudf::device_span parameter_a, + cudf::device_span parameter_b, cudf::size_type width, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 2c4e08e002c..3b08088ffc2 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -73,20 +73,20 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, hash_value_type* d_hashes) { auto const idx = cudf::detail::grid_1d::global_thread_id(); - auto const str_idx = idx / tile_size; + auto const str_idx = idx / block_size; if (str_idx >= d_strings.size()) { return; } if (d_strings.is_null(str_idx)) { return; } auto const d_str = d_strings.element(str_idx); auto const init = d_str.empty() ? 0 : std::numeric_limits::max(); - auto const lane_idx = idx % tile_size; + auto const lane_idx = idx % block_size; - auto tile_hashes = working_memory + (str_idx * tile_size * seeds.size()); + auto tile_hashes = working_memory + (str_idx * block_size * seeds.size()); // initialize working memory - for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size(); seed_idx += tile_size) { - auto begin = tile_hashes + (seed_idx * tile_size); - thrust::uninitialized_fill(thrust::seq, begin, begin + tile_size, init); + for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size(); seed_idx += block_size) { + auto begin = tile_hashes + (seed_idx * block_size); + thrust::uninitialized_fill(thrust::seq, begin, begin + block_size, init); } __syncthreads(); @@ -96,7 +96,7 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, auto const end = d_str.data() + d_str.size_bytes(); // each lane hashes 'width' substrings of d_str - for (auto itr = begin; itr < end; itr += tile_size) { + for (auto itr = begin; itr < end; itr += block_size) { if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { continue; } auto const check_str = // used for counting 'width' characters cudf::string_view(itr, static_cast(thrust::distance(itr, end))); @@ -112,16 +112,16 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, } else { hv = thrust::get<0>(hasher(hash_str)); } - tile_hashes[(seed_idx * tile_size) + lane_idx] = - cuda::std::min(hv, tile_hashes[(seed_idx * tile_size) + lane_idx]); + tile_hashes[(seed_idx * block_size) + lane_idx] = + cuda::std::min(hv, tile_hashes[(seed_idx * block_size) + lane_idx]); } } __syncthreads(); // compute final result - for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size(); seed_idx += tile_size) { - auto begin = tile_hashes + (seed_idx * tile_size); - auto hv = thrust::reduce(thrust::seq, begin, begin + tile_size, init, thrust::minimum{}); + for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size(); seed_idx += block_size) { + auto begin = tile_hashes + (seed_idx * block_size); + auto hv = thrust::reduce(thrust::seq, begin, begin + block_size, init, thrust::minimum{}); d_output[seed_idx] = hv; } } @@ -166,17 +166,17 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, return hashes; } -// Number of seeds to process per thread. +// Number of parameter a/b values to process per thread. // The intermediate values are stored in shared-memory and therefore limits the count. // Regardless, this value was found to be the most efficient size for both uint32 and uint64 // hash types based on benchmarks -constexpr cuda::std::size_t seeds_chunk_size = 16; +constexpr cuda::std::size_t calc_chunk_size = 16; template CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings, hash_value_type seed, - cudf::device_span parmA, - cudf::device_span parmB, + cudf::device_span parameter_a, + cudf::device_span parameter_b, cudf::size_type width, hash_value_type* d_hashes) { @@ -189,7 +189,7 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string auto const init = d_str.empty() ? 0 : std::numeric_limits::max(); auto const lane_idx = idx % block_size; - auto const d_output = d_hashes + (str_idx * parmA.size()); + auto const d_output = d_hashes + (str_idx * parameter_a.size()); auto const begin = d_str.data() + lane_idx; auto const end = d_str.data() + d_str.size_bytes(); @@ -199,19 +199,19 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string constexpr hash_value_type hash_max = std::numeric_limits::max(); // Found to be the most efficient shared memory size for both hash types - __shared__ char shmem[block_size * seeds_chunk_size * sizeof(hash_value_type)]; + __shared__ char shmem[block_size * calc_chunk_size * sizeof(hash_value_type)]; auto const block_hashes = reinterpret_cast(shmem); auto const hasher = HashFunction(seed); - for (std::size_t i = 0; i < parmA.size(); i += seeds_chunk_size) { + for (std::size_t i = 0; i < parameter_a.size(); i += calc_chunk_size) { // initialize working memory - auto const tile_hashes = block_hashes + (lane_idx * seeds_chunk_size); - thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + seeds_chunk_size, init); + auto const tile_hashes = block_hashes + (lane_idx * calc_chunk_size); + thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + calc_chunk_size, init); __syncthreads(); - auto const seed_count = - cuda::std::min(static_cast(seeds_chunk_size), parmA.size() - i); + auto const param_count = + cuda::std::min(static_cast(calc_chunk_size), parameter_a.size() - i); // each lane hashes 'width' substrings of d_str for (auto itr = begin; itr < end; itr += block_size) { @@ -223,28 +223,29 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string if ((itr != d_str.data()) && (left > 0)) { continue; } // true if past the end of the string auto const hash_str = cudf::string_view(itr, bytes); - hash_value_type hv1; + hash_value_type hv; if constexpr (std::is_same_v) { - hv1 = hasher(hash_str); + hv = hasher(hash_str); } else { - hv1 = thrust::get<0>(hasher(hash_str)); + hv = thrust::get<0>(hasher(hash_str)); } - for (std::size_t seed_idx = i; seed_idx < (i + seed_count); ++seed_idx) { + for (std::size_t param_idx = i; param_idx < (i + param_count); ++param_idx) { // permutation formula used by datatrove - hash_value_type const hv = - ((hv1 * parmA[seed_idx] + parmB[seed_idx]) % mersenne_prime) & hash_max; - auto const block_idx = ((seed_idx % seeds_chunk_size) * block_size) + lane_idx; - block_hashes[block_idx] = cuda::std::min(hv, block_hashes[block_idx]); + hash_value_type const v = + ((hv * parameter_a[param_idx] + parameter_b[param_idx]) % mersenne_prime) & hash_max; + auto const block_idx = ((param_idx % calc_chunk_size) * block_size) + lane_idx; + block_hashes[block_idx] = cuda::std::min(v, block_hashes[block_idx]); } } __syncthreads(); - // reduce each seed to a single min value - if (lane_idx < seed_count) { - auto const hvs = block_hashes + (lane_idx * block_size); - auto const hv = thrust::reduce(thrust::seq, hvs, hvs + block_size, init, thrust::minimum{}); - d_output[lane_idx + i] = hv; + // reduce each parameter values vector to a single min value + if (lane_idx < param_count) { + auto const values = block_hashes + (lane_idx * block_size); + auto const minv = + thrust::reduce(thrust::seq, values, values + block_size, init, thrust::minimum{}); + d_output[lane_idx + i] = minv; } __syncthreads(); } @@ -264,16 +265,16 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); auto const offsets_itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset()); - auto const offset0 = offsets_itr[0]; // pass this in + auto const offset0 = offsets_itr[0]; auto const offset = offsets_itr[str_idx]; auto const size_bytes = static_cast(offsets_itr[str_idx + 1] - offset); if (size_bytes == 0) { return; } - auto const seed_hashes = working_memory + offset - offset0; // hashes_offset + auto const seed_hashes = working_memory + offset - offset0; auto const d_str = cudf::string_view(d_strings.head() + offset, size_bytes); auto const lane_idx = idx % tile_size; - auto const begin = d_str.data() + (lane_idx); + auto const begin = d_str.data() + lane_idx; auto const end = d_str.data() + d_str.size_bytes(); auto const hasher = HashFunction(seed); @@ -301,8 +302,8 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, template CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings, - cudf::device_span parmA, - cudf::device_span parmB, + cudf::device_span parameter_a, + cudf::device_span parameter_b, cudf::size_type width, hash_value_type* working_memory, hash_value_type* d_hashes) @@ -315,16 +316,16 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); auto const offsets_itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset()); - auto const offset0 = offsets_itr[0]; // pass this in + auto const offset0 = offsets_itr[0]; auto const offset = offsets_itr[str_idx]; auto const size_bytes = static_cast(offsets_itr[str_idx + 1] - offset); - auto const seed_hashes = working_memory + offset - offset0; // hashes_offset; + auto const seed_hashes = working_memory + offset - offset0; auto const hashes_size = cuda::std::max(static_cast(size_bytes > 0), size_bytes - width + 1); auto const init = size_bytes == 0 ? 0 : std::numeric_limits::max(); auto const lane_idx = idx % block_size; - auto const d_output = d_hashes + (str_idx * parmA.size()); + auto const d_output = d_hashes + (str_idx * parameter_a.size()); auto const begin = seed_hashes + lane_idx; auto const end = seed_hashes + hashes_size; @@ -334,38 +335,39 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string constexpr hash_value_type hash_max = std::numeric_limits::max(); // Found to be the most efficient shared memory size for both hash types - __shared__ char shmem[block_size * seeds_chunk_size * sizeof(hash_value_type)]; + __shared__ char shmem[block_size * calc_chunk_size * sizeof(hash_value_type)]; auto const block_hashes = reinterpret_cast(shmem); - for (std::size_t i = 0; i < parmA.size(); i += seeds_chunk_size) { + for (std::size_t i = 0; i < parameter_a.size(); i += calc_chunk_size) { // initialize working memory - auto const tile_hashes = block_hashes + (lane_idx * seeds_chunk_size); - thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + seeds_chunk_size, init); + auto const tiles = block_hashes + (lane_idx * calc_chunk_size); + thrust::uninitialized_fill(thrust::seq, tiles, tiles + calc_chunk_size, init); __syncthreads(); - auto const seed_count = - cuda::std::min(static_cast(seeds_chunk_size), parmA.size() - i); + auto const param_count = + cuda::std::min(static_cast(calc_chunk_size), parameter_a.size() - i); // each lane accumulates min hashes in its spot in shared memory for (auto itr = begin; itr < end; itr += block_size) { - auto const hv1 = *itr; - if (hv1 == 0) { continue; } // skip intermediate UTF-8 bytes + auto const hv = *itr; + if (hv == 0) { continue; } // skip intermediate UTF-8 bytes - for (std::size_t seed_idx = i; seed_idx < (i + seed_count); ++seed_idx) { + for (std::size_t param_idx = i; param_idx < (i + param_count); ++param_idx) { // permutation formula used by datatrove - hash_value_type const hv = - ((hv1 * parmA[seed_idx] + parmB[seed_idx]) % mersenne_prime) & hash_max; - auto const block_idx = ((seed_idx % seeds_chunk_size) * block_size) + lane_idx; - block_hashes[block_idx] = cuda::std::min(hv, block_hashes[block_idx]); + hash_value_type const v = + ((hv * parameter_a[param_idx] + parameter_b[param_idx]) % mersenne_prime) & hash_max; + auto const block_idx = ((param_idx % calc_chunk_size) * block_size) + lane_idx; + block_hashes[block_idx] = cuda::std::min(v, block_hashes[block_idx]); } } __syncthreads(); - // reduce each seed to a single min value - if (lane_idx < seed_count) { - auto const hvs = block_hashes + (lane_idx * block_size); - auto const hv = thrust::reduce(thrust::seq, hvs, hvs + block_size, init, thrust::minimum{}); - d_output[lane_idx + i] = hv; + // reduce each parameter values vector to a single min value + if (lane_idx < param_count) { + auto const values = block_hashes + (lane_idx * block_size); + auto const minv = + thrust::reduce(thrust::seq, values, values + block_size, init, thrust::minimum{}); + d_output[lane_idx + i] = minv; } __syncthreads(); } @@ -374,34 +376,35 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string template std::unique_ptr minhash_fn(cudf::strings_column_view const& input, hash_value_type seed, - cudf::device_span parmA, - cudf::device_span parmB, + cudf::device_span parameter_a, + cudf::device_span parameter_b, cudf::size_type width, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS(!parmA.empty(), "Parameters A and B cannot be empty", std::invalid_argument); + CUDF_EXPECTS(!parameter_a.empty(), "Parameters A and B cannot be empty", std::invalid_argument); CUDF_EXPECTS(width >= 2, "Parameter width should be an integer value of 2 or greater", std::invalid_argument); - CUDF_EXPECTS((static_cast(input.size()) * parmA.size()) < + CUDF_EXPECTS(parameter_a.size() == parameter_b.size(), + "Parameters A and B should have the same number of elements", + std::invalid_argument); + CUDF_EXPECTS((static_cast(input.size()) * parameter_a.size()) < static_cast(std::numeric_limits::max()), "The number of seeds times the number of input rows exceeds the column size limit", std::overflow_error); - CUDF_EXPECTS(parmA.size() == parmB.size(), - "Parameters A and B should have the same number of elements", - std::invalid_argument); auto const output_type = cudf::data_type{cudf::type_to_id()}; if (input.is_empty()) { return cudf::make_empty_column(output_type); } auto const d_strings = cudf::column_device_view::create(input.parent(), stream); - auto hashes = cudf::make_numeric_column(output_type, - input.size() * static_cast(parmA.size()), - cudf::mask_state::UNALLOCATED, - stream, - mr); + auto hashes = + cudf::make_numeric_column(output_type, + input.size() * static_cast(parameter_a.size()), + cudf::mask_state::UNALLOCATED, + stream, + mr); auto d_hashes = hashes->mutable_view().data(); cudf::detail::grid_1d grid{static_cast(input.size()) * block_size, @@ -409,7 +412,7 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, #if 0 minhash_permuted_kernel <<>>( - *d_strings, seed, parmA, parmB, width, d_hashes); + *d_strings, seed, parameter_a, parameter_b, width, d_hashes); #else auto const wm_size = input.chars_size(stream); @@ -421,7 +424,7 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, minhash_permuted_kernel <<>>( - *d_strings, parmA, parmB, width, working_memory.data(), d_hashes); + *d_strings, parameter_a, parameter_b, width, working_memory.data(), d_hashes); #endif return hashes; @@ -576,15 +579,16 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, std::unique_ptr minhash(cudf::strings_column_view const& input, uint32_t seed, - cudf::device_span parmA, - cudf::device_span parmB, + cudf::device_span parameter_a, + cudf::device_span parameter_b, cudf::size_type width, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; - auto hashes = detail::minhash_fn(input, seed, parmA, parmB, width, stream, mr); - return build_list_result(input.parent(), std::move(hashes), parmA.size(), stream, mr); + auto hashes = + detail::minhash_fn(input, seed, parameter_a, parameter_b, width, stream, mr); + return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr); } std::unique_ptr minhash64(cudf::strings_column_view const& input, @@ -613,15 +617,16 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, std::unique_ptr minhash64(cudf::strings_column_view const& input, uint64_t seed, - cudf::device_span parmA, - cudf::device_span parmB, + cudf::device_span parameter_a, + cudf::device_span parameter_b, cudf::size_type width, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; - auto hashes = detail::minhash_fn(input, seed, parmA, parmB, width, stream, mr); - return build_list_result(input.parent(), std::move(hashes), parmA.size(), stream, mr); + auto hashes = + detail::minhash_fn(input, seed, parameter_a, parameter_b, width, stream, mr); + return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr); } std::unique_ptr word_minhash(cudf::lists_column_view const& input, @@ -667,14 +672,14 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, std::unique_ptr minhash_permuted(cudf::strings_column_view const& input, uint32_t seed, - cudf::device_span parmA, - cudf::device_span parmB, + cudf::device_span parameter_a, + cudf::device_span parameter_b, cudf::size_type width, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::minhash(input, seed, parmA, parmB, width, stream, mr); + return detail::minhash(input, seed, parameter_a, parameter_b, width, stream, mr); } std::unique_ptr minhash64(cudf::strings_column_view const& input, @@ -699,14 +704,14 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, std::unique_ptr minhash64_permuted(cudf::strings_column_view const& input, uint64_t seed, - cudf::device_span parmA, - cudf::device_span parmB, + cudf::device_span parameter_a, + cudf::device_span parameter_b, cudf::size_type width, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::minhash64(input, seed, parmA, parmB, width, stream, mr); + return detail::minhash64(input, seed, parameter_a, parameter_b, width, stream, mr); } std::unique_ptr word_minhash(cudf::lists_column_view const& input, From 1f4d441f73399cf211c0ee22c09bbcc142683ef9 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 22 Oct 2024 18:48:05 -0400 Subject: [PATCH 14/25] support for super-wide strings --- cpp/src/text/minhash.cu | 153 ++++++++++++++++++++++++++++++---------- 1 file changed, 117 insertions(+), 36 deletions(-) diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 3b08088ffc2..549b9faab32 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -38,8 +38,11 @@ #include #include +#include #include #include +#include +#include #include @@ -229,6 +232,7 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string } else { hv = thrust::get<0>(hasher(hash_str)); } + hv = cuda::std::max(hv, hash_value_type{1}); for (std::size_t param_idx = i; param_idx < (i + param_count); ++param_idx) { // permutation formula used by datatrove @@ -251,43 +255,53 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string } } +// values determined using redpajama and books_sample datasets +constexpr cudf::size_type wide_string_threshold = 1 << 18; // 256K +constexpr cudf::size_type blocks_per_string = 64; + template CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, hash_value_type seed, cudf::size_type width, - hash_value_type* working_memory) + hash_value_type* working_memory, + cudf::size_type* count, + cudf::size_type param_count, + hash_value_type* d_results) { - auto const idx = cudf::detail::grid_1d::global_thread_id(); - auto const str_idx = idx / tile_size; + auto const tid = cudf::detail::grid_1d::global_thread_id(); + auto const str_idx = tid / tile_size; if (str_idx >= d_strings.size()) { return; } if (d_strings.is_null(str_idx)) { return; } auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); auto const offsets_itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset()); - auto const offset0 = offsets_itr[0]; auto const offset = offsets_itr[str_idx]; auto const size_bytes = static_cast(offsets_itr[str_idx + 1] - offset); if (size_bytes == 0) { return; } - auto const seed_hashes = working_memory + offset - offset0; + + auto const seed_hashes = working_memory + offset - offsets_itr[0]; auto const d_str = cudf::string_view(d_strings.head() + offset, size_bytes); - auto const lane_idx = idx % tile_size; + auto const lane_idx = tid % tile_size; auto const begin = d_str.data() + lane_idx; auto const end = d_str.data() + d_str.size_bytes(); auto const hasher = HashFunction(seed); - auto d_output = seed_hashes + lane_idx; - for (auto itr = begin; itr < end; itr += tile_size, d_output += tile_size) { + auto d_hashes = seed_hashes + lane_idx; + for (auto itr = begin; itr < end; itr += tile_size, d_hashes += tile_size) { if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { - *d_output = 0; + *d_hashes = 0; continue; } auto const check_str = // used for counting 'width' characters cudf::string_view(itr, static_cast(thrust::distance(itr, end))); auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(check_str, width); - if ((itr != d_str.data()) && (left > 0)) { continue; } // true if past the end of the string + if ((itr != d_str.data()) && (left > 0)) { // true if past the end of the string + *d_hashes = 0; + continue; + } auto const hash_str = cudf::string_view(itr, bytes); hash_value_type hv; @@ -296,35 +310,59 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, } else { hv = thrust::get<0>(hasher(hash_str)); } - *d_output = hv; + *d_hashes = cuda::std::max(hv, hash_value_type{1}); + } + + // logic appended here so an extra kernel is not required + if (size_bytes >= wide_string_threshold) { + if (lane_idx == 0) { + // count the number of wider strings + cuda::atomic_ref ref{*count}; + ref.fetch_add(1, cuda::std::memory_order_relaxed); + } + // initialize the output for wider strings + auto d_output = d_results + (str_idx * param_count); + for (auto i = lane_idx; i < param_count; i += tile_size) { + d_output[i] = std::numeric_limits::max(); + } } } -template +template CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings, + cudf::device_span indices, cudf::device_span parameter_a, cudf::device_span parameter_b, cudf::size_type width, hash_value_type* working_memory, hash_value_type* d_hashes) { - auto const idx = cudf::detail::grid_1d::global_thread_id(); - auto const str_idx = idx / block_size; - if (str_idx >= d_strings.size()) { return; } + auto const tid = cudf::detail::grid_1d::global_thread_id(); + auto const idx = (tid / blocks_per_string) / block_size; + if (idx >= indices.size()) { return; } + auto const str_idx = indices[idx]; if (d_strings.is_null(str_idx)) { return; } + int const section_idx = blockIdx.x % blocks_per_string; auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); auto const offsets_itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset()); - auto const offset0 = offsets_itr[0]; - auto const offset = offsets_itr[str_idx]; - auto const size_bytes = static_cast(offsets_itr[str_idx + 1] - offset); - auto const seed_hashes = working_memory + offset - offset0; + auto const offset = offsets_itr[str_idx]; + auto const size_bytes = static_cast(offsets_itr[str_idx + 1] - offset); + + auto const section_size = + (size_bytes / blocks_per_string) + + (section_idx < (blocks_per_string - 1) ? 0 : size_bytes % blocks_per_string); + auto const section_offset = section_idx * (size_bytes / blocks_per_string); + + auto const seed_hashes = working_memory + offset - offsets_itr[0] + section_offset; auto const hashes_size = - cuda::std::max(static_cast(size_bytes > 0), size_bytes - width + 1); + section_idx < (blocks_per_string - 1) + ? section_size + : cuda::std::max(static_cast(size_bytes > 0), section_size - width + 1); auto const init = size_bytes == 0 ? 0 : std::numeric_limits::max(); - auto const lane_idx = idx % block_size; + auto const lane_idx = tid % block_size; auto const d_output = d_hashes + (str_idx * parameter_a.size()); auto const begin = seed_hashes + lane_idx; @@ -367,7 +405,12 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string auto const values = block_hashes + (lane_idx * block_size); auto const minv = thrust::reduce(thrust::seq, values, values + block_size, init, thrust::minimum{}); - d_output[lane_idx + i] = minv; + if constexpr (blocks_per_string > 1) { + cuda::atomic_ref ref{d_output[lane_idx + i]}; + ref.fetch_min(minv, cuda::std::memory_order_relaxed); + } else { + d_output[lane_idx + i] = minv; + } } __syncthreads(); } @@ -409,23 +452,61 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, cudf::detail::grid_1d grid{static_cast(input.size()) * block_size, block_size}; -#if 0 - minhash_permuted_kernel - <<>>( - *d_strings, seed, parameter_a, parameter_b, width, d_hashes); - -#else - auto const wm_size = input.chars_size(stream); - auto working_memory = rmm::device_uvector(wm_size, stream); + auto const wm_size = input.chars_size(stream); + auto working_memory = rmm::device_uvector(wm_size, stream); + auto d_threshold_count = rmm::device_scalar(0, stream); minhash_seed_kernel - <<>>( - *d_strings, seed, width, working_memory.data()); + <<>>(*d_strings, + seed, + width, + working_memory.data(), + d_threshold_count.data(), + parameter_a.size(), + d_hashes); + auto const threshold_count = d_threshold_count.value(stream); + + auto indices = rmm::device_uvector(input.size(), stream); + thrust::sequence(rmm::exec_policy(stream), indices.begin(), indices.end()); + cudf::size_type threshold_index = input.size(); + if (threshold_count < input.size()) { + auto sizes = rmm::device_uvector(input.size(), stream); + thrust::transform(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(input.size()), + sizes.data(), + cuda::proclaim_return_type( + [d_strings = *d_strings] __device__(auto idx) -> cudf::size_type { + if (d_strings.is_null(idx)) { return 0; } + return d_strings.element(idx).size_bytes(); + })); + thrust::sort_by_key( + rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), indices.begin()); + auto bp = thrust::lower_bound( + rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), wide_string_threshold); + threshold_index = static_cast(thrust::distance(sizes.begin(), bp)); + } - minhash_permuted_kernel - <<>>( - *d_strings, parameter_a, parameter_b, width, working_memory.data(), d_hashes); -#endif + // handle the strings below the threshold width + if (threshold_index > 0) { + auto d_indices = cudf::device_span(indices.data(), threshold_index); + cudf::detail::grid_1d grid{static_cast(d_indices.size()) * block_size, + block_size}; + minhash_permuted_kernel + <<>>( + *d_strings, d_indices, parameter_a, parameter_b, width, working_memory.data(), d_hashes); + } + + // handle the strings above the threshold width + if (threshold_index < input.size()) { + auto count = static_cast(input.size() - threshold_index); + auto d_indices = + cudf::device_span(indices.data() + threshold_index, count); + cudf::detail::grid_1d grid{count * block_size * blocks_per_string, block_size}; + minhash_permuted_kernel + <<>>( + *d_strings, d_indices, parameter_a, parameter_b, width, working_memory.data(), d_hashes); + } return hashes; } From 58f9206dc8903ea8aac76631073a4cd577752b5b Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 23 Oct 2024 13:24:32 -0400 Subject: [PATCH 15/25] fix threshold-index init logic --- cpp/src/text/minhash.cu | 186 +++++++++++++++++++++++++++------------- 1 file changed, 127 insertions(+), 59 deletions(-) diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 549b9faab32..ddfd81de310 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -37,6 +37,7 @@ #include #include +#include #include #include #include @@ -170,11 +171,13 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, } // Number of parameter a/b values to process per thread. -// The intermediate values are stored in shared-memory and therefore limits the count. -// Regardless, this value was found to be the most efficient size for both uint32 and uint64 -// hash types based on benchmarks +// The intermediate values are stored in shared-memory and therefore limits this count. +// This value was found to be the most efficient size for both uint32 and uint64 +// hash types based on benchmarks. constexpr cuda::std::size_t calc_chunk_size = 16; +// this is deprecated and to be removed in the future; +// keeping it for now for verifying results from the faster kernels below template CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings, hash_value_type seed, @@ -188,9 +191,11 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string if (str_idx >= d_strings.size()) { return; } if (d_strings.is_null(str_idx)) { return; } + auto const block = cooperative_groups::this_thread_block(); + auto const d_str = d_strings.element(str_idx); auto const init = d_str.empty() ? 0 : std::numeric_limits::max(); - auto const lane_idx = idx % block_size; + auto const lane_idx = block.thread_rank(); // idx % block_size; auto const d_output = d_hashes + (str_idx * parameter_a.size()); @@ -203,15 +208,15 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string // Found to be the most efficient shared memory size for both hash types __shared__ char shmem[block_size * calc_chunk_size * sizeof(hash_value_type)]; - auto const block_hashes = reinterpret_cast(shmem); + auto const block_values = reinterpret_cast(shmem); auto const hasher = HashFunction(seed); for (std::size_t i = 0; i < parameter_a.size(); i += calc_chunk_size) { // initialize working memory - auto const tile_hashes = block_hashes + (lane_idx * calc_chunk_size); + auto const tile_hashes = block_values + (lane_idx * calc_chunk_size); thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + calc_chunk_size, init); - __syncthreads(); + block.sync(); auto const param_count = cuda::std::min(static_cast(calc_chunk_size), parameter_a.size() - i); @@ -239,32 +244,55 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string hash_value_type const v = ((hv * parameter_a[param_idx] + parameter_b[param_idx]) % mersenne_prime) & hash_max; auto const block_idx = ((param_idx % calc_chunk_size) * block_size) + lane_idx; - block_hashes[block_idx] = cuda::std::min(v, block_hashes[block_idx]); + block_values[block_idx] = cuda::std::min(v, block_values[block_idx]); } } - __syncthreads(); + block.sync(); // reduce each parameter values vector to a single min value if (lane_idx < param_count) { - auto const values = block_hashes + (lane_idx * block_size); + auto const values = block_values + (lane_idx * block_size); auto const minv = thrust::reduce(thrust::seq, values, values + block_size, init, thrust::minimum{}); d_output[lane_idx + i] = minv; } - __syncthreads(); + block.sync(); } } -// values determined using redpajama and books_sample datasets +// Separate kernels are used to process strings above and below this value (in bytes). constexpr cudf::size_type wide_string_threshold = 1 << 18; // 256K -constexpr cudf::size_type blocks_per_string = 64; +// The number of blocks per string for the above-threshold kernel processing. +constexpr cudf::size_type blocks_per_string = 64; +// These values determined using redpajama and books_sample datasets +/** + * @brief Hashing kernel launched as a thread per tile-size (block or warp) + * + * This kernel computes the hashes for each string using the seed and the specified + * hash function. The width is used to compute rolling substrings to hash over. + * The hashes are stored in d_hashes to be used in the minhash_permuted_kernel. + * + * This kernel also counts the number of strings above the wide_string_threshold + * and proactively initializes the output values for those strings. + * + * @tparam HashFunction The hash function to use for this kernel + * @tparam hash_value_type Derived from HashFunction result_type + * + * @param d_strings The input strings to hash + * @param seed The seed used for the hash function + * @param width Width in characters used for determining substrings to hash + * @param d_hashes The resulting hash values are stored here + * @param count Stores the number of strings above wide_string_threshold + * @param param_count Number of parameters (used for proactive initialize) + * @param d_results Final results vector (used for proactive initialize) + */ template CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, hash_value_type seed, cudf::size_type width, - hash_value_type* working_memory, - cudf::size_type* count, + hash_value_type* d_hashes, + cudf::size_type* threshold_count, cudf::size_type param_count, hash_value_type* d_results) { @@ -273,6 +301,7 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, if (str_idx >= d_strings.size()) { return; } if (d_strings.is_null(str_idx)) { return; } + // retrieve this strings offset to locate the output position in d_hashes auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); auto const offsets_itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset()); @@ -280,26 +309,26 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, auto const size_bytes = static_cast(offsets_itr[str_idx + 1] - offset); if (size_bytes == 0) { return; } - auto const seed_hashes = working_memory + offset - offsets_itr[0]; - auto const d_str = cudf::string_view(d_strings.head() + offset, size_bytes); auto const lane_idx = tid % tile_size; + // hashes for this string/thread stored here + auto seed_hashes = d_hashes + offset - offsets_itr[0] + lane_idx; + auto const begin = d_str.data() + lane_idx; auto const end = d_str.data() + d_str.size_bytes(); auto const hasher = HashFunction(seed); - auto d_hashes = seed_hashes + lane_idx; - for (auto itr = begin; itr < end; itr += tile_size, d_hashes += tile_size) { + for (auto itr = begin; itr < end; itr += tile_size, seed_hashes += tile_size) { if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { - *d_hashes = 0; + *seed_hashes = 0; continue; } auto const check_str = // used for counting 'width' characters cudf::string_view(itr, static_cast(thrust::distance(itr, end))); auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(check_str, width); if ((itr != d_str.data()) && (left > 0)) { // true if past the end of the string - *d_hashes = 0; + *seed_hashes = 0; continue; } @@ -310,14 +339,15 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, } else { hv = thrust::get<0>(hasher(hash_str)); } - *d_hashes = cuda::std::max(hv, hash_value_type{1}); + // disallowing hash to zero case + *seed_hashes = cuda::std::max(hv, hash_value_type{1}); } // logic appended here so an extra kernel is not required if (size_bytes >= wide_string_threshold) { if (lane_idx == 0) { - // count the number of wider strings - cuda::atomic_ref ref{*count}; + // count the number of wide strings + cuda::atomic_ref ref{*threshold_count}; ref.fetch_add(1, cuda::std::memory_order_relaxed); } // initialize the output for wider strings @@ -328,21 +358,46 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, } } +/** + * @brief Permutation calculation kernel + * + * This kernel computes uses the hashes from the minhash_seed_kernel to + * compute the results using the parameter_a and parameter_b values. + * The output is the number of input rows (N) by the number of parameter values (M). + * Each output[i] is the calculated result for parameter_a/b[0:M]. + * + * This kernel is launched with either blocks per strings of 1 for strings + * below the wide_strings_threshold or blocks per string = blocks_per_strings + * for strings above wide_strings_threshold. + * + * @tparam hash_value_type Derived from HashFunction result_type + * @tparam blocks_per_string Number of blocks used to process each string + * + * @param d_strings The input strings to hash + * @param indices The indices of the strings in d_strings to process here + * @param parameter_a 1st set of parameters for the calculation result + * @param parameter_b 2nd set of parameters for the calculation result + * @param width Width used for calculating the number of available hashes in each string + * @param d_hashes The hash values computed in minhash_seed_kernel + * @param d_results Final results vector of calculate values + */ template CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings, cudf::device_span indices, cudf::device_span parameter_a, cudf::device_span parameter_b, cudf::size_type width, - hash_value_type* working_memory, - hash_value_type* d_hashes) + hash_value_type const* d_hashes, + hash_value_type* d_results) { auto const tid = cudf::detail::grid_1d::global_thread_id(); auto const idx = (tid / blocks_per_string) / block_size; if (idx >= indices.size()) { return; } auto const str_idx = indices[idx]; if (d_strings.is_null(str_idx)) { return; } - int const section_idx = blockIdx.x % blocks_per_string; + + auto const block = cooperative_groups::this_thread_block(); + int const section_idx = block.group_index().x % blocks_per_string; auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); auto const offsets_itr = @@ -350,20 +405,24 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string auto const offset = offsets_itr[str_idx]; auto const size_bytes = static_cast(offsets_itr[str_idx + 1] - offset); + // number of items to process in this block; + // last block includes any remainder values from the size_bytes/blocks_per_string truncation auto const section_size = (size_bytes / blocks_per_string) + (section_idx < (blocks_per_string - 1) ? 0 : size_bytes % blocks_per_string); auto const section_offset = section_idx * (size_bytes / blocks_per_string); - auto const seed_hashes = working_memory + offset - offsets_itr[0] + section_offset; + // hash values for this block + auto const seed_hashes = d_hashes + offset - offsets_itr[0] + section_offset; + // width used here as a max value since a string's char-count <= byte-count auto const hashes_size = section_idx < (blocks_per_string - 1) ? section_size : cuda::std::max(static_cast(size_bytes > 0), section_size - width + 1); auto const init = size_bytes == 0 ? 0 : std::numeric_limits::max(); - auto const lane_idx = tid % block_size; - auto const d_output = d_hashes + (str_idx * parameter_a.size()); + auto const lane_idx = block.thread_rank(); + auto const d_output = d_results + (str_idx * parameter_a.size()); auto const begin = seed_hashes + lane_idx; auto const end = seed_hashes + hashes_size; @@ -372,15 +431,16 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string constexpr uint64_t mersenne_prime = (1UL << 61) - 1; constexpr hash_value_type hash_max = std::numeric_limits::max(); - // Found to be the most efficient shared memory size for both hash types + // found to be the most efficient shared memory size for both hash types __shared__ char shmem[block_size * calc_chunk_size * sizeof(hash_value_type)]; - auto const block_hashes = reinterpret_cast(shmem); + auto const block_values = reinterpret_cast(shmem); for (std::size_t i = 0; i < parameter_a.size(); i += calc_chunk_size) { - // initialize working memory - auto const tiles = block_hashes + (lane_idx * calc_chunk_size); - thrust::uninitialized_fill(thrust::seq, tiles, tiles + calc_chunk_size, init); - __syncthreads(); + // initialize our chunk of shared memory + // each thread handles calc_chunk_size of values + auto const chunk_values = block_values + (lane_idx * calc_chunk_size); + thrust::uninitialized_fill(thrust::seq, chunk_values, chunk_values + calc_chunk_size, init); + block.sync(); auto const param_count = cuda::std::min(static_cast(calc_chunk_size), parameter_a.size() - i); @@ -388,31 +448,35 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string // each lane accumulates min hashes in its spot in shared memory for (auto itr = begin; itr < end; itr += block_size) { auto const hv = *itr; - if (hv == 0) { continue; } // skip intermediate UTF-8 bytes + if (hv == 0) { continue; } // 0 is used as a skip sentinel for (std::size_t param_idx = i; param_idx < (i + param_count); ++param_idx) { // permutation formula used by datatrove hash_value_type const v = ((hv * parameter_a[param_idx] + parameter_b[param_idx]) % mersenne_prime) & hash_max; auto const block_idx = ((param_idx % calc_chunk_size) * block_size) + lane_idx; - block_hashes[block_idx] = cuda::std::min(v, block_hashes[block_idx]); + block_values[block_idx] = cuda::std::min(v, block_values[block_idx]); } } - __syncthreads(); + block.sync(); // reduce each parameter values vector to a single min value + // assumes that the block_size > calc_chunk_size + // each thread reduces a block_size of parameter values (thread per parameter) if (lane_idx < param_count) { - auto const values = block_hashes + (lane_idx * block_size); + auto const values = block_values + (lane_idx * block_size); + // cooperative groups does not have a min function and cub::BlockReduce was slower auto const minv = thrust::reduce(thrust::seq, values, values + block_size, init, thrust::minimum{}); if constexpr (blocks_per_string > 1) { + // accumulates mins for each block into d_output cuda::atomic_ref ref{d_output[lane_idx + i]}; ref.fetch_min(minv, cuda::std::memory_order_relaxed); } else { d_output[lane_idx + i] = minv; } } - __syncthreads(); + block.sync(); } } @@ -425,51 +489,55 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS(!parameter_a.empty(), "Parameters A and B cannot be empty", std::invalid_argument); CUDF_EXPECTS(width >= 2, "Parameter width should be an integer value of 2 or greater", std::invalid_argument); + CUDF_EXPECTS(!parameter_a.empty(), "Parameters A and B cannot be empty", std::invalid_argument); CUDF_EXPECTS(parameter_a.size() == parameter_b.size(), "Parameters A and B should have the same number of elements", std::invalid_argument); - CUDF_EXPECTS((static_cast(input.size()) * parameter_a.size()) < - static_cast(std::numeric_limits::max()), - "The number of seeds times the number of input rows exceeds the column size limit", - std::overflow_error); + CUDF_EXPECTS( + (static_cast(input.size()) * parameter_a.size()) < + static_cast(std::numeric_limits::max()), + "The number of parameters times the number of input rows exceeds the column size limit", + std::overflow_error); auto const output_type = cudf::data_type{cudf::type_to_id()}; if (input.is_empty()) { return cudf::make_empty_column(output_type); } auto const d_strings = cudf::column_device_view::create(input.parent(), stream); - auto hashes = + auto results = cudf::make_numeric_column(output_type, input.size() * static_cast(parameter_a.size()), cudf::mask_state::UNALLOCATED, stream, mr); - auto d_hashes = hashes->mutable_view().data(); + auto d_results = results->mutable_view().data(); cudf::detail::grid_1d grid{static_cast(input.size()) * block_size, block_size}; - auto const wm_size = input.chars_size(stream); - auto working_memory = rmm::device_uvector(wm_size, stream); + auto const hashes_size = input.chars_size(stream); + auto d_hashes = rmm::device_uvector(hashes_size, stream); auto d_threshold_count = rmm::device_scalar(0, stream); minhash_seed_kernel <<>>(*d_strings, seed, width, - working_memory.data(), + d_hashes.data(), d_threshold_count.data(), parameter_a.size(), - d_hashes); + d_results); auto const threshold_count = d_threshold_count.value(stream); auto indices = rmm::device_uvector(input.size(), stream); thrust::sequence(rmm::exec_policy(stream), indices.begin(), indices.end()); - cudf::size_type threshold_index = input.size(); - if (threshold_count < input.size()) { + cudf::size_type threshold_index = threshold_count < input.size() ? input.size() : 0; + + // if we counted a split of above/below threshold then + // compute partitions based on the size of each string + if ((threshold_count > 0) && (threshold_count < input.size())) { auto sizes = rmm::device_uvector(input.size(), stream); thrust::transform(rmm::exec_policy_nosync(stream), thrust::counting_iterator(0), @@ -482,9 +550,9 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, })); thrust::sort_by_key( rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), indices.begin()); - auto bp = thrust::lower_bound( + auto const lb = thrust::lower_bound( rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), wide_string_threshold); - threshold_index = static_cast(thrust::distance(sizes.begin(), bp)); + threshold_index = static_cast(thrust::distance(sizes.begin(), lb)); } // handle the strings below the threshold width @@ -494,7 +562,7 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, block_size}; minhash_permuted_kernel <<>>( - *d_strings, d_indices, parameter_a, parameter_b, width, working_memory.data(), d_hashes); + *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results); } // handle the strings above the threshold width @@ -505,10 +573,10 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, cudf::detail::grid_1d grid{count * block_size * blocks_per_string, block_size}; minhash_permuted_kernel <<>>( - *d_strings, d_indices, parameter_a, parameter_b, width, working_memory.data(), d_hashes); + *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results); } - return hashes; + return results; } /** From fef6e0e6bb38b66fbbf458f5e9f5874f21ef86b6 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 23 Oct 2024 15:56:02 -0400 Subject: [PATCH 16/25] use cudf::detail::device_scalar --- cpp/src/text/minhash.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index ddfd81de310..fa6e7a30a62 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -519,7 +520,7 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, block_size}; auto const hashes_size = input.chars_size(stream); auto d_hashes = rmm::device_uvector(hashes_size, stream); - auto d_threshold_count = rmm::device_scalar(0, stream); + auto d_threshold_count = cudf::detail::device_scalar(0, stream); minhash_seed_kernel <<>>(*d_strings, From e1067b354922cfcdc6a98a122ed2d410816094ae Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 24 Oct 2024 12:58:25 -0400 Subject: [PATCH 17/25] fix benchmarks; add gtest --- cpp/benchmarks/text/minhash.cpp | 31 +++++++------- cpp/include/nvtext/minhash.hpp | 12 +++--- cpp/src/text/minhash.cu | 44 +++++++++---------- cpp/tests/text/minhash_tests.cpp | 73 +++++++++++++++++++++++--------- 4 files changed, 97 insertions(+), 63 deletions(-) diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp index a0f9196d05f..a80d0dcbdb8 100644 --- a/cpp/benchmarks/text/minhash.cpp +++ b/cpp/benchmarks/text/minhash.cpp @@ -20,8 +20,6 @@ #include -#include - #include static void bench_minhash(nvbench::state& state) @@ -29,25 +27,25 @@ static void bench_minhash(nvbench::state& state) auto const num_rows = static_cast(state.get_int64("num_rows")); auto const row_width = static_cast(state.get_int64("row_width")); auto const hash_width = static_cast(state.get_int64("hash_width")); - auto const seed_count = static_cast(state.get_int64("seed_count")); + auto const parameters = static_cast(state.get_int64("parameters")); auto const base64 = state.get_int64("hash_type") == 64; - if ((num_rows * seed_count * (base64 ? sizeof(int64_t) : sizeof(int32_t)) * 32L) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks requiring more than 2GB working memory"); - } - data_profile const strings_profile = data_profile_builder().distribution( cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); auto const strings_table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile); cudf::strings_column_view input(strings_table->view().column(0)); - data_profile const seeds_profile = data_profile_builder().no_validity().distribution( - cudf::type_to_id(), distribution_id::NORMAL, 0, row_width); - auto const seed_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32; - auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile); - auto seeds = seeds_table->get_column(0); + data_profile const param_profile = data_profile_builder().no_validity().distribution( + cudf::type_to_id(), + distribution_id::NORMAL, + 0u, + std::numeric_limits::max()); + auto const param_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32; + auto const param_table = + create_random_table({param_type, param_type}, row_count{parameters}, param_profile); + auto const parameters_a = param_table->view().column(0); + auto const parameters_b = param_table->view().column(1); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); @@ -56,8 +54,9 @@ static void bench_minhash(nvbench::state& state) state.add_global_memory_writes(num_rows); // output are hashes state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = base64 ? nvtext::minhash64(input, seeds.view(), hash_width) - : nvtext::minhash(input, seeds.view(), hash_width); + auto result = base64 + ? nvtext::minhash64_permuted(input, 0, parameters_a, parameters_b, hash_width) + : nvtext::minhash_permuted(input, 0, parameters_a, parameters_b, hash_width); }); } @@ -66,5 +65,5 @@ NVBENCH_BENCH(bench_minhash) .add_int64_axis("num_rows", {15000, 30000, 60000}) .add_int64_axis("row_width", {6000, 28000, 50000}) .add_int64_axis("hash_width", {12, 24}) - .add_int64_axis("seed_count", {26, 260}) + .add_int64_axis("parameters", {26, 260}) .add_int64_axis("hash_type", {32, 64}); diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index 0e352cf9b74..95d98b9aef4 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -97,12 +97,12 @@ std::unique_ptr minhash( * * The input strings are first hashed using the given `seed` over substrings * of `width` characters. These hash values are then combined with the `a` - * and `b` values using the following formula: + * and `b` parameter values using the following formula: * ``` * max_hash = max of uint32 * mp = (1 << 61) - 1 - * hv = hash value of a substring - * pv[i] = ((hv * a[i] + b[i]) % mp) & max_hash + * hv[i] = hash value of a substring at i + * pv[i] = ((hv[i] * a[i] + b[i]) % mp) & max_hash * ``` * * This calculation is performed on each substring and the minimum value is computed @@ -205,12 +205,12 @@ std::unique_ptr minhash64( * * The input strings are first hashed using the given `seed` over substrings * of `width` characters. These hash values are then combined with the `a` - * and `b` values using the following formula: + * and `b` parameter values using the following formula: * ``` * max_hash = max of uint64 * mp = (1 << 61) - 1 - * hv = hash value of a substring - * pv[i] = ((hv * a[i] + b[i]) % mp) & max_hash + * hv[i] = hash value of a substring at i + * pv[i] = ((hv[i] * a[i] + b[i]) % mp) & max_hash * ``` * * This calculation is performed on each substring and the minimum value is computed diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index fa6e7a30a62..7e45e3ad3fd 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -175,7 +175,7 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, // The intermediate values are stored in shared-memory and therefore limits this count. // This value was found to be the most efficient size for both uint32 and uint64 // hash types based on benchmarks. -constexpr cuda::std::size_t calc_chunk_size = 16; +constexpr cuda::std::size_t params_per_thread = 16; // this is deprecated and to be removed in the future; // keeping it for now for verifying results from the faster kernels below @@ -208,19 +208,19 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string constexpr hash_value_type hash_max = std::numeric_limits::max(); // Found to be the most efficient shared memory size for both hash types - __shared__ char shmem[block_size * calc_chunk_size * sizeof(hash_value_type)]; + __shared__ char shmem[block_size * params_per_thread * sizeof(hash_value_type)]; auto const block_values = reinterpret_cast(shmem); auto const hasher = HashFunction(seed); - for (std::size_t i = 0; i < parameter_a.size(); i += calc_chunk_size) { + for (std::size_t i = 0; i < parameter_a.size(); i += params_per_thread) { // initialize working memory - auto const tile_hashes = block_values + (lane_idx * calc_chunk_size); - thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + calc_chunk_size, init); + auto const tile_hashes = block_values + (lane_idx * params_per_thread); + thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + params_per_thread, init); block.sync(); auto const param_count = - cuda::std::min(static_cast(calc_chunk_size), parameter_a.size() - i); + cuda::std::min(static_cast(params_per_thread), parameter_a.size() - i); // each lane hashes 'width' substrings of d_str for (auto itr = begin; itr < end; itr += block_size) { @@ -244,7 +244,7 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string // permutation formula used by datatrove hash_value_type const v = ((hv * parameter_a[param_idx] + parameter_b[param_idx]) % mersenne_prime) & hash_max; - auto const block_idx = ((param_idx % calc_chunk_size) * block_size) + lane_idx; + auto const block_idx = ((param_idx % params_per_thread) * block_size) + lane_idx; block_values[block_idx] = cuda::std::min(v, block_values[block_idx]); } } @@ -284,7 +284,7 @@ constexpr cudf::size_type blocks_per_string = 64; * @param seed The seed used for the hash function * @param width Width in characters used for determining substrings to hash * @param d_hashes The resulting hash values are stored here - * @param count Stores the number of strings above wide_string_threshold + * @param threshold_count Stores the number of strings above wide_string_threshold * @param param_count Number of parameters (used for proactive initialize) * @param d_results Final results vector (used for proactive initialize) */ @@ -351,7 +351,7 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, cuda::atomic_ref ref{*threshold_count}; ref.fetch_add(1, cuda::std::memory_order_relaxed); } - // initialize the output for wider strings + // initialize the output -- only needed for wider strings auto d_output = d_results + (str_idx * param_count); for (auto i = lane_idx; i < param_count; i += tile_size) { d_output[i] = std::numeric_limits::max(); @@ -362,8 +362,8 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, /** * @brief Permutation calculation kernel * - * This kernel computes uses the hashes from the minhash_seed_kernel to - * compute the results using the parameter_a and parameter_b values. + * This kernel uses the hashes from the minhash_seed_kernel and the parameter_a and + * parameter_b values to compute the final output results. * The output is the number of input rows (N) by the number of parameter values (M). * Each output[i] is the calculated result for parameter_a/b[0:M]. * @@ -433,20 +433,20 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string constexpr hash_value_type hash_max = std::numeric_limits::max(); // found to be the most efficient shared memory size for both hash types - __shared__ char shmem[block_size * calc_chunk_size * sizeof(hash_value_type)]; + __shared__ char shmem[block_size * params_per_thread * sizeof(hash_value_type)]; auto const block_values = reinterpret_cast(shmem); - for (std::size_t i = 0; i < parameter_a.size(); i += calc_chunk_size) { - // initialize our chunk of shared memory - // each thread handles calc_chunk_size of values - auto const chunk_values = block_values + (lane_idx * calc_chunk_size); - thrust::uninitialized_fill(thrust::seq, chunk_values, chunk_values + calc_chunk_size, init); + for (std::size_t i = 0; i < parameter_a.size(); i += params_per_thread) { + // initialize this block's chunk of shared memory + // each thread handles params_per_thread of values + auto const chunk_values = block_values + (lane_idx * params_per_thread); + thrust::uninitialized_fill(thrust::seq, chunk_values, chunk_values + params_per_thread, init); block.sync(); auto const param_count = - cuda::std::min(static_cast(calc_chunk_size), parameter_a.size() - i); + cuda::std::min(static_cast(params_per_thread), parameter_a.size() - i); - // each lane accumulates min hashes in its spot in shared memory + // each lane accumulates min hashes in its shared memory for (auto itr = begin; itr < end; itr += block_size) { auto const hv = *itr; if (hv == 0) { continue; } // 0 is used as a skip sentinel @@ -455,14 +455,14 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string // permutation formula used by datatrove hash_value_type const v = ((hv * parameter_a[param_idx] + parameter_b[param_idx]) % mersenne_prime) & hash_max; - auto const block_idx = ((param_idx % calc_chunk_size) * block_size) + lane_idx; + auto const block_idx = ((param_idx % params_per_thread) * block_size) + lane_idx; block_values[block_idx] = cuda::std::min(v, block_values[block_idx]); } } block.sync(); // reduce each parameter values vector to a single min value - // assumes that the block_size > calc_chunk_size + // assumes that the block_size > params_per_thread // each thread reduces a block_size of parameter values (thread per parameter) if (lane_idx < param_count) { auto const values = block_values + (lane_idx * block_size); @@ -568,7 +568,7 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, // handle the strings above the threshold width if (threshold_index < input.size()) { - auto count = static_cast(input.size() - threshold_index); + auto const count = static_cast(input.size() - threshold_index); auto d_indices = cudf::device_span(indices.data() + threshold_index, count); cudf::detail::grid_1d grid{count * block_size * blocks_per_string, block_size}; diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index 4ca0607ede8..0f4d0ce0dc2 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -169,7 +169,7 @@ TEST_F(MinHashTest, MultiSeedWithNullInputRow) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } -TEST_F(MinHashTest, PermutedMultiSeed) +TEST_F(MinHashTest, Permuted) { auto input = cudf::test::strings_column_wrapper({"doc 1", @@ -185,31 +185,31 @@ TEST_F(MinHashTest, PermutedMultiSeed) auto view = cudf::strings_column_view(input); - auto first = thrust::counting_iterator(10); - auto seeds = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto first = thrust::counting_iterator(10); + auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); auto results = - nvtext::minhash_permuted(view, 0, cudf::column_view(seeds), cudf::column_view(seeds), 4); + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); - using LCW = cudf::test::lists_column_wrapper; + using LCW32 = cudf::test::lists_column_wrapper; // clang-format off - LCW expected({ - LCW{1392101586u, 394869177u, 811528444u}, - LCW{ 211415830u, 187088503u, 130291444u}, - LCW{2098117052u, 394869177u, 799753544u}, - LCW{2264583304u, 2920538364u, 3576493424u}, - LCW{ 253327882u, 41747273u, 302030804u}, - LCW{2109809594u, 1017470651u, 326988172u}, - LCW{1303819864u, 850676747u, 147107852u}, - LCW{ 736021564u, 720812292u, 1405158760u}, - LCW{ 902780242u, 134064807u, 1613944636u}, - LCW{ 547084870u, 1748895564u, 656501844u} + LCW32 expected({ + LCW32{1392101586u, 394869177u, 811528444u}, + LCW32{ 211415830u, 187088503u, 130291444u}, + LCW32{2098117052u, 394869177u, 799753544u}, + LCW32{2264583304u, 2920538364u, 3576493424u}, + LCW32{ 253327882u, 41747273u, 302030804u}, + LCW32{2109809594u, 1017470651u, 326988172u}, + LCW32{1303819864u, 850676747u, 147107852u}, + LCW32{ 736021564u, 720812292u, 1405158760u}, + LCW32{ 902780242u, 134064807u, 1613944636u}, + LCW32{ 547084870u, 1748895564u, 656501844u} }); // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto seeds64 = cudf::test::fixed_width_column_wrapper(first, first + 3); - auto results64 = - nvtext::minhash64_permuted(view, 0, cudf::column_view(seeds64), cudf::column_view(seeds64), 4); + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results64 = nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); using LCW64 = cudf::test::lists_column_wrapper; // clang-format off @@ -229,6 +229,41 @@ TEST_F(MinHashTest, PermutedMultiSeed) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } +TEST_F(MinHashTest, PermutedWide) +{ + std::string const small(2 << 10, 'x'); // below wide_string_threshold + std::string const wide(2 << 19, 'y'); // above wide_string_threshold + auto input = cudf::test::strings_column_wrapper({small, wide}); + auto view = cudf::strings_column_view(input); + + auto first = thrust::counting_iterator(20); + auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results = + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); + + using LCW32 = cudf::test::lists_column_wrapper; + // clang-format off + LCW32 expected({ + LCW32{1731998032u, 315359380u, 3193688024u}, + LCW32{1293098788u, 2860992281u, 133918478u} + }); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results64 = nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); + + using LCW64 = cudf::test::lists_column_wrapper; + // clang-format off + LCW64 expected64({ + LCW64{1818322427062143853ul, 641024893347719371ul, 1769570368846988848ul}, + LCW64{1389920339306667795ul, 421787002125838902ul, 1759496674158703968ul} + }); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); +} + TEST_F(MinHashTest, WordsMinHash) { using LCWS = cudf::test::lists_column_wrapper; From 081546f960d2efa1ca4ce01ef3ec89ab676c4e5c Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 25 Oct 2024 16:04:47 -0400 Subject: [PATCH 18/25] reinstate original non-permuted code --- cpp/benchmarks/CMakeLists.txt | 4 +- cpp/src/text/minhash.cu | 176 +++++++----------------- cpp/tests/CMakeLists.txt | 1 + cpp/tests/text/minhash_tests.cpp | 223 +++++-------------------------- 4 files changed, 86 insertions(+), 318 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 2a4ac789046..11819dfe281 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -348,8 +348,8 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary ConfigureBench(TEXT_BENCH text/subword.cpp) ConfigureNVBench( - TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/ngrams.cpp - text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp + TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp + text/ngrams.cpp text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp ) # ################################################################################################## diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 7e45e3ad3fd..88b61fb6c8c 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -52,56 +52,54 @@ namespace nvtext { namespace detail { namespace { -constexpr cudf::thread_index_type block_size = 256; -// for tuning independently from block_size -constexpr cudf::thread_index_type tile_size = block_size; - /** * @brief Compute the minhash of each string for each seed * - * This is a block-per-string algorithm where parallel threads within a block - * work on a single string row. + * This is a warp-per-string algorithm where parallel threads within a warp + * work on substrings of a single string row. * * @tparam HashFunction hash function to use on each substring * * @param d_strings Strings column to process * @param seeds Seeds for hashing each string * @param width Substring window size in characters - * @param working_memory Memory used to hold intermediate hash values * @param d_hashes Minhash output values for each string */ -template +template < + typename HashFunction, + typename hash_value_type = std:: + conditional_t, uint32_t, uint64_t>> CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, cudf::device_span seeds, cudf::size_type width, - hash_value_type* working_memory, hash_value_type* d_hashes) { - auto const idx = cudf::detail::grid_1d::global_thread_id(); - auto const str_idx = idx / block_size; - if (str_idx >= d_strings.size()) { return; } + auto const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (idx >= (static_cast(d_strings.size()) * + static_cast(cudf::detail::warp_size))) { + return; + } + + auto const str_idx = static_cast(idx / cudf::detail::warp_size); + auto const lane_idx = static_cast(idx % cudf::detail::warp_size); + if (d_strings.is_null(str_idx)) { return; } auto const d_str = d_strings.element(str_idx); - auto const init = d_str.empty() ? 0 : std::numeric_limits::max(); - auto const lane_idx = idx % block_size; - - auto tile_hashes = working_memory + (str_idx * block_size * seeds.size()); + auto const d_output = d_hashes + (str_idx * seeds.size()); - // initialize working memory - for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size(); seed_idx += block_size) { - auto begin = tile_hashes + (seed_idx * block_size); - thrust::uninitialized_fill(thrust::seq, begin, begin + block_size, init); + // initialize hashes output for this string + if (lane_idx == 0) { + auto const init = d_str.empty() ? 0 : std::numeric_limits::max(); + thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init); } - __syncthreads(); - - auto const d_output = d_hashes + (str_idx * seeds.size()); + __syncwarp(); auto const begin = d_str.data() + lane_idx; auto const end = d_str.data() + d_str.size_bytes(); // each lane hashes 'width' substrings of d_str - for (auto itr = begin; itr < end; itr += block_size) { + for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) { if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { continue; } auto const check_str = // used for counting 'width' characters cudf::string_view(itr, static_cast(thrust::distance(itr, end))); @@ -109,29 +107,30 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, if ((itr != d_str.data()) && (left > 0)) { continue; } // true if past the end of the string auto const hash_str = cudf::string_view(itr, bytes); + // hashing with each seed on the same section of the string is 10x faster than + // computing the substrings for each seed for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) { auto const hasher = HashFunction(seeds[seed_idx]); - hash_value_type hv; + // hash substring and store the min value if constexpr (std::is_same_v) { - hv = hasher(hash_str); + auto const hvalue = hasher(hash_str); + cuda::atomic_ref ref{*(d_output + seed_idx)}; + ref.fetch_min(hvalue, cuda::std::memory_order_relaxed); } else { - hv = thrust::get<0>(hasher(hash_str)); + // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values + // but only uses the first uint64 value as requested by the LLM team. + auto const hvalue = thrust::get<0>(hasher(hash_str)); + cuda::atomic_ref ref{*(d_output + seed_idx)}; + ref.fetch_min(hvalue, cuda::std::memory_order_relaxed); } - tile_hashes[(seed_idx * block_size) + lane_idx] = - cuda::std::min(hv, tile_hashes[(seed_idx * block_size) + lane_idx]); } } - __syncthreads(); - - // compute final result - for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size(); seed_idx += block_size) { - auto begin = tile_hashes + (seed_idx * block_size); - auto hv = thrust::reduce(thrust::seq, begin, begin + block_size, init, thrust::minimum{}); - d_output[seed_idx] = hv; - } } -template +template < + typename HashFunction, + typename hash_value_type = std:: + conditional_t, uint32_t, uint64_t>> std::unique_ptr minhash_fn(cudf::strings_column_view const& input, cudf::device_span seeds, cudf::size_type width, @@ -159,108 +158,25 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, mr); auto d_hashes = hashes->mutable_view().data(); - auto const wm_size = cudf::util::round_up_safe(seeds.size() * tile_size * input.size(), - static_cast(block_size)); - auto working_memory = rmm::device_uvector(wm_size, stream); - - cudf::detail::grid_1d grid{static_cast(input.size()) * tile_size, - block_size}; + constexpr cudf::thread_index_type block_size = 256; + cudf::detail::grid_1d grid{ + static_cast(input.size()) * cudf::detail::warp_size, block_size}; minhash_kernel<<>>( - *d_strings, seeds, width, working_memory.data(), d_hashes); + *d_strings, seeds, width, d_hashes); return hashes; } -// Number of parameter a/b values to process per thread. +constexpr cudf::thread_index_type block_size = 256; +// for tuning independently from block_size +constexpr cudf::thread_index_type tile_size = block_size; + +// Number of a/b parameter values to process per thread. // The intermediate values are stored in shared-memory and therefore limits this count. // This value was found to be the most efficient size for both uint32 and uint64 // hash types based on benchmarks. constexpr cuda::std::size_t params_per_thread = 16; -// this is deprecated and to be removed in the future; -// keeping it for now for verifying results from the faster kernels below -template -CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings, - hash_value_type seed, - cudf::device_span parameter_a, - cudf::device_span parameter_b, - cudf::size_type width, - hash_value_type* d_hashes) -{ - auto const idx = cudf::detail::grid_1d::global_thread_id(); - auto const str_idx = idx / block_size; - if (str_idx >= d_strings.size()) { return; } - if (d_strings.is_null(str_idx)) { return; } - - auto const block = cooperative_groups::this_thread_block(); - - auto const d_str = d_strings.element(str_idx); - auto const init = d_str.empty() ? 0 : std::numeric_limits::max(); - auto const lane_idx = block.thread_rank(); // idx % block_size; - - auto const d_output = d_hashes + (str_idx * parameter_a.size()); - - auto const begin = d_str.data() + lane_idx; - auto const end = d_str.data() + d_str.size_bytes(); - - // constants used for the permutation calculations - constexpr uint64_t mersenne_prime = (1UL << 61) - 1; - constexpr hash_value_type hash_max = std::numeric_limits::max(); - - // Found to be the most efficient shared memory size for both hash types - __shared__ char shmem[block_size * params_per_thread * sizeof(hash_value_type)]; - auto const block_values = reinterpret_cast(shmem); - - auto const hasher = HashFunction(seed); - - for (std::size_t i = 0; i < parameter_a.size(); i += params_per_thread) { - // initialize working memory - auto const tile_hashes = block_values + (lane_idx * params_per_thread); - thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + params_per_thread, init); - block.sync(); - - auto const param_count = - cuda::std::min(static_cast(params_per_thread), parameter_a.size() - i); - - // each lane hashes 'width' substrings of d_str - for (auto itr = begin; itr < end; itr += block_size) { - if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { continue; } - auto const check_str = // used for counting 'width' characters - cudf::string_view(itr, static_cast(thrust::distance(itr, end))); - auto const [bytes, left] = - cudf::strings::detail::bytes_to_character_position(check_str, width); - if ((itr != d_str.data()) && (left > 0)) { continue; } // true if past the end of the string - - auto const hash_str = cudf::string_view(itr, bytes); - hash_value_type hv; - if constexpr (std::is_same_v) { - hv = hasher(hash_str); - } else { - hv = thrust::get<0>(hasher(hash_str)); - } - hv = cuda::std::max(hv, hash_value_type{1}); - - for (std::size_t param_idx = i; param_idx < (i + param_count); ++param_idx) { - // permutation formula used by datatrove - hash_value_type const v = - ((hv * parameter_a[param_idx] + parameter_b[param_idx]) % mersenne_prime) & hash_max; - auto const block_idx = ((param_idx % params_per_thread) * block_size) + lane_idx; - block_values[block_idx] = cuda::std::min(v, block_values[block_idx]); - } - } - block.sync(); - - // reduce each parameter values vector to a single min value - if (lane_idx < param_count) { - auto const values = block_values + (lane_idx * block_size); - auto const minv = - thrust::reduce(thrust::seq, values, values + block_size, init, thrust::minimum{}); - d_output[lane_idx + i] = minv; - } - block.sync(); - } -} - // Separate kernels are used to process strings above and below this value (in bytes). constexpr cudf::size_type wide_string_threshold = 1 << 18; // 256K // The number of blocks per string for the above-threshold kernel processing. diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index b78a64d0e55..a4213dcbe94 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -611,6 +611,7 @@ ConfigureTest( text/bpe_tests.cpp text/edit_distance_tests.cpp text/jaccard_tests.cpp + text/minhash_tests.cpp text/ngrams_tests.cpp text/ngrams_tokenize_tests.cpp text/normalize_tests.cpp diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index 0f4d0ce0dc2..3c0c0068a49 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -32,143 +32,6 @@ struct MinHashTest : public cudf::test::BaseFixture {}; -TEST_F(MinHashTest, Basic) -{ - auto validity = cudf::test::iterators::null_at(1); - auto input = - cudf::test::strings_column_wrapper({"doc 1", - "", - "this is doc 2", - "", - "doc 3", - "d", - "The quick brown fox jumpéd over the lazy brown dog.", - "line eight", - "line nine", - "line ten"}, - validity); - - auto view = cudf::strings_column_view(input); - - auto results = nvtext::minhash(view); - - auto expected = cudf::test::fixed_width_column_wrapper({1207251914u, - 0u, - 21141582u, - 0u, - 1207251914u, - 655955059u, - 86520422u, - 304329233u, - 640477688u, - 640477688u}, - validity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - - auto results64 = nvtext::minhash64(view); - auto expected64 = cudf::test::fixed_width_column_wrapper({774489391575805754ul, - 0ul, - 3232308021562742685ul, - 0ul, - 13145552576991307582ul, - 14660046701545912182ul, - 398062025280761388ul, - 1273320923074904938ul, - 3456065052701055601ul, - 10664519708968191209ul}, - validity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); -} - -TEST_F(MinHashTest, LengthEqualsWidth) -{ - auto input = cudf::test::strings_column_wrapper({"abcdé", "fghjk", "lmnop", "qrstu", "vwxyz"}); - auto view = cudf::strings_column_view(input); - auto results = nvtext::minhash(view, 0, 5); - auto expected = cudf::test::fixed_width_column_wrapper( - {3825281041u, 2728681928u, 1984332911u, 3965004915u, 192452857u}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); -} - -TEST_F(MinHashTest, MultiSeed) -{ - auto input = - cudf::test::strings_column_wrapper({"doc 1", - "this is doc 2", - "doc 3", - "d", - "The quick brown fox jumpéd over the lazy brown dog.", - "line six", - "line seven", - "line eight", - "line nine", - "line ten"}); - - auto view = cudf::strings_column_view(input); - - auto seeds = cudf::test::fixed_width_column_wrapper({0, 1, 2}); - auto results = nvtext::minhash(view, cudf::column_view(seeds)); - - using LCW = cudf::test::lists_column_wrapper; - // clang-format off - LCW expected({LCW{1207251914u, 1677652962u, 1061355987u}, - LCW{ 21141582u, 580916568u, 1258052021u}, - LCW{1207251914u, 943567174u, 1109272887u}, - LCW{ 655955059u, 488346356u, 2394664816u}, - LCW{ 86520422u, 236622901u, 102546228u}, - LCW{ 640477688u, 198451716u, 136303992u}, - LCW{ 640477688u, 198451716u, 577802054u}, - LCW{ 304329233u, 198451716u, 714941560u}, - LCW{ 640477688u, 198451716u, 261342259u}, - LCW{ 640477688u, 198451716u, 139988887u}}); - // clang-format on - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - - auto seeds64 = cudf::test::fixed_width_column_wrapper({0, 1, 2}); - auto results64 = nvtext::minhash64(view, cudf::column_view(seeds64)); - - using LCW64 = cudf::test::lists_column_wrapper; - // clang-format off - LCW64 expected64({LCW64{ 774489391575805754ul, 10435654231793485448ul, 1188598072697676120ul}, - LCW64{ 3232308021562742685ul, 4445611509348165860ul, 1188598072697676120ul}, - LCW64{13145552576991307582ul, 6846192680998069919ul, 1188598072697676120ul}, - LCW64{14660046701545912182ul, 17106501326045553694ul, 17713478494106035784ul}, - LCW64{ 398062025280761388ul, 377720198157450084ul, 984941365662009329ul}, - LCW64{ 2837259098848821044ul, 650799815433771163ul, 2428991957842356245ul}, - LCW64{ 2105419906076957667ul, 650799815433771163ul, 2428991957842356245ul}, - LCW64{ 1273320923074904938ul, 650799815433771163ul, 2428991957842356245ul}, - LCW64{ 3456065052701055601ul, 650799815433771163ul, 2428991957842356245ul}, - LCW64{10664519708968191209ul, 650799815433771163ul, 2428991957842356245ul}}); - // clang-format on - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); -} - -TEST_F(MinHashTest, MultiSeedWithNullInputRow) -{ - auto validity = cudf::test::iterators::null_at(1); - auto input = cudf::test::strings_column_wrapper({"abcdéfgh", "", "", "stuvwxyz"}, validity); - auto view = cudf::strings_column_view(input); - - auto seeds = cudf::test::fixed_width_column_wrapper({1, 2}); - auto results = nvtext::minhash(view, cudf::column_view(seeds)); - - using LCW = cudf::test::lists_column_wrapper; - LCW expected({LCW{484984072u, 1074168784u}, LCW{}, LCW{0u, 0u}, LCW{571652169u, 173528385u}}, - validity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - - auto seeds64 = cudf::test::fixed_width_column_wrapper({11, 22}); - auto results64 = nvtext::minhash64(view, cudf::column_view(seeds64)); - - using LCW64 = cudf::test::lists_column_wrapper; - LCW64 expected64({LCW64{2597399324547032480ul, 4461410998582111052ul}, - LCW64{}, - LCW64{0ul, 0ul}, - LCW64{2717781266371273264ul, 6977325820868387259ul}}, - validity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); -} - TEST_F(MinHashTest, Permuted) { auto input = @@ -264,48 +127,17 @@ TEST_F(MinHashTest, PermutedWide) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } -TEST_F(MinHashTest, WordsMinHash) -{ - using LCWS = cudf::test::lists_column_wrapper; - auto validity = cudf::test::iterators::null_at(1); - - LCWS input( - {LCWS({"hello", "abcdéfgh"}), - LCWS{}, - LCWS({"rapids", "moré", "test", "text"}), - LCWS({"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "brown", "dog"})}, - validity); - - auto view = cudf::lists_column_view(input); - - auto seeds = cudf::test::fixed_width_column_wrapper({1, 2}); - auto results = nvtext::word_minhash(view, cudf::column_view(seeds)); - using LCW32 = cudf::test::lists_column_wrapper; - LCW32 expected({LCW32{2069617641u, 1975382903u}, - LCW32{}, - LCW32{657297235u, 1010955999u}, - LCW32{644643885u, 310002789u}}, - validity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - - auto seeds64 = cudf::test::fixed_width_column_wrapper({11, 22}); - auto results64 = nvtext::word_minhash64(view, cudf::column_view(seeds64)); - using LCW64 = cudf::test::lists_column_wrapper; - LCW64 expected64({LCW64{1940333969930105370ul, 272615362982418219ul}, - LCW64{}, - LCW64{5331949571924938590ul, 2088583894581919741ul}, - LCW64{3400468157617183341ul, 2398577492366130055ul}}, - validity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); -} - TEST_F(MinHashTest, EmptyTest) { - auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); - auto view = cudf::strings_column_view(input->view()); - auto results = nvtext::minhash(view); + auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + auto view = cudf::strings_column_view(input->view()); + auto params = cudf::test::fixed_width_column_wrapper({1, 2, 3}); + auto results = + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); EXPECT_EQ(results->size(), 0); - results = nvtext::minhash64(view); + auto params64 = cudf::test::fixed_width_column_wrapper({1, 2, 3}); + results = nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); EXPECT_EQ(results->size(), 0); } @@ -313,20 +145,39 @@ TEST_F(MinHashTest, ErrorsTest) { auto input = cudf::test::strings_column_wrapper({"this string intentionally left blank"}); auto view = cudf::strings_column_view(input); - EXPECT_THROW(nvtext::minhash(view, 0, 0), std::invalid_argument); - EXPECT_THROW(nvtext::minhash64(view, 0, 0), std::invalid_argument); - auto seeds = cudf::test::fixed_width_column_wrapper(); - EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::invalid_argument); - auto seeds64 = cudf::test::fixed_width_column_wrapper(); - EXPECT_THROW(nvtext::minhash64(view, cudf::column_view(seeds64)), std::invalid_argument); + auto empty = cudf::test::fixed_width_column_wrapper(); + EXPECT_THROW( + nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 0), + std::invalid_argument); + auto empty64 = cudf::test::fixed_width_column_wrapper(); + EXPECT_THROW( + nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 0), + std::invalid_argument); + EXPECT_THROW( + nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 4), + std::invalid_argument); + EXPECT_THROW( + nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 4), + std::invalid_argument); std::vector h_input(50000, ""); input = cudf::test::strings_column_wrapper(h_input.begin(), h_input.end()); view = cudf::strings_column_view(input); auto const zeroes = thrust::constant_iterator(0); - seeds = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); - EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::overflow_error); - seeds64 = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); - EXPECT_THROW(nvtext::minhash64(view, cudf::column_view(seeds64)), std::overflow_error); + auto params = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); + EXPECT_THROW( + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4), + std::overflow_error); + auto params64 = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); + EXPECT_THROW(nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4), + std::overflow_error); + + EXPECT_THROW( + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(empty), 4), + std::invalid_argument); + EXPECT_THROW( + nvtext::minhash64_permuted(view, 0, cudf::column_view(params64), cudf::column_view(empty64), 4), + std::invalid_argument); } From ef7bb46cc72061a1cfb7e252e65a714f3e62994b Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 28 Oct 2024 11:14:19 -0400 Subject: [PATCH 19/25] add gtest for parameter chunking --- cpp/src/text/minhash.cu | 37 ++++++++++++--------- cpp/tests/text/minhash_tests.cpp | 57 ++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 16 deletions(-) diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 88b61fb6c8c..362672e74bb 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -168,12 +168,12 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, } constexpr cudf::thread_index_type block_size = 256; -// for tuning independently from block_size +// for potentially tuning minhash_seed_kernel independently from block_size constexpr cudf::thread_index_type tile_size = block_size; // Number of a/b parameter values to process per thread. // The intermediate values are stored in shared-memory and therefore limits this count. -// This value was found to be the most efficient size for both uint32 and uint64 +// This value was found to be an efficient size for both uint32 and uint64 // hash types based on benchmarks. constexpr cuda::std::size_t params_per_thread = 16; @@ -181,7 +181,7 @@ constexpr cuda::std::size_t params_per_thread = 16; constexpr cudf::size_type wide_string_threshold = 1 << 18; // 256K // The number of blocks per string for the above-threshold kernel processing. constexpr cudf::size_type blocks_per_string = 64; -// These values determined using redpajama and books_sample datasets +// The above values were determined using the redpajama and books_sample datasets /** * @brief Hashing kernel launched as a thread per tile-size (block or warp) @@ -201,8 +201,8 @@ constexpr cudf::size_type blocks_per_string = 64; * @param width Width in characters used for determining substrings to hash * @param d_hashes The resulting hash values are stored here * @param threshold_count Stores the number of strings above wide_string_threshold - * @param param_count Number of parameters (used for proactive initialize) - * @param d_results Final results vector (used for proactive initialize) + * @param param_count Number of parameters (used for the proactive initialize) + * @param d_results Final results vector (used for the proactive initialize) */ template CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, @@ -218,7 +218,7 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, if (str_idx >= d_strings.size()) { return; } if (d_strings.is_null(str_idx)) { return; } - // retrieve this strings offset to locate the output position in d_hashes + // retrieve this string's offset to locate the output position in d_hashes auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); auto const offsets_itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset()); @@ -229,7 +229,7 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, auto const d_str = cudf::string_view(d_strings.head() + offset, size_bytes); auto const lane_idx = tid % tile_size; - // hashes for this string/thread stored here + // hashes for this string/thread are stored here auto seed_hashes = d_hashes + offset - offsets_itr[0] + lane_idx; auto const begin = d_str.data() + lane_idx; @@ -244,7 +244,8 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, auto const check_str = // used for counting 'width' characters cudf::string_view(itr, static_cast(thrust::distance(itr, end))); auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(check_str, width); - if ((itr != d_str.data()) && (left > 0)) { // true if past the end of the string + if ((itr != d_str.data()) && (left > 0)) { + // true itr+width is past the end of the string *seed_hashes = 0; continue; } @@ -291,10 +292,10 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, * @tparam blocks_per_string Number of blocks used to process each string * * @param d_strings The input strings to hash - * @param indices The indices of the strings in d_strings to process here + * @param indices The indices of the strings in d_strings to process * @param parameter_a 1st set of parameters for the calculation result * @param parameter_b 2nd set of parameters for the calculation result - * @param width Width used for calculating the number of available hashes in each string + * @param width Used for calculating the number of available hashes in each string * @param d_hashes The hash values computed in minhash_seed_kernel * @param d_results Final results vector of calculate values */ @@ -323,13 +324,16 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string auto const size_bytes = static_cast(offsets_itr[str_idx + 1] - offset); // number of items to process in this block; - // last block includes any remainder values from the size_bytes/blocks_per_string truncation + // last block also includes any remainder values from the size_bytes/blocks_per_string truncation + // example: + // each section_size for string with size 588090 and blocks_per_string=64 is 9188 + // except the last section which is 9188 + (588090 % 64) = 9246 auto const section_size = (size_bytes / blocks_per_string) + (section_idx < (blocks_per_string - 1) ? 0 : size_bytes % blocks_per_string); auto const section_offset = section_idx * (size_bytes / blocks_per_string); - // hash values for this block + // hash values for this block/section auto const seed_hashes = d_hashes + offset - offsets_itr[0] + section_offset; // width used here as a max value since a string's char-count <= byte-count auto const hashes_size = @@ -348,7 +352,7 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string constexpr uint64_t mersenne_prime = (1UL << 61) - 1; constexpr hash_value_type hash_max = std::numeric_limits::max(); - // found to be the most efficient shared memory size for both hash types + // found to be an efficient shared memory size for both hash types __shared__ char shmem[block_size * params_per_thread * sizeof(hash_value_type)]; auto const block_values = reinterpret_cast(shmem); @@ -365,7 +369,8 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string // each lane accumulates min hashes in its shared memory for (auto itr = begin; itr < end; itr += block_size) { auto const hv = *itr; - if (hv == 0) { continue; } // 0 is used as a skip sentinel + // 0 is used as a skip sentinel for UTF-8 and trailing bytes + if (hv == 0) { continue; } for (std::size_t param_idx = i; param_idx < (i + param_count); ++param_idx) { // permutation formula used by datatrove @@ -377,8 +382,8 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string } block.sync(); - // reduce each parameter values vector to a single min value - // assumes that the block_size > params_per_thread + // reduce each parameter values vector to a single min value; + // assumes that the block_size > params_per_thread; // each thread reduces a block_size of parameter values (thread per parameter) if (lane_idx < param_count) { auto const values = block_values + (lane_idx * block_size); diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index 3c0c0068a49..1332edb4675 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -127,6 +127,63 @@ TEST_F(MinHashTest, PermutedWide) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } +TEST_F(MinHashTest, PermutedManyParameters) +{ + std::string const small(2 << 10, 'x'); + std::string const wide(2 << 19, 'y'); + auto input = cudf::test::strings_column_wrapper({small, wide}); + auto view = cudf::strings_column_view(input); + + auto first = thrust::counting_iterator(20); + // more than params_per_thread + auto params = cudf::test::fixed_width_column_wrapper(first, first + 31); + auto results = + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); + + using LCW32 = cudf::test::lists_column_wrapper; + // clang-format off + LCW32 expected({ + LCW32{1731998032u, 315359380u, 3193688024u, 1777049372u, 360410720u, 3238739364u, 1822100712u, 405462060u, + 3283790704u, 1867152052u, 450513400u, 3328842044u, 1912203392u, 495564740u, 3373893384u, 1957254732u, + 540616080u, 3418944724u, 2002306072u, 585667420u, 3463996064u, 2047357412u, 630718760u, 3509047404u, + 2092408752u, 675770100u, 3554098744u, 2137460092u, 720821440u, 3599150084u, 2182511432u}, + LCW32{1293098788u, 2860992281u, 133918478u, 1701811971u, 3269705464u, 542631661u, 2110525154u, 3678418647u, + 951344844u, 2519238337u, 4087131830u, 1360058027u, 2927951520u, 200877717u, 1768771210u, 3336664703u, + 609590900u, 2177484393u, 3745377886u, 1018304083u, 2586197576u, 4154091069u, 1427017266u, 2994910759u, + 267836956u, 1835730449u, 3403623942u, 676550139u, 2244443632u, 3812337125u, 1085263322u} + }); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + // more than params_per_thread + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 31); + auto results64 = nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); + + using LCW64 = cudf::test::lists_column_wrapper; + // clang-format off + LCW64 expected64({ + LCW64{1818322427062143853, 641024893347719371, 1769570368846988848, 592272835132564366, + 1720818310631833835, 543520776917409353, 1672066252416678822, 494768718702254348, + 1623314194201523817, 446016660487099335, 1574562135986368804, 397264602271944322, + 1525810077771213799, 348512544056789317, 1477058019556058786, 299760485841634304, + 1428305961340903773, 251008427626479291, 1379553903125748768, 202256369411324286, + 1330801844910593755, 153504311196169273, 1282049786695438742, 104752252981014268, + 1233297728480283737, 56000194765859255, 1184545670265128724, 7248136550704242, + 1135793612049973719, 2264339087549243188, 1087041553834818706}, + LCW64{1389920339306667795, 421787002125838902, 1759496674158703968, 791363336977875075, + 2129073009010740141, 1160939671829911248, 192806334649082363, 1530516006681947421, + 562382669501118536, 1900092341533983602, 931959004353154709, 2269668676386019775, + 1301535339205190882, 333402002024361997, 1671111674057227055, 702978336876398170, + 2040688008909263228, 1072554671728434343, 104421334547605450, 1442131006580470516, + 473997669399641631, 1811707341432506689, 843574004251677804, 2181283676284542862, + 1213150339103713977, 245017001922885084, 1582726673955750150, 614593336774921257, + 1952303008807786323, 984169671626957438, 16036334446128545} + }); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); +} + TEST_F(MinHashTest, EmptyTest) { auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); From 2a9928a09c104d68e0ed99f8409eb9ecdc8b6402 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 28 Oct 2024 11:15:34 -0400 Subject: [PATCH 20/25] move pytests to use permuted api --- .../cudf/cudf/tests/text/test_text_methods.py | 46 ++++++------------- python/pylibcudf/pylibcudf/nvtext/minhash.pyx | 12 ++--- .../pylibcudf/tests/test_nvtext_minhash.py | 10 ++-- 3 files changed, 26 insertions(+), 42 deletions(-) diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 997ca357986..6ec9348dc8d 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -885,65 +885,45 @@ def test_is_vowel_consonant(): def test_minhash(): strings = cudf.Series(["this is my", "favorite book", None, ""]) + params = cudf.Series([1, 2, 3], dtype=np.uint32) expected = cudf.Series( [ - cudf.Series([21141582], dtype=np.uint32), - cudf.Series([962346254], dtype=np.uint32), - None, - cudf.Series([0], dtype=np.uint32), - ] - ) - actual = strings.str.minhash() - assert_eq(expected, actual) - seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - expected = cudf.Series( - [ - cudf.Series([1305480167, 668155704, 34311509], dtype=np.uint32), - cudf.Series([32665384, 3470118, 363147162], dtype=np.uint32), + cudf.Series([1305480168, 462824406, 74608229], dtype=np.uint32), + cudf.Series([32665385, 65330770, 97996155], dtype=np.uint32), None, cudf.Series([0, 0, 0], dtype=np.uint32), ] ) - actual = strings.str.minhash(seeds=seeds, width=5) + actual = strings.str.minhash_permuted(0, a=params, b=params, width=5) assert_eq(expected, actual) - expected = cudf.Series( - [ - cudf.Series([3232308021562742685], dtype=np.uint64), - cudf.Series([23008204270530356], dtype=np.uint64), - None, - cudf.Series([0], dtype=np.uint64), - ] - ) - actual = strings.str.minhash64() - assert_eq(expected, actual) - seeds = cudf.Series([0, 1, 2], dtype=np.uint64) + params = cudf.Series([1, 2, 3], dtype=np.uint64) expected = cudf.Series( [ cudf.Series( - [7082801294247314046, 185949556058924788, 167570629329462454], + [105531920695060180, 172452388517576009, 316595762085180524], dtype=np.uint64, ), cudf.Series( - [382665377781028452, 86243762733551437, 7688750597953083512], + [35713768479063122, 71427536958126236, 58787297728258212], dtype=np.uint64, ), None, cudf.Series([0, 0, 0], dtype=np.uint64), ] ) - actual = strings.str.minhash64(seeds=seeds, width=5) + actual = strings.str.minhash64_permuted(0, a=params, b=params, width=5) assert_eq(expected, actual) # test wrong seed types with pytest.raises(ValueError): - strings.str.minhash(seeds="a") + strings.str.minhash_permuted(1, a="a", b="b", width=7) with pytest.raises(ValueError): - seeds = cudf.Series([0, 1, 2], dtype=np.int32) - strings.str.minhash(seeds=seeds) + params = cudf.Series([0, 1, 2], dtype=np.int32) + strings.str.minhash_permuted(1, a=params, b=params, width=6) with pytest.raises(ValueError): - seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - strings.str.minhash64(seeds=seeds) + params = cudf.Series([0, 1, 2], dtype=np.uint32) + strings.str.minhash64_permuted(1, a=params, b=params, width=8) def test_word_minhash(): diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx index 8c2c6cc7f4d..574d586b9da 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx @@ -68,7 +68,7 @@ cpdef Column minhash_permuted( Returns the minhash values for each string. This function uses MurmurHash3_x86_32 for the hash algorithm. - For details, see :cpp:func:`minhash`. + For details, see :cpp:func:`minhash_permuted`. Parameters ---------- @@ -77,9 +77,9 @@ cpdef Column minhash_permuted( seed : uint32_t Seed used for the hash function a : Column - Seed value(s) used for the hash algorithm. + 1st parameter value used for the minhash algorithm. b : Column - 2nd seed value(s) used for the hash algorithm. + 2nd parameter value used for the minhash algorithm. width : size_type Character width used for apply substrings; @@ -151,7 +151,7 @@ cpdef Column minhash64_permuted( Returns the minhash values for each string. This function uses MurmurHash3_x64_128 for the hash algorithm. - For details, see :cpp:func:`minhash`. + For details, see :cpp:func:`minhash64_permuted`. Parameters ---------- @@ -160,9 +160,9 @@ cpdef Column minhash64_permuted( seed : uint64_t Seed used for the hash function a : Column - Seed value(s) used for the hash algorithm. + 1st parameter value used for the minhash algorithm. b : Column - 2nd seed value(s) used for the hash algorithm. + 2nd parameter value used for the minhash algorithm. width : size_type Character width used for apply substrings; diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py index ead9ee094af..918d73c2c30 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py @@ -24,12 +24,16 @@ def word_minhash_input_data(request): def test_minhash(minhash_input_data, width): input_arr, seeds, seed_type = minhash_input_data minhash_func = ( - plc.nvtext.minhash.minhash + plc.nvtext.minhash.minhash_permuted if seed_type == pa.uint32() - else plc.nvtext.minhash.minhash64 + else plc.nvtext.minhash.minhash64_permuted ) result = minhash_func( - plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds), width + plc.interop.from_arrow(input_arr), + 0, + plc.interop.from_arrow(seeds), + plc.interop.from_arrow(seeds), + width, ) pa_result = plc.interop.to_arrow(result) assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr)) From a39b7388428f0d09fc805310efbd7d82ead18479 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 29 Oct 2024 11:54:19 -0400 Subject: [PATCH 21/25] add docstring for new APIs --- python/cudf/cudf/_lib/nvtext/minhash.pyx | 30 ++++---- python/cudf/cudf/core/column/string.py | 71 ++++++++++++++++++- python/pylibcudf/pylibcudf/nvtext/minhash.pyx | 28 ++++---- .../pylibcudf/tests/test_nvtext_minhash.py | 2 +- 4 files changed, 98 insertions(+), 33 deletions(-) diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx index a95f8edb3c8..25cfcf99ca6 100644 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -21,14 +21,15 @@ def minhash(Column input, Column seeds, int width=4): @acquire_spill_lock() def minhash_permuted(Column input, uint32_t seed, Column a, Column b, int width): - result = nvtext.minhash.minhash_permuted( - input.to_pylibcudf(mode="read"), - seed, - a.to_pylibcudf(mode="read"), - b.to_pylibcudf(mode="read"), - width, + return Column.from_pylibcudf( + nvtext.minhash.minhash_permuted( + input.to_pylibcudf(mode="read"), + seed, + a.to_pylibcudf(mode="read"), + b.to_pylibcudf(mode="read"), + width, + ) ) - return Column.from_pylibcudf(result) @acquire_spill_lock() @@ -43,14 +44,15 @@ def minhash64(Column input, Column seeds, int width=4): @acquire_spill_lock() def minhash64_permuted(Column input, uint64_t seed, Column a, Column b, int width): - result = nvtext.minhash.minhash64_permuted( - input.to_pylibcudf(mode="read"), - seed, - a.to_pylibcudf(mode="read"), - b.to_pylibcudf(mode="read"), - width, + return Column.from_pylibcudf( + nvtext.minhash.minhash64_permuted( + input.to_pylibcudf(mode="read"), + seed, + a.to_pylibcudf(mode="read"), + b.to_pylibcudf(mode="read"), + width, + ) ) - return Column.from_pylibcudf(result) @acquire_spill_lock() diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index f52a564c052..f8d20f9d21c 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5351,8 +5351,41 @@ def minhash( ) def minhash_permuted( - self, seed: np.uint32, a: ColumnLike, b: ColumnLike, width: int = 4 + self, seed: np.uint32, a: ColumnLike, b: ColumnLike, width: int ) -> SeriesOrIndex: + """ + Compute the minhash of a strings column. + This uses the MurmurHash3_x86_32 algorithm for the hash function. + + Calculation uses the formula (hv * a + b) % mersenne_prime + where hv is the hash of a substring of width characters, + a and b are provided values and mersenne_prime is 2^61-1. + + Parameters + ---------- + seed : uint32 + The seed used for the hash algorithm. + a : ColumnLike + Values for minhash calculation. + Must be of type uint32. + b : ColumnLike + Values for minhash calculation. + Must be of type uint32. + width : int + The width of the substring to hash. + + Examples + -------- + >>> import cudf + >>> import numpy as np + >>> s = cudf.Series(['this is my', 'favorite book']) + >>> a = cudf.Series([1, 2, 3], dtype=np.uint32) + >>> b = cudf.Series([4, 5, 6], dtype=np.uint32) + >>> s.str.minhash_permuted(0, a=a, b=b, width=5) + 0 [1305480171, 462824409, 74608232] + 1 [32665388, 65330773, 97996158] + dtype: list + """ a_column = column.as_column(a) if a_column.dtype != np.uint32: raise ValueError( @@ -5410,8 +5443,42 @@ def minhash64( ) def minhash64_permuted( - self, seed: np.uint64, a: ColumnLike, b: ColumnLike, width: int = 4 + self, seed: np.uint64, a: ColumnLike, b: ColumnLike, width: int ) -> SeriesOrIndex: + """ + Compute the minhash of a strings column. + This uses the MurmurHash3_x64_128 algorithm for the hash function. + + Calculation uses the formula (hv * a + b) % mersenne_prime + where hv is the hash of a substring of width characters, + a and b are provided values and mersenne_prime is 2^61-1. + + Parameters + ---------- + seed : uint64 + The seed used for the hash algorithm. + a : ColumnLike + Values for minhash calculation. + Must be of type uint64. + b : ColumnLike + Values for minhash calculation. + Must be of type uint64. + width : int + The width of the substring to hash. + + Examples + -------- + >>> import cudf + >>> import numpy as np + >>> s = cudf.Series(['this is my', 'favorite book', 'to read']) + >>> a = cudf.Series([2, 3], dtype=np.uint64) + >>> b = cudf.Series([5, 6], dtype=np.uint64) + >>> s.str.minhash64_permuted(0, a=a, b=b, width=5) + 0 [172452388517576012, 316595762085180527] + 1 [71427536958126239, 58787297728258215] + 2 [423885828176437114, 1140588505926961370] + dtype: list + """ a_column = column.as_column(a) if a_column.dtype != np.uint64: raise ValueError( diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx index 574d586b9da..ef5b53ddea0 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx @@ -91,14 +91,12 @@ cpdef Column minhash_permuted( cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_minhash_permuted( - input.view(), - seed, - a.view(), - b.view(), - width - ) + c_result = cpp_minhash_permuted( + input.view(), + seed, + a.view(), + b.view(), + width ) return Column.from_libcudf(move(c_result)) @@ -174,14 +172,12 @@ cpdef Column minhash64_permuted( cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_minhash64_permuted( - input.view(), - seed, - a.view(), - b.view(), - width - ) + c_result = cpp_minhash64_permuted( + input.view(), + seed, + a.view(), + b.view(), + width ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py index 918d73c2c30..ec533e64307 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py @@ -21,7 +21,7 @@ def word_minhash_input_data(request): @pytest.mark.parametrize("width", [5, 12]) -def test_minhash(minhash_input_data, width): +def test_minhash_permuted(minhash_input_data, width): input_arr, seeds, seed_type = minhash_input_data minhash_func = ( plc.nvtext.minhash.minhash_permuted From 1989c62a03d4809bc79ee3df7cae6e71f9e3b34a Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 6 Nov 2024 10:15:30 -0500 Subject: [PATCH 22/25] change test_minhash to test_minhash_permuted --- python/cudf/cudf/tests/text/test_text_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 6ec9348dc8d..47e541fdcef 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -882,7 +882,7 @@ def test_is_vowel_consonant(): assert_eq(expected, actual) -def test_minhash(): +def test_minhash_permuted(): strings = cudf.Series(["this is my", "favorite book", None, ""]) params = cudf.Series([1, 2, 3], dtype=np.uint32) From 2804015acdcded842dbd9035111f0922d37f7728 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 6 Nov 2024 15:26:16 -0500 Subject: [PATCH 23/25] add deprecation warnings --- python/pylibcudf/pylibcudf/nvtext/minhash.pyx | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx index ef5b53ddea0..0751db5e423 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx @@ -18,6 +18,7 @@ from pylibcudf.libcudf.types cimport size_type from pylibcudf.scalar cimport Scalar from cython.operator import dereference +import warnings cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): @@ -42,6 +43,12 @@ cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): Column List column of minhash values for each string per seed """ + warnings.warn( + "Starting in version 25.02, the signature of this function will " + "be changed to match pylibcudf.nvtext.minhash_permuted.", + DeprecationWarning + ) + cdef unique_ptr[column] c_result if not isinstance(seeds, (Column, Scalar)): @@ -123,6 +130,12 @@ cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4): Column List column of minhash values for each string per seed """ + warnings.warn( + "Starting in version 25.02, the signature of this function will " + "be changed to match pylibcudf.nvtext.minhash64_permuted.", + DeprecationWarning + ) + cdef unique_ptr[column] c_result if not isinstance(seeds, (Column, Scalar)): From 1ed78a4f1f62fafa22dd5a3e806913bde8b7cc95 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 7 Nov 2024 09:16:07 -0500 Subject: [PATCH 24/25] change DeprecationWarning to FutureWarning --- python/cudf/cudf/core/column/string.py | 2 ++ python/pylibcudf/pylibcudf/nvtext/minhash.pyx | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index f8d20f9d21c..3d70b01b7e4 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5355,6 +5355,7 @@ def minhash_permuted( ) -> SeriesOrIndex: """ Compute the minhash of a strings column. + This uses the MurmurHash3_x86_32 algorithm for the hash function. Calculation uses the formula (hv * a + b) % mersenne_prime @@ -5407,6 +5408,7 @@ def minhash64( ) -> SeriesOrIndex: """ Compute the minhash of a strings column. + This uses the MurmurHash3_x64_128 algorithm for the hash function. This function generates 2 uint64 values but only the first uint64 value is used. diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx index 0751db5e423..5a51e32b287 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx @@ -46,7 +46,7 @@ cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): warnings.warn( "Starting in version 25.02, the signature of this function will " "be changed to match pylibcudf.nvtext.minhash_permuted.", - DeprecationWarning + FutureWarning ) cdef unique_ptr[column] c_result @@ -133,7 +133,7 @@ cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4): warnings.warn( "Starting in version 25.02, the signature of this function will " "be changed to match pylibcudf.nvtext.minhash64_permuted.", - DeprecationWarning + FutureWarning ) cdef unique_ptr[column] c_result From 056eb79bb7d5dc893394d6730c2e2e2eb974e728 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 8 Nov 2024 08:43:03 -0500 Subject: [PATCH 25/25] fix shared-memory variable type --- cpp/include/nvtext/minhash.hpp | 4 ++-- cpp/src/text/minhash.cu | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index 4ff863842a2..b2c1a23f57e 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -124,7 +124,7 @@ namespace CUDF_EXPORT nvtext { * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit * * @param input Strings column to compute minhash - * @param seed Seed value used for the hash algorithm + * @param seed Seed value used for the hash algorithm * @param parameter_a Values used for the permuted calculation * @param parameter_b Values used for the permuted calculation * @param width The character width of substrings to hash for each row @@ -236,7 +236,7 @@ std::unique_ptr minhash_permuted( * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit * * @param input Strings column to compute minhash - * @param seed Seed value used for the hash algorithm + * @param seed Seed value used for the hash algorithm * @param parameter_a Values used for the permuted calculation * @param parameter_b Values used for the permuted calculation * @param width The character width of substrings to hash for each row diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 362672e74bb..aee83ab35ed 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -353,8 +353,7 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string constexpr hash_value_type hash_max = std::numeric_limits::max(); // found to be an efficient shared memory size for both hash types - __shared__ char shmem[block_size * params_per_thread * sizeof(hash_value_type)]; - auto const block_values = reinterpret_cast(shmem); + __shared__ hash_value_type block_values[block_size * params_per_thread]; for (std::size_t i = 0; i < parameter_a.size(); i += params_per_thread) { // initialize this block's chunk of shared memory