From 11a8dd061e671585f49c2c6eeaabe14a3ae146e0 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 2 Aug 2023 17:01:39 -0400 Subject: [PATCH 01/13] Fix Byte-Pair-Encoding usage of cuco static-map for storing merge-pairs --- cpp/include/nvtext/bpe_tokenize.hpp | 8 +--- cpp/src/text/subword/bpe_tokenizer.cu | 38 ++++++----------- cpp/src/text/subword/bpe_tokenizer.cuh | 52 +++++++++++++++++++++--- cpp/src/text/subword/load_merges_file.cu | 24 ++++------- 4 files changed, 66 insertions(+), 56 deletions(-) diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/bpe_tokenize.hpp index b93d93b07c6..4438f262fae 100644 --- a/cpp/include/nvtext/bpe_tokenize.hpp +++ b/cpp/include/nvtext/bpe_tokenize.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -68,12 +68,6 @@ struct bpe_merge_pairs { * @return The number of merge pairs in the table */ cudf::size_type get_size(); - /** - * @brief Returns the number of unique merge pairs in the table. - * - * @return The number of unique merge pairs in the table - */ - std::size_t get_map_size(); }; /** diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index 413fb2497c0..d1c443a63a6 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -80,10 +80,11 @@ __device__ cudf::string_view get_first_token(cudf::string_view const& d_str) * * @see The byte_pair_encoding_fn::operator() function below for details. */ +template struct byte_pair_encoding_fn { cudf::column_device_view const d_merges; cudf::column_device_view const d_strings; - merge_pairs_map_type::device_view const d_map; + MapRefType const d_map; cudf::size_type* d_sizes; // output size of encoded string string_hasher_type const hasher; cudf::size_type* d_byte_indices; @@ -135,18 +136,7 @@ struct byte_pair_encoding_fn { return cudf::string_view(d_str.data() + *begin, size); } - /** - * @brief Compute the hash over the input strings. - * - * The input strings are combined with a space to produce hash for matching - * a merge pair within the `d_map`. - * - * @param lhs First string. - * @param rhs Second string. - * @return The hash value to match with `d_map`. - */ - __device__ cudf::hash_value_type compute_hash(cudf::string_view const& lhs, - cudf::string_view const& rhs) + __device__ auto get_merge_pair(cudf::string_view const& lhs, cudf::string_view const& rhs) { __shared__ char shmem[48 * 1024]; // max for Pascal auto const total_size = lhs.size_bytes() + rhs.size_bytes() + 1; @@ -155,7 +145,7 @@ struct byte_pair_encoding_fn { // Edge case check. // Empirically found only two merge pair strings that were greater than 70 bytes // and they both looked like ignorable errors. Double check this analysis with Vibhu. - if (thread_memory_size < total_size) { return 0; } + if (thread_memory_size < total_size) { return d_map.end(); } // build the target string in shared memory char* ptr = &shmem[threadIdx.x * thread_memory_size]; @@ -165,8 +155,8 @@ struct byte_pair_encoding_fn { memcpy(ptr + lhs.size_bytes(), " ", 1); memcpy(ptr + lhs.size_bytes() + 1, rhs.data(), rhs.size_bytes()); - auto const d_hash_str = cudf::string_view(ptr, total_size); - return hasher(d_hash_str); // return the hash for the temp string + auto const d_str = cudf::string_view(ptr, total_size); + return d_map.find(d_str); } /** @@ -233,11 +223,10 @@ struct byte_pair_encoding_fn { auto const rhs = next_substr(itr, end, d_str); if (rhs.empty()) break; // no more adjacent pairs - auto const hash = compute_hash(lhs, rhs); - auto const map_itr = d_map.find(hash, thrust::identity{}); - if (map_itr != d_map.end()) { + auto const map_itr = get_merge_pair(lhs, rhs); + if (!(map_itr == d_map.end())) { // found a match; record the rank (and other min_ vars) - auto const rank = static_cast(map_itr->second); + auto const rank = static_cast((*map_itr).second); if (rank < min_rank) { min_rank = rank; min_itr = itr; @@ -369,12 +358,9 @@ std::unique_ptr byte_pair_encoding( rmm::mr::get_current_device_resource()); auto d_offsets = offsets->mutable_view().data(); - byte_pair_encoding_fn fn{*d_merges, - *d_strings, - merge_pairs.get_merge_pairs_map(), - d_offsets, - string_hasher_type{}, - d_byte_indices.data()}; + auto map_ref = merge_pairs.get_merge_pairs_map(); + byte_pair_encoding_fn fn{ + *d_merges, *d_strings, map_ref, d_offsets, string_hasher_type{}, d_byte_indices.data()}; thrust::for_each_n( rmm::exec_policy(stream), thrust::make_counting_iterator(0), input.size(), fn); diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh index 0697a9961c7..836166f5b40 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cuh +++ b/cpp/src/text/subword/bpe_tokenizer.cuh @@ -21,12 +21,15 @@ #include #include +#include #include +#include #include #include #include +#include #include #include @@ -34,14 +37,51 @@ namespace nvtext { namespace detail { +using hash_value_type = uint32_t; +using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32; + +struct bpe_hasher { + cudf::column_device_view const d_strings; + string_hasher_type hasher{}; + // used by insert + __device__ hash_value_type operator()(cudf::size_type index) const + { + return hasher(d_strings.element(index)); + } + // used by find + __device__ hash_value_type operator()(cudf::string_view const& s) const { return hasher(s); } +}; + +struct bpe_equal { + cudf::column_device_view const d_strings; + // used by insert + __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) const noexcept + { + return d_strings.element(lhs) == d_strings.element(rhs); + } + // used by find + __device__ bool operator()(cudf::size_type lhs, cudf::string_view const& rhs) const noexcept + { + return d_strings.element(lhs) == rhs; + } +}; + using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor>; -using merge_pairs_map_type = cuco::static_map; +using probe_scheme = cuco::experimental::double_hashing<1, bpe_hasher>; -using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32; +using merge_pairs_map_type = cuco::experimental::static_map, + cuda::thread_scope_device, + bpe_equal, + probe_scheme, + hash_table_allocator_type>; + +// using merge_pairs_map_type = cuco::static_map; } // namespace detail @@ -53,7 +93,7 @@ struct bpe_merge_pairs::bpe_merge_pairs_impl { std::unique_ptr&& merge_pairs_map); auto get_merge_pairs() const { return merge_pairs->view(); } - auto get_merge_pairs_map() const { return merge_pairs_map->get_device_view(); } + auto get_merge_pairs_map() const { return merge_pairs_map->ref(cuco::experimental::op::find); } }; } // namespace nvtext diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu index dffe035ad35..d83263118a1 100644 --- a/cpp/src/text/subword/load_merges_file.cu +++ b/cpp/src/text/subword/load_merges_file.cu @@ -36,20 +36,13 @@ namespace nvtext { namespace detail { - namespace { struct make_pair_function { - /** - * @brief Hash the merge pair entry - */ - __device__ cuco::pair operator()(cudf::size_type idx) + __device__ cuco::pair operator()(cudf::size_type idx) { - auto const result = _hasher(d_strings.element(idx)); - return cuco::make_pair(result, idx); + return cuco::make_pair(idx, idx); } - - string_hasher_type const _hasher; cudf::column_device_view const d_strings; }; @@ -103,24 +96,22 @@ std::unique_ptr load_file_to_column(std::string const& filename_me std::unique_ptr initialize_merge_pairs_map( cudf::strings_column_view const& input, rmm::cuda_stream_view stream) { + auto d_strings = cudf::column_device_view::create(input.parent(), stream); // Ensure capacity is at least (size/0.7) as documented here: // https://github.com/NVIDIA/cuCollections/blob/6ec8b6dcdeceea07ab4456d32461a05c18864411/include/cuco/static_map.cuh#L179-L182 auto merge_pairs_map = std::make_unique( static_cast(input.size() * 2), // capacity is 2x; cuco::empty_key{std::numeric_limits::max()}, cuco::empty_value{-1}, // empty value is not used + bpe_equal{*d_strings}, + probe_scheme{bpe_hasher{*d_strings}}, hash_table_allocator_type{default_allocator{}, stream}, stream.value()); - auto d_strings = cudf::column_device_view::create(input.parent(), stream); - make_pair_function pair_func{string_hasher_type{}, *d_strings}; + make_pair_function pair_func{*d_strings}; auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func); - merge_pairs_map->insert(iter, - iter + input.size(), - thrust::identity{}, - thrust::equal_to{}, - stream.value()); + merge_pairs_map->insert(iter, iter + input.size(), stream.value()); return merge_pairs_map; } @@ -185,6 +176,5 @@ bpe_merge_pairs::bpe_merge_pairs(cudf::strings_column_view const& input, bpe_merge_pairs::~bpe_merge_pairs() = default; cudf::size_type bpe_merge_pairs::get_size() { return impl->merge_pairs->size(); } -std::size_t bpe_merge_pairs::get_map_size() { return impl->merge_pairs_map->get_size(); } } // namespace nvtext From b3e01f06abe13c208973f8d7c714525299e25231 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 2 Aug 2023 15:37:55 -0700 Subject: [PATCH 02/13] Use linear probing + use proper key sentinel --- cpp/cmake/thirdparty/get_cucollections.cmake | 16 +++++++++------- cpp/src/text/subword/bpe_tokenizer.cuh | 2 +- cpp/src/text/subword/load_merges_file.cu | 2 +- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake index 9758958b44f..b1aaad4d573 100644 --- a/cpp/cmake/thirdparty/get_cucollections.cmake +++ b/cpp/cmake/thirdparty/get_cucollections.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -14,12 +14,14 @@ # This function finds cuCollections and performs any additional configuration. function(find_and_configure_cucollections) - include(${rapids-cmake-dir}/cpm/cuco.cmake) - if(BUILD_SHARED_LIBS) - rapids_cpm_cuco(BUILD_EXPORT_SET cudf-exports) - else() - rapids_cpm_cuco(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports) - endif() + rapids_cpm_find( + cuco 0.0.1 + GLOBAL_TARGETS cuco::cuco + CPM_ARGS + GIT_REPOSITORY https://github.com/PointKernel/cuCollections.git + GIT_TAG fix-ref-issues + OPTIONS "BUILD_TESTS OFF"  "BUILD_BENCHMARKS OFF"  "BUILD_EXAMPLES OFF" + ) endfunction() find_and_configure_cucollections() diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh index 836166f5b40..0926839c4ac 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cuh +++ b/cpp/src/text/subword/bpe_tokenizer.cuh @@ -68,7 +68,7 @@ struct bpe_equal { using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor>; -using probe_scheme = cuco::experimental::double_hashing<1, bpe_hasher>; +using probe_scheme = cuco::experimental::linear_probing<1, bpe_hasher>; using merge_pairs_map_type = cuco::experimental::static_map initialize_merge_pairs_map( // https://github.com/NVIDIA/cuCollections/blob/6ec8b6dcdeceea07ab4456d32461a05c18864411/include/cuco/static_map.cuh#L179-L182 auto merge_pairs_map = std::make_unique( static_cast(input.size() * 2), // capacity is 2x; - cuco::empty_key{std::numeric_limits::max()}, + cuco::empty_key{-1}, cuco::empty_value{-1}, // empty value is not used bpe_equal{*d_strings}, probe_scheme{bpe_hasher{*d_strings}}, From ac73b10d79baf8c583178b64f8c30d70cc5f4523 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 2 Aug 2023 15:57:23 -0700 Subject: [PATCH 03/13] Use != operator for map iterators --- cpp/src/text/subword/bpe_tokenizer.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index d1c443a63a6..fc0784d757f 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -224,7 +224,7 @@ struct byte_pair_encoding_fn { if (rhs.empty()) break; // no more adjacent pairs auto const map_itr = get_merge_pair(lhs, rhs); - if (!(map_itr == d_map.end())) { + if (map_itr != d_map.end()) { // found a match; record the rank (and other min_ vars) auto const rank = static_cast((*map_itr).second); if (rank < min_rank) { From 326c2fac390d9515fcc39d45f6e13ce2d3e8feba Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 2 Aug 2023 16:13:16 -0700 Subject: [PATCH 04/13] Use access operator --- cpp/src/text/subword/bpe_tokenizer.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index fc0784d757f..f3e9dff43eb 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -226,7 +226,7 @@ struct byte_pair_encoding_fn { auto const map_itr = get_merge_pair(lhs, rhs); if (map_itr != d_map.end()) { // found a match; record the rank (and other min_ vars) - auto const rank = static_cast((*map_itr).second); + auto const rank = map_itr->second; if (rank < min_rank) { min_rank = rank; min_itr = itr; From fceefc443d18d10317b8e473d103eef0373e1454 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 3 Aug 2023 10:15:18 -0400 Subject: [PATCH 05/13] use linear-probe scheme --- cpp/src/text/subword/bpe_tokenizer.cuh | 2 +- cpp/src/text/subword/load_merges_file.cu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh index 836166f5b40..0926839c4ac 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cuh +++ b/cpp/src/text/subword/bpe_tokenizer.cuh @@ -68,7 +68,7 @@ struct bpe_equal { using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor>; -using probe_scheme = cuco::experimental::double_hashing<1, bpe_hasher>; +using probe_scheme = cuco::experimental::linear_probing<1, bpe_hasher>; using merge_pairs_map_type = cuco::experimental::static_map initialize_merge_pairs_map( // https://github.com/NVIDIA/cuCollections/blob/6ec8b6dcdeceea07ab4456d32461a05c18864411/include/cuco/static_map.cuh#L179-L182 auto merge_pairs_map = std::make_unique( static_cast(input.size() * 2), // capacity is 2x; - cuco::empty_key{std::numeric_limits::max()}, + cuco::empty_key{-1}, cuco::empty_value{-1}, // empty value is not used bpe_equal{*d_strings}, probe_scheme{bpe_hasher{*d_strings}}, From 91ab4fb33cd079fa1309426c3e51f6f01d3793d7 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Sun, 6 Aug 2023 11:21:36 -0400 Subject: [PATCH 06/13] keep merge-pairs column-device-view alive --- cpp/src/text/subword/bpe_tokenizer.cuh | 9 +++++++-- cpp/src/text/subword/load_merges_file.cu | 22 +++++++++++++--------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh index 0926839c4ac..7a8395e3bfb 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cuh +++ b/cpp/src/text/subword/bpe_tokenizer.cuh @@ -87,10 +87,15 @@ using merge_pairs_map_type = cuco::experimental::static_map const merge_pairs; + std::unique_ptr> const + d_merge_pairs; std::unique_ptr merge_pairs_map; - bpe_merge_pairs_impl(std::unique_ptr&& merge_pairs, - std::unique_ptr&& merge_pairs_map); + bpe_merge_pairs_impl( + std::unique_ptr&& merge_pairs, + std::unique_ptr>&& + d_merge_pairs, + std::unique_ptr&& merge_pairs_map); auto get_merge_pairs() const { return merge_pairs->view(); } auto get_merge_pairs_map() const { return merge_pairs_map->ref(cuco::experimental::op::find); } diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu index ab640531c8d..d881b386a57 100644 --- a/cpp/src/text/subword/load_merges_file.cu +++ b/cpp/src/text/subword/load_merges_file.cu @@ -94,21 +94,20 @@ std::unique_ptr load_file_to_column(std::string const& filename_me } std::unique_ptr initialize_merge_pairs_map( - cudf::strings_column_view const& input, rmm::cuda_stream_view stream) + cudf::column_device_view const& input, rmm::cuda_stream_view stream) { - auto d_strings = cudf::column_device_view::create(input.parent(), stream); // Ensure capacity is at least (size/0.7) as documented here: // https://github.com/NVIDIA/cuCollections/blob/6ec8b6dcdeceea07ab4456d32461a05c18864411/include/cuco/static_map.cuh#L179-L182 auto merge_pairs_map = std::make_unique( static_cast(input.size() * 2), // capacity is 2x; cuco::empty_key{-1}, cuco::empty_value{-1}, // empty value is not used - bpe_equal{*d_strings}, - probe_scheme{bpe_hasher{*d_strings}}, + bpe_equal{input}, + probe_scheme{bpe_hasher{input}}, hash_table_allocator_type{default_allocator{}, stream}, stream.value()); - make_pair_function pair_func{*d_strings}; + make_pair_function pair_func{input}; auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func); merge_pairs_map->insert(iter, iter + input.size(), stream.value()); @@ -119,9 +118,10 @@ std::unique_ptr initialize_merge_pairs_map( std::unique_ptr create_bpe_merge_pairs_impl( std::unique_ptr&& input, rmm::cuda_stream_view stream) { - auto merge_pairs = initialize_merge_pairs_map(cudf::strings_column_view(input->view()), stream); - return std::make_unique(std::move(input), - std::move(merge_pairs)); + auto d_strings = cudf::column_device_view::create(input->view(), stream); + auto merge_pairs = initialize_merge_pairs_map(*d_strings, stream); + return std::make_unique( + std::move(input), std::move(d_strings), std::move(merge_pairs)); } std::unique_ptr create_bpe_merge_pairs_impl( @@ -154,8 +154,12 @@ std::unique_ptr load_merge_pairs_file(std::string const& filena bpe_merge_pairs::bpe_merge_pairs_impl::bpe_merge_pairs_impl( std::unique_ptr&& merge_pairs, + std::unique_ptr>&& + d_merge_pairs, std::unique_ptr&& merge_pairs_map) - : merge_pairs(std::move(merge_pairs)), merge_pairs_map(std::move(merge_pairs_map)) + : merge_pairs(std::move(merge_pairs)), + d_merge_pairs(std::move(d_merge_pairs)), + merge_pairs_map(std::move(merge_pairs_map)) { } From 6a4fcf607ff54d8382a8154a7b7a224c46d35ae2 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Sun, 6 Aug 2023 16:37:24 -0400 Subject: [PATCH 07/13] cleanup col-device-view declaration in impl class --- cpp/src/text/subword/bpe_tokenizer.cuh | 24 ++++++++++++------------ cpp/src/text/subword/load_merges_file.cu | 23 +++++++++++------------ 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh index 7a8395e3bfb..583ff876072 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cuh +++ b/cpp/src/text/subword/bpe_tokenizer.cuh @@ -33,6 +33,7 @@ #include #include +#include namespace nvtext { namespace detail { @@ -78,24 +79,23 @@ using merge_pairs_map_type = cuco::experimental::static_map; -// using merge_pairs_map_type = cuco::static_map; - } // namespace detail +// since column_device_view::create returns is a little more than +// std::unique_ptr this helper simplifies the return type in a more maintainable +// way +using col_device_view = std::invoke_result_t; + struct bpe_merge_pairs::bpe_merge_pairs_impl { std::unique_ptr const merge_pairs; - std::unique_ptr> const - d_merge_pairs; + col_device_view const d_merge_pairs; std::unique_ptr merge_pairs_map; - bpe_merge_pairs_impl( - std::unique_ptr&& merge_pairs, - std::unique_ptr>&& - d_merge_pairs, - std::unique_ptr&& merge_pairs_map); + bpe_merge_pairs_impl(std::unique_ptr&& merge_pairs, + col_device_view&& d_merge_pairs, + std::unique_ptr&& merge_pairs_map); auto get_merge_pairs() const { return merge_pairs->view(); } auto get_merge_pairs_map() const { return merge_pairs_map->ref(cuco::experimental::op::find); } diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu index d881b386a57..28454da8d3b 100644 --- a/cpp/src/text/subword/load_merges_file.cu +++ b/cpp/src/text/subword/load_merges_file.cu @@ -38,13 +38,12 @@ namespace nvtext { namespace detail { namespace { -struct make_pair_function { - __device__ cuco::pair operator()(cudf::size_type idx) - { - return cuco::make_pair(idx, idx); - } - cudf::column_device_view const d_strings; -}; +// struct make_pair_function { +// __device__ cuco::pair operator()(cudf::size_type idx) +// { +// return cuco::make_pair(idx, idx); +// } +// }; /** * @brief Loads a text file of merge-pairs into a strings column. @@ -107,8 +106,8 @@ std::unique_ptr initialize_merge_pairs_map( hash_table_allocator_type{default_allocator{}, stream}, stream.value()); - make_pair_function pair_func{input}; - auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func); + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [] __device__(cudf::size_type idx) { return cuco::make_pair(idx, idx); }); merge_pairs_map->insert(iter, iter + input.size(), stream.value()); @@ -118,10 +117,10 @@ std::unique_ptr initialize_merge_pairs_map( std::unique_ptr create_bpe_merge_pairs_impl( std::unique_ptr&& input, rmm::cuda_stream_view stream) { - auto d_strings = cudf::column_device_view::create(input->view(), stream); - auto merge_pairs = initialize_merge_pairs_map(*d_strings, stream); + auto d_input = cudf::column_device_view::create(input->view(), stream); + auto merge_pairs = initialize_merge_pairs_map(*d_input, stream); return std::make_unique( - std::move(input), std::move(d_strings), std::move(merge_pairs)); + std::move(input), std::move(d_input), std::move(merge_pairs)); } std::unique_ptr create_bpe_merge_pairs_impl( From 0008bd21674a1e337107ee6a1ced5c01d40aee86 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Sun, 6 Aug 2023 20:05:42 -0400 Subject: [PATCH 08/13] cleanup code; add comments --- cpp/include/nvtext/bpe_tokenize.hpp | 7 ------- cpp/src/text/subword/bpe_tokenizer.cuh | 12 ++++++++++++ cpp/src/text/subword/load_merges_file.cu | 11 +---------- 3 files changed, 13 insertions(+), 17 deletions(-) diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/bpe_tokenize.hpp index 4438f262fae..c67f4bd8b1c 100644 --- a/cpp/include/nvtext/bpe_tokenize.hpp +++ b/cpp/include/nvtext/bpe_tokenize.hpp @@ -61,13 +61,6 @@ struct bpe_merge_pairs { rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); ~bpe_merge_pairs(); - - /** - * @brief Returns the number of merge pairs in the table. - * - * @return The number of merge pairs in the table - */ - cudf::size_type get_size(); }; /** diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh index 583ff876072..916ea20d262 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cuh +++ b/cpp/src/text/subword/bpe_tokenizer.cuh @@ -41,6 +41,12 @@ namespace detail { using hash_value_type = uint32_t; using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32; +/** + * @brief Hasher function used for building and using the cuco static-map + * + * This takes advantage of hetergenouse lookup feature in cuco static-map which + * allows inserting with one type (index) and looking up with a different type (string). + */ struct bpe_hasher { cudf::column_device_view const d_strings; string_hasher_type hasher{}; @@ -53,6 +59,12 @@ struct bpe_hasher { __device__ hash_value_type operator()(cudf::string_view const& s) const { return hasher(s); } }; +/** + * @brief Equal function used for building and using the cuco static-map + * + * This takes advantage of hetergenouse lookup feature in cuco static-map which + * allows inserting with one type (index) and looking up with a different type (string). + */ struct bpe_equal { cudf::column_device_view const d_strings; // used by insert diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu index 28454da8d3b..1f1b90b3f49 100644 --- a/cpp/src/text/subword/load_merges_file.cu +++ b/cpp/src/text/subword/load_merges_file.cu @@ -38,13 +38,6 @@ namespace nvtext { namespace detail { namespace { -// struct make_pair_function { -// __device__ cuco::pair operator()(cudf::size_type idx) -// { -// return cuco::make_pair(idx, idx); -// } -// }; - /** * @brief Loads a text file of merge-pairs into a strings column. * @@ -109,7 +102,7 @@ std::unique_ptr initialize_merge_pairs_map( auto iter = cudf::detail::make_counting_transform_iterator( 0, [] __device__(cudf::size_type idx) { return cuco::make_pair(idx, idx); }); - merge_pairs_map->insert(iter, iter + input.size(), stream.value()); + merge_pairs_map->insert_async(iter, iter + input.size(), stream.value()); return merge_pairs_map; } @@ -178,6 +171,4 @@ bpe_merge_pairs::bpe_merge_pairs(cudf::strings_column_view const& input, bpe_merge_pairs::~bpe_merge_pairs() = default; -cudf::size_type bpe_merge_pairs::get_size() { return impl->merge_pairs->size(); } - } // namespace nvtext From 40755df11fb7932228b18a76fafcbed3e4b7d94c Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 7 Aug 2023 11:58:46 -0700 Subject: [PATCH 09/13] Update cuco git tag --- cpp/cmake/thirdparty/get_cucollections.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake index b1aaad4d573..d81e5eb1970 100644 --- a/cpp/cmake/thirdparty/get_cucollections.cmake +++ b/cpp/cmake/thirdparty/get_cucollections.cmake @@ -18,8 +18,8 @@ function(find_and_configure_cucollections) cuco 0.0.1 GLOBAL_TARGETS cuco::cuco CPM_ARGS - GIT_REPOSITORY https://github.com/PointKernel/cuCollections.git - GIT_TAG fix-ref-issues + GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git + GIT_TAG 5186b39522e13a3681c0eb591db4eaacbf969485 OPTIONS "BUILD_TESTS OFF"  "BUILD_BENCHMARKS OFF"  "BUILD_EXAMPLES OFF" ) endfunction() From 440b8b5114778c8ed9a60c53611221cd13fe172d Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 11 Aug 2023 12:22:18 -0400 Subject: [PATCH 10/13] reuse already created col-dev-view --- cpp/src/text/subword/bpe_tokenizer.cu | 15 +++++++++++---- cpp/src/text/subword/bpe_tokenizer.cuh | 12 ++++++------ 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index b65a3fef341..30862563c49 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -136,6 +136,13 @@ struct byte_pair_encoding_fn { return cudf::string_view(d_str.data() + *begin, size); } + /** + * @brief Lookup the pair of strings in the d_map/d_merges + * + * @param lhs Left half of the string + * @param rhs Right half of the string + * @return Position of merge pair within d_map + */ __device__ auto get_merge_pair(cudf::string_view const& lhs, cudf::string_view const& rhs) { __shared__ char shmem[48 * 1024]; // max for Pascal @@ -343,12 +350,12 @@ std::unique_ptr byte_pair_encoding( bpe_merge_pairs::bpe_merge_pairs_impl const& merge_pairs, rmm::cuda_stream_view stream) { - CUDF_EXPECTS(!merge_pairs.get_merge_pairs().is_empty(), "Merge pairs table must not be empty"); + auto const d_merges = merge_pairs.get_merge_pairs(); + CUDF_EXPECTS(d_merges.size() > 0, "Merge pairs table must not be empty"); // build working vector to hold index values per byte rmm::device_uvector d_byte_indices(input.chars().size(), stream); - auto const d_merges = cudf::column_device_view::create(merge_pairs.get_merge_pairs(), stream); auto const d_strings = cudf::column_device_view::create(input.parent(), stream); auto offsets = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id()}, @@ -358,9 +365,9 @@ std::unique_ptr byte_pair_encoding( rmm::mr::get_current_device_resource()); auto d_offsets = offsets->mutable_view().data(); - auto map_ref = merge_pairs.get_merge_pairs_map(); + auto map_ref = merge_pairs.get_merge_pairs_ref(); byte_pair_encoding_fn fn{ - *d_merges, *d_strings, map_ref, d_offsets, string_hasher_type{}, d_byte_indices.data()}; + d_merges, *d_strings, map_ref, d_offsets, string_hasher_type{}, d_byte_indices.data()}; thrust::for_each_n( rmm::exec_policy(stream), thrust::make_counting_iterator(0), input.size(), fn); diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh index 916ea20d262..6ee3b236b90 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cuh +++ b/cpp/src/text/subword/bpe_tokenizer.cuh @@ -44,8 +44,8 @@ using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32&& merge_pairs_map); - auto get_merge_pairs() const { return merge_pairs->view(); } - auto get_merge_pairs_map() const { return merge_pairs_map->ref(cuco::experimental::op::find); } + auto const get_merge_pairs() const { return *d_merge_pairs; } + auto get_merge_pairs_ref() const { return merge_pairs_map->ref(cuco::experimental::op::find); } }; } // namespace nvtext From d6234c6aaeed429ec08d71dfeb880709bc54fa7d Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 11 Aug 2023 11:52:29 -0700 Subject: [PATCH 11/13] Revert temporary CMake changes --- cpp/cmake/thirdparty/get_cucollections.cmake | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake index d81e5eb1970..9758958b44f 100644 --- a/cpp/cmake/thirdparty/get_cucollections.cmake +++ b/cpp/cmake/thirdparty/get_cucollections.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -14,14 +14,12 @@ # This function finds cuCollections and performs any additional configuration. function(find_and_configure_cucollections) - rapids_cpm_find( - cuco 0.0.1 - GLOBAL_TARGETS cuco::cuco - CPM_ARGS - GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git - GIT_TAG 5186b39522e13a3681c0eb591db4eaacbf969485 - OPTIONS "BUILD_TESTS OFF"  "BUILD_BENCHMARKS OFF"  "BUILD_EXAMPLES OFF" - ) + include(${rapids-cmake-dir}/cpm/cuco.cmake) + if(BUILD_SHARED_LIBS) + rapids_cpm_cuco(BUILD_EXPORT_SET cudf-exports) + else() + rapids_cpm_cuco(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports) + endif() endfunction() find_and_configure_cucollections() From caf5e68e5ad8eb268eb289ceca27e273cfa7ea05 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 11 Aug 2023 16:49:22 -0400 Subject: [PATCH 12/13] remove unneeded include --- cpp/src/text/subword/bpe_tokenizer.cuh | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh index 6ee3b236b90..83aa22aaae9 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cuh +++ b/cpp/src/text/subword/bpe_tokenizer.cuh @@ -29,7 +29,6 @@ #include #include -#include #include #include From 708d8fda380e5bb0510e0f1de45cf92e3ace55fb Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 15 Aug 2023 20:19:20 -0400 Subject: [PATCH 13/13] fix some comments --- cpp/src/text/subword/bpe_tokenizer.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index 30862563c49..4c4f5b3a4b1 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -137,7 +137,7 @@ struct byte_pair_encoding_fn { } /** - * @brief Lookup the pair of strings in the d_map/d_merges + * @brief Look up the pair of strings in the d_map/d_merges * * @param lhs Left half of the string * @param rhs Right half of the string @@ -151,7 +151,7 @@ struct byte_pair_encoding_fn { // Edge case check. // Empirically found only two merge pair strings that were greater than 70 bytes - // and they both looked like ignorable errors. Double check this analysis with Vibhu. + // and they both looked like ignorable errors. if (thread_memory_size < total_size) { return d_map.end(); } // build the target string in shared memory