Skip to content

Commit

Permalink
Deprecate current libcudf nvtext minhash functions (#17152)
Browse files Browse the repository at this point in the history
Deprecates the current nvtext minhash functions some of which will be replaced in #16756 with a different signature. The others will no longer be used and removed in future release. The existing gtests and benchmarks will be retained for rework in the future release as well.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: #17152
  • Loading branch information
davidwendt authored Oct 25, 2024
1 parent 03777f6 commit e98e6b9
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 9 deletions.
4 changes: 2 additions & 2 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -348,8 +348,8 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary
ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)

ConfigureNVBench(
TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp text/word_minhash.cpp
TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/normalize.cpp
text/replace.cpp text/tokenize.cpp text/vocab.cpp
)

# ##################################################################################################
Expand Down
24 changes: 18 additions & 6 deletions cpp/include/nvtext/minhash.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ namespace CUDF_EXPORT nvtext {
*
* This function uses MurmurHash3_x86_32 for the hash algorithm.
*
* @deprecated Deprecated in 24.12
*
* @throw std::invalid_argument if the width < 2
*
* @param input Strings column to compute minhash
Expand All @@ -51,7 +53,7 @@ namespace CUDF_EXPORT nvtext {
* @param mr Device memory resource used to allocate the returned column's device memory
* @return Minhash values for each string in input
*/
std::unique_ptr<cudf::column> minhash(
[[deprecated]] std::unique_ptr<cudf::column> minhash(
cudf::strings_column_view const& input,
cudf::numeric_scalar<uint32_t> seed = 0,
cudf::size_type width = 4,
Expand All @@ -71,6 +73,8 @@ std::unique_ptr<cudf::column> minhash(
*
* Any null row entries result in corresponding null output rows.
*
* @deprecated Deprecated in 24.12 - to be replaced in a future release
*
* @throw std::invalid_argument if the width < 2
* @throw std::invalid_argument if seeds is empty
* @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
Expand All @@ -83,7 +87,7 @@ std::unique_ptr<cudf::column> minhash(
* @param mr Device memory resource used to allocate the returned column's device memory
* @return List column of minhash values for each string per seed
*/
std::unique_ptr<cudf::column> minhash(
[[deprecated]] std::unique_ptr<cudf::column> minhash(
cudf::strings_column_view const& input,
cudf::device_span<uint32_t const> seeds,
cudf::size_type width = 4,
Expand All @@ -102,6 +106,8 @@ std::unique_ptr<cudf::column> minhash(
* The hash function returns 2 uint64 values but only the first value
* is used with the minhash calculation.
*
* @deprecated Deprecated in 24.12
*
* @throw std::invalid_argument if the width < 2
*
* @param input Strings column to compute minhash
Expand All @@ -112,7 +118,7 @@ std::unique_ptr<cudf::column> minhash(
* @param mr Device memory resource used to allocate the returned column's device memory
* @return Minhash values as UINT64 for each string in input
*/
std::unique_ptr<cudf::column> minhash64(
[[deprecated]] std::unique_ptr<cudf::column> minhash64(
cudf::strings_column_view const& input,
cudf::numeric_scalar<uint64_t> seed = 0,
cudf::size_type width = 4,
Expand All @@ -132,6 +138,8 @@ std::unique_ptr<cudf::column> minhash64(
*
* Any null row entries result in corresponding null output rows.
*
* @deprecated Deprecated in 24.12 - to be replaced in a future release
*
* @throw std::invalid_argument if the width < 2
* @throw std::invalid_argument if seeds is empty
* @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
Expand All @@ -144,7 +152,7 @@ std::unique_ptr<cudf::column> minhash64(
* @param mr Device memory resource used to allocate the returned column's device memory
* @return List column of minhash values for each string per seed
*/
std::unique_ptr<cudf::column> minhash64(
[[deprecated]] std::unique_ptr<cudf::column> minhash64(
cudf::strings_column_view const& input,
cudf::device_span<uint64_t const> seeds,
cudf::size_type width = 4,
Expand All @@ -164,6 +172,8 @@ std::unique_ptr<cudf::column> minhash64(
*
* Any null row entries result in corresponding null output rows.
*
* @deprecated Deprecated in 24.12
*
* @throw std::invalid_argument if seeds is empty
* @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
*
Expand All @@ -173,7 +183,7 @@ std::unique_ptr<cudf::column> minhash64(
* @param mr Device memory resource used to allocate the returned column's device memory
* @return List column of minhash values for each string per seed
*/
std::unique_ptr<cudf::column> word_minhash(
[[deprecated]] std::unique_ptr<cudf::column> word_minhash(
cudf::lists_column_view const& input,
cudf::device_span<uint32_t const> seeds,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
Expand All @@ -193,6 +203,8 @@ std::unique_ptr<cudf::column> word_minhash(
*
* Any null row entries result in corresponding null output rows.
*
* @deprecated Deprecated in 24.12
*
* @throw std::invalid_argument if seeds is empty
* @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
*
Expand All @@ -202,7 +214,7 @@ std::unique_ptr<cudf::column> word_minhash(
* @param mr Device memory resource used to allocate the returned column's device memory
* @return List column of minhash values for each string per seed
*/
std::unique_ptr<cudf::column> word_minhash64(
[[deprecated]] std::unique_ptr<cudf::column> word_minhash64(
cudf::lists_column_view const& input,
cudf::device_span<uint64_t const> seeds,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
Expand Down
1 change: 0 additions & 1 deletion cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -611,7 +611,6 @@ ConfigureTest(
text/bpe_tests.cpp
text/edit_distance_tests.cpp
text/jaccard_tests.cpp
text/minhash_tests.cpp
text/ngrams_tests.cpp
text/ngrams_tokenize_tests.cpp
text/normalize_tests.cpp
Expand Down

0 comments on commit e98e6b9

Please sign in to comment.