Skip to content

Commit

Permalink
Don't test strings shorter than the requested ngram size (#13758)
Browse files Browse the repository at this point in the history
The random string generation was using a minimum length of 1 rather than the ngram size. While both the reference implementation and the libcudf code were robust to that case, they return different extreme values (0 vs 1) for those. Since testing that edge case isn't really in scope for this particular test, I've modified the generator to only generate strings whose length is at least the requested ngram size.

Resolves #13748

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)

URL: #13758
  • Loading branch information
vyasr authored Jul 26, 2023
1 parent 67e81ae commit d26172f
Showing 1 changed file with 19 additions and 8 deletions.
27 changes: 19 additions & 8 deletions python/cudf/cudf/tests/text/test_text_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -878,27 +878,38 @@ def test_jaccard_index():
str1.str.jaccard_index(str3, 5)


def _make_list_of_strings_of_random_length(num_strings, max_length):
def _make_list_of_strings_of_random_length(
num_strings, min_length, max_length
):
return [
"".join(
random.choice(string.ascii_lowercase)
for _ in range(random.randint(1, max_length))
for _ in range(random.randint(min_length, max_length))
)
for _ in range(num_strings)
]


def test_jaccard_index_random_strings():
# Seed the rng before random string generation.
random.seed(42)
num_strings = 100
common_strings = _make_list_of_strings_of_random_length(num_strings, 50)
uncommon_strings1 = _make_list_of_strings_of_random_length(num_strings, 10)
uncommon_strings2 = _make_list_of_strings_of_random_length(num_strings, 20)
jaccard_width = 5
common_strings = _make_list_of_strings_of_random_length(
num_strings, jaccard_width, 50
)
uncommon_strings1 = _make_list_of_strings_of_random_length(
num_strings, jaccard_width, 10
)
uncommon_strings2 = _make_list_of_strings_of_random_length(
num_strings, jaccard_width, 20
)
str1 = cudf.Series(uncommon_strings1).str.cat(cudf.Series(common_strings))
str2 = cudf.Series(uncommon_strings2).str.cat(cudf.Series(common_strings))

# adopted from https://github.com/rapidsai/rapids-deduplication/issues/36
da = str1.str.character_ngrams(5, True)
db = str2.str.character_ngrams(5, True)
da = str1.str.character_ngrams(jaccard_width, True)
db = str2.str.character_ngrams(jaccard_width, True)
da = da.list.unique()
db = db.list.unique()
da = da.explode()
Expand All @@ -920,5 +931,5 @@ def test_jaccard_index_random_strings():
res = res.values.astype("float32")
expected = cudf.Series(res)

actual = str1.str.jaccard_index(str2, 5)
actual = str1.str.jaccard_index(str2, jaccard_width)
assert_eq(expected, actual)

0 comments on commit d26172f

Please sign in to comment.