Skip to content

Commit

Permalink
Don't test strings shorter than the requested ngram size
Browse files Browse the repository at this point in the history
  • Loading branch information
vyasr committed Jul 25, 2023
1 parent 120fc29 commit 1cf3c3b
Showing 1 changed file with 17 additions and 8 deletions.
25 changes: 17 additions & 8 deletions python/cudf/cudf/tests/text/test_text_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -878,27 +878,36 @@ def test_jaccard_index():
str1.str.jaccard_index(str3, 5)


def _make_list_of_strings_of_random_length(num_strings, max_length):
def _make_list_of_strings_of_random_length(
num_strings, min_length, max_length
):
return [
"".join(
random.choice(string.ascii_lowercase)
for _ in range(random.randint(1, max_length))
for _ in range(random.randint(min_length, max_length))
)
for _ in range(num_strings)
]


def test_jaccard_index_random_strings():
num_strings = 100
common_strings = _make_list_of_strings_of_random_length(num_strings, 50)
uncommon_strings1 = _make_list_of_strings_of_random_length(num_strings, 10)
uncommon_strings2 = _make_list_of_strings_of_random_length(num_strings, 20)
jaccard_width = 5
common_strings = _make_list_of_strings_of_random_length(
num_strings, jaccard_width, 50
)
uncommon_strings1 = _make_list_of_strings_of_random_length(
num_strings, jaccard_width, 10
)
uncommon_strings2 = _make_list_of_strings_of_random_length(
num_strings, jaccard_width, 20
)
str1 = cudf.Series(uncommon_strings1).str.cat(cudf.Series(common_strings))
str2 = cudf.Series(uncommon_strings2).str.cat(cudf.Series(common_strings))

# adopted from https://github.com/rapidsai/rapids-deduplication/issues/36
da = str1.str.character_ngrams(5, True)
db = str2.str.character_ngrams(5, True)
da = str1.str.character_ngrams(jaccard_width, True)
db = str2.str.character_ngrams(jaccard_width, True)
da = da.list.unique()
db = db.list.unique()
da = da.explode()
Expand All @@ -920,5 +929,5 @@ def test_jaccard_index_random_strings():
res = res.values.astype("float32")
expected = cudf.Series(res)

actual = str1.str.jaccard_index(str2, 5)
actual = str1.str.jaccard_index(str2, jaccard_width)
assert_eq(expected, actual)

0 comments on commit 1cf3c3b

Please sign in to comment.