From d26172f8d74e16103d3fdad30b5b73dd2388bb19 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 25 Jul 2023 21:41:57 -0700
Subject: [PATCH] Don't test strings shorter than the requested ngram size
 (#13758)

The random string generation was using a minimum length of 1 rather than the ngram size. While both the reference implementation and the libcudf code were robust to that case, they return different extreme values (0 vs 1) for those. Since testing that edge case isn't really in scope for this particular test, I've modified the generator to only generate strings whose length is at least the requested ngram size.

Resolves #13748

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13758
---
 .../cudf/cudf/tests/text/test_text_methods.py | 27 +++++++++++++------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index 18043e54739..e1dda1ae5d1 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -878,27 +878,38 @@ def test_jaccard_index():
         str1.str.jaccard_index(str3, 5)
 
 
-def _make_list_of_strings_of_random_length(num_strings, max_length):
+def _make_list_of_strings_of_random_length(
+    num_strings, min_length, max_length
+):
     return [
         "".join(
             random.choice(string.ascii_lowercase)
-            for _ in range(random.randint(1, max_length))
+            for _ in range(random.randint(min_length, max_length))
         )
         for _ in range(num_strings)
     ]
 
 
 def test_jaccard_index_random_strings():
+    # Seed the rng before random string generation.
+    random.seed(42)
     num_strings = 100
-    common_strings = _make_list_of_strings_of_random_length(num_strings, 50)
-    uncommon_strings1 = _make_list_of_strings_of_random_length(num_strings, 10)
-    uncommon_strings2 = _make_list_of_strings_of_random_length(num_strings, 20)
+    jaccard_width = 5
+    common_strings = _make_list_of_strings_of_random_length(
+        num_strings, jaccard_width, 50
+    )
+    uncommon_strings1 = _make_list_of_strings_of_random_length(
+        num_strings, jaccard_width, 10
+    )
+    uncommon_strings2 = _make_list_of_strings_of_random_length(
+        num_strings, jaccard_width, 20
+    )
     str1 = cudf.Series(uncommon_strings1).str.cat(cudf.Series(common_strings))
     str2 = cudf.Series(uncommon_strings2).str.cat(cudf.Series(common_strings))
 
     # adopted from https://github.com/rapidsai/rapids-deduplication/issues/36
-    da = str1.str.character_ngrams(5, True)
-    db = str2.str.character_ngrams(5, True)
+    da = str1.str.character_ngrams(jaccard_width, True)
+    db = str2.str.character_ngrams(jaccard_width, True)
     da = da.list.unique()
     db = db.list.unique()
     da = da.explode()
@@ -920,5 +931,5 @@ def test_jaccard_index_random_strings():
     res = res.values.astype("float32")
     expected = cudf.Series(res)
 
-    actual = str1.str.jaccard_index(str2, 5)
+    actual = str1.str.jaccard_index(str2, jaccard_width)
     assert_eq(expected, actual)