Use uint64 in hash_voca and clean up warnings

Using uint8 fails because mixing it with Python integers try to stick with uint8 on NumPy 2. Using Python integers fails some NumPy 1.x paths, because mixing them with uint64 scalars fails on NumPy 1. uint64 should work on both versions. Also moved a warnings ignore from the tests to the function
rapidsai · May 29, 2024 · 4c3cb2b · 4c3cb2b
1 parent b112c15
commit 4c3cb2b
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 15 deletions.
diff --git a/python/cudf/cudf/tests/test_hash_vocab.py b/python/cudf/cudf/tests/test_hash_vocab.py
@@ -21,9 +21,7 @@ def test_correct_bert_base_vocab_hash(datadir, tmpdir):
 
     groundtruth_path = os.path.join(datadir, "vocab-hash.txt")
     output_path = tmpdir.join("cudf-vocab-hash.txt")
-    with warnings.catch_warnings():
-        # See https://github.com/rapidsai/cudf/issues/12403
-        warnings.simplefilter(action="ignore", category=RuntimeWarning)
-        hash_vocab(vocab_path, output_path)
+    warnings.simplefilter(action="ignore", category=RuntimeWarning)
+    hash_vocab(vocab_path, output_path)
 
     assert filecmp.cmp(output_path, groundtruth_path, shallow=False)
diff --git a/python/cudf/cudf/utils/hash_vocab_utils.py b/python/cudf/cudf/utils/hash_vocab_utils.py
@@ -7,8 +7,8 @@
 
 # Coefficients ranges for inner hash - This are important to set to be
 # large so that we have randomness in the bottom bits when modding
-A_SECOND_LEVEL_POW = 48
-B_SECOND_LEVEL_POW = 7
+A_SECOND_LEVEL_POW = np.uint64(48)
+B_SECOND_LEVEL_POW = np.uint64(7)
 
 A_LBOUND_SECOND_LEVEL_HASH = 2**16
 A_HBOUND_SECOND_LEVEL_HASH = 2**A_SECOND_LEVEL_POW
@@ -23,10 +23,11 @@
 
 
 # Shifts for bit packing
-A_SECOND_LEVEL_SHIFT_AMT = 64 - A_SECOND_LEVEL_POW
-B_SECOND_LEVEL_SHIFT_AMT = 64 - A_SECOND_LEVEL_POW - B_SECOND_LEVEL_POW
-assert B_SECOND_LEVEL_SHIFT_AMT > 0  # was a uint8 so assume this must be true
-BITS_FOR_INNER_TABLE_SIZE = 8
+A_SECOND_LEVEL_SHIFT_AMT = np.uint64(64 - A_SECOND_LEVEL_POW)
+B_SECOND_LEVEL_SHIFT_AMT = np.uint64(
+    64 - A_SECOND_LEVEL_POW - B_SECOND_LEVEL_POW
+)
+BITS_FOR_INNER_TABLE_SIZE = np.uint64(8)
 
 NOT_FOUND = -1
 
@@ -93,7 +94,7 @@ def _find_hash_for_internal(hash_bin):
 
     while True:
         a = np.random.randint(
-            A_LBOUND_SECOND_LEVEL_HASH, A_HBOUND_SECOND_LEVEL_HASH
+            A_LBOUND_SECOND_LEVEL_HASH, A_HBOUND_SECOND_LEVEL_HASH,
         )
         b = np.random.randint(
             B_LBOUND_SECOND_LEVEL_HASH, B_HBOUND_SECOND_LEVEL_HASH
@@ -129,12 +130,12 @@ def _perfect_hash(integers, max_constant):
         bin_length = len(internal_table)
         max_bin_length = max(bin_length, max_bin_length)
         internal_table_coeffs[i] = (
-            coeff_a << A_SECOND_LEVEL_SHIFT_AMT
-            | coeff_b << B_SECOND_LEVEL_SHIFT_AMT
-            | bin_length
+            np.uint64(coeff_a) << A_SECOND_LEVEL_SHIFT_AMT
+            | np.uint64(coeff_b) << B_SECOND_LEVEL_SHIFT_AMT
+            | np.uint64(bin_length)
         )
         offset_into_flattened_table[i + 1] = (
-            offset_into_flattened_table[i] + bin_length
+            offset_into_flattened_table[i] + np.uint64(bin_length)
         )
         flattened_bins.extend(internal_table)