From 4c3cb2b536bd7f6145b259dd7fbde5d44635ef32 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Wed, 29 May 2024 14:10:10 +0200
Subject: [PATCH] Use uint64 in hash_voca and clean up warnings

Using uint8 fails because mixing it with Python integers try to
stick with uint8 on NumPy 2.
Using Python integers fails some NumPy 1.x paths, because mixing
them with uint64 scalars fails on NumPy 1.

uint64 should work on both versions.

Also moved a warnings ignore from the tests to the function
---
 python/cudf/cudf/tests/test_hash_vocab.py  |  6 ++----
 python/cudf/cudf/utils/hash_vocab_utils.py | 23 +++++++++++-----------
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/python/cudf/cudf/tests/test_hash_vocab.py b/python/cudf/cudf/tests/test_hash_vocab.py
index e081119ff89..001b41d4d17 100644
--- a/python/cudf/cudf/tests/test_hash_vocab.py
+++ b/python/cudf/cudf/tests/test_hash_vocab.py
@@ -21,9 +21,7 @@ def test_correct_bert_base_vocab_hash(datadir, tmpdir):
 
     groundtruth_path = os.path.join(datadir, "vocab-hash.txt")
     output_path = tmpdir.join("cudf-vocab-hash.txt")
-    with warnings.catch_warnings():
-        # See https://github.com/rapidsai/cudf/issues/12403
-        warnings.simplefilter(action="ignore", category=RuntimeWarning)
-        hash_vocab(vocab_path, output_path)
+    warnings.simplefilter(action="ignore", category=RuntimeWarning)
+    hash_vocab(vocab_path, output_path)
 
     assert filecmp.cmp(output_path, groundtruth_path, shallow=False)
diff --git a/python/cudf/cudf/utils/hash_vocab_utils.py b/python/cudf/cudf/utils/hash_vocab_utils.py
index 71ced5fd8ec..88d1f68209b 100644
--- a/python/cudf/cudf/utils/hash_vocab_utils.py
+++ b/python/cudf/cudf/utils/hash_vocab_utils.py
@@ -7,8 +7,8 @@
 
 # Coefficients ranges for inner hash - This are important to set to be
 # large so that we have randomness in the bottom bits when modding
-A_SECOND_LEVEL_POW = 48
-B_SECOND_LEVEL_POW = 7
+A_SECOND_LEVEL_POW = np.uint64(48)
+B_SECOND_LEVEL_POW = np.uint64(7)
 
 A_LBOUND_SECOND_LEVEL_HASH = 2**16
 A_HBOUND_SECOND_LEVEL_HASH = 2**A_SECOND_LEVEL_POW
@@ -23,10 +23,11 @@
 
 
 # Shifts for bit packing
-A_SECOND_LEVEL_SHIFT_AMT = 64 - A_SECOND_LEVEL_POW
-B_SECOND_LEVEL_SHIFT_AMT = 64 - A_SECOND_LEVEL_POW - B_SECOND_LEVEL_POW
-assert B_SECOND_LEVEL_SHIFT_AMT > 0  # was a uint8 so assume this must be true
-BITS_FOR_INNER_TABLE_SIZE = 8
+A_SECOND_LEVEL_SHIFT_AMT = np.uint64(64 - A_SECOND_LEVEL_POW)
+B_SECOND_LEVEL_SHIFT_AMT = np.uint64(
+    64 - A_SECOND_LEVEL_POW - B_SECOND_LEVEL_POW
+)
+BITS_FOR_INNER_TABLE_SIZE = np.uint64(8)
 
 NOT_FOUND = -1
 
@@ -93,7 +94,7 @@ def _find_hash_for_internal(hash_bin):
 
     while True:
         a = np.random.randint(
-            A_LBOUND_SECOND_LEVEL_HASH, A_HBOUND_SECOND_LEVEL_HASH
+            A_LBOUND_SECOND_LEVEL_HASH, A_HBOUND_SECOND_LEVEL_HASH,
         )
         b = np.random.randint(
             B_LBOUND_SECOND_LEVEL_HASH, B_HBOUND_SECOND_LEVEL_HASH
@@ -129,12 +130,12 @@ def _perfect_hash(integers, max_constant):
         bin_length = len(internal_table)
         max_bin_length = max(bin_length, max_bin_length)
         internal_table_coeffs[i] = (
-            coeff_a << A_SECOND_LEVEL_SHIFT_AMT
-            | coeff_b << B_SECOND_LEVEL_SHIFT_AMT
-            | bin_length
+            np.uint64(coeff_a) << A_SECOND_LEVEL_SHIFT_AMT
+            | np.uint64(coeff_b) << B_SECOND_LEVEL_SHIFT_AMT
+            | np.uint64(bin_length)
         )
         offset_into_flattened_table[i + 1] = (
-            offset_into_flattened_table[i] + bin_length
+            offset_into_flattened_table[i] + np.uint64(bin_length)
         )
         flattened_bins.extend(internal_table)