Skip to content

Commit

Permalink
Use uint64 in hash_voca and clean up warnings
Browse files Browse the repository at this point in the history
Using uint8 fails because mixing it with Python integers try to
stick with uint8 on NumPy 2.
Using Python integers fails some NumPy 1.x paths, because mixing
them with uint64 scalars fails on NumPy 1.

uint64 should work on both versions.

Also moved a warnings ignore from the tests to the function
  • Loading branch information
seberg committed May 29, 2024
1 parent b112c15 commit 4c3cb2b
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 15 deletions.
6 changes: 2 additions & 4 deletions python/cudf/cudf/tests/test_hash_vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,7 @@ def test_correct_bert_base_vocab_hash(datadir, tmpdir):

groundtruth_path = os.path.join(datadir, "vocab-hash.txt")
output_path = tmpdir.join("cudf-vocab-hash.txt")
with warnings.catch_warnings():
# See https://github.com/rapidsai/cudf/issues/12403
warnings.simplefilter(action="ignore", category=RuntimeWarning)
hash_vocab(vocab_path, output_path)
warnings.simplefilter(action="ignore", category=RuntimeWarning)
hash_vocab(vocab_path, output_path)

assert filecmp.cmp(output_path, groundtruth_path, shallow=False)
23 changes: 12 additions & 11 deletions python/cudf/cudf/utils/hash_vocab_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@

# Coefficients ranges for inner hash - This are important to set to be
# large so that we have randomness in the bottom bits when modding
A_SECOND_LEVEL_POW = 48
B_SECOND_LEVEL_POW = 7
A_SECOND_LEVEL_POW = np.uint64(48)
B_SECOND_LEVEL_POW = np.uint64(7)

A_LBOUND_SECOND_LEVEL_HASH = 2**16
A_HBOUND_SECOND_LEVEL_HASH = 2**A_SECOND_LEVEL_POW
Expand All @@ -23,10 +23,11 @@


# Shifts for bit packing
A_SECOND_LEVEL_SHIFT_AMT = 64 - A_SECOND_LEVEL_POW
B_SECOND_LEVEL_SHIFT_AMT = 64 - A_SECOND_LEVEL_POW - B_SECOND_LEVEL_POW
assert B_SECOND_LEVEL_SHIFT_AMT > 0 # was a uint8 so assume this must be true
BITS_FOR_INNER_TABLE_SIZE = 8
A_SECOND_LEVEL_SHIFT_AMT = np.uint64(64 - A_SECOND_LEVEL_POW)
B_SECOND_LEVEL_SHIFT_AMT = np.uint64(
64 - A_SECOND_LEVEL_POW - B_SECOND_LEVEL_POW
)
BITS_FOR_INNER_TABLE_SIZE = np.uint64(8)

NOT_FOUND = -1

Expand Down Expand Up @@ -93,7 +94,7 @@ def _find_hash_for_internal(hash_bin):

while True:
a = np.random.randint(
A_LBOUND_SECOND_LEVEL_HASH, A_HBOUND_SECOND_LEVEL_HASH
A_LBOUND_SECOND_LEVEL_HASH, A_HBOUND_SECOND_LEVEL_HASH,
)
b = np.random.randint(
B_LBOUND_SECOND_LEVEL_HASH, B_HBOUND_SECOND_LEVEL_HASH
Expand Down Expand Up @@ -129,12 +130,12 @@ def _perfect_hash(integers, max_constant):
bin_length = len(internal_table)
max_bin_length = max(bin_length, max_bin_length)
internal_table_coeffs[i] = (
coeff_a << A_SECOND_LEVEL_SHIFT_AMT
| coeff_b << B_SECOND_LEVEL_SHIFT_AMT
| bin_length
np.uint64(coeff_a) << A_SECOND_LEVEL_SHIFT_AMT
| np.uint64(coeff_b) << B_SECOND_LEVEL_SHIFT_AMT
| np.uint64(bin_length)
)
offset_into_flattened_table[i + 1] = (
offset_into_flattened_table[i] + bin_length
offset_into_flattened_table[i] + np.uint64(bin_length)
)
flattened_bins.extend(internal_table)

Expand Down

0 comments on commit 4c3cb2b

Please sign in to comment.