From e436ed777b06f3cd144a3460d48bfd172ff480c1 Mon Sep 17 00:00:00 2001 From: Alissa Rao Date: Tue, 16 Apr 2024 01:00:19 -0700 Subject: [PATCH] Fix issues with using `WyHash` with `std::hash::Hash` (#9) * Ignore IntelliJ directory. * Simplify test vector code to not duplicate the actual test. * Remove impossible chunk condition in hasher (usize is 64-bit at max, and u64::MAX will never exceed it) * Improve code style in consume_bytes to match a more typical Rust style. * Allow multiple write commands to function. * Add special cases for hashing integers. --- .gitignore | 1 + benches/rand_bench.rs | 6 +- src/hasher.rs | 258 +++++++++++++++++++++++++----------------- src/lib.rs | 2 +- 4 files changed, 158 insertions(+), 109 deletions(-) diff --git a/.gitignore b/.gitignore index 5b70a89..975b01a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ /target /Cargo.lock .vscode +.idea diff --git a/benches/rand_bench.rs b/benches/rand_bench.rs index 45aada7..a290746 100644 --- a/benches/rand_bench.rs +++ b/benches/rand_bench.rs @@ -77,12 +77,10 @@ fn wyhash_benchmark(c: &mut Criterion) { #[cfg(feature = "randomised_wyhash")] c.bench_function("Random Hash new", |b| { - use wyrand::RandomWyHashState; use std::hash::BuildHasher; + use wyrand::RandomWyHashState; - b.iter(|| { - RandomWyHashState::new().build_hasher() - }); + b.iter(|| RandomWyHashState::new().build_hasher()); }); } diff --git a/src/hasher.rs b/src/hasher.rs index 3798837..0bd723d 100644 --- a/src/hasher.rs +++ b/src/hasher.rs @@ -38,6 +38,15 @@ use self::{ /// /// assert_ne!(hasher.finish(), 5); // Should not be represented by the same value any more /// ``` +/// +/// # Stability +/// +/// The result is only guaranteed to match the result `wyhash` would naturally produce if `write` +/// is called a single time, followed by a call to `finish`. +/// +/// Any other sequence of events (including calls to `write_u32` or similar functions) are +/// guaranteed to have consistent results between platforms and versions of this crate, but may not +/// map well to the reference implementation. #[cfg_attr(docsrs, doc(cfg(feature = "wyhash")))] #[derive(Clone)] pub struct WyHash { @@ -75,81 +84,115 @@ impl WyHash { #[inline] fn consume_bytes(&self, bytes: &[u8]) -> (u64, u64, u64) { - let (lo, hi): (u64, u64); let length = bytes.len(); - let mut seed = self.seed; - - match length { - 4..=16 => { - lo = (read_4_bytes(bytes) << 32) | read_4_bytes(&bytes[(length >> 3) << 2..]); - hi = (read_4_bytes(&bytes[length - 4..]) << 32) - | read_4_bytes(&bytes[length - 4 - ((length >> 3) << 2)..]); - } - 1..=3 => { - lo = read_upto_3_bytes(bytes); - hi = 0; - } - 0 => { - lo = 0; - hi = 0; - } - _ => { - let mut index = length; - let mut start = 0; - - if is_over_48_bytes(length) { - let mut seed1 = seed; - let mut seed2 = seed; - - while is_over_48_bytes(index) { - seed = wymix( - read_8_bytes(&bytes[start..]) ^ self.secret[1], - read_8_bytes(&bytes[start + 8..]) ^ seed, - ); - seed1 = wymix( - read_8_bytes(&bytes[start + 16..]) ^ self.secret[2], - read_8_bytes(&bytes[start + 24..]) ^ seed1, - ); - seed2 = wymix( - read_8_bytes(&bytes[start + 32..]) ^ self.secret[3], - read_8_bytes(&bytes[start + 40..]) ^ seed2, - ); - index -= 48; - start += 48; - } - - seed ^= seed1 ^ seed2; - } - - while index > 16 { + if length <= 0 { + (0, 0, self.seed) + } else if length <= 3 { + (read_upto_3_bytes(bytes), 0, self.seed) + } else if length <= 16 { + let lo = (read_4_bytes(bytes) << 32) | read_4_bytes(&bytes[(length >> 3) << 2..]); + let hi = (read_4_bytes(&bytes[length - 4..]) << 32) + | read_4_bytes(&bytes[length - 4 - ((length >> 3) << 2)..]); + (lo, hi, self.seed) + } else { + let mut index = length; + let mut start = 0; + let mut seed = self.seed; + + if is_over_48_bytes(length) { + let mut seed1 = seed; + let mut seed2 = seed; + + while is_over_48_bytes(index) { seed = wymix( read_8_bytes(&bytes[start..]) ^ self.secret[1], read_8_bytes(&bytes[start + 8..]) ^ seed, ); - index -= 16; - start += 16 + seed1 = wymix( + read_8_bytes(&bytes[start + 16..]) ^ self.secret[2], + read_8_bytes(&bytes[start + 24..]) ^ seed1, + ); + seed2 = wymix( + read_8_bytes(&bytes[start + 32..]) ^ self.secret[3], + read_8_bytes(&bytes[start + 40..]) ^ seed2, + ); + index -= 48; + start += 48; } - lo = read_8_bytes(&bytes[length - 16..]); - hi = read_8_bytes(&bytes[length - 8..]); + seed ^= seed1 ^ seed2; } + + while index > 16 { + seed = wymix( + read_8_bytes(&bytes[start..]) ^ self.secret[1], + read_8_bytes(&bytes[start + 8..]) ^ seed, + ); + index -= 16; + start += 16 + } + + let lo = read_8_bytes(&bytes[length - 16..]); + let hi = read_8_bytes(&bytes[length - 8..]); + (lo, hi, seed) } + } - (lo, hi, seed) + #[inline] + fn mix_current_seed(&mut self) { + if self.size != 0 { + self.seed = wymix(self.lo, self.hi ^ self.seed); + } } } impl Hasher for WyHash { #[inline] fn write(&mut self, bytes: &[u8]) { - for chunk in bytes.chunks(u64::MAX as usize) { - let (lo, hi, seed) = self.consume_bytes(chunk); + self.mix_current_seed(); - self.lo = lo; - self.hi = hi; - self.seed = seed; - self.size += chunk.len() as u64; - } + let (lo, hi, seed) = self.consume_bytes(bytes); + + self.lo = lo; + self.hi = hi; + self.seed = seed; + self.size += bytes.len() as u64; + } + + #[inline] + fn write_u8(&mut self, i: u8) { + self.write_u64(i as u64) + } + + #[inline] + fn write_u16(&mut self, i: u16) { + self.write_u64(i as u64) + } + + #[inline] + fn write_u32(&mut self, i: u32) { + self.write_u64(i as u64) + } + + #[inline] + fn write_u64(&mut self, i: u64) { + self.mix_current_seed(); + self.lo = i; + self.hi = 0; + self.size += 8; + } + + #[inline] + fn write_u128(&mut self, i: u128) { + self.mix_current_seed(); + self.lo = i as u64; + self.hi = (i >> 64) as u64; + self.size += 16; + } + + #[inline] + fn write_usize(&mut self, i: usize) { + self.write_u64(i as u64); } #[inline] @@ -180,6 +223,8 @@ mod tests { use super::*; + use core::hash::Hash; + #[cfg(feature = "debug")] #[test] fn no_leaking_debug() { @@ -195,26 +240,38 @@ mod tests { } #[cfg(not(feature = "v4_2"))] + #[rustfmt::skip] + const TEST_VECTORS: [(u64, &str); 8] = [ + (0x0409_638e_e2bd_e459, ""), + (0xa841_2d09_1b5f_e0a9, "a"), + (0x32dd_92e4_b291_5153, "abc"), + (0x8619_1240_89a3_a16b, "message digest"), + (0x7a43_afb6_1d7f_5f40, "abcdefghijklmnopqrstuvwxyz"), + (0xff42_329b_90e5_0d58, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"), + (0xc39c_ab13_b115_aad3, "12345678901234567890123456789012345678901234567890123456789012345678901234567890"), + (0xe44a_846b_fc65_00cd, "123456789012345678901234567890123456789012345678"), + ]; + + #[cfg(feature = "v4_2")] + #[rustfmt::skip] + const TEST_VECTORS: [(u64, &str); 8] = [ + (0x9322_8a4d_e0ee_c5a2, ""), + (0xc5ba_c3db_1787_13c4, "a"), + (0xa97f_2f7b_1d9b_3314, "abc"), + (0x786d_1f1d_f380_1df4, "message digest"), + (0xdca5_a813_8ad3_7c87, "abcdefghijklmnopqrstuvwxyz"), + (0xb9e7_34f1_17cf_af70, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"), + (0x6cc5_eab4_9a92_d617, "12345678901234567890123456789012345678901234567890123456789012345678901234567890"), + (0xe1d4_c58d_97ba_df5e, "123456789012345678901234567890123456789012345678"), + ]; + #[test] - fn expected_final_v4_hasher_output() { - // Test cases generated from the C reference's test_vectors - #[rustfmt::skip] - let test_cases: [(u64, &str); 8] = [ - (0x0409_638e_e2bd_e459, ""), - (0xa841_2d09_1b5f_e0a9, "a"), - (0x32dd_92e4_b291_5153, "abc"), - (0x8619_1240_89a3_a16b, "message digest"), - (0x7a43_afb6_1d7f_5f40, "abcdefghijklmnopqrstuvwxyz"), - (0xff42_329b_90e5_0d58, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"), - (0xc39c_ab13_b115_aad3, "12345678901234567890123456789012345678901234567890123456789012345678901234567890"), - (0xe44a_846b_fc65_00cd, "123456789012345678901234567890123456789012345678") - ]; - - test_cases + fn expected_hasher_output() { + TEST_VECTORS .into_iter() .enumerate() .map(|(seed, (expected, input))| { - let mut hasher = WyHash::new_with_secret(seed as u64, [WY0, WY1, WY2, WY3]); + let mut hasher = WyHash::new_with_default_secret(seed as u64); hasher.write(input.as_bytes()); @@ -229,38 +286,31 @@ mod tests { }); } - #[cfg(feature = "v4_2")] #[test] - fn expected_final_v42_hasher_output() { - // Test cases generated from the C reference's test_vectors - #[rustfmt::skip] - let test_cases: [(u64, &str); 8] = [ - (0x9322_8a4d_e0ee_c5a2, ""), - (0xc5ba_c3db_1787_13c4, "a"), - (0xa97f_2f7b_1d9b_3314, "abc"), - (0x786d_1f1d_f380_1df4, "message digest"), - (0xdca5_a813_8ad3_7c87, "abcdefghijklmnopqrstuvwxyz"), - (0xb9e7_34f1_17cf_af70, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"), - (0x6cc5_eab4_9a92_d617, "12345678901234567890123456789012345678901234567890123456789012345678901234567890"), - (0xe1d4_c58d_97ba_df5e, "123456789012345678901234567890123456789012345678") - ]; - - test_cases - .into_iter() - .enumerate() - .map(|(seed, (expected, input))| { - let mut hasher = WyHash::new_with_secret(seed as u64, [WY0, WY1, WY2, WY3]); + fn multiple_writes_no_collision() { + let mut hasher = WyHash::new_with_default_secret(0); + hasher.write(b"abcdef"); + hasher.write(b"abcdef"); + let hash_a = hasher.finish(); + + let mut hasher = WyHash::new_with_default_secret(0); + hasher.write(b"abcdeF"); + hasher.write(b"abcdef"); + let hash_b = hasher.finish(); + + assert_ne!(hash_a, hash_b); + } - hasher.write(input.as_bytes()); + #[test] + fn tuples_no_collision() { + let mut hasher = WyHash::new_with_default_secret(0); + (1000, 2000).hash(&mut hasher); + let hash_a = hasher.finish(); - (input, expected, hasher.finish()) - }) - .for_each(|(input, expected_hash, computed_hash)| { - assert_eq!( - expected_hash, computed_hash, - "Hashed output didn't match expected for \"{}\"", - input - ); - }); + let mut hasher = WyHash::new_with_default_secret(0); + (1500, 2000).hash(&mut hasher); + let hash_b = hasher.finish(); + + assert_ne!(hash_a, hash_b); } } diff --git a/src/lib.rs b/src/lib.rs index 43ac0d8..c5d48bf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,8 +8,8 @@ mod constants; #[cfg(feature = "wyhash")] mod hasher; -mod wyrand; mod utils; +mod wyrand; #[cfg(feature = "wyhash")] pub use hasher::*;