From e436ed777b06f3cd144a3460d48bfd172ff480c1 Mon Sep 17 00:00:00 2001
From: Alissa Rao <lymia@lymia.moe>
Date: Tue, 16 Apr 2024 01:00:19 -0700
Subject: [PATCH] Fix issues with using `WyHash` with `std::hash::Hash` (#9)

* Ignore IntelliJ directory.

* Simplify test vector code to not duplicate the actual test.

* Remove impossible chunk condition in hasher (usize is 64-bit at max, and u64::MAX will never exceed it)

* Improve code style in consume_bytes to match a more typical Rust style.

* Allow multiple write commands to function.

* Add special cases for hashing integers.
---
 .gitignore            |   1 +
 benches/rand_bench.rs |   6 +-
 src/hasher.rs         | 258 +++++++++++++++++++++++++-----------------
 src/lib.rs            |   2 +-
 4 files changed, 158 insertions(+), 109 deletions(-)

diff --git a/.gitignore b/.gitignore
index 5b70a89..975b01a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 /target
 /Cargo.lock
 .vscode
+.idea
diff --git a/benches/rand_bench.rs b/benches/rand_bench.rs
index 45aada7..a290746 100644
--- a/benches/rand_bench.rs
+++ b/benches/rand_bench.rs
@@ -77,12 +77,10 @@ fn wyhash_benchmark(c: &mut Criterion) {
 
     #[cfg(feature = "randomised_wyhash")]
     c.bench_function("Random Hash new", |b| {
-        use wyrand::RandomWyHashState;
         use std::hash::BuildHasher;
+        use wyrand::RandomWyHashState;
 
-        b.iter(|| {
-            RandomWyHashState::new().build_hasher()
-        });
+        b.iter(|| RandomWyHashState::new().build_hasher());
     });
 }
 
diff --git a/src/hasher.rs b/src/hasher.rs
index 3798837..0bd723d 100644
--- a/src/hasher.rs
+++ b/src/hasher.rs
@@ -38,6 +38,15 @@ use self::{
 ///
 /// assert_ne!(hasher.finish(), 5); // Should not be represented by the same value any more
 /// ```
+///
+/// # Stability
+///
+/// The result is only guaranteed to match the result `wyhash` would naturally produce if `write`
+/// is called a single time, followed by a call to `finish`.
+///
+/// Any other sequence of events (including calls to `write_u32` or similar functions) are
+/// guaranteed to have consistent results between platforms and versions of this crate, but may not
+/// map well to the reference implementation.
 #[cfg_attr(docsrs, doc(cfg(feature = "wyhash")))]
 #[derive(Clone)]
 pub struct WyHash {
@@ -75,81 +84,115 @@ impl WyHash {
 
     #[inline]
     fn consume_bytes(&self, bytes: &[u8]) -> (u64, u64, u64) {
-        let (lo, hi): (u64, u64);
         let length = bytes.len();
-        let mut seed = self.seed;
-
-        match length {
-            4..=16 => {
-                lo = (read_4_bytes(bytes) << 32) | read_4_bytes(&bytes[(length >> 3) << 2..]);
-                hi = (read_4_bytes(&bytes[length - 4..]) << 32)
-                    | read_4_bytes(&bytes[length - 4 - ((length >> 3) << 2)..]);
-            }
-            1..=3 => {
-                lo = read_upto_3_bytes(bytes);
-                hi = 0;
-            }
-            0 => {
-                lo = 0;
-                hi = 0;
-            }
-            _ => {
-                let mut index = length;
-                let mut start = 0;
-
-                if is_over_48_bytes(length) {
-                    let mut seed1 = seed;
-                    let mut seed2 = seed;
-
-                    while is_over_48_bytes(index) {
-                        seed = wymix(
-                            read_8_bytes(&bytes[start..]) ^ self.secret[1],
-                            read_8_bytes(&bytes[start + 8..]) ^ seed,
-                        );
-                        seed1 = wymix(
-                            read_8_bytes(&bytes[start + 16..]) ^ self.secret[2],
-                            read_8_bytes(&bytes[start + 24..]) ^ seed1,
-                        );
-                        seed2 = wymix(
-                            read_8_bytes(&bytes[start + 32..]) ^ self.secret[3],
-                            read_8_bytes(&bytes[start + 40..]) ^ seed2,
-                        );
-                        index -= 48;
-                        start += 48;
-                    }
-
-                    seed ^= seed1 ^ seed2;
-                }
-
-                while index > 16 {
+        if length <= 0 {
+            (0, 0, self.seed)
+        } else if length <= 3 {
+            (read_upto_3_bytes(bytes), 0, self.seed)
+        } else if length <= 16 {
+            let lo = (read_4_bytes(bytes) << 32) | read_4_bytes(&bytes[(length >> 3) << 2..]);
+            let hi = (read_4_bytes(&bytes[length - 4..]) << 32)
+                | read_4_bytes(&bytes[length - 4 - ((length >> 3) << 2)..]);
+            (lo, hi, self.seed)
+        } else {
+            let mut index = length;
+            let mut start = 0;
+            let mut seed = self.seed;
+
+            if is_over_48_bytes(length) {
+                let mut seed1 = seed;
+                let mut seed2 = seed;
+
+                while is_over_48_bytes(index) {
                     seed = wymix(
                         read_8_bytes(&bytes[start..]) ^ self.secret[1],
                         read_8_bytes(&bytes[start + 8..]) ^ seed,
                     );
-                    index -= 16;
-                    start += 16
+                    seed1 = wymix(
+                        read_8_bytes(&bytes[start + 16..]) ^ self.secret[2],
+                        read_8_bytes(&bytes[start + 24..]) ^ seed1,
+                    );
+                    seed2 = wymix(
+                        read_8_bytes(&bytes[start + 32..]) ^ self.secret[3],
+                        read_8_bytes(&bytes[start + 40..]) ^ seed2,
+                    );
+                    index -= 48;
+                    start += 48;
                 }
 
-                lo = read_8_bytes(&bytes[length - 16..]);
-                hi = read_8_bytes(&bytes[length - 8..]);
+                seed ^= seed1 ^ seed2;
             }
+
+            while index > 16 {
+                seed = wymix(
+                    read_8_bytes(&bytes[start..]) ^ self.secret[1],
+                    read_8_bytes(&bytes[start + 8..]) ^ seed,
+                );
+                index -= 16;
+                start += 16
+            }
+
+            let lo = read_8_bytes(&bytes[length - 16..]);
+            let hi = read_8_bytes(&bytes[length - 8..]);
+            (lo, hi, seed)
         }
+    }
 
-        (lo, hi, seed)
+    #[inline]
+    fn mix_current_seed(&mut self) {
+        if self.size != 0 {
+            self.seed = wymix(self.lo, self.hi ^ self.seed);
+        }
     }
 }
 
 impl Hasher for WyHash {
     #[inline]
     fn write(&mut self, bytes: &[u8]) {
-        for chunk in bytes.chunks(u64::MAX as usize) {
-            let (lo, hi, seed) = self.consume_bytes(chunk);
+        self.mix_current_seed();
 
-            self.lo = lo;
-            self.hi = hi;
-            self.seed = seed;
-            self.size += chunk.len() as u64;
-        }
+        let (lo, hi, seed) = self.consume_bytes(bytes);
+
+        self.lo = lo;
+        self.hi = hi;
+        self.seed = seed;
+        self.size += bytes.len() as u64;
+    }
+
+    #[inline]
+    fn write_u8(&mut self, i: u8) {
+        self.write_u64(i as u64)
+    }
+
+    #[inline]
+    fn write_u16(&mut self, i: u16) {
+        self.write_u64(i as u64)
+    }
+
+    #[inline]
+    fn write_u32(&mut self, i: u32) {
+        self.write_u64(i as u64)
+    }
+
+    #[inline]
+    fn write_u64(&mut self, i: u64) {
+        self.mix_current_seed();
+        self.lo = i;
+        self.hi = 0;
+        self.size += 8;
+    }
+
+    #[inline]
+    fn write_u128(&mut self, i: u128) {
+        self.mix_current_seed();
+        self.lo = i as u64;
+        self.hi = (i >> 64) as u64;
+        self.size += 16;
+    }
+
+    #[inline]
+    fn write_usize(&mut self, i: usize) {
+        self.write_u64(i as u64);
     }
 
     #[inline]
@@ -180,6 +223,8 @@ mod tests {
 
     use super::*;
 
+    use core::hash::Hash;
+
     #[cfg(feature = "debug")]
     #[test]
     fn no_leaking_debug() {
@@ -195,26 +240,38 @@ mod tests {
     }
 
     #[cfg(not(feature = "v4_2"))]
+    #[rustfmt::skip]
+    const TEST_VECTORS: [(u64, &str); 8] = [
+        (0x0409_638e_e2bd_e459, ""),
+        (0xa841_2d09_1b5f_e0a9, "a"),
+        (0x32dd_92e4_b291_5153, "abc"),
+        (0x8619_1240_89a3_a16b, "message digest"),
+        (0x7a43_afb6_1d7f_5f40, "abcdefghijklmnopqrstuvwxyz"),
+        (0xff42_329b_90e5_0d58, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"),
+        (0xc39c_ab13_b115_aad3, "12345678901234567890123456789012345678901234567890123456789012345678901234567890"),
+        (0xe44a_846b_fc65_00cd, "123456789012345678901234567890123456789012345678"),
+    ];
+
+    #[cfg(feature = "v4_2")]
+    #[rustfmt::skip]
+    const TEST_VECTORS: [(u64, &str); 8] = [
+        (0x9322_8a4d_e0ee_c5a2, ""),
+        (0xc5ba_c3db_1787_13c4, "a"),
+        (0xa97f_2f7b_1d9b_3314, "abc"),
+        (0x786d_1f1d_f380_1df4, "message digest"),
+        (0xdca5_a813_8ad3_7c87, "abcdefghijklmnopqrstuvwxyz"),
+        (0xb9e7_34f1_17cf_af70, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"),
+        (0x6cc5_eab4_9a92_d617, "12345678901234567890123456789012345678901234567890123456789012345678901234567890"),
+        (0xe1d4_c58d_97ba_df5e, "123456789012345678901234567890123456789012345678"),
+    ];
+
     #[test]
-    fn expected_final_v4_hasher_output() {
-        // Test cases generated from the C reference's test_vectors
-        #[rustfmt::skip]
-        let test_cases: [(u64, &str); 8] = [
-            (0x0409_638e_e2bd_e459, ""),
-            (0xa841_2d09_1b5f_e0a9, "a"),
-            (0x32dd_92e4_b291_5153, "abc"),
-            (0x8619_1240_89a3_a16b, "message digest"),
-            (0x7a43_afb6_1d7f_5f40, "abcdefghijklmnopqrstuvwxyz"),
-            (0xff42_329b_90e5_0d58, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"),
-            (0xc39c_ab13_b115_aad3, "12345678901234567890123456789012345678901234567890123456789012345678901234567890"),
-            (0xe44a_846b_fc65_00cd, "123456789012345678901234567890123456789012345678")
-        ];
-
-        test_cases
+    fn expected_hasher_output() {
+        TEST_VECTORS
             .into_iter()
             .enumerate()
             .map(|(seed, (expected, input))| {
-                let mut hasher = WyHash::new_with_secret(seed as u64, [WY0, WY1, WY2, WY3]);
+                let mut hasher = WyHash::new_with_default_secret(seed as u64);
 
                 hasher.write(input.as_bytes());
 
@@ -229,38 +286,31 @@ mod tests {
             });
     }
 
-    #[cfg(feature = "v4_2")]
     #[test]
-    fn expected_final_v42_hasher_output() {
-        // Test cases generated from the C reference's test_vectors
-        #[rustfmt::skip]
-        let test_cases: [(u64, &str); 8] = [
-            (0x9322_8a4d_e0ee_c5a2, ""),
-            (0xc5ba_c3db_1787_13c4, "a"),
-            (0xa97f_2f7b_1d9b_3314, "abc"),
-            (0x786d_1f1d_f380_1df4, "message digest"),
-            (0xdca5_a813_8ad3_7c87, "abcdefghijklmnopqrstuvwxyz"),
-            (0xb9e7_34f1_17cf_af70, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"),
-            (0x6cc5_eab4_9a92_d617, "12345678901234567890123456789012345678901234567890123456789012345678901234567890"),
-            (0xe1d4_c58d_97ba_df5e, "123456789012345678901234567890123456789012345678")
-        ];
-
-        test_cases
-            .into_iter()
-            .enumerate()
-            .map(|(seed, (expected, input))| {
-                let mut hasher = WyHash::new_with_secret(seed as u64, [WY0, WY1, WY2, WY3]);
+    fn multiple_writes_no_collision() {
+        let mut hasher = WyHash::new_with_default_secret(0);
+        hasher.write(b"abcdef");
+        hasher.write(b"abcdef");
+        let hash_a = hasher.finish();
+
+        let mut hasher = WyHash::new_with_default_secret(0);
+        hasher.write(b"abcdeF");
+        hasher.write(b"abcdef");
+        let hash_b = hasher.finish();
+
+        assert_ne!(hash_a, hash_b);
+    }
 
-                hasher.write(input.as_bytes());
+    #[test]
+    fn tuples_no_collision() {
+        let mut hasher = WyHash::new_with_default_secret(0);
+        (1000, 2000).hash(&mut hasher);
+        let hash_a = hasher.finish();
 
-                (input, expected, hasher.finish())
-            })
-            .for_each(|(input, expected_hash, computed_hash)| {
-                assert_eq!(
-                    expected_hash, computed_hash,
-                    "Hashed output didn't match expected for \"{}\"",
-                    input
-                );
-            });
+        let mut hasher = WyHash::new_with_default_secret(0);
+        (1500, 2000).hash(&mut hasher);
+        let hash_b = hasher.finish();
+
+        assert_ne!(hash_a, hash_b);
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
index 43ac0d8..c5d48bf 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -8,8 +8,8 @@
 mod constants;
 #[cfg(feature = "wyhash")]
 mod hasher;
-mod wyrand;
 mod utils;
+mod wyrand;
 
 #[cfg(feature = "wyhash")]
 pub use hasher::*;