diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5a41a1a..2157cff 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,6 +20,7 @@ jobs: - uses: dtolnay/rust-toolchain@stable - run: cargo test - run: cargo test --features diff + - run: cargo test --features fast clippy: name: Clippy @@ -31,6 +32,7 @@ jobs: components: clippy - run: cargo clippy --tests - run: cargo clippy --tests --features diff + - run: cargo clippy --tests --features fast rustfmt: name: Rustfmt @@ -54,3 +56,4 @@ jobs: target: s390x-unknown-linux-gnu - run: cargo test - run: cargo test --features diff + - run: cargo test --features fast diff --git a/Cargo.toml b/Cargo.toml index 286f418..48af237 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,6 +13,9 @@ categories = ["algorithms"] # Enable ability to compute diff score between two TLSH. # This is behind a feature flag as it adds a 64k static array to the binary. diff = [] +# Enable joint lookup for faster Pearson hashing. +# This is behind a feature flag as it adds a 64k static array to the binary. +fast = [] [dev-dependencies] glob = "0.3.0" diff --git a/README.md b/README.md index 36a5094..73a3aed 100644 --- a/README.md +++ b/README.md @@ -43,5 +43,7 @@ Those configurations are available: - 256 buckets and 3-byte checksum. - 48 buckets and 1-byte checksum. +The `fast` feature speeds up TLSH generation but adds a 64kB lookup table. + The `threaded` and `private` options that exists in the original TLSH version are not yet implemented. \ No newline at end of file diff --git a/src/pearson.rs b/src/pearson.rs index 979f381..f671bad 100644 --- a/src/pearson.rs +++ b/src/pearson.rs @@ -29,19 +29,50 @@ const V_TABLE48: [u8; 256] = [ 16, 43, 23, 13, 40, 17, ]; +// Two-byte lookup for Pearson's sample random table +#[cfg(feature = "fast")] +const JOINT_V_TABLE: [[u8; 256]; 256] = { + let mut table = [[0; 256]; 256]; + let mut i = 0; + while i < 256 { + let mut j = 0; + while j < 256 { + table[i][j] = V_TABLE[V_TABLE[j] as usize ^ i]; + j += 1; + } + i += 1; + } + table +}; + // Pearson's algorithm pub fn b_mapping(salt: u8, i: u8, j: u8, k: u8) -> u8 { let mut h = 0; h = V_TABLE[usize::from(h ^ salt)]; - h = V_TABLE[usize::from(h ^ i)]; - h = V_TABLE[usize::from(h ^ j)]; + #[cfg(feature = "fast")] + { + h = JOINT_V_TABLE[usize::from(j)][usize::from(h ^ i)]; + } + #[cfg(not(feature = "fast"))] + { + h = V_TABLE[usize::from(h ^ i)]; + h = V_TABLE[usize::from(h ^ j)]; + } h = V_TABLE[usize::from(h ^ k)]; h } pub fn fast_b_mapping(salt: u8, i: u8, j: u8, k: u8) -> u8 { - let mut h = V_TABLE[usize::from(salt ^ i)]; - h = V_TABLE[usize::from(h ^ j)]; + let mut h = salt; + #[cfg(feature = "fast")] + { + h = JOINT_V_TABLE[usize::from(j)][usize::from(h ^ i)]; + } + #[cfg(not(feature = "fast"))] + { + h = V_TABLE[usize::from(h ^ i)]; + h = V_TABLE[usize::from(h ^ j)]; + } if EFF_BUCKETS == 48 { V_TABLE48[usize::from(h ^ k)] } else { diff --git a/src/tlsh.rs b/src/tlsh.rs index a207aa2..b571fce 100644 --- a/src/tlsh.rs +++ b/src/tlsh.rs @@ -106,12 +106,12 @@ impl< self.a_bucket[usize::from(r)] += 1; let r = fast_b_mapping::(12, b_0, b_1, b_3); self.a_bucket[usize::from(r)] += 1; + let r = fast_b_mapping::(84, b_0, b_1, b_4); + self.a_bucket[usize::from(r)] += 1; let r = fast_b_mapping::(178, b_0, b_2, b_3); self.a_bucket[usize::from(r)] += 1; let r = fast_b_mapping::(166, b_0, b_2, b_4); self.a_bucket[usize::from(r)] += 1; - let r = fast_b_mapping::(84, b_0, b_1, b_4); - self.a_bucket[usize::from(r)] += 1; let r = fast_b_mapping::(230, b_0, b_3, b_4); self.a_bucket[usize::from(r)] += 1; }