From bce4ed3aa08c90a24fe79efce501cbd6556383d3 Mon Sep 17 00:00:00 2001 From: Anh Pham Date: Sun, 17 Nov 2024 13:23:29 -0800 Subject: [PATCH 1/3] feat: add implementation for Dot Product: simd and scalar --- rs/utils/src/dot_product_similarity.rs | 128 +++++++++++++++++++++++++ rs/utils/src/lib.rs | 1 + 2 files changed, 129 insertions(+) create mode 100644 rs/utils/src/dot_product_similarity.rs diff --git a/rs/utils/src/dot_product_similarity.rs b/rs/utils/src/dot_product_similarity.rs new file mode 100644 index 0000000..36abc06 --- /dev/null +++ b/rs/utils/src/dot_product_similarity.rs @@ -0,0 +1,128 @@ +use std::simd::{f32x4, f32x8}; +use std::simd::num::SimdFloat; + +use crate::{DistanceCalculator}; + +pub struct DotProductSimilarityCalculator { + dist_simd_8: f32x8, + dist_simd_4: f32x4, + dist_simd_1: f32 +} + +#[derive(Debug, PartialEq, Clone)] +pub enum DotProductSimilarityCalculatorImpl { + Scalar, + SIMD, + StreamingWithSIMD +} + +impl DotProductSimilarityCalculator { + pub fn new() -> Self { + Self { + dist_simd_8: f32x8::splat(0.0), + dist_simd_4: f32x4::splat(0.0), + dist_simd_1: 0.0 + } + } + + pub fn calculate_scalar(&self, a: &[f32], b: &[f32]) -> f32 { + // multiply corresponding elements in two vectors, then add them together + a.iter() + .zip(b.iter()) + .map(|(&x,&y)| x * y ) + .sum::() + } + + fn multiply(&mut self, a: &[f32], b: &[f32]) { + let mut i: usize = 0; + let mut step: usize = self.dist_simd_8.len(); + + while i + step <= a.len() { + let a_slice = f32x8::from_slice(&a[i..]); + let b_slice = f32x8::from_slice(&b[i..]); + self.dist_simd_8 += a_slice * b_slice; + i += step; + } + + step = self.dist_simd_4.len(); + while i + step <= a.len() { + let a_slice = f32x4::from_slice(&a[i..]); + let b_slice = f32x4::from_slice(&b[i..]); + self.dist_simd_4 += a_slice * b_slice; + i += step; + } + + for j in i..a.len() { + self.dist_simd_1 += a[j] * b[j]; + } + } + + fn accumulate(&self) -> f32 { + self.dist_simd_8.reduce_sum() + self.dist_simd_4.reduce_sum() + self.dist_simd_1 + } + + pub fn calculate_simd(&mut self, a: &[f32], b: &[f32]) -> f32 { + self.multiply(a, b); + let res = self.accumulate(); + res + } + + +} + +impl DistanceCalculator for DotProductSimilarityCalculator { + fn calculate(&mut self, a: &[f32], b: &[f32]) -> f32 { + let num_elements = a.len(); + if num_elements < 32 { + self.calculate_scalar(a, b) + } else { + self.calculate_simd(a, b) + } + } +} + + +// Tests + +#[cfg(test)] +mod tests { + use super::*; + use crate::test_utils::generate_random_vector; + + #[test] + fn test_basic_dot_product_calculation() { + // arrange + let a = [0.0, 3.0, -5.0]; + let b = [-3.0, 9.0, 8.0]; + + let mut distance_calculator = DotProductSimilarityCalculator::new(); + + let known_product = 0.0 * -3.0 + 3.0 * 9.0 + -5.0 * 8.0; + + // act + let distance_simd = distance_calculator.calculate_simd(&a, &b); + let distance_scalar = distance_calculator.calculate_scalar(&a, &b); + + // assert + assert_eq!(known_product, distance_simd); + assert_eq!(known_product, distance_scalar); + } + + #[test] + fn test_dot_product_similarity_consistency() { + // arrange + let a = generate_random_vector(128); + let b = generate_random_vector(128); + + let mut distance_calculator = DotProductSimilarityCalculator::new(); + let epsilon = 1e-5; + + // act + let distance_simd = distance_calculator.calculate_simd(&a, &b); + let distance_scalar = distance_calculator.calculate_scalar(&a,&b); + + // assert + assert!((distance_simd - distance_scalar).abs() < epsilon); + } +} + diff --git a/rs/utils/src/lib.rs b/rs/utils/src/lib.rs index 70c0491..56991c2 100644 --- a/rs/utils/src/lib.rs +++ b/rs/utils/src/lib.rs @@ -4,6 +4,7 @@ pub mod kmeans_builder; pub mod distance; pub mod mem; pub mod test_utils; +pub mod dot_product_similarity; pub trait DistanceCalculator { /// Compute distance between two vectors. From 1909ba848041cf44f2941f0ec3779c97a8c620cb Mon Sep 17 00:00:00 2001 From: Anh Pham Date: Sun, 17 Nov 2024 14:33:28 -0800 Subject: [PATCH 2/3] merge latest changes and add benchmark test for Dot Product Similarity --- rs/utils/benches/dot_product_similarity.rs | 38 +++++++++++++++++++ .../{ => distance}/dot_product_similarity.rs | 0 2 files changed, 38 insertions(+) create mode 100644 rs/utils/benches/dot_product_similarity.rs rename rs/utils/src/{ => distance}/dot_product_similarity.rs (100%) diff --git a/rs/utils/benches/dot_product_similarity.rs b/rs/utils/benches/dot_product_similarity.rs new file mode 100644 index 0000000..4c214f4 --- /dev/null +++ b/rs/utils/benches/dot_product_similarity.rs @@ -0,0 +1,38 @@ +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; +use utils::distance::dot_product_similarity::DotProductSimilarityCalculator; +use utils::test_utils::generate_random_vector; +use utils::DistanceCalculator; + +fn bench_dot_product_similarity(c: &mut Criterion) { + let mut group = c.benchmark_group("Dot Product Similarity"); + let mut distance_calculator = DotProductSimilarityCalculator::new(); + + for size in [ + 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, + 384, // VECTOR_DIM_SENTENCE_TRANSFORMERS_MINI_LM + 768, // VECTOR_DIM_SENTENCE_TRANSFORMERS_MPNET + 1536, // VECTOR_DIM_OPENAI_SMALL + 3072, // VECTOR_DIM_OPENAI_LARGE + ].iter() + { + let a = generate_random_vector(*size); + let b = generate_random_vector(*size); + + group.bench_with_input(BenchmarkId::new("Scalar", *size), &size, |bencher,_| { + bencher.iter(|| distance_calculator.calculate_scalar(black_box(&a), black_box(&b))) + }); + + group.bench_with_input(BenchmarkId::new("SIMD", *size), &size, |bencher,_| { + bencher.iter(|| distance_calculator.calculate_simd(black_box(&a), black_box(&b))) + }); + + group.bench_with_input(BenchmarkId::new("Calculate", *size), &size, |bencher,_| { + bencher.iter(|| distance_calculator.calculate(black_box(&a), black_box(&b))) + }); + } + + group.finish(); +} + +criterion_group!(benches, bench_dot_product_similarity); +criterion_main!(benches); \ No newline at end of file diff --git a/rs/utils/src/dot_product_similarity.rs b/rs/utils/src/distance/dot_product_similarity.rs similarity index 100% rename from rs/utils/src/dot_product_similarity.rs rename to rs/utils/src/distance/dot_product_similarity.rs From 601edf469f08c32bff153d563586d83095d0efc6 Mon Sep 17 00:00:00 2001 From: Anh Pham Date: Sun, 17 Nov 2024 14:35:10 -0800 Subject: [PATCH 3/3] chore: add rebased changes --- rs/utils/Cargo.toml | 4 ++++ rs/utils/src/distance/mod.rs | 1 + rs/utils/src/lib.rs | 1 - 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/rs/utils/Cargo.toml b/rs/utils/Cargo.toml index 0e60911..40b377d 100644 --- a/rs/utils/Cargo.toml +++ b/rs/utils/Cargo.toml @@ -20,6 +20,10 @@ rayon.workspace = true name = "l2" harness = false +[[bench]] +name = "dot_product_similarity" +harness = false + [[bench]] name = "kmeans" harness = false diff --git a/rs/utils/src/distance/mod.rs b/rs/utils/src/distance/mod.rs index a9f3202..0b1eedd 100644 --- a/rs/utils/src/distance/mod.rs +++ b/rs/utils/src/distance/mod.rs @@ -1 +1,2 @@ pub mod l2; +pub mod dot_product_similarity; diff --git a/rs/utils/src/lib.rs b/rs/utils/src/lib.rs index 56991c2..70c0491 100644 --- a/rs/utils/src/lib.rs +++ b/rs/utils/src/lib.rs @@ -4,7 +4,6 @@ pub mod kmeans_builder; pub mod distance; pub mod mem; pub mod test_utils; -pub mod dot_product_similarity; pub trait DistanceCalculator { /// Compute distance between two vectors.