Skip to content

Commit

Permalink
implement DistanceCalculator trait for DotProduct (#218)
Browse files Browse the repository at this point in the history
* simd dot_product with bechnmark

* inline dot_product functions
  • Loading branch information
thinh2 authored Jan 3, 2025
1 parent 4ab2737 commit 0e33ac3
Show file tree
Hide file tree
Showing 5 changed files with 139 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@
flamegraph.svg
perf.data*
.idea
.DS_store
.venv
*/__pycache__/
4 changes: 4 additions & 0 deletions rs/utils/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ harness = false
name = "kmeans"
harness = false

[[bench]]
name = "dot_product"
harness = false

[[bin]]
name = "run_kmeans"
path = "src/scripts/run_kmeans.rs"
37 changes: 37 additions & 0 deletions rs/utils/benches/dot_product.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@

use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
use utils::test_utils::generate_random_vector;
use utils::distance::dot_product::DotProductDistanceCalculator;
use utils::DistanceCalculator;

fn benches_dot_product(c: &mut Criterion) {
let mut group = c.benchmark_group("dot_product");

for size in [
8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096,
384, // VECTOR_DIM_SENTENCE_TRANSFORMERS_MINI_LM
768, // VECTOR_DIM_SENTENCE_TRANSFORMERS_MPNET
1536, // VECTOR_DIM_OPENAI_SMALL
3072, // VECTOR_DIM_OPENAI_LARGE
]
.iter() {
let a = generate_random_vector(*size);
let b = generate_random_vector(*size);

group.bench_with_input(BenchmarkId::new("simd", size), &size, |bench, &_size| {
bench.iter(|| {
DotProductDistanceCalculator::calculate(black_box(&a), black_box(&b));
});
});

group.bench_with_input(BenchmarkId::new("scalar", size), &size, |bench, &_size| {
bench.iter(|| {
DotProductDistanceCalculator::calculate_scalar(black_box(&a), black_box(&b));
});
});
}
group.finish();
}

criterion_group!(benches, benches_dot_product);
criterion_main!(benches);
96 changes: 96 additions & 0 deletions rs/utils/src/distance/dot_product.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
use crate::DistanceCalculator;
use std::{ops::AddAssign, simd::{num::SimdFloat, LaneCount, Simd, SupportedLaneCount}};

pub struct DotProductDistanceCalculator {}

impl DotProductDistanceCalculator {
pub fn calculate_scalar(a: &[f32], b: &[f32]) -> f32 {
let mut ret = 0.0;
for i in 0..a.len() {
ret += a[i] * b[i];
}
Self::neg_score(ret)
}

/*
* In our code, the lower distance value, the greater similarity between two vectors.
* However, in dot product, two vector having the same direction
* will yield the largest distance.
* Thus, we need to take negative value of the original dot product value.
*/
#[inline(always)]
pub fn neg_score(x: f32) -> f32 {
-x
}
}

impl DistanceCalculator for DotProductDistanceCalculator {
#[inline(always)]
fn calculate(a: &[f32], b: &[f32]) -> f32 {
let mut res = 0.0;
let mut a_vec = a;
let mut b_vec = b;

if a_vec.len() > 16 {
let mut accumulator= Simd::<f32, 16>::splat(0.0);
Self::accumulate_lanes::<16>(a_vec, b_vec, &mut accumulator);
res += accumulator.reduce_sum();
a_vec = a_vec.chunks_exact(16).remainder();
b_vec = b_vec.chunks_exact(16).remainder();
}

if a_vec.len() > 8 {
let mut accumulator= Simd::<f32, 8>::splat(0.0);
Self::accumulate_lanes::<8>(a_vec, b_vec, &mut accumulator);
res += accumulator.reduce_sum();
a_vec = a_vec.chunks_exact(8).remainder();
b_vec = b_vec.chunks_exact(8).remainder();
}

if a_vec.len() > 4 {
let mut accumulator= Simd::<f32, 4>::splat(0.0);
Self::accumulate_lanes::<4>(a_vec, b_vec, &mut accumulator);
res += accumulator.reduce_sum();
a_vec = a_vec.chunks_exact(4).remainder();
b_vec = b_vec.chunks_exact(4).remainder();
}

for i in 0..a_vec.len() {
res += a_vec[i] * b_vec[i];
}
Self::neg_score(res)
}

#[inline(always)]
fn accumulate_lanes<const LANES: usize>(
a: &[f32],
b: &[f32],
accumulator: &mut Simd<f32, LANES>,
) where
LaneCount<LANES>: SupportedLaneCount,
{
a.chunks_exact(LANES)
.zip(b.chunks_exact(LANES))
.for_each(|(a_chunk, b_chunk)| {
let a_simd = Simd::<f32, LANES>::from_slice(a_chunk);
let b_simd = Simd::<f32, LANES>::from_slice(b_chunk);
accumulator.add_assign(a_simd * b_simd);
});
}
}

#[cfg(test)]
mod tests {
use super::*;
use crate::test_utils::generate_random_vector;

#[test]
fn test_dot_product_distance_calculator() {
let a = generate_random_vector(128);
let b = generate_random_vector(128);
let eps = 1e-5;
let result = DotProductDistanceCalculator::calculate(&a, &b);
let expected = DotProductDistanceCalculator::calculate_scalar(&a, &b);
assert!((result - expected).abs() < eps);
}
}
1 change: 1 addition & 0 deletions rs/utils/src/distance/mod.rs
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
pub mod l2;
pub mod dot_product;

0 comments on commit 0e33ac3

Please sign in to comment.