Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DRAFT] Implement Dot Product Similarity Distance algorithm #120

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions rs/utils/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ rayon.workspace = true
name = "l2"
harness = false

[[bench]]
name = "dot_product_similarity"
harness = false

[[bench]]
name = "kmeans"
harness = false
Expand Down
38 changes: 38 additions & 0 deletions rs/utils/benches/dot_product_similarity.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
use utils::distance::dot_product_similarity::DotProductSimilarityCalculator;
use utils::test_utils::generate_random_vector;
use utils::DistanceCalculator;

fn bench_dot_product_similarity(c: &mut Criterion) {
let mut group = c.benchmark_group("Dot Product Similarity");
let mut distance_calculator = DotProductSimilarityCalculator::new();

for size in [
8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096,
384, // VECTOR_DIM_SENTENCE_TRANSFORMERS_MINI_LM
768, // VECTOR_DIM_SENTENCE_TRANSFORMERS_MPNET
1536, // VECTOR_DIM_OPENAI_SMALL
3072, // VECTOR_DIM_OPENAI_LARGE
].iter()
{
let a = generate_random_vector(*size);
let b = generate_random_vector(*size);

group.bench_with_input(BenchmarkId::new("Scalar", *size), &size, |bencher,_| {
bencher.iter(|| distance_calculator.calculate_scalar(black_box(&a), black_box(&b)))
});

group.bench_with_input(BenchmarkId::new("SIMD", *size), &size, |bencher,_| {
bencher.iter(|| distance_calculator.calculate_simd(black_box(&a), black_box(&b)))
});

group.bench_with_input(BenchmarkId::new("Calculate", *size), &size, |bencher,_| {
bencher.iter(|| distance_calculator.calculate(black_box(&a), black_box(&b)))
});
}

group.finish();
}

criterion_group!(benches, bench_dot_product_similarity);
criterion_main!(benches);
128 changes: 128 additions & 0 deletions rs/utils/src/distance/dot_product_similarity.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
use std::simd::{f32x4, f32x8};
use std::simd::num::SimdFloat;

use crate::{DistanceCalculator};

pub struct DotProductSimilarityCalculator {
dist_simd_8: f32x8,
dist_simd_4: f32x4,
dist_simd_1: f32
}

#[derive(Debug, PartialEq, Clone)]
pub enum DotProductSimilarityCalculatorImpl {
Scalar,
SIMD,
StreamingWithSIMD
}

impl DotProductSimilarityCalculator {
pub fn new() -> Self {
Self {
dist_simd_8: f32x8::splat(0.0),
dist_simd_4: f32x4::splat(0.0),
dist_simd_1: 0.0
}
}

pub fn calculate_scalar(&self, a: &[f32], b: &[f32]) -> f32 {
// multiply corresponding elements in two vectors, then add them together
a.iter()
.zip(b.iter())
.map(|(&x,&y)| x * y )
.sum::<f32>()
}

fn multiply(&mut self, a: &[f32], b: &[f32]) {
let mut i: usize = 0;
let mut step: usize = self.dist_simd_8.len();

while i + step <= a.len() {
let a_slice = f32x8::from_slice(&a[i..]);
let b_slice = f32x8::from_slice(&b[i..]);
self.dist_simd_8 += a_slice * b_slice;
i += step;
}
Comment on lines +40 to +45
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
while i + step <= a.len() {
let a_slice = f32x8::from_slice(&a[i..]);
let b_slice = f32x8::from_slice(&b[i..]);
self.dist_simd_8 += a_slice * b_slice;
i += step;
}
while i + 16 <= a.len() && i + 16 < b.len(){
let a_slice = f32x16::from_slice(&a[i..i + 16]);
let b_slice = f32x16::from_slice(&b[i..i + 16]);
self.dist_simd_16 += a_slice * b_slice;
i += step;
}


step = self.dist_simd_4.len();
while i + step <= a.len() {
let a_slice = f32x4::from_slice(&a[i..]);
let b_slice = f32x4::from_slice(&b[i..]);
self.dist_simd_4 += a_slice * b_slice;
i += step;
}

for j in i..a.len() {
self.dist_simd_1 += a[j] * b[j];
}
}

fn accumulate(&self) -> f32 {
self.dist_simd_8.reduce_sum() + self.dist_simd_4.reduce_sum() + self.dist_simd_1
}

pub fn calculate_simd(&mut self, a: &[f32], b: &[f32]) -> f32 {
self.multiply(a, b);
let res = self.accumulate();
res
}


}

impl DistanceCalculator for DotProductSimilarityCalculator {
fn calculate(&mut self, a: &[f32], b: &[f32]) -> f32 {
let num_elements = a.len();
if num_elements < 32 {
self.calculate_scalar(a, b)
} else {
self.calculate_simd(a, b)
}
}
}


// Tests

#[cfg(test)]
mod tests {
use super::*;
use crate::test_utils::generate_random_vector;

#[test]
fn test_basic_dot_product_calculation() {
// arrange
let a = [0.0, 3.0, -5.0];
let b = [-3.0, 9.0, 8.0];

let mut distance_calculator = DotProductSimilarityCalculator::new();

let known_product = 0.0 * -3.0 + 3.0 * 9.0 + -5.0 * 8.0;

// act
let distance_simd = distance_calculator.calculate_simd(&a, &b);
let distance_scalar = distance_calculator.calculate_scalar(&a, &b);

// assert
assert_eq!(known_product, distance_simd);
assert_eq!(known_product, distance_scalar);
}

#[test]
fn test_dot_product_similarity_consistency() {
// arrange
let a = generate_random_vector(128);
let b = generate_random_vector(128);

let mut distance_calculator = DotProductSimilarityCalculator::new();
let epsilon = 1e-5;

// act
let distance_simd = distance_calculator.calculate_simd(&a, &b);
let distance_scalar = distance_calculator.calculate_scalar(&a,&b);

// assert
assert!((distance_simd - distance_scalar).abs() < epsilon);
}
}

1 change: 1 addition & 0 deletions rs/utils/src/distance/mod.rs
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
pub mod l2;
pub mod dot_product_similarity;