Skip to content

Commit

Permalink
Change ids type to u128 to support UUID
Browse files Browse the repository at this point in the history
  • Loading branch information
hicder committed Jan 17, 2025
1 parent 863539e commit 3dfbb1e
Show file tree
Hide file tree
Showing 29 changed files with 242 additions and 188 deletions.
4 changes: 2 additions & 2 deletions rs/index/src/collection/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -149,11 +149,11 @@ impl Collection {
})
}

pub fn insert(&self, doc_id: u64, data: &[f32]) -> Result<()> {
pub fn insert(&self, doc_id: u128, data: &[f32]) -> Result<()> {
self.mutable_segment.write().unwrap().insert(doc_id, data)
}

pub fn insert_for_users(&self, user_ids: &[u64], doc_id: u64, data: &[f32]) -> Result<()> {
pub fn insert_for_users(&self, user_ids: &[u128], doc_id: u128, data: &[f32]) -> Result<()> {
for user_id in user_ids {
self.mutable_segment
.write()
Expand Down
4 changes: 2 additions & 2 deletions rs/index/src/collection/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,8 @@ mod tests {
for i in 0..num_vectors {
builder
.insert(
(i % 5) as u64,
i as u64,
(i % 5) as u128,
i as u128,
&generate_random_vector(num_features),
)
.unwrap();
Expand Down
6 changes: 3 additions & 3 deletions rs/index/src/collection/snapshot.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ impl Snapshot {

pub fn search_for_ids(
&self,
ids: &[u64],
ids: &[u128],
query: &[f32],
k: usize,
ef_construction: u32,
Expand All @@ -57,7 +57,7 @@ impl Snapshot {
impl Searchable for Snapshot {
fn search_with_id(
&self,
id: u64,
id: u128,
query: &[f32],
k: usize,
ef_construction: u32,
Expand Down Expand Up @@ -86,7 +86,7 @@ impl Searchable for Snapshot {
ef_construction: u32,
context: &mut SearchContext,
) -> Option<Vec<IdWithScore>> {
self.search_with_id(0 as u64, query, k, ef_construction, context)
self.search_with_id(0u128, query, k, ef_construction, context)
}
}

Expand Down
14 changes: 7 additions & 7 deletions rs/index/src/hnsw/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ use quantization::typing::VectorOps;
use rand::Rng;

use super::index::Hnsw;
use super::utils::{BuilderContext, GraphTraversal, PointAndDistance};
use crate::utils::SearchContext;
use super::utils::{BuilderContext, GraphTraversal};
use crate::utils::{PointAndDistance, SearchContext};
use crate::vector::file::FileBackedAppendableVectorStorage;
use crate::vector::{VectorStorage, VectorStorageConfig};

Expand Down Expand Up @@ -56,7 +56,7 @@ pub struct HnswBuilder<Q: Quantizer> {
ef_contruction: u32,
pub entry_point: Vec<u32>,
max_layer: u8,
pub doc_id_mapping: Vec<u64>,
pub doc_id_mapping: Vec<u128>,
}

// TODO(hicder): support bare vector in addition to quantized one.
Expand Down Expand Up @@ -171,7 +171,7 @@ impl<Q: Quantizer> HnswBuilder<Q> {
}
}

fn generate_id(&mut self, doc_id: u64) -> u32 {
fn generate_id(&mut self, doc_id: u128) -> u32 {
let generated_id = self.doc_id_mapping.len() as u32;
self.doc_id_mapping.push(doc_id);
return generated_id;
Expand Down Expand Up @@ -215,7 +215,7 @@ impl<Q: Quantizer> HnswBuilder<Q> {
{
continue;
}
queue.push_back(edge.point_id);
queue.push_back(edge.point_id as u32);
if assigned_ids[edge.point_id as usize] < 0 {
assigned_ids[edge.point_id as usize] = *current_id;
*current_id += 1;
Expand Down Expand Up @@ -300,7 +300,7 @@ impl<Q: Quantizer> HnswBuilder<Q> {
}

/// Insert a vector into the index
pub fn insert(&mut self, doc_id: u64, vector: &[f32]) -> Result<()> {
pub fn insert(&mut self, doc_id: u128, vector: &[f32]) -> Result<()> {
let quantized_query = Q::QuantizedT::process_vector(vector, &self.quantizer);
let point_id = self.generate_id(doc_id);
let mut context = BuilderContext::new(point_id + 1);
Expand All @@ -326,7 +326,7 @@ impl<Q: Quantizer> HnswBuilder<Q> {
for l in ((layer + 1)..=self.current_top_layer).rev() {
let nearest_elements =
self.search_layer(&mut context, &quantized_query, entry_point, 1, l);
entry_point = nearest_elements[0].point_id;
entry_point = nearest_elements[0].point_id as u32;
}
} else if layer > self.current_top_layer {
// Initialize the layers
Expand Down
6 changes: 3 additions & 3 deletions rs/index/src/hnsw/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ impl<Q: Quantizer> Hnsw<Q> {
}
}

fn map_point_id_to_doc_id(&self, point_ids: &[u32]) -> Vec<u64> {
fn map_point_id_to_doc_id(&self, point_ids: &[u32]) -> Vec<u128> {
let doc_id_mapping = self.get_doc_id_mapping_slice();
point_ids
.iter()
Expand Down Expand Up @@ -157,7 +157,7 @@ impl<Q: Quantizer> Hnsw<Q> {
}

/// Returns the doc_id_mapping slice
pub fn get_doc_id_mapping_slice(&self) -> &[u64] {
pub fn get_doc_id_mapping_slice(&self) -> &[u128] {
let start = self.doc_id_mapping_offset;
let slice = &self.mmap[start..start + self.header.doc_id_mapping_len as usize];
return utils::mem::transmute_u8_to_slice(slice);
Expand Down Expand Up @@ -215,7 +215,7 @@ impl<Q: Quantizer> Hnsw<Q> {
}

#[cfg(test)]
pub fn get_doc_id_test(&self, point_ids: &[u32]) -> Vec<u64> {
pub fn get_doc_id_test(&self, point_ids: &[u32]) -> Vec<u128> {
let doc_id_mapping = self.get_doc_id_mapping_slice();
point_ids
.iter()
Expand Down
11 changes: 8 additions & 3 deletions rs/index/src/hnsw/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,12 @@ impl HnswReader {
let edge_offsets_offset =
points_offset + header.points_len as usize + edge_offsets_padding as usize;
let level_offsets_offset = edge_offsets_offset + header.edge_offsets_len as usize;
let doc_id_mapping_offset = level_offsets_offset + header.level_offsets_len as usize;

let doc_id_mapping_padding =
(16 - ((level_offsets_offset + header.level_offsets_len as usize) % 16)) % 16;
let doc_id_mapping_offset = level_offsets_offset
+ header.level_offsets_len as usize
+ doc_id_mapping_padding as usize;

Ok(Hnsw::new(
backing_file,
Expand Down Expand Up @@ -167,7 +172,7 @@ mod tests {
10, 128, 20, 1024, 4096, 16, pq, vector_dir,
);
for i in 0..datapoints.len() {
hnsw_builder.insert(i as u64, &datapoints[i]).unwrap();
hnsw_builder.insert(i as u128, &datapoints[i]).unwrap();
}

let hnsw_dir = format!("{}/hnsw", base_directory);
Expand Down Expand Up @@ -201,7 +206,7 @@ mod tests {
let mut hnsw_builder =
HnswBuilder::new(10, 128, 20, 1024, 4096, 128, quantizer, vector_dir);
for i in 0..datapoints.len() {
hnsw_builder.insert(i as u64, &datapoints[i]).unwrap();
hnsw_builder.insert(i as u128, &datapoints[i]).unwrap();
}

let hnsw_dir = format!("{}/hnsw", base_directory);
Expand Down
11 changes: 2 additions & 9 deletions rs/index/src/hnsw/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use bit_vec::BitVec;
use ordered_float::NotNan;
use quantization::quantization::Quantizer;

use crate::utils::TraversalContext;
use crate::utils::{PointAndDistance, TraversalContext};
pub struct BuilderContext {
visited: BitVec,
}
Expand Down Expand Up @@ -33,13 +33,6 @@ impl TraversalContext for BuilderContext {
fn record_pages(&mut self, _page_id: String) {}
}

/// A point and its distance to the query.
#[derive(PartialEq, Eq, Ord, PartialOrd, Clone, Debug)]
pub struct PointAndDistance {
pub point_id: u32,
pub distance: NotNan<f32>,
}

/// Move the traversal logic out, since it's used in both indexing and query path
pub trait GraphTraversal<Q: Quantizer> {
type ContextT: TraversalContext;
Expand Down Expand Up @@ -78,7 +71,7 @@ pub trait GraphTraversal<Q: Quantizer> {

while !candidates.is_empty() {
let point_and_distance = candidates.pop().unwrap();
let point_id = point_and_distance.point_id;
let point_id = point_and_distance.point_id as u32;
let distance: f32 = -*point_and_distance.distance;

let mut furthest_element_from_working_list = working_list.peek().unwrap();
Expand Down
9 changes: 8 additions & 1 deletion rs/index/src/hnsw/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,12 @@ impl<Q: Quantizer> HnswWriter<Q> {
}
written += append_file_to_writer(&edge_offsets_path, &mut combined_buffer_writer)?;
written += append_file_to_writer(&level_offsets_path, &mut combined_buffer_writer)?;

padding = 16 - (written % 16);
if padding != 16 {
let padding_buffer = vec![0; padding];
written += wrap_write(&mut combined_buffer_writer, &padding_buffer)?;
}
written += append_file_to_writer(&doc_id_mapping_path, &mut combined_buffer_writer)?;

combined_buffer_writer
Expand All @@ -275,7 +281,8 @@ mod tests {
use super::*;
use crate::hnsw::builder::Layer;
use crate::hnsw::reader::HnswReader;
use crate::hnsw::utils::{GraphTraversal, PointAndDistance};
use crate::hnsw::utils::GraphTraversal;
use crate::utils::PointAndDistance;

fn construct_layers(hnsw_builder: &mut HnswBuilder<ProductQuantizer<L2DistanceCalculator>>) {
// Prepare all layers
Expand Down
2 changes: 1 addition & 1 deletion rs/index/src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ pub trait Searchable {
#[allow(unused_variables)]
fn search_with_id(
&self,
id: u64,
id: u128,
query: &[f32],
k: usize,
ef_construction: u32,
Expand Down
38 changes: 15 additions & 23 deletions rs/index/src/ivf/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ use utils::{ceil_div, CalculateSquared, DistanceCalculator};

use crate::posting_list::file::FileBackedAppendablePostingListStorage;
use crate::posting_list::PostingListStorage;
use crate::utils::PointAndDistance;
use crate::vector::file::FileBackedAppendableVectorStorage;
use crate::vector::VectorStorage;

Expand Down Expand Up @@ -44,16 +45,10 @@ pub struct IvfBuilder<D: DistanceCalculator + CalculateSquared + Send + Sync> {
vectors: AtomicRefCell<Box<dyn VectorStorage<f32> + Send + Sync>>,
centroids: AtomicRefCell<Box<dyn VectorStorage<f32> + Send + Sync>>,
posting_lists: Box<dyn for<'a> PostingListStorage<'a>>,
doc_id_mapping: Vec<u64>,
doc_id_mapping: Vec<u128>,
_marker: PhantomData<D>,
}

// TODO(tyb): maybe merge with HNSW's one
pub struct PointAndDistance {
pub point_id: usize,
pub distance: f32,
}

#[derive(Debug)]
struct PostingListInfo {
centroid: Vec<f32>,
Expand Down Expand Up @@ -198,7 +193,7 @@ impl<D: DistanceCalculator + CalculateSquared + Send + Sync> IvfBuilder<D> {
&self.vectors
}

pub fn doc_id_mapping(&self) -> &[u64] {
pub fn doc_id_mapping(&self) -> &[u128] {
&*self.doc_id_mapping
}

Expand All @@ -215,7 +210,7 @@ impl<D: DistanceCalculator + CalculateSquared + Send + Sync> IvfBuilder<D> {
}

/// Add a new vector to the dataset for training
pub fn add_vector(&mut self, doc_id: u64, data: &[f32]) -> Result<()> {
pub fn add_vector(&mut self, doc_id: u128, data: &[f32]) -> Result<()> {
self.vectors.borrow_mut().append(&data)?;
self.generate_id(doc_id)?;
Ok(())
Expand All @@ -233,7 +228,7 @@ impl<D: DistanceCalculator + CalculateSquared + Send + Sync> IvfBuilder<D> {
Ok(())
}

fn generate_id(&mut self, doc_id: u64) -> Result<u32> {
fn generate_id(&mut self, doc_id: u128) -> Result<u32> {
let generated_id = self.doc_id_mapping.len() as u32;
self.doc_id_mapping.push(doc_id);
Ok(generated_id)
Expand Down Expand Up @@ -270,10 +265,7 @@ impl<D: DistanceCalculator + CalculateSquared + Send + Sync> IvfBuilder<D> {
if dist.is_nan() {
println!("NAN found");
}
distances.push(PointAndDistance {
point_id: i,
distance: dist,
});
distances.push(PointAndDistance::new(dist, i as u32));
}
distances.select_nth_unstable_by(num_probes - 1, |a, b| a.distance.total_cmp(&b.distance));
distances.truncate(num_probes);
Expand Down Expand Up @@ -304,7 +296,7 @@ impl<D: DistanceCalculator + CalculateSquared + Send + Sync> IvfBuilder<D> {
// other value
let nearest_distance = nearest_centroids
.iter()
.map(|pad| pad.distance)
.map(|pad| pad.distance.into_inner())
.min_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Greater))
.expect("nearest_distance should not be None");
let mut accepted_centroid_ids = vec![];
Expand Down Expand Up @@ -810,7 +802,7 @@ mod tests {
// Generate 1000 vectors of f32, dimension 4
for i in 0..num_vectors {
builder
.add_vector(i as u64, &[(i + 1) as f32])
.add_vector(i as u128, &[(i + 1) as f32])
.expect("Vector should be added");
}

Expand Down Expand Up @@ -1035,7 +1027,7 @@ mod tests {

for i in 0..num_vectors {
builder
.add_vector(i as u64, &generate_random_vector(num_features))
.add_vector(i as u128, &generate_random_vector(num_features))
.expect("Vector should be added");
}

Expand Down Expand Up @@ -1108,7 +1100,7 @@ mod tests {

for i in 0..num_vectors {
builder
.add_vector(i as u64, &generate_random_vector(num_features))
.add_vector(i as u128, &generate_random_vector(num_features))
.expect("Vector should be added");
}

Expand Down Expand Up @@ -1181,7 +1173,7 @@ mod tests {

for i in 0..num_vectors {
builder
.add_vector(i as u64, &generate_random_vector(num_features))
.add_vector(i as u128, &generate_random_vector(num_features))
.expect("Vector should be added");
}

Expand Down Expand Up @@ -1276,7 +1268,7 @@ mod tests {

for i in 0..num_vectors {
builder
.add_vector(i as u64, &generate_random_vector(num_features))
.add_vector(i as u128, &generate_random_vector(num_features))
.expect("Vector should be added");
}

Expand Down Expand Up @@ -1354,7 +1346,7 @@ mod tests {

for i in 0..NUM_VECTORS {
builder
.add_vector(i as u64 + 100, &[i as f32])
.add_vector(i as u128 + 100, &[i as f32])
.expect("Vector should be added");
}

Expand Down Expand Up @@ -1384,7 +1376,7 @@ mod tests {
);
}

let expected_doc_ids: [u64; NUM_VECTORS] = [
let expected_doc_ids: [u128; NUM_VECTORS] = [
10, 14, 15, 1, 3, 5, 7, 9, 16, 18, 0, 2, 4, 6, 8, 20, 11, 12, 13, 21, 17, 19,
];

Expand Down Expand Up @@ -1429,7 +1421,7 @@ mod tests {
// Generate 1000 vectors of f32, dimension 4
for i in 0..num_vectors {
builder
.add_vector(i as u64, &generate_random_vector(num_features))
.add_vector(i as u128, &generate_random_vector(num_features))
.expect("Vector should be added");
}

Expand Down
Loading

0 comments on commit 3dfbb1e

Please sign in to comment.