Skip to content

Commit

Permalink
[BUG] Fix HNSW resize conditions to exclude deleted items (#3424)
Browse files Browse the repository at this point in the history
## Description of changes

We were seeing errors in compaction that looked like:

```
WriteSegments(ApplyMaterializatedLogsError(HnswIndex(FFIException(\"The number of elements exceeds the specified limit\"))))
```
This is because the resize logic uses `len` which removes deleted
elements which then causes the conditional for resizing to be skipped.
This adds a new method `len_with_deleted` to return the length of the
index including the elements that have been deleted so that the resize
is not skipped. See:
https://github.com/chroma-core/chroma/blob/27564afa0e82d184a0fa853a7384d00861d8e9dc/rust/worker/src/segment/distributed_hnsw_segment.rs#L212-L222

## Test plan
A test was added. It will fail when using the existing `len` method, but
passes when using the newly added `len_with_deleted` method.

- [x] Tests pass locally with `pytest` for python, `yarn test` for js,
`cargo test` for rust

## Documentation Changes
N/A

Thanks to @sanketkedia for helping root cause this. 

Fixes chroma-core/hosted-chroma#855
  • Loading branch information
eculver authored and github-actions[bot] committed Jan 7, 2025
1 parent 6145277 commit 514bda0
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 1 deletion.
13 changes: 13 additions & 0 deletions rust/index/bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,7 @@ extern "C"
}

// Can not throw std::exception
// Note: includes deleted items
int len(Index<float> *index)
{
if (!index->index_inited)
Expand All @@ -440,6 +441,18 @@ extern "C"
return index->appr_alg->getCurrentElementCount() - index->appr_alg->getDeletedCount();
}

// Can not throw std::exception
// Note: does not include deleted items
int len_with_deleted(Index<float> *index)
{
if (!index->index_inited)
{
return 0;
}

return index->appr_alg->getCurrentElementCount();
}

// Can not throw std::exception
size_t capacity(Index<float> *index)
{
Expand Down
56 changes: 55 additions & 1 deletion rust/index/src/hnsw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,11 @@ impl HnswIndex {
// Does not return an error
}

pub fn len_with_deleted(&self) -> usize {
unsafe { len_with_deleted(self.ffi_ptr) as usize }
// Does not return an error
}

pub fn is_empty(&self) -> bool {
self.len() == 0
}
Expand Down Expand Up @@ -397,9 +402,9 @@ extern "C" {

#[cfg(test)]
fn get_ef(index: *const IndexPtrFFI) -> c_int;

fn set_ef(index: *const IndexPtrFFI, ef: c_int);
fn len(index: *const IndexPtrFFI) -> c_int;
fn len_with_deleted(index: *const IndexPtrFFI) -> c_int;
fn capacity(index: *const IndexPtrFFI) -> c_int;
fn resize_index(index: *const IndexPtrFFI, new_size: usize);
fn get_last_error(index: *const IndexPtrFFI) -> *const c_char;
Expand Down Expand Up @@ -925,4 +930,53 @@ pub mod test {
.to_string()
.contains("HNSW Integrity failure"))
}

#[test]
fn it_can_resize_correctly() {
let n: usize = 10;
let d: usize = 960;
let distance_function = DistanceFunction::Euclidean;
let tmp_dir = tempdir().unwrap();
let persist_path = tmp_dir.path().to_str().unwrap().to_string();
let id = Uuid::new_v4();
let index = HnswIndex::init(
&IndexConfig {
dimensionality: d as i32,
distance_function: distance_function.clone(),
},
Some(&HnswIndexConfig {
max_elements: n,
m: 32,
ef_construction: 100,
ef_search: 100,
random_seed: 0,
persist_path: persist_path.clone(),
}),
IndexUuid(id),
);

let mut index = match index {
Err(e) => panic!("Error initializing index: {}", e),
Ok(index) => index,
};

let data: Vec<f32> = utils::generate_random_data(n, d);
let ids: Vec<usize> = (0..n).collect();

(0..n).for_each(|i| {
let data = &data[i * d..(i + 1) * d];
index.add(ids[i], data).expect("Should not error");
});

index.delete(0).unwrap();
let data = &data[d..2 * d];

let index_len = index.len_with_deleted();
let index_capacity = index.capacity();
if index_len + 1 > index_capacity {
index.resize(index_capacity * 2).unwrap();
}
// this will fail if the index is not resized correctly
index.add(100, data).unwrap();
}
}

0 comments on commit 514bda0

Please sign in to comment.