From 0590639ce51958c7357ceb6f6e2c0541b5f9d1d3 Mon Sep 17 00:00:00 2001 From: tyb0807 Date: Fri, 3 Jan 2025 16:03:05 +0100 Subject: [PATCH] Add encode_value to encoder API (#264) --- rs/compression/src/compression.rs | 5 +- rs/compression/src/elias_fano/ef.rs | 115 ++++++++++++++++------------ rs/compression/src/noc/noc.rs | 9 ++- rs/index/src/ivf/reader.rs | 4 +- rs/index/src/ivf/writer.rs | 6 +- 5 files changed, 82 insertions(+), 57 deletions(-) diff --git a/rs/compression/src/compression.rs b/rs/compression/src/compression.rs index bd5fe1b..87df62b 100644 --- a/rs/compression/src/compression.rs +++ b/rs/compression/src/compression.rs @@ -10,7 +10,10 @@ pub trait IntSeqEncoder { Self: Sized; /// Compresses a sorted slice of integers - fn encode(&mut self, values: &[u64]) -> Result<()>; + fn encode_batch(&mut self, slice: &[u64]) -> Result<()>; + + /// Compresses an u64 integer + fn encode_value(&mut self, value: &u64) -> Result<()>; /// Returns the size of the encoded data (that would be written to disk) fn len(&self) -> usize; diff --git a/rs/compression/src/elias_fano/ef.rs b/rs/compression/src/elias_fano/ef.rs index 886161c..e74a132 100644 --- a/rs/compression/src/elias_fano/ef.rs +++ b/rs/compression/src/elias_fano/ef.rs @@ -10,11 +10,14 @@ use crate::compression::IntSeqEncoder; pub struct EliasFano { #[cfg(any(debug_assertions, test))] universe: usize, - size: usize, + num_elem: usize, lower_bits: BitVec, upper_bits: BitVec, lower_bit_mask: u64, lower_bit_length: usize, + // Needed for multiple calls to `encode()` + cur_high: u64, + cur_index: usize, } // TODO(tyb): consider moving this to utils @@ -29,18 +32,18 @@ fn msb(n: u64) -> u64 { impl EliasFano { /// Creates a new EliasFano structure - pub fn new(universe: usize, size: usize) -> Self { - // lower_bit_length = floor(log(universe / size)) + pub fn new(universe: usize, num_elem: usize) -> Self { + // lower_bit_length = floor(log(universe / num_elem)) // More efficient way to do it is with bit manipulation - let lower_bit_length = if universe > size { - msb((universe / size) as u64) + let lower_bit_length = if universe > num_elem { + msb((universe / num_elem) as u64) } else { 0 } as usize; let lower_bit_mask = (1 << lower_bit_length) - 1; - let mut lower_bits = BitVec::with_capacity(size * lower_bit_length); + let mut lower_bits = BitVec::with_capacity(num_elem * lower_bit_length); // Ensure lower_bits is filled with false initially - lower_bits.resize(size * lower_bit_length, false); + lower_bits.resize(num_elem * lower_bit_length, false); // The upper bits are encoded using unary coding for the gaps between consecutive values. // This part uses at most 2n bits: @@ -51,18 +54,20 @@ impl EliasFano { Self { #[cfg(any(debug_assertions, test))] universe, - size, + num_elem, lower_bits, - upper_bits: BitVec::with_capacity(2 * size), + upper_bits: BitVec::with_capacity(2 * num_elem), lower_bit_mask, lower_bit_length, + cur_high: 0, + cur_index: 0, } } /// Returns the value at the given index #[allow(dead_code)] fn get(&self, index: usize) -> Result { - if index >= self.size { + if index >= self.num_elem { return Err(anyhow!("Index {} out of bound", index)); } @@ -94,50 +99,58 @@ impl EliasFano { } impl IntSeqEncoder for EliasFano { - fn new_encoder(universe: usize, size: usize) -> Self { - Self::new(universe, size) + fn new_encoder(universe: usize, num_elem: usize) -> Self { + Self::new(universe, num_elem) } // Algorithm described in https://vigna.di.unimi.it/ftp/papers/QuasiSuccinctIndices.pdf - fn encode(&mut self, values: &[u64]) -> Result<()> { - let mut prev_high = 0; - for (i, &val) in values.iter().enumerate() { - // Sanity check only in debug or test builds - #[cfg(any(debug_assertions, test))] - if val > self.universe as u64 { - return Err(anyhow!( - "Element {}th ({}) is greater than universe", - i, - val - )); - } - // Encode lower bits efficiently - if self.lower_bit_length > 0 { - let low = val & self.lower_bit_mask; - let start = i * self.lower_bit_length; - self.lower_bits[start..start + self.lower_bit_length].store(low as u64); - } - - // Encode upper bits using unary coding - let high = val >> self.lower_bit_length; - // Sanity check only in debug or test builds - #[cfg(any(debug_assertions, test))] - if high < prev_high { - return Err(anyhow!("Sequence is not sorted")); - } + fn encode_batch(&mut self, slice: &[u64]) -> Result<()> { + for &val in slice.iter() { + self.encode_value(&val)?; + } + Ok(()) + } - let gap = high - prev_high; - self.upper_bits - .extend_from_bitslice(&BitVec::::repeat(false, gap as usize)); - self.upper_bits.push(true); + fn encode_value(&mut self, value: &u64) -> Result<()> { + let val = *value; + // Sanity check only in debug or test builds + #[cfg(any(debug_assertions, test))] + if val > self.universe as u64 { + return Err(anyhow!( + "Element {}th ({}) is greater than universe", + self.cur_index, + val + )); + } + // Encode lower bits efficiently + if self.lower_bit_length > 0 { + let low = val & self.lower_bit_mask; + let start = self.cur_index * self.lower_bit_length; + self.lower_bits[start..start + self.lower_bit_length].store(low as u64); + } - prev_high = high; + // Encode upper bits using unary coding + let high = val >> self.lower_bit_length; + // Sanity check only in debug or test builds + #[cfg(any(debug_assertions, test))] + if high < self.cur_high { + return Err(anyhow!("Sequence is not sorted")); } + + let gap = high - self.cur_high; + self.upper_bits + .extend_from_bitslice(&BitVec::::repeat(false, gap as usize)); + self.upper_bits.push(true); + + self.cur_high = high; + self.cur_index += 1; Ok(()) } fn len(&self) -> usize { - self.size + let lower_vec: &[u64] = self.lower_bits.as_raw_slice(); + let upper_vec: &[u64] = self.upper_bits.as_raw_slice(); + (1 /* lower_bit_length */ + lower_vec.len() + upper_vec.len()) * std::mem::size_of::() } fn write(&self, writer: &mut BufWriter<&mut File>) -> Result { @@ -174,7 +187,7 @@ mod tests { let values = vec![5, 8, 8, 15, 32]; let upper_bound = 36; let mut ef = EliasFano::new_encoder(upper_bound, values.len()); - assert!(ef.encode(&values).is_ok()); + assert!(ef.encode_batch(&values).is_ok()); // Calculate expected lower bits // L = floor(log2(36/5)) = 2 @@ -197,13 +210,13 @@ mod tests { let values = vec![5, 8, 7, 15, 32]; let upper_bound = 36; let mut ef = EliasFano::new_encoder(upper_bound, values.len()); - assert!(ef.encode(&values).is_err()); + assert!(ef.encode_batch(&values).is_err()); // Test sequence with element exceeding upper bound let values = vec![5, 8, 8, 15, 32]; let upper_bound = 31; let mut ef = EliasFano::new_encoder(upper_bound, values.len()); - assert!(ef.encode(&values).is_err()); + assert!(ef.encode_batch(&values).is_err()); } #[test] @@ -218,7 +231,7 @@ mod tests { for (values, upper_bound) in test_cases { let mut ef = EliasFano::new_encoder(upper_bound, values.len()); - assert!(ef.encode(&values).is_ok()); + assert!(ef.encode_batch(&values).is_ok()); for i in 0..values.len() { let decoded_value = ef.get(i).expect("Failed to decode value"); @@ -231,7 +244,7 @@ mod tests { let upper_bound = 9999; let mut ef = EliasFano::new_encoder(upper_bound, values.len()); - assert!(ef.encode(&values).is_ok()); + assert!(ef.encode_batch(&values).is_ok()); // Check random accesses assert_eq!(ef.get(0).expect("Failed to decode value"), 1); @@ -247,11 +260,13 @@ mod tests { // Create a mock EliasFano instance let ef = EliasFano { universe: 100, - size: 5, + num_elem: 5, lower_bits: BitVec::from_slice(&[0b10101010_01010101]), upper_bits: BitVec::from_slice(&[0b11001100_00110011]), lower_bit_mask: 0b1111, lower_bit_length: 4, + cur_high: 0, + cur_index: 0, }; let temp_dir = diff --git a/rs/compression/src/noc/noc.rs b/rs/compression/src/noc/noc.rs index efc2df4..96e4553 100644 --- a/rs/compression/src/noc/noc.rs +++ b/rs/compression/src/noc/noc.rs @@ -26,8 +26,13 @@ impl IntSeqEncoder for PlainEncoder { Self::new(num_elem) } - fn encode(&mut self, values: &[u64]) -> Result<()> { - self.sequence = values.to_vec(); + fn encode_value(&mut self, value: &u64) -> Result<()> { + self.sequence.push(*value); + Ok(()) + } + + fn encode_batch(&mut self, slice: &[u64]) -> Result<()> { + self.sequence.extend(slice); Ok(()) } diff --git a/rs/index/src/ivf/reader.rs b/rs/index/src/ivf/reader.rs index b0b352e..6bf9075 100644 --- a/rs/index/src/ivf/reader.rs +++ b/rs/index/src/ivf/reader.rs @@ -253,8 +253,8 @@ mod tests { let posting_list = transmute_u8_to_slice::(posting_list_byte_arr.unwrap()); // It's possible that the posting list size is more than max_posting_list_size, - // but it should be less than 2x. - assert!(posting_list.len() <= 20); + // but it should be less than 2.5x. + assert!(posting_list.len() <= 25); } } } diff --git a/rs/index/src/ivf/writer.rs b/rs/index/src/ivf/writer.rs index bcaa751..df808db 100644 --- a/rs/index/src/ivf/writer.rs +++ b/rs/index/src/ivf/writer.rs @@ -195,7 +195,7 @@ impl IvfWriter { posting_list.len(), ); // Encode to get the length of the encoded data - encoder.encode(&posting_list)?; + encoder.encode_batch(&posting_list)?; // Write the length of the encoded posting list metadata_bytes_written += wrap_write(&mut metadata_writer, &encoder.len().to_le_bytes())?; @@ -603,7 +603,9 @@ mod tests { // Check metadata file let expected_metadata = vec![ 1, 0, 0, 0, 0, 0, 0, 0, // num_posting_lists - 5, 0, 0, 0, 0, 0, 0, 0, // posting_list0_len + 24, 0, 0, 0, 0, 0, 0, + 0, // posting_list0_len: 3 * u64: lower_bit_length + 1 lower_bit + // + 1 upper_bit 0, 0, 0, 0, 0, 0, 0, 0, // posting_list0_offset ]; assert_eq!(metadata_content, expected_metadata);