Skip to content

Commit

Permalink
EliasFanoDecoder and associated iterator
Browse files Browse the repository at this point in the history
  • Loading branch information
BuildKite committed Jan 5, 2025
1 parent cf154b6 commit c09c44c
Show file tree
Hide file tree
Showing 4 changed files with 218 additions and 12 deletions.
6 changes: 4 additions & 2 deletions rs/compression/src/compression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@ pub trait IntSeqEncoder {
}

pub trait IntSeqDecoder {
type IteratorType<'a>: Iterator<Item = Self::Item>;
type IteratorType<'a>: Iterator<Item = Self::Item>
where
Self: 'a;
type Item;

/// Creates a decoder
Expand All @@ -34,5 +36,5 @@ pub trait IntSeqDecoder {

/// Creates an iterator that iterates the encoded data and decodes one element at a time on the
/// fly
fn get_iterator<'a>(&self, byte_slice: &'a [u8]) -> Self::IteratorType<'a>;
fn get_iterator<'a>(&'a self, byte_slice: &'a [u8]) -> Self::IteratorType<'a>;
}
213 changes: 208 additions & 5 deletions rs/compression/src/elias_fano/ef.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@ use std::io::{BufWriter, Write};
use anyhow::{anyhow, Result};
use bitvec::prelude::*;
use utils::io::wrap_write;
use utils::mem::transmute_u8_to_slice;

use crate::compression::IntSeqEncoder;
use crate::compression::{IntSeqDecoder, IntSeqEncoder};

pub struct EliasFano {
#[cfg(any(debug_assertions, test))]
Expand Down Expand Up @@ -150,12 +151,18 @@ impl IntSeqEncoder for EliasFano {
fn len(&self) -> usize {
let lower_vec: &[u64] = self.lower_bits.as_raw_slice();
let upper_vec: &[u64] = self.upper_bits.as_raw_slice();
(1 /* lower_bit_length */ + lower_vec.len() + upper_vec.len()) * std::mem::size_of::<u64>()
(1 /* self.num_elem */ +
1 /* self.lower_bit_length */ +
1 /* lower_vec.len() */ +
1 /* upper_vec.len() */ +
lower_vec.len() +
upper_vec.len())
* std::mem::size_of::<u64>()
}

fn write(&self, writer: &mut BufWriter<&mut File>) -> Result<usize> {
let mut total_bytes_written =
wrap_write(writer, &((self.lower_bit_length as u64).to_le_bytes()))?;
let mut total_bytes_written = wrap_write(writer, &((self.num_elem as u64).to_le_bytes()))?;
total_bytes_written += wrap_write(writer, &((self.lower_bit_length as u64).to_le_bytes()))?;
let lower_vec: &[u64] = self.lower_bits.as_raw_slice();
let upper_vec: &[u64] = self.upper_bits.as_raw_slice();
total_bytes_written += wrap_write(writer, &((lower_vec.len() as u64).to_le_bytes()))?;
Expand All @@ -173,9 +180,162 @@ impl IntSeqEncoder for EliasFano {
Ok(total_bytes_written)
}
}

pub struct EliasFanoDecoder {
num_elem: usize,
lower_bits: BitVec<u64>,
upper_bits: BitVec<u64>,
lower_bit_length: usize,
}

impl EliasFanoDecoder {
const METADATA_SIZE: usize = 4;

fn construct_lower_bits(vec: &[u64], bits_to_read: usize) -> BitVec<u64> {
let mut bitvec = BitVec::<u64>::with_capacity(bits_to_read);

for i in 0..bits_to_read {
// Determine which u64 to read from
let index = i / 64;
// Determine which bit in that u64
let bit_position = i % 64;

// Get the bit value and push it into the BitVec
let bit_value = (vec[index] >> bit_position) & 1;
bitvec.push(bit_value == 1);
}

bitvec
}

fn construct_upper_bits(vec: &[u64], num_elem: usize) -> BitVec<u64> {
let mut bitvec = BitVec::<u64>::with_capacity(2 * num_elem);

// Iterate over each u64 and push bits as needed
let mut cnt = 0;
for &num in vec {
for i in 0..64 {
let set_bit = (num >> i) & 1 == 1;
bitvec.push(set_bit);
if set_bit {
cnt += 1;
}
if cnt == num_elem {
break;
}
}
if cnt == num_elem {
break;
}
}

bitvec
}
}

impl IntSeqDecoder for EliasFanoDecoder {
type IteratorType<'a> = EliasFanoDecodingIterator<'a>;
type Item = u64;

fn new_decoder(byte_slice: &[u8]) -> Result<Self> {
let encoded_data = transmute_u8_to_slice::<u64>(byte_slice);
if encoded_data.len() < Self::METADATA_SIZE {
return Err(anyhow!("Not enough metadata for EliasFano encoded data"));
}
let [num_elem, lower_bit_length, lower_vec_len, _upper_vec_len, ..] =
encoded_data[..Self::METADATA_SIZE]
else {
return Err(anyhow!("Invalid metadata for EliasFano encoded data"));
};
let lower_bits = Self::construct_lower_bits(
&encoded_data[Self::METADATA_SIZE..],
(num_elem * lower_bit_length) as usize,
);
let upper_bits = Self::construct_upper_bits(
&encoded_data[Self::METADATA_SIZE + lower_vec_len as usize..],
num_elem as usize,
);

Ok(Self {
num_elem: num_elem as usize,

lower_bits,
upper_bits,
lower_bit_length: lower_bit_length as usize,
})
}

fn get_iterator<'a>(&'a self, _byte_slice: &'a [u8]) -> Self::IteratorType<'a> {
EliasFanoDecodingIterator {
num_elem: self.num_elem,
cur_elem_index: 0,
cur_upper_bit_index: 0,
cumulative_gap_sum: 0,

lower_bits: &self.lower_bits,
upper_bits: &self.upper_bits,
lower_bit_mask: (1 << self.lower_bit_length) - 1,
lower_bit_length: self.lower_bit_length as usize,
}
}
}

pub struct EliasFanoDecodingIterator<'a> {
num_elem: usize,
cur_elem_index: usize,
cur_upper_bit_index: usize,
cumulative_gap_sum: usize,

lower_bits: &'a BitVec<u64>,
upper_bits: &'a BitVec<u64>,
lower_bit_mask: u64,
lower_bit_length: usize,
}

impl<'a> EliasFanoDecodingIterator<'a> {
fn decode_upper_part(&mut self) {
while self.cur_upper_bit_index < self.upper_bits.len()
&& !self.upper_bits[self.cur_upper_bit_index]
{
// Add the gap to cumulative sum
self.cumulative_gap_sum += 1;
self.cur_upper_bit_index += 1;
}
// Skip the '1' that terminates the unary code
self.cur_upper_bit_index += 1;
}

fn get_lower_part(&self) -> usize {
let mut low = 0;
if self.lower_bit_length > 0 {
let low_start = self.cur_elem_index * self.lower_bit_length;
low = (self.lower_bits[low_start..low_start + self.lower_bit_length].load::<u64>()
& self.lower_bit_mask) as usize;
}
low
}
}

impl<'a> Iterator for EliasFanoDecodingIterator<'a> {
type Item = u64;

fn next(&mut self) -> Option<Self::Item> {
if self.cur_elem_index < self.num_elem {
self.decode_upper_part();
let upper = self.cumulative_gap_sum;
let lower = self.get_lower_part();
self.cur_elem_index += 1;

Some((upper << self.lower_bit_length | lower) as u64)
} else {
None
}
}
}

#[cfg(test)]
mod tests {
use std::fs::File;
use std::fs::{remove_dir_all, File};
use std::io::{BufReader, BufWriter, Read};

use tempdir::TempDir;
Expand Down Expand Up @@ -289,6 +449,7 @@ mod tests {

// Expected data
let expected_data = vec![
5, 0, 0, 0, 0, 0, 0, 0, // num_elem (5 as u64)
4, 0, 0, 0, 0, 0, 0, 0, // lower_bit_length (4 as u64)
1, 0, 0, 0, 0, 0, 0, 0, // lower_vec.len() (1 as u64)
1, 0, 0, 0, 0, 0, 0, 0, // upper_vec.len() (1 as u64)
Expand All @@ -299,4 +460,46 @@ mod tests {
assert_eq!(written_data, expected_data);
assert_eq!(bytes_written, expected_data.len());
}

#[test]
fn test_elias_fano_decoding_iterator() {
let test_cases = vec![
(vec![5, 8, 8, 15, 32], 36), // Basic case
// (vec![0, 1, 2, 3, 4], 5), // Start with 0
// (vec![10], 20), // Single element
// (vec![1000, 2000, 3000, 4000, 5000], 6000), // Large numbers
// (vec![2, 4, 6, 8, 10], 10), // Non-consecutive integers
];

for (values, upper_bound) in test_cases {
let mut ef = EliasFano::new_encoder(upper_bound, values.len());
assert!(ef.encode_batch(&values).is_ok());

let temp_dir = TempDir::new("test_elias_fano_decoding_iterator")
.expect("Failed to create temporary directory");
let file_path = temp_dir.path().join("test_file");
let mut file = File::create(&file_path).expect("Failed to create test file");
let mut writer = BufWriter::new(&mut file);

// Call the write method
assert!(ef.write(&mut writer).is_ok());

drop(writer);

// Read the file contents into a byte vector
let mut file = File::open(&file_path).expect("Failed to open file for read");
let mut byte_slice = Vec::new();
assert!(file.read_to_end(&mut byte_slice).is_ok());

let decoder = EliasFanoDecoder::new_decoder(&byte_slice)
.expect("Failed to create posting list decoder");
let mut i = 0;
for idx in decoder.get_iterator(&byte_slice) {
assert_eq!(values[i], idx);
i += 1;
}

let _ = remove_dir_all(&file_path);
}
}
}
2 changes: 1 addition & 1 deletion rs/compression/src/noc/noc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ impl IntSeqDecoder for PlainDecoder {
})
}

fn get_iterator<'a>(&self, byte_slice: &'a [u8]) -> Self::IteratorType<'a> {
fn get_iterator<'a>(&'a self, byte_slice: &'a [u8]) -> Self::IteratorType<'a> {
PlainDecodingIterator {
num_elem: self.num_elem(),
cur_index: 0,
Expand Down
9 changes: 5 additions & 4 deletions rs/index/src/ivf/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -624,24 +624,25 @@ mod tests {
// Check metadata file
let expected_metadata = vec![
1, 0, 0, 0, 0, 0, 0, 0, // num_posting_lists
24, 0, 0, 0, 0, 0, 0,
0, // posting_list0_len: 3 * u64: lower_bit_length + 1 lower_bit
// + 1 upper_bit
48, 0, 0, 0, 0, 0, 0, 0, // posting_list0_len: 6 * u64:
// num_elem + lower_bit_length + lower_bits size
// + upper_bits size + 1 lower_bit + 1 upper_bit
0, 0, 0, 0, 0, 0, 0, 0, // posting_list0_offset
];
assert_eq!(metadata_content, expected_metadata);
assert_eq!(metadata_content.len(), 8 * 3);

// Check posting list file
let expected_posting_lists = vec![
5, 0, 0, 0, 0, 0, 0, 0, // num_elem
2, 0, 0, 0, 0, 0, 0, 0, // lower_bit_length
1, 0, 0, 0, 0, 0, 0, 0, // number of u64 for encoding lower_bits
1, 0, 0, 0, 0, 0, 0, 0, // number of u64 for encoding upper_bits
0b11000001, 0, 0, 0, 0, 0, 0, 0, // lower_bits + padding
0b01011010, 0b00010000, 0, 0, 0, 0, 0, 0, // upper_bits + padding
];
assert_eq!(posting_lists_content, expected_posting_lists);
assert_eq!(posting_lists_content.len(), 8 * 5);
assert_eq!(posting_lists_content.len(), 8 * 6);
}

#[test]
Expand Down

0 comments on commit c09c44c

Please sign in to comment.