From f9a467828a3925f2ca50bf58843a04f67c92c252 Mon Sep 17 00:00:00 2001 From: tyb0807 Date: Mon, 30 Dec 2024 20:22:27 +0100 Subject: [PATCH] IntSeqEncoder trait and PlainEncoder implementation (#239) This trait is for implementing different encoding algorithms to compress posting lists (or any sorted integer sequences) --- Cargo.lock | 1 + rs/compression/Cargo.toml | 1 + rs/compression/src/compression.rs | 20 +++++++++++++ rs/compression/src/lib.rs | 2 ++ rs/compression/src/noc/mod.rs | 1 + rs/compression/src/noc/noc.rs | 48 +++++++++++++++++++++++++++++++ 6 files changed, 73 insertions(+) create mode 100644 rs/compression/src/compression.rs create mode 100644 rs/compression/src/noc/mod.rs create mode 100644 rs/compression/src/noc/noc.rs diff --git a/Cargo.lock b/Cargo.lock index d185c41..c0ee294 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -477,6 +477,7 @@ dependencies = [ "env_logger", "log", "tempdir", + "utils", ] [[package]] diff --git a/rs/compression/Cargo.toml b/rs/compression/Cargo.toml index b125eff..bef3812 100644 --- a/rs/compression/Cargo.toml +++ b/rs/compression/Cargo.toml @@ -9,3 +9,4 @@ bitvec = "1" env_logger.workspace = true log.workspace = true tempdir.workspace = true +utils.workspace = true diff --git a/rs/compression/src/compression.rs b/rs/compression/src/compression.rs new file mode 100644 index 0000000..56425fb --- /dev/null +++ b/rs/compression/src/compression.rs @@ -0,0 +1,20 @@ +use std::fs::File; +use std::io::BufWriter; + +use anyhow::Result; + +pub trait IntSeqEncoder { + /// Creates an encoder + fn new_encoder(universe: Option, size: usize) -> Self + where + Self: Sized; + + /// Compresses a sorted slice of integers + fn encode(&mut self, values: &[u64]) -> Result<()>; + + /// Returns the number of elements in the sequence + fn len(&self) -> usize; + + /// Writes to disk and return number of bytes written. + fn write(&self, writer: &mut BufWriter<&mut File>) -> Result; +} diff --git a/rs/compression/src/lib.rs b/rs/compression/src/lib.rs index e75354e..ee5e21e 100644 --- a/rs/compression/src/lib.rs +++ b/rs/compression/src/lib.rs @@ -1 +1,3 @@ +pub mod compression; pub mod elias_fano; +pub mod noc; diff --git a/rs/compression/src/noc/mod.rs b/rs/compression/src/noc/mod.rs new file mode 100644 index 0000000..f7e789f --- /dev/null +++ b/rs/compression/src/noc/mod.rs @@ -0,0 +1 @@ +pub mod noc; diff --git a/rs/compression/src/noc/noc.rs b/rs/compression/src/noc/noc.rs new file mode 100644 index 0000000..0c5cb5e --- /dev/null +++ b/rs/compression/src/noc/noc.rs @@ -0,0 +1,48 @@ +use std::fs::File; +use std::io::{BufWriter, Write}; + +use anyhow::Result; +use utils::io::wrap_write; + +use crate::compression::IntSeqEncoder; + +pub struct PlainEncoder { + size: usize, + sequence: Vec, +} + +impl PlainEncoder { + pub fn new(size: usize) -> Self { + Self { + size, + sequence: Vec::new(), + } + } +} + +impl IntSeqEncoder for PlainEncoder { + fn new_encoder(_universe: Option, size: usize) -> Self { + Self::new(size) + } + + fn encode(&mut self, values: &[u64]) -> Result<()> { + self.sequence = values.to_vec(); + Ok(()) + } + + fn len(&self) -> usize { + self.size + } + + fn write(&self, writer: &mut BufWriter<&mut File>) -> Result { + let mut total_bytes_written = 0; + + for &val in self.sequence.iter() { + total_bytes_written += wrap_write(writer, &val.to_le_bytes())?; + } + + writer.flush()?; + + Ok(total_bytes_written) + } +}