spiraldb · gatesn · Jun 9, 2024 · Jun 9, 2024 · Jun 9, 2024 · Jun 9, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -2,6 +2,7 @@
 members = [
     "bench-vortex",
     "build-vortex",
+    "fastlanes",
     "fastlanez",
     "fastlanez-sys",
     "pyvortex",

diff --git a/fastlanes/Cargo.toml b/fastlanes/Cargo.toml
@@ -0,0 +1,27 @@
+[package]
+name = "fastlanes"
+version.workspace = true
+homepage.workspace = true
+repository.workspace = true
+authors.workspace = true
+license.workspace = true
+keywords.workspace = true
+include.workspace = true
+edition.workspace = true
+rust-version.workspace = true
+
+[dependencies]
+arrayref = { workspace = true }
+num-traits = { workspace = true }
+paste = { workspace = true }
+seq-macro = { workspace = true }
+
+[dev-dependencies]
+criterion = { workspace = true }
+
+[lints]
+workspace = true
+
+[[bench]]
+name = "fastlanes_bitpacking"
+harness = false
diff --git a/fastlanes/benches/fastlanes_bitpacking.rs b/fastlanes/benches/fastlanes_bitpacking.rs
@@ -0,0 +1,65 @@
+#![allow(incomplete_features)]
+#![feature(generic_const_exprs)]
+
+use std::mem::size_of;
+
+use arrayref::{array_mut_ref, array_ref};
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use fastlanes::BitPacking;
+
+fn bitpacking(c: &mut Criterion) {
+    {
+        let mut group = c.benchmark_group("bit-packing");
+        group.bench_function("pack 16 -> 3 heap", |b| {
+            const WIDTH: usize = 3;
+            let values = vec![3u16; 1024];
+            let mut packed = vec![0; 128 * WIDTH / size_of::<u16>()];
+
+            b.iter(|| {
+                BitPacking::bitpack::<WIDTH>(
+                    array_ref![values, 0, 1024],
+                    array_mut_ref![packed, 0, 192],
+                );
+            });
+        });
+
+        group.bench_function("pack 16 -> 3 stack", |b| {
+            const WIDTH: usize = 3;
+            let values = [3u16; 1024];
+            let mut packed = [0; 128 * WIDTH / size_of::<u16>()];
+            b.iter(|| BitPacking::bitpack::<WIDTH>(&values, &mut packed));
+        });
+    }
+
+    {
+        let mut group = c.benchmark_group("bit-unpacking");
+        group.bench_function("unpack 16 <- 3 stack", |b| {
+            const WIDTH: usize = 3;
+            let values = [3u16; 1024];
+            let mut packed = [0; 128 * WIDTH / size_of::<u16>()];
+            BitPacking::bitpack::<WIDTH>(&values, &mut packed);
+
+            let mut unpacked = [0u16; 1024];
+            b.iter(|| BitPacking::bitunpack::<WIDTH>(&packed, &mut unpacked));
+        });
+    }
+
+    {
+        let mut group = c.benchmark_group("unpack-single");
+        group.bench_function("unpack single 16 <- 3", |b| {
+            const WIDTH: usize = 3;
+            let values = [3u16; 1024];
+            let mut packed = [0; 128 * WIDTH / size_of::<u16>()];
+            BitPacking::bitpack::<WIDTH>(&values, &mut packed);
+
+            b.iter(|| {
+                for i in 0..1024 {
+                    black_box::<u16>(BitPacking::bitunpack_single::<WIDTH>(&packed, i));
+                }
+            });
+        });
+    }
+}
+
+criterion_group!(benches, bitpacking);
+criterion_main!(benches);
diff --git a/fastlanes/src/bitpacking.rs b/fastlanes/src/bitpacking.rs
@@ -0,0 +1,228 @@
+use std::mem::size_of;
+
+use num_traits::{One, PrimInt, Unsigned};
+use paste::paste;
+use seq_macro::seq;
+
+use crate::FastLanes;
+use crate::{Pred, Satisfied};
+
+pub struct BitPackWidth<const W: usize>;
+pub trait SupportedBitPackWidth<T> {}
+impl<const W: usize, T> SupportedBitPackWidth<T> for BitPackWidth<W>
+where
+    Pred<{ W > 0 }>: Satisfied,
+    Pred<{ W < 8 * size_of::<T>() }>: Satisfied,
+{
+}
+
+/// BitPack into a compile-time known bit-width.
+pub trait BitPacking: FastLanes {
+    /// Packs 1024 elements into W bits each.
+    /// The output is given as Self to ensure correct alignment.
+    fn bitpack<const W: usize>(
+        input: &[Self; 1024],
+        output: &mut [Self; 128 * W / size_of::<Self>()],
+    ) where
+        BitPackWidth<W>: SupportedBitPackWidth<Self>;
+
+    /// Unpacks W-bit elements into 1024 elements.
+    fn bitunpack<const W: usize>(
+        input: &[Self; 128 * W / size_of::<Self>()],
+        output: &mut [Self; 1024],
+    ) where
+        BitPackWidth<W>: SupportedBitPackWidth<Self>;
+
+    fn bitunpack_single<const W: usize>(
+        input: &[Self; 128 * W / size_of::<Self>()],
+        index: usize,
+    ) -> Self
+    where
+        BitPackWidth<W>: SupportedBitPackWidth<Self>;
+}
+
+// Macro for repeating a code block bit_size_of::<T> times.
+macro_rules! seq_type_width {
+    ($ident:ident in u8 $body:tt) => {seq!($ident in 0..8 $body);};
+    ($ident:ident in u16 $body:tt) => {seq!($ident in 0..16 $body);};
+    ($ident:ident in u32 $body:tt) => {seq!($ident in 0..32 $body);};
+    ($ident:ident in u64 $body:tt) => {seq!($ident in 0..64 $body);};
+}
+
+#[inline]
+pub(crate) fn mask<T: PrimInt + Unsigned + One>(width: usize) -> T {
+    (T::one() << width) - T::one()
+}
+
+// We need to use a macro instead of generic impl since we have to know the bit-width of T ahead
+// of time.
+macro_rules! impl_bitpacking {
+    ($T:ty) => {
+        paste! {
+            impl FastLanes for $T {}
+
+            impl BitPacking for $T {
+                #[inline(never)] // Makes it easier to disassemble and validate ASM.
+                #[allow(unused_assignments)] // Inlined loop gives unused assignment on final iteration
+                fn bitpack<const W: usize>(
+                    input: &[Self; 1024],
+                    output: &mut [Self; 128 * W / size_of::<Self>()],
+                ) where BitPackWidth<W>: SupportedBitPackWidth<Self> {
+                    let mask = (1 << W) - 1;
+
+                    // First we loop over each lane in the virtual 1024 bit word.
+                    for i in 0..Self::LANES {
+                        let mut tmp: Self = 0;
+
+                        // Loop over each of the rows of the lane.
+                        // Inlining this loop means all branches are known at compile time and
+                        // the code is auto-vectorized for SIMD execution.
+                        seq_type_width!(row in $T {{
+                            let src = input[Self::LANES * row + i] & mask;
+
+                            // Shift the src bits into their position in the tmp output variable.
+                            if row == 0 {
+                                tmp = src;
+                            } else {
+                                tmp |= src << (row * W) % Self::T;
+                            }
+
+                            // If the next input value overlaps with the next output, then we
+                            // write out the tmp variable and bring forward the remaining bits.
+                            let curr_pos: usize = (row * W) / Self::T;
+                            let next_pos: usize = ((row + 1) * W) / Self::T;
+                            if next_pos > curr_pos {
+                                output[Self::LANES * curr_pos + i] = tmp;
+
+                                let remaining_bits: usize = ((row + 1) * W) % Self::T;
+                                tmp = src >> W - remaining_bits;
+                            }
+                        }});
+                    }
+                }
+
+                #[inline(never)]
+                fn bitunpack<const W: usize>(
+                    input: &[Self; 128 * W / size_of::<Self>()],
+                    output: &mut [Self; 1024],
+                ) where BitPackWidth<W>: SupportedBitPackWidth<Self> {
+                    for i in 0..Self::LANES {
+                        let mut src = input[i];
+                        let mut tmp: Self;
+
+                        seq_type_width!(row in $T {{
+                            let curr_pos: usize = (row * W) / Self::T;
+                            let next_pos = ((row + 1) * W) / Self::T;
+
+                            let shift = (row * W) % Self::T;
+
+                            if next_pos > curr_pos {
+                                // Consume some bits from the curr input, the remainder are in the next input
+                                let remaining_bits = ((row + 1) * W) % Self::T;
+                                let current_bits = W - remaining_bits;
+                                tmp = (src >> shift) & mask::<Self>(current_bits);
+
+                                if next_pos < W {
+                                    // Load the next input value
+                                    src = input[Self::LANES * next_pos + i];
+                                    // Consume the remaining bits from the next input value.
+                                    tmp |= (src & mask::<Self>(remaining_bits)) << current_bits;
+                                }
+                            } else {
+                                // Otherwise, just grab W bits from the src value
+                                tmp = (src >> shift) & mask::<Self>(W);
+                            }
+
+                            // Write out the unpacked value
+                            output[(Self::LANES * row) + i] = tmp;
+                        }});
+                    }
+                }
+
+                #[inline(never)]
+                fn bitunpack_single<const W: usize>(
+                    input: &[Self; 128 * W / size_of::<Self>()],
+                    index: usize,
+                ) -> Self where BitPackWidth<W>: SupportedBitPackWidth<Self> {
+                    let lane_index = index % Self::LANES;
+                    let lane_start_bit = (index / Self::LANES) * W;
+
+                    let (lsb, msb) = {
+                        // the value may be split across two words
+                        let lane_start_word = lane_start_bit / Self::T;
+                        let lane_end_word = (lane_start_bit + W - 1) / Self::T;
+
+                        (
+                            input[lane_start_word * Self::LANES + lane_index],
+                            input[lane_end_word * Self::LANES + lane_index], // this may be a duplicate
+                        )
+                    };
+
+                    let shift = lane_start_bit % Self::T;
+                    if shift == 0 {
+                        (lsb >> shift) & mask::<Self>(W)
+                    } else {
+                        // If shift == 0, then this shift overflows, instead of shifting to zero.
+                        // This forces us to introduce a branch. Any way to avoid?
+                        let hi = msb << (Self::T - shift);
+                        let lo = lsb >> shift;
+                        (lo | hi) & mask::<Self>(W)
+                    }
+                }
+            }
+        }
+    };
+}
+
+impl_bitpacking!(u8);
+impl_bitpacking!(u16);
+impl_bitpacking!(u32);
+impl_bitpacking!(u64);
+
+#[cfg(test)]
+// #[cfg(not(debug_assertions))] // Only run in release mode
+mod test {
+    use super::*;
+
+    macro_rules! test_round_trip {
+        ($T:ty, $W:literal) => {
+            paste! {
+                #[test]
+                fn [<try_round_trip_ $T _ $W>]() {
+                    let mut values: [$T; 1024] = [0; 1024];
+                    for i in 0..1024 {
+                        values[i] = (i % (1 << $W)) as $T;
+                    }
+
+                    let mut packed = [0; 128 * $W / size_of::<$T>()];
+                    BitPacking::bitpack::<$W>(&values, &mut packed);
+
+                    let mut unpacked = [0; 1024];
+                    BitPacking::bitunpack::<$W>(&packed, &mut unpacked);
+
+                    assert_eq!(&unpacked, &values);
+                }
+
+                #[test]
+                fn [<try_unpack_single_ $T _ $W>]() {
+                    let mut values: [$T; 1024] = [0; 1024];
+                    for i in 0..1024 {
+                        values[i] = (i % (1 << $W)) as $T;
+                    }
+
+                    let mut packed = [0; 128 * $W / size_of::<$T>()];
+                    BitPacking::bitpack::<$W>(&values, &mut packed);
+
+                    for (idx, value) in values.into_iter().enumerate() {
+                        assert_eq!(BitPacking::bitunpack_single::<$W>(&packed, idx), value);
+                    }
+                }
+            }
+        };
+    }
+
+    seq!(W in 1..8 { test_round_trip!(u8, W); });
+    seq!(W in 1..16 { test_round_trip!(u16, W); });
+    seq!(W in 1..32 { test_round_trip!(u32, W); });
+    seq!(W in 1..64 { test_round_trip!(u64, W); });
+}
diff --git a/fastlanes/src/lib.rs b/fastlanes/src/lib.rs
@@ -0,0 +1,22 @@
+#![allow(incomplete_features)]
+#![feature(generic_const_exprs)]
+
+use std::mem::size_of;
+
+use num_traits::{PrimInt, Unsigned};
+
+mod bitpacking;
+pub use bitpacking::*;
+
+pub const ORDER: [u8; 8] = [0, 4, 2, 6, 1, 5, 3, 7];
+
+pub trait FastLanes: Sized + Unsigned + PrimInt {
+    const T: usize = size_of::<Self>() * 8;
+    const LANES: usize = 1024 / Self::T;
+}
+
+pub struct Pred<const B: bool>;
+
+pub trait Satisfied {}
+
+impl Satisfied for Pred<true> {}
diff --git a/fastlanez/src/bitpack.rs b/fastlanez/src/bitpack.rs
@@ -118,7 +118,7 @@ macro_rules! bitpack_impl {
                         }
                     }
 
-                    #[inline]
+                    #[inline(never)]
                     fn unpack_single(
                         input: &[u8; 128 * W],
                         index: usize
@@ -236,4 +236,17 @@ mod test {
             assert_eq!(decoded, *v);
         });
     }
+
+    #[test]
+    fn test_unpack_single2() {
+        let input = (0u16..1024).collect::<Vec<_>>();
+        let mut output = Vec::new();
+        TryBitPack::try_pack_into(array_ref![input, 0, 1024], 11, &mut output).unwrap();
+        assert_eq!(output.len(), 1408);
+
+        input.iter().enumerate().for_each(|(i, v)| {
+            let decoded = u16::try_unpack_single(&output, 11, i).unwrap();
+            assert_eq!(decoded, *v);
+        });
+    }
 }