Unpack single

spiraldb · Jun 10, 2024 · 220a784 · 220a784
1 parent 98f03f2
commit 220a784
Show file tree

Hide file tree

Showing 3 changed files with 91 additions and 5 deletions.
diff --git a/fastlanez/benches/fastlanes_bitpacking.rs b/fastlanez/benches/fastlanes_bitpacking.rs
@@ -33,17 +33,17 @@ fn bitpacking(c: &mut Criterion) {
         group.bench_function("old pack 16 -> 3", |b| {
             const WIDTH: usize = 3;
             let values = [3u16; 1024];
+            let mut packed = [MaybeUninit::new(0u8); 128 * WIDTH];
 
             b.iter(|| {
-                let mut packed = [MaybeUninit::new(0u8); 128 * WIDTH];
                 black_box(BitPack::<WIDTH>::pack(&values, &mut packed));
             });
         });
     }
 
     {
         let mut group = c.benchmark_group("bit-unpacking");
-        group.bench_function("unpack 16 <- 3 heap", |b| {
+        group.bench_function("unpack 16 <- 3 stack", |b| {
             const WIDTH: usize = 3;
             let values = [3u16; 1024];
             let mut packed = [0; 128 * WIDTH / size_of::<u16>()];
@@ -53,6 +53,34 @@ fn bitpacking(c: &mut Criterion) {
             b.iter(|| BitPack2::<WIDTH>::bitunpack(&packed, &mut unpacked));
         });
     }
+
+    {
+        let mut group = c.benchmark_group("unpack-single");
+        group.bench_function("unpack single 16 <- 3", |b| {
+            const WIDTH: usize = 3;
+            let values = [3u16; 1024];
+            let mut packed = [0; 128 * WIDTH / size_of::<u16>()];
+            BitPack2::<WIDTH>::bitpack(&values, &mut packed);
+
+            b.iter(|| {
+                for i in 0..1024 {
+                    black_box::<u16>(BitPack2::<WIDTH>::bitunpack_single(&packed, i));
+                }
+            });
+        });
+        group.bench_function("unpack single old 16 <- 3", |b| {
+            const WIDTH: usize = 3;
+            let values = [3u16; 1024];
+            let mut packed = [MaybeUninit::new(0u8); 128 * WIDTH];
+            let packed = BitPack::<WIDTH>::pack(&values, &mut packed);
+
+            b.iter(|| {
+                for i in 0..1024 {
+                    black_box::<u16>(BitPack::<WIDTH>::unpack_single(&packed, i));
+                }
+            });
+        });
+    }
 }
 
 criterion_group!(benches, bitpacking);

diff --git a/fastlanez/src/bitpack.rs b/fastlanez/src/bitpack.rs
@@ -118,7 +118,7 @@ macro_rules! bitpack_impl {
                         }
                     }
 
-                    #[inline]
+                    #[inline(never)]
                     fn unpack_single(
                         input: &[u8; 128 * W],
                         index: usize
@@ -236,4 +236,17 @@ mod test {
             assert_eq!(decoded, *v);
         });
     }
+
+    #[test]
+    fn test_unpack_single2() {
+        let input = (0u16..1024).collect::<Vec<_>>();
+        let mut output = Vec::new();
+        TryBitPack::try_pack_into(array_ref![input, 0, 1024], 11, &mut output).unwrap();
+        assert_eq!(output.len(), 1408);
+
+        input.iter().enumerate().for_each(|(i, v)| {
+            let decoded = u16::try_unpack_single(&output, 11, i).unwrap();
+            assert_eq!(decoded, *v);
+        });
+    }
 }
diff --git a/fastlanez/src/fl.rs b/fastlanez/src/fl.rs
@@ -29,6 +29,8 @@ where
 
     /// Unpacks W-bit elements into 1024 elements.
     fn bitunpack(input: &[Self; 128 * W / size_of::<Self>()], output: &mut [Self; 1024]);
+
+    fn bitunpack_single(input: &[Self; 128 * W / size_of::<Self>()], index: usize) -> Self;
 }
 
 // Macro for repeating a code block bit_size_of::<T> times.
@@ -60,7 +62,7 @@ macro_rules! impl_bitpacking {
                 #[inline(never)] // Makes it easier to disassemble and validate ASM.
                 #[allow(unused_assignments)] // Inlined loop gives unused assignment on final iteration
                 fn bitpack(input: &[Self; 1024], output: &mut [Self; 128 * W / size_of::<Self>()]) {
-                    let mask = ((1 << W) - 1);
+                    let mask = (1 << W) - 1;
 
                     // First we loop over each lane in the virtual 1024 bit word.
                     for i in 0..Self::LANES {
@@ -93,7 +95,7 @@ macro_rules! impl_bitpacking {
                     }
                 }
 
-                #[inline(never)] // Makes it easier to disassemble and validate ASM.
+                #[inline(never)]
                 fn bitunpack(input: &[Self; 128 * W / size_of::<Self>()], output: &mut [Self; 1024]) {
                     for i in 0..Self::LANES {
                         let mut src = input[i];
@@ -127,6 +129,34 @@ macro_rules! impl_bitpacking {
                         }});
                     }
                 }
+
+                #[inline(never)]
+                fn bitunpack_single(input: &[Self; 128 * W / size_of::<Self>()], index: usize) -> Self {
+                    let lane_index = index % Self::LANES;
+                    let lane_start_bit = (index / Self::LANES) * Self::WIDTH;
+
+                    let (lsb, msb) = {
+                        // the value may be split across two words
+                        let lane_start_word = lane_start_bit / Self::T;
+                        let lane_end_word = (lane_start_bit + Self::WIDTH - 1) / Self::T;
+
+                        (
+                            input[lane_start_word * Self::LANES + lane_index],
+                            input[lane_end_word * Self::LANES + lane_index], // this may be a duplicate
+                        )
+                    };
+
+                    let shift = lane_start_bit % Self::T;
+                    if shift == 0 {
+                        (lsb >> shift) & mask::<Self>(Self::WIDTH)
+                    } else {
+                        // If shift == 0, then this shift overflows, instead of shifting to zero.
+                        // This forces us to introduce a branch. Any way to avoid?
+                        let hi = msb << (Self::T - shift);
+                        let lo = (lsb >> shift);
+                        (lo | hi) & mask::<Self>(Self::WIDTH)
+                    }
+                }
             }
         }
     };
@@ -189,6 +219,21 @@ mod test {
 
                     assert_eq!(&unpacked, &values);
                 }
+
+                #[test]
+                fn [<try_unpack_single_ $T _ $W>]() {
+                    let mut values: [$T; 1024] = [0; 1024];
+                    for i in 0..1024 {
+                        values[i] = (i % (1 << $W)) as $T;
+                    }
+
+                    let mut packed = [0; 128 * $W / size_of::<$T>()];
+                    BitPack2::<$W>::bitpack(&values, &mut packed);
+
+                    for (idx, value) in values.into_iter().enumerate() {
+                        assert_eq!(BitPack2::<$W>::bitunpack_single(&packed, idx), value);
+                    }
+                }
             }
         };
     }