Skip to content

Commit

Permalink
Unpack single
Browse files Browse the repository at this point in the history
  • Loading branch information
gatesn committed Jun 10, 2024
1 parent 98f03f2 commit 220a784
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 5 deletions.
32 changes: 30 additions & 2 deletions fastlanez/benches/fastlanes_bitpacking.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,17 @@ fn bitpacking(c: &mut Criterion) {
group.bench_function("old pack 16 -> 3", |b| {
const WIDTH: usize = 3;
let values = [3u16; 1024];
let mut packed = [MaybeUninit::new(0u8); 128 * WIDTH];

b.iter(|| {
let mut packed = [MaybeUninit::new(0u8); 128 * WIDTH];
black_box(BitPack::<WIDTH>::pack(&values, &mut packed));
});
});
}

{
let mut group = c.benchmark_group("bit-unpacking");
group.bench_function("unpack 16 <- 3 heap", |b| {
group.bench_function("unpack 16 <- 3 stack", |b| {
const WIDTH: usize = 3;
let values = [3u16; 1024];
let mut packed = [0; 128 * WIDTH / size_of::<u16>()];
Expand All @@ -53,6 +53,34 @@ fn bitpacking(c: &mut Criterion) {
b.iter(|| BitPack2::<WIDTH>::bitunpack(&packed, &mut unpacked));
});
}

{
let mut group = c.benchmark_group("unpack-single");
group.bench_function("unpack single 16 <- 3", |b| {
const WIDTH: usize = 3;
let values = [3u16; 1024];
let mut packed = [0; 128 * WIDTH / size_of::<u16>()];
BitPack2::<WIDTH>::bitpack(&values, &mut packed);

b.iter(|| {
for i in 0..1024 {
black_box::<u16>(BitPack2::<WIDTH>::bitunpack_single(&packed, i));
}
});
});
group.bench_function("unpack single old 16 <- 3", |b| {
const WIDTH: usize = 3;
let values = [3u16; 1024];
let mut packed = [MaybeUninit::new(0u8); 128 * WIDTH];
let packed = BitPack::<WIDTH>::pack(&values, &mut packed);

b.iter(|| {
for i in 0..1024 {
black_box::<u16>(BitPack::<WIDTH>::unpack_single(&packed, i));
}
});
});
}
}

criterion_group!(benches, bitpacking);
Expand Down
15 changes: 14 additions & 1 deletion fastlanez/src/bitpack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ macro_rules! bitpack_impl {
}
}

#[inline]
#[inline(never)]
fn unpack_single(
input: &[u8; 128 * W],
index: usize
Expand Down Expand Up @@ -236,4 +236,17 @@ mod test {
assert_eq!(decoded, *v);
});
}

#[test]
fn test_unpack_single2() {
let input = (0u16..1024).collect::<Vec<_>>();
let mut output = Vec::new();
TryBitPack::try_pack_into(array_ref![input, 0, 1024], 11, &mut output).unwrap();
assert_eq!(output.len(), 1408);

input.iter().enumerate().for_each(|(i, v)| {
let decoded = u16::try_unpack_single(&output, 11, i).unwrap();
assert_eq!(decoded, *v);
});
}
}
49 changes: 47 additions & 2 deletions fastlanez/src/fl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ where

/// Unpacks W-bit elements into 1024 elements.
fn bitunpack(input: &[Self; 128 * W / size_of::<Self>()], output: &mut [Self; 1024]);

fn bitunpack_single(input: &[Self; 128 * W / size_of::<Self>()], index: usize) -> Self;
}

// Macro for repeating a code block bit_size_of::<T> times.
Expand Down Expand Up @@ -60,7 +62,7 @@ macro_rules! impl_bitpacking {
#[inline(never)] // Makes it easier to disassemble and validate ASM.
#[allow(unused_assignments)] // Inlined loop gives unused assignment on final iteration
fn bitpack(input: &[Self; 1024], output: &mut [Self; 128 * W / size_of::<Self>()]) {
let mask = ((1 << W) - 1);
let mask = (1 << W) - 1;

// First we loop over each lane in the virtual 1024 bit word.
for i in 0..Self::LANES {
Expand Down Expand Up @@ -93,7 +95,7 @@ macro_rules! impl_bitpacking {
}
}

#[inline(never)] // Makes it easier to disassemble and validate ASM.
#[inline(never)]
fn bitunpack(input: &[Self; 128 * W / size_of::<Self>()], output: &mut [Self; 1024]) {
for i in 0..Self::LANES {
let mut src = input[i];
Expand Down Expand Up @@ -127,6 +129,34 @@ macro_rules! impl_bitpacking {
}});
}
}

#[inline(never)]
fn bitunpack_single(input: &[Self; 128 * W / size_of::<Self>()], index: usize) -> Self {
let lane_index = index % Self::LANES;
let lane_start_bit = (index / Self::LANES) * Self::WIDTH;

let (lsb, msb) = {
// the value may be split across two words
let lane_start_word = lane_start_bit / Self::T;
let lane_end_word = (lane_start_bit + Self::WIDTH - 1) / Self::T;

(
input[lane_start_word * Self::LANES + lane_index],
input[lane_end_word * Self::LANES + lane_index], // this may be a duplicate
)
};

let shift = lane_start_bit % Self::T;
if shift == 0 {
(lsb >> shift) & mask::<Self>(Self::WIDTH)
} else {
// If shift == 0, then this shift overflows, instead of shifting to zero.
// This forces us to introduce a branch. Any way to avoid?
let hi = msb << (Self::T - shift);
let lo = (lsb >> shift);
(lo | hi) & mask::<Self>(Self::WIDTH)
}
}
}
}
};
Expand Down Expand Up @@ -189,6 +219,21 @@ mod test {

assert_eq!(&unpacked, &values);
}

#[test]
fn [<try_unpack_single_ $T _ $W>]() {
let mut values: [$T; 1024] = [0; 1024];
for i in 0..1024 {
values[i] = (i % (1 << $W)) as $T;
}

let mut packed = [0; 128 * $W / size_of::<$T>()];
BitPack2::<$W>::bitpack(&values, &mut packed);

for (idx, value) in values.into_iter().enumerate() {
assert_eq!(BitPack2::<$W>::bitunpack_single(&packed, idx), value);
}
}
}
};
}
Expand Down

0 comments on commit 220a784

Please sign in to comment.