Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rust FastLanes #345

Closed
wants to merge 15 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
members = [
"bench-vortex",
"build-vortex",
"fastlanes",
"fastlanez",
"fastlanez-sys",
"pyvortex",
Expand Down
27 changes: 27 additions & 0 deletions fastlanes/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
[package]
name = "fastlanes"
version.workspace = true
homepage.workspace = true
repository.workspace = true
authors.workspace = true
license.workspace = true
keywords.workspace = true
include.workspace = true
edition.workspace = true
rust-version.workspace = true

[dependencies]
arrayref = { workspace = true }
num-traits = { workspace = true }
paste = { workspace = true }
seq-macro = { workspace = true }

[dev-dependencies]
criterion = { workspace = true }

[lints]
workspace = true

[[bench]]
name = "fastlanes_bitpacking"
harness = false
65 changes: 65 additions & 0 deletions fastlanes/benches/fastlanes_bitpacking.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#![allow(incomplete_features)]
#![feature(generic_const_exprs)]

use std::mem::size_of;

use arrayref::{array_mut_ref, array_ref};
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use fastlanes::BitPacking;

fn bitpacking(c: &mut Criterion) {
{
let mut group = c.benchmark_group("bit-packing");
group.bench_function("pack 16 -> 3 heap", |b| {
const WIDTH: usize = 3;
let values = vec![3u16; 1024];
let mut packed = vec![0; 128 * WIDTH / size_of::<u16>()];

b.iter(|| {
BitPacking::bitpack::<WIDTH>(
array_ref![values, 0, 1024],
array_mut_ref![packed, 0, 192],
);
});
});

group.bench_function("pack 16 -> 3 stack", |b| {
const WIDTH: usize = 3;
let values = [3u16; 1024];
let mut packed = [0; 128 * WIDTH / size_of::<u16>()];
b.iter(|| BitPacking::bitpack::<WIDTH>(&values, &mut packed));
});
}

{
let mut group = c.benchmark_group("bit-unpacking");
group.bench_function("unpack 16 <- 3 stack", |b| {
const WIDTH: usize = 3;
let values = [3u16; 1024];
let mut packed = [0; 128 * WIDTH / size_of::<u16>()];
BitPacking::bitpack::<WIDTH>(&values, &mut packed);

let mut unpacked = [0u16; 1024];
b.iter(|| BitPacking::bitunpack::<WIDTH>(&packed, &mut unpacked));
});
}

{
let mut group = c.benchmark_group("unpack-single");
group.bench_function("unpack single 16 <- 3", |b| {
const WIDTH: usize = 3;
let values = [3u16; 1024];
let mut packed = [0; 128 * WIDTH / size_of::<u16>()];
BitPacking::bitpack::<WIDTH>(&values, &mut packed);

b.iter(|| {
for i in 0..1024 {
black_box::<u16>(BitPacking::bitunpack_single::<WIDTH>(&packed, i));
}
});
});
}
}

criterion_group!(benches, bitpacking);
criterion_main!(benches);
228 changes: 228 additions & 0 deletions fastlanes/src/bitpacking.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
use std::mem::size_of;

use num_traits::{One, PrimInt, Unsigned};
use paste::paste;
use seq_macro::seq;

use crate::FastLanes;
use crate::{Pred, Satisfied};

pub struct BitPackWidth<const W: usize>;
pub trait SupportedBitPackWidth<T> {}
impl<const W: usize, T> SupportedBitPackWidth<T> for BitPackWidth<W>
where
Pred<{ W > 0 }>: Satisfied,
Pred<{ W < 8 * size_of::<T>() }>: Satisfied,
{
}

/// BitPack into a compile-time known bit-width.
pub trait BitPacking: FastLanes {
/// Packs 1024 elements into W bits each.
/// The output is given as Self to ensure correct alignment.
fn bitpack<const W: usize>(
input: &[Self; 1024],
output: &mut [Self; 128 * W / size_of::<Self>()],
) where
BitPackWidth<W>: SupportedBitPackWidth<Self>;

/// Unpacks W-bit elements into 1024 elements.
fn bitunpack<const W: usize>(
input: &[Self; 128 * W / size_of::<Self>()],
output: &mut [Self; 1024],
) where
BitPackWidth<W>: SupportedBitPackWidth<Self>;

fn bitunpack_single<const W: usize>(
input: &[Self; 128 * W / size_of::<Self>()],
index: usize,
) -> Self
where
BitPackWidth<W>: SupportedBitPackWidth<Self>;
}

// Macro for repeating a code block bit_size_of::<T> times.
macro_rules! seq_type_width {
($ident:ident in u8 $body:tt) => {seq!($ident in 0..8 $body);};
($ident:ident in u16 $body:tt) => {seq!($ident in 0..16 $body);};
($ident:ident in u32 $body:tt) => {seq!($ident in 0..32 $body);};
($ident:ident in u64 $body:tt) => {seq!($ident in 0..64 $body);};
}

#[inline]
pub(crate) fn mask<T: PrimInt + Unsigned + One>(width: usize) -> T {
(T::one() << width) - T::one()
}

// We need to use a macro instead of generic impl since we have to know the bit-width of T ahead
// of time.
macro_rules! impl_bitpacking {
($T:ty) => {
paste! {
impl FastLanes for $T {}

impl BitPacking for $T {
#[inline(never)] // Makes it easier to disassemble and validate ASM.
#[allow(unused_assignments)] // Inlined loop gives unused assignment on final iteration
fn bitpack<const W: usize>(
input: &[Self; 1024],
output: &mut [Self; 128 * W / size_of::<Self>()],
) where BitPackWidth<W>: SupportedBitPackWidth<Self> {
let mask = (1 << W) - 1;

// First we loop over each lane in the virtual 1024 bit word.
for i in 0..Self::LANES {
let mut tmp: Self = 0;

// Loop over each of the rows of the lane.
// Inlining this loop means all branches are known at compile time and
// the code is auto-vectorized for SIMD execution.
seq_type_width!(row in $T {{
let src = input[Self::LANES * row + i] & mask;

// Shift the src bits into their position in the tmp output variable.
if row == 0 {
tmp = src;
} else {
tmp |= src << (row * W) % Self::T;
}

// If the next input value overlaps with the next output, then we
// write out the tmp variable and bring forward the remaining bits.
let curr_pos: usize = (row * W) / Self::T;
let next_pos: usize = ((row + 1) * W) / Self::T;
if next_pos > curr_pos {
output[Self::LANES * curr_pos + i] = tmp;

let remaining_bits: usize = ((row + 1) * W) % Self::T;
tmp = src >> W - remaining_bits;
}
}});
}
}

#[inline(never)]
fn bitunpack<const W: usize>(
input: &[Self; 128 * W / size_of::<Self>()],
output: &mut [Self; 1024],
) where BitPackWidth<W>: SupportedBitPackWidth<Self> {
for i in 0..Self::LANES {
let mut src = input[i];
let mut tmp: Self;

seq_type_width!(row in $T {{
let curr_pos: usize = (row * W) / Self::T;
let next_pos = ((row + 1) * W) / Self::T;

let shift = (row * W) % Self::T;

if next_pos > curr_pos {
// Consume some bits from the curr input, the remainder are in the next input
let remaining_bits = ((row + 1) * W) % Self::T;
let current_bits = W - remaining_bits;
tmp = (src >> shift) & mask::<Self>(current_bits);

if next_pos < W {
// Load the next input value
src = input[Self::LANES * next_pos + i];
// Consume the remaining bits from the next input value.
tmp |= (src & mask::<Self>(remaining_bits)) << current_bits;
}
} else {
// Otherwise, just grab W bits from the src value
tmp = (src >> shift) & mask::<Self>(W);
}

// Write out the unpacked value
output[(Self::LANES * row) + i] = tmp;
}});
}
}

#[inline(never)]
fn bitunpack_single<const W: usize>(
input: &[Self; 128 * W / size_of::<Self>()],
index: usize,
) -> Self where BitPackWidth<W>: SupportedBitPackWidth<Self> {
let lane_index = index % Self::LANES;
let lane_start_bit = (index / Self::LANES) * W;

let (lsb, msb) = {
// the value may be split across two words
let lane_start_word = lane_start_bit / Self::T;
let lane_end_word = (lane_start_bit + W - 1) / Self::T;

(
input[lane_start_word * Self::LANES + lane_index],
input[lane_end_word * Self::LANES + lane_index], // this may be a duplicate
)
};

let shift = lane_start_bit % Self::T;
if shift == 0 {
(lsb >> shift) & mask::<Self>(W)
} else {
// If shift == 0, then this shift overflows, instead of shifting to zero.
// This forces us to introduce a branch. Any way to avoid?
let hi = msb << (Self::T - shift);
let lo = lsb >> shift;
(lo | hi) & mask::<Self>(W)
}
}
}
}
};
}

impl_bitpacking!(u8);
impl_bitpacking!(u16);
impl_bitpacking!(u32);
impl_bitpacking!(u64);

#[cfg(test)]
// #[cfg(not(debug_assertions))] // Only run in release mode
mod test {
use super::*;

macro_rules! test_round_trip {
($T:ty, $W:literal) => {
paste! {
#[test]
fn [<try_round_trip_ $T _ $W>]() {
let mut values: [$T; 1024] = [0; 1024];
for i in 0..1024 {
values[i] = (i % (1 << $W)) as $T;
}

let mut packed = [0; 128 * $W / size_of::<$T>()];
BitPacking::bitpack::<$W>(&values, &mut packed);

let mut unpacked = [0; 1024];
BitPacking::bitunpack::<$W>(&packed, &mut unpacked);

assert_eq!(&unpacked, &values);
}

#[test]
fn [<try_unpack_single_ $T _ $W>]() {
let mut values: [$T; 1024] = [0; 1024];
for i in 0..1024 {
values[i] = (i % (1 << $W)) as $T;
}

let mut packed = [0; 128 * $W / size_of::<$T>()];
BitPacking::bitpack::<$W>(&values, &mut packed);

for (idx, value) in values.into_iter().enumerate() {
assert_eq!(BitPacking::bitunpack_single::<$W>(&packed, idx), value);
}
}
}
};
}

seq!(W in 1..8 { test_round_trip!(u8, W); });
seq!(W in 1..16 { test_round_trip!(u16, W); });
seq!(W in 1..32 { test_round_trip!(u32, W); });
seq!(W in 1..64 { test_round_trip!(u64, W); });
}
22 changes: 22 additions & 0 deletions fastlanes/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#![allow(incomplete_features)]
#![feature(generic_const_exprs)]

use std::mem::size_of;

use num_traits::{PrimInt, Unsigned};

mod bitpacking;
pub use bitpacking::*;

pub const ORDER: [u8; 8] = [0, 4, 2, 6, 1, 5, 3, 7];

pub trait FastLanes: Sized + Unsigned + PrimInt {
const T: usize = size_of::<Self>() * 8;
const LANES: usize = 1024 / Self::T;
}

pub struct Pred<const B: bool>;

pub trait Satisfied {}

impl Satisfied for Pred<true> {}
15 changes: 14 additions & 1 deletion fastlanez/src/bitpack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ macro_rules! bitpack_impl {
}
}

#[inline]
#[inline(never)]
fn unpack_single(
input: &[u8; 128 * W],
index: usize
Expand Down Expand Up @@ -236,4 +236,17 @@ mod test {
assert_eq!(decoded, *v);
});
}

#[test]
fn test_unpack_single2() {
let input = (0u16..1024).collect::<Vec<_>>();
let mut output = Vec::new();
TryBitPack::try_pack_into(array_ref![input, 0, 1024], 11, &mut output).unwrap();
assert_eq!(output.len(), 1408);

input.iter().enumerate().for_each(|(i, v)| {
let decoded = u16::try_unpack_single(&output, 11, i).unwrap();
assert_eq!(decoded, *v);
});
}
}