Skip to content

Commit

Permalink
Switch to per-block ownership (#1107)
Browse files Browse the repository at this point in the history
At last, we're ready to switch to per-block (instead of per-byte) ownership!
This means that buffers take up 50% less RAM, and do significantly less work
during reads.

Before (from #1094)
4K READ: bw=52.1MiB/s (54.6MB/s), 52.1MiB/s-52.1MiB/s (54.6MB/s-54.6MB/s), io=3126MiB (3278MB), run=60002-60002msec
1M READ: bw=355MiB/s (372MB/s), 355MiB/s-355MiB/s (372MB/s-372MB/s), io=20.8GiB (22.4GB), run=60080-60080msec
4M READ: bw=352MiB/s (369MB/s), 352MiB/s-352MiB/s (369MB/s-369MB/s), io=20.7GiB (22.2GB), run=60273-60273msec

After
4K READ: bw=50.8MiB/s (53.2MB/s), 50.8MiB/s-50.8MiB/s (53.2MB/s-53.2MB/s), io=3046MiB (3194MB), run=60002-60002msec
1M READ: bw=644MiB/s (675MB/s), 644MiB/s-644MiB/s (675MB/s-675MB/s), io=37.8GiB (40.6GB), run=60038-60038msec
4M READ: bw=576MiB/s (604MB/s), 576MiB/s-576MiB/s (604MB/s-604MB/s), io=33.9GiB (36.4GB), run=60176-60176msec
  • Loading branch information
mkeeter authored Jan 24, 2024
1 parent da60a6d commit eb96d24
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 49 deletions.
41 changes: 23 additions & 18 deletions upstairs/src/in_memory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ use super::*;

struct Inner {
bytes: Vec<u8>,

/// Ownership is tracked per block
owned: Vec<bool>,
}

Expand All @@ -16,12 +18,15 @@ pub struct InMemoryBlockIO {

impl InMemoryBlockIO {
pub fn new(id: Uuid, block_size: u64, total_size: usize) -> Self {
// TODO: make this take (block_count, block_size) so that it matches
// Buffer; this requires changing a bunch of call sites.
assert_eq!(total_size % block_size as usize, 0);
Self {
uuid: id,
block_size,
inner: Mutex::new(Inner {
bytes: vec![0; total_size],
owned: vec![false; total_size],
owned: vec![false; total_size / block_size as usize],
}),
}
}
Expand Down Expand Up @@ -60,15 +65,15 @@ impl BlockIO for InMemoryBlockIO {
offset: Block,
data: &mut Buffer,
) -> Result<(), CrucibleError> {
self.check_data_size(data.len()).await?;
let bs = self.check_data_size(data.len()).await? as usize;
let inner = self.inner.lock().await;

let start = offset.value as usize * self.block_size as usize;
let start = offset.value as usize * bs;

data.write_with_ownership(
data.write_if_owned(
0,
&inner.bytes[start..][..data.len()],
&inner.owned[start..][..data.len()],
&inner.owned[offset.value as usize..][..data.len() / bs],
);

Ok(())
Expand All @@ -80,14 +85,14 @@ impl BlockIO for InMemoryBlockIO {
offset: Block,
data: Bytes,
) -> Result<(), CrucibleError> {
self.check_data_size(data.len()).await?;
let bs = self.check_data_size(data.len()).await? as usize;
let mut inner = self.inner.lock().await;

let start = offset.value as usize * self.block_size as usize;

for i in 0..data.len() {
inner.bytes[start + i] = data[i];
inner.owned[start + i] = true;
let start_block = offset.value as usize;
for (b, chunk) in data.chunks(bs).enumerate() {
let block = start_block + b;
inner.owned[block] = true;
inner.bytes[block * bs..][..bs].copy_from_slice(chunk);
}

Ok(())
Expand All @@ -98,15 +103,15 @@ impl BlockIO for InMemoryBlockIO {
offset: Block,
data: Bytes,
) -> Result<(), CrucibleError> {
self.check_data_size(data.len()).await?;
let bs = self.check_data_size(data.len()).await? as usize;
let mut inner = self.inner.lock().await;

let start = offset.value as usize * self.block_size as usize;

for i in 0..data.len() {
if !inner.owned[start + i] {
inner.bytes[start + i] = data[i];
inner.owned[start + i] = true;
let start_block = offset.value as usize;
for (b, chunk) in data.chunks(bs).enumerate() {
let block = start_block + b;
if !inner.owned[block] {
inner.owned[block] = true;
inner.bytes[block * bs..][..bs].copy_from_slice(chunk);
}
}

Expand Down
92 changes: 66 additions & 26 deletions upstairs/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1507,6 +1507,10 @@ impl fmt::Display for AckStatus {
pub struct Buffer {
block_size: usize,
data: Vec<u8>,

/// Per-block ownership data
///
/// `owned.len() == data.len() / block_size`
owned: Vec<bool>,
}

Expand All @@ -1516,7 +1520,7 @@ impl Buffer {
Buffer {
block_size,
data: vec![0; len],
owned: vec![false; len],
owned: vec![false; block_count],
}
}

Expand All @@ -1526,14 +1530,14 @@ impl Buffer {
Buffer {
block_size,
data: vec![v; len],
owned: vec![false; len],
owned: vec![false; block_count],
}
}

pub fn with_capacity(block_count: usize, block_size: usize) -> Buffer {
let len = block_count * block_size;
let data = Vec::with_capacity(len);
let owned = Vec::with_capacity(len);
let owned = Vec::with_capacity(block_count);

Buffer {
block_size,
Expand Down Expand Up @@ -1570,10 +1574,13 @@ impl Buffer {
assert_eq!(data.len() % self.block_size, 0);

self.data[offset..][..data.len()].copy_from_slice(data);
self.owned[offset..][..data.len()].fill(true);
self.owned[offset / self.block_size..][..data.len() / self.block_size]
.fill(true);
}

/// Writes both data and ownership to the buffer
/// Writes data to the buffer where `owned` is true
///
/// If `owned[i]` is `false`, then that block is not written
///
/// # Panics
/// - The offset must be block-aligned
Expand All @@ -1582,29 +1589,35 @@ impl Buffer {
/// - `data` and `owned` must be the same size
///
/// If any of these conditions are not met, the function will panic.
pub(crate) fn write_with_ownership(
pub(crate) fn write_if_owned(
&mut self,
offset: usize,
data: &[u8],
owned: &[bool],
) {
assert!(offset + data.len() <= self.data.len());
assert_eq!(data.len(), owned.len());
assert_eq!(data.len() % self.block_size, 0);
assert_eq!(offset % self.block_size, 0);
for i in 0..data.len() {
if owned[i] {
self.data[offset + i] = data[i];
self.owned[offset + i] = true;
assert_eq!(data.len() / self.block_size, owned.len());

let start_block = offset / self.block_size;
for (b, chunk) in data.chunks(self.block_size).enumerate() {
debug_assert_eq!(chunk.len(), self.block_size);
if owned[b] {
let block = start_block + b;
self.owned[block] = true;
self.block_mut(block).copy_from_slice(chunk);
}
}
}

/// Writes a `ReadResponse` into the buffer, setting `owned` to true
///
/// The `ReadResponse` must contain a single block's worth of data.
///
/// # Panics
/// - The offset must be block-aligned
/// - The response data length must be divisible by block size
/// - The response data length must be block size
/// - Data cannot exceed the buffer's length
///
/// If any of these conditions are not met, the function will panic.
Expand All @@ -1617,15 +1630,16 @@ impl Buffer {
assert_eq!(offset % self.block_size, 0);
assert_eq!(response.data.len(), self.block_size);
if !response.block_contexts.is_empty() {
self.data[offset..][..response.data.len()]
.copy_from_slice(&response.data);
self.owned[offset..][..response.data.len()].fill(true);
let block = offset / self.block_size;
self.owned[block] = true;
self.block_mut(block).copy_from_slice(&response.data);
}
}

/// Reads buffer data into the given array
///
/// Values without `self.owned` are left unmodified
/// Only blocks with `self.owned` are changed; other blocks are left
/// unmodified.
///
/// # Panics
/// - The offset must be block-aligned
Expand All @@ -1637,9 +1651,13 @@ impl Buffer {
assert!(offset + data.len() <= self.data.len());
assert_eq!(offset % self.block_size, 0);
assert_eq!(data.len() % self.block_size, 0);
for (i, d) in data.iter_mut().enumerate() {
if self.owned[offset + i] {
*d = self.data[offset + i];

let start_block = offset / self.block_size;
for (b, chunk) in data.chunks_mut(self.block_size).enumerate() {
debug_assert_eq!(chunk.len(), self.block_size);
let block = start_block + b;
if self.owned[block] {
chunk.copy_from_slice(self.block(block));
}
}
}
Expand All @@ -1653,6 +1671,9 @@ impl Buffer {

/// Consume and layer buffer contents on top of this one
///
/// The `buffer` argument will be reset to zero size, but preserves its
/// allocations for reuse.
///
/// # Panics
/// - The offset must be block-aligned
/// - Both buffers must have the same block size
Expand All @@ -1661,12 +1682,13 @@ impl Buffer {
pub(crate) fn eat(&mut self, offset: usize, buffer: &mut Buffer) {
assert_eq!(offset % self.block_size, 0);
assert_eq!(self.block_size, buffer.block_size);
for (i, (data, owned)) in
std::iter::zip(&buffer.data, &buffer.owned).enumerate()
{
if *owned {
self.data[offset + i] = *data;
self.owned[offset + i] = true;

let start_block = offset / self.block_size;
for (b, (owned, chunk)) in buffer.blocks().enumerate() {
if owned {
let block = start_block + b;
self.owned[block] = true;
self.block_mut(block).copy_from_slice(chunk);
}
}

Expand All @@ -1683,9 +1705,27 @@ impl Buffer {

let len = block_count * block_size;
self.data.resize(len, 0u8);
self.owned.resize(len, false);
self.owned.resize(block_count, false);
self.block_size = block_size;
}

/// Returns a reference to a particular block
pub fn block(&self, b: usize) -> &[u8] {
&self.data[b * self.block_size..][..self.block_size]
}

/// Returns a mutable reference to a particular block
pub fn block_mut(&mut self, b: usize) -> &mut [u8] {
&mut self.data[b * self.block_size..][..self.block_size]
}

/// Returns an iterator over `(owned, block)` tuples
pub fn blocks(&self) -> impl Iterator<Item = (bool, &[u8])> {
self.owned
.iter()
.cloned()
.zip(self.data.chunks(self.block_size))
}
}

impl std::ops::Deref for Buffer {
Expand Down
7 changes: 2 additions & 5 deletions upstairs/src/volume.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2083,7 +2083,7 @@ mod test {
.read(Block::new(0, BLOCK_SIZE.trailing_zeros()), &mut buffer)
.await?;

assert_eq!(buffer.owned_ref(), &[false; 1024]);
assert_eq!(buffer.owned_ref(), &[false, false]);

// Ownership is set by writing to blocks

Expand All @@ -2105,10 +2105,7 @@ mod test {
expected.extend(vec![0u8; 512]);

assert_eq!(&*buffer, &expected);

let mut expected_ownership = vec![true; 512];
expected_ownership.extend(vec![false; 512]);

let expected_ownership = [true, false];
assert_eq!(buffer.owned_ref(), &expected_ownership);

// Ownership through a volume should be the same!!
Expand Down

0 comments on commit eb96d24

Please sign in to comment.