From eb96d243c1c711df15cd108c6c2ad9b239565e5f Mon Sep 17 00:00:00 2001 From: Matt Keeter Date: Wed, 24 Jan 2024 15:42:16 -0500 Subject: [PATCH] Switch to per-block ownership (#1107) At last, we're ready to switch to per-block (instead of per-byte) ownership! This means that buffers take up 50% less RAM, and do significantly less work during reads. Before (from #1094) 4K READ: bw=52.1MiB/s (54.6MB/s), 52.1MiB/s-52.1MiB/s (54.6MB/s-54.6MB/s), io=3126MiB (3278MB), run=60002-60002msec 1M READ: bw=355MiB/s (372MB/s), 355MiB/s-355MiB/s (372MB/s-372MB/s), io=20.8GiB (22.4GB), run=60080-60080msec 4M READ: bw=352MiB/s (369MB/s), 352MiB/s-352MiB/s (369MB/s-369MB/s), io=20.7GiB (22.2GB), run=60273-60273msec After 4K READ: bw=50.8MiB/s (53.2MB/s), 50.8MiB/s-50.8MiB/s (53.2MB/s-53.2MB/s), io=3046MiB (3194MB), run=60002-60002msec 1M READ: bw=644MiB/s (675MB/s), 644MiB/s-644MiB/s (675MB/s-675MB/s), io=37.8GiB (40.6GB), run=60038-60038msec 4M READ: bw=576MiB/s (604MB/s), 576MiB/s-576MiB/s (604MB/s-604MB/s), io=33.9GiB (36.4GB), run=60176-60176msec --- upstairs/src/in_memory.rs | 41 +++++++++-------- upstairs/src/lib.rs | 92 ++++++++++++++++++++++++++++----------- upstairs/src/volume.rs | 7 +-- 3 files changed, 91 insertions(+), 49 deletions(-) diff --git a/upstairs/src/in_memory.rs b/upstairs/src/in_memory.rs index 45b96520b..011a1c019 100644 --- a/upstairs/src/in_memory.rs +++ b/upstairs/src/in_memory.rs @@ -4,6 +4,8 @@ use super::*; struct Inner { bytes: Vec, + + /// Ownership is tracked per block owned: Vec, } @@ -16,12 +18,15 @@ pub struct InMemoryBlockIO { impl InMemoryBlockIO { pub fn new(id: Uuid, block_size: u64, total_size: usize) -> Self { + // TODO: make this take (block_count, block_size) so that it matches + // Buffer; this requires changing a bunch of call sites. + assert_eq!(total_size % block_size as usize, 0); Self { uuid: id, block_size, inner: Mutex::new(Inner { bytes: vec![0; total_size], - owned: vec![false; total_size], + owned: vec![false; total_size / block_size as usize], }), } } @@ -60,15 +65,15 @@ impl BlockIO for InMemoryBlockIO { offset: Block, data: &mut Buffer, ) -> Result<(), CrucibleError> { - self.check_data_size(data.len()).await?; + let bs = self.check_data_size(data.len()).await? as usize; let inner = self.inner.lock().await; - let start = offset.value as usize * self.block_size as usize; + let start = offset.value as usize * bs; - data.write_with_ownership( + data.write_if_owned( 0, &inner.bytes[start..][..data.len()], - &inner.owned[start..][..data.len()], + &inner.owned[offset.value as usize..][..data.len() / bs], ); Ok(()) @@ -80,14 +85,14 @@ impl BlockIO for InMemoryBlockIO { offset: Block, data: Bytes, ) -> Result<(), CrucibleError> { - self.check_data_size(data.len()).await?; + let bs = self.check_data_size(data.len()).await? as usize; let mut inner = self.inner.lock().await; - let start = offset.value as usize * self.block_size as usize; - - for i in 0..data.len() { - inner.bytes[start + i] = data[i]; - inner.owned[start + i] = true; + let start_block = offset.value as usize; + for (b, chunk) in data.chunks(bs).enumerate() { + let block = start_block + b; + inner.owned[block] = true; + inner.bytes[block * bs..][..bs].copy_from_slice(chunk); } Ok(()) @@ -98,15 +103,15 @@ impl BlockIO for InMemoryBlockIO { offset: Block, data: Bytes, ) -> Result<(), CrucibleError> { - self.check_data_size(data.len()).await?; + let bs = self.check_data_size(data.len()).await? as usize; let mut inner = self.inner.lock().await; - let start = offset.value as usize * self.block_size as usize; - - for i in 0..data.len() { - if !inner.owned[start + i] { - inner.bytes[start + i] = data[i]; - inner.owned[start + i] = true; + let start_block = offset.value as usize; + for (b, chunk) in data.chunks(bs).enumerate() { + let block = start_block + b; + if !inner.owned[block] { + inner.owned[block] = true; + inner.bytes[block * bs..][..bs].copy_from_slice(chunk); } } diff --git a/upstairs/src/lib.rs b/upstairs/src/lib.rs index db4256402..6af7aaadd 100644 --- a/upstairs/src/lib.rs +++ b/upstairs/src/lib.rs @@ -1507,6 +1507,10 @@ impl fmt::Display for AckStatus { pub struct Buffer { block_size: usize, data: Vec, + + /// Per-block ownership data + /// + /// `owned.len() == data.len() / block_size` owned: Vec, } @@ -1516,7 +1520,7 @@ impl Buffer { Buffer { block_size, data: vec![0; len], - owned: vec![false; len], + owned: vec![false; block_count], } } @@ -1526,14 +1530,14 @@ impl Buffer { Buffer { block_size, data: vec![v; len], - owned: vec![false; len], + owned: vec![false; block_count], } } pub fn with_capacity(block_count: usize, block_size: usize) -> Buffer { let len = block_count * block_size; let data = Vec::with_capacity(len); - let owned = Vec::with_capacity(len); + let owned = Vec::with_capacity(block_count); Buffer { block_size, @@ -1570,10 +1574,13 @@ impl Buffer { assert_eq!(data.len() % self.block_size, 0); self.data[offset..][..data.len()].copy_from_slice(data); - self.owned[offset..][..data.len()].fill(true); + self.owned[offset / self.block_size..][..data.len() / self.block_size] + .fill(true); } - /// Writes both data and ownership to the buffer + /// Writes data to the buffer where `owned` is true + /// + /// If `owned[i]` is `false`, then that block is not written /// /// # Panics /// - The offset must be block-aligned @@ -1582,29 +1589,35 @@ impl Buffer { /// - `data` and `owned` must be the same size /// /// If any of these conditions are not met, the function will panic. - pub(crate) fn write_with_ownership( + pub(crate) fn write_if_owned( &mut self, offset: usize, data: &[u8], owned: &[bool], ) { assert!(offset + data.len() <= self.data.len()); - assert_eq!(data.len(), owned.len()); assert_eq!(data.len() % self.block_size, 0); assert_eq!(offset % self.block_size, 0); - for i in 0..data.len() { - if owned[i] { - self.data[offset + i] = data[i]; - self.owned[offset + i] = true; + assert_eq!(data.len() / self.block_size, owned.len()); + + let start_block = offset / self.block_size; + for (b, chunk) in data.chunks(self.block_size).enumerate() { + debug_assert_eq!(chunk.len(), self.block_size); + if owned[b] { + let block = start_block + b; + self.owned[block] = true; + self.block_mut(block).copy_from_slice(chunk); } } } /// Writes a `ReadResponse` into the buffer, setting `owned` to true /// + /// The `ReadResponse` must contain a single block's worth of data. + /// /// # Panics /// - The offset must be block-aligned - /// - The response data length must be divisible by block size + /// - The response data length must be block size /// - Data cannot exceed the buffer's length /// /// If any of these conditions are not met, the function will panic. @@ -1617,15 +1630,16 @@ impl Buffer { assert_eq!(offset % self.block_size, 0); assert_eq!(response.data.len(), self.block_size); if !response.block_contexts.is_empty() { - self.data[offset..][..response.data.len()] - .copy_from_slice(&response.data); - self.owned[offset..][..response.data.len()].fill(true); + let block = offset / self.block_size; + self.owned[block] = true; + self.block_mut(block).copy_from_slice(&response.data); } } /// Reads buffer data into the given array /// - /// Values without `self.owned` are left unmodified + /// Only blocks with `self.owned` are changed; other blocks are left + /// unmodified. /// /// # Panics /// - The offset must be block-aligned @@ -1637,9 +1651,13 @@ impl Buffer { assert!(offset + data.len() <= self.data.len()); assert_eq!(offset % self.block_size, 0); assert_eq!(data.len() % self.block_size, 0); - for (i, d) in data.iter_mut().enumerate() { - if self.owned[offset + i] { - *d = self.data[offset + i]; + + let start_block = offset / self.block_size; + for (b, chunk) in data.chunks_mut(self.block_size).enumerate() { + debug_assert_eq!(chunk.len(), self.block_size); + let block = start_block + b; + if self.owned[block] { + chunk.copy_from_slice(self.block(block)); } } } @@ -1653,6 +1671,9 @@ impl Buffer { /// Consume and layer buffer contents on top of this one /// + /// The `buffer` argument will be reset to zero size, but preserves its + /// allocations for reuse. + /// /// # Panics /// - The offset must be block-aligned /// - Both buffers must have the same block size @@ -1661,12 +1682,13 @@ impl Buffer { pub(crate) fn eat(&mut self, offset: usize, buffer: &mut Buffer) { assert_eq!(offset % self.block_size, 0); assert_eq!(self.block_size, buffer.block_size); - for (i, (data, owned)) in - std::iter::zip(&buffer.data, &buffer.owned).enumerate() - { - if *owned { - self.data[offset + i] = *data; - self.owned[offset + i] = true; + + let start_block = offset / self.block_size; + for (b, (owned, chunk)) in buffer.blocks().enumerate() { + if owned { + let block = start_block + b; + self.owned[block] = true; + self.block_mut(block).copy_from_slice(chunk); } } @@ -1683,9 +1705,27 @@ impl Buffer { let len = block_count * block_size; self.data.resize(len, 0u8); - self.owned.resize(len, false); + self.owned.resize(block_count, false); self.block_size = block_size; } + + /// Returns a reference to a particular block + pub fn block(&self, b: usize) -> &[u8] { + &self.data[b * self.block_size..][..self.block_size] + } + + /// Returns a mutable reference to a particular block + pub fn block_mut(&mut self, b: usize) -> &mut [u8] { + &mut self.data[b * self.block_size..][..self.block_size] + } + + /// Returns an iterator over `(owned, block)` tuples + pub fn blocks(&self) -> impl Iterator { + self.owned + .iter() + .cloned() + .zip(self.data.chunks(self.block_size)) + } } impl std::ops::Deref for Buffer { diff --git a/upstairs/src/volume.rs b/upstairs/src/volume.rs index aa1e4f5af..f047f2209 100644 --- a/upstairs/src/volume.rs +++ b/upstairs/src/volume.rs @@ -2083,7 +2083,7 @@ mod test { .read(Block::new(0, BLOCK_SIZE.trailing_zeros()), &mut buffer) .await?; - assert_eq!(buffer.owned_ref(), &[false; 1024]); + assert_eq!(buffer.owned_ref(), &[false, false]); // Ownership is set by writing to blocks @@ -2105,10 +2105,7 @@ mod test { expected.extend(vec![0u8; 512]); assert_eq!(&*buffer, &expected); - - let mut expected_ownership = vec![true; 512]; - expected_ownership.extend(vec![false; 512]); - + let expected_ownership = [true, false]; assert_eq!(buffer.owned_ref(), &expected_ownership); // Ownership through a volume should be the same!!