From a318fcd8b15aa1bf1dd4cf4e7d20605e468fe656 Mon Sep 17 00:00:00 2001 From: Abner Zheng Date: Wed, 15 May 2024 19:33:11 +0800 Subject: [PATCH] start day4 --- mini-lsm-starter/README.md | 27 +++++ mini-lsm-starter/src/block/builder.rs | 4 +- mini-lsm-starter/src/tests.rs | 1 + mini-lsm-starter/src/tests/harness.rs | 13 --- mini-lsm-starter/src/tests/week1_day4.rs | 141 +++++++++++++++++++++++ 5 files changed, 171 insertions(+), 15 deletions(-) create mode 100644 mini-lsm-starter/src/tests/week1_day4.rs diff --git a/mini-lsm-starter/README.md b/mini-lsm-starter/README.md index 82f1c37..83704a0 100644 --- a/mini-lsm-starter/README.md +++ b/mini-lsm-starter/README.md @@ -22,3 +22,30 @@ Starter code for Mini-LSM. - No, freezing the memtable acquire the write lock, which required that there is no thread holding read lock, so readers can not hold old LSM state. * There are several places that you might first acquire a read lock on state, then drop it and acquire a write lock (these two operations might be in different functions but they happened sequentially due to one function calls the other). How does it differ from directly upgrading the read lock to a write lock? Is it necessary to upgrade instead of acquiring and dropping and what is the cost of doing the upgrade? - The critical section is different, if upgrading directly, the concurrency of system diminished a lot. + +### day 2 +#### Test Your Understanding +* What is the time/space complexity of using your merge iterator? +* Why do we need a self-referential structure for memtable iterator? +* If a key is removed (there is a delete tombstone), do you need to return it to the user? Where did you handle this logic? +* If a key has multiple versions, will the user see all of them? Where did you handle this logic? +* If we want to get rid of self-referential structure and have a lifetime on the memtable iterator (i.e., MemtableIterator<'a>, where 'a = memtable or LsmStorageInner lifetime), is it still possible to implement the scan functionality? +* What happens if (1) we create an iterator on the skiplist memtable (2) someone inserts new keys into the memtable (3) will the iterator see the new key? +* What happens if your key comparator cannot give the binary heap implementation a stable order? +* Why do we need to ensure the merge iterator returns data in the iterator construction order? +* Is it possible to implement a Rust-style iterator (i.e., next(&self) -> (Key, Value)) for LSM iterators? What are the pros/cons? +* The scan interface is like fn scan(&self, lower: Bound<&[u8]>, upper: Bound<&[u8]>). How to make this API compatible with Rust-style range (i.e., key_a..key_b)? If you implement this, try to pass a full range .. to the interface and see what will happen. +* The starter code provides the merge iterator interface to store Box instead of I. What might be the reason behind that? + + +### day 3 +#### Test Your Understanding +* What is the time complexity of seeking a key in the block? +* Where does the cursor stop when you seek a non-existent key in your implementation? +* So Block is simply a vector of raw data and a vector of offsets. Can we change them to Byte and Arc<[u16]>, and change all the iterator interfaces to return Byte instead of &[u8]? (Assume that we use Byte::slice to return a slice of the block without copying.) What are the pros/cons? +* What is the endian of the numbers written into the blocks in your implementation? +* Is your implementation prune to a maliciously-built block? Will there be invalid memory access, or OOMs, if a user deliberately construct an invalid block? +* Can a block contain duplicated keys? +* What happens if the user adds a key larger than the target block size? +* Consider the case that the LSM engine is built on object store services (S3). How would you optimize/change the block format and parameters to make it suitable for such services? +* Do you love bubble tea? Why or why not? \ No newline at end of file diff --git a/mini-lsm-starter/src/block/builder.rs b/mini-lsm-starter/src/block/builder.rs index 3a6651b..a69ecaf 100644 --- a/mini-lsm-starter/src/block/builder.rs +++ b/mini-lsm-starter/src/block/builder.rs @@ -1,6 +1,6 @@ -use bytes::BufMut; -use crate::key::{KeySlice, KeyVec}; use super::Block; +use crate::key::{KeySlice, KeyVec}; +use bytes::BufMut; /// Builds a block. pub struct BlockBuilder { diff --git a/mini-lsm-starter/src/tests.rs b/mini-lsm-starter/src/tests.rs index 69007ca..590f76e 100644 --- a/mini-lsm-starter/src/tests.rs +++ b/mini-lsm-starter/src/tests.rs @@ -5,3 +5,4 @@ mod harness; mod week1_day1; mod week1_day2; mod week1_day3; +mod week1_day4; diff --git a/mini-lsm-starter/src/tests/harness.rs b/mini-lsm-starter/src/tests/harness.rs index a6cdfd4..b41745b 100644 --- a/mini-lsm-starter/src/tests/harness.rs +++ b/mini-lsm-starter/src/tests/harness.rs @@ -94,11 +94,6 @@ where I: for<'a> StorageIterator = KeySlice<'a>>, { for (k, v) in expected { - println!( - "expected key: {:?}, actual key: {:?}", - k, - as_bytes(iter.key().for_testing_key_ref()) - ); assert!(iter.is_valid()); assert_eq!( k, @@ -155,13 +150,6 @@ where I: for<'a> StorageIterator = &'a [u8]>, { for (k, v) in expected { - println!( - "expected: {:?}/{:?}, actual: {:?}/{:?}", - k, - v, - as_bytes(iter.key()), - as_bytes(iter.value()), - ); assert!(iter.is_valid()); assert_eq!( k, @@ -184,7 +172,6 @@ where pub fn expect_iter_error(mut iter: impl StorageIterator) { loop { - println!("{:?}:{:?}", iter.key(), iter.value()); match iter.next() { Ok(_) if iter.is_valid() => continue, Ok(_) => panic!("expect an error"), diff --git a/mini-lsm-starter/src/tests/week1_day4.rs b/mini-lsm-starter/src/tests/week1_day4.rs new file mode 100644 index 0000000..7f0cfd3 --- /dev/null +++ b/mini-lsm-starter/src/tests/week1_day4.rs @@ -0,0 +1,141 @@ +use std::sync::Arc; + +use bytes::Bytes; +use tempfile::{tempdir, TempDir}; + +use crate::iterators::StorageIterator; +use crate::key::{KeySlice, KeyVec}; +use crate::table::{SsTable, SsTableBuilder, SsTableIterator}; + +#[test] +fn test_sst_build_single_key() { + let mut builder = SsTableBuilder::new(16); + builder.add(KeySlice::for_testing_from_slice_no_ts(b"233"), b"233333"); + let dir = tempdir().unwrap(); + builder.build_for_test(dir.path().join("1.sst")).unwrap(); +} + +#[test] +fn test_sst_build_two_blocks() { + let mut builder = SsTableBuilder::new(16); + builder.add(KeySlice::for_testing_from_slice_no_ts(b"11"), b"11"); + builder.add(KeySlice::for_testing_from_slice_no_ts(b"22"), b"22"); + builder.add(KeySlice::for_testing_from_slice_no_ts(b"33"), b"11"); + builder.add(KeySlice::for_testing_from_slice_no_ts(b"44"), b"22"); + builder.add(KeySlice::for_testing_from_slice_no_ts(b"55"), b"11"); + builder.add(KeySlice::for_testing_from_slice_no_ts(b"66"), b"22"); + assert!(builder.meta.len() >= 2); + let dir = tempdir().unwrap(); + builder.build_for_test(dir.path().join("1.sst")).unwrap(); +} + +fn key_of(idx: usize) -> KeyVec { + KeyVec::for_testing_from_vec_no_ts(format!("key_{:03}", idx * 5).into_bytes()) +} + +fn value_of(idx: usize) -> Vec { + format!("value_{:010}", idx).into_bytes() +} + +fn num_of_keys() -> usize { + 100 +} + +fn generate_sst() -> (TempDir, SsTable) { + let mut builder = SsTableBuilder::new(128); + for idx in 0..num_of_keys() { + let key = key_of(idx); + let value = value_of(idx); + builder.add(key.as_key_slice(), &value[..]); + } + let dir = tempdir().unwrap(); + let path = dir.path().join("1.sst"); + (dir, builder.build_for_test(path).unwrap()) +} + +#[test] +fn test_sst_build_all() { + generate_sst(); +} + +#[test] +fn test_sst_decode() { + let (_dir, sst) = generate_sst(); + let meta = sst.block_meta.clone(); + let new_sst = SsTable::open_for_test(sst.file).unwrap(); + assert_eq!(new_sst.block_meta, meta); + assert_eq!( + new_sst.first_key().for_testing_key_ref(), + key_of(0).for_testing_key_ref() + ); + assert_eq!( + new_sst.last_key().for_testing_key_ref(), + key_of(num_of_keys() - 1).for_testing_key_ref() + ); +} + +fn as_bytes(x: &[u8]) -> Bytes { + Bytes::copy_from_slice(x) +} + +#[test] +fn test_sst_iterator() { + let (_dir, sst) = generate_sst(); + let sst = Arc::new(sst); + let mut iter = SsTableIterator::create_and_seek_to_first(sst).unwrap(); + for _ in 0..5 { + for i in 0..num_of_keys() { + let key = iter.key(); + let value = iter.value(); + assert_eq!( + key.for_testing_key_ref(), + key_of(i).for_testing_key_ref(), + "expected key: {:?}, actual key: {:?}", + as_bytes(key_of(i).for_testing_key_ref()), + as_bytes(key.for_testing_key_ref()) + ); + assert_eq!( + value, + value_of(i), + "expected value: {:?}, actual value: {:?}", + as_bytes(&value_of(i)), + as_bytes(value) + ); + iter.next().unwrap(); + } + iter.seek_to_first().unwrap(); + } +} + +#[test] +fn test_sst_seek_key() { + let (_dir, sst) = generate_sst(); + let sst = Arc::new(sst); + let mut iter = SsTableIterator::create_and_seek_to_key(sst, key_of(0).as_key_slice()).unwrap(); + for offset in 1..=5 { + for i in 0..num_of_keys() { + let key = iter.key(); + let value = iter.value(); + assert_eq!( + key.for_testing_key_ref(), + key_of(i).for_testing_key_ref(), + "expected key: {:?}, actual key: {:?}", + as_bytes(key_of(i).for_testing_key_ref()), + as_bytes(key.for_testing_key_ref()) + ); + assert_eq!( + value, + value_of(i), + "expected value: {:?}, actual value: {:?}", + as_bytes(&value_of(i)), + as_bytes(value) + ); + iter.seek_to_key(KeySlice::for_testing_from_slice_no_ts( + &format!("key_{:03}", i * 5 + offset).into_bytes(), + )) + .unwrap(); + } + iter.seek_to_key(KeySlice::for_testing_from_slice_no_ts(b"k")) + .unwrap(); + } +}