From d9fa5921aee148b29729fb1cd1476de2683c9a77 Mon Sep 17 00:00:00 2001 From: Bai Chuan Date: Tue, 11 Jul 2023 23:20:45 +0800 Subject: [PATCH] provide raw store component base on RocksDB (#454) --- Cargo.toml | 14 +- moveos/moveos-store/Cargo.toml | 3 +- moveos/raw-store/Cargo.toml | 77 + moveos/raw-store/src/lib.rs | 22 + moveos/raw-store/src/macros.rs | 138 ++ moveos/raw-store/src/metrics.rs | 885 +++++++++ moveos/raw-store/src/rocks/errors.rs | 126 ++ moveos/raw-store/src/rocks/iter.rs | 171 ++ moveos/raw-store/src/rocks/keys.rs | 71 + moveos/raw-store/src/rocks/mod.rs | 2406 +++++++++++++++++++++++ moveos/raw-store/src/rocks/safe_iter.rs | 161 ++ moveos/raw-store/src/rocks/tests.rs | 1154 +++++++++++ moveos/raw-store/src/rocks/util.rs | 81 + moveos/raw-store/src/rocks/values.rs | 47 + moveos/raw-store/src/test_db.rs | 807 ++++++++ moveos/raw-store/src/traits.rs | 208 ++ 16 files changed, 6369 insertions(+), 2 deletions(-) create mode 100644 moveos/raw-store/Cargo.toml create mode 100644 moveos/raw-store/src/lib.rs create mode 100644 moveos/raw-store/src/macros.rs create mode 100644 moveos/raw-store/src/metrics.rs create mode 100644 moveos/raw-store/src/rocks/errors.rs create mode 100644 moveos/raw-store/src/rocks/iter.rs create mode 100644 moveos/raw-store/src/rocks/keys.rs create mode 100644 moveos/raw-store/src/rocks/mod.rs create mode 100644 moveos/raw-store/src/rocks/safe_iter.rs create mode 100644 moveos/raw-store/src/rocks/tests.rs create mode 100644 moveos/raw-store/src/rocks/util.rs create mode 100644 moveos/raw-store/src/rocks/values.rs create mode 100644 moveos/raw-store/src/test_db.rs create mode 100644 moveos/raw-store/src/traits.rs diff --git a/Cargo.toml b/Cargo.toml index 4d33e0f50..79d9574c5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,6 +9,7 @@ members = [ "moveos/moveos-stdlib-builder", "moveos/moveos", "moveos/moveos-common", + "moveos/raw-store", "crates/rooch-key", "crates/rooch-types", "crates/rooch-framework", @@ -61,6 +62,7 @@ moveos = { path = "moveos/moveos" } moveos-cli = { path = "moveos/moveos-cli" } moveos-common = { path = "moveos/moveos-common" } moveos-verifier = { path = "moveos/moveos-verifier" } +raw-store = { path = "moveos/raw-store" } # crates for Rooch rooch = { path = "crates/rooch" } @@ -160,13 +162,23 @@ versions = "4.1.0" pretty_assertions = "1.2.0" syn = { version = "1.0.104", features = ["full", "extra-traits"] } quote = "1.0" -proc-macro2 = "1.0" +proc-macro2 = "1.0.47" derive-syn-parse = "0.1.5" unescape = "0.1.0" tempfile = "3.2.0" regex = "1.8.4" walkdir = "2.3.3" +rocksdb = { version = "0.21.0", features = ["snappy", "lz4", "zstd", "zlib", "multi-threaded-cf"], default-features = false } +bincode = "1.3.3" +collectable = "0.0.2" +fdlimit = "0.2.1" +tap = "1.0.1" +num_cpus = "1.14.0" +prometheus = "0.13.3" +hdrhistogram = "7.5.1" +ouroboros = "0.15.5" +rstest = "0.16.0" # Note: the BEGIN and END comments below are required for external tooling. Do not remove. # BEGIN MOVE DEPENDENCIES diff --git a/moveos/moveos-store/Cargo.toml b/moveos/moveos-store/Cargo.toml index 16ae771d4..261df2b83 100644 --- a/moveos/moveos-store/Cargo.toml +++ b/moveos/moveos-store/Cargo.toml @@ -26,4 +26,5 @@ move-core-types = { workspace = true } move-resource-viewer = { workspace = true } moveos-types = { workspace = true } -moveos-stdlib = { workspace = true } \ No newline at end of file +moveos-stdlib = { workspace = true } +raw-store = { workspace = true } \ No newline at end of file diff --git a/moveos/raw-store/Cargo.toml b/moveos/raw-store/Cargo.toml new file mode 100644 index 000000000..df434be27 --- /dev/null +++ b/moveos/raw-store/Cargo.toml @@ -0,0 +1,77 @@ +[package] +name = "raw-store" +version = "0.1.0" + +# Workspace inherited keys +authors = { workspace = true } +edition = { workspace = true } +homepage = { workspace = true } +license = { workspace = true } +publish = { workspace = true } +repository = { workspace = true } +rust-version = { workspace = true } + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +#[dependencies] +#bcs = "0.1.4" +#bincode = "1.3.3" +#collectable = "0.0.2" +#eyre = "0.6.8" +#fdlimit = "0.2.1" +#once_cell = "1.15.0" +#tap = "1.0.1" +#num_cpus = "1.14.0" +#prometheus = "0.13.3" +#hdrhistogram = "7.5.1" +## deactivation of bzip2 due to https://github.com/rust-rocksdb/rust-rocksdb/issues/609 +#rocksdb = { version = "0.20.1", features = ["snappy", "lz4", "zstd", "zlib", "multi-threaded-cf"], default-features = false } +#serde = { version = "1.0.140", features = ["derive"] } +#thiserror = "1.0.37" +#tokio = { workspace = true, features = ["full", "test-util"] } +#tracing = "0.1.37" +#ouroboros = "0.15.5" +#rand = "0.8.5" +#async-trait = "0.1.57" +#itertools = "0.10.5" + +#sui-macros = { path = "../sui-macros" } +#workspace-hack = { version = "0.1", path = "../workspace-hack" } + +[dependencies] +anyhow = { workspace = true } +bcs = { workspace = true } +#smt = { workspace = true } +serde = { workspace = true } +serde_bytes = { workspace = true } +hex = { workspace = true } +parking_lot = { workspace = true } + +rocksdb = { workspace = true } +prometheus = { workspace = true } +tokio = { workspace = true } +bincode = { workspace = true } +collectable = { workspace = true } +once_cell = { workspace = true } +eyre = { workspace = true } +fdlimit = { workspace = true } +tap = { workspace = true } +num_cpus = { workspace = true } +hdrhistogram = { workspace = true } +ouroboros = { workspace = true } +rand = { workspace = true } +async-trait = { workspace = true } +itertools = { workspace = true } +thiserror = { workspace = true } +tracing = { workspace = true } +futures = { workspace = true } +rstest = { workspace = true } +tempfile = { workspace = true } + +move-core-types = { workspace = true } +move-resource-viewer = { workspace = true } + +# Most packages should depend on sui-simulator instead of directly on msim, but for typed-store +# that creates a circular dependency. +#[target.'cfg(msim)'.dependencies] +#msim = { git = "https://github.com/MystenLabs/mysten-sim.git", rev = "e9011f96b84615b63cd8b5835e606a2fc218a1bd", package = "msim" } diff --git a/moveos/raw-store/src/lib.rs b/moveos/raw-store/src/lib.rs new file mode 100644 index 000000000..be501ebdd --- /dev/null +++ b/moveos/raw-store/src/lib.rs @@ -0,0 +1,22 @@ +// Copyright (c) RoochNetwork +// SPDX-License-Identifier: Apache-2.0 + +// Copyright (c) 2021, Facebook, Inc. and its affiliates +// Copyright (c) Mysten Labs, Inc. +// SPDX-License-Identifier: Apache-2.0 +#![warn( + future_incompatible, + nonstandard_style, + rust_2018_idioms, + rust_2021_compatibility +)] + +pub mod traits; +pub use traits::Map; +pub mod rocks; +pub use rocks::RawStoreError; +pub mod macros; +pub mod metrics; +pub mod test_db; + +pub type StoreError = rocks::RawStoreError; diff --git a/moveos/raw-store/src/macros.rs b/moveos/raw-store/src/macros.rs new file mode 100644 index 000000000..be9ac0c5f --- /dev/null +++ b/moveos/raw-store/src/macros.rs @@ -0,0 +1,138 @@ +// Copyright (c) RoochNetwork +// SPDX-License-Identifier: Apache-2.0 + +// Copyright (c) Mysten Labs, Inc. +// SPDX-License-Identifier: Apache-2.0 + +use futures::future::BoxFuture; +use std::collections::HashMap; +use std::future::Future; +use std::sync::Arc; + +/// Simply evaluates expr. +#[macro_export] +macro_rules! nondeterministic { + ($expr: expr) => { + $expr + }; +} + +type FpCallback = dyn Fn() -> Option> + Send + Sync + 'static; +type FpMap = HashMap<&'static str, Arc>; + +fn with_fp_map(func: impl FnOnce(&mut FpMap) -> T) -> T { + use once_cell::sync::Lazy; + use std::sync::Mutex; + + static MAP: Lazy> = Lazy::new(Default::default); + let mut map = MAP.lock().unwrap(); + func(&mut map) +} + +fn get_callback(identifier: &'static str) -> Option> { + with_fp_map(|map| map.get(identifier).cloned()) +} + +pub fn handle_fail_point(identifier: &'static str) { + if let Some(callback) = get_callback(identifier) { + tracing::error!("hit failpoint {}", identifier); + assert!( + callback().is_none(), + "sync failpoint must not return future" + ); + } +} + +pub async fn handle_fail_point_async(identifier: &'static str) { + if let Some(callback) = get_callback(identifier) { + tracing::error!("hit async failpoint {}", identifier); + let fut = callback().expect("async callback must return future"); + fut.await; + } +} + +fn register_fail_point_impl( + identifier: &'static str, + callback: Arc Option> + Sync + Send + 'static>, +) { + with_fp_map(move |map| { + assert!( + map.insert(identifier, callback).is_none(), + "duplicate fail point registration" + ); + }) +} + +pub fn register_fail_point(identifier: &'static str, callback: impl Fn() + Sync + Send + 'static) { + register_fail_point_impl( + identifier, + Arc::new(move || { + callback(); + None + }), + ); +} + +pub fn register_fail_point_async( + identifier: &'static str, + callback: impl Fn() -> F + Sync + Send + 'static, +) where + F: Future + Sync + Send + 'static, +{ + register_fail_point_impl(identifier, Arc::new(move || Some(Box::pin(callback())))); +} + +pub fn register_fail_points( + identifiers: &[&'static str], + callback: impl Fn() + Sync + Send + 'static, +) { + let cb = Arc::new(move || { + callback(); + None + }); + for id in identifiers { + register_fail_point_impl(id, cb.clone()); + } +} + +#[cfg(not(any(fail_points)))] +#[macro_export] +macro_rules! fail_point { + ($tag: expr) => {}; +} + +#[cfg(not(any(fail_points)))] +#[macro_export] +macro_rules! fail_point_async { + ($tag: expr) => {}; +} + +// These tests need to be run in release mode, since debug mode does overflow checks by default! +#[cfg(test)] +mod test { + // use super::*; + + // Uncomment to test error messages + // #[with_checked_arithmetic] + // struct TestStruct; + + macro_rules! pass_through { + ($($tt:tt)*) => { + $($tt)* + } + } + + #[test] + fn test_skip_checked_arithmetic() { + // comment out this attr to test the error message + pass_through! { + fn unchecked_add(a: i32, b: i32) -> i32 { + a + b + } + } + + // this will not panic even if we pass in (i32::MAX, 1), because we skipped processing + // the item macro, so we also need to make sure it doesn't panic in debug mode. + unchecked_add(1, 2); + } +} diff --git a/moveos/raw-store/src/metrics.rs b/moveos/raw-store/src/metrics.rs new file mode 100644 index 000000000..a86ecde5a --- /dev/null +++ b/moveos/raw-store/src/metrics.rs @@ -0,0 +1,885 @@ +// Copyright (c) RoochNetwork +// SPDX-License-Identifier: Apache-2.0 + +// Copyright (c) Mysten Labs, Inc. +// SPDX-License-Identifier: Apache-2.0 +use once_cell::sync::OnceCell; +use prometheus::{ + register_histogram_vec_with_registry, register_int_counter_vec_with_registry, + register_int_gauge_vec_with_registry, HistogramVec, IntCounterVec, IntGaugeVec, Registry, +}; +use rocksdb::perf::set_perf_stats; +use rocksdb::{PerfContext, PerfMetric, PerfStatsLevel}; +use std::cell::RefCell; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::Arc; +use std::time::Duration; +use tap::TapFallible; +use tracing::warn; + +thread_local! { + static PER_THREAD_ROCKS_PERF_CONTEXT: std::cell::RefCell = RefCell::new(PerfContext::default()); +} + +const LATENCY_SEC_BUCKETS: &[f64] = &[ + 0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1., 2.5, 5., 10., 20., 30., 60., 90., +]; + +#[derive(Debug, Clone)] +// A struct for sampling based on number of operations or duration. +// Sampling happens if the duration expires and after number of operations +pub struct SamplingInterval { + // Sample once every time duration + pub once_every_duration: Duration, + // Sample once every number of operations + pub after_num_ops: u64, + // Counter for keeping track of previous sample + pub counter: Arc, +} + +impl Default for SamplingInterval { + fn default() -> Self { + // Enabled with 60 second interval + SamplingInterval::new(Duration::from_secs(60), 0) + } +} + +impl SamplingInterval { + pub fn new(once_every_duration: Duration, after_num_ops: u64) -> Self { + let counter = Arc::new(AtomicU64::new(1)); + if !once_every_duration.is_zero() { + let counter = counter.clone(); + tokio::task::spawn(async move { + loop { + if counter.load(Ordering::SeqCst) > after_num_ops { + counter.store(0, Ordering::SeqCst); + } + tokio::time::sleep(once_every_duration).await; + } + }); + } + SamplingInterval { + once_every_duration, + after_num_ops, + counter, + } + } + pub fn new_from_self(&self) -> SamplingInterval { + SamplingInterval::new(self.once_every_duration, self.after_num_ops) + } + pub fn sample(&self) -> bool { + if self.once_every_duration.is_zero() { + self.counter.fetch_add(1, Ordering::Relaxed) % (self.after_num_ops + 1) == 0 + } else { + self.counter.fetch_add(1, Ordering::Relaxed) == 0 + } + } +} + +#[derive(Debug)] +pub struct ColumnFamilyMetrics { + pub rocksdb_total_sst_files_size: IntGaugeVec, + pub rocksdb_total_blob_files_size: IntGaugeVec, + pub rocksdb_size_all_mem_tables: IntGaugeVec, + pub rocksdb_num_snapshots: IntGaugeVec, + pub rocksdb_oldest_snapshot_time: IntGaugeVec, + pub rocksdb_actual_delayed_write_rate: IntGaugeVec, + pub rocksdb_is_write_stopped: IntGaugeVec, + pub rocksdb_block_cache_capacity: IntGaugeVec, + pub rocksdb_block_cache_usage: IntGaugeVec, + pub rocksdb_block_cache_pinned_usage: IntGaugeVec, + pub rocskdb_estimate_table_readers_mem: IntGaugeVec, + pub rocksdb_mem_table_flush_pending: IntGaugeVec, + pub rocskdb_compaction_pending: IntGaugeVec, + pub rocskdb_num_running_compactions: IntGaugeVec, + pub rocksdb_num_running_flushes: IntGaugeVec, + pub rocksdb_estimate_oldest_key_time: IntGaugeVec, + pub rocskdb_background_errors: IntGaugeVec, + pub rocksdb_estimated_num_keys: IntGaugeVec, +} + +impl ColumnFamilyMetrics { + pub(crate) fn new(registry: &Registry) -> Self { + ColumnFamilyMetrics { + rocksdb_total_sst_files_size: register_int_gauge_vec_with_registry!( + "rocksdb_total_sst_files_size", + "The storage size occupied by the sst files in the column family", + &["cf_name"], + registry, + ) + .unwrap(), + rocksdb_total_blob_files_size: register_int_gauge_vec_with_registry!( + "rocksdb_total_blob_files_size", + "The storage size occupied by the blob files in the column family", + &["cf_name"], + registry, + ) + .unwrap(), + rocksdb_size_all_mem_tables: register_int_gauge_vec_with_registry!( + "rocksdb_size_all_mem_tables", + "The memory size occupied by the column family's in-memory buffer", + &["cf_name"], + registry, + ) + .unwrap(), + rocksdb_num_snapshots: register_int_gauge_vec_with_registry!( + "rocksdb_num_snapshots", + "Number of snapshots held for the column family", + &["cf_name"], + registry, + ) + .unwrap(), + rocksdb_oldest_snapshot_time: register_int_gauge_vec_with_registry!( + "rocksdb_oldest_snapshot_time", + "Unit timestamp of the oldest unreleased snapshot", + &["cf_name"], + registry, + ) + .unwrap(), + rocksdb_actual_delayed_write_rate: register_int_gauge_vec_with_registry!( + "rocksdb_actual_delayed_write_rate", + "The current actual delayed write rate. 0 means no delay", + &["cf_name"], + registry, + ) + .unwrap(), + rocksdb_is_write_stopped: register_int_gauge_vec_with_registry!( + "rocksdb_is_write_stopped", + "A flag indicating whether writes are stopped on this column family. 1 indicates writes have been stopped.", + &["cf_name"], + registry, + ) + .unwrap(), + rocksdb_block_cache_capacity: register_int_gauge_vec_with_registry!( + "rocksdb_block_cache_capacity", + "The block cache capacity of the column family.", + &["cf_name"], + registry, + ) + .unwrap(), + rocksdb_block_cache_usage: register_int_gauge_vec_with_registry!( + "rocksdb_block_cache_usage", + "The memory size used by the column family in the block cache.", + &["cf_name"], + registry, + ) + .unwrap(), + rocksdb_block_cache_pinned_usage: register_int_gauge_vec_with_registry!( + "rocksdb_block_cache_pinned_usage", + "The memory size used by the column family in the block cache where entries are pinned", + &["cf_name"], + registry, + ) + .unwrap(), + rocskdb_estimate_table_readers_mem: register_int_gauge_vec_with_registry!( + "rocskdb_estimate_table_readers_mem", + "The estimated memory size used for reading SST tables in this column + family such as filters and index blocks. Note that this number does not + include the memory used in block cache.", + &["cf_name"], + registry, + ) + .unwrap(), + rocksdb_mem_table_flush_pending: register_int_gauge_vec_with_registry!( + "rocksdb_mem_table_flush_pending", + "A 1 or 0 flag indicating whether a memtable flush is pending. + If this number is 1, it means a memtable is waiting for being flushed, + but there might be too many L0 files that prevents it from being flushed.", + &["cf_name"], + registry, + ) + .unwrap(), + rocskdb_compaction_pending: register_int_gauge_vec_with_registry!( + "rocskdb_compaction_pending", + "A 1 or 0 flag indicating whether a compaction job is pending. + If this number is 1, it means some part of the column family requires + compaction in order to maintain shape of LSM tree, but the compaction + is pending because the desired compaction job is either waiting for + other dependent compactions to be finished or waiting for an available + compaction thread.", + &["cf_name"], + registry, + ) + .unwrap(), + rocskdb_num_running_compactions: register_int_gauge_vec_with_registry!( + "rocskdb_num_running_compactions", + "The number of compactions that are currently running for the column family.", + &["cf_name"], + registry, + ) + .unwrap(), + rocksdb_num_running_flushes: register_int_gauge_vec_with_registry!( + "rocksdb_num_running_flushes", + "The number of flushes that are currently running for the column family.", + &["cf_name"], + registry, + ) + .unwrap(), + rocksdb_estimate_oldest_key_time: register_int_gauge_vec_with_registry!( + "rocksdb_estimate_oldest_key_time", + "Estimation of the oldest key timestamp in the DB. Only available + for FIFO compaction with compaction_options_fifo.allow_compaction = false.", + &["cf_name"], + registry, + ) + .unwrap(), + rocksdb_estimated_num_keys: register_int_gauge_vec_with_registry!( + "rocksdb_estimated_num_keys", + "The estimated number of keys in the table", + &["cf_name"], + registry, + ) + .unwrap(), + rocskdb_background_errors: register_int_gauge_vec_with_registry!( + "rocskdb_background_errors", + "The accumulated number of RocksDB background errors.", + &["cf_name"], + registry, + ) + .unwrap(), + + } + } +} + +#[derive(Debug)] +pub struct OperationMetrics { + pub rocksdb_iter_latency_seconds: HistogramVec, + pub rocksdb_iter_bytes: HistogramVec, + pub rocksdb_iter_keys: HistogramVec, + pub rocksdb_get_latency_seconds: HistogramVec, + pub rocksdb_get_bytes: HistogramVec, + pub rocksdb_multiget_latency_seconds: HistogramVec, + pub rocksdb_multiget_bytes: HistogramVec, + pub rocksdb_put_latency_seconds: HistogramVec, + pub rocksdb_put_bytes: HistogramVec, + pub rocksdb_delete_latency_seconds: HistogramVec, + pub rocksdb_deletes: IntCounterVec, + pub rocksdb_batch_commit_latency_seconds: HistogramVec, + pub rocksdb_batch_commit_bytes: HistogramVec, +} + +impl OperationMetrics { + pub(crate) fn new(registry: &Registry) -> Self { + OperationMetrics { + rocksdb_iter_latency_seconds: register_histogram_vec_with_registry!( + "rocksdb_iter_latency_seconds", + "Rocksdb iter latency in seconds", + &["cf_name"], + LATENCY_SEC_BUCKETS.to_vec(), + registry, + ) + .unwrap(), + rocksdb_iter_bytes: register_histogram_vec_with_registry!( + "rocksdb_iter_bytes", + "Rocksdb iter size in bytes", + &["cf_name"], + registry, + ) + .unwrap(), + rocksdb_iter_keys: register_histogram_vec_with_registry!( + "rocksdb_iter_keys", + "Rocksdb iter num keys", + &["cf_name"], + registry, + ) + .unwrap(), + rocksdb_get_latency_seconds: register_histogram_vec_with_registry!( + "rocksdb_get_latency_seconds", + "Rocksdb get latency in seconds", + &["cf_name"], + LATENCY_SEC_BUCKETS.to_vec(), + registry, + ) + .unwrap(), + rocksdb_get_bytes: register_histogram_vec_with_registry!( + "rocksdb_get_bytes", + "Rocksdb get call returned data size in bytes", + &["cf_name"], + registry + ) + .unwrap(), + rocksdb_multiget_latency_seconds: register_histogram_vec_with_registry!( + "rocksdb_multiget_latency_seconds", + "Rocksdb multiget latency in seconds", + &["cf_name"], + LATENCY_SEC_BUCKETS.to_vec(), + registry, + ) + .unwrap(), + rocksdb_multiget_bytes: register_histogram_vec_with_registry!( + "rocksdb_multiget_bytes", + "Rocksdb multiget call returned data size in bytes", + &["cf_name"], + registry, + ) + .unwrap(), + rocksdb_put_latency_seconds: register_histogram_vec_with_registry!( + "rocksdb_put_latency_seconds", + "Rocksdb put latency in seconds", + &["cf_name"], + LATENCY_SEC_BUCKETS.to_vec(), + registry, + ) + .unwrap(), + rocksdb_put_bytes: register_histogram_vec_with_registry!( + "rocksdb_put_bytes", + "Rocksdb put call puts data size in bytes", + &["cf_name"], + registry, + ) + .unwrap(), + rocksdb_delete_latency_seconds: register_histogram_vec_with_registry!( + "rocksdb_delete_latency_seconds", + "Rocksdb delete latency in seconds", + &["cf_name"], + LATENCY_SEC_BUCKETS.to_vec(), + registry, + ) + .unwrap(), + rocksdb_deletes: register_int_counter_vec_with_registry!( + "rocksdb_deletes", + "Rocksdb delete calls", + &["cf_name"], + registry + ) + .unwrap(), + rocksdb_batch_commit_latency_seconds: register_histogram_vec_with_registry!( + "rocksdb_write_batch_commit_latency_seconds", + "Rocksdb schema batch commit latency in seconds", + &["db_name"], + LATENCY_SEC_BUCKETS.to_vec(), + registry, + ) + .unwrap(), + rocksdb_batch_commit_bytes: register_histogram_vec_with_registry!( + "rocksdb_batch_commit_bytes", + "Rocksdb schema batch commit size in bytes", + &["db_name"], + registry, + ) + .unwrap(), + } + } +} + +pub struct RocksDBPerfContext; + +impl Default for RocksDBPerfContext { + fn default() -> Self { + set_perf_stats(PerfStatsLevel::EnableTime); + PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context| { + perf_context.borrow_mut().reset(); + }); + RocksDBPerfContext {} + } +} + +impl Drop for RocksDBPerfContext { + fn drop(&mut self) { + set_perf_stats(PerfStatsLevel::Disable); + } +} + +#[derive(Debug)] +pub struct ReadPerfContextMetrics { + pub user_key_comparison_count: IntCounterVec, + pub block_cache_hit_count: IntCounterVec, + pub block_read_count: IntCounterVec, + pub block_read_byte: IntCounterVec, + pub block_read_nanos: IntCounterVec, + pub block_checksum_nanos: IntCounterVec, + pub block_decompress_nanos: IntCounterVec, + pub get_read_bytes: IntCounterVec, + pub multiget_read_bytes: IntCounterVec, + pub get_snapshot_nanos: IntCounterVec, + pub get_from_memtable_nanos: IntCounterVec, + pub get_from_memtable_count: IntCounterVec, + pub get_post_process_nanos: IntCounterVec, + pub get_from_output_files_nanos: IntCounterVec, + pub db_mutex_lock_nanos: IntCounterVec, + pub db_condition_wait_nanos: IntCounterVec, + pub merge_operator_nanos: IntCounterVec, + pub read_index_block_nanos: IntCounterVec, + pub read_filter_block_nanos: IntCounterVec, + pub new_table_block_iter_nanos: IntCounterVec, + pub block_seek_nanos: IntCounterVec, + pub find_table_nanos: IntCounterVec, + pub bloom_memtable_hit_count: IntCounterVec, + pub bloom_memtable_miss_count: IntCounterVec, + pub bloom_sst_hit_count: IntCounterVec, + pub bloom_sst_miss_count: IntCounterVec, + pub key_lock_wait_time: IntCounterVec, + pub key_lock_wait_count: IntCounterVec, + pub internal_delete_skipped_count: IntCounterVec, + pub internal_skipped_count: IntCounterVec, +} + +impl ReadPerfContextMetrics { + pub(crate) fn new(registry: &Registry) -> Self { + ReadPerfContextMetrics { + user_key_comparison_count: register_int_counter_vec_with_registry!( + "user_key_comparison_count", + "Helps us figure out whether too many comparisons in binary search can be a problem, + especially when a more expensive comparator is used. Moreover, since number of comparisons + is usually uniform based on the memtable size, the SST file size for Level 0 and size of other + levels, an significant increase of the counter can indicate unexpected LSM-tree shape. + You may want to check whether flush/compaction can keep up with the write speed", + &["cf_name"], + registry, + ) + .unwrap(), + block_cache_hit_count: register_int_counter_vec_with_registry!( + "block_cache_hit_count", + "Tells us how many times we read data blocks from block cache, and block_read_count tells us how many + times we have to read blocks from the file system (either block cache is disabled or it is a cache miss). + We can evaluate the block cache efficiency by looking at the two counters over time.", + &["cf_name"], + registry, + ) + .unwrap(), + block_read_count: register_int_counter_vec_with_registry!( + "block_read_count", + "Tells us how many times we have to read blocks from the file system (either block cache is disabled or it is a cache miss)", + &["cf_name"], + registry, + ) + .unwrap(), + block_read_byte: register_int_counter_vec_with_registry!( + "block_read_byte", + "Tells us how many total bytes we read from the file system. It can tell us whether a slow query can be caused by reading + large blocks from the file system. Index and bloom filter blocks are usually large blocks. A large block can also be the result + of a very large key or value", + &["cf_name"], + registry, + ) + .unwrap(), + block_read_nanos: register_int_counter_vec_with_registry!( + "block_read_nanos", + "Total nanos spent on block reads", + &["cf_name"], + registry, + ) + .unwrap(), + block_checksum_nanos: register_int_counter_vec_with_registry!( + "block_checksum_nanos", + "Total nanos spent on verifying block checksum", + &["cf_name"], + registry, + ) + .unwrap(), + block_decompress_nanos: register_int_counter_vec_with_registry!( + "block_decompress_nanos", + "Total nanos spent on decompressing a block", + &["cf_name"], + registry, + ) + .unwrap(), + get_read_bytes: register_int_counter_vec_with_registry!( + "get_read_bytes", + "Total bytes for values returned by Get", + &["cf_name"], + registry, + ) + .unwrap(), + multiget_read_bytes: register_int_counter_vec_with_registry!( + "multiget_read_bytes", + "Total bytes for values returned by MultiGet.", + &["cf_name"], + registry, + ) + .unwrap(), + get_snapshot_nanos: register_int_counter_vec_with_registry!( + "get_snapshot_nanos", + "Time spent in getting snapshot.", + &["cf_name"], + registry, + ) + .unwrap(), + get_from_memtable_nanos: register_int_counter_vec_with_registry!( + "get_from_memtable_nanos", + "Time spent on reading data from memtable.", + &["cf_name"], + registry, + ) + .unwrap(), + get_from_memtable_count: register_int_counter_vec_with_registry!( + "get_from_memtable_count", + "Number of memtables queried", + &["cf_name"], + registry, + ) + .unwrap(), + get_post_process_nanos: register_int_counter_vec_with_registry!( + "get_post_process_nanos", + "Total nanos spent after Get() finds a key", + &["cf_name"], + registry, + ) + .unwrap(), + get_from_output_files_nanos: register_int_counter_vec_with_registry!( + "get_from_output_files_nanos", + "Total nanos reading from output files", + &["cf_name"], + registry, + ) + .unwrap(), + db_mutex_lock_nanos: register_int_counter_vec_with_registry!( + "db_mutex_lock_nanos", + "Time spent on acquiring db mutex", + &["cf_name"], + registry, + ) + .unwrap(), + db_condition_wait_nanos: register_int_counter_vec_with_registry!( + "db_condition_wait_nanos", + "Time spent waiting with a condition variable created with DB Mutex.", + &["cf_name"], + registry, + ) + .unwrap(), + merge_operator_nanos: register_int_counter_vec_with_registry!( + "merge_operator_nanos", + "Time spent on merge operator.", + &["cf_name"], + registry, + ) + .unwrap(), + read_index_block_nanos: register_int_counter_vec_with_registry!( + "read_index_block_nanos", + "Time spent on reading index block from block cache or SST file", + &["cf_name"], + registry, + ) + .unwrap(), + read_filter_block_nanos: register_int_counter_vec_with_registry!( + "read_filter_block_nanos", + "Time spent on reading filter block from block cache or SST file", + &["cf_name"], + registry, + ) + .unwrap(), + new_table_block_iter_nanos: register_int_counter_vec_with_registry!( + "new_table_block_iter_nanos", + "Time spent on creating data block iterator", + &["cf_name"], + registry, + ) + .unwrap(), + block_seek_nanos: register_int_counter_vec_with_registry!( + "block_seek_nanos", + "Time spent on seeking a key in data/index blocks", + &["cf_name"], + registry, + ) + .unwrap(), + find_table_nanos: register_int_counter_vec_with_registry!( + "find_table_nanos", + "Time spent on finding or creating a table reader", + &["cf_name"], + registry, + ) + .unwrap(), + bloom_memtable_hit_count: register_int_counter_vec_with_registry!( + "bloom_memtable_hit_count", + "Total number of mem table bloom hits", + &["cf_name"], + registry, + ) + .unwrap(), + bloom_memtable_miss_count: register_int_counter_vec_with_registry!( + "bloom_memtable_miss_count", + "Total number of mem table bloom misses", + &["cf_name"], + registry, + ) + .unwrap(), + bloom_sst_hit_count: register_int_counter_vec_with_registry!( + "bloom_sst_hit_count", + "Total number of SST table bloom hits", + &["cf_name"], + registry, + ) + .unwrap(), + bloom_sst_miss_count: register_int_counter_vec_with_registry!( + "bloom_sst_miss_count", + "Total number of SST table bloom misses", + &["cf_name"], + registry, + ) + .unwrap(), + key_lock_wait_time: register_int_counter_vec_with_registry!( + "key_lock_wait_time", + "Time spent waiting on key locks in transaction lock manager", + &["cf_name"], + registry, + ) + .unwrap(), + key_lock_wait_count: register_int_counter_vec_with_registry!( + "key_lock_wait_count", + "Number of times acquiring a lock was blocked by another transaction", + &["cf_name"], + registry, + ) + .unwrap(), + internal_delete_skipped_count: register_int_counter_vec_with_registry!( + "internal_delete_skipped_count", + "Total number of deleted keys skipped during iteration", + &["cf_name"], + registry, + ) + .unwrap(), + internal_skipped_count: register_int_counter_vec_with_registry!( + "internal_skipped_count", + "Totall number of internal keys skipped during iteration", + &["cf_name"], + registry, + ) + .unwrap(), + } + } + + pub fn report_metrics(&self, cf_name: &str) { + PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context_cell| { + set_perf_stats(PerfStatsLevel::Disable); + let perf_context = perf_context_cell.borrow(); + self.user_key_comparison_count + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::UserKeyComparisonCount)); + self.block_cache_hit_count + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::BlockCacheHitCount)); + self.block_read_count + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::BlockReadCount)); + self.block_read_byte + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::BlockReadByte)); + self.block_read_nanos + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::BlockReadTime)); + self.block_read_count + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::BlockReadCount)); + self.block_checksum_nanos + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::BlockChecksumTime)); + self.block_decompress_nanos + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::BlockDecompressTime)); + self.get_read_bytes + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::GetReadBytes)); + self.multiget_read_bytes + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::MultigetReadBytes)); + self.get_snapshot_nanos + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::GetSnapshotTime)); + self.get_from_memtable_nanos + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::GetFromMemtableTime)); + self.get_from_memtable_count + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::GetFromMemtableCount)); + self.get_post_process_nanos + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::GetPostProcessTime)); + self.get_from_output_files_nanos + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::GetFromOutputFilesTime)); + self.db_mutex_lock_nanos + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::DbMutexLockNanos)); + self.db_condition_wait_nanos + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::DbConditionWaitNanos)); + self.merge_operator_nanos + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::MergeOperatorTimeNanos)); + self.read_index_block_nanos + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::ReadIndexBlockNanos)); + self.read_filter_block_nanos + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::ReadFilterBlockNanos)); + self.new_table_block_iter_nanos + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::NewTableBlockIterNanos)); + self.block_seek_nanos + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::BlockSeekNanos)); + self.find_table_nanos + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::FindTableNanos)); + self.bloom_memtable_hit_count + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::BloomMemtableHitCount)); + self.bloom_memtable_miss_count + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::BloomMemtableMissCount)); + self.bloom_sst_hit_count + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::BloomSstHitCount)); + self.bloom_sst_miss_count + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::BloomSstMissCount)); + self.key_lock_wait_time + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::KeyLockWaitTime)); + self.key_lock_wait_count + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::KeyLockWaitCount)); + self.internal_delete_skipped_count + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::InternalDeleteSkippedCount)); + self.internal_skipped_count + .with_label_values(&[cf_name]) + .inc_by(perf_context.metric(PerfMetric::InternalKeySkippedCount)); + }); + } +} + +#[derive(Debug)] +pub struct WritePerfContextMetrics { + pub write_wal_nanos: IntCounterVec, + pub write_memtable_nanos: IntCounterVec, + pub write_delay_nanos: IntCounterVec, + pub write_pre_and_post_process_nanos: IntCounterVec, + pub write_db_mutex_lock_nanos: IntCounterVec, + pub write_db_condition_wait_nanos: IntCounterVec, + pub write_key_lock_wait_nanos: IntCounterVec, + pub write_key_lock_wait_count: IntCounterVec, +} + +impl WritePerfContextMetrics { + pub(crate) fn new(registry: &Registry) -> Self { + WritePerfContextMetrics { + write_wal_nanos: register_int_counter_vec_with_registry!( + "write_wal_nanos", + "Total nanos spent on writing to WAL", + &["cf_name"], + registry, + ) + .unwrap(), + write_memtable_nanos: register_int_counter_vec_with_registry!( + "write_memtable_nanos", + "Total nanos spent on writing to memtable", + &["cf_name"], + registry, + ) + .unwrap(), + write_delay_nanos: register_int_counter_vec_with_registry!( + "write_delay_nanos", + "Total nanos spent on delaying or throttling write", + &["cf_name"], + registry, + ) + .unwrap(), + write_pre_and_post_process_nanos: register_int_counter_vec_with_registry!( + "write_pre_and_post_process_nanos", + "Total nanos spent on writing a record, excluding the above four things", + &["cf_name"], + registry, + ) + .unwrap(), + write_db_mutex_lock_nanos: register_int_counter_vec_with_registry!( + "write_db_mutex_lock_nanos", + "Time spent on acquiring db mutex", + &["cf_name"], + registry, + ) + .unwrap(), + write_db_condition_wait_nanos: register_int_counter_vec_with_registry!( + "write_db_condition_wait_nanos", + "Time spent waiting with a condition variable created with DB Mutex.", + &["cf_name"], + registry, + ) + .unwrap(), + write_key_lock_wait_nanos: register_int_counter_vec_with_registry!( + "write_key_lock_wait_time", + "Time spent waiting on key locks in transaction lock manager", + &["cf_name"], + registry, + ) + .unwrap(), + write_key_lock_wait_count: register_int_counter_vec_with_registry!( + "write_key_lock_wait_count", + "Number of times acquiring a lock was blocked by another transaction", + &["cf_name"], + registry, + ) + .unwrap(), + } + } + pub fn report_metrics(&self, db_name: &str) { + PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context_cell| { + set_perf_stats(PerfStatsLevel::Disable); + let perf_context = perf_context_cell.borrow(); + self.write_wal_nanos + .with_label_values(&[db_name]) + .inc_by(perf_context.metric(PerfMetric::WriteWalTime)); + self.write_memtable_nanos + .with_label_values(&[db_name]) + .inc_by(perf_context.metric(PerfMetric::WriteMemtableTime)); + self.write_delay_nanos + .with_label_values(&[db_name]) + .inc_by(perf_context.metric(PerfMetric::WriteDelayTime)); + self.write_pre_and_post_process_nanos + .with_label_values(&[db_name]) + .inc_by(perf_context.metric(PerfMetric::WritePreAndPostProcessTime)); + self.write_db_mutex_lock_nanos + .with_label_values(&[db_name]) + .inc_by(perf_context.metric(PerfMetric::DbMutexLockNanos)); + self.write_db_condition_wait_nanos + .with_label_values(&[db_name]) + .inc_by(perf_context.metric(PerfMetric::DbConditionWaitNanos)); + self.write_key_lock_wait_nanos + .with_label_values(&[db_name]) + .inc_by(perf_context.metric(PerfMetric::KeyLockWaitTime)); + self.write_key_lock_wait_count + .with_label_values(&[db_name]) + .inc_by(perf_context.metric(PerfMetric::KeyLockWaitCount)); + }); + } +} + +#[derive(Debug)] +pub struct DBMetrics { + pub op_metrics: OperationMetrics, + pub cf_metrics: ColumnFamilyMetrics, + pub read_perf_ctx_metrics: ReadPerfContextMetrics, + pub write_perf_ctx_metrics: WritePerfContextMetrics, +} + +static ONCE: OnceCell> = OnceCell::new(); + +impl DBMetrics { + fn new(registry: &Registry) -> Self { + DBMetrics { + op_metrics: OperationMetrics::new(registry), + cf_metrics: ColumnFamilyMetrics::new(registry), + read_perf_ctx_metrics: ReadPerfContextMetrics::new(registry), + write_perf_ctx_metrics: WritePerfContextMetrics::new(registry), + } + } + pub fn init(registry: &Registry) -> &'static Arc { + // Initialize this before creating any instance of DBMap + // TODO: Remove static initialization because this basically means we can + // only ever initialize db metrics once with a registry whereas + // in the code we might want to initialize it with different + // registries. The problem is underlying metrics cannot be re-initialized + // or prometheus complains. We essentially need to pass in DBMetrics + // everywhere we create DBMap as the right fix + let _ = ONCE + .set(Arc::new(DBMetrics::new(registry))) + // this happens many times during tests + .tap_err(|_| warn!("DBMetrics registry overwritten")); + ONCE.get().unwrap() + } + pub fn get() -> &'static Arc { + ONCE.get() + .unwrap_or_else(|| DBMetrics::init(prometheus::default_registry())) + } +} diff --git a/moveos/raw-store/src/rocks/errors.rs b/moveos/raw-store/src/rocks/errors.rs new file mode 100644 index 000000000..393715338 --- /dev/null +++ b/moveos/raw-store/src/rocks/errors.rs @@ -0,0 +1,126 @@ +// Copyright (c) RoochNetwork +// SPDX-License-Identifier: Apache-2.0 + +// Copyright (c) Mysten Labs, Inc. +// SPDX-License-Identifier: Apache-2.0 + +use bincode::ErrorKind as BincodeErrorKind; + +use rocksdb::Error as RocksError; +use serde::{Deserialize, Serialize}; +use std::{fmt, fmt::Display}; +use thiserror::Error; + +#[non_exhaustive] +#[derive(Error, Debug, Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Ord, PartialOrd)] +pub enum RawStoreError { + #[error("rocksdb error: {0}")] + RocksDBError(String), + #[error("(de)serialization error: {0}")] + SerializationError(String), + #[error("the column family {0} was not registered with the database")] + UnregisteredColumn(String), + #[error("a batch operation can't operate across databases")] + CrossDBBatch, + #[error("Metric reporting thread failed with error")] + MetricsReporting, + #[error("Transaction should be retried")] + RetryableTransactionError, +} + +#[derive(Serialize, Deserialize, Clone, Eq, PartialEq, Hash, Debug, Error)] +pub(crate) struct RocksErrorDef { + message: String, +} + +impl From for RocksErrorDef { + fn from(err: RocksError) -> Self { + RocksErrorDef { + message: err.as_ref().to_string(), + } + } +} + +impl From for RawStoreError { + fn from(err: RocksError) -> Self { + RawStoreError::RocksDBError(format!("{err}")) + } +} + +impl Display for RocksErrorDef { + fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { + self.message.fmt(formatter) + } +} + +#[derive(Serialize, Deserialize, Clone, Hash, Eq, PartialEq, Debug, Error)] +pub(crate) enum BincodeErrorDef { + Io(String), + InvalidUtf8Encoding(String), + InvalidBoolEncoding(u8), + InvalidCharEncoding, + InvalidTagEncoding(usize), + DeserializeAnyNotSupported, + SizeLimit, + SequenceMustHaveLength, + Custom(String), +} + +impl fmt::Display for BincodeErrorDef { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + BincodeErrorDef::Io(ref ioerr) => write!(fmt, "io error: {ioerr}"), + BincodeErrorDef::InvalidUtf8Encoding(ref e) => { + write!(fmt, "{e}") + } + BincodeErrorDef::InvalidBoolEncoding(b) => { + write!(fmt, "expected 0 or 1, found {b}") + } + BincodeErrorDef::InvalidCharEncoding => write!(fmt, "{self:?}"), + BincodeErrorDef::InvalidTagEncoding(tag) => { + write!(fmt, "found {tag}") + } + BincodeErrorDef::SequenceMustHaveLength => write!(fmt, "{self:?}"), + BincodeErrorDef::SizeLimit => write!(fmt, "{self:?}"), + BincodeErrorDef::DeserializeAnyNotSupported => write!( + fmt, + "Bincode does not support the serde::Deserializer::deserialize_any method" + ), + BincodeErrorDef::Custom(ref s) => s.fmt(fmt), + } + } +} + +impl From for BincodeErrorDef { + fn from(err: bincode::Error) -> Self { + match err.as_ref() { + BincodeErrorKind::Io(ioerr) => BincodeErrorDef::Io(ioerr.to_string()), + BincodeErrorKind::InvalidUtf8Encoding(utf8err) => { + BincodeErrorDef::InvalidUtf8Encoding(utf8err.to_string()) + } + BincodeErrorKind::InvalidBoolEncoding(byte) => { + BincodeErrorDef::InvalidBoolEncoding(*byte) + } + BincodeErrorKind::InvalidCharEncoding => BincodeErrorDef::InvalidCharEncoding, + BincodeErrorKind::InvalidTagEncoding(tag) => BincodeErrorDef::InvalidTagEncoding(*tag), + BincodeErrorKind::DeserializeAnyNotSupported => { + BincodeErrorDef::DeserializeAnyNotSupported + } + BincodeErrorKind::SizeLimit => BincodeErrorDef::SizeLimit, + BincodeErrorKind::SequenceMustHaveLength => BincodeErrorDef::SequenceMustHaveLength, + BincodeErrorKind::Custom(str) => BincodeErrorDef::Custom(str.to_owned()), + } + } +} + +impl From for RawStoreError { + fn from(err: bcs::Error) -> Self { + RawStoreError::SerializationError(format!("{err}")) + } +} + +impl From for RawStoreError { + fn from(err: bincode::Error) -> Self { + RawStoreError::SerializationError(format!("{err}")) + } +} diff --git a/moveos/raw-store/src/rocks/iter.rs b/moveos/raw-store/src/rocks/iter.rs new file mode 100644 index 000000000..25114e67c --- /dev/null +++ b/moveos/raw-store/src/rocks/iter.rs @@ -0,0 +1,171 @@ +// Copyright (c) RoochNetwork +// SPDX-License-Identifier: Apache-2.0 + +// Copyright (c) Mysten Labs, Inc. +// SPDX-License-Identifier: Apache-2.0 +use std::marker::PhantomData; +use std::sync::Arc; + +use bincode::Options; +use prometheus::{Histogram, HistogramTimer}; +use rocksdb::Direction; + +use super::{be_fix_int_ser, errors::RawStoreError, RocksDBRawIter}; +use crate::metrics::DBMetrics; +use crate::metrics::RocksDBPerfContext; +use serde::{de::DeserializeOwned, Serialize}; + +/// An iterator over all key-value pairs in a data map. +pub struct Iter<'a, K, V> { + cf_name: String, + db_iter: RocksDBRawIter<'a>, + // *const here is an equivalent to `impl !Send for Iter` (which is not a stable feature at the moment) + _phantom: PhantomData<*const (K, V)>, + direction: Direction, + is_initialized: bool, + _timer: Option, + _perf_ctx: Option, + bytes_scanned: Option, + keys_scanned: Option, + db_metrics: Option>, + bytes_scanned_counter: usize, + keys_returned_counter: usize, +} + +impl<'a, K: DeserializeOwned, V: DeserializeOwned> Iter<'a, K, V> { + pub(super) fn new( + cf_name: String, + db_iter: RocksDBRawIter<'a>, + _timer: Option, + _perf_ctx: Option, + bytes_scanned: Option, + keys_scanned: Option, + db_metrics: Option>, + ) -> Self { + Self { + cf_name, + db_iter, + _phantom: PhantomData, + direction: Direction::Forward, + is_initialized: false, + _timer, + _perf_ctx, + bytes_scanned, + keys_scanned, + db_metrics, + bytes_scanned_counter: 0, + keys_returned_counter: 0, + } + } +} + +impl<'a, K: DeserializeOwned, V: DeserializeOwned> Iterator for Iter<'a, K, V> { + type Item = (K, V); + + fn next(&mut self) -> Option { + // implicitly set iterator to the first entry in the column family if it hasn't been initialized + // used for backward compatibility + if !self.is_initialized { + self.db_iter.seek_to_first(); + self.is_initialized = true; + } + if self.db_iter.valid() { + let config = bincode::DefaultOptions::new() + .with_big_endian() + .with_fixint_encoding(); + let raw_key = self + .db_iter + .key() + .expect("Valid iterator failed to get key"); + let raw_value = self + .db_iter + .value() + .expect("Valid iterator failed to get value"); + self.bytes_scanned_counter += raw_key.len() + raw_value.len(); + self.keys_returned_counter += 1; + let key = config.deserialize(raw_key).ok(); + let value = bcs::from_bytes(raw_value).ok(); + match self.direction { + Direction::Forward => self.db_iter.next(), + Direction::Reverse => self.db_iter.prev(), + } + key.and_then(|k| value.map(|v| (k, v))) + } else { + None + } + } +} + +impl<'a, K, V> Drop for Iter<'a, K, V> { + fn drop(&mut self) { + if let Some(bytes_scanned) = self.bytes_scanned.take() { + bytes_scanned.observe(self.bytes_scanned_counter as f64); + } + if let Some(keys_scanned) = self.keys_scanned.take() { + keys_scanned.observe(self.keys_returned_counter as f64); + } + if let Some(db_metrics) = self.db_metrics.take() { + db_metrics + .read_perf_ctx_metrics + .report_metrics(&self.cf_name); + } + } +} + +impl<'a, K: Serialize, V> Iter<'a, K, V> { + /// Skips all the elements that are smaller than the given key, + /// and either lands on the key or the first one greater than + /// the key. + pub fn skip_to(mut self, key: &K) -> Result { + self.is_initialized = true; + self.db_iter.seek(be_fix_int_ser(key)?); + Ok(self) + } + + /// Moves the iterator the element given or + /// the one prior to it if it does not exist. If there is + /// no element prior to it, it returns an empty iterator. + pub fn skip_prior_to(mut self, key: &K) -> Result { + self.is_initialized = true; + self.db_iter.seek_for_prev(be_fix_int_ser(key)?); + Ok(self) + } + + /// Seeks to the last key in the database (at this column family). + pub fn skip_to_last(mut self) -> Self { + self.is_initialized = true; + self.db_iter.seek_to_last(); + self + } + + /// Will make the direction of the iteration reverse and will + /// create a new `RevIter` to consume. Every call to `next` method + /// will give the next element from the end. + pub fn reverse(mut self) -> RevIter<'a, K, V> { + self.direction = Direction::Reverse; + RevIter::new(self) + } +} + +/// An iterator with a reverted direction to the original. The `RevIter` +/// is hosting an iteration which is consuming in the opposing direction. +/// It's not possible to do further manipulation (ex re-reverse) to the +/// iterator. +pub struct RevIter<'a, K, V> { + iter: Iter<'a, K, V>, +} + +impl<'a, K, V> RevIter<'a, K, V> { + fn new(iter: Iter<'a, K, V>) -> Self { + Self { iter } + } +} + +impl<'a, K: DeserializeOwned, V: DeserializeOwned> Iterator for RevIter<'a, K, V> { + type Item = (K, V); + + /// Will give the next item backwards + fn next(&mut self) -> Option { + self.iter.next() + } +} diff --git a/moveos/raw-store/src/rocks/keys.rs b/moveos/raw-store/src/rocks/keys.rs new file mode 100644 index 000000000..6469fb2f7 --- /dev/null +++ b/moveos/raw-store/src/rocks/keys.rs @@ -0,0 +1,71 @@ +// Copyright (c) RoochNetwork +// SPDX-License-Identifier: Apache-2.0 + +// Copyright (c) Mysten Labs, Inc. +// SPDX-License-Identifier: Apache-2.0 +use bincode::Options; + +use serde::{de::DeserializeOwned, Serialize}; +use std::marker::PhantomData; + +use super::{be_fix_int_ser, RawStoreError, RocksDBRawIter}; + +/// An iterator over the keys of a prefix. +pub struct Keys<'a, K> { + db_iter: RocksDBRawIter<'a>, + _phantom: PhantomData, +} + +impl<'a, K: DeserializeOwned> Keys<'a, K> { + pub(crate) fn new(db_iter: RocksDBRawIter<'a>) -> Self { + Self { + db_iter, + _phantom: PhantomData, + } + } +} + +impl<'a, K: DeserializeOwned> Iterator for Keys<'a, K> { + type Item = Result; + + fn next(&mut self) -> Option { + if self.db_iter.valid() { + let config = bincode::DefaultOptions::new() + .with_big_endian() + .with_fixint_encoding(); + let key = self.db_iter.key().and_then(|k| config.deserialize(k).ok()); + self.db_iter.next(); + key.map(Ok) + } else { + match self.db_iter.status() { + Ok(_) => None, + Err(err) => Some(Err(RawStoreError::RocksDBError(format!("{err}")))), + } + } + } +} + +impl<'a, K: Serialize> Keys<'a, K> { + /// Skips all the elements that are smaller than the given key, + /// and either lands on the key or the first one greater than + /// the key. + pub fn skip_to(mut self, key: &K) -> Result { + self.db_iter.seek(be_fix_int_ser(key)?); + Ok(self) + } + + /// Moves the iterator the element given or + /// the one prior to it if it does not exist. If there is + /// no element prior to it, it returns an empty iterator. + pub fn skip_prior_to(mut self, key: &K) -> Result { + self.db_iter.seek_for_prev(be_fix_int_ser(key)?); + Ok(self) + } + + /// Seeks to the last key in the database (at this column family). + /// + pub fn skip_to_last(mut self) -> Self { + self.db_iter.seek_to_last(); + self + } +} diff --git a/moveos/raw-store/src/rocks/mod.rs b/moveos/raw-store/src/rocks/mod.rs new file mode 100644 index 000000000..154fa4c12 --- /dev/null +++ b/moveos/raw-store/src/rocks/mod.rs @@ -0,0 +1,2406 @@ +// Copyright (c) RoochNetwork +// SPDX-License-Identifier: Apache-2.0 + +// Copyright (c) Mysten Labs, Inc. +// SPDX-License-Identifier: Apache-2.0 +pub mod errors; +pub(crate) mod iter; +pub(crate) mod keys; +pub(crate) mod safe_iter; +pub mod util; +pub(crate) mod values; + +use crate::{ + metrics::{DBMetrics, RocksDBPerfContext, SamplingInterval}, + traits::{Map, TableSummary}, +}; +use bincode::Options; +use collectable::TryExtend; +use itertools::Itertools; +use rocksdb::{ + checkpoint::Checkpoint, BlockBasedOptions, BottommostLevelCompaction, Cache, CompactOptions, + LiveFile, OptimisticTransactionDB, SnapshotWithThreadMode, +}; +use rocksdb::{ + properties, AsColumnFamilyRef, CStrLike, ColumnFamilyDescriptor, DBWithThreadMode, Error, + ErrorKind, IteratorMode, MultiThreaded, OptimisticTransactionOptions, ReadOptions, Transaction, + WriteBatch, WriteBatchWithTransaction, WriteOptions, +}; +use serde::{de::DeserializeOwned, Serialize}; +use std::{ + borrow::Borrow, + collections::BTreeMap, + env, + marker::PhantomData, + path::{Path, PathBuf}, + sync::Arc, + time::Duration, +}; +use std::{collections::HashSet, ffi::CStr}; +use tap::TapFallible; +use tokio::sync::oneshot; +use tracing::{error, info, instrument, warn}; + +use self::{iter::Iter, keys::Keys, values::Values}; +use crate::rocks::safe_iter::SafeIter; +use crate::{fail_point, nondeterministic}; +pub use errors::RawStoreError; + +// Write buffer size per RocksDB instance can be set via the env var below. +// If the env var is not set, use the default value in MiB. +const ENV_VAR_DB_WRITE_BUFFER_SIZE: &str = "DB_WRITE_BUFFER_SIZE_MB"; +const DEFAULT_DB_WRITE_BUFFER_SIZE: usize = 1024; + +// Write ahead log size per RocksDB instance can be set via the env var below. +// If the env var is not set, use the default value in MiB. +const ENV_VAR_DB_WAL_SIZE: &str = "DB_WAL_SIZE_MB"; +const DEFAULT_DB_WAL_SIZE: usize = 1024; + +// Environment variable to control behavior of write throughput optimized tables. +const ENV_VAR_L0_NUM_FILES_COMPACTION_TRIGGER: &str = "L0_NUM_FILES_COMPACTION_TRIGGER"; +const DEFAULT_L0_NUM_FILES_COMPACTION_TRIGGER: usize = 6; +const ENV_VAR_MAX_WRITE_BUFFER_SIZE_MB: &str = "MAX_WRITE_BUFFER_SIZE_MB"; +const DEFAULT_MAX_WRITE_BUFFER_SIZE_MB: usize = 256; +const ENV_VAR_MAX_WRITE_BUFFER_NUMBER: &str = "MAX_WRITE_BUFFER_NUMBER"; +const DEFAULT_MAX_WRITE_BUFFER_NUMBER: usize = 6; +const ENV_VAR_TARGET_FILE_SIZE_BASE_MB: &str = "TARGET_FILE_SIZE_BASE_MB"; +const DEFAULT_TARGET_FILE_SIZE_BASE_MB: usize = 128; + +// Set to 1 to disable blob storage for transactions and effects. +const ENV_VAR_DISABLE_BLOB_STORAGE: &str = "DISABLE_BLOB_STORAGE"; + +const ENV_VAR_MAX_BACKGROUND_JOBS: &str = "MAX_BACKGROUND_JOBS"; + +// TODO: remove this after Rust rocksdb has the TOTAL_BLOB_FILES_SIZE property built-in. +// From https://github.com/facebook/rocksdb/blob/bd80433c73691031ba7baa65c16c63a83aef201a/include/rocksdb/db.h#L1169 +const ROCKSDB_PROPERTY_TOTAL_BLOB_FILES_SIZE: &CStr = + unsafe { CStr::from_bytes_with_nul_unchecked("rocksdb.total-blob-file-size\0".as_bytes()) }; + +#[cfg(test)] +mod tests; + +/// A helper macro to reopen multiple column families. The macro returns +/// a tuple of DBMap structs in the same order that the column families +/// are defined. +/// +/// # Arguments +/// +/// * `db` - a reference to a rocks DB object +/// * `cf;` - a comma separated list of column families to open. For each +/// column family a concatenation of column family name (cf) and Key-Value +/// should be provided. +/// +/// # Examples +/// +/// We successfully open two different column families. +/// ``` +/// use raw_store::reopen; +/// use raw_store::rocks::*; +/// use tempfile::tempdir; +/// use prometheus::Registry; +/// use std::sync::Arc; +/// use raw_store::metrics::DBMetrics; +/// use core::fmt::Error; +/// +/// #[tokio::main] +/// async fn main() -> Result<(), Error> { +/// const FIRST_CF: &str = "First_CF"; +/// const SECOND_CF: &str = "Second_CF"; +/// +/// +/// /// Create the rocks database reference for the desired column families +/// let rocks = open_cf(tempdir().unwrap(), None, MetricConf::default(), &[FIRST_CF, SECOND_CF]).unwrap(); +/// +/// /// Now simply open all the column families for their expected Key-Value types +/// let (db_map_1, db_map_2) = reopen!(&rocks, FIRST_CF;, SECOND_CF;); +/// Ok(()) +/// } +/// ``` +/// +#[macro_export] +macro_rules! reopen { + ( $db:expr, $($cf:expr;<$K:ty, $V:ty>),*) => { + ( + $( + DBMap::<$K, $V>::reopen($db, Some($cf), &ReadWriteOptions::default()).expect(&format!("Cannot open {} CF.", $cf)[..]) + ),* + ) + }; +} + +/// Repeatedly attempt an Optimistic Transaction until it succeeds. +/// Since many callsites (e.g. the consensus handler) cannot proceed in the case of failed writes, +/// this will loop forever until the transaction succeeds. +#[macro_export] +macro_rules! retry_transaction { + ($transaction:expr) => { + retry_transaction!($transaction, Some(20)) + }; + + ( + $transaction:expr, + $max_retries:expr // should be an Option, None for unlimited + $(,)? + + ) => {{ + use rand::{ + distributions::{Distribution, Uniform}, + rngs::ThreadRng, + }; + use tokio::time::{sleep, Duration}; + use tracing::{error, info}; + + let mut retries = 0; + let max_retries = $max_retries; + loop { + let status = $transaction; + match status { + Err(RawStoreError::RetryableTransactionError) => { + retries += 1; + // Randomized delay to help racing transactions get out of each other's way. + let delay = { + let mut rng = ThreadRng::default(); + Duration::from_millis(Uniform::new(0, 50).sample(&mut rng)) + }; + if let Some(max_retries) = max_retries { + if retries > max_retries { + error!(?max_retries, "max retries exceeded"); + break status; + } + } + if retries > 10 { + // TODO: monitoring needed? + error!(?delay, ?retries, "excessive transaction retries..."); + } else { + info!( + ?delay, + ?retries, + "transaction write conflict detected, sleeping" + ); + } + sleep(delay).await; + } + _ => break status, + } + } + }}; +} + +#[macro_export] +macro_rules! retry_transaction_forever { + ($transaction:expr) => { + $crate::retry_transaction!($transaction, None) + }; +} + +#[derive(Debug)] +pub struct DBWithThreadModeWrapper { + pub underlying: rocksdb::DBWithThreadMode, + pub metric_conf: MetricConf, + pub db_path: PathBuf, +} + +#[derive(Debug)] +pub struct OptimisticTransactionDBWrapper { + pub underlying: rocksdb::OptimisticTransactionDB, + pub metric_conf: MetricConf, + pub db_path: PathBuf, +} + +/// Thin wrapper to unify interface across different db types +#[derive(Debug)] +pub enum RocksDB { + DBWithThreadMode(DBWithThreadModeWrapper), + OptimisticTransactionDB(OptimisticTransactionDBWrapper), +} + +macro_rules! delegate_call { + ($self:ident.$method:ident($($args:ident),*)) => { + match $self { + Self::DBWithThreadMode(d) => d.underlying.$method($($args),*), + Self::OptimisticTransactionDB(d) => d.underlying.$method($($args),*), + } + } +} + +impl Drop for RocksDB { + fn drop(&mut self) { + delegate_call!(self.cancel_all_background_work(/* wait */ true)) + } +} + +impl RocksDB { + pub fn get>(&self, key: K) -> Result>, rocksdb::Error> { + delegate_call!(self.get(key)) + } + + pub fn multi_get_cf<'a, 'b: 'a, K, I, W>( + &'a self, + keys: I, + readopts: &ReadOptions, + ) -> Vec>, rocksdb::Error>> + where + K: AsRef<[u8]>, + I: IntoIterator, + W: 'b + AsColumnFamilyRef, + { + delegate_call!(self.multi_get_cf_opt(keys, readopts)) + } + + pub fn property_int_value_cf( + &self, + cf: &impl AsColumnFamilyRef, + name: impl CStrLike, + ) -> Result, rocksdb::Error> { + delegate_call!(self.property_int_value_cf(cf, name)) + } + + pub fn get_pinned_cf>( + &self, + cf: &impl AsColumnFamilyRef, + key: K, + readopts: &ReadOptions, + ) -> Result>, rocksdb::Error> { + delegate_call!(self.get_pinned_cf_opt(cf, key, readopts)) + } + + pub fn cf_handle(&self, name: &str) -> Option>> { + delegate_call!(self.cf_handle(name)) + } + + pub fn create_cf>( + &self, + name: N, + opts: &rocksdb::Options, + ) -> Result<(), rocksdb::Error> { + delegate_call!(self.create_cf(name, opts)) + } + + pub fn drop_cf(&self, name: &str) -> Result<(), rocksdb::Error> { + delegate_call!(self.drop_cf(name)) + } + + pub fn delete_cf>( + &self, + cf: &impl AsColumnFamilyRef, + key: K, + writeopts: &WriteOptions, + ) -> Result<(), rocksdb::Error> { + fail_point!("delete-cf-before"); + let ret = delegate_call!(self.delete_cf_opt(cf, key, writeopts)); + fail_point!("delete-cf-after"); + #[allow(clippy::let_and_return)] + ret + } + + pub fn path(&self) -> &Path { + delegate_call!(self.path()) + } + + pub fn put_cf( + &self, + cf: &impl AsColumnFamilyRef, + key: K, + value: V, + writeopts: &WriteOptions, + ) -> Result<(), rocksdb::Error> + where + K: AsRef<[u8]>, + V: AsRef<[u8]>, + { + fail_point!("put-cf-before"); + let ret = delegate_call!(self.put_cf_opt(cf, key, value, writeopts)); + fail_point!("put-cf-after"); + #[allow(clippy::let_and_return)] + ret + } + + pub fn key_may_exist_cf>( + &self, + cf: &impl AsColumnFamilyRef, + key: K, + readopts: &ReadOptions, + ) -> bool { + delegate_call!(self.key_may_exist_cf_opt(cf, key, readopts)) + } + + pub fn try_catch_up_with_primary(&self) -> Result<(), rocksdb::Error> { + delegate_call!(self.try_catch_up_with_primary()) + } + + pub fn write(&self, batch: RocksDBBatch) -> Result<(), RawStoreError> { + fail_point!("batch-write-before"); + let ret = match (self, batch) { + (RocksDB::DBWithThreadMode(db), RocksDBBatch::Regular(batch)) => { + db.underlying.write(batch)?; + Ok(()) + } + (RocksDB::OptimisticTransactionDB(db), RocksDBBatch::Transactional(batch)) => { + db.underlying.write(batch)?; + Ok(()) + } + _ => Err(RawStoreError::RocksDBError( + "using invalid batch type for the database".to_string(), + )), + }; + fail_point!("batch-write-after"); + #[allow(clippy::let_and_return)] + ret + } + + pub fn transaction_without_snapshot( + &self, + ) -> Result, RawStoreError> { + match self { + Self::OptimisticTransactionDB(db) => Ok(db.underlying.transaction()), + Self::DBWithThreadMode(_) => Err(RawStoreError::RocksDBError( + "operation not supported".to_string(), + )), + } + } + + pub fn transaction( + &self, + ) -> Result, RawStoreError> { + match self { + Self::OptimisticTransactionDB(db) => { + let mut tx_opts = OptimisticTransactionOptions::new(); + tx_opts.set_snapshot(true); + + Ok(db + .underlying + .transaction_opt(&WriteOptions::default(), &tx_opts)) + } + Self::DBWithThreadMode(_) => Err(RawStoreError::RocksDBError( + "operation not supported".to_string(), + )), + } + } + + pub fn raw_iterator_cf<'a: 'b, 'b>( + &'a self, + cf_handle: &impl AsColumnFamilyRef, + readopts: ReadOptions, + ) -> RocksDBRawIter<'b> { + match self { + Self::DBWithThreadMode(db) => { + RocksDBRawIter::DB(db.underlying.raw_iterator_cf_opt(cf_handle, readopts)) + } + Self::OptimisticTransactionDB(db) => RocksDBRawIter::OptimisticTransactionDB( + db.underlying.raw_iterator_cf_opt(cf_handle, readopts), + ), + } + } + + pub fn iterator_cf<'a: 'b, 'b>( + &'a self, + cf_handle: &impl AsColumnFamilyRef, + readopts: ReadOptions, + mode: IteratorMode<'_>, + ) -> RocksDBIter<'b> { + match self { + Self::DBWithThreadMode(db) => { + RocksDBIter::DB(db.underlying.iterator_cf_opt(cf_handle, readopts, mode)) + } + Self::OptimisticTransactionDB(db) => RocksDBIter::OptimisticTransactionDB( + db.underlying.iterator_cf_opt(cf_handle, readopts, mode), + ), + } + } + + pub fn compact_range_cf>( + &self, + cf: &impl AsColumnFamilyRef, + start: Option, + end: Option, + ) { + delegate_call!(self.compact_range_cf(cf, start, end)) + } + + pub fn compact_range_to_bottom>( + &self, + cf: &impl AsColumnFamilyRef, + start: Option, + end: Option, + ) { + let opt = &mut CompactOptions::default(); + opt.set_bottommost_level_compaction(BottommostLevelCompaction::ForceOptimized); + delegate_call!(self.compact_range_cf_opt(cf, start, end, opt)) + } + + pub fn flush(&self) -> Result<(), RawStoreError> { + delegate_call!(self.flush()).map_err(|e| RawStoreError::RocksDBError(e.into_string())) + } + + pub fn snapshot(&self) -> RocksDBSnapshot<'_> { + match self { + Self::DBWithThreadMode(d) => RocksDBSnapshot::DBWithThreadMode(d.underlying.snapshot()), + Self::OptimisticTransactionDB(d) => { + RocksDBSnapshot::OptimisticTransactionDB(d.underlying.snapshot()) + } + } + } + + pub fn checkpoint(&self, path: &Path) -> Result<(), RawStoreError> { + let checkpoint = match self { + Self::DBWithThreadMode(d) => Checkpoint::new(&d.underlying)?, + Self::OptimisticTransactionDB(d) => Checkpoint::new(&d.underlying)?, + }; + checkpoint + .create_checkpoint(path) + .map_err(|e| RawStoreError::RocksDBError(e.to_string()))?; + Ok(()) + } + + pub fn flush_cf(&self, cf: &impl AsColumnFamilyRef) -> Result<(), rocksdb::Error> { + delegate_call!(self.flush_cf(cf)) + } + + pub fn set_options_cf( + &self, + cf: &impl AsColumnFamilyRef, + opts: &[(&str, &str)], + ) -> Result<(), rocksdb::Error> { + delegate_call!(self.set_options_cf(cf, opts)) + } + + pub fn get_sampling_interval(&self) -> SamplingInterval { + match self { + Self::DBWithThreadMode(d) => d.metric_conf.read_sample_interval.new_from_self(), + Self::OptimisticTransactionDB(d) => d.metric_conf.read_sample_interval.new_from_self(), + } + } + + pub fn multiget_sampling_interval(&self) -> SamplingInterval { + match self { + Self::DBWithThreadMode(d) => d.metric_conf.read_sample_interval.new_from_self(), + Self::OptimisticTransactionDB(d) => d.metric_conf.read_sample_interval.new_from_self(), + } + } + + pub fn write_sampling_interval(&self) -> SamplingInterval { + match self { + Self::DBWithThreadMode(d) => d.metric_conf.write_sample_interval.new_from_self(), + Self::OptimisticTransactionDB(d) => d.metric_conf.write_sample_interval.new_from_self(), + } + } + + pub fn iter_sampling_interval(&self) -> SamplingInterval { + match self { + Self::DBWithThreadMode(d) => d.metric_conf.iter_sample_interval.new_from_self(), + Self::OptimisticTransactionDB(d) => d.metric_conf.iter_sample_interval.new_from_self(), + } + } + + pub fn db_name(&self) -> String { + match self { + Self::DBWithThreadMode(d) => d + .metric_conf + .db_name_override + .clone() + .unwrap_or_else(|| self.default_db_name()), + Self::OptimisticTransactionDB(d) => d + .metric_conf + .db_name_override + .clone() + .unwrap_or_else(|| self.default_db_name()), + } + } + + pub fn live_files(&self) -> Result, Error> { + delegate_call!(self.live_files()) + } + + fn default_db_name(&self) -> String { + self.path() + .file_name() + .and_then(|f| f.to_str()) + .unwrap_or("unknown") + .to_string() + } +} + +pub enum RocksDBSnapshot<'a> { + DBWithThreadMode(rocksdb::Snapshot<'a>), + OptimisticTransactionDB(SnapshotWithThreadMode<'a, OptimisticTransactionDB>), +} + +impl<'a> RocksDBSnapshot<'a> { + pub fn multi_get_cf_opt<'b: 'a, K, I, W>( + &'a self, + keys: I, + readopts: ReadOptions, + ) -> Vec>, rocksdb::Error>> + where + K: AsRef<[u8]>, + I: IntoIterator, + W: 'b + AsColumnFamilyRef, + { + match self { + Self::DBWithThreadMode(s) => s.multi_get_cf_opt(keys, readopts), + Self::OptimisticTransactionDB(s) => s.multi_get_cf_opt(keys, readopts), + } + } + pub fn multi_get_cf<'b: 'a, K, I, W>( + &'a self, + keys: I, + ) -> Vec>, rocksdb::Error>> + where + K: AsRef<[u8]>, + I: IntoIterator, + W: 'b + AsColumnFamilyRef, + { + match self { + Self::DBWithThreadMode(s) => s.multi_get_cf(keys), + Self::OptimisticTransactionDB(s) => s.multi_get_cf(keys), + } + } +} + +pub enum RocksDBBatch { + Regular(rocksdb::WriteBatch), + Transactional(rocksdb::WriteBatchWithTransaction), +} + +macro_rules! delegate_batch_call { + ($self:ident.$method:ident($($args:ident),*)) => { + match $self { + Self::Regular(b) => b.$method($($args),*), + Self::Transactional(b) => b.$method($($args),*), + } + } +} + +impl RocksDBBatch { + fn size_in_bytes(&self) -> usize { + delegate_batch_call!(self.size_in_bytes()) + } + + pub fn delete_cf>(&mut self, cf: &impl AsColumnFamilyRef, key: K) { + delegate_batch_call!(self.delete_cf(cf, key)) + } + + pub fn put_cf(&mut self, cf: &impl AsColumnFamilyRef, key: K, value: V) + where + K: AsRef<[u8]>, + V: AsRef<[u8]>, + { + delegate_batch_call!(self.put_cf(cf, key, value)) + } + + pub fn merge_cf(&mut self, cf: &impl AsColumnFamilyRef, key: K, value: V) + where + K: AsRef<[u8]>, + V: AsRef<[u8]>, + { + delegate_batch_call!(self.merge_cf(cf, key, value)) + } + + pub fn delete_range_cf>( + &mut self, + cf: &impl AsColumnFamilyRef, + from: K, + to: K, + ) -> Result<(), RawStoreError> { + match self { + Self::Regular(batch) => { + batch.delete_range_cf(cf, from, to); + Ok(()) + } + Self::Transactional(_) => Err(RawStoreError::RocksDBError( + "operation not supported".to_string(), + )), + } + } +} + +#[derive(Debug, Default)] +pub struct MetricConf { + pub db_name_override: Option, + pub read_sample_interval: SamplingInterval, + pub write_sample_interval: SamplingInterval, + pub iter_sample_interval: SamplingInterval, +} + +impl MetricConf { + pub fn with_db_name(db_name: &str) -> Self { + Self { + db_name_override: Some(db_name.to_string()), + read_sample_interval: SamplingInterval::default(), + write_sample_interval: SamplingInterval::default(), + iter_sample_interval: SamplingInterval::default(), + } + } + pub fn with_sampling(read_interval: SamplingInterval) -> Self { + Self { + db_name_override: None, + read_sample_interval: read_interval, + write_sample_interval: SamplingInterval::default(), + iter_sample_interval: SamplingInterval::default(), + } + } +} +const CF_METRICS_REPORT_PERIOD_MILLIS: u64 = 1000; +const METRICS_ERROR: i64 = -1; + +/// An interface to a rocksDB database, keyed by a columnfamily +#[derive(Clone, Debug)] +pub struct DBMap { + pub rocksdb: Arc, + _phantom: PhantomData V>, + // the rocksDB ColumnFamily under which the map is stored + cf: String, + pub opts: ReadWriteOptions, + db_metrics: Arc, + get_sample_interval: SamplingInterval, + multiget_sample_interval: SamplingInterval, + write_sample_interval: SamplingInterval, + iter_sample_interval: SamplingInterval, + _metrics_task_cancel_handle: Arc>, +} + +unsafe impl Send for DBMap {} + +impl DBMap { + pub(crate) fn new(db: Arc, opts: &ReadWriteOptions, opt_cf: &str) -> Self { + let db_cloned = db.clone(); + let db_metrics = DBMetrics::get(); + let db_metrics_cloned = db_metrics.clone(); + let cf = opt_cf.to_string(); + let (sender, mut recv) = tokio::sync::oneshot::channel(); + tokio::task::spawn(async move { + let mut interval = + tokio::time::interval(Duration::from_millis(CF_METRICS_REPORT_PERIOD_MILLIS)); + loop { + tokio::select! { + _ = interval.tick() => { + let db = db_cloned.clone(); + let cf = cf.clone(); + let db_metrics = db_metrics.clone(); + if let Err(e) = tokio::task::spawn_blocking(move || { + Self::report_metrics(&db, &cf, &db_metrics); + }).await { + error!("Failed to log metrics with error: {}", e); + } + } + _ = &mut recv => break, + } + } + info!("Returning the cf metric logging task for DBMap: {}", &cf); + }); + DBMap { + rocksdb: db.clone(), + opts: opts.clone(), + _phantom: PhantomData, + cf: opt_cf.to_string(), + db_metrics: db_metrics_cloned, + _metrics_task_cancel_handle: Arc::new(sender), + get_sample_interval: db.get_sampling_interval(), + multiget_sample_interval: db.multiget_sampling_interval(), + write_sample_interval: db.write_sampling_interval(), + iter_sample_interval: db.iter_sampling_interval(), + } + } + + /// Opens a database from a path, with specific options and an optional column family. + /// + /// This database is used to perform operations on single column family, and parametrizes + /// all operations in `DBBatch` when writing across column families. + #[instrument(level="debug", skip_all, fields(path = ?path.as_ref(), cf = ?opt_cf), err)] + pub fn open>( + path: P, + metric_conf: MetricConf, + db_options: Option, + opt_cf: Option<&str>, + rw_options: &ReadWriteOptions, + ) -> Result { + let cf_key = opt_cf.unwrap_or(rocksdb::DEFAULT_COLUMN_FAMILY_NAME); + let cfs = vec![cf_key]; + let rocksdb = open_cf(path, db_options, metric_conf, &cfs)?; + Ok(DBMap::new(rocksdb, rw_options, cf_key)) + } + + /// Reopens an open database as a typed map operating under a specific column family. + /// if no column family is passed, the default column family is used. + /// + /// ``` + /// use raw_store::rocks::*; + /// use raw_store::metrics::DBMetrics; + /// use tempfile::tempdir; + /// use prometheus::Registry; + /// use std::sync::Arc; + /// use core::fmt::Error; + /// #[tokio::main] + /// async fn main() -> Result<(), Error> { + /// /// Open the DB with all needed column families first. + /// let rocks = open_cf(tempdir().unwrap(), None, MetricConf::default(), &["First_CF", "Second_CF"]).unwrap(); + /// /// Attach the column families to specific maps. + /// let db_cf_1 = DBMap::::reopen(&rocks, Some("First_CF"), &ReadWriteOptions::default()).expect("Failed to open storage"); + /// let db_cf_2 = DBMap::::reopen(&rocks, Some("Second_CF"), &ReadWriteOptions::default()).expect("Failed to open storage"); + /// Ok(()) + /// } + /// ``` + #[instrument(level = "debug", skip(db), err)] + pub fn reopen( + db: &Arc, + opt_cf: Option<&str>, + rw_options: &ReadWriteOptions, + ) -> Result { + let cf_key = opt_cf + .unwrap_or(rocksdb::DEFAULT_COLUMN_FAMILY_NAME) + .to_owned(); + + db.cf_handle(&cf_key) + .ok_or_else(|| RawStoreError::UnregisteredColumn(cf_key.clone()))?; + + Ok(DBMap::new(db.clone(), rw_options, &cf_key)) + } + + pub fn batch(&self) -> DBBatch { + let batch = match *self.rocksdb { + RocksDB::DBWithThreadMode(_) => RocksDBBatch::Regular(WriteBatch::default()), + RocksDB::OptimisticTransactionDB(_) => { + RocksDBBatch::Transactional(WriteBatchWithTransaction::::default()) + } + }; + DBBatch::new( + &self.rocksdb, + batch, + &self.db_metrics, + &self.write_sample_interval, + ) + } + + pub fn compact_range(&self, start: &J, end: &J) -> Result<(), RawStoreError> { + let from_buf = be_fix_int_ser(start.borrow())?; + let to_buf = be_fix_int_ser(end.borrow())?; + self.rocksdb + .compact_range_cf(&self.cf(), Some(from_buf), Some(to_buf)); + Ok(()) + } + + pub fn compact_range_to_bottom( + &self, + start: &J, + end: &J, + ) -> Result<(), RawStoreError> { + let from_buf = be_fix_int_ser(start.borrow())?; + let to_buf = be_fix_int_ser(end.borrow())?; + self.rocksdb + .compact_range_to_bottom(&self.cf(), Some(from_buf), Some(to_buf)); + Ok(()) + } + + pub fn cf(&self) -> Arc> { + self.rocksdb + .cf_handle(&self.cf) + .expect("Map-keying column family should have been checked at DB creation") + } + + pub fn iterator_cf(&self) -> RocksDBIter<'_> { + self.rocksdb + .iterator_cf(&self.cf(), self.opts.readopts(), IteratorMode::Start) + } + + pub fn flush(&self) -> Result<(), RawStoreError> { + self.rocksdb + .flush_cf(&self.cf()) + .map_err(|e| RawStoreError::RocksDBError(e.into_string())) + } + + pub fn set_options(&self, opts: &[(&str, &str)]) -> Result<(), rocksdb::Error> { + self.rocksdb.set_options_cf(&self.cf(), opts) + } + + fn get_int_property( + rocksdb: &RocksDB, + cf: &impl AsColumnFamilyRef, + property_name: &'static std::ffi::CStr, + ) -> Result { + match rocksdb.property_int_value_cf(cf, property_name) { + Ok(Some(value)) => Ok(value.try_into().unwrap()), + Ok(None) => Ok(0), + Err(e) => Err(RawStoreError::RocksDBError(e.into_string())), + } + } + + fn report_metrics(rocksdb: &Arc, cf_name: &str, db_metrics: &Arc) { + let cf = rocksdb.cf_handle(cf_name).expect("Failed to get cf"); + db_metrics + .cf_metrics + .rocksdb_total_sst_files_size + .with_label_values(&[cf_name]) + .set( + Self::get_int_property(rocksdb, &cf, properties::TOTAL_SST_FILES_SIZE) + .unwrap_or(METRICS_ERROR), + ); + db_metrics + .cf_metrics + .rocksdb_total_blob_files_size + .with_label_values(&[cf_name]) + .set( + Self::get_int_property(rocksdb, &cf, ROCKSDB_PROPERTY_TOTAL_BLOB_FILES_SIZE) + .unwrap_or(METRICS_ERROR), + ); + db_metrics + .cf_metrics + .rocksdb_size_all_mem_tables + .with_label_values(&[cf_name]) + .set( + Self::get_int_property(rocksdb, &cf, properties::SIZE_ALL_MEM_TABLES) + .unwrap_or(METRICS_ERROR), + ); + db_metrics + .cf_metrics + .rocksdb_num_snapshots + .with_label_values(&[cf_name]) + .set( + Self::get_int_property(rocksdb, &cf, properties::NUM_SNAPSHOTS) + .unwrap_or(METRICS_ERROR), + ); + db_metrics + .cf_metrics + .rocksdb_oldest_snapshot_time + .with_label_values(&[cf_name]) + .set( + Self::get_int_property(rocksdb, &cf, properties::OLDEST_SNAPSHOT_TIME) + .unwrap_or(METRICS_ERROR), + ); + db_metrics + .cf_metrics + .rocksdb_actual_delayed_write_rate + .with_label_values(&[cf_name]) + .set( + Self::get_int_property(rocksdb, &cf, properties::ACTUAL_DELAYED_WRITE_RATE) + .unwrap_or(METRICS_ERROR), + ); + db_metrics + .cf_metrics + .rocksdb_is_write_stopped + .with_label_values(&[cf_name]) + .set( + Self::get_int_property(rocksdb, &cf, properties::IS_WRITE_STOPPED) + .unwrap_or(METRICS_ERROR), + ); + db_metrics + .cf_metrics + .rocksdb_block_cache_capacity + .with_label_values(&[cf_name]) + .set( + Self::get_int_property(rocksdb, &cf, properties::BLOCK_CACHE_CAPACITY) + .unwrap_or(METRICS_ERROR), + ); + db_metrics + .cf_metrics + .rocksdb_block_cache_usage + .with_label_values(&[cf_name]) + .set( + Self::get_int_property(rocksdb, &cf, properties::BLOCK_CACHE_USAGE) + .unwrap_or(METRICS_ERROR), + ); + db_metrics + .cf_metrics + .rocksdb_block_cache_pinned_usage + .with_label_values(&[cf_name]) + .set( + Self::get_int_property(rocksdb, &cf, properties::BLOCK_CACHE_PINNED_USAGE) + .unwrap_or(METRICS_ERROR), + ); + db_metrics + .cf_metrics + .rocskdb_estimate_table_readers_mem + .with_label_values(&[cf_name]) + .set( + Self::get_int_property(rocksdb, &cf, properties::ESTIMATE_TABLE_READERS_MEM) + .unwrap_or(METRICS_ERROR), + ); + db_metrics + .cf_metrics + .rocksdb_estimated_num_keys + .with_label_values(&[cf_name]) + .set( + Self::get_int_property(rocksdb, &cf, properties::ESTIMATE_NUM_KEYS) + .unwrap_or(METRICS_ERROR), + ); + db_metrics + .cf_metrics + .rocksdb_mem_table_flush_pending + .with_label_values(&[cf_name]) + .set( + Self::get_int_property(rocksdb, &cf, properties::MEM_TABLE_FLUSH_PENDING) + .unwrap_or(METRICS_ERROR), + ); + db_metrics + .cf_metrics + .rocskdb_compaction_pending + .with_label_values(&[cf_name]) + .set( + Self::get_int_property(rocksdb, &cf, properties::COMPACTION_PENDING) + .unwrap_or(METRICS_ERROR), + ); + db_metrics + .cf_metrics + .rocskdb_num_running_compactions + .with_label_values(&[cf_name]) + .set( + Self::get_int_property(rocksdb, &cf, properties::NUM_RUNNING_COMPACTIONS) + .unwrap_or(METRICS_ERROR), + ); + db_metrics + .cf_metrics + .rocksdb_num_running_flushes + .with_label_values(&[cf_name]) + .set( + Self::get_int_property(rocksdb, &cf, properties::NUM_RUNNING_FLUSHES) + .unwrap_or(METRICS_ERROR), + ); + db_metrics + .cf_metrics + .rocksdb_estimate_oldest_key_time + .with_label_values(&[cf_name]) + .set( + Self::get_int_property(rocksdb, &cf, properties::ESTIMATE_OLDEST_KEY_TIME) + .unwrap_or(METRICS_ERROR), + ); + db_metrics + .cf_metrics + .rocskdb_background_errors + .with_label_values(&[cf_name]) + .set( + Self::get_int_property(rocksdb, &cf, properties::BACKGROUND_ERRORS) + .unwrap_or(METRICS_ERROR), + ); + } + + pub fn transaction(&self) -> Result, RawStoreError> { + DBTransaction::new(&self.rocksdb) + } + + pub fn transaction_without_snapshot(&self) -> Result, RawStoreError> { + DBTransaction::new_without_snapshot(&self.rocksdb) + } + + pub fn checkpoint_db(&self, path: &Path) -> Result<(), RawStoreError> { + self.rocksdb.checkpoint(path) + } + + pub fn snapshot(&self) -> Result, RawStoreError> { + Ok(self.rocksdb.snapshot()) + } + + pub fn table_summary(&self) -> eyre::Result { + let mut num_keys = 0; + let mut key_bytes_total = 0; + let mut value_bytes_total = 0; + let mut key_hist = hdrhistogram::Histogram::::new_with_max(100000, 2).unwrap(); + let mut value_hist = hdrhistogram::Histogram::::new_with_max(100000, 2).unwrap(); + let iter = self.iterator_cf().map(Result::unwrap); + for (key, value) in iter { + num_keys += 1; + key_bytes_total += key.len(); + value_bytes_total += value.len(); + key_hist.record(key.len() as u64)?; + value_hist.record(value.len() as u64)?; + } + Ok(TableSummary { + num_keys, + key_bytes_total, + value_bytes_total, + key_hist, + value_hist, + }) + } +} + +/// Provides a mutable struct to form a collection of database write operations, and execute them. +/// +/// Batching write and delete operations is faster than performing them one by one and ensures their atomicity, +/// ie. they are all written or none is. +/// This is also true of operations across column families in the same database. +/// +/// Serializations / Deserialization, and naming of column families is performed by passing a DBMap +/// with each operation. +/// +/// ``` +/// use raw_store::rocks::*; +/// use tempfile::tempdir; +/// use raw_store::Map; +/// use raw_store::metrics::DBMetrics; +/// use prometheus::Registry; +/// use core::fmt::Error; +/// use std::sync::Arc; +/// +/// #[tokio::main] +/// async fn main() -> Result<(), Error> { +/// let rocks = open_cf(tempfile::tempdir().unwrap(), None, MetricConf::default(), &["First_CF", "Second_CF"]).unwrap(); +/// +/// let db_cf_1 = DBMap::reopen(&rocks, Some("First_CF"), &ReadWriteOptions::default()) +/// .expect("Failed to open storage"); +/// let keys_vals_1 = (1..100).map(|i| (i, i.to_string())); +/// +/// let db_cf_2 = DBMap::reopen(&rocks, Some("Second_CF"), &ReadWriteOptions::default()) +/// .expect("Failed to open storage"); +/// let keys_vals_2 = (1000..1100).map(|i| (i, i.to_string())); +/// +/// let mut batch = db_cf_1.batch(); +/// batch +/// .insert_batch(&db_cf_1, keys_vals_1.clone()) +/// .expect("Failed to batch insert") +/// .insert_batch(&db_cf_2, keys_vals_2.clone()) +/// .expect("Failed to batch insert"); +/// +/// let _ = batch.write().expect("Failed to execute batch"); +/// for (k, v) in keys_vals_1 { +/// let val = db_cf_1.get(&k).expect("Failed to get inserted key"); +/// assert_eq!(Some(v), val); +/// } +/// +/// for (k, v) in keys_vals_2 { +/// let val = db_cf_2.get(&k).expect("Failed to get inserted key"); +/// assert_eq!(Some(v), val); +/// } +/// Ok(()) +/// } +/// ``` +/// +pub struct DBBatch { + rocksdb: Arc, + batch: RocksDBBatch, + db_metrics: Arc, + write_sample_interval: SamplingInterval, +} + +impl DBBatch { + /// Create a new batch associated with a DB reference. + /// + /// Use `open_cf` to get the DB reference or an existing open database. + pub fn new( + dbref: &Arc, + batch: RocksDBBatch, + db_metrics: &Arc, + write_sample_interval: &SamplingInterval, + ) -> Self { + DBBatch { + rocksdb: dbref.clone(), + batch, + db_metrics: db_metrics.clone(), + write_sample_interval: write_sample_interval.clone(), + } + } + + /// Consume the batch and write its operations to the database + #[instrument(level = "trace", skip_all, err)] + pub fn write(self) -> Result<(), RawStoreError> { + let report_metrics = if self.write_sample_interval.sample() { + let db_name = self.rocksdb.db_name(); + let timer = self + .db_metrics + .op_metrics + .rocksdb_batch_commit_latency_seconds + .with_label_values(&[&db_name]) + .start_timer(); + let size = self.batch.size_in_bytes(); + Some((db_name, size, timer, RocksDBPerfContext::default())) + } else { + None + }; + self.rocksdb.write(self.batch)?; + if let Some((db_name, batch_size, _timer, _perf_ctx)) = report_metrics { + self.db_metrics + .op_metrics + .rocksdb_batch_commit_bytes + .with_label_values(&[&db_name]) + .observe(batch_size as f64); + self.db_metrics + .write_perf_ctx_metrics + .report_metrics(&db_name); + } + Ok(()) + } +} + +// TODO: Remove this entire implementation once we switch to sally +impl DBBatch { + pub fn delete_batch, K: Serialize, V>( + &mut self, + db: &DBMap, + purged_vals: impl IntoIterator, + ) -> Result<(), RawStoreError> { + if !Arc::ptr_eq(&db.rocksdb, &self.rocksdb) { + return Err(RawStoreError::CrossDBBatch); + } + + purged_vals + .into_iter() + .try_for_each::<_, Result<_, RawStoreError>>(|k| { + let k_buf = be_fix_int_ser(k.borrow())?; + self.batch.delete_cf(&db.cf(), k_buf); + + Ok(()) + })?; + Ok(()) + } + + /// Deletes a range of keys between `from` (inclusive) and `to` (non-inclusive) + pub fn delete_range( + &mut self, + db: &DBMap, + from: &K, + to: &K, + ) -> Result<(), RawStoreError> { + if !Arc::ptr_eq(&db.rocksdb, &self.rocksdb) { + return Err(RawStoreError::CrossDBBatch); + } + + let from_buf = be_fix_int_ser(from)?; + let to_buf = be_fix_int_ser(to)?; + + self.batch.delete_range_cf(&db.cf(), from_buf, to_buf)?; + Ok(()) + } + + /// inserts a range of (key, value) pairs given as an iterator + pub fn insert_batch, K: Serialize, U: Borrow, V: Serialize>( + &mut self, + db: &DBMap, + new_vals: impl IntoIterator, + ) -> Result<&mut Self, RawStoreError> { + if !Arc::ptr_eq(&db.rocksdb, &self.rocksdb) { + return Err(RawStoreError::CrossDBBatch); + } + + new_vals + .into_iter() + .try_for_each::<_, Result<_, RawStoreError>>(|(k, v)| { + let k_buf = be_fix_int_ser(k.borrow())?; + let v_buf = bcs::to_bytes(v.borrow())?; + self.batch.put_cf(&db.cf(), k_buf, v_buf); + Ok(()) + })?; + Ok(self) + } + + /// merges a range of (key, value) pairs given as an iterator + pub fn merge_batch, K: Serialize, U: Borrow, V: Serialize>( + &mut self, + db: &DBMap, + new_vals: impl IntoIterator, + ) -> Result<&mut Self, RawStoreError> { + if !Arc::ptr_eq(&db.rocksdb, &self.rocksdb) { + return Err(RawStoreError::CrossDBBatch); + } + + new_vals + .into_iter() + .try_for_each::<_, Result<_, RawStoreError>>(|(k, v)| { + let k_buf = be_fix_int_ser(k.borrow())?; + let v_buf = bcs::to_bytes(v.borrow())?; + self.batch.merge_cf(&db.cf(), k_buf, v_buf); + Ok(()) + })?; + Ok(self) + } + + /// similar to `merge_batch` but allows merge with partial values + pub fn partial_merge_batch, K: Serialize, V: Serialize, B: AsRef<[u8]>>( + &mut self, + db: &DBMap, + new_vals: impl IntoIterator, + ) -> Result<&mut Self, RawStoreError> { + if !Arc::ptr_eq(&db.rocksdb, &self.rocksdb) { + return Err(RawStoreError::CrossDBBatch); + } + new_vals + .into_iter() + .try_for_each::<_, Result<_, RawStoreError>>(|(k, v)| { + let k_buf = be_fix_int_ser(k.borrow())?; + self.batch.merge_cf(&db.cf(), k_buf, v); + Ok(()) + })?; + Ok(self) + } +} + +pub struct DBTransaction<'a> { + rocksdb: Arc, + transaction: Transaction<'a, rocksdb::OptimisticTransactionDB>, +} + +impl<'a> DBTransaction<'a> { + pub fn new(db: &'a Arc) -> Result { + Ok(Self { + rocksdb: db.clone(), + transaction: db.transaction()?, + }) + } + + pub fn new_without_snapshot(db: &'a Arc) -> Result { + Ok(Self { + rocksdb: db.clone(), + transaction: db.transaction_without_snapshot()?, + }) + } + + pub fn insert_batch, K: Serialize, U: Borrow, V: Serialize>( + &mut self, + db: &DBMap, + new_vals: impl IntoIterator, + ) -> Result<&mut Self, RawStoreError> { + if !Arc::ptr_eq(&db.rocksdb, &self.rocksdb) { + return Err(RawStoreError::CrossDBBatch); + } + + new_vals + .into_iter() + .try_for_each::<_, Result<_, RawStoreError>>(|(k, v)| { + let k_buf = be_fix_int_ser(k.borrow())?; + let v_buf = bcs::to_bytes(v.borrow())?; + self.transaction.put_cf(&db.cf(), k_buf, v_buf)?; + Ok(()) + })?; + Ok(self) + } + + /// Deletes a set of keys given as an iterator + pub fn delete_batch, K: Serialize, V>( + &mut self, + db: &DBMap, + purged_vals: impl IntoIterator, + ) -> Result<&mut Self, RawStoreError> { + if !Arc::ptr_eq(&db.rocksdb, &self.rocksdb) { + return Err(RawStoreError::CrossDBBatch); + } + purged_vals + .into_iter() + .try_for_each::<_, Result<_, RawStoreError>>(|k| { + let k_buf = be_fix_int_ser(k.borrow())?; + self.transaction.delete_cf(&db.cf(), k_buf)?; + Ok(()) + })?; + Ok(self) + } + + pub fn snapshot( + &self, + ) -> rocksdb::SnapshotWithThreadMode<'_, Transaction<'a, rocksdb::OptimisticTransactionDB>> + { + self.transaction.snapshot() + } + + pub fn get_for_update( + &self, + db: &DBMap, + key: &K, + ) -> Result, RawStoreError> { + if !Arc::ptr_eq(&db.rocksdb, &self.rocksdb) { + return Err(RawStoreError::CrossDBBatch); + } + let k_buf = be_fix_int_ser(key.borrow())?; + match self + .transaction + .get_for_update_cf_opt(&db.cf(), k_buf, true, &db.opts.readopts())? + { + Some(data) => Ok(Some(bcs::from_bytes(&data)?)), + None => Ok(None), + } + } + + pub fn get( + &self, + db: &DBMap, + key: &K, + ) -> Result, RawStoreError> { + let key_buf = be_fix_int_ser(key)?; + self.transaction + .get_cf_opt(&db.cf(), key_buf, &db.opts.readopts()) + .map_err(|e| RawStoreError::RocksDBError(e.to_string())) + .map(|res| res.and_then(|bytes| bcs::from_bytes::(&bytes).ok())) + } + + pub fn multi_get, K: Serialize + DeserializeOwned, V: DeserializeOwned>( + &self, + db: &DBMap, + keys: impl IntoIterator, + ) -> Result>, RawStoreError> { + let cf = db.cf(); + let keys_bytes: Result, RawStoreError> = keys + .into_iter() + .map(|k| Ok((&cf, be_fix_int_ser(k.borrow())?))) + .collect(); + + let results = self + .transaction + .multi_get_cf_opt(keys_bytes?, &db.opts.readopts()); + + let values_parsed: Result, RawStoreError> = results + .into_iter() + .map(|value_byte| match value_byte? { + Some(data) => Ok(Some(bcs::from_bytes(&data)?)), + None => Ok(None), + }) + .collect(); + + values_parsed + } + + pub fn iter( + &'a self, + db: &DBMap, + ) -> Iter<'a, K, V> { + let db_iter = self + .transaction + .raw_iterator_cf_opt(&db.cf(), db.opts.readopts()); + Iter::new( + db.cf.clone(), + RocksDBRawIter::OptimisticTransaction(db_iter), + None, + None, + None, + None, + None, + ) + } + + pub fn keys( + &'a self, + db: &DBMap, + ) -> Keys<'a, K> { + let mut db_iter = RocksDBRawIter::OptimisticTransaction( + self.transaction + .raw_iterator_cf_opt(&db.cf(), db.opts.readopts()), + ); + db_iter.seek_to_first(); + + Keys::new(db_iter) + } + + pub fn values( + &'a self, + db: &DBMap, + ) -> Values<'a, V> { + let mut db_iter = RocksDBRawIter::OptimisticTransaction( + self.transaction + .raw_iterator_cf_opt(&db.cf(), db.opts.readopts()), + ); + db_iter.seek_to_first(); + + Values::new(db_iter) + } + + pub fn commit(self) -> Result<(), RawStoreError> { + fail_point!("transaction-commit"); + self.transaction.commit().map_err(|e| match e.kind() { + // empirically, this is what you get when there is a write conflict. it is not + // documented whether this is the only time you can get this error. + ErrorKind::Busy | ErrorKind::TryAgain => RawStoreError::RetryableTransactionError, + _ => e.into(), + })?; + Ok(()) + } +} + +macro_rules! delegate_iter_call { + ($self:ident.$method:ident($($args:ident),*)) => { + match $self { + Self::DB(db) => db.$method($($args),*), + Self::OptimisticTransactionDB(db) => db.$method($($args),*), + Self::OptimisticTransaction(db) => db.$method($($args),*), + } + } +} + +pub enum RocksDBRawIter<'a> { + DB(rocksdb::DBRawIteratorWithThreadMode<'a, DBWithThreadMode>), + OptimisticTransactionDB( + rocksdb::DBRawIteratorWithThreadMode<'a, rocksdb::OptimisticTransactionDB>, + ), + OptimisticTransaction( + rocksdb::DBRawIteratorWithThreadMode< + 'a, + Transaction<'a, rocksdb::OptimisticTransactionDB>, + >, + ), +} + +impl<'a> RocksDBRawIter<'a> { + pub fn valid(&self) -> bool { + delegate_iter_call!(self.valid()) + } + pub fn key(&self) -> Option<&[u8]> { + delegate_iter_call!(self.key()) + } + pub fn value(&self) -> Option<&[u8]> { + delegate_iter_call!(self.value()) + } + pub fn next(&mut self) { + delegate_iter_call!(self.next()) + } + pub fn prev(&mut self) { + delegate_iter_call!(self.prev()) + } + pub fn seek>(&mut self, key: K) { + delegate_iter_call!(self.seek(key)) + } + pub fn seek_to_last(&mut self) { + delegate_iter_call!(self.seek_to_last()) + } + pub fn seek_to_first(&mut self) { + delegate_iter_call!(self.seek_to_first()) + } + pub fn seek_for_prev>(&mut self, key: K) { + delegate_iter_call!(self.seek_for_prev(key)) + } + pub fn status(&self) -> Result<(), rocksdb::Error> { + delegate_iter_call!(self.status()) + } +} + +pub enum RocksDBIter<'a> { + DB(rocksdb::DBIteratorWithThreadMode<'a, DBWithThreadMode>), + OptimisticTransactionDB( + rocksdb::DBIteratorWithThreadMode<'a, rocksdb::OptimisticTransactionDB>, + ), +} + +impl<'a> Iterator for RocksDBIter<'a> { + type Item = Result<(Box<[u8]>, Box<[u8]>), Error>; + fn next(&mut self) -> Option { + match self { + Self::DB(db) => db.next(), + Self::OptimisticTransactionDB(db) => db.next(), + } + } +} + +impl<'a, K, V> Map<'a, K, V> for DBMap +where + K: Serialize + DeserializeOwned, + V: Serialize + DeserializeOwned, +{ + type Error = RawStoreError; + type Iterator = Iter<'a, K, V>; + type SafeIterator = SafeIter<'a, K, V>; + type Keys = Keys<'a, K>; + type Values = Values<'a, V>; + + #[instrument(level = "trace", skip_all, err)] + fn contains_key(&self, key: &K) -> Result { + let key_buf = be_fix_int_ser(key)?; + // [`rocksdb::DBWithThreadMode::key_may_exist_cf`] can have false positives, + // but no false negatives. We use it to short-circuit the absent case + let readopts = self.opts.readopts(); + Ok(self + .rocksdb + .key_may_exist_cf(&self.cf(), &key_buf, &readopts) + && self + .rocksdb + .get_pinned_cf(&self.cf(), &key_buf, &readopts)? + .is_some()) + } + + #[instrument(level = "trace", skip_all, err)] + fn get(&self, key: &K) -> Result, RawStoreError> { + let _timer = self + .db_metrics + .op_metrics + .rocksdb_get_latency_seconds + .with_label_values(&[&self.cf]) + .start_timer(); + let perf_ctx = if self.get_sample_interval.sample() { + Some(RocksDBPerfContext::default()) + } else { + None + }; + let key_buf = be_fix_int_ser(key)?; + let res = self + .rocksdb + .get_pinned_cf(&self.cf(), &key_buf, &self.opts.readopts())?; + self.db_metrics + .op_metrics + .rocksdb_get_bytes + .with_label_values(&[&self.cf]) + .observe(res.as_ref().map_or(0.0, |v| v.len() as f64)); + if perf_ctx.is_some() { + self.db_metrics + .read_perf_ctx_metrics + .report_metrics(&self.cf); + } + match res { + Some(data) => Ok(Some(bcs::from_bytes(&data)?)), + None => Ok(None), + } + } + + #[instrument(level = "trace", skip_all, err)] + fn get_raw_bytes(&self, key: &K) -> Result>, RawStoreError> { + let _timer = self + .db_metrics + .op_metrics + .rocksdb_get_latency_seconds + .with_label_values(&[&self.cf]) + .start_timer(); + let perf_ctx = if self.get_sample_interval.sample() { + Some(RocksDBPerfContext::default()) + } else { + None + }; + let key_buf = be_fix_int_ser(key)?; + let res = self + .rocksdb + .get_pinned_cf(&self.cf(), &key_buf, &self.opts.readopts())?; + self.db_metrics + .op_metrics + .rocksdb_get_bytes + .with_label_values(&[&self.cf]) + .observe(res.as_ref().map_or(0.0, |v| v.len() as f64)); + if perf_ctx.is_some() { + self.db_metrics + .read_perf_ctx_metrics + .report_metrics(&self.cf); + } + match res { + Some(data) => Ok(Some(data.to_vec())), + None => Ok(None), + } + } + + #[instrument(level = "trace", skip_all, err)] + fn insert(&self, key: &K, value: &V) -> Result<(), RawStoreError> { + let _timer = self + .db_metrics + .op_metrics + .rocksdb_put_latency_seconds + .with_label_values(&[&self.cf]) + .start_timer(); + let perf_ctx = if self.write_sample_interval.sample() { + Some(RocksDBPerfContext::default()) + } else { + None + }; + let key_buf = be_fix_int_ser(key)?; + let value_buf = bcs::to_bytes(value)?; + self.db_metrics + .op_metrics + .rocksdb_put_bytes + .with_label_values(&[&self.cf]) + .observe((key_buf.len() + value_buf.len()) as f64); + if perf_ctx.is_some() { + self.db_metrics + .write_perf_ctx_metrics + .report_metrics(&self.cf); + } + self.rocksdb + .put_cf(&self.cf(), &key_buf, &value_buf, &self.opts.writeopts())?; + Ok(()) + } + + #[instrument(level = "trace", skip_all, err)] + fn remove(&self, key: &K) -> Result<(), RawStoreError> { + let _timer = self + .db_metrics + .op_metrics + .rocksdb_delete_latency_seconds + .with_label_values(&[&self.cf]) + .start_timer(); + let perf_ctx = if self.write_sample_interval.sample() { + Some(RocksDBPerfContext::default()) + } else { + None + }; + let key_buf = be_fix_int_ser(key)?; + self.rocksdb + .delete_cf(&self.cf(), key_buf, &self.opts.writeopts())?; + self.db_metrics + .op_metrics + .rocksdb_deletes + .with_label_values(&[&self.cf]) + .inc(); + if perf_ctx.is_some() { + self.db_metrics + .write_perf_ctx_metrics + .report_metrics(&self.cf); + } + Ok(()) + } + + #[instrument(level = "trace", skip_all, err)] + fn clear(&self) -> Result<(), RawStoreError> { + let _ = self.rocksdb.drop_cf(&self.cf); + self.rocksdb + .create_cf(self.cf.clone(), &default_db_options().options)?; + Ok(()) + } + + fn is_empty(&self) -> bool { + self.safe_iter().next().is_none() + } + + fn iter(&'a self) -> Self::Iterator { + let _timer = self + .db_metrics + .op_metrics + .rocksdb_iter_latency_seconds + .with_label_values(&[&self.cf]) + .start_timer(); + let bytes_scanned = self + .db_metrics + .op_metrics + .rocksdb_iter_bytes + .with_label_values(&[&self.cf]); + let keys_scanned = self + .db_metrics + .op_metrics + .rocksdb_iter_keys + .with_label_values(&[&self.cf]); + let _perf_ctx = if self.iter_sample_interval.sample() { + Some(RocksDBPerfContext::default()) + } else { + None + }; + let db_iter = self + .rocksdb + .raw_iterator_cf(&self.cf(), self.opts.readopts()); + Iter::new( + self.cf.clone(), + db_iter, + Some(_timer), + _perf_ctx, + Some(bytes_scanned), + Some(keys_scanned), + Some(self.db_metrics.clone()), + ) + } + + fn safe_iter(&'a self) -> Self::SafeIterator { + let _timer = self + .db_metrics + .op_metrics + .rocksdb_iter_latency_seconds + .with_label_values(&[&self.cf]) + .start_timer(); + let _perf_ctx = if self.iter_sample_interval.sample() { + Some(RocksDBPerfContext::default()) + } else { + None + }; + let bytes_scanned = self + .db_metrics + .op_metrics + .rocksdb_iter_bytes + .with_label_values(&[&self.cf]); + let keys_scanned = self + .db_metrics + .op_metrics + .rocksdb_iter_keys + .with_label_values(&[&self.cf]); + let mut db_iter = self + .rocksdb + .raw_iterator_cf(&self.cf(), self.opts.readopts()); + db_iter.seek_to_first(); + SafeIter::new( + self.cf.clone(), + db_iter, + Some(_timer), + _perf_ctx, + Some(bytes_scanned), + Some(keys_scanned), + Some(self.db_metrics.clone()), + ) + } + + /// Returns an iterator visiting each key-value pair in the map. By proving bounds of the + /// scan range, RocksDB scan avoid unnecessary scans + fn iter_with_bounds( + &'a self, + lower_bound: Option, + upper_bound: Option, + ) -> Self::Iterator { + let _timer = self + .db_metrics + .op_metrics + .rocksdb_iter_latency_seconds + .with_label_values(&[&self.cf]) + .start_timer(); + let bytes_scanned = self + .db_metrics + .op_metrics + .rocksdb_iter_bytes + .with_label_values(&[&self.cf]); + let keys_scanned = self + .db_metrics + .op_metrics + .rocksdb_iter_keys + .with_label_values(&[&self.cf]); + let _perf_ctx = if self.iter_sample_interval.sample() { + Some(RocksDBPerfContext::default()) + } else { + None + }; + let mut readopts = ReadOptions::default(); + if let Some(lower_bound) = lower_bound { + let key_buf = be_fix_int_ser(&lower_bound).unwrap(); + readopts.set_iterate_lower_bound(key_buf); + } + if let Some(upper_bound) = upper_bound { + let key_buf = be_fix_int_ser(&upper_bound).unwrap(); + readopts.set_iterate_upper_bound(key_buf); + } + let db_iter = self.rocksdb.raw_iterator_cf(&self.cf(), readopts); + Iter::new( + self.cf.clone(), + db_iter, + Some(_timer), + _perf_ctx, + Some(bytes_scanned), + Some(keys_scanned), + Some(self.db_metrics.clone()), + ) + } + + fn keys(&'a self) -> Self::Keys { + let mut db_iter = self + .rocksdb + .raw_iterator_cf(&self.cf(), self.opts.readopts()); + db_iter.seek_to_first(); + + Keys::new(db_iter) + } + + fn values(&'a self) -> Self::Values { + let mut db_iter = self + .rocksdb + .raw_iterator_cf(&self.cf(), self.opts.readopts()); + db_iter.seek_to_first(); + + Values::new(db_iter) + } + + /// Returns a vector of raw values corresponding to the keys provided. + #[instrument(level = "trace", skip_all, err)] + fn multi_get_raw_bytes( + &self, + keys: impl IntoIterator, + ) -> Result>>, RawStoreError> + where + J: Borrow, + { + let _timer = self + .db_metrics + .op_metrics + .rocksdb_multiget_latency_seconds + .with_label_values(&[&self.cf]) + .start_timer(); + let perf_ctx = if self.multiget_sample_interval.sample() { + Some(RocksDBPerfContext::default()) + } else { + None + }; + let cf = self.cf(); + let keys_bytes: Result, RawStoreError> = keys + .into_iter() + .map(|k| Ok((&cf, be_fix_int_ser(k.borrow())?))) + .collect(); + let results = self + .rocksdb + .multi_get_cf(keys_bytes?, &self.opts.readopts()); + let entry_size = |entry: &Result>, rocksdb::Error>| -> f64 { + entry + .as_ref() + .map_or(0.0, |e| e.as_ref().map_or(0.0, |v| v.len() as f64)) + }; + self.db_metrics + .op_metrics + .rocksdb_multiget_bytes + .with_label_values(&[&self.cf]) + .observe(results.iter().map(entry_size).sum()); + if perf_ctx.is_some() { + self.db_metrics + .read_perf_ctx_metrics + .report_metrics(&self.cf); + } + Ok(results.into_iter().collect::>()?) + } + + /// Returns a vector of values corresponding to the keys provided. + #[instrument(level = "trace", skip_all, err)] + fn multi_get( + &self, + keys: impl IntoIterator, + ) -> Result>, RawStoreError> + where + J: Borrow, + { + let results = self.multi_get_raw_bytes(keys)?; + let values_parsed: Result, RawStoreError> = results + .into_iter() + .map(|value_byte| match value_byte { + Some(data) => Ok(Some(bcs::from_bytes(&data)?)), + None => Ok(None), + }) + .collect(); + + values_parsed + } + + /// Returns a vector of values corresponding to the keys provided. + #[instrument(level = "trace", skip_all, err)] + fn chunked_multi_get( + &self, + keys: impl IntoIterator, + chunk_size: usize, + ) -> Result>, RawStoreError> + where + J: Borrow, + { + let cf = self.cf(); + let keys_bytes = keys + .into_iter() + .map(|k| (&cf, be_fix_int_ser(k.borrow()).unwrap())); + let chunked_keys = keys_bytes.into_iter().chunks(chunk_size); + let snapshot = self.snapshot()?; + let mut results = vec![]; + for chunk in chunked_keys.into_iter() { + let chunk_result = snapshot.multi_get_cf(chunk); + let values_parsed: Result, RawStoreError> = chunk_result + .into_iter() + .map(|value_byte| { + let value_byte = value_byte?; + match value_byte { + Some(data) => Ok(Some(bcs::from_bytes(&data)?)), + None => Ok(None), + } + }) + .collect(); + results.extend(values_parsed?); + } + Ok(results) + } + + /// Convenience method for batch insertion + #[instrument(level = "trace", skip_all, err)] + fn multi_insert( + &self, + key_val_pairs: impl IntoIterator, + ) -> Result<(), Self::Error> + where + J: Borrow, + U: Borrow, + { + let mut batch = self.batch(); + batch.insert_batch(self, key_val_pairs)?; + batch.write() + } + + /// Convenience method for batch removal + #[instrument(level = "trace", skip_all, err)] + fn multi_remove(&self, keys: impl IntoIterator) -> Result<(), Self::Error> + where + J: Borrow, + { + let mut batch = self.batch(); + batch.delete_batch(self, keys)?; + batch.write() + } + + /// Try to catch up with primary when running as secondary + #[instrument(level = "trace", skip_all, err)] + fn try_catch_up_with_primary(&self) -> Result<(), Self::Error> { + Ok(self.rocksdb.try_catch_up_with_primary()?) + } +} + +impl TryExtend<(J, U)> for DBMap +where + J: Borrow, + U: Borrow, + K: Serialize, + V: Serialize, +{ + type Error = RawStoreError; + + fn try_extend(&mut self, iter: &mut T) -> Result<(), Self::Error> + where + T: Iterator, + { + let mut batch = self.batch(); + batch.insert_batch(self, iter)?; + batch.write() + } + + fn try_extend_from_slice(&mut self, slice: &[(J, U)]) -> Result<(), Self::Error> { + let slice_of_refs = slice.iter().map(|(k, v)| (k.borrow(), v.borrow())); + let mut batch = self.batch(); + batch.insert_batch(self, slice_of_refs)?; + batch.write() + } +} + +pub fn read_size_from_env(var_name: &str) -> Option { + env::var(var_name) + .ok()? + .parse::() + .tap_err(|e| { + warn!( + "Env var {} does not contain valid usize integer: {}", + var_name, e + ) + }) + .ok() +} + +#[derive(Default, Clone, Debug)] +pub struct ReadWriteOptions { + pub ignore_range_deletions: bool, +} + +impl ReadWriteOptions { + pub fn readopts(&self) -> ReadOptions { + let mut readopts = ReadOptions::default(); + readopts.set_ignore_range_deletions(self.ignore_range_deletions); + readopts + } + pub fn writeopts(&self) -> WriteOptions { + WriteOptions::default() + } +} + +// TODO: refactor this into a builder pattern, where rocksdb::Options are +// generated after a call to build(). +#[derive(Default, Clone)] +pub struct DBOptions { + pub options: rocksdb::Options, + pub rw_options: ReadWriteOptions, +} + +impl DBOptions { + // Optimize lookup perf for tables where no scans are performed. + // If non-trivial number of values can be > 512B in size, it is beneficial to also + // specify optimize_for_large_values_no_scan(). + pub fn optimize_for_point_lookup(mut self, block_cache_size_mb: usize) -> DBOptions { + // NOTE: this overwrites the block options. + self.options + .optimize_for_point_lookup(block_cache_size_mb as u64); + self + } + + // Optimize write and lookup perf for tables which are rarely scanned, and have large values. + // https://rocksdb.org/blog/2021/05/26/integrated-blob-db.html + pub fn optimize_for_large_values_no_scan(mut self, min_blob_size: u64) -> DBOptions { + if env::var(ENV_VAR_DISABLE_BLOB_STORAGE).is_ok() { + info!("Large value blob storage optimization is disabled via env var."); + return self; + } + + // Blob settings. + self.options.set_enable_blob_files(true); + self.options + .set_blob_compression_type(rocksdb::DBCompressionType::Lz4); + self.options.set_enable_blob_gc(true); + // Since each blob can have non-trivial size overhead, and compression does not work across blobs, + // set a min blob size in bytes to so small transactions and effects are kept in sst files. + self.options.set_min_blob_size(min_blob_size); + + // Increase write buffer size to 256MiB. + let write_buffer_size = read_size_from_env(ENV_VAR_MAX_WRITE_BUFFER_SIZE_MB) + .unwrap_or(DEFAULT_MAX_WRITE_BUFFER_SIZE_MB) + * 1024 + * 1024; + self.options.set_write_buffer_size(write_buffer_size); + // Since large blobs are not in sst files, reduce the target file size and base level + // target size. + let target_file_size_base = 64 << 20; + self.options + .set_target_file_size_base(target_file_size_base); + // Level 1 default to 64MiB * 6 ~ 384MiB. + let max_level_zero_file_num = read_size_from_env(ENV_VAR_L0_NUM_FILES_COMPACTION_TRIGGER) + .unwrap_or(DEFAULT_L0_NUM_FILES_COMPACTION_TRIGGER); + self.options + .set_max_bytes_for_level_base(target_file_size_base * max_level_zero_file_num as u64); + + self + } + + // Optimize tables with a mix of lookup and scan workloads. + pub fn optimize_for_read(mut self, block_cache_size_mb: usize) -> DBOptions { + self.options + .set_block_based_table_factory(&get_block_options(block_cache_size_mb)); + self + } + + // Optimize DB receiving significant insertions. + pub fn optimize_db_for_write_throughput(mut self, db_max_write_buffer_gb: u64) -> DBOptions { + self.options + .set_db_write_buffer_size(db_max_write_buffer_gb as usize * 1024 * 1024 * 1024); + self.options + .set_max_total_wal_size(db_max_write_buffer_gb * 1024 * 1024 * 1024); + self + } + + // Optimize tables receiving significant insertions. + pub fn optimize_for_write_throughput(mut self) -> DBOptions { + // Increase write buffer size to 256MiB. + let write_buffer_size = read_size_from_env(ENV_VAR_MAX_WRITE_BUFFER_SIZE_MB) + .unwrap_or(DEFAULT_MAX_WRITE_BUFFER_SIZE_MB) + * 1024 + * 1024; + self.options.set_write_buffer_size(write_buffer_size); + // Increase write buffers to keep to 6 before slowing down writes. + let max_write_buffer_number = read_size_from_env(ENV_VAR_MAX_WRITE_BUFFER_NUMBER) + .unwrap_or(DEFAULT_MAX_WRITE_BUFFER_NUMBER); + self.options + .set_max_write_buffer_number(max_write_buffer_number.try_into().unwrap()); + // Keep 1 write buffer so recent writes can be read from memory. + self.options + .set_max_write_buffer_size_to_maintain((write_buffer_size).try_into().unwrap()); + + // Increase compaction trigger for level 0 to 6. + let max_level_zero_file_num = read_size_from_env(ENV_VAR_L0_NUM_FILES_COMPACTION_TRIGGER) + .unwrap_or(DEFAULT_L0_NUM_FILES_COMPACTION_TRIGGER); + self.options.set_level_zero_file_num_compaction_trigger( + max_level_zero_file_num.try_into().unwrap(), + ); + self.options.set_level_zero_slowdown_writes_trigger( + (max_level_zero_file_num * 4).try_into().unwrap(), + ); + self.options + .set_level_zero_stop_writes_trigger((max_level_zero_file_num * 5).try_into().unwrap()); + + // Increase sst file size to 128MiB. + self.options.set_target_file_size_base( + read_size_from_env(ENV_VAR_TARGET_FILE_SIZE_BASE_MB) + .unwrap_or(DEFAULT_TARGET_FILE_SIZE_BASE_MB) as u64 + * 1024 + * 1024, + ); + + // Increase level 1 target size to 256MiB * 6 ~ 1.5GiB. + self.options + .set_max_bytes_for_level_base((write_buffer_size * max_level_zero_file_num) as u64); + + self + } + + // Optimize tables receiving significant deletions. + // TODO: revisit when intra-epoch pruning is enabled. + pub fn optimize_for_pruning(mut self) -> DBOptions { + self.options.set_min_write_buffer_number_to_merge(2); + self + } +} + +/// Creates a default RocksDB option, to be used when RocksDB option is unspecified. +pub fn default_db_options() -> DBOptions { + let mut opt = rocksdb::Options::default(); + + // One common issue when running tests on Mac is that the default ulimit is too low, + // leading to I/O errors such as "Too many open files". Raising fdlimit to bypass it. + if let Some(limit) = fdlimit::raise_fd_limit() { + // on windows raise_fd_limit return None + opt.set_max_open_files((limit / 8) as i32); + } + + // The table cache is locked for updates and this determines the number + // of shards, ie 2^10. Increase in case of lock contentions. + opt.set_table_cache_num_shard_bits(10); + + // LSM compression settings + opt.set_min_level_to_compress(2); + opt.set_compression_type(rocksdb::DBCompressionType::Lz4); + opt.set_bottommost_compression_type(rocksdb::DBCompressionType::Zstd); + opt.set_bottommost_zstd_max_train_bytes(1024 * 1024, true); + + opt.set_max_background_jobs( + read_size_from_env(ENV_VAR_MAX_BACKGROUND_JOBS) + .unwrap_or(2) + .try_into() + .unwrap(), + ); + + // Sui uses multiple RocksDB in a node, so total sizes of write buffers and WAL can be higher + // than the limits below. + // + // RocksDB also exposes the option to configure total write buffer size across multiple instances + // via `write_buffer_manager`. But the write buffer flush policy (flushing the buffer receiving + // the next write) may not work well. So sticking to per-db write buffer size limit for now. + // + // The environment variables are only meant to be emergency overrides. They may go away in future. + // If you need to modify an option, either update the default value, or override the option in + // Sui / Narwhal. + opt.set_db_write_buffer_size( + read_size_from_env(ENV_VAR_DB_WRITE_BUFFER_SIZE).unwrap_or(DEFAULT_DB_WRITE_BUFFER_SIZE) + * 1024 + * 1024, + ); + opt.set_max_total_wal_size( + read_size_from_env(ENV_VAR_DB_WAL_SIZE).unwrap_or(DEFAULT_DB_WAL_SIZE) as u64 * 1024 * 1024, + ); + + opt.increase_parallelism(4); + opt.set_enable_pipelined_write(true); + + opt.set_block_based_table_factory(&get_block_options(128)); + + // Set memtable bloomfilter. + opt.set_memtable_prefix_bloom_ratio(0.02); + + DBOptions { + options: opt, + rw_options: ReadWriteOptions::default(), + } +} + +fn get_block_options(block_cache_size_mb: usize) -> BlockBasedOptions { + // Set options mostly similar to those used in optimize_for_point_lookup(), + // except non-default binary and hash index, to hopefully reduce lookup latencies + // without causing any regression for scanning, with slightly more memory usages. + // https://github.com/facebook/rocksdb/blob/11cb6af6e5009c51794641905ca40ce5beec7fee/options/options.cc#L611-L621 + let mut block_options = BlockBasedOptions::default(); + // Increase block size to 16KiB. + // https://github.com/EighteenZi/rocksdb_wiki/blob/master/Memory-usage-in-RocksDB.md#indexes-and-filter-blocks + block_options.set_block_size(16 * 1024); + // Configure a block cache. + block_options.set_block_cache(&Cache::new_lru_cache(block_cache_size_mb << 20)); + // Set a bloomfilter with 1% false positive rate. + block_options.set_bloom_filter(10.0, false); + // From https://github.com/EighteenZi/rocksdb_wiki/blob/master/Block-Cache.md#caching-index-and-filter-blocks + block_options.set_pin_l0_filter_and_index_blocks_in_cache(true); + block_options +} + +/// Opens a database with options, and a number of column families that are created if they do not exist. +#[instrument(level="debug", skip_all, fields(path = ?path.as_ref(), cf = ?opt_cfs), err)] +pub fn open_cf>( + path: P, + db_options: Option, + metric_conf: MetricConf, + opt_cfs: &[&str], +) -> Result, RawStoreError> { + let options = db_options.unwrap_or_else(|| default_db_options().options); + let column_descriptors: Vec<_> = opt_cfs + .iter() + .map(|name| (*name, options.clone())) + .collect(); + open_cf_opts( + path, + Some(options.clone()), + metric_conf, + &column_descriptors[..], + ) +} + +fn prepare_db_options(db_options: Option) -> rocksdb::Options { + // Customize database options + let mut options = db_options.unwrap_or_else(|| default_db_options().options); + options.create_if_missing(true); + options.create_missing_column_families(true); + options +} + +/// Opens a database with options, and a number of column families with individual options that are created if they do not exist. +#[instrument(level="debug", skip_all, fields(path = ?path.as_ref()), err)] +pub fn open_cf_opts>( + path: P, + db_options: Option, + metric_conf: MetricConf, + opt_cfs: &[(&str, rocksdb::Options)], +) -> Result, RawStoreError> { + let path = path.as_ref(); + // In the simulator, we intercept the wall clock in the test thread only. This causes problems + // because rocksdb uses the simulated clock when creating its background threads, but then + // those threads see the real wall clock (because they are not the test thread), which causes + // rocksdb to panic. The `nondeterministic` macro evaluates expressions in new threads, which + // resolves the issue. + // + // This is a no-op in non-simulator builds. + + let cfs = populate_missing_cfs(opt_cfs, path)?; + nondeterministic!({ + let options = prepare_db_options(db_options); + let rocksdb = { + rocksdb::DBWithThreadMode::::open_cf_descriptors( + &options, + path, + cfs.into_iter() + .map(|(name, opts)| ColumnFamilyDescriptor::new(name, opts)), + )? + }; + Ok(Arc::new(RocksDB::DBWithThreadMode( + DBWithThreadModeWrapper { + underlying: rocksdb, + metric_conf, + db_path: PathBuf::from(path), + }, + ))) + }) +} + +/// Opens a database with options, and a number of column families with individual options that are created if they do not exist. +#[instrument(level="debug", skip_all, fields(path = ?path.as_ref()), err)] +pub fn open_cf_opts_transactional>( + path: P, + db_options: Option, + metric_conf: MetricConf, + opt_cfs: &[(&str, rocksdb::Options)], +) -> Result, RawStoreError> { + let path = path.as_ref(); + let cfs = populate_missing_cfs(opt_cfs, path)?; + // See comment above for explanation of why nondeterministic is necessary here. + nondeterministic!({ + let options = prepare_db_options(db_options); + let rocksdb = rocksdb::OptimisticTransactionDB::::open_cf_descriptors( + &options, + path, + cfs.into_iter() + .map(|(name, opts)| ColumnFamilyDescriptor::new(name, opts)), + )?; + Ok(Arc::new(RocksDB::OptimisticTransactionDB( + OptimisticTransactionDBWrapper { + underlying: rocksdb, + metric_conf, + db_path: PathBuf::from(path), + }, + ))) + }) +} + +/// Opens a database with options, and a number of column families with individual options that are created if they do not exist. +pub fn open_cf_opts_secondary>( + primary_path: P, + secondary_path: Option

, + db_options: Option, + metric_conf: MetricConf, + opt_cfs: &[(&str, rocksdb::Options)], +) -> Result, RawStoreError> { + let primary_path = primary_path.as_ref(); + let secondary_path = secondary_path.as_ref().map(|p| p.as_ref()); + // See comment above for explanation of why nondeterministic is necessary here. + nondeterministic!({ + // Customize database options + let mut options = db_options.unwrap_or_else(|| default_db_options().options); + + fdlimit::raise_fd_limit(); + // This is a requirement by RocksDB when opening as secondary + options.set_max_open_files(-1); + + let mut opt_cfs: std::collections::HashMap<_, _> = opt_cfs.iter().cloned().collect(); + let cfs = rocksdb::DBWithThreadMode::::list_cf(&options, primary_path) + .ok() + .unwrap_or_default(); + + let default_db_options = default_db_options(); + // Add CFs not explicitly listed + for cf_key in cfs.iter() { + if !opt_cfs.contains_key(&cf_key[..]) { + opt_cfs.insert(cf_key, default_db_options.options.clone()); + } + } + + let primary_path = primary_path.to_path_buf(); + let secondary_path = secondary_path.map(|q| q.to_path_buf()).unwrap_or_else(|| { + let mut s = primary_path.clone(); + s.pop(); + s.push("SECONDARY"); + s.as_path().to_path_buf() + }); + + let rocksdb = { + options.create_if_missing(true); + options.create_missing_column_families(true); + let db = rocksdb::DBWithThreadMode::::open_cf_descriptors_as_secondary( + &options, + &primary_path, + &secondary_path, + opt_cfs + .iter() + .map(|(name, opts)| ColumnFamilyDescriptor::new(*name, (*opts).clone())), + )?; + db.try_catch_up_with_primary()?; + db + }; + Ok(Arc::new(RocksDB::DBWithThreadMode( + DBWithThreadModeWrapper { + underlying: rocksdb, + metric_conf, + db_path: secondary_path, + }, + ))) + }) +} + +pub fn list_tables(path: std::path::PathBuf) -> eyre::Result> { + const DB_DEFAULT_CF_NAME: &str = "default"; + + let opts = rocksdb::Options::default(); + rocksdb::DBWithThreadMode::::list_cf(&opts, path) + .map_err(|e| e.into()) + .map(|q| { + q.iter() + .filter_map(|s| { + // The `default` table is not used + if s != DB_DEFAULT_CF_NAME { + Some(s.clone()) + } else { + None + } + }) + .collect() + }) +} + +/// TODO: Good description of why we're doing this : RocksDB stores keys in BE and has a seek operator on iterators, see `https://github.com/facebook/rocksdb/wiki/Iterator#introduction` +#[inline] +pub fn be_fix_int_ser(t: &S) -> Result, RawStoreError> +where + S: ?Sized + serde::Serialize, +{ + bincode::DefaultOptions::new() + .with_big_endian() + .with_fixint_encoding() + .serialize(t) + .map_err(|e| e.into()) +} + +#[derive(Clone)] +pub struct DBMapTableConfigMap(BTreeMap); +impl DBMapTableConfigMap { + pub fn new(map: BTreeMap) -> Self { + Self(map) + } + + pub fn to_map(&self) -> BTreeMap { + self.0.clone() + } +} + +pub enum RocksDBAccessType { + Primary, + Secondary(Option), +} + +pub fn safe_drop_db(path: PathBuf) -> Result<(), rocksdb::Error> { + rocksdb::DB::destroy(&rocksdb::Options::default(), path) +} + +fn populate_missing_cfs( + input_cfs: &[(&str, rocksdb::Options)], + path: &Path, +) -> Result, rocksdb::Error> { + let mut cfs = vec![]; + let input_cf_index: HashSet<_> = input_cfs.iter().map(|(name, _)| *name).collect(); + let existing_cfs = + rocksdb::DBWithThreadMode::::list_cf(&rocksdb::Options::default(), path) + .ok() + .unwrap_or_default(); + + for cf_name in existing_cfs { + if !input_cf_index.contains(&cf_name[..]) { + cfs.push((cf_name, rocksdb::Options::default())); + } + } + cfs.extend( + input_cfs + .iter() + .map(|(name, opts)| (name.to_string(), (*opts).clone())), + ); + Ok(cfs) +} diff --git a/moveos/raw-store/src/rocks/safe_iter.rs b/moveos/raw-store/src/rocks/safe_iter.rs new file mode 100644 index 000000000..d57360cf0 --- /dev/null +++ b/moveos/raw-store/src/rocks/safe_iter.rs @@ -0,0 +1,161 @@ +// Copyright (c) RoochNetwork +// SPDX-License-Identifier: Apache-2.0 + +// Copyright (c) Mysten Labs, Inc. +// SPDX-License-Identifier: Apache-2.0 +use std::{marker::PhantomData, sync::Arc}; + +use bincode::Options; +use prometheus::{Histogram, HistogramTimer}; +use rocksdb::Direction; + +use crate::metrics::{DBMetrics, RocksDBPerfContext}; + +use super::{be_fix_int_ser, errors::RawStoreError, RocksDBRawIter}; +use serde::{de::DeserializeOwned, Serialize}; + +/// An iterator over all key-value pairs in a data map. +pub struct SafeIter<'a, K, V> { + cf_name: String, + db_iter: RocksDBRawIter<'a>, + _phantom: PhantomData<(K, V)>, + direction: Direction, + _timer: Option, + _perf_ctx: Option, + bytes_scanned: Option, + keys_scanned: Option, + db_metrics: Option>, + bytes_scanned_counter: usize, + keys_returned_counter: usize, +} + +impl<'a, K: DeserializeOwned, V: DeserializeOwned> SafeIter<'a, K, V> { + pub(super) fn new( + cf_name: String, + db_iter: RocksDBRawIter<'a>, + _timer: Option, + _perf_ctx: Option, + bytes_scanned: Option, + keys_scanned: Option, + db_metrics: Option>, + ) -> Self { + Self { + cf_name, + db_iter, + _phantom: PhantomData, + direction: Direction::Forward, + _timer, + _perf_ctx, + bytes_scanned, + keys_scanned, + db_metrics, + bytes_scanned_counter: 0, + keys_returned_counter: 0, + } + } +} + +impl<'a, K: DeserializeOwned, V: DeserializeOwned> Iterator for SafeIter<'a, K, V> { + type Item = Result<(K, V), RawStoreError>; + + fn next(&mut self) -> Option { + if self.db_iter.valid() { + let config = bincode::DefaultOptions::new() + .with_big_endian() + .with_fixint_encoding(); + let raw_key = self + .db_iter + .key() + .expect("Valid iterator failed to get key"); + let raw_value = self + .db_iter + .value() + .expect("Valid iterator failed to get value"); + self.bytes_scanned_counter += raw_key.len() + raw_value.len(); + self.keys_returned_counter += 1; + let key = config.deserialize(raw_key).ok(); + let value = bcs::from_bytes(raw_value).ok(); + match self.direction { + Direction::Forward => self.db_iter.next(), + Direction::Reverse => self.db_iter.prev(), + } + key.and_then(|k| value.map(|v| Ok((k, v)))) + } else { + match self.db_iter.status() { + Ok(_) => None, + Err(err) => Some(Err(RawStoreError::RocksDBError(format!("{err}")))), + } + } + } +} + +impl<'a, K, V> Drop for SafeIter<'a, K, V> { + fn drop(&mut self) { + if let Some(bytes_scanned) = self.bytes_scanned.take() { + bytes_scanned.observe(self.bytes_scanned_counter as f64); + } + if let Some(keys_scanned) = self.keys_scanned.take() { + keys_scanned.observe(self.keys_returned_counter as f64); + } + if let Some(db_metrics) = self.db_metrics.take() { + db_metrics + .read_perf_ctx_metrics + .report_metrics(&self.cf_name); + } + } +} + +impl<'a, K: Serialize, V> SafeIter<'a, K, V> { + /// Skips all the elements that are smaller than the given key, + /// and either lands on the key or the first one greater than + /// the key. + pub fn skip_to(mut self, key: &K) -> Result { + self.db_iter.seek(be_fix_int_ser(key)?); + Ok(self) + } + + /// Moves the iterator the element given or + /// the one prior to it if it does not exist. If there is + /// no element prior to it, it returns an empty iterator. + pub fn skip_prior_to(mut self, key: &K) -> Result { + self.db_iter.seek_for_prev(be_fix_int_ser(key)?); + Ok(self) + } + + /// Seeks to the last key in the database (at this column family). + pub fn skip_to_last(mut self) -> Self { + self.db_iter.seek_to_last(); + self + } + + /// Will make the direction of the iteration reverse and will + /// create a new `RevIter` to consume. Every call to `next` method + /// will give the next element from the end. + pub fn reverse(mut self) -> SafeRevIter<'a, K, V> { + self.direction = Direction::Reverse; + SafeRevIter::new(self) + } +} + +/// An iterator with a reverted direction to the original. The `RevIter` +/// is hosting an iteration which is consuming in the opposing direction. +/// It's not possible to do further manipulation (ex re-reverse) to the +/// iterator. +pub struct SafeRevIter<'a, K, V> { + iter: SafeIter<'a, K, V>, +} + +impl<'a, K, V> SafeRevIter<'a, K, V> { + fn new(iter: SafeIter<'a, K, V>) -> Self { + Self { iter } + } +} + +impl<'a, K: DeserializeOwned, V: DeserializeOwned> Iterator for SafeRevIter<'a, K, V> { + type Item = Result<(K, V), RawStoreError>; + + /// Will give the next item backwards + fn next(&mut self) -> Option { + self.iter.next() + } +} diff --git a/moveos/raw-store/src/rocks/tests.rs b/moveos/raw-store/src/rocks/tests.rs new file mode 100644 index 000000000..ee6a8a738 --- /dev/null +++ b/moveos/raw-store/src/rocks/tests.rs @@ -0,0 +1,1154 @@ +// Copyright (c) RoochNetwork +// SPDX-License-Identifier: Apache-2.0 + +// Copyright (c) Mysten Labs, Inc. +// SPDX-License-Identifier: Apache-2.0 +use super::*; +// use crate::rocks::util::{is_ref_count_value, reference_count_merge_operator}; +use crate::{reopen, retry_transaction, retry_transaction_forever}; +use rstest::rstest; +use serde::Deserialize; + +fn temp_dir() -> std::path::PathBuf { + tempfile::tempdir() + .expect("Failed to open temporary directory") + .into_path() +} + +#[rstest] +#[tokio::test] +async fn test_open(#[values(true, false)] is_transactional: bool) { + let _db = open_map::<_, u32, String>(temp_dir(), None, is_transactional); +} + +#[rstest] +#[tokio::test] +async fn test_reopen(#[values(true, false)] is_transactional: bool) { + let arc = { + let db = open_map::<_, u32, String>(temp_dir(), None, is_transactional); + db.insert(&123456789, &"123456789".to_string()) + .expect("Failed to insert"); + db + }; + let db = DBMap::::reopen(&arc.rocksdb, None, &ReadWriteOptions::default()) + .expect("Failed to re-open storage"); + assert!(db + .contains_key(&123456789) + .expect("Failed to retrieve item in storage")); +} + +#[tokio::test] +async fn test_reopen_macro() { + const FIRST_CF: &str = "First_CF"; + const SECOND_CF: &str = "Second_CF"; + + let rocks = open_cf( + temp_dir(), + None, + MetricConf::default(), + &[FIRST_CF, SECOND_CF], + ) + .unwrap(); + + let (db_map_1, db_map_2) = reopen!(&rocks, FIRST_CF;, SECOND_CF;); + + let keys_vals_cf1 = (1..100).map(|i| (i, i.to_string())); + let keys_vals_cf2 = (1..100).map(|i| (i, i.to_string())); + + assert_eq!(db_map_1.cf, FIRST_CF); + assert_eq!(db_map_2.cf, SECOND_CF); + + assert!(db_map_1.multi_insert(keys_vals_cf1).is_ok()); + assert!(db_map_2.multi_insert(keys_vals_cf2).is_ok()); +} + +#[rstest] +#[tokio::test] +async fn test_wrong_reopen(#[values(true, false)] is_transactional: bool) { + let rocks = open_rocksdb(temp_dir(), &["foo", "bar", "baz"], is_transactional); + let db = DBMap::::reopen(&rocks, Some("quux"), &ReadWriteOptions::default()); + assert!(db.is_err()); +} + +// #[rstest] +// #[tokio::test] +// async fn test_contains_key(#[values(true, false)] is_transactional: bool) { +// let db = open_map(temp_dir(), None, is_transactional); +// +// db.insert(&123456789, &"123456789".to_string()) +// .expect("Failed to insert"); +// assert!(db +// .contains_key(&123456789) +// .expect("Failed to call contains key")); +// assert!(!db +// .contains_key(&000000000) +// .expect("Failed to call contains key")); +// } + +// #[rstest] +// #[tokio::test] +// async fn test_get(#[values(true, false)] is_transactional: bool) { +// let db = open_map(temp_dir(), None, is_transactional); +// +// db.insert(&123456789, &"123456789".to_string()) +// .expect("Failed to insert"); +// assert_eq!( +// Some("123456789".to_string()), +// db.get(&123456789).expect("Failed to get") +// ); +// assert_eq!(None, db.get(&000000000).expect("Failed to get")); +// } + +// #[rstest] +// #[tokio::test] +// async fn test_get_raw(#[values(true, false)] is_transactional: bool) { +// let db = open_map(temp_dir(), None, is_transactional); +// +// db.insert(&123456789, &"123456789".to_string()) +// .expect("Failed to insert"); +// +// let val_bytes = db +// .get_raw_bytes(&123456789) +// .expect("Failed to get_raw_bytes") +// .unwrap(); +// +// assert_eq!(bcs::to_bytes(&"123456789".to_string()).unwrap(), val_bytes); +// assert_eq!( +// None, +// db.get_raw_bytes(&000000000) +// .expect("Failed to get_raw_bytes") +// ); +// } + +#[rstest] +#[tokio::test] +async fn test_multi_get(#[values(true, false)] is_transactional: bool) { + let db = open_map(temp_dir(), None, is_transactional); + + db.insert(&123, &"123".to_string()) + .expect("Failed to insert"); + db.insert(&456, &"456".to_string()) + .expect("Failed to insert"); + + let result = db.multi_get([123, 456, 789]).expect("Failed to multi get"); + + assert_eq!(result.len(), 3); + assert_eq!(result[0], Some("123".to_string())); + assert_eq!(result[1], Some("456".to_string())); + assert_eq!(result[2], None); +} + +// #[rstest] +// #[tokio::test] +// async fn test_chunked_multi_get(#[values(true, false)] is_transactional: bool) { +// let db = open_map(temp_dir(), None, is_transactional); +// +// db.insert(&123, &"123".to_string()) +// .expect("Failed to insert"); +// db.insert(&456, &"456".to_string()) +// .expect("Failed to insert"); +// +// let result = db +// .chunked_multi_get([123, 456, 789], 1) +// .expect("Failed to chunk multi get"); +// +// assert_eq!(result.len(), 3); +// assert_eq!(result[0], Some("123".to_string())); +// assert_eq!(result[1], Some("456".to_string())); +// assert_eq!(result[2], None); +// } + +#[rstest] +#[tokio::test] +async fn test_skip(#[values(true, false)] is_transactional: bool) { + let db = open_map(temp_dir(), None, is_transactional); + + db.insert(&123, &"123".to_string()) + .expect("Failed to insert"); + db.insert(&456, &"456".to_string()) + .expect("Failed to insert"); + db.insert(&789, &"789".to_string()) + .expect("Failed to insert"); + + // Skip all smaller + let key_vals: Vec<_> = db.safe_iter().skip_to(&456).expect("Seek failed").collect(); + assert_eq!(key_vals.len(), 2); + assert_eq!(key_vals[0], Ok((456, "456".to_string()))); + assert_eq!(key_vals[1], Ok((789, "789".to_string()))); + + // Skip all smaller: same for the keys iterator + let keys: Vec<_> = db.keys().skip_to(&456).expect("Seek failed").collect(); + assert_eq!(keys.len(), 2); + assert_eq!(keys[0], Ok(456)); + assert_eq!(keys[1], Ok(789)); + + // Skip to the end + assert_eq!( + db.safe_iter().skip_to(&999).expect("Seek failed").count(), + 0 + ); + // same for the keys + assert_eq!(db.keys().skip_to(&999).expect("Seek failed").count(), 0); + + // Skip to last + assert_eq!( + db.safe_iter().skip_to_last().next(), + Some(Ok((789, "789".to_string()))) + ); + // same for the keys + assert_eq!(db.keys().skip_to_last().next(), Some(Ok(789))); + + // Skip to successor of first value + assert_eq!( + db.safe_iter().skip_to(&000).expect("Skip failed").count(), + 3 + ); + assert_eq!(db.keys().skip_to(&000).expect("Skip failed").count(), 3); +} + +#[rstest] +#[tokio::test] +async fn test_skip_to_previous_simple(#[values(true, false)] is_transactional: bool) { + let db = open_map(temp_dir(), None, is_transactional); + + db.insert(&123, &"123".to_string()) + .expect("Failed to insert"); + db.insert(&456, &"456".to_string()) + .expect("Failed to insert"); + db.insert(&789, &"789".to_string()) + .expect("Failed to insert"); + + // Skip to the one before the end + let key_vals: Vec<_> = db + .safe_iter() + .skip_prior_to(&999) + .expect("Seek failed") + .collect(); + assert_eq!(key_vals.len(), 1); + assert_eq!(key_vals[0], Ok((789, "789".to_string()))); + // Same for the keys iterator + let keys: Vec<_> = db + .keys() + .skip_prior_to(&999) + .expect("Seek failed") + .collect(); + assert_eq!(keys.len(), 1); + assert_eq!(keys[0], Ok(789)); + + // Skip to prior of first value + // Note: returns an empty iterator! + assert_eq!( + db.safe_iter() + .skip_prior_to(&000) + .expect("Seek failed") + .count(), + 0 + ); + // Same for the keys iterator + assert_eq!( + db.keys().skip_prior_to(&000).expect("Seek failed").count(), + 0 + ); +} + +#[rstest] +#[tokio::test] +async fn test_iter_skip_to_previous_gap(#[values(true, false)] is_transactional: bool) { + let db = open_map(temp_dir(), None, is_transactional); + + for i in 1..100 { + if i != 50 { + db.insert(&i, &i.to_string()).unwrap(); + } + } + + // Skip prior to will return an iterator starting with an "unexpected" key if the sought one is not in the table + let db_iter = db.safe_iter().skip_prior_to(&50).unwrap(); + + assert_eq!( + (49..50) + .chain(51..100) + .map(|i| Ok((i, i.to_string()))) + .collect::>(), + db_iter.collect::>() + ); + // Same logic in the keys iterator + let db_iter = db.keys().skip_prior_to(&50).unwrap(); + + assert_eq!( + (49..50).chain(51..100).map(Ok).collect::>(), + db_iter.collect::>() + ); +} + +#[rstest] +#[tokio::test] +async fn test_remove(#[values(true, false)] is_transactional: bool) { + let db = open_map(temp_dir(), None, is_transactional); + + db.insert(&123456789, &"123456789".to_string()) + .expect("Failed to insert"); + assert!(db.get(&123456789).expect("Failed to get").is_some()); + + db.remove(&123456789).expect("Failed to remove"); + assert!(db.get(&123456789).expect("Failed to get").is_none()); +} + +#[rstest] +#[tokio::test] +async fn test_iter(#[values(true, false)] is_transactional: bool) { + let db = open_map(temp_dir(), None, is_transactional); + db.insert(&123456789, &"123456789".to_string()) + .expect("Failed to insert"); + + let mut iter = db.safe_iter(); + assert_eq!(Some(Ok((123456789, "123456789".to_string()))), iter.next()); + assert_eq!(None, iter.next()); +} + +#[rstest] +#[tokio::test] +async fn test_iter_reverse(#[values(true, false)] is_transactional: bool) { + let db = open_map(temp_dir(), None, is_transactional); + + db.insert(&1, &"1".to_string()).expect("Failed to insert"); + db.insert(&2, &"2".to_string()).expect("Failed to insert"); + db.insert(&3, &"3".to_string()).expect("Failed to insert"); + + let mut iter = db.safe_iter().skip_to_last().reverse(); + assert_eq!(Some(Ok((3, "3".to_string()))), iter.next()); + assert_eq!(Some(Ok((2, "2".to_string()))), iter.next()); + assert_eq!(Some(Ok((1, "1".to_string()))), iter.next()); + assert_eq!(None, iter.next()); + + let mut iter = db.safe_iter().skip_to(&2).unwrap().reverse(); + assert_eq!(Some(Ok((2, "2".to_string()))), iter.next()); + assert_eq!(Some(Ok((1, "1".to_string()))), iter.next()); + assert_eq!(None, iter.next()); +} + +#[rstest] +#[tokio::test] +async fn test_keys(#[values(true, false)] is_transactional: bool) { + let db = open_map(temp_dir(), None, is_transactional); + + db.insert(&123456789, &"123456789".to_string()) + .expect("Failed to insert"); + + let mut keys = db.keys(); + assert_eq!(Some(Ok(123456789)), keys.next()); + assert_eq!(None, keys.next()); +} + +#[rstest] +#[tokio::test] +async fn test_values(#[values(true, false)] is_transactional: bool) { + let db = open_map(temp_dir(), None, is_transactional); + + db.insert(&123456789, &"123456789".to_string()) + .expect("Failed to insert"); + + let mut values = db.values(); + assert_eq!(Some(Ok("123456789".to_string())), values.next()); + assert_eq!(None, values.next()); +} + +#[rstest] +#[tokio::test] +async fn test_try_extend(#[values(true, false)] is_transactional: bool) { + let mut db = open_map(temp_dir(), None, is_transactional); + let mut keys_vals = (1..100).map(|i| (i, i.to_string())); + + db.try_extend(&mut keys_vals) + .expect("Failed to extend the DB with (k, v) pairs"); + for (k, v) in keys_vals { + let val = db.get(&k).expect("Failed to get inserted key"); + assert_eq!(Some(v), val); + } +} + +#[rstest] +#[tokio::test] +async fn test_try_extend_from_slice(#[values(true, false)] is_transactional: bool) { + let mut db = open_map(temp_dir(), None, is_transactional); + let keys_vals = (1..100).map(|i| (i, i.to_string())); + + db.try_extend_from_slice(&keys_vals.clone().collect::>()[..]) + .expect("Failed to extend the DB with (k, v) pairs"); + for (k, v) in keys_vals { + let val = db.get(&k).expect("Failed to get inserted key"); + assert_eq!(Some(v), val); + } +} + +#[rstest] +#[tokio::test] +async fn test_insert_batch(#[values(true, false)] is_transactional: bool) { + let db = open_map(temp_dir(), None, is_transactional); + let keys_vals = (1..100).map(|i| (i, i.to_string())); + let mut insert_batch = db.batch(); + insert_batch + .insert_batch(&db, keys_vals.clone()) + .expect("Failed to batch insert"); + insert_batch.write().expect("Failed to execute batch"); + for (k, v) in keys_vals { + let val = db.get(&k).expect("Failed to get inserted key"); + assert_eq!(Some(v), val); + } +} + +#[rstest] +#[tokio::test] +async fn test_insert_batch_across_cf(#[values(true, false)] is_transactional: bool) { + let rocks = open_rocksdb(temp_dir(), &["First_CF", "Second_CF"], is_transactional); + + let db_cf_1 = DBMap::reopen(&rocks, Some("First_CF"), &ReadWriteOptions::default()) + .expect("Failed to open storage"); + let keys_vals_1 = (1..100).map(|i| (i, i.to_string())); + + let db_cf_2 = DBMap::reopen(&rocks, Some("Second_CF"), &ReadWriteOptions::default()) + .expect("Failed to open storage"); + let keys_vals_2 = (1000..1100).map(|i| (i, i.to_string())); + + let mut batch = db_cf_1.batch(); + batch + .insert_batch(&db_cf_1, keys_vals_1.clone()) + .expect("Failed to batch insert") + .insert_batch(&db_cf_2, keys_vals_2.clone()) + .expect("Failed to batch insert"); + + batch.write().expect("Failed to execute batch"); + for (k, v) in keys_vals_1 { + let val = db_cf_1.get(&k).expect("Failed to get inserted key"); + assert_eq!(Some(v), val); + } + + for (k, v) in keys_vals_2 { + let val = db_cf_2.get(&k).expect("Failed to get inserted key"); + assert_eq!(Some(v), val); + } +} + +#[rstest] +#[tokio::test] +async fn test_insert_batch_across_different_db(#[values(true, false)] is_transactional: bool) { + let rocks = open_rocksdb(temp_dir(), &["First_CF", "Second_CF"], is_transactional); + let rocks2 = open_rocksdb(temp_dir(), &["First_CF", "Second_CF"], is_transactional); + + let db_cf_1: DBMap = + DBMap::reopen(&rocks, Some("First_CF"), &ReadWriteOptions::default()) + .expect("Failed to open storage"); + let keys_vals_1 = (1..100).map(|i| (i, i.to_string())); + + let db_cf_2: DBMap = + DBMap::reopen(&rocks2, Some("Second_CF"), &ReadWriteOptions::default()) + .expect("Failed to open storage"); + let keys_vals_2 = (1000..1100).map(|i| (i, i.to_string())); + + assert!(db_cf_1 + .batch() + .insert_batch(&db_cf_1, keys_vals_1) + .expect("Failed to batch insert") + .insert_batch(&db_cf_2, keys_vals_2) + .is_err()); +} + +// #[tokio::test] +// async fn test_delete_batch() { +// let db = DBMap::::open( +// temp_dir(), +// MetricConf::default(), +// None, +// None, +// &ReadWriteOptions::default(), +// ) +// .expect("Failed to open storage"); +// +// let keys_vals = (1..100).map(|i| (i, i.to_string())); +// let mut batch = db.batch(); +// batch +// .insert_batch(&db, keys_vals) +// .expect("Failed to batch insert"); +// +// // delete the odd-index keys +// let deletion_keys = (1..100).step_by(2); +// batch +// .delete_batch(&db, deletion_keys) +// .expect("Failed to batch delete"); +// +// batch.write().expect("Failed to execute batch"); +// +// for k in db.keys() { +// assert_eq!(k.unwrap() % 2, 0); +// } +// } + +// #[tokio::test] +// async fn test_delete_range() { +// let db: DBMap = DBMap::open( +// temp_dir(), +// MetricConf::default(), +// None, +// None, +// &ReadWriteOptions::default(), +// ) +// .expect("Failed to open storage"); +// +// // Note that the last element is (100, "100".to_owned()) here +// let keys_vals = (0..101).map(|i| (i, i.to_string())); +// let mut batch = db.batch(); +// batch +// .insert_batch(&db, keys_vals) +// .expect("Failed to batch insert"); +// +// batch +// .delete_range(&db, &50, &100) +// .expect("Failed to delete range"); +// +// batch.write().expect("Failed to execute batch"); +// +// for k in 0..50 { +// assert!(db.contains_key(&k).expect("Failed to query legal key"),); +// } +// for k in 50..100 { +// assert!(!db.contains_key(&k).expect("Failed to query legal key")); +// } +// +// // range operator is not inclusive of to +// assert!(db.contains_key(&100).expect("Failed to query legal key")); +// } + +#[tokio::test] +async fn test_clear() { + let db = DBMap::::open( + temp_dir(), + MetricConf::default(), + None, + Some("table"), + &ReadWriteOptions::default(), + ) + .expect("Failed to open storage"); + // Test clear of empty map + let _ = db.clear(); + + let keys_vals = (0..101).map(|i| (i, i.to_string())); + let mut insert_batch = db.batch(); + insert_batch + .insert_batch(&db, keys_vals) + .expect("Failed to batch insert"); + + insert_batch.write().expect("Failed to execute batch"); + + // Check we have multiple entries + assert!(db.safe_iter().count() > 1); + let _ = db.clear(); + assert_eq!(db.safe_iter().count(), 0); + // Clear again to ensure safety when clearing empty map + let _ = db.clear(); + assert_eq!(db.safe_iter().count(), 0); + // Clear with one item + let _ = db.insert(&1, &"e".to_string()); + assert_eq!(db.safe_iter().count(), 1); + let _ = db.clear(); + assert_eq!(db.safe_iter().count(), 0); +} + +#[rstest] +#[tokio::test] +async fn test_iter_with_bounds(#[values(true, false)] is_transactional: bool) { + let db = open_map(temp_dir(), None, is_transactional); + + // Add [1, 50) and (50, 100) in the db + for i in 1..100 { + if i != 50 { + db.insert(&i, &i.to_string()).unwrap(); + } + } + + // Skip prior to will return an iterator starting with an "unexpected" key if the sought one is not in the table + let db_iter = db + .iter_with_bounds(Some(1), Some(100)) + .skip_prior_to(&50) + .unwrap(); + + assert_eq!( + (49..50) + .chain(51..100) + .map(|i| (i, i.to_string())) + .collect::>(), + db_iter.collect::>() + ); + + // Same logic in the keys iterator + let db_iter = db.keys().skip_prior_to(&50).unwrap(); + + assert_eq!( + (49..50).chain(51..100).map(Ok).collect::>(), + db_iter.collect::>() + ); + + // Skip to a key which is not within the bounds (bound is [1, 50)) + let db_iter = db.iter_with_bounds(Some(1), Some(50)).skip_to(&50).unwrap(); + assert_eq!(Vec::<(i32, String)>::new(), db_iter.collect::>()); + + // Skip to first key in the bound (bound is [1, 50)) + let db_iter = db.iter_with_bounds(Some(1), Some(50)).skip_to(&1).unwrap(); + assert_eq!( + (1..50).map(|i| (i, i.to_string())).collect::>(), + db_iter.collect::>() + ); + + // Skip to a key which is not within the bounds (bound is [1, 50)) + let db_iter = db + .iter_with_bounds(Some(1), Some(50)) + .skip_prior_to(&50) + .unwrap(); + assert_eq!(vec![(49, "49".to_string())], db_iter.collect::>()); +} + +#[tokio::test] +async fn test_is_empty() { + let db = DBMap::::open( + temp_dir(), + MetricConf::default(), + None, + Some("table"), + &ReadWriteOptions::default(), + ) + .expect("Failed to open storage"); + + // Test empty map is truly empty + assert!(db.is_empty()); + let _ = db.clear(); + assert!(db.is_empty()); + + let keys_vals = (0..101).map(|i| (i, i.to_string())); + let mut insert_batch = db.batch(); + insert_batch + .insert_batch(&db, keys_vals) + .expect("Failed to batch insert"); + + insert_batch.write().expect("Failed to execute batch"); + + // Check we have multiple entries and not empty + assert!(db.safe_iter().count() > 1); + assert!(!db.is_empty()); + + // Clear again to ensure empty works after clearing + let _ = db.clear(); + assert_eq!(db.safe_iter().count(), 0); + assert!(db.is_empty()); +} + +#[rstest] +#[tokio::test] +async fn test_multi_insert(#[values(true, false)] is_transactional: bool) { + // Init a DB + let db: DBMap = open_map(temp_dir(), Some("table"), is_transactional); + // Create kv pairs + let keys_vals = (0..101).map(|i| (i, i.to_string())); + + db.multi_insert(keys_vals.clone()) + .expect("Failed to multi-insert"); + + for (k, v) in keys_vals { + let val = db.get(&k).expect("Failed to get inserted key"); + assert_eq!(Some(v), val); + } +} + +#[rstest] +#[tokio::test] +async fn test_checkpoint(#[values(true, false)] is_transactional: bool) { + let path_prefix = temp_dir(); + let db_path = path_prefix.join("db"); + let db: DBMap = open_map(db_path, Some("table"), is_transactional); + // Create kv pairs + let keys_vals = (0..101).map(|i| (i, i.to_string())); + + db.multi_insert(keys_vals.clone()) + .expect("Failed to multi-insert"); + let checkpointed_path = path_prefix.join("checkpointed_db"); + db.rocksdb + .checkpoint(&checkpointed_path) + .expect("Failed to create db checkpoint"); + // Create more kv pairs + let new_keys_vals = (101..201).map(|i| (i, i.to_string())); + db.multi_insert(new_keys_vals.clone()) + .expect("Failed to multi-insert"); + // Verify checkpoint + let checkpointed_db: DBMap = + open_map(checkpointed_path, Some("table"), is_transactional); + // Ensure keys inserted before checkpoint are present in original and checkpointed db + for (k, v) in keys_vals { + let val = db.get(&k).expect("Failed to get inserted key"); + assert_eq!(Some(v.clone()), val); + let val = checkpointed_db.get(&k).expect("Failed to get inserted key"); + assert_eq!(Some(v), val); + } + // Ensure keys inserted after checkpoint are only present in original db but not in checkpointed db + for (k, v) in new_keys_vals { + let val = db.get(&k).expect("Failed to get inserted key"); + assert_eq!(Some(v.clone()), val); + let val = checkpointed_db.get(&k).expect("Failed to get inserted key"); + assert_eq!(None, val); + } +} + +#[rstest] +#[tokio::test] +async fn test_multi_remove(#[values(true, false)] is_transactional: bool) { + // Init a DB + let db: DBMap = open_map(temp_dir(), Some("table"), is_transactional); + + // Create kv pairs + let keys_vals = (0..101).map(|i| (i, i.to_string())); + + db.multi_insert(keys_vals.clone()) + .expect("Failed to multi-insert"); + + // Check insertion + for (k, v) in keys_vals.clone() { + let val = db.get(&k).expect("Failed to get inserted key"); + assert_eq!(Some(v), val); + } + + // Remove 50 items + db.multi_remove(keys_vals.clone().map(|kv| kv.0).take(50)) + .expect("Failed to multi-remove"); + assert_eq!(db.safe_iter().count(), 101 - 50); + + // Check that the remaining are present + for (k, v) in keys_vals.skip(50) { + let val = db.get(&k).expect("Failed to get inserted key"); + assert_eq!(Some(v), val); + } +} + +#[tokio::test] +async fn test_transactional() { + let key = "key"; + let path = temp_dir(); + let opt = rocksdb::Options::default(); + let rocksdb = + open_cf_opts_transactional(path, None, MetricConf::default(), &[("cf", opt)]).unwrap(); + let db = DBMap::::reopen(&rocksdb, None, &ReadWriteOptions::default()) + .expect("Failed to re-open storage"); + + // transaction is used instead + let mut tx1 = db.transaction().expect("failed to initiate transaction"); + let mut tx2 = db.transaction().expect("failed to initiate transaction"); + + tx1.insert_batch(&db, vec![(key.to_string(), "1".to_string())]) + .unwrap(); + tx2.insert_batch(&db, vec![(key.to_string(), "2".to_string())]) + .unwrap(); + + tx1.commit().expect("failed to commit first transaction"); + assert!(tx2.commit().is_err()); + assert_eq!(db.get(&key.to_string()).unwrap(), Some("1".to_string())); +} + +#[tokio::test] +async fn test_transaction_snapshot() { + let key = "key".to_string(); + let path = temp_dir(); + let opt = rocksdb::Options::default(); + let rocksdb = + open_cf_opts_transactional(path, None, MetricConf::default(), &[("cf", opt)]).unwrap(); + let db = DBMap::::reopen(&rocksdb, None, &ReadWriteOptions::default()) + .expect("Failed to re-open storage"); + + // transaction without set_snapshot succeeds when extraneous write occurs before transaction + // write. + let mut tx1 = db + .transaction_without_snapshot() + .expect("failed to initiate transaction"); + // write occurs after transaction is created but before first write + db.insert(&key, &"1".to_string()).unwrap(); + tx1.insert_batch(&db, vec![(key.to_string(), "2".to_string())]) + .unwrap(); + tx1.commit().expect("failed to commit first transaction"); + assert_eq!(db.get(&key).unwrap().unwrap(), "2".to_string()); + + // transaction without set_snapshot fails when extraneous write occurs after transaction + // write. + let mut tx1 = db + .transaction_without_snapshot() + .expect("failed to initiate transaction"); + tx1.insert_batch(&db, vec![(key.to_string(), "2".to_string())]) + .unwrap(); + db.insert(&key, &"1".to_string()).unwrap(); + assert!(matches!( + tx1.commit(), + Err(RawStoreError::RetryableTransactionError) + )); + assert_eq!(db.get(&key).unwrap().unwrap(), "1".to_string()); + + // failed transaction with set_snapshot + let mut tx1 = db.transaction().expect("failed to initiate transaction"); + // write occurs after transaction is created, so the conflict is detected + db.insert(&key, &"1".to_string()).unwrap(); + tx1.insert_batch(&db, vec![(key.to_string(), "2".to_string())]) + .unwrap(); + assert!(matches!( + tx1.commit(), + Err(RawStoreError::RetryableTransactionError) + )); + + let mut tx1 = db.transaction().expect("failed to initiate transaction"); + tx1.insert_batch(&db, vec![(key.to_string(), "2".to_string())]) + .unwrap(); + // no conflicting writes, should succeed this time. + tx1.commit().unwrap(); + + // when to transactions race, one will fail provided that neither commits before the other + // writes. + let mut tx1 = db + .transaction_without_snapshot() + .expect("failed to initiate transaction"); + let mut tx2 = db + .transaction_without_snapshot() + .expect("failed to initiate transaction"); + tx1.insert_batch(&db, vec![(key.to_string(), "1".to_string())]) + .unwrap(); + tx2.insert_batch(&db, vec![(key.to_string(), "2".to_string())]) + .unwrap(); + // which ever tx is committed first will succeed. + tx1.commit().expect("failed to commit"); + assert!(matches!( + tx2.commit(), + Err(RawStoreError::RetryableTransactionError) + )); + + // IMPORTANT: a race is still possible if one tx commits before the other writes. + let mut tx1 = db + .transaction_without_snapshot() + .expect("failed to initiate transaction"); + let mut tx2 = db + .transaction_without_snapshot() + .expect("failed to initiate transaction"); + tx1.insert_batch(&db, vec![(key.to_string(), "1".to_string())]) + .unwrap(); + tx1.commit().expect("failed to commit"); + + tx2.insert_batch(&db, vec![(key, "2".to_string())]).unwrap(); + tx2.commit().expect("failed to commit"); +} + +#[tokio::test] +async fn test_retry_transaction() { + let key = "key".to_string(); + let path = temp_dir(); + let opt = rocksdb::Options::default(); + let rocksdb = + open_cf_opts_transactional(path, None, MetricConf::default(), &[("cf", opt)]).unwrap(); + let db = DBMap::::reopen(&rocksdb, None, &ReadWriteOptions::default()) + .expect("Failed to re-open storage"); + + let mut conflicts = 0; + retry_transaction!({ + let mut tx1 = db + .transaction_without_snapshot() + .expect("failed to initiate transaction"); + tx1.insert_batch(&db, vec![(key.to_string(), "2".to_string())]) + .unwrap(); + if conflicts < 3 { + db.insert(&key, &"1".to_string()).unwrap(); + } + conflicts += 1; + tx1.commit() + }) + // succeeds after we stop causing conflicts + .unwrap(); + + retry_transaction!({ + let mut tx1 = db + .transaction_without_snapshot() + .expect("failed to initiate transaction"); + tx1.insert_batch(&db, vec![(key.to_string(), "2".to_string())]) + .unwrap(); + db.insert(&key, &"1".to_string()).unwrap(); + tx1.commit() + }) + // fails after hitting maximum number of retries + .unwrap_err(); + + // obviously we cannot verify that this never times out, this is more just a test to make sure + // the macro compiles as expected. + tokio::time::timeout(Duration::from_secs(1), async move { + retry_transaction_forever!({ + let mut tx1 = db + .transaction_without_snapshot() + .expect("failed to initiate transaction"); + tx1.insert_batch(&db, vec![(key.to_string(), "2".to_string())]) + .unwrap(); + db.insert(&key, &"1".to_string()).unwrap(); + tx1.commit() + }) + // fails after hitting maximum number of retries + .unwrap_err(); + panic!("should never finish"); + }) + .await + // must timeout + .unwrap_err(); +} + +#[tokio::test] +async fn test_transaction_read_your_write() { + let key1 = "key1"; + let key2 = "key2"; + let path = temp_dir(); + let opt = rocksdb::Options::default(); + let rocksdb = + open_cf_opts_transactional(path, None, MetricConf::default(), &[("cf", opt)]).unwrap(); + let db = DBMap::::reopen(&rocksdb, None, &ReadWriteOptions::default()) + .expect("Failed to re-open storage"); + db.insert(&key1.to_string(), &"1".to_string()).unwrap(); + let mut tx = db.transaction().expect("failed to initiate transaction"); + tx.insert_batch( + &db, + vec![ + (key1.to_string(), "11".to_string()), + (key2.to_string(), "2".to_string()), + ], + ) + .unwrap(); + assert_eq!(db.get(&key1.to_string()).unwrap(), Some("1".to_string())); + assert_eq!(db.get(&key2.to_string()).unwrap(), None); + + assert_eq!( + tx.get(&db, &key1.to_string()).unwrap(), + Some("11".to_string()) + ); + assert_eq!( + tx.get(&db, &key2.to_string()).unwrap(), + Some("2".to_string()) + ); + + tx.delete_batch(&db, vec![(key2.to_string())]).unwrap(); + + assert_eq!( + tx.multi_get(&db, vec![key1.to_string(), key2.to_string()]) + .unwrap(), + vec![Some("11".to_string()), None] + ); + let keys: Vec = tx.keys(&db).map(|x| x.unwrap()).collect(); + assert_eq!(keys, vec![key1.to_string()]); + let values: Vec<_> = tx.values(&db).collect(); + assert_eq!(values, vec![Ok("11".to_string())]); + assert!(tx.commit().is_ok()); +} + +#[tokio::test] +async fn open_as_secondary_test() { + let primary_path = temp_dir(); + + // Init a DB + let primary_db = DBMap::::open( + primary_path.clone(), + MetricConf::default(), + None, + Some("table"), + &ReadWriteOptions::default(), + ) + .expect("Failed to open storage"); + // Create kv pairs + let keys_vals = (0..101).map(|i| (i, i.to_string())); + + primary_db + .multi_insert(keys_vals.clone()) + .expect("Failed to multi-insert"); + + let opt = rocksdb::Options::default(); + let secondary_store = open_cf_opts_secondary( + primary_path, + None, + None, + MetricConf::default(), + &[("table", opt)], + ) + .unwrap(); + let secondary_db = DBMap::::reopen( + &secondary_store, + Some("table"), + &ReadWriteOptions::default(), + ) + .unwrap(); + + secondary_db.try_catch_up_with_primary().unwrap(); + // Check secondary + for (k, v) in keys_vals { + assert_eq!(secondary_db.get(&k).unwrap(), Some(v)); + } + + // Update the value from 0 to 10 + primary_db.insert(&0, &"10".to_string()).unwrap(); + + // This should still be stale since secondary is behind + assert_eq!(secondary_db.get(&0).unwrap(), Some("0".to_string())); + + // Try force catchup + secondary_db.try_catch_up_with_primary().unwrap(); + + // New value should be present + assert_eq!(secondary_db.get(&0).unwrap(), Some("10".to_string())); +} + +#[derive(Serialize, Deserialize, Copy, Clone)] +struct ObjectWithRefCount { + value: i64, + ref_count: i64, +} + +fn increment_counter(db: &DBMap, key: &str, value: i64) { + let mut batch = db.batch(); + batch + .partial_merge_batch(db, [(key.to_string(), value.to_le_bytes())]) + .unwrap(); + batch.write().unwrap(); +} + +// #[tokio::test] +// async fn refcount_test() { +// let key = "key".to_string(); +// let mut options = rocksdb::Options::default(); +// options.set_merge_operator( +// "refcount operator", +// reference_count_merge_operator, +// reference_count_merge_operator, +// ); +// let db = DBMap::::open( +// temp_dir(), +// MetricConf::default(), +// Some(options), +// None, +// &ReadWriteOptions::default(), +// ) +// .expect("failed to open rocksdb"); +// let object = ObjectWithRefCount { +// value: 3, +// ref_count: 1, +// }; +// // increment value 10 times +// let iterations = 10; +// for _ in 0..iterations { +// let mut batch = db.batch(); +// batch.merge_batch(&db, [(key.to_string(), object)]).unwrap(); +// batch.write().unwrap(); +// } +// let value = db +// .get(&key) +// .expect("failed to read value") +// .expect("value is empty"); +// assert_eq!(value.value, object.value); +// assert_eq!(value.ref_count, iterations); +// +// // decrement value +// increment_counter(&db, &key, -1); +// let value = db.get(&key).unwrap().unwrap(); +// assert_eq!(value.value, object.value); +// assert_eq!(value.ref_count, iterations - 1); +// } + +// #[tokio::test] +// async fn refcount_with_compaction_test() { +// let key = "key".to_string(); +// let mut options = rocksdb::Options::default(); +// options.set_merge_operator( +// "refcount operator", +// reference_count_merge_operator, +// reference_count_merge_operator, +// ); +// let db = DBMap::::open( +// temp_dir(), +// MetricConf::default(), +// Some(options), +// None, +// &ReadWriteOptions::default(), +// ) +// .expect("failed to open rocksdb"); +// +// let object = ObjectWithRefCount { +// value: 3, +// ref_count: 1, +// }; +// let mut batch = db.batch(); +// batch.merge_batch(&db, [(key.to_string(), object)]).unwrap(); +// batch.write().unwrap(); +// // increment value once +// increment_counter(&db, &key, 1); +// let value = db.get(&key).unwrap().unwrap(); +// assert_eq!(value.value, object.value); +// +// // decrement value to 0 +// increment_counter(&db, &key, -1); +// increment_counter(&db, &key, -1); +// // ref count went to zero. Reading value returns empty array +// assert!(db.get(&key).is_err()); +// let value = db.multi_get_raw_bytes([(&key)]).unwrap()[0] +// .clone() +// .unwrap(); +// assert!(value.is_empty()); +// +// // refcount increment makes value visible again +// increment_counter(&db, &key, 1); +// let value = db.get(&key).unwrap().unwrap(); +// assert_eq!(value.value, object.value); +// +// increment_counter(&db, &key, -1); +// db.compact_range( +// &object, +// &ObjectWithRefCount { +// value: 100, +// ref_count: 1, +// }, +// ) +// .unwrap(); +// +// increment_counter(&db, &key, 1); +// let value = db.get_raw_bytes(&key).unwrap().unwrap(); +// assert!(is_ref_count_value(&value)); +// } + +fn open_map, K, V>( + path: P, + opt_cf: Option<&str>, + is_transactional: bool, +) -> DBMap { + if is_transactional { + let cf = opt_cf.unwrap_or(rocksdb::DEFAULT_COLUMN_FAMILY_NAME); + open_cf_opts_transactional( + path, + None, + MetricConf::default(), + &[(cf, default_db_options().options)], + ) + .map(|db| DBMap::new(db, &ReadWriteOptions::default(), cf)) + .expect("failed to open rocksdb") + } else { + DBMap::::open( + path, + MetricConf::default(), + None, + opt_cf, + &ReadWriteOptions::default(), + ) + .expect("failed to open rocksdb") + } +} + +fn open_rocksdb>(path: P, opt_cfs: &[&str], is_transactional: bool) -> Arc { + if is_transactional { + let options = default_db_options().options; + let cfs: Vec<_> = opt_cfs + .iter() + .map(|name| (*name, options.clone())) + .collect(); + open_cf_opts_transactional(path, None, MetricConf::default(), &cfs) + .expect("failed to open rocksdb") + } else { + open_cf(path, None, MetricConf::default(), opt_cfs).expect("failed to open rocksdb") + } +} diff --git a/moveos/raw-store/src/rocks/util.rs b/moveos/raw-store/src/rocks/util.rs new file mode 100644 index 000000000..f9663343e --- /dev/null +++ b/moveos/raw-store/src/rocks/util.rs @@ -0,0 +1,81 @@ +// Copyright (c) RoochNetwork +// SPDX-License-Identifier: Apache-2.0 + +// Copyright (c) Mysten Labs, Inc. +// SPDX-License-Identifier: Apache-2.0 + +use rocksdb::{CompactionDecision, MergeOperands}; +use std::cmp::Ordering; + +/// custom rocksdb merge operator used for storing objects with reference counts +/// important: reference count field must be 64-bit integer and must be last in struct declaration +/// should be used with immutable objects only +pub fn reference_count_merge_operator( + _key: &[u8], + stored_value: Option<&[u8]>, + operands: &MergeOperands, +) -> Option> { + let (mut value, mut ref_count) = stored_value.map_or((None, 0), deserialize_ref_count_value); + + for operand in operands { + let (new_value, delta) = deserialize_ref_count_value(operand); + assert!(value.is_none() || new_value.is_none() || value == new_value); + if value.is_none() && new_value.is_some() { + value = new_value; + } + ref_count += delta; + } + match ref_count.cmp(&0) { + Ordering::Greater => Some([value.unwrap_or(b""), &ref_count.to_le_bytes()].concat()), + Ordering::Equal => Some(vec![]), + Ordering::Less => Some(ref_count.to_le_bytes().to_vec()), + } +} + +pub fn empty_compaction_filter(_level: u32, _key: &[u8], value: &[u8]) -> CompactionDecision { + if value.is_empty() { + CompactionDecision::Remove + } else { + CompactionDecision::Keep + } +} + +pub fn is_ref_count_value(value: &[u8]) -> bool { + value.is_empty() || value.len() == 8 +} + +fn deserialize_ref_count_value(bytes: &[u8]) -> (Option<&[u8]>, i64) { + if bytes.is_empty() { + return (None, 0); + } + assert!(bytes.len() >= 8); + let (value, rc_bytes) = bytes.split_at(bytes.len() - 8); + let ref_count = i64::from_le_bytes(rc_bytes.try_into().unwrap()); + (if value.is_empty() { None } else { Some(value) }, ref_count) +} + +#[cfg(test)] +mod tests { + use super::deserialize_ref_count_value; + + #[test] + fn deserialize_ref_count_value_test() { + assert_eq!(deserialize_ref_count_value(&[]), (None, 0)); + assert_eq!( + deserialize_ref_count_value(b"\x01\0\0\0\0\0\0\0"), + (None, 1) + ); + assert_eq!( + deserialize_ref_count_value(b"\xff\xff\xff\xff\xff\xff\xff\xff"), + (None, -1) + ); + assert_eq!( + deserialize_ref_count_value(b"\xfe\xff\xff\xff\xff\xff\xff\xff"), + (None, -2) + ); + assert_eq!( + deserialize_ref_count_value(b"test\x04\0\0\0\0\0\0\0"), + (Some(b"test".as_ref()), 4) + ); + } +} diff --git a/moveos/raw-store/src/rocks/values.rs b/moveos/raw-store/src/rocks/values.rs new file mode 100644 index 000000000..78e97c0e8 --- /dev/null +++ b/moveos/raw-store/src/rocks/values.rs @@ -0,0 +1,47 @@ +// Copyright (c) RoochNetwork +// SPDX-License-Identifier: Apache-2.0 + +// Copyright (c) Mysten Labs, Inc. +// SPDX-License-Identifier: Apache-2.0 +use std::marker::PhantomData; + +use crate::RawStoreError; +use serde::de::DeserializeOwned; + +use super::RocksDBRawIter; + +/// An iterator over the values of a prefix. +pub struct Values<'a, V> { + db_iter: RocksDBRawIter<'a>, + _phantom: PhantomData, +} + +impl<'a, V: DeserializeOwned> Values<'a, V> { + pub(crate) fn new(db_iter: RocksDBRawIter<'a>) -> Self { + Self { + db_iter, + _phantom: PhantomData, + } + } +} + +impl<'a, V: DeserializeOwned> Iterator for Values<'a, V> { + type Item = Result; + + fn next(&mut self) -> Option { + if self.db_iter.valid() { + let value = self + .db_iter + .key() + .and_then(|_| self.db_iter.value().and_then(|v| bcs::from_bytes(v).ok())); + + self.db_iter.next(); + value.map(Ok) + } else { + match self.db_iter.status() { + Ok(_) => None, + Err(err) => Some(Err(RawStoreError::RocksDBError(format!("{err}")))), + } + } + } +} diff --git a/moveos/raw-store/src/test_db.rs b/moveos/raw-store/src/test_db.rs new file mode 100644 index 000000000..30b875065 --- /dev/null +++ b/moveos/raw-store/src/test_db.rs @@ -0,0 +1,807 @@ +// Copyright (c) RoochNetwork +// SPDX-License-Identifier: Apache-2.0 + +// Copyright (c) Mysten Labs, Inc. +// SPDX-License-Identifier: Apache-2.0 +#![allow(clippy::await_holding_lock)] + +use std::{ + borrow::Borrow, + collections::{btree_map::Iter, BTreeMap, HashMap, VecDeque}, + marker::PhantomData, + sync::{Arc, RwLock}, +}; + +use crate::{ + rocks::{be_fix_int_ser, RawStoreError}, + Map, +}; +use bincode::Options; +use collectable::TryExtend; +use ouroboros::self_referencing; +use rand::distributions::{Alphanumeric, DistString}; +use rocksdb::Direction; +use serde::{de::DeserializeOwned, Serialize}; +use std::sync::{RwLockReadGuard, RwLockWriteGuard}; + +/// An interface to a btree map backed sally database. This is mainly intended +/// for tests and performing benchmark comparisons +#[derive(Clone, Debug)] +pub struct TestDB { + pub rows: Arc, Vec>>>, + pub name: String, + _phantom: PhantomData V>, +} + +impl TestDB { + pub fn open() -> Self { + TestDB { + rows: Arc::new(RwLock::new(BTreeMap::new())), + name: Alphanumeric.sample_string(&mut rand::thread_rng(), 16), + _phantom: PhantomData, + } + } + pub fn batch(&self) -> TestDBWriteBatch { + TestDBWriteBatch::default() + } +} + +#[self_referencing(pub_extras)] +pub struct TestDBIter<'a, K, V> { + pub rows: RwLockReadGuard<'a, BTreeMap, Vec>>, + #[borrows(mut rows)] + #[covariant] + pub iter: Iter<'this, Vec, Vec>, + phantom: PhantomData<(K, V)>, + pub direction: Direction, +} + +#[self_referencing(pub_extras)] +pub struct TestDBKeys<'a, K> { + rows: RwLockReadGuard<'a, BTreeMap, Vec>>, + #[borrows(mut rows)] + #[covariant] + pub iter: Iter<'this, Vec, Vec>, + phantom: PhantomData, +} + +#[self_referencing(pub_extras)] +pub struct TestDBValues<'a, V> { + rows: RwLockReadGuard<'a, BTreeMap, Vec>>, + #[borrows(mut rows)] + #[covariant] + pub iter: Iter<'this, Vec, Vec>, + phantom: PhantomData, +} + +impl<'a, K: DeserializeOwned, V: DeserializeOwned> Iterator for TestDBIter<'a, K, V> { + type Item = Result<(K, V), RawStoreError>; + + fn next(&mut self) -> Option { + let mut out: Option = None; + let config = bincode::DefaultOptions::new() + .with_big_endian() + .with_fixint_encoding(); + self.with_mut(|fields| { + let resp = match fields.direction { + Direction::Forward => fields.iter.next(), + Direction::Reverse => panic!("Reverse iteration not supported in test db"), + }; + if let Some((raw_key, raw_value)) = resp { + let key: K = config.deserialize(raw_key).ok().unwrap(); + let value: V = bcs::from_bytes(raw_value).ok().unwrap(); + out = Some(Ok((key, value))); + } + }); + out + } +} + +impl<'a, K: Serialize, V> TestDBIter<'a, K, V> { + /// Skips all the elements that are smaller than the given key, + /// and either lands on the key or the first one greater than + /// the key. + pub fn skip_to(mut self, key: &K) -> Result { + self.with_mut(|fields| { + let serialized_key = be_fix_int_ser(key).expect("serialization failed"); + let mut peekable = fields.iter.peekable(); + let mut peeked = peekable.peek(); + while peeked.is_some() { + let serialized = be_fix_int_ser(peeked.unwrap()).expect("serialization failed"); + if serialized >= serialized_key { + break; + } else { + peekable.next(); + peeked = peekable.peek(); + } + } + }); + Ok(self) + } + + /// Moves the iterator to the element given or + /// the one prior to it if it does not exist. If there is + /// no element prior to it, it returns an empty iterator. + pub fn skip_prior_to(mut self, key: &K) -> Result { + self.with_mut(|fields| { + let serialized_key = be_fix_int_ser(key).expect("serialization failed"); + let mut peekable = fields.iter.peekable(); + let mut peeked = peekable.peek(); + while peeked.is_some() { + let serialized = be_fix_int_ser(peeked.unwrap()).expect("serialization failed"); + if serialized > serialized_key { + break; + } else { + peekable.next(); + peeked = peekable.peek(); + } + } + }); + Ok(self) + } + + /// Seeks to the last key in the database (at this column family). + pub fn skip_to_last(mut self) -> Self { + self.with_mut(|fields| { + fields.iter.last(); + }); + self + } + + /// Will make the direction of the iteration reverse and will + /// create a new `RevIter` to consume. Every call to `next` method + /// will give the next element from the end. + pub fn reverse(mut self) -> TestDBRevIter<'a, K, V> { + self.with_mut(|fields| { + *fields.direction = Direction::Reverse; + }); + TestDBRevIter::new(self) + } +} + +/// An iterator with a reverted direction to the original. The `RevIter` +/// is hosting an iteration which is consuming in the opposing direction. +/// It's not possible to do further manipulation (ex re-reverse) to the +/// iterator. +pub struct TestDBRevIter<'a, K, V> { + iter: TestDBIter<'a, K, V>, +} + +impl<'a, K, V> TestDBRevIter<'a, K, V> { + fn new(iter: TestDBIter<'a, K, V>) -> Self { + Self { iter } + } +} + +impl<'a, K: DeserializeOwned, V: DeserializeOwned> Iterator for TestDBRevIter<'a, K, V> { + type Item = Result<(K, V), RawStoreError>; + + /// Will give the next item backwards + fn next(&mut self) -> Option { + self.iter.next() + } +} + +impl<'a, K: DeserializeOwned> Iterator for TestDBKeys<'a, K> { + type Item = Result; + + fn next(&mut self) -> Option { + let mut out: Option = None; + self.with_mut(|fields| { + let config = bincode::DefaultOptions::new() + .with_big_endian() + .with_fixint_encoding(); + if let Some((raw_key, _)) = fields.iter.next() { + let key: K = config.deserialize(raw_key).ok().unwrap(); + out = Some(Ok(key)); + } + }); + out + } +} + +impl<'a, V: DeserializeOwned> Iterator for TestDBValues<'a, V> { + type Item = Result; + + fn next(&mut self) -> Option { + let mut out: Option = None; + self.with_mut(|fields| { + if let Some((_, raw_value)) = fields.iter.next() { + let value: V = bcs::from_bytes(raw_value).ok().unwrap(); + out = Some(Ok(value)); + } + }); + out + } +} + +impl<'a, K, V> Map<'a, K, V> for TestDB +where + K: Serialize + DeserializeOwned, + V: Serialize + DeserializeOwned, +{ + type Error = RawStoreError; + type Iterator = std::iter::Empty<(K, V)>; + type SafeIterator = TestDBIter<'a, K, V>; + type Keys = TestDBKeys<'a, K>; + type Values = TestDBValues<'a, V>; + + fn contains_key(&self, key: &K) -> Result { + let raw_key = be_fix_int_ser(key)?; + let locked = self.rows.read().unwrap(); + Ok(locked.contains_key(&raw_key)) + } + + fn get(&self, key: &K) -> Result, Self::Error> { + let raw_key = be_fix_int_ser(key)?; + let locked = self.rows.read().unwrap(); + let res = locked.get(&raw_key); + Ok(res.map(|raw_value| bcs::from_bytes(raw_value).ok().unwrap())) + } + + fn get_raw_bytes(&self, key: &K) -> Result>, Self::Error> { + let raw_key = be_fix_int_ser(key)?; + let locked = self.rows.read().unwrap(); + let res = locked.get(&raw_key); + Ok(res.cloned()) + } + + fn insert(&self, key: &K, value: &V) -> Result<(), Self::Error> { + let raw_key = be_fix_int_ser(key)?; + let raw_value = bcs::to_bytes(value)?; + let mut locked = self.rows.write().unwrap(); + locked.insert(raw_key, raw_value); + Ok(()) + } + + fn remove(&self, key: &K) -> Result<(), Self::Error> { + let raw_key = be_fix_int_ser(key)?; + let mut locked = self.rows.write().unwrap(); + locked.remove(&raw_key); + Ok(()) + } + + fn clear(&self) -> Result<(), Self::Error> { + let mut locked = self.rows.write().unwrap(); + locked.clear(); + Ok(()) + } + + fn is_empty(&self) -> bool { + let locked = self.rows.read().unwrap(); + locked.is_empty() + } + + fn iter(&'a self) -> Self::Iterator { + unimplemented!("umplemented API"); + } + + fn iter_with_bounds( + &'a self, + _lower_bound: Option, + _upper_bound: Option, + ) -> Self::Iterator { + unimplemented!("umplemented API"); + } + + fn safe_iter(&'a self) -> Self::SafeIterator { + TestDBIterBuilder { + rows: self.rows.read().unwrap(), + iter_builder: |rows: &mut RwLockReadGuard<'a, BTreeMap, Vec>>| rows.iter(), + phantom: PhantomData, + direction: Direction::Forward, + } + .build() + } + + fn keys(&'a self) -> Self::Keys { + TestDBKeysBuilder { + rows: self.rows.read().unwrap(), + iter_builder: |rows: &mut RwLockReadGuard<'a, BTreeMap, Vec>>| rows.iter(), + phantom: PhantomData, + } + .build() + } + + fn values(&'a self) -> Self::Values { + TestDBValuesBuilder { + rows: self.rows.read().unwrap(), + iter_builder: |rows: &mut RwLockReadGuard<'a, BTreeMap, Vec>>| rows.iter(), + phantom: PhantomData, + } + .build() + } + + fn try_catch_up_with_primary(&self) -> Result<(), Self::Error> { + Ok(()) + } +} + +impl TryExtend<(J, U)> for TestDB +where + J: Borrow, + U: Borrow, + K: Serialize, + V: Serialize, +{ + type Error = RawStoreError; + + fn try_extend(&mut self, iter: &mut T) -> Result<(), Self::Error> + where + T: Iterator, + { + let mut wb = self.batch(); + wb.insert_batch(self, iter)?; + wb.write() + } + + fn try_extend_from_slice(&mut self, slice: &[(J, U)]) -> Result<(), Self::Error> { + let slice_of_refs = slice.iter().map(|(k, v)| (k.borrow(), v.borrow())); + let mut wb = self.batch(); + wb.insert_batch(self, slice_of_refs)?; + wb.write() + } +} + +pub type DeleteBatchPayload = ( + Arc, Vec>>>, + String, + Vec>, +); +pub type DeleteRangePayload = ( + Arc, Vec>>>, + String, + (Vec, Vec), +); +pub type InsertBatchPayload = ( + Arc, Vec>>>, + String, + Vec<(Vec, Vec)>, +); +type DBAndName = (Arc, Vec>>>, String); + +pub enum WriteBatchOp { + DeleteBatch(DeleteBatchPayload), + DeleteRange(DeleteRangePayload), + InsertBatch(InsertBatchPayload), +} + +#[derive(Default)] +pub struct TestDBWriteBatch { + pub ops: VecDeque, +} + +#[self_referencing] +pub struct DBLocked { + db: Arc, Vec>>>, + #[borrows(db)] + #[covariant] + db_guard: RwLockWriteGuard<'this, BTreeMap, Vec>>, +} + +impl TestDBWriteBatch { + pub fn write(self) -> Result<(), RawStoreError> { + let mut dbs: Vec = self + .ops + .iter() + .map(|op| match op { + WriteBatchOp::DeleteBatch((db, name, _)) => (db.clone(), name.clone()), + WriteBatchOp::DeleteRange((db, name, _)) => (db.clone(), name.clone()), + WriteBatchOp::InsertBatch((db, name, _)) => (db.clone(), name.clone()), + }) + .collect(); + dbs.sort_by_key(|(_k, v)| v.clone()); + dbs.dedup_by_key(|(_k, v)| v.clone()); + // lock all databases + let mut db_locks = HashMap::new(); + dbs.iter().for_each(|(db, name)| { + if !db_locks.contains_key(name) { + db_locks.insert( + name.clone(), + DBLockedBuilder { + db: db.clone(), + db_guard_builder: |db: &Arc, Vec>>>| { + db.write().unwrap() + }, + } + .build(), + ); + } + }); + self.ops.iter().for_each(|op| match op { + WriteBatchOp::DeleteBatch((_, id, keys)) => { + let locked = db_locks.get_mut(id).unwrap(); + locked.with_db_guard_mut(|db| { + keys.iter().for_each(|key| { + db.remove(key); + }); + }); + } + WriteBatchOp::DeleteRange((_, id, (from, to))) => { + let locked = db_locks.get_mut(id).unwrap(); + locked.with_db_guard_mut(|db| { + db.retain(|k, _| k < from || k >= to); + }); + } + WriteBatchOp::InsertBatch((_, id, key_values)) => { + let locked = db_locks.get_mut(id).unwrap(); + locked.with_db_guard_mut(|db| { + key_values.iter().for_each(|(k, v)| { + db.insert(k.clone(), v.clone()); + }); + }); + } + }); + // unlock in the reverse order + dbs.iter().rev().for_each(|(_db, id)| { + if db_locks.contains_key(id) { + db_locks.remove(id); + } + }); + Ok(()) + } + /// Deletes a set of keys given as an iterator + pub fn delete_batch, K: Serialize, V>( + &mut self, + db: &TestDB, + purged_vals: impl IntoIterator, + ) -> Result<(), RawStoreError> { + self.ops.push_back(WriteBatchOp::DeleteBatch(( + db.rows.clone(), + db.name.clone(), + purged_vals + .into_iter() + .map(|key| be_fix_int_ser(&key.borrow()).unwrap()) + .collect(), + ))); + Ok(()) + } + /// Deletes a range of keys between `from` (inclusive) and `to` (non-inclusive) + pub fn delete_range( + &mut self, + db: &TestDB, + from: &K, + to: &K, + ) -> Result<(), RawStoreError> { + let raw_from = be_fix_int_ser(from.borrow()).unwrap(); + let raw_to = be_fix_int_ser(to.borrow()).unwrap(); + self.ops.push_back(WriteBatchOp::DeleteRange(( + db.rows.clone(), + db.name.clone(), + (raw_from, raw_to), + ))); + Ok(()) + } + /// inserts a range of (key, value) pairs given as an iterator + pub fn insert_batch, K: Serialize, U: Borrow, V: Serialize>( + &mut self, + db: &TestDB, + new_vals: impl IntoIterator, + ) -> Result<(), RawStoreError> { + self.ops.push_back(WriteBatchOp::InsertBatch(( + db.rows.clone(), + db.name.clone(), + new_vals + .into_iter() + .map(|(key, value)| { + ( + be_fix_int_ser(&key.borrow()).unwrap(), + bcs::to_bytes(&value.borrow()).unwrap(), + ) + }) + .collect(), + ))); + Ok(()) + } +} + +#[cfg(test)] +mod test { + use crate::{test_db::TestDB, Map}; + + #[test] + fn test_contains_key() { + let db = TestDB::open(); + db.insert(&123456789, &"123456789".to_string()) + .expect("Failed to insert"); + assert!(db + .contains_key(&123456789) + .expect("Failed to call contains key")); + assert!(!db + .contains_key(&000000000) + .expect("Failed to call contains key")); + } + + #[test] + fn test_get() { + let db = TestDB::open(); + db.insert(&123456789, &"123456789".to_string()) + .expect("Failed to insert"); + assert_eq!( + Some("123456789".to_string()), + db.get(&123456789).expect("Failed to get") + ); + assert_eq!(None, db.get(&000000000).expect("Failed to get")); + } + + #[test] + fn test_get_raw() { + let db = TestDB::open(); + db.insert(&123456789, &"123456789".to_string()) + .expect("Failed to insert"); + + let val_bytes = db + .get_raw_bytes(&123456789) + .expect("Failed to get_raw_bytes") + .unwrap(); + + assert_eq!(bcs::to_bytes(&"123456789".to_string()).unwrap(), val_bytes); + assert_eq!( + None, + db.get_raw_bytes(&000000000) + .expect("Failed to get_raw_bytes") + ); + } + + #[test] + fn test_multi_get() { + let db = TestDB::open(); + db.insert(&123, &"123".to_string()) + .expect("Failed to insert"); + db.insert(&456, &"456".to_string()) + .expect("Failed to insert"); + + let result = db.multi_get([123, 456, 789]).expect("Failed to multi get"); + + assert_eq!(result.len(), 3); + assert_eq!(result[0], Some("123".to_string())); + assert_eq!(result[1], Some("456".to_string())); + assert_eq!(result[2], None); + } + + #[test] + fn test_remove() { + let db = TestDB::open(); + db.insert(&123456789, &"123456789".to_string()) + .expect("Failed to insert"); + assert!(db.get(&123456789).expect("Failed to get").is_some()); + + db.remove(&123456789).expect("Failed to remove"); + assert!(db.get(&123456789).expect("Failed to get").is_none()); + } + + #[test] + fn test_iter() { + let db = TestDB::open(); + db.insert(&123456789, &"123456789".to_string()) + .expect("Failed to insert"); + + let mut iter = db.safe_iter(); + assert_eq!(Some(Ok((123456789, "123456789".to_string()))), iter.next()); + assert_eq!(None, iter.next()); + } + + #[test] + fn test_iter_reverse() { + let db = TestDB::open(); + db.insert(&1, &"1".to_string()).expect("Failed to insert"); + db.insert(&2, &"2".to_string()).expect("Failed to insert"); + db.insert(&3, &"3".to_string()).expect("Failed to insert"); + let mut iter = db.safe_iter(); + + assert_eq!(Some(Ok((1, "1".to_string()))), iter.next()); + assert_eq!(Some(Ok((2, "2".to_string()))), iter.next()); + assert_eq!(Some(Ok((3, "3".to_string()))), iter.next()); + assert_eq!(None, iter.next()); + } + + #[test] + fn test_keys() { + let db = TestDB::open(); + + db.insert(&123456789, &"123456789".to_string()) + .expect("Failed to insert"); + + let mut keys = db.keys(); + assert_eq!(Some(Ok(123456789)), keys.next()); + assert_eq!(None, keys.next()); + } + + #[test] + fn test_values() { + let db = TestDB::open(); + + db.insert(&123456789, &"123456789".to_string()) + .expect("Failed to insert"); + + let mut values = db.values(); + assert_eq!(Some(Ok("123456789".to_string())), values.next()); + assert_eq!(None, values.next()); + } + + #[test] + fn test_insert_batch() { + let db = TestDB::open(); + let keys_vals = (1..100).map(|i| (i, i.to_string())); + let mut wb = db.batch(); + wb.insert_batch(&db, keys_vals.clone()) + .expect("Failed to batch insert"); + wb.write().expect("Failed to execute batch"); + for (k, v) in keys_vals { + let val = db.get(&k).expect("Failed to get inserted key"); + assert_eq!(Some(v), val); + } + } + + #[test] + fn test_insert_batch_across_cf() { + let db_cf_1 = TestDB::open(); + let keys_vals_1 = (1..100).map(|i| (i, i.to_string())); + + let db_cf_2 = TestDB::open(); + let keys_vals_2 = (1000..1100).map(|i| (i, i.to_string())); + + let mut wb = db_cf_1.batch(); + wb.insert_batch(&db_cf_1, keys_vals_1.clone()) + .expect("Failed to batch insert"); + wb.insert_batch(&db_cf_2, keys_vals_2.clone()) + .expect("Failed to batch insert"); + wb.write().expect("Failed to execute batch"); + for (k, v) in keys_vals_1 { + let val = db_cf_1.get(&k).expect("Failed to get inserted key"); + assert_eq!(Some(v), val); + } + + for (k, v) in keys_vals_2 { + let val = db_cf_2.get(&k).expect("Failed to get inserted key"); + assert_eq!(Some(v), val); + } + } + + #[test] + fn test_delete_batch() { + let db: TestDB = TestDB::open(); + + let keys_vals = (1..100).map(|i| (i, i.to_string())); + let mut wb = db.batch(); + wb.insert_batch(&db, keys_vals) + .expect("Failed to batch insert"); + + // delete the odd-index keys + let deletion_keys = (1..100).step_by(2); + wb.delete_batch(&db, deletion_keys) + .expect("Failed to batch delete"); + + wb.write().expect("Failed to execute batch"); + + for k in db.keys() { + assert_eq!(k.unwrap() % 2, 0); + } + } + + #[test] + fn test_delete_range() { + let db: TestDB = TestDB::open(); + + // Note that the last element is (100, "100".to_owned()) here + let keys_vals = (0..101).map(|i| (i, i.to_string())); + let mut wb = db.batch(); + wb.insert_batch(&db, keys_vals) + .expect("Failed to batch insert"); + + wb.delete_range(&db, &50, &100) + .expect("Failed to delete range"); + + wb.write().expect("Failed to execute batch"); + + for k in 0..50 { + assert!(db.contains_key(&k).expect("Failed to query legal key"),); + } + for k in 50..100 { + assert!(!db.contains_key(&k).expect("Failed to query legal key")); + } + + // range operator is not inclusive of to + assert!(db.contains_key(&100).expect("Failed to query legel key")); + } + + #[test] + fn test_clear() { + let db: TestDB = TestDB::open(); + + // Test clear of empty map + let _ = db.clear(); + + let keys_vals = (0..101).map(|i| (i, i.to_string())); + let mut wb = db.batch(); + wb.insert_batch(&db, keys_vals) + .expect("Failed to batch insert"); + + wb.write().expect("Failed to execute batch"); + + // Check we have multiple entries + assert!(db.safe_iter().count() > 1); + let _ = db.clear(); + assert_eq!(db.safe_iter().count(), 0); + // Clear again to ensure safety when clearing empty map + let _ = db.clear(); + assert_eq!(db.safe_iter().count(), 0); + // Clear with one item + let _ = db.insert(&1, &"e".to_string()); + assert_eq!(db.safe_iter().count(), 1); + let _ = db.clear(); + assert_eq!(db.safe_iter().count(), 0); + } + + #[test] + fn test_is_empty() { + let db: TestDB = TestDB::open(); + + // Test empty map is truly empty + assert!(db.is_empty()); + let _ = db.clear(); + assert!(db.is_empty()); + + let keys_vals = (0..101).map(|i| (i, i.to_string())); + let mut wb = db.batch(); + wb.insert_batch(&db, keys_vals) + .expect("Failed to batch insert"); + + wb.write().expect("Failed to execute batch"); + + // Check we have multiple entries and not empty + assert!(db.safe_iter().count() > 1); + assert!(!db.is_empty()); + + // Clear again to ensure empty works after clearing + let _ = db.clear(); + assert_eq!(db.safe_iter().count(), 0); + assert!(db.is_empty()); + } + + #[test] + fn test_multi_insert() { + // Init a DB + let db: TestDB = TestDB::open(); + + // Create kv pairs + let keys_vals = (0..101).map(|i| (i, i.to_string())); + + db.multi_insert(keys_vals.clone()) + .expect("Failed to multi-insert"); + + for (k, v) in keys_vals { + let val = db.get(&k).expect("Failed to get inserted key"); + assert_eq!(Some(v), val); + } + } + + #[test] + fn test_multi_remove() { + // Init a DB + let db: TestDB = TestDB::open(); + + // Create kv pairs + let keys_vals = (0..101).map(|i| (i, i.to_string())); + + db.multi_insert(keys_vals.clone()) + .expect("Failed to multi-insert"); + + // Check insertion + for (k, v) in keys_vals.clone() { + let val = db.get(&k).expect("Failed to get inserted key"); + assert_eq!(Some(v), val); + } + + // Remove 50 items + db.multi_remove(keys_vals.clone().map(|kv| kv.0).take(50)) + .expect("Failed to multi-remove"); + assert_eq!(db.safe_iter().count(), 101 - 50); + + // Check that the remaining are present + for (k, v) in keys_vals.skip(50) { + let val = db.get(&k).expect("Failed to get inserted key"); + assert_eq!(Some(v), val); + } + } +} diff --git a/moveos/raw-store/src/traits.rs b/moveos/raw-store/src/traits.rs new file mode 100644 index 000000000..a4154ba7f --- /dev/null +++ b/moveos/raw-store/src/traits.rs @@ -0,0 +1,208 @@ +// Copyright (c) RoochNetwork +// SPDX-License-Identifier: Apache-2.0 + +// Copyright (c) Mysten Labs, Inc. +// SPDX-License-Identifier: Apache-2.0 +use crate::RawStoreError; +use async_trait::async_trait; +use serde::{de::DeserializeOwned, Serialize}; +use std::{borrow::Borrow, collections::BTreeMap, error::Error}; + +pub trait Map<'a, K, V> +where + K: Serialize + DeserializeOwned + ?Sized, + V: Serialize + DeserializeOwned, +{ + type Error: Error; + type Iterator: Iterator; + type SafeIterator: Iterator>; + type Keys: Iterator>; + type Values: Iterator>; + + /// Returns true if the map contains a value for the specified key. + fn contains_key(&self, key: &K) -> Result; + + /// Returns the value for the given key from the map, if it exists. + fn get(&self, key: &K) -> Result, Self::Error>; + + /// Returns the raw value (serialized bytes) for the given key from the map, if it exists. + fn get_raw_bytes(&self, key: &K) -> Result>, Self::Error>; + + /// Returns the value for the given key from the map, if it exists + /// or the given default value if it does not. + /// This method is not thread safe + fn get_or_insert_unsafe V>( + &self, + key: &K, + default: F, + ) -> Result { + self.get(key).and_then(|optv| match optv { + Some(v) => Ok(v), + None => { + self.insert(key, &default())?; + self.get(key).transpose().expect("default just inserted") + } + }) + } + + /// Inserts the given key-value pair into the map. + fn insert(&self, key: &K, value: &V) -> Result<(), Self::Error>; + + /// Removes the entry for the given key from the map. + fn remove(&self, key: &K) -> Result<(), Self::Error>; + + /// Removes every key-value pair from the map. + fn clear(&self) -> Result<(), Self::Error>; + + /// Returns true if the map is empty, otherwise false. + fn is_empty(&self) -> bool; + + /// Returns an iterator visiting each key-value pair in the map. + fn iter(&'a self) -> Self::Iterator; + + /// Returns an iterator visiting each key-value pair in the map. + fn iter_with_bounds(&'a self, lower_bound: Option, upper_bound: Option) + -> Self::Iterator; + + /// Same as `iter` but performs status check + fn safe_iter(&'a self) -> Self::SafeIterator; + + /// Returns an iterator over each key in the map. + fn keys(&'a self) -> Self::Keys; + + /// Returns an iterator over each value in the map. + fn values(&'a self) -> Self::Values; + + /// Returns a vector of values corresponding to the keys provided, non-atomically. + fn multi_get(&self, keys: impl IntoIterator) -> Result>, Self::Error> + where + J: Borrow, + { + keys.into_iter().map(|key| self.get(key.borrow())).collect() + } + + /// Returns a vector of raw values corresponding to the keys provided, non-atomically. + fn multi_get_raw_bytes( + &self, + keys: impl IntoIterator, + ) -> Result>>, Self::Error> + where + J: Borrow, + { + keys.into_iter() + .map(|key| self.get_raw_bytes(key.borrow())) + .collect() + } + + /// Returns a vector of values corresponding to the keys provided, non-atomically. + fn chunked_multi_get( + &self, + keys: impl IntoIterator, + _chunk_size: usize, + ) -> Result>, Self::Error> + where + J: Borrow, + { + keys.into_iter().map(|key| self.get(key.borrow())).collect() + } + + /// Inserts key-value pairs, non-atomically. + fn multi_insert( + &self, + key_val_pairs: impl IntoIterator, + ) -> Result<(), Self::Error> + where + J: Borrow, + U: Borrow, + { + key_val_pairs + .into_iter() + .try_for_each(|(key, value)| self.insert(key.borrow(), value.borrow())) + } + + /// Removes keys, non-atomically. + fn multi_remove(&self, keys: impl IntoIterator) -> Result<(), Self::Error> + where + J: Borrow, + { + keys.into_iter() + .try_for_each(|key| self.remove(key.borrow())) + } + + /// Try to catch up with primary when running as secondary + fn try_catch_up_with_primary(&self) -> Result<(), Self::Error>; +} + +#[async_trait] +pub trait AsyncMap<'a, K, V> +where + K: Serialize + DeserializeOwned + ?Sized + std::marker::Sync, + V: Serialize + DeserializeOwned + std::marker::Sync + std::marker::Send, +{ + type Error: Error; + type Iterator: Iterator>; + type Keys: Iterator>; + type Values: Iterator>; + + /// Returns true if the map contains a value for the specified key. + async fn contains_key(&self, key: &K) -> Result; + + /// Returns the value for the given key from the map, if it exists. + async fn get(&self, key: &K) -> Result, Self::Error>; + + /// Returns the raw value (serialized bytes) for the given key from the map, if it exists. + async fn get_raw_bytes(&self, key: &K) -> Result>, Self::Error>; + + /// Returns true if the map is empty, otherwise false. + async fn is_empty(&self) -> bool; + + /// Returns an iterator visiting each key-value pair in the map. + async fn iter(&'a self) -> Self::Iterator; + + /// Returns an iterator over each key in the map. + async fn keys(&'a self) -> Self::Keys; + + /// Returns an iterator over each value in the map. + async fn values(&'a self) -> Self::Values; + + /// Returns a vector of values corresponding to the keys provided, non-atomically. + async fn multi_get( + &self, + keys: impl IntoIterator + std::marker::Send, + ) -> Result>, Self::Error> + where + J: Borrow; + + /// Try to catch up with primary when running as secondary + async fn try_catch_up_with_primary(&self) -> Result<(), Self::Error>; +} + +pub struct TableSummary { + pub num_keys: u64, + pub key_bytes_total: usize, + pub value_bytes_total: usize, + pub key_hist: hdrhistogram::Histogram, + pub value_hist: hdrhistogram::Histogram, +} + +pub trait RawStoreDebug { + /// Dump a DB table with pagination + fn dump_table( + &self, + table_name: String, + page_size: u16, + page_number: usize, + ) -> eyre::Result>; + + /// Get the name of the DB. This is simply the name of the struct + fn primary_db_name(&self) -> String; + + /// Get a map of table names to key-value types + fn describe_all_tables(&self) -> BTreeMap; + + /// Count the entries in the table + fn count_table_keys(&self, table_name: String) -> eyre::Result; + + /// Return table summary of the input table + fn table_summary(&self, table_name: String) -> eyre::Result; +}