From d9fa5921aee148b29729fb1cd1476de2683c9a77 Mon Sep 17 00:00:00 2001
From: Bai Chuan <muzixinly@gmail.com>
Date: Tue, 11 Jul 2023 23:20:45 +0800
Subject: [PATCH] provide raw store component base on RocksDB (#454)

---
 Cargo.toml                              |   14 +-
 moveos/moveos-store/Cargo.toml          |    3 +-
 moveos/raw-store/Cargo.toml             |   77 +
 moveos/raw-store/src/lib.rs             |   22 +
 moveos/raw-store/src/macros.rs          |  138 ++
 moveos/raw-store/src/metrics.rs         |  885 +++++++++
 moveos/raw-store/src/rocks/errors.rs    |  126 ++
 moveos/raw-store/src/rocks/iter.rs      |  171 ++
 moveos/raw-store/src/rocks/keys.rs      |   71 +
 moveos/raw-store/src/rocks/mod.rs       | 2406 +++++++++++++++++++++++
 moveos/raw-store/src/rocks/safe_iter.rs |  161 ++
 moveos/raw-store/src/rocks/tests.rs     | 1154 +++++++++++
 moveos/raw-store/src/rocks/util.rs      |   81 +
 moveos/raw-store/src/rocks/values.rs    |   47 +
 moveos/raw-store/src/test_db.rs         |  807 ++++++++
 moveos/raw-store/src/traits.rs          |  208 ++
 16 files changed, 6369 insertions(+), 2 deletions(-)
 create mode 100644 moveos/raw-store/Cargo.toml
 create mode 100644 moveos/raw-store/src/lib.rs
 create mode 100644 moveos/raw-store/src/macros.rs
 create mode 100644 moveos/raw-store/src/metrics.rs
 create mode 100644 moveos/raw-store/src/rocks/errors.rs
 create mode 100644 moveos/raw-store/src/rocks/iter.rs
 create mode 100644 moveos/raw-store/src/rocks/keys.rs
 create mode 100644 moveos/raw-store/src/rocks/mod.rs
 create mode 100644 moveos/raw-store/src/rocks/safe_iter.rs
 create mode 100644 moveos/raw-store/src/rocks/tests.rs
 create mode 100644 moveos/raw-store/src/rocks/util.rs
 create mode 100644 moveos/raw-store/src/rocks/values.rs
 create mode 100644 moveos/raw-store/src/test_db.rs
 create mode 100644 moveos/raw-store/src/traits.rs

diff --git a/Cargo.toml b/Cargo.toml
index 4d33e0f50..79d9574c5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,6 +9,7 @@ members = [
     "moveos/moveos-stdlib-builder",
     "moveos/moveos",
     "moveos/moveos-common",
+    "moveos/raw-store",
     "crates/rooch-key",
     "crates/rooch-types",
     "crates/rooch-framework",
@@ -61,6 +62,7 @@ moveos = { path = "moveos/moveos" }
 moveos-cli = { path = "moveos/moveos-cli" }
 moveos-common = { path = "moveos/moveos-common" }
 moveos-verifier = { path = "moveos/moveos-verifier" }
+raw-store = { path = "moveos/raw-store" }
 
 # crates for Rooch
 rooch = { path = "crates/rooch" }
@@ -160,13 +162,23 @@ versions = "4.1.0"
 pretty_assertions = "1.2.0"
 syn = { version = "1.0.104", features = ["full", "extra-traits"] }
 quote = "1.0"
-proc-macro2 = "1.0"
+proc-macro2 = "1.0.47"
 derive-syn-parse = "0.1.5"
 unescape = "0.1.0"
 tempfile = "3.2.0"
 regex = "1.8.4"
 walkdir = "2.3.3"
 
+rocksdb = { version = "0.21.0", features = ["snappy", "lz4", "zstd", "zlib", "multi-threaded-cf"], default-features = false }
+bincode = "1.3.3"
+collectable = "0.0.2"
+fdlimit = "0.2.1"
+tap = "1.0.1"
+num_cpus = "1.14.0"
+prometheus = "0.13.3"
+hdrhistogram = "7.5.1"
+ouroboros = "0.15.5"
+rstest = "0.16.0"
 
 # Note: the BEGIN and END comments below are required for external tooling. Do not remove.
 # BEGIN MOVE DEPENDENCIES
diff --git a/moveos/moveos-store/Cargo.toml b/moveos/moveos-store/Cargo.toml
index 16ae771d4..261df2b83 100644
--- a/moveos/moveos-store/Cargo.toml
+++ b/moveos/moveos-store/Cargo.toml
@@ -26,4 +26,5 @@ move-core-types = { workspace = true }
 move-resource-viewer = { workspace = true }
 
 moveos-types = { workspace = true }
-moveos-stdlib = { workspace = true }
\ No newline at end of file
+moveos-stdlib = { workspace = true }
+raw-store = { workspace = true }
\ No newline at end of file
diff --git a/moveos/raw-store/Cargo.toml b/moveos/raw-store/Cargo.toml
new file mode 100644
index 000000000..df434be27
--- /dev/null
+++ b/moveos/raw-store/Cargo.toml
@@ -0,0 +1,77 @@
+[package]
+name = "raw-store"
+version = "0.1.0"
+
+# Workspace inherited keys
+authors = { workspace = true }
+edition = { workspace = true }
+homepage = { workspace = true }
+license = { workspace = true }
+publish = { workspace = true }
+repository = { workspace = true }
+rust-version = { workspace = true }
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+#[dependencies]
+#bcs = "0.1.4"
+#bincode = "1.3.3"
+#collectable = "0.0.2"
+#eyre = "0.6.8"
+#fdlimit = "0.2.1"
+#once_cell = "1.15.0"
+#tap = "1.0.1"
+#num_cpus = "1.14.0"
+#prometheus = "0.13.3"
+#hdrhistogram = "7.5.1"
+## deactivation of bzip2 due to https://github.com/rust-rocksdb/rust-rocksdb/issues/609
+#rocksdb = { version = "0.20.1", features = ["snappy", "lz4", "zstd", "zlib", "multi-threaded-cf"], default-features = false }
+#serde = { version = "1.0.140", features = ["derive"] }
+#thiserror = "1.0.37"
+#tokio = { workspace = true, features = ["full", "test-util"] }
+#tracing = "0.1.37"
+#ouroboros = "0.15.5"
+#rand = "0.8.5"
+#async-trait = "0.1.57"
+#itertools = "0.10.5"
+
+#sui-macros = { path = "../sui-macros" }
+#workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dependencies]
+anyhow = { workspace = true }
+bcs = { workspace = true }
+#smt = { workspace = true }
+serde = { workspace = true }
+serde_bytes = { workspace = true }
+hex = { workspace = true }
+parking_lot = { workspace = true }
+
+rocksdb = { workspace = true }
+prometheus = { workspace = true }
+tokio = { workspace = true }
+bincode = { workspace = true }
+collectable = { workspace = true }
+once_cell = { workspace = true }
+eyre = { workspace = true }
+fdlimit = { workspace = true }
+tap = { workspace = true }
+num_cpus = { workspace = true }
+hdrhistogram = { workspace = true }
+ouroboros = { workspace = true }
+rand = { workspace = true }
+async-trait = { workspace = true }
+itertools = { workspace = true }
+thiserror = { workspace = true }
+tracing = { workspace = true }
+futures = { workspace = true }
+rstest = { workspace = true }
+tempfile = { workspace = true }
+
+move-core-types = { workspace = true }
+move-resource-viewer = { workspace = true }
+
+# Most packages should depend on sui-simulator instead of directly on msim, but for typed-store
+# that creates a circular dependency.
+#[target.'cfg(msim)'.dependencies]
+#msim = { git = "https://github.com/MystenLabs/mysten-sim.git", rev = "e9011f96b84615b63cd8b5835e606a2fc218a1bd", package = "msim" }
diff --git a/moveos/raw-store/src/lib.rs b/moveos/raw-store/src/lib.rs
new file mode 100644
index 000000000..be501ebdd
--- /dev/null
+++ b/moveos/raw-store/src/lib.rs
@@ -0,0 +1,22 @@
+// Copyright (c) RoochNetwork
+// SPDX-License-Identifier: Apache-2.0
+
+// Copyright (c) 2021, Facebook, Inc. and its affiliates
+// Copyright (c) Mysten Labs, Inc.
+// SPDX-License-Identifier: Apache-2.0
+#![warn(
+    future_incompatible,
+    nonstandard_style,
+    rust_2018_idioms,
+    rust_2021_compatibility
+)]
+
+pub mod traits;
+pub use traits::Map;
+pub mod rocks;
+pub use rocks::RawStoreError;
+pub mod macros;
+pub mod metrics;
+pub mod test_db;
+
+pub type StoreError = rocks::RawStoreError;
diff --git a/moveos/raw-store/src/macros.rs b/moveos/raw-store/src/macros.rs
new file mode 100644
index 000000000..be9ac0c5f
--- /dev/null
+++ b/moveos/raw-store/src/macros.rs
@@ -0,0 +1,138 @@
+// Copyright (c) RoochNetwork
+// SPDX-License-Identifier: Apache-2.0
+
+// Copyright (c) Mysten Labs, Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+use futures::future::BoxFuture;
+use std::collections::HashMap;
+use std::future::Future;
+use std::sync::Arc;
+
+/// Simply evaluates expr.
+#[macro_export]
+macro_rules! nondeterministic {
+    ($expr: expr) => {
+        $expr
+    };
+}
+
+type FpCallback = dyn Fn() -> Option<BoxFuture<'static, ()>> + Send + Sync + 'static;
+type FpMap = HashMap<&'static str, Arc<FpCallback>>;
+
+fn with_fp_map<T>(func: impl FnOnce(&mut FpMap) -> T) -> T {
+    use once_cell::sync::Lazy;
+    use std::sync::Mutex;
+
+    static MAP: Lazy<Mutex<FpMap>> = Lazy::new(Default::default);
+    let mut map = MAP.lock().unwrap();
+    func(&mut map)
+}
+
+fn get_callback(identifier: &'static str) -> Option<Arc<FpCallback>> {
+    with_fp_map(|map| map.get(identifier).cloned())
+}
+
+pub fn handle_fail_point(identifier: &'static str) {
+    if let Some(callback) = get_callback(identifier) {
+        tracing::error!("hit failpoint {}", identifier);
+        assert!(
+            callback().is_none(),
+            "sync failpoint must not return future"
+        );
+    }
+}
+
+pub async fn handle_fail_point_async(identifier: &'static str) {
+    if let Some(callback) = get_callback(identifier) {
+        tracing::error!("hit async failpoint {}", identifier);
+        let fut = callback().expect("async callback must return future");
+        fut.await;
+    }
+}
+
+fn register_fail_point_impl(
+    identifier: &'static str,
+    callback: Arc<dyn Fn() -> Option<BoxFuture<'static, ()>> + Sync + Send + 'static>,
+) {
+    with_fp_map(move |map| {
+        assert!(
+            map.insert(identifier, callback).is_none(),
+            "duplicate fail point registration"
+        );
+    })
+}
+
+pub fn register_fail_point(identifier: &'static str, callback: impl Fn() + Sync + Send + 'static) {
+    register_fail_point_impl(
+        identifier,
+        Arc::new(move || {
+            callback();
+            None
+        }),
+    );
+}
+
+pub fn register_fail_point_async<F>(
+    identifier: &'static str,
+    callback: impl Fn() -> F + Sync + Send + 'static,
+) where
+    F: Future<Output = ()> + Sync + Send + 'static,
+{
+    register_fail_point_impl(identifier, Arc::new(move || Some(Box::pin(callback()))));
+}
+
+pub fn register_fail_points(
+    identifiers: &[&'static str],
+    callback: impl Fn() + Sync + Send + 'static,
+) {
+    let cb = Arc::new(move || {
+        callback();
+        None
+    });
+    for id in identifiers {
+        register_fail_point_impl(id, cb.clone());
+    }
+}
+
+#[cfg(not(any(fail_points)))]
+#[macro_export]
+macro_rules! fail_point {
+    ($tag: expr) => {};
+}
+
+#[cfg(not(any(fail_points)))]
+#[macro_export]
+macro_rules! fail_point_async {
+    ($tag: expr) => {};
+}
+
+// These tests need to be run in release mode, since debug mode does overflow checks by default!
+#[cfg(test)]
+mod test {
+    // use super::*;
+
+    // Uncomment to test error messages
+    // #[with_checked_arithmetic]
+    // struct TestStruct;
+
+    macro_rules! pass_through {
+        ($($tt:tt)*) => {
+            $($tt)*
+        }
+    }
+
+    #[test]
+    fn test_skip_checked_arithmetic() {
+        // comment out this attr to test the error message
+        pass_through! {
+            fn unchecked_add(a: i32, b: i32) -> i32 {
+                a + b
+            }
+        }
+
+        // this will not panic even if we pass in (i32::MAX, 1), because we skipped processing
+        // the item macro, so we also need to make sure it doesn't panic in debug mode.
+        unchecked_add(1, 2);
+    }
+}
diff --git a/moveos/raw-store/src/metrics.rs b/moveos/raw-store/src/metrics.rs
new file mode 100644
index 000000000..a86ecde5a
--- /dev/null
+++ b/moveos/raw-store/src/metrics.rs
@@ -0,0 +1,885 @@
+// Copyright (c) RoochNetwork
+// SPDX-License-Identifier: Apache-2.0
+
+// Copyright (c) Mysten Labs, Inc.
+// SPDX-License-Identifier: Apache-2.0
+use once_cell::sync::OnceCell;
+use prometheus::{
+    register_histogram_vec_with_registry, register_int_counter_vec_with_registry,
+    register_int_gauge_vec_with_registry, HistogramVec, IntCounterVec, IntGaugeVec, Registry,
+};
+use rocksdb::perf::set_perf_stats;
+use rocksdb::{PerfContext, PerfMetric, PerfStatsLevel};
+use std::cell::RefCell;
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::Arc;
+use std::time::Duration;
+use tap::TapFallible;
+use tracing::warn;
+
+thread_local! {
+    static PER_THREAD_ROCKS_PERF_CONTEXT: std::cell::RefCell<rocksdb::PerfContext>  = RefCell::new(PerfContext::default());
+}
+
+const LATENCY_SEC_BUCKETS: &[f64] = &[
+    0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1., 2.5, 5., 10., 20., 30., 60., 90.,
+];
+
+#[derive(Debug, Clone)]
+// A struct for sampling based on number of operations or duration.
+// Sampling happens if the duration expires and after number of operations
+pub struct SamplingInterval {
+    // Sample once every time duration
+    pub once_every_duration: Duration,
+    // Sample once every number of operations
+    pub after_num_ops: u64,
+    // Counter for keeping track of previous sample
+    pub counter: Arc<AtomicU64>,
+}
+
+impl Default for SamplingInterval {
+    fn default() -> Self {
+        // Enabled with 60 second interval
+        SamplingInterval::new(Duration::from_secs(60), 0)
+    }
+}
+
+impl SamplingInterval {
+    pub fn new(once_every_duration: Duration, after_num_ops: u64) -> Self {
+        let counter = Arc::new(AtomicU64::new(1));
+        if !once_every_duration.is_zero() {
+            let counter = counter.clone();
+            tokio::task::spawn(async move {
+                loop {
+                    if counter.load(Ordering::SeqCst) > after_num_ops {
+                        counter.store(0, Ordering::SeqCst);
+                    }
+                    tokio::time::sleep(once_every_duration).await;
+                }
+            });
+        }
+        SamplingInterval {
+            once_every_duration,
+            after_num_ops,
+            counter,
+        }
+    }
+    pub fn new_from_self(&self) -> SamplingInterval {
+        SamplingInterval::new(self.once_every_duration, self.after_num_ops)
+    }
+    pub fn sample(&self) -> bool {
+        if self.once_every_duration.is_zero() {
+            self.counter.fetch_add(1, Ordering::Relaxed) % (self.after_num_ops + 1) == 0
+        } else {
+            self.counter.fetch_add(1, Ordering::Relaxed) == 0
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct ColumnFamilyMetrics {
+    pub rocksdb_total_sst_files_size: IntGaugeVec,
+    pub rocksdb_total_blob_files_size: IntGaugeVec,
+    pub rocksdb_size_all_mem_tables: IntGaugeVec,
+    pub rocksdb_num_snapshots: IntGaugeVec,
+    pub rocksdb_oldest_snapshot_time: IntGaugeVec,
+    pub rocksdb_actual_delayed_write_rate: IntGaugeVec,
+    pub rocksdb_is_write_stopped: IntGaugeVec,
+    pub rocksdb_block_cache_capacity: IntGaugeVec,
+    pub rocksdb_block_cache_usage: IntGaugeVec,
+    pub rocksdb_block_cache_pinned_usage: IntGaugeVec,
+    pub rocskdb_estimate_table_readers_mem: IntGaugeVec,
+    pub rocksdb_mem_table_flush_pending: IntGaugeVec,
+    pub rocskdb_compaction_pending: IntGaugeVec,
+    pub rocskdb_num_running_compactions: IntGaugeVec,
+    pub rocksdb_num_running_flushes: IntGaugeVec,
+    pub rocksdb_estimate_oldest_key_time: IntGaugeVec,
+    pub rocskdb_background_errors: IntGaugeVec,
+    pub rocksdb_estimated_num_keys: IntGaugeVec,
+}
+
+impl ColumnFamilyMetrics {
+    pub(crate) fn new(registry: &Registry) -> Self {
+        ColumnFamilyMetrics {
+            rocksdb_total_sst_files_size: register_int_gauge_vec_with_registry!(
+                "rocksdb_total_sst_files_size",
+                "The storage size occupied by the sst files in the column family",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            rocksdb_total_blob_files_size: register_int_gauge_vec_with_registry!(
+                "rocksdb_total_blob_files_size",
+                "The storage size occupied by the blob files in the column family",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            rocksdb_size_all_mem_tables: register_int_gauge_vec_with_registry!(
+                "rocksdb_size_all_mem_tables",
+                "The memory size occupied by the column family's in-memory buffer",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            rocksdb_num_snapshots: register_int_gauge_vec_with_registry!(
+                "rocksdb_num_snapshots",
+                "Number of snapshots held for the column family",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            rocksdb_oldest_snapshot_time: register_int_gauge_vec_with_registry!(
+                "rocksdb_oldest_snapshot_time",
+                "Unit timestamp of the oldest unreleased snapshot",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            rocksdb_actual_delayed_write_rate: register_int_gauge_vec_with_registry!(
+                "rocksdb_actual_delayed_write_rate",
+                "The current actual delayed write rate. 0 means no delay",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            rocksdb_is_write_stopped: register_int_gauge_vec_with_registry!(
+                "rocksdb_is_write_stopped",
+                "A flag indicating whether writes are stopped on this column family. 1 indicates writes have been stopped.",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            rocksdb_block_cache_capacity: register_int_gauge_vec_with_registry!(
+                "rocksdb_block_cache_capacity",
+                "The block cache capacity of the column family.",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            rocksdb_block_cache_usage: register_int_gauge_vec_with_registry!(
+                "rocksdb_block_cache_usage",
+                "The memory size used by the column family in the block cache.",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            rocksdb_block_cache_pinned_usage: register_int_gauge_vec_with_registry!(
+                "rocksdb_block_cache_pinned_usage",
+                "The memory size used by the column family in the block cache where entries are pinned",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            rocskdb_estimate_table_readers_mem: register_int_gauge_vec_with_registry!(
+                "rocskdb_estimate_table_readers_mem",
+                "The estimated memory size used for reading SST tables in this column
+                family such as filters and index blocks. Note that this number does not
+                include the memory used in block cache.",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            rocksdb_mem_table_flush_pending: register_int_gauge_vec_with_registry!(
+                "rocksdb_mem_table_flush_pending",
+                "A 1 or 0 flag indicating whether a memtable flush is pending.
+                If this number is 1, it means a memtable is waiting for being flushed,
+                but there might be too many L0 files that prevents it from being flushed.",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            rocskdb_compaction_pending: register_int_gauge_vec_with_registry!(
+                "rocskdb_compaction_pending",
+                "A 1 or 0 flag indicating whether a compaction job is pending.
+                If this number is 1, it means some part of the column family requires
+                compaction in order to maintain shape of LSM tree, but the compaction
+                is pending because the desired compaction job is either waiting for
+                other dependent compactions to be finished or waiting for an available
+                compaction thread.",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            rocskdb_num_running_compactions: register_int_gauge_vec_with_registry!(
+                "rocskdb_num_running_compactions",
+                "The number of compactions that are currently running for the column family.",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            rocksdb_num_running_flushes: register_int_gauge_vec_with_registry!(
+                "rocksdb_num_running_flushes",
+                "The number of flushes that are currently running for the column family.",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            rocksdb_estimate_oldest_key_time: register_int_gauge_vec_with_registry!(
+                "rocksdb_estimate_oldest_key_time",
+                "Estimation of the oldest key timestamp in the DB. Only available
+                for FIFO compaction with compaction_options_fifo.allow_compaction = false.",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            rocksdb_estimated_num_keys: register_int_gauge_vec_with_registry!(
+                "rocksdb_estimated_num_keys",
+                "The estimated number of keys in the table",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            rocskdb_background_errors: register_int_gauge_vec_with_registry!(
+                "rocskdb_background_errors",
+                "The accumulated number of RocksDB background errors.",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct OperationMetrics {
+    pub rocksdb_iter_latency_seconds: HistogramVec,
+    pub rocksdb_iter_bytes: HistogramVec,
+    pub rocksdb_iter_keys: HistogramVec,
+    pub rocksdb_get_latency_seconds: HistogramVec,
+    pub rocksdb_get_bytes: HistogramVec,
+    pub rocksdb_multiget_latency_seconds: HistogramVec,
+    pub rocksdb_multiget_bytes: HistogramVec,
+    pub rocksdb_put_latency_seconds: HistogramVec,
+    pub rocksdb_put_bytes: HistogramVec,
+    pub rocksdb_delete_latency_seconds: HistogramVec,
+    pub rocksdb_deletes: IntCounterVec,
+    pub rocksdb_batch_commit_latency_seconds: HistogramVec,
+    pub rocksdb_batch_commit_bytes: HistogramVec,
+}
+
+impl OperationMetrics {
+    pub(crate) fn new(registry: &Registry) -> Self {
+        OperationMetrics {
+            rocksdb_iter_latency_seconds: register_histogram_vec_with_registry!(
+                "rocksdb_iter_latency_seconds",
+                "Rocksdb iter latency in seconds",
+                &["cf_name"],
+                LATENCY_SEC_BUCKETS.to_vec(),
+                registry,
+            )
+            .unwrap(),
+            rocksdb_iter_bytes: register_histogram_vec_with_registry!(
+                "rocksdb_iter_bytes",
+                "Rocksdb iter size in bytes",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            rocksdb_iter_keys: register_histogram_vec_with_registry!(
+                "rocksdb_iter_keys",
+                "Rocksdb iter num keys",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            rocksdb_get_latency_seconds: register_histogram_vec_with_registry!(
+                "rocksdb_get_latency_seconds",
+                "Rocksdb get latency in seconds",
+                &["cf_name"],
+                LATENCY_SEC_BUCKETS.to_vec(),
+                registry,
+            )
+            .unwrap(),
+            rocksdb_get_bytes: register_histogram_vec_with_registry!(
+                "rocksdb_get_bytes",
+                "Rocksdb get call returned data size in bytes",
+                &["cf_name"],
+                registry
+            )
+            .unwrap(),
+            rocksdb_multiget_latency_seconds: register_histogram_vec_with_registry!(
+                "rocksdb_multiget_latency_seconds",
+                "Rocksdb multiget latency in seconds",
+                &["cf_name"],
+                LATENCY_SEC_BUCKETS.to_vec(),
+                registry,
+            )
+            .unwrap(),
+            rocksdb_multiget_bytes: register_histogram_vec_with_registry!(
+                "rocksdb_multiget_bytes",
+                "Rocksdb multiget call returned data size in bytes",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            rocksdb_put_latency_seconds: register_histogram_vec_with_registry!(
+                "rocksdb_put_latency_seconds",
+                "Rocksdb put latency in seconds",
+                &["cf_name"],
+                LATENCY_SEC_BUCKETS.to_vec(),
+                registry,
+            )
+            .unwrap(),
+            rocksdb_put_bytes: register_histogram_vec_with_registry!(
+                "rocksdb_put_bytes",
+                "Rocksdb put call puts data size in bytes",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            rocksdb_delete_latency_seconds: register_histogram_vec_with_registry!(
+                "rocksdb_delete_latency_seconds",
+                "Rocksdb delete latency in seconds",
+                &["cf_name"],
+                LATENCY_SEC_BUCKETS.to_vec(),
+                registry,
+            )
+            .unwrap(),
+            rocksdb_deletes: register_int_counter_vec_with_registry!(
+                "rocksdb_deletes",
+                "Rocksdb delete calls",
+                &["cf_name"],
+                registry
+            )
+            .unwrap(),
+            rocksdb_batch_commit_latency_seconds: register_histogram_vec_with_registry!(
+                "rocksdb_write_batch_commit_latency_seconds",
+                "Rocksdb schema batch commit latency in seconds",
+                &["db_name"],
+                LATENCY_SEC_BUCKETS.to_vec(),
+                registry,
+            )
+            .unwrap(),
+            rocksdb_batch_commit_bytes: register_histogram_vec_with_registry!(
+                "rocksdb_batch_commit_bytes",
+                "Rocksdb schema batch commit size in bytes",
+                &["db_name"],
+                registry,
+            )
+            .unwrap(),
+        }
+    }
+}
+
+pub struct RocksDBPerfContext;
+
+impl Default for RocksDBPerfContext {
+    fn default() -> Self {
+        set_perf_stats(PerfStatsLevel::EnableTime);
+        PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context| {
+            perf_context.borrow_mut().reset();
+        });
+        RocksDBPerfContext {}
+    }
+}
+
+impl Drop for RocksDBPerfContext {
+    fn drop(&mut self) {
+        set_perf_stats(PerfStatsLevel::Disable);
+    }
+}
+
+#[derive(Debug)]
+pub struct ReadPerfContextMetrics {
+    pub user_key_comparison_count: IntCounterVec,
+    pub block_cache_hit_count: IntCounterVec,
+    pub block_read_count: IntCounterVec,
+    pub block_read_byte: IntCounterVec,
+    pub block_read_nanos: IntCounterVec,
+    pub block_checksum_nanos: IntCounterVec,
+    pub block_decompress_nanos: IntCounterVec,
+    pub get_read_bytes: IntCounterVec,
+    pub multiget_read_bytes: IntCounterVec,
+    pub get_snapshot_nanos: IntCounterVec,
+    pub get_from_memtable_nanos: IntCounterVec,
+    pub get_from_memtable_count: IntCounterVec,
+    pub get_post_process_nanos: IntCounterVec,
+    pub get_from_output_files_nanos: IntCounterVec,
+    pub db_mutex_lock_nanos: IntCounterVec,
+    pub db_condition_wait_nanos: IntCounterVec,
+    pub merge_operator_nanos: IntCounterVec,
+    pub read_index_block_nanos: IntCounterVec,
+    pub read_filter_block_nanos: IntCounterVec,
+    pub new_table_block_iter_nanos: IntCounterVec,
+    pub block_seek_nanos: IntCounterVec,
+    pub find_table_nanos: IntCounterVec,
+    pub bloom_memtable_hit_count: IntCounterVec,
+    pub bloom_memtable_miss_count: IntCounterVec,
+    pub bloom_sst_hit_count: IntCounterVec,
+    pub bloom_sst_miss_count: IntCounterVec,
+    pub key_lock_wait_time: IntCounterVec,
+    pub key_lock_wait_count: IntCounterVec,
+    pub internal_delete_skipped_count: IntCounterVec,
+    pub internal_skipped_count: IntCounterVec,
+}
+
+impl ReadPerfContextMetrics {
+    pub(crate) fn new(registry: &Registry) -> Self {
+        ReadPerfContextMetrics {
+            user_key_comparison_count: register_int_counter_vec_with_registry!(
+                "user_key_comparison_count",
+                "Helps us figure out whether too many comparisons in binary search can be a problem,
+                especially when a more expensive comparator is used. Moreover, since number of comparisons
+                is usually uniform based on the memtable size, the SST file size for Level 0 and size of other
+                levels, an significant increase of the counter can indicate unexpected LSM-tree shape.
+                You may want to check whether flush/compaction can keep up with the write speed",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            block_cache_hit_count: register_int_counter_vec_with_registry!(
+                "block_cache_hit_count",
+                "Tells us how many times we read data blocks from block cache, and block_read_count tells us how many
+                times we have to read blocks from the file system (either block cache is disabled or it is a cache miss).
+                We can evaluate the block cache efficiency by looking at the two counters over time.",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            block_read_count: register_int_counter_vec_with_registry!(
+                "block_read_count",
+                "Tells us how many times we have to read blocks from the file system (either block cache is disabled or it is a cache miss)",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            block_read_byte: register_int_counter_vec_with_registry!(
+                "block_read_byte",
+                "Tells us how many total bytes we read from the file system. It can tell us whether a slow query can be caused by reading
+                large blocks from the file system. Index and bloom filter blocks are usually large blocks. A large block can also be the result
+                of a very large key or value",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            block_read_nanos: register_int_counter_vec_with_registry!(
+                "block_read_nanos",
+                "Total nanos spent on block reads",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            block_checksum_nanos: register_int_counter_vec_with_registry!(
+                "block_checksum_nanos",
+                "Total nanos spent on verifying block checksum",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            block_decompress_nanos: register_int_counter_vec_with_registry!(
+                "block_decompress_nanos",
+                "Total nanos spent on decompressing a block",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            get_read_bytes: register_int_counter_vec_with_registry!(
+                "get_read_bytes",
+                "Total bytes for values returned by Get",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            multiget_read_bytes: register_int_counter_vec_with_registry!(
+                "multiget_read_bytes",
+                "Total bytes for values returned by MultiGet.",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            get_snapshot_nanos: register_int_counter_vec_with_registry!(
+                "get_snapshot_nanos",
+                "Time spent in getting snapshot.",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            get_from_memtable_nanos: register_int_counter_vec_with_registry!(
+                "get_from_memtable_nanos",
+                "Time spent on reading data from memtable.",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            get_from_memtable_count: register_int_counter_vec_with_registry!(
+                "get_from_memtable_count",
+                "Number of memtables queried",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            get_post_process_nanos: register_int_counter_vec_with_registry!(
+                "get_post_process_nanos",
+                "Total nanos spent after Get() finds a key",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            get_from_output_files_nanos: register_int_counter_vec_with_registry!(
+                "get_from_output_files_nanos",
+                "Total nanos reading from output files",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            db_mutex_lock_nanos: register_int_counter_vec_with_registry!(
+                "db_mutex_lock_nanos",
+                "Time spent on acquiring db mutex",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            db_condition_wait_nanos: register_int_counter_vec_with_registry!(
+                "db_condition_wait_nanos",
+                "Time spent waiting with a condition variable created with DB Mutex.",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            merge_operator_nanos: register_int_counter_vec_with_registry!(
+                "merge_operator_nanos",
+                "Time spent on merge operator.",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            read_index_block_nanos: register_int_counter_vec_with_registry!(
+                "read_index_block_nanos",
+                "Time spent on reading index block from block cache or SST file",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            read_filter_block_nanos: register_int_counter_vec_with_registry!(
+                "read_filter_block_nanos",
+                "Time spent on reading filter block from block cache or SST file",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            new_table_block_iter_nanos: register_int_counter_vec_with_registry!(
+                "new_table_block_iter_nanos",
+                "Time spent on creating data block iterator",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            block_seek_nanos: register_int_counter_vec_with_registry!(
+                "block_seek_nanos",
+                "Time spent on seeking a key in data/index blocks",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            find_table_nanos: register_int_counter_vec_with_registry!(
+                "find_table_nanos",
+                "Time spent on finding or creating a table reader",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            bloom_memtable_hit_count: register_int_counter_vec_with_registry!(
+                "bloom_memtable_hit_count",
+                "Total number of mem table bloom hits",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            bloom_memtable_miss_count: register_int_counter_vec_with_registry!(
+                "bloom_memtable_miss_count",
+                "Total number of mem table bloom misses",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            bloom_sst_hit_count: register_int_counter_vec_with_registry!(
+                "bloom_sst_hit_count",
+                "Total number of SST table bloom hits",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            bloom_sst_miss_count: register_int_counter_vec_with_registry!(
+                "bloom_sst_miss_count",
+                "Total number of SST table bloom misses",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            key_lock_wait_time: register_int_counter_vec_with_registry!(
+                "key_lock_wait_time",
+                "Time spent waiting on key locks in transaction lock manager",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            key_lock_wait_count: register_int_counter_vec_with_registry!(
+                "key_lock_wait_count",
+                "Number of times acquiring a lock was blocked by another transaction",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            internal_delete_skipped_count: register_int_counter_vec_with_registry!(
+                "internal_delete_skipped_count",
+                "Total number of deleted keys skipped during iteration",
+                &["cf_name"],
+                registry,
+            )
+                .unwrap(),
+            internal_skipped_count: register_int_counter_vec_with_registry!(
+                "internal_skipped_count",
+                "Totall number of internal keys skipped during iteration",
+                &["cf_name"],
+                registry,
+            )
+                .unwrap(),
+        }
+    }
+
+    pub fn report_metrics(&self, cf_name: &str) {
+        PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context_cell| {
+            set_perf_stats(PerfStatsLevel::Disable);
+            let perf_context = perf_context_cell.borrow();
+            self.user_key_comparison_count
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::UserKeyComparisonCount));
+            self.block_cache_hit_count
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::BlockCacheHitCount));
+            self.block_read_count
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::BlockReadCount));
+            self.block_read_byte
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::BlockReadByte));
+            self.block_read_nanos
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::BlockReadTime));
+            self.block_read_count
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::BlockReadCount));
+            self.block_checksum_nanos
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::BlockChecksumTime));
+            self.block_decompress_nanos
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::BlockDecompressTime));
+            self.get_read_bytes
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::GetReadBytes));
+            self.multiget_read_bytes
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::MultigetReadBytes));
+            self.get_snapshot_nanos
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::GetSnapshotTime));
+            self.get_from_memtable_nanos
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::GetFromMemtableTime));
+            self.get_from_memtable_count
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::GetFromMemtableCount));
+            self.get_post_process_nanos
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::GetPostProcessTime));
+            self.get_from_output_files_nanos
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::GetFromOutputFilesTime));
+            self.db_mutex_lock_nanos
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::DbMutexLockNanos));
+            self.db_condition_wait_nanos
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::DbConditionWaitNanos));
+            self.merge_operator_nanos
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::MergeOperatorTimeNanos));
+            self.read_index_block_nanos
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::ReadIndexBlockNanos));
+            self.read_filter_block_nanos
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::ReadFilterBlockNanos));
+            self.new_table_block_iter_nanos
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::NewTableBlockIterNanos));
+            self.block_seek_nanos
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::BlockSeekNanos));
+            self.find_table_nanos
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::FindTableNanos));
+            self.bloom_memtable_hit_count
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::BloomMemtableHitCount));
+            self.bloom_memtable_miss_count
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::BloomMemtableMissCount));
+            self.bloom_sst_hit_count
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::BloomSstHitCount));
+            self.bloom_sst_miss_count
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::BloomSstMissCount));
+            self.key_lock_wait_time
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::KeyLockWaitTime));
+            self.key_lock_wait_count
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::KeyLockWaitCount));
+            self.internal_delete_skipped_count
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::InternalDeleteSkippedCount));
+            self.internal_skipped_count
+                .with_label_values(&[cf_name])
+                .inc_by(perf_context.metric(PerfMetric::InternalKeySkippedCount));
+        });
+    }
+}
+
+#[derive(Debug)]
+pub struct WritePerfContextMetrics {
+    pub write_wal_nanos: IntCounterVec,
+    pub write_memtable_nanos: IntCounterVec,
+    pub write_delay_nanos: IntCounterVec,
+    pub write_pre_and_post_process_nanos: IntCounterVec,
+    pub write_db_mutex_lock_nanos: IntCounterVec,
+    pub write_db_condition_wait_nanos: IntCounterVec,
+    pub write_key_lock_wait_nanos: IntCounterVec,
+    pub write_key_lock_wait_count: IntCounterVec,
+}
+
+impl WritePerfContextMetrics {
+    pub(crate) fn new(registry: &Registry) -> Self {
+        WritePerfContextMetrics {
+            write_wal_nanos: register_int_counter_vec_with_registry!(
+                "write_wal_nanos",
+                "Total nanos spent on writing to WAL",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            write_memtable_nanos: register_int_counter_vec_with_registry!(
+                "write_memtable_nanos",
+                "Total nanos spent on writing to memtable",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            write_delay_nanos: register_int_counter_vec_with_registry!(
+                "write_delay_nanos",
+                "Total nanos spent on delaying or throttling write",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            write_pre_and_post_process_nanos: register_int_counter_vec_with_registry!(
+                "write_pre_and_post_process_nanos",
+                "Total nanos spent on writing a record, excluding the above four things",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            write_db_mutex_lock_nanos: register_int_counter_vec_with_registry!(
+                "write_db_mutex_lock_nanos",
+                "Time spent on acquiring db mutex",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            write_db_condition_wait_nanos: register_int_counter_vec_with_registry!(
+                "write_db_condition_wait_nanos",
+                "Time spent waiting with a condition variable created with DB Mutex.",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            write_key_lock_wait_nanos: register_int_counter_vec_with_registry!(
+                "write_key_lock_wait_time",
+                "Time spent waiting on key locks in transaction lock manager",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+            write_key_lock_wait_count: register_int_counter_vec_with_registry!(
+                "write_key_lock_wait_count",
+                "Number of times acquiring a lock was blocked by another transaction",
+                &["cf_name"],
+                registry,
+            )
+            .unwrap(),
+        }
+    }
+    pub fn report_metrics(&self, db_name: &str) {
+        PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context_cell| {
+            set_perf_stats(PerfStatsLevel::Disable);
+            let perf_context = perf_context_cell.borrow();
+            self.write_wal_nanos
+                .with_label_values(&[db_name])
+                .inc_by(perf_context.metric(PerfMetric::WriteWalTime));
+            self.write_memtable_nanos
+                .with_label_values(&[db_name])
+                .inc_by(perf_context.metric(PerfMetric::WriteMemtableTime));
+            self.write_delay_nanos
+                .with_label_values(&[db_name])
+                .inc_by(perf_context.metric(PerfMetric::WriteDelayTime));
+            self.write_pre_and_post_process_nanos
+                .with_label_values(&[db_name])
+                .inc_by(perf_context.metric(PerfMetric::WritePreAndPostProcessTime));
+            self.write_db_mutex_lock_nanos
+                .with_label_values(&[db_name])
+                .inc_by(perf_context.metric(PerfMetric::DbMutexLockNanos));
+            self.write_db_condition_wait_nanos
+                .with_label_values(&[db_name])
+                .inc_by(perf_context.metric(PerfMetric::DbConditionWaitNanos));
+            self.write_key_lock_wait_nanos
+                .with_label_values(&[db_name])
+                .inc_by(perf_context.metric(PerfMetric::KeyLockWaitTime));
+            self.write_key_lock_wait_count
+                .with_label_values(&[db_name])
+                .inc_by(perf_context.metric(PerfMetric::KeyLockWaitCount));
+        });
+    }
+}
+
+#[derive(Debug)]
+pub struct DBMetrics {
+    pub op_metrics: OperationMetrics,
+    pub cf_metrics: ColumnFamilyMetrics,
+    pub read_perf_ctx_metrics: ReadPerfContextMetrics,
+    pub write_perf_ctx_metrics: WritePerfContextMetrics,
+}
+
+static ONCE: OnceCell<Arc<DBMetrics>> = OnceCell::new();
+
+impl DBMetrics {
+    fn new(registry: &Registry) -> Self {
+        DBMetrics {
+            op_metrics: OperationMetrics::new(registry),
+            cf_metrics: ColumnFamilyMetrics::new(registry),
+            read_perf_ctx_metrics: ReadPerfContextMetrics::new(registry),
+            write_perf_ctx_metrics: WritePerfContextMetrics::new(registry),
+        }
+    }
+    pub fn init(registry: &Registry) -> &'static Arc<DBMetrics> {
+        // Initialize this before creating any instance of DBMap
+        // TODO: Remove static initialization because this basically means we can
+        // only ever initialize db metrics once with a registry whereas
+        // in the code we might want to initialize it with different
+        // registries. The problem is underlying metrics cannot be re-initialized
+        // or prometheus complains. We essentially need to pass in DBMetrics
+        // everywhere we create DBMap as the right fix
+        let _ = ONCE
+            .set(Arc::new(DBMetrics::new(registry)))
+            // this happens many times during tests
+            .tap_err(|_| warn!("DBMetrics registry overwritten"));
+        ONCE.get().unwrap()
+    }
+    pub fn get() -> &'static Arc<DBMetrics> {
+        ONCE.get()
+            .unwrap_or_else(|| DBMetrics::init(prometheus::default_registry()))
+    }
+}
diff --git a/moveos/raw-store/src/rocks/errors.rs b/moveos/raw-store/src/rocks/errors.rs
new file mode 100644
index 000000000..393715338
--- /dev/null
+++ b/moveos/raw-store/src/rocks/errors.rs
@@ -0,0 +1,126 @@
+// Copyright (c) RoochNetwork
+// SPDX-License-Identifier: Apache-2.0
+
+// Copyright (c) Mysten Labs, Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+use bincode::ErrorKind as BincodeErrorKind;
+
+use rocksdb::Error as RocksError;
+use serde::{Deserialize, Serialize};
+use std::{fmt, fmt::Display};
+use thiserror::Error;
+
+#[non_exhaustive]
+#[derive(Error, Debug, Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Ord, PartialOrd)]
+pub enum RawStoreError {
+    #[error("rocksdb error: {0}")]
+    RocksDBError(String),
+    #[error("(de)serialization error: {0}")]
+    SerializationError(String),
+    #[error("the column family {0} was not registered with the database")]
+    UnregisteredColumn(String),
+    #[error("a batch operation can't operate across databases")]
+    CrossDBBatch,
+    #[error("Metric reporting thread failed with error")]
+    MetricsReporting,
+    #[error("Transaction should be retried")]
+    RetryableTransactionError,
+}
+
+#[derive(Serialize, Deserialize, Clone, Eq, PartialEq, Hash, Debug, Error)]
+pub(crate) struct RocksErrorDef {
+    message: String,
+}
+
+impl From<RocksError> for RocksErrorDef {
+    fn from(err: RocksError) -> Self {
+        RocksErrorDef {
+            message: err.as_ref().to_string(),
+        }
+    }
+}
+
+impl From<RocksError> for RawStoreError {
+    fn from(err: RocksError) -> Self {
+        RawStoreError::RocksDBError(format!("{err}"))
+    }
+}
+
+impl Display for RocksErrorDef {
+    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
+        self.message.fmt(formatter)
+    }
+}
+
+#[derive(Serialize, Deserialize, Clone, Hash, Eq, PartialEq, Debug, Error)]
+pub(crate) enum BincodeErrorDef {
+    Io(String),
+    InvalidUtf8Encoding(String),
+    InvalidBoolEncoding(u8),
+    InvalidCharEncoding,
+    InvalidTagEncoding(usize),
+    DeserializeAnyNotSupported,
+    SizeLimit,
+    SequenceMustHaveLength,
+    Custom(String),
+}
+
+impl fmt::Display for BincodeErrorDef {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match *self {
+            BincodeErrorDef::Io(ref ioerr) => write!(fmt, "io error: {ioerr}"),
+            BincodeErrorDef::InvalidUtf8Encoding(ref e) => {
+                write!(fmt, "{e}")
+            }
+            BincodeErrorDef::InvalidBoolEncoding(b) => {
+                write!(fmt, "expected 0 or 1, found {b}")
+            }
+            BincodeErrorDef::InvalidCharEncoding => write!(fmt, "{self:?}"),
+            BincodeErrorDef::InvalidTagEncoding(tag) => {
+                write!(fmt, "found {tag}")
+            }
+            BincodeErrorDef::SequenceMustHaveLength => write!(fmt, "{self:?}"),
+            BincodeErrorDef::SizeLimit => write!(fmt, "{self:?}"),
+            BincodeErrorDef::DeserializeAnyNotSupported => write!(
+                fmt,
+                "Bincode does not support the serde::Deserializer::deserialize_any method"
+            ),
+            BincodeErrorDef::Custom(ref s) => s.fmt(fmt),
+        }
+    }
+}
+
+impl From<bincode::Error> for BincodeErrorDef {
+    fn from(err: bincode::Error) -> Self {
+        match err.as_ref() {
+            BincodeErrorKind::Io(ioerr) => BincodeErrorDef::Io(ioerr.to_string()),
+            BincodeErrorKind::InvalidUtf8Encoding(utf8err) => {
+                BincodeErrorDef::InvalidUtf8Encoding(utf8err.to_string())
+            }
+            BincodeErrorKind::InvalidBoolEncoding(byte) => {
+                BincodeErrorDef::InvalidBoolEncoding(*byte)
+            }
+            BincodeErrorKind::InvalidCharEncoding => BincodeErrorDef::InvalidCharEncoding,
+            BincodeErrorKind::InvalidTagEncoding(tag) => BincodeErrorDef::InvalidTagEncoding(*tag),
+            BincodeErrorKind::DeserializeAnyNotSupported => {
+                BincodeErrorDef::DeserializeAnyNotSupported
+            }
+            BincodeErrorKind::SizeLimit => BincodeErrorDef::SizeLimit,
+            BincodeErrorKind::SequenceMustHaveLength => BincodeErrorDef::SequenceMustHaveLength,
+            BincodeErrorKind::Custom(str) => BincodeErrorDef::Custom(str.to_owned()),
+        }
+    }
+}
+
+impl From<bcs::Error> for RawStoreError {
+    fn from(err: bcs::Error) -> Self {
+        RawStoreError::SerializationError(format!("{err}"))
+    }
+}
+
+impl From<bincode::Error> for RawStoreError {
+    fn from(err: bincode::Error) -> Self {
+        RawStoreError::SerializationError(format!("{err}"))
+    }
+}
diff --git a/moveos/raw-store/src/rocks/iter.rs b/moveos/raw-store/src/rocks/iter.rs
new file mode 100644
index 000000000..25114e67c
--- /dev/null
+++ b/moveos/raw-store/src/rocks/iter.rs
@@ -0,0 +1,171 @@
+// Copyright (c) RoochNetwork
+// SPDX-License-Identifier: Apache-2.0
+
+// Copyright (c) Mysten Labs, Inc.
+// SPDX-License-Identifier: Apache-2.0
+use std::marker::PhantomData;
+use std::sync::Arc;
+
+use bincode::Options;
+use prometheus::{Histogram, HistogramTimer};
+use rocksdb::Direction;
+
+use super::{be_fix_int_ser, errors::RawStoreError, RocksDBRawIter};
+use crate::metrics::DBMetrics;
+use crate::metrics::RocksDBPerfContext;
+use serde::{de::DeserializeOwned, Serialize};
+
+/// An iterator over all key-value pairs in a data map.
+pub struct Iter<'a, K, V> {
+    cf_name: String,
+    db_iter: RocksDBRawIter<'a>,
+    // *const here is an equivalent to `impl !Send for Iter` (which is not a stable feature at the moment)
+    _phantom: PhantomData<*const (K, V)>,
+    direction: Direction,
+    is_initialized: bool,
+    _timer: Option<HistogramTimer>,
+    _perf_ctx: Option<RocksDBPerfContext>,
+    bytes_scanned: Option<Histogram>,
+    keys_scanned: Option<Histogram>,
+    db_metrics: Option<Arc<DBMetrics>>,
+    bytes_scanned_counter: usize,
+    keys_returned_counter: usize,
+}
+
+impl<'a, K: DeserializeOwned, V: DeserializeOwned> Iter<'a, K, V> {
+    pub(super) fn new(
+        cf_name: String,
+        db_iter: RocksDBRawIter<'a>,
+        _timer: Option<HistogramTimer>,
+        _perf_ctx: Option<RocksDBPerfContext>,
+        bytes_scanned: Option<Histogram>,
+        keys_scanned: Option<Histogram>,
+        db_metrics: Option<Arc<DBMetrics>>,
+    ) -> Self {
+        Self {
+            cf_name,
+            db_iter,
+            _phantom: PhantomData,
+            direction: Direction::Forward,
+            is_initialized: false,
+            _timer,
+            _perf_ctx,
+            bytes_scanned,
+            keys_scanned,
+            db_metrics,
+            bytes_scanned_counter: 0,
+            keys_returned_counter: 0,
+        }
+    }
+}
+
+impl<'a, K: DeserializeOwned, V: DeserializeOwned> Iterator for Iter<'a, K, V> {
+    type Item = (K, V);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        // implicitly set iterator to the first entry in the column family if it hasn't been initialized
+        // used for backward compatibility
+        if !self.is_initialized {
+            self.db_iter.seek_to_first();
+            self.is_initialized = true;
+        }
+        if self.db_iter.valid() {
+            let config = bincode::DefaultOptions::new()
+                .with_big_endian()
+                .with_fixint_encoding();
+            let raw_key = self
+                .db_iter
+                .key()
+                .expect("Valid iterator failed to get key");
+            let raw_value = self
+                .db_iter
+                .value()
+                .expect("Valid iterator failed to get value");
+            self.bytes_scanned_counter += raw_key.len() + raw_value.len();
+            self.keys_returned_counter += 1;
+            let key = config.deserialize(raw_key).ok();
+            let value = bcs::from_bytes(raw_value).ok();
+            match self.direction {
+                Direction::Forward => self.db_iter.next(),
+                Direction::Reverse => self.db_iter.prev(),
+            }
+            key.and_then(|k| value.map(|v| (k, v)))
+        } else {
+            None
+        }
+    }
+}
+
+impl<'a, K, V> Drop for Iter<'a, K, V> {
+    fn drop(&mut self) {
+        if let Some(bytes_scanned) = self.bytes_scanned.take() {
+            bytes_scanned.observe(self.bytes_scanned_counter as f64);
+        }
+        if let Some(keys_scanned) = self.keys_scanned.take() {
+            keys_scanned.observe(self.keys_returned_counter as f64);
+        }
+        if let Some(db_metrics) = self.db_metrics.take() {
+            db_metrics
+                .read_perf_ctx_metrics
+                .report_metrics(&self.cf_name);
+        }
+    }
+}
+
+impl<'a, K: Serialize, V> Iter<'a, K, V> {
+    /// Skips all the elements that are smaller than the given key,
+    /// and either lands on the key or the first one greater than
+    /// the key.
+    pub fn skip_to(mut self, key: &K) -> Result<Self, RawStoreError> {
+        self.is_initialized = true;
+        self.db_iter.seek(be_fix_int_ser(key)?);
+        Ok(self)
+    }
+
+    /// Moves the iterator the element given or
+    /// the one prior to it if it does not exist. If there is
+    /// no element prior to it, it returns an empty iterator.
+    pub fn skip_prior_to(mut self, key: &K) -> Result<Self, RawStoreError> {
+        self.is_initialized = true;
+        self.db_iter.seek_for_prev(be_fix_int_ser(key)?);
+        Ok(self)
+    }
+
+    /// Seeks to the last key in the database (at this column family).
+    pub fn skip_to_last(mut self) -> Self {
+        self.is_initialized = true;
+        self.db_iter.seek_to_last();
+        self
+    }
+
+    /// Will make the direction of the iteration reverse and will
+    /// create a new `RevIter` to consume. Every call to `next` method
+    /// will give the next element from the end.
+    pub fn reverse(mut self) -> RevIter<'a, K, V> {
+        self.direction = Direction::Reverse;
+        RevIter::new(self)
+    }
+}
+
+/// An iterator with a reverted direction to the original. The `RevIter`
+/// is hosting an iteration which is consuming in the opposing direction.
+/// It's not possible to do further manipulation (ex re-reverse) to the
+/// iterator.
+pub struct RevIter<'a, K, V> {
+    iter: Iter<'a, K, V>,
+}
+
+impl<'a, K, V> RevIter<'a, K, V> {
+    fn new(iter: Iter<'a, K, V>) -> Self {
+        Self { iter }
+    }
+}
+
+impl<'a, K: DeserializeOwned, V: DeserializeOwned> Iterator for RevIter<'a, K, V> {
+    type Item = (K, V);
+
+    /// Will give the next item backwards
+    fn next(&mut self) -> Option<Self::Item> {
+        self.iter.next()
+    }
+}
diff --git a/moveos/raw-store/src/rocks/keys.rs b/moveos/raw-store/src/rocks/keys.rs
new file mode 100644
index 000000000..6469fb2f7
--- /dev/null
+++ b/moveos/raw-store/src/rocks/keys.rs
@@ -0,0 +1,71 @@
+// Copyright (c) RoochNetwork
+// SPDX-License-Identifier: Apache-2.0
+
+// Copyright (c) Mysten Labs, Inc.
+// SPDX-License-Identifier: Apache-2.0
+use bincode::Options;
+
+use serde::{de::DeserializeOwned, Serialize};
+use std::marker::PhantomData;
+
+use super::{be_fix_int_ser, RawStoreError, RocksDBRawIter};
+
+/// An iterator over the keys of a prefix.
+pub struct Keys<'a, K> {
+    db_iter: RocksDBRawIter<'a>,
+    _phantom: PhantomData<K>,
+}
+
+impl<'a, K: DeserializeOwned> Keys<'a, K> {
+    pub(crate) fn new(db_iter: RocksDBRawIter<'a>) -> Self {
+        Self {
+            db_iter,
+            _phantom: PhantomData,
+        }
+    }
+}
+
+impl<'a, K: DeserializeOwned> Iterator for Keys<'a, K> {
+    type Item = Result<K, RawStoreError>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.db_iter.valid() {
+            let config = bincode::DefaultOptions::new()
+                .with_big_endian()
+                .with_fixint_encoding();
+            let key = self.db_iter.key().and_then(|k| config.deserialize(k).ok());
+            self.db_iter.next();
+            key.map(Ok)
+        } else {
+            match self.db_iter.status() {
+                Ok(_) => None,
+                Err(err) => Some(Err(RawStoreError::RocksDBError(format!("{err}")))),
+            }
+        }
+    }
+}
+
+impl<'a, K: Serialize> Keys<'a, K> {
+    /// Skips all the elements that are smaller than the given key,
+    /// and either lands on the key or the first one greater than
+    /// the key.
+    pub fn skip_to(mut self, key: &K) -> Result<Self, RawStoreError> {
+        self.db_iter.seek(be_fix_int_ser(key)?);
+        Ok(self)
+    }
+
+    /// Moves the iterator the element given or
+    /// the one prior to it if it does not exist. If there is
+    /// no element prior to it, it returns an empty iterator.
+    pub fn skip_prior_to(mut self, key: &K) -> Result<Self, RawStoreError> {
+        self.db_iter.seek_for_prev(be_fix_int_ser(key)?);
+        Ok(self)
+    }
+
+    /// Seeks to the last key in the database (at this column family).
+    ///
+    pub fn skip_to_last(mut self) -> Self {
+        self.db_iter.seek_to_last();
+        self
+    }
+}
diff --git a/moveos/raw-store/src/rocks/mod.rs b/moveos/raw-store/src/rocks/mod.rs
new file mode 100644
index 000000000..154fa4c12
--- /dev/null
+++ b/moveos/raw-store/src/rocks/mod.rs
@@ -0,0 +1,2406 @@
+// Copyright (c) RoochNetwork
+// SPDX-License-Identifier: Apache-2.0
+
+// Copyright (c) Mysten Labs, Inc.
+// SPDX-License-Identifier: Apache-2.0
+pub mod errors;
+pub(crate) mod iter;
+pub(crate) mod keys;
+pub(crate) mod safe_iter;
+pub mod util;
+pub(crate) mod values;
+
+use crate::{
+    metrics::{DBMetrics, RocksDBPerfContext, SamplingInterval},
+    traits::{Map, TableSummary},
+};
+use bincode::Options;
+use collectable::TryExtend;
+use itertools::Itertools;
+use rocksdb::{
+    checkpoint::Checkpoint, BlockBasedOptions, BottommostLevelCompaction, Cache, CompactOptions,
+    LiveFile, OptimisticTransactionDB, SnapshotWithThreadMode,
+};
+use rocksdb::{
+    properties, AsColumnFamilyRef, CStrLike, ColumnFamilyDescriptor, DBWithThreadMode, Error,
+    ErrorKind, IteratorMode, MultiThreaded, OptimisticTransactionOptions, ReadOptions, Transaction,
+    WriteBatch, WriteBatchWithTransaction, WriteOptions,
+};
+use serde::{de::DeserializeOwned, Serialize};
+use std::{
+    borrow::Borrow,
+    collections::BTreeMap,
+    env,
+    marker::PhantomData,
+    path::{Path, PathBuf},
+    sync::Arc,
+    time::Duration,
+};
+use std::{collections::HashSet, ffi::CStr};
+use tap::TapFallible;
+use tokio::sync::oneshot;
+use tracing::{error, info, instrument, warn};
+
+use self::{iter::Iter, keys::Keys, values::Values};
+use crate::rocks::safe_iter::SafeIter;
+use crate::{fail_point, nondeterministic};
+pub use errors::RawStoreError;
+
+// Write buffer size per RocksDB instance can be set via the env var below.
+// If the env var is not set, use the default value in MiB.
+const ENV_VAR_DB_WRITE_BUFFER_SIZE: &str = "DB_WRITE_BUFFER_SIZE_MB";
+const DEFAULT_DB_WRITE_BUFFER_SIZE: usize = 1024;
+
+// Write ahead log size per RocksDB instance can be set via the env var below.
+// If the env var is not set, use the default value in MiB.
+const ENV_VAR_DB_WAL_SIZE: &str = "DB_WAL_SIZE_MB";
+const DEFAULT_DB_WAL_SIZE: usize = 1024;
+
+// Environment variable to control behavior of write throughput optimized tables.
+const ENV_VAR_L0_NUM_FILES_COMPACTION_TRIGGER: &str = "L0_NUM_FILES_COMPACTION_TRIGGER";
+const DEFAULT_L0_NUM_FILES_COMPACTION_TRIGGER: usize = 6;
+const ENV_VAR_MAX_WRITE_BUFFER_SIZE_MB: &str = "MAX_WRITE_BUFFER_SIZE_MB";
+const DEFAULT_MAX_WRITE_BUFFER_SIZE_MB: usize = 256;
+const ENV_VAR_MAX_WRITE_BUFFER_NUMBER: &str = "MAX_WRITE_BUFFER_NUMBER";
+const DEFAULT_MAX_WRITE_BUFFER_NUMBER: usize = 6;
+const ENV_VAR_TARGET_FILE_SIZE_BASE_MB: &str = "TARGET_FILE_SIZE_BASE_MB";
+const DEFAULT_TARGET_FILE_SIZE_BASE_MB: usize = 128;
+
+// Set to 1 to disable blob storage for transactions and effects.
+const ENV_VAR_DISABLE_BLOB_STORAGE: &str = "DISABLE_BLOB_STORAGE";
+
+const ENV_VAR_MAX_BACKGROUND_JOBS: &str = "MAX_BACKGROUND_JOBS";
+
+// TODO: remove this after Rust rocksdb has the TOTAL_BLOB_FILES_SIZE property built-in.
+// From https://github.com/facebook/rocksdb/blob/bd80433c73691031ba7baa65c16c63a83aef201a/include/rocksdb/db.h#L1169
+const ROCKSDB_PROPERTY_TOTAL_BLOB_FILES_SIZE: &CStr =
+    unsafe { CStr::from_bytes_with_nul_unchecked("rocksdb.total-blob-file-size\0".as_bytes()) };
+
+#[cfg(test)]
+mod tests;
+
+/// A helper macro to reopen multiple column families. The macro returns
+/// a tuple of DBMap structs in the same order that the column families
+/// are defined.
+///
+/// # Arguments
+///
+/// * `db` - a reference to a rocks DB object
+/// * `cf;<ty,ty>` - a comma separated list of column families to open. For each
+/// column family a concatenation of column family name (cf) and Key-Value <ty, ty>
+/// should be provided.
+///
+/// # Examples
+///
+/// We successfully open two different column families.
+/// ```
+/// use raw_store::reopen;
+/// use raw_store::rocks::*;
+/// use tempfile::tempdir;
+/// use prometheus::Registry;
+/// use std::sync::Arc;
+/// use raw_store::metrics::DBMetrics;
+/// use core::fmt::Error;
+///
+/// #[tokio::main]
+/// async fn main() -> Result<(), Error> {
+/// const FIRST_CF: &str = "First_CF";
+/// const SECOND_CF: &str = "Second_CF";
+///
+///
+/// /// Create the rocks database reference for the desired column families
+/// let rocks = open_cf(tempdir().unwrap(), None, MetricConf::default(), &[FIRST_CF, SECOND_CF]).unwrap();
+///
+/// /// Now simply open all the column families for their expected Key-Value types
+/// let (db_map_1, db_map_2) = reopen!(&rocks, FIRST_CF;<i32, String>, SECOND_CF;<i32, String>);
+/// Ok(())
+/// }
+/// ```
+///
+#[macro_export]
+macro_rules! reopen {
+    ( $db:expr, $($cf:expr;<$K:ty, $V:ty>),*) => {
+        (
+            $(
+                DBMap::<$K, $V>::reopen($db, Some($cf), &ReadWriteOptions::default()).expect(&format!("Cannot open {} CF.", $cf)[..])
+            ),*
+        )
+    };
+}
+
+/// Repeatedly attempt an Optimistic Transaction until it succeeds.
+/// Since many callsites (e.g. the consensus handler) cannot proceed in the case of failed writes,
+/// this will loop forever until the transaction succeeds.
+#[macro_export]
+macro_rules! retry_transaction {
+    ($transaction:expr) => {
+        retry_transaction!($transaction, Some(20))
+    };
+
+    (
+        $transaction:expr,
+        $max_retries:expr // should be an Option<int type>, None for unlimited
+        $(,)?
+
+    ) => {{
+        use rand::{
+            distributions::{Distribution, Uniform},
+            rngs::ThreadRng,
+        };
+        use tokio::time::{sleep, Duration};
+        use tracing::{error, info};
+
+        let mut retries = 0;
+        let max_retries = $max_retries;
+        loop {
+            let status = $transaction;
+            match status {
+                Err(RawStoreError::RetryableTransactionError) => {
+                    retries += 1;
+                    // Randomized delay to help racing transactions get out of each other's way.
+                    let delay = {
+                        let mut rng = ThreadRng::default();
+                        Duration::from_millis(Uniform::new(0, 50).sample(&mut rng))
+                    };
+                    if let Some(max_retries) = max_retries {
+                        if retries > max_retries {
+                            error!(?max_retries, "max retries exceeded");
+                            break status;
+                        }
+                    }
+                    if retries > 10 {
+                        // TODO: monitoring needed?
+                        error!(?delay, ?retries, "excessive transaction retries...");
+                    } else {
+                        info!(
+                            ?delay,
+                            ?retries,
+                            "transaction write conflict detected, sleeping"
+                        );
+                    }
+                    sleep(delay).await;
+                }
+                _ => break status,
+            }
+        }
+    }};
+}
+
+#[macro_export]
+macro_rules! retry_transaction_forever {
+    ($transaction:expr) => {
+        $crate::retry_transaction!($transaction, None)
+    };
+}
+
+#[derive(Debug)]
+pub struct DBWithThreadModeWrapper {
+    pub underlying: rocksdb::DBWithThreadMode<MultiThreaded>,
+    pub metric_conf: MetricConf,
+    pub db_path: PathBuf,
+}
+
+#[derive(Debug)]
+pub struct OptimisticTransactionDBWrapper {
+    pub underlying: rocksdb::OptimisticTransactionDB<MultiThreaded>,
+    pub metric_conf: MetricConf,
+    pub db_path: PathBuf,
+}
+
+/// Thin wrapper to unify interface across different db types
+#[derive(Debug)]
+pub enum RocksDB {
+    DBWithThreadMode(DBWithThreadModeWrapper),
+    OptimisticTransactionDB(OptimisticTransactionDBWrapper),
+}
+
+macro_rules! delegate_call {
+    ($self:ident.$method:ident($($args:ident),*)) => {
+        match $self {
+            Self::DBWithThreadMode(d) => d.underlying.$method($($args),*),
+            Self::OptimisticTransactionDB(d) => d.underlying.$method($($args),*),
+        }
+    }
+}
+
+impl Drop for RocksDB {
+    fn drop(&mut self) {
+        delegate_call!(self.cancel_all_background_work(/* wait */ true))
+    }
+}
+
+impl RocksDB {
+    pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Result<Option<Vec<u8>>, rocksdb::Error> {
+        delegate_call!(self.get(key))
+    }
+
+    pub fn multi_get_cf<'a, 'b: 'a, K, I, W>(
+        &'a self,
+        keys: I,
+        readopts: &ReadOptions,
+    ) -> Vec<Result<Option<Vec<u8>>, rocksdb::Error>>
+    where
+        K: AsRef<[u8]>,
+        I: IntoIterator<Item = (&'b W, K)>,
+        W: 'b + AsColumnFamilyRef,
+    {
+        delegate_call!(self.multi_get_cf_opt(keys, readopts))
+    }
+
+    pub fn property_int_value_cf(
+        &self,
+        cf: &impl AsColumnFamilyRef,
+        name: impl CStrLike,
+    ) -> Result<Option<u64>, rocksdb::Error> {
+        delegate_call!(self.property_int_value_cf(cf, name))
+    }
+
+    pub fn get_pinned_cf<K: AsRef<[u8]>>(
+        &self,
+        cf: &impl AsColumnFamilyRef,
+        key: K,
+        readopts: &ReadOptions,
+    ) -> Result<Option<rocksdb::DBPinnableSlice<'_>>, rocksdb::Error> {
+        delegate_call!(self.get_pinned_cf_opt(cf, key, readopts))
+    }
+
+    pub fn cf_handle(&self, name: &str) -> Option<Arc<rocksdb::BoundColumnFamily<'_>>> {
+        delegate_call!(self.cf_handle(name))
+    }
+
+    pub fn create_cf<N: AsRef<str>>(
+        &self,
+        name: N,
+        opts: &rocksdb::Options,
+    ) -> Result<(), rocksdb::Error> {
+        delegate_call!(self.create_cf(name, opts))
+    }
+
+    pub fn drop_cf(&self, name: &str) -> Result<(), rocksdb::Error> {
+        delegate_call!(self.drop_cf(name))
+    }
+
+    pub fn delete_cf<K: AsRef<[u8]>>(
+        &self,
+        cf: &impl AsColumnFamilyRef,
+        key: K,
+        writeopts: &WriteOptions,
+    ) -> Result<(), rocksdb::Error> {
+        fail_point!("delete-cf-before");
+        let ret = delegate_call!(self.delete_cf_opt(cf, key, writeopts));
+        fail_point!("delete-cf-after");
+        #[allow(clippy::let_and_return)]
+        ret
+    }
+
+    pub fn path(&self) -> &Path {
+        delegate_call!(self.path())
+    }
+
+    pub fn put_cf<K, V>(
+        &self,
+        cf: &impl AsColumnFamilyRef,
+        key: K,
+        value: V,
+        writeopts: &WriteOptions,
+    ) -> Result<(), rocksdb::Error>
+    where
+        K: AsRef<[u8]>,
+        V: AsRef<[u8]>,
+    {
+        fail_point!("put-cf-before");
+        let ret = delegate_call!(self.put_cf_opt(cf, key, value, writeopts));
+        fail_point!("put-cf-after");
+        #[allow(clippy::let_and_return)]
+        ret
+    }
+
+    pub fn key_may_exist_cf<K: AsRef<[u8]>>(
+        &self,
+        cf: &impl AsColumnFamilyRef,
+        key: K,
+        readopts: &ReadOptions,
+    ) -> bool {
+        delegate_call!(self.key_may_exist_cf_opt(cf, key, readopts))
+    }
+
+    pub fn try_catch_up_with_primary(&self) -> Result<(), rocksdb::Error> {
+        delegate_call!(self.try_catch_up_with_primary())
+    }
+
+    pub fn write(&self, batch: RocksDBBatch) -> Result<(), RawStoreError> {
+        fail_point!("batch-write-before");
+        let ret = match (self, batch) {
+            (RocksDB::DBWithThreadMode(db), RocksDBBatch::Regular(batch)) => {
+                db.underlying.write(batch)?;
+                Ok(())
+            }
+            (RocksDB::OptimisticTransactionDB(db), RocksDBBatch::Transactional(batch)) => {
+                db.underlying.write(batch)?;
+                Ok(())
+            }
+            _ => Err(RawStoreError::RocksDBError(
+                "using invalid batch type for the database".to_string(),
+            )),
+        };
+        fail_point!("batch-write-after");
+        #[allow(clippy::let_and_return)]
+        ret
+    }
+
+    pub fn transaction_without_snapshot(
+        &self,
+    ) -> Result<Transaction<'_, rocksdb::OptimisticTransactionDB>, RawStoreError> {
+        match self {
+            Self::OptimisticTransactionDB(db) => Ok(db.underlying.transaction()),
+            Self::DBWithThreadMode(_) => Err(RawStoreError::RocksDBError(
+                "operation not supported".to_string(),
+            )),
+        }
+    }
+
+    pub fn transaction(
+        &self,
+    ) -> Result<Transaction<'_, rocksdb::OptimisticTransactionDB>, RawStoreError> {
+        match self {
+            Self::OptimisticTransactionDB(db) => {
+                let mut tx_opts = OptimisticTransactionOptions::new();
+                tx_opts.set_snapshot(true);
+
+                Ok(db
+                    .underlying
+                    .transaction_opt(&WriteOptions::default(), &tx_opts))
+            }
+            Self::DBWithThreadMode(_) => Err(RawStoreError::RocksDBError(
+                "operation not supported".to_string(),
+            )),
+        }
+    }
+
+    pub fn raw_iterator_cf<'a: 'b, 'b>(
+        &'a self,
+        cf_handle: &impl AsColumnFamilyRef,
+        readopts: ReadOptions,
+    ) -> RocksDBRawIter<'b> {
+        match self {
+            Self::DBWithThreadMode(db) => {
+                RocksDBRawIter::DB(db.underlying.raw_iterator_cf_opt(cf_handle, readopts))
+            }
+            Self::OptimisticTransactionDB(db) => RocksDBRawIter::OptimisticTransactionDB(
+                db.underlying.raw_iterator_cf_opt(cf_handle, readopts),
+            ),
+        }
+    }
+
+    pub fn iterator_cf<'a: 'b, 'b>(
+        &'a self,
+        cf_handle: &impl AsColumnFamilyRef,
+        readopts: ReadOptions,
+        mode: IteratorMode<'_>,
+    ) -> RocksDBIter<'b> {
+        match self {
+            Self::DBWithThreadMode(db) => {
+                RocksDBIter::DB(db.underlying.iterator_cf_opt(cf_handle, readopts, mode))
+            }
+            Self::OptimisticTransactionDB(db) => RocksDBIter::OptimisticTransactionDB(
+                db.underlying.iterator_cf_opt(cf_handle, readopts, mode),
+            ),
+        }
+    }
+
+    pub fn compact_range_cf<K: AsRef<[u8]>>(
+        &self,
+        cf: &impl AsColumnFamilyRef,
+        start: Option<K>,
+        end: Option<K>,
+    ) {
+        delegate_call!(self.compact_range_cf(cf, start, end))
+    }
+
+    pub fn compact_range_to_bottom<K: AsRef<[u8]>>(
+        &self,
+        cf: &impl AsColumnFamilyRef,
+        start: Option<K>,
+        end: Option<K>,
+    ) {
+        let opt = &mut CompactOptions::default();
+        opt.set_bottommost_level_compaction(BottommostLevelCompaction::ForceOptimized);
+        delegate_call!(self.compact_range_cf_opt(cf, start, end, opt))
+    }
+
+    pub fn flush(&self) -> Result<(), RawStoreError> {
+        delegate_call!(self.flush()).map_err(|e| RawStoreError::RocksDBError(e.into_string()))
+    }
+
+    pub fn snapshot(&self) -> RocksDBSnapshot<'_> {
+        match self {
+            Self::DBWithThreadMode(d) => RocksDBSnapshot::DBWithThreadMode(d.underlying.snapshot()),
+            Self::OptimisticTransactionDB(d) => {
+                RocksDBSnapshot::OptimisticTransactionDB(d.underlying.snapshot())
+            }
+        }
+    }
+
+    pub fn checkpoint(&self, path: &Path) -> Result<(), RawStoreError> {
+        let checkpoint = match self {
+            Self::DBWithThreadMode(d) => Checkpoint::new(&d.underlying)?,
+            Self::OptimisticTransactionDB(d) => Checkpoint::new(&d.underlying)?,
+        };
+        checkpoint
+            .create_checkpoint(path)
+            .map_err(|e| RawStoreError::RocksDBError(e.to_string()))?;
+        Ok(())
+    }
+
+    pub fn flush_cf(&self, cf: &impl AsColumnFamilyRef) -> Result<(), rocksdb::Error> {
+        delegate_call!(self.flush_cf(cf))
+    }
+
+    pub fn set_options_cf(
+        &self,
+        cf: &impl AsColumnFamilyRef,
+        opts: &[(&str, &str)],
+    ) -> Result<(), rocksdb::Error> {
+        delegate_call!(self.set_options_cf(cf, opts))
+    }
+
+    pub fn get_sampling_interval(&self) -> SamplingInterval {
+        match self {
+            Self::DBWithThreadMode(d) => d.metric_conf.read_sample_interval.new_from_self(),
+            Self::OptimisticTransactionDB(d) => d.metric_conf.read_sample_interval.new_from_self(),
+        }
+    }
+
+    pub fn multiget_sampling_interval(&self) -> SamplingInterval {
+        match self {
+            Self::DBWithThreadMode(d) => d.metric_conf.read_sample_interval.new_from_self(),
+            Self::OptimisticTransactionDB(d) => d.metric_conf.read_sample_interval.new_from_self(),
+        }
+    }
+
+    pub fn write_sampling_interval(&self) -> SamplingInterval {
+        match self {
+            Self::DBWithThreadMode(d) => d.metric_conf.write_sample_interval.new_from_self(),
+            Self::OptimisticTransactionDB(d) => d.metric_conf.write_sample_interval.new_from_self(),
+        }
+    }
+
+    pub fn iter_sampling_interval(&self) -> SamplingInterval {
+        match self {
+            Self::DBWithThreadMode(d) => d.metric_conf.iter_sample_interval.new_from_self(),
+            Self::OptimisticTransactionDB(d) => d.metric_conf.iter_sample_interval.new_from_self(),
+        }
+    }
+
+    pub fn db_name(&self) -> String {
+        match self {
+            Self::DBWithThreadMode(d) => d
+                .metric_conf
+                .db_name_override
+                .clone()
+                .unwrap_or_else(|| self.default_db_name()),
+            Self::OptimisticTransactionDB(d) => d
+                .metric_conf
+                .db_name_override
+                .clone()
+                .unwrap_or_else(|| self.default_db_name()),
+        }
+    }
+
+    pub fn live_files(&self) -> Result<Vec<LiveFile>, Error> {
+        delegate_call!(self.live_files())
+    }
+
+    fn default_db_name(&self) -> String {
+        self.path()
+            .file_name()
+            .and_then(|f| f.to_str())
+            .unwrap_or("unknown")
+            .to_string()
+    }
+}
+
+pub enum RocksDBSnapshot<'a> {
+    DBWithThreadMode(rocksdb::Snapshot<'a>),
+    OptimisticTransactionDB(SnapshotWithThreadMode<'a, OptimisticTransactionDB>),
+}
+
+impl<'a> RocksDBSnapshot<'a> {
+    pub fn multi_get_cf_opt<'b: 'a, K, I, W>(
+        &'a self,
+        keys: I,
+        readopts: ReadOptions,
+    ) -> Vec<Result<Option<Vec<u8>>, rocksdb::Error>>
+    where
+        K: AsRef<[u8]>,
+        I: IntoIterator<Item = (&'b W, K)>,
+        W: 'b + AsColumnFamilyRef,
+    {
+        match self {
+            Self::DBWithThreadMode(s) => s.multi_get_cf_opt(keys, readopts),
+            Self::OptimisticTransactionDB(s) => s.multi_get_cf_opt(keys, readopts),
+        }
+    }
+    pub fn multi_get_cf<'b: 'a, K, I, W>(
+        &'a self,
+        keys: I,
+    ) -> Vec<Result<Option<Vec<u8>>, rocksdb::Error>>
+    where
+        K: AsRef<[u8]>,
+        I: IntoIterator<Item = (&'b W, K)>,
+        W: 'b + AsColumnFamilyRef,
+    {
+        match self {
+            Self::DBWithThreadMode(s) => s.multi_get_cf(keys),
+            Self::OptimisticTransactionDB(s) => s.multi_get_cf(keys),
+        }
+    }
+}
+
+pub enum RocksDBBatch {
+    Regular(rocksdb::WriteBatch),
+    Transactional(rocksdb::WriteBatchWithTransaction<true>),
+}
+
+macro_rules! delegate_batch_call {
+    ($self:ident.$method:ident($($args:ident),*)) => {
+        match $self {
+            Self::Regular(b) => b.$method($($args),*),
+            Self::Transactional(b) => b.$method($($args),*),
+        }
+    }
+}
+
+impl RocksDBBatch {
+    fn size_in_bytes(&self) -> usize {
+        delegate_batch_call!(self.size_in_bytes())
+    }
+
+    pub fn delete_cf<K: AsRef<[u8]>>(&mut self, cf: &impl AsColumnFamilyRef, key: K) {
+        delegate_batch_call!(self.delete_cf(cf, key))
+    }
+
+    pub fn put_cf<K, V>(&mut self, cf: &impl AsColumnFamilyRef, key: K, value: V)
+    where
+        K: AsRef<[u8]>,
+        V: AsRef<[u8]>,
+    {
+        delegate_batch_call!(self.put_cf(cf, key, value))
+    }
+
+    pub fn merge_cf<K, V>(&mut self, cf: &impl AsColumnFamilyRef, key: K, value: V)
+    where
+        K: AsRef<[u8]>,
+        V: AsRef<[u8]>,
+    {
+        delegate_batch_call!(self.merge_cf(cf, key, value))
+    }
+
+    pub fn delete_range_cf<K: AsRef<[u8]>>(
+        &mut self,
+        cf: &impl AsColumnFamilyRef,
+        from: K,
+        to: K,
+    ) -> Result<(), RawStoreError> {
+        match self {
+            Self::Regular(batch) => {
+                batch.delete_range_cf(cf, from, to);
+                Ok(())
+            }
+            Self::Transactional(_) => Err(RawStoreError::RocksDBError(
+                "operation not supported".to_string(),
+            )),
+        }
+    }
+}
+
+#[derive(Debug, Default)]
+pub struct MetricConf {
+    pub db_name_override: Option<String>,
+    pub read_sample_interval: SamplingInterval,
+    pub write_sample_interval: SamplingInterval,
+    pub iter_sample_interval: SamplingInterval,
+}
+
+impl MetricConf {
+    pub fn with_db_name(db_name: &str) -> Self {
+        Self {
+            db_name_override: Some(db_name.to_string()),
+            read_sample_interval: SamplingInterval::default(),
+            write_sample_interval: SamplingInterval::default(),
+            iter_sample_interval: SamplingInterval::default(),
+        }
+    }
+    pub fn with_sampling(read_interval: SamplingInterval) -> Self {
+        Self {
+            db_name_override: None,
+            read_sample_interval: read_interval,
+            write_sample_interval: SamplingInterval::default(),
+            iter_sample_interval: SamplingInterval::default(),
+        }
+    }
+}
+const CF_METRICS_REPORT_PERIOD_MILLIS: u64 = 1000;
+const METRICS_ERROR: i64 = -1;
+
+/// An interface to a rocksDB database, keyed by a columnfamily
+#[derive(Clone, Debug)]
+pub struct DBMap<K, V> {
+    pub rocksdb: Arc<RocksDB>,
+    _phantom: PhantomData<fn(K) -> V>,
+    // the rocksDB ColumnFamily under which the map is stored
+    cf: String,
+    pub opts: ReadWriteOptions,
+    db_metrics: Arc<DBMetrics>,
+    get_sample_interval: SamplingInterval,
+    multiget_sample_interval: SamplingInterval,
+    write_sample_interval: SamplingInterval,
+    iter_sample_interval: SamplingInterval,
+    _metrics_task_cancel_handle: Arc<oneshot::Sender<()>>,
+}
+
+unsafe impl<K: Send, V: Send> Send for DBMap<K, V> {}
+
+impl<K, V> DBMap<K, V> {
+    pub(crate) fn new(db: Arc<RocksDB>, opts: &ReadWriteOptions, opt_cf: &str) -> Self {
+        let db_cloned = db.clone();
+        let db_metrics = DBMetrics::get();
+        let db_metrics_cloned = db_metrics.clone();
+        let cf = opt_cf.to_string();
+        let (sender, mut recv) = tokio::sync::oneshot::channel();
+        tokio::task::spawn(async move {
+            let mut interval =
+                tokio::time::interval(Duration::from_millis(CF_METRICS_REPORT_PERIOD_MILLIS));
+            loop {
+                tokio::select! {
+                    _ = interval.tick() => {
+                        let db = db_cloned.clone();
+                        let cf = cf.clone();
+                        let db_metrics = db_metrics.clone();
+                        if let Err(e) = tokio::task::spawn_blocking(move || {
+                            Self::report_metrics(&db, &cf, &db_metrics);
+                        }).await {
+                            error!("Failed to log metrics with error: {}", e);
+                        }
+                    }
+                    _ = &mut recv => break,
+                }
+            }
+            info!("Returning the cf metric logging task for DBMap: {}", &cf);
+        });
+        DBMap {
+            rocksdb: db.clone(),
+            opts: opts.clone(),
+            _phantom: PhantomData,
+            cf: opt_cf.to_string(),
+            db_metrics: db_metrics_cloned,
+            _metrics_task_cancel_handle: Arc::new(sender),
+            get_sample_interval: db.get_sampling_interval(),
+            multiget_sample_interval: db.multiget_sampling_interval(),
+            write_sample_interval: db.write_sampling_interval(),
+            iter_sample_interval: db.iter_sampling_interval(),
+        }
+    }
+
+    /// Opens a database from a path, with specific options and an optional column family.
+    ///
+    /// This database is used to perform operations on single column family, and parametrizes
+    /// all operations in `DBBatch` when writing across column families.
+    #[instrument(level="debug", skip_all, fields(path = ?path.as_ref(), cf = ?opt_cf), err)]
+    pub fn open<P: AsRef<Path>>(
+        path: P,
+        metric_conf: MetricConf,
+        db_options: Option<rocksdb::Options>,
+        opt_cf: Option<&str>,
+        rw_options: &ReadWriteOptions,
+    ) -> Result<Self, RawStoreError> {
+        let cf_key = opt_cf.unwrap_or(rocksdb::DEFAULT_COLUMN_FAMILY_NAME);
+        let cfs = vec![cf_key];
+        let rocksdb = open_cf(path, db_options, metric_conf, &cfs)?;
+        Ok(DBMap::new(rocksdb, rw_options, cf_key))
+    }
+
+    /// Reopens an open database as a typed map operating under a specific column family.
+    /// if no column family is passed, the default column family is used.
+    ///
+    /// ```
+    ///    use raw_store::rocks::*;
+    ///    use raw_store::metrics::DBMetrics;
+    ///    use tempfile::tempdir;
+    ///    use prometheus::Registry;
+    ///    use std::sync::Arc;
+    ///    use core::fmt::Error;
+    ///    #[tokio::main]
+    ///    async fn main() -> Result<(), Error> {
+    ///    /// Open the DB with all needed column families first.
+    ///    let rocks = open_cf(tempdir().unwrap(), None, MetricConf::default(), &["First_CF", "Second_CF"]).unwrap();
+    ///    /// Attach the column families to specific maps.
+    ///    let db_cf_1 = DBMap::<u32,u32>::reopen(&rocks, Some("First_CF"), &ReadWriteOptions::default()).expect("Failed to open storage");
+    ///    let db_cf_2 = DBMap::<u32,u32>::reopen(&rocks, Some("Second_CF"), &ReadWriteOptions::default()).expect("Failed to open storage");
+    ///    Ok(())
+    ///    }
+    /// ```
+    #[instrument(level = "debug", skip(db), err)]
+    pub fn reopen(
+        db: &Arc<RocksDB>,
+        opt_cf: Option<&str>,
+        rw_options: &ReadWriteOptions,
+    ) -> Result<Self, RawStoreError> {
+        let cf_key = opt_cf
+            .unwrap_or(rocksdb::DEFAULT_COLUMN_FAMILY_NAME)
+            .to_owned();
+
+        db.cf_handle(&cf_key)
+            .ok_or_else(|| RawStoreError::UnregisteredColumn(cf_key.clone()))?;
+
+        Ok(DBMap::new(db.clone(), rw_options, &cf_key))
+    }
+
+    pub fn batch(&self) -> DBBatch {
+        let batch = match *self.rocksdb {
+            RocksDB::DBWithThreadMode(_) => RocksDBBatch::Regular(WriteBatch::default()),
+            RocksDB::OptimisticTransactionDB(_) => {
+                RocksDBBatch::Transactional(WriteBatchWithTransaction::<true>::default())
+            }
+        };
+        DBBatch::new(
+            &self.rocksdb,
+            batch,
+            &self.db_metrics,
+            &self.write_sample_interval,
+        )
+    }
+
+    pub fn compact_range<J: Serialize>(&self, start: &J, end: &J) -> Result<(), RawStoreError> {
+        let from_buf = be_fix_int_ser(start.borrow())?;
+        let to_buf = be_fix_int_ser(end.borrow())?;
+        self.rocksdb
+            .compact_range_cf(&self.cf(), Some(from_buf), Some(to_buf));
+        Ok(())
+    }
+
+    pub fn compact_range_to_bottom<J: Serialize>(
+        &self,
+        start: &J,
+        end: &J,
+    ) -> Result<(), RawStoreError> {
+        let from_buf = be_fix_int_ser(start.borrow())?;
+        let to_buf = be_fix_int_ser(end.borrow())?;
+        self.rocksdb
+            .compact_range_to_bottom(&self.cf(), Some(from_buf), Some(to_buf));
+        Ok(())
+    }
+
+    pub fn cf(&self) -> Arc<rocksdb::BoundColumnFamily<'_>> {
+        self.rocksdb
+            .cf_handle(&self.cf)
+            .expect("Map-keying column family should have been checked at DB creation")
+    }
+
+    pub fn iterator_cf(&self) -> RocksDBIter<'_> {
+        self.rocksdb
+            .iterator_cf(&self.cf(), self.opts.readopts(), IteratorMode::Start)
+    }
+
+    pub fn flush(&self) -> Result<(), RawStoreError> {
+        self.rocksdb
+            .flush_cf(&self.cf())
+            .map_err(|e| RawStoreError::RocksDBError(e.into_string()))
+    }
+
+    pub fn set_options(&self, opts: &[(&str, &str)]) -> Result<(), rocksdb::Error> {
+        self.rocksdb.set_options_cf(&self.cf(), opts)
+    }
+
+    fn get_int_property(
+        rocksdb: &RocksDB,
+        cf: &impl AsColumnFamilyRef,
+        property_name: &'static std::ffi::CStr,
+    ) -> Result<i64, RawStoreError> {
+        match rocksdb.property_int_value_cf(cf, property_name) {
+            Ok(Some(value)) => Ok(value.try_into().unwrap()),
+            Ok(None) => Ok(0),
+            Err(e) => Err(RawStoreError::RocksDBError(e.into_string())),
+        }
+    }
+
+    fn report_metrics(rocksdb: &Arc<RocksDB>, cf_name: &str, db_metrics: &Arc<DBMetrics>) {
+        let cf = rocksdb.cf_handle(cf_name).expect("Failed to get cf");
+        db_metrics
+            .cf_metrics
+            .rocksdb_total_sst_files_size
+            .with_label_values(&[cf_name])
+            .set(
+                Self::get_int_property(rocksdb, &cf, properties::TOTAL_SST_FILES_SIZE)
+                    .unwrap_or(METRICS_ERROR),
+            );
+        db_metrics
+            .cf_metrics
+            .rocksdb_total_blob_files_size
+            .with_label_values(&[cf_name])
+            .set(
+                Self::get_int_property(rocksdb, &cf, ROCKSDB_PROPERTY_TOTAL_BLOB_FILES_SIZE)
+                    .unwrap_or(METRICS_ERROR),
+            );
+        db_metrics
+            .cf_metrics
+            .rocksdb_size_all_mem_tables
+            .with_label_values(&[cf_name])
+            .set(
+                Self::get_int_property(rocksdb, &cf, properties::SIZE_ALL_MEM_TABLES)
+                    .unwrap_or(METRICS_ERROR),
+            );
+        db_metrics
+            .cf_metrics
+            .rocksdb_num_snapshots
+            .with_label_values(&[cf_name])
+            .set(
+                Self::get_int_property(rocksdb, &cf, properties::NUM_SNAPSHOTS)
+                    .unwrap_or(METRICS_ERROR),
+            );
+        db_metrics
+            .cf_metrics
+            .rocksdb_oldest_snapshot_time
+            .with_label_values(&[cf_name])
+            .set(
+                Self::get_int_property(rocksdb, &cf, properties::OLDEST_SNAPSHOT_TIME)
+                    .unwrap_or(METRICS_ERROR),
+            );
+        db_metrics
+            .cf_metrics
+            .rocksdb_actual_delayed_write_rate
+            .with_label_values(&[cf_name])
+            .set(
+                Self::get_int_property(rocksdb, &cf, properties::ACTUAL_DELAYED_WRITE_RATE)
+                    .unwrap_or(METRICS_ERROR),
+            );
+        db_metrics
+            .cf_metrics
+            .rocksdb_is_write_stopped
+            .with_label_values(&[cf_name])
+            .set(
+                Self::get_int_property(rocksdb, &cf, properties::IS_WRITE_STOPPED)
+                    .unwrap_or(METRICS_ERROR),
+            );
+        db_metrics
+            .cf_metrics
+            .rocksdb_block_cache_capacity
+            .with_label_values(&[cf_name])
+            .set(
+                Self::get_int_property(rocksdb, &cf, properties::BLOCK_CACHE_CAPACITY)
+                    .unwrap_or(METRICS_ERROR),
+            );
+        db_metrics
+            .cf_metrics
+            .rocksdb_block_cache_usage
+            .with_label_values(&[cf_name])
+            .set(
+                Self::get_int_property(rocksdb, &cf, properties::BLOCK_CACHE_USAGE)
+                    .unwrap_or(METRICS_ERROR),
+            );
+        db_metrics
+            .cf_metrics
+            .rocksdb_block_cache_pinned_usage
+            .with_label_values(&[cf_name])
+            .set(
+                Self::get_int_property(rocksdb, &cf, properties::BLOCK_CACHE_PINNED_USAGE)
+                    .unwrap_or(METRICS_ERROR),
+            );
+        db_metrics
+            .cf_metrics
+            .rocskdb_estimate_table_readers_mem
+            .with_label_values(&[cf_name])
+            .set(
+                Self::get_int_property(rocksdb, &cf, properties::ESTIMATE_TABLE_READERS_MEM)
+                    .unwrap_or(METRICS_ERROR),
+            );
+        db_metrics
+            .cf_metrics
+            .rocksdb_estimated_num_keys
+            .with_label_values(&[cf_name])
+            .set(
+                Self::get_int_property(rocksdb, &cf, properties::ESTIMATE_NUM_KEYS)
+                    .unwrap_or(METRICS_ERROR),
+            );
+        db_metrics
+            .cf_metrics
+            .rocksdb_mem_table_flush_pending
+            .with_label_values(&[cf_name])
+            .set(
+                Self::get_int_property(rocksdb, &cf, properties::MEM_TABLE_FLUSH_PENDING)
+                    .unwrap_or(METRICS_ERROR),
+            );
+        db_metrics
+            .cf_metrics
+            .rocskdb_compaction_pending
+            .with_label_values(&[cf_name])
+            .set(
+                Self::get_int_property(rocksdb, &cf, properties::COMPACTION_PENDING)
+                    .unwrap_or(METRICS_ERROR),
+            );
+        db_metrics
+            .cf_metrics
+            .rocskdb_num_running_compactions
+            .with_label_values(&[cf_name])
+            .set(
+                Self::get_int_property(rocksdb, &cf, properties::NUM_RUNNING_COMPACTIONS)
+                    .unwrap_or(METRICS_ERROR),
+            );
+        db_metrics
+            .cf_metrics
+            .rocksdb_num_running_flushes
+            .with_label_values(&[cf_name])
+            .set(
+                Self::get_int_property(rocksdb, &cf, properties::NUM_RUNNING_FLUSHES)
+                    .unwrap_or(METRICS_ERROR),
+            );
+        db_metrics
+            .cf_metrics
+            .rocksdb_estimate_oldest_key_time
+            .with_label_values(&[cf_name])
+            .set(
+                Self::get_int_property(rocksdb, &cf, properties::ESTIMATE_OLDEST_KEY_TIME)
+                    .unwrap_or(METRICS_ERROR),
+            );
+        db_metrics
+            .cf_metrics
+            .rocskdb_background_errors
+            .with_label_values(&[cf_name])
+            .set(
+                Self::get_int_property(rocksdb, &cf, properties::BACKGROUND_ERRORS)
+                    .unwrap_or(METRICS_ERROR),
+            );
+    }
+
+    pub fn transaction(&self) -> Result<DBTransaction<'_>, RawStoreError> {
+        DBTransaction::new(&self.rocksdb)
+    }
+
+    pub fn transaction_without_snapshot(&self) -> Result<DBTransaction<'_>, RawStoreError> {
+        DBTransaction::new_without_snapshot(&self.rocksdb)
+    }
+
+    pub fn checkpoint_db(&self, path: &Path) -> Result<(), RawStoreError> {
+        self.rocksdb.checkpoint(path)
+    }
+
+    pub fn snapshot(&self) -> Result<RocksDBSnapshot<'_>, RawStoreError> {
+        Ok(self.rocksdb.snapshot())
+    }
+
+    pub fn table_summary(&self) -> eyre::Result<TableSummary> {
+        let mut num_keys = 0;
+        let mut key_bytes_total = 0;
+        let mut value_bytes_total = 0;
+        let mut key_hist = hdrhistogram::Histogram::<u64>::new_with_max(100000, 2).unwrap();
+        let mut value_hist = hdrhistogram::Histogram::<u64>::new_with_max(100000, 2).unwrap();
+        let iter = self.iterator_cf().map(Result::unwrap);
+        for (key, value) in iter {
+            num_keys += 1;
+            key_bytes_total += key.len();
+            value_bytes_total += value.len();
+            key_hist.record(key.len() as u64)?;
+            value_hist.record(value.len() as u64)?;
+        }
+        Ok(TableSummary {
+            num_keys,
+            key_bytes_total,
+            value_bytes_total,
+            key_hist,
+            value_hist,
+        })
+    }
+}
+
+/// Provides a mutable struct to form a collection of database write operations, and execute them.
+///
+/// Batching write and delete operations is faster than performing them one by one and ensures their atomicity,
+///  ie. they are all written or none is.
+/// This is also true of operations across column families in the same database.
+///
+/// Serializations / Deserialization, and naming of column families is performed by passing a DBMap<K,V>
+/// with each operation.
+///
+/// ```
+/// use raw_store::rocks::*;
+/// use tempfile::tempdir;
+/// use raw_store::Map;
+/// use raw_store::metrics::DBMetrics;
+/// use prometheus::Registry;
+/// use core::fmt::Error;
+/// use std::sync::Arc;
+///
+/// #[tokio::main]
+/// async fn main() -> Result<(), Error> {
+/// let rocks = open_cf(tempfile::tempdir().unwrap(), None, MetricConf::default(), &["First_CF", "Second_CF"]).unwrap();
+///
+/// let db_cf_1 = DBMap::reopen(&rocks, Some("First_CF"), &ReadWriteOptions::default())
+///     .expect("Failed to open storage");
+/// let keys_vals_1 = (1..100).map(|i| (i, i.to_string()));
+///
+/// let db_cf_2 = DBMap::reopen(&rocks, Some("Second_CF"), &ReadWriteOptions::default())
+///     .expect("Failed to open storage");
+/// let keys_vals_2 = (1000..1100).map(|i| (i, i.to_string()));
+///
+/// let mut batch = db_cf_1.batch();
+/// batch
+///     .insert_batch(&db_cf_1, keys_vals_1.clone())
+///     .expect("Failed to batch insert")
+///     .insert_batch(&db_cf_2, keys_vals_2.clone())
+///     .expect("Failed to batch insert");
+///
+/// let _ = batch.write().expect("Failed to execute batch");
+/// for (k, v) in keys_vals_1 {
+///     let val = db_cf_1.get(&k).expect("Failed to get inserted key");
+///     assert_eq!(Some(v), val);
+/// }
+///
+/// for (k, v) in keys_vals_2 {
+///     let val = db_cf_2.get(&k).expect("Failed to get inserted key");
+///     assert_eq!(Some(v), val);
+/// }
+/// Ok(())
+/// }
+/// ```
+///
+pub struct DBBatch {
+    rocksdb: Arc<RocksDB>,
+    batch: RocksDBBatch,
+    db_metrics: Arc<DBMetrics>,
+    write_sample_interval: SamplingInterval,
+}
+
+impl DBBatch {
+    /// Create a new batch associated with a DB reference.
+    ///
+    /// Use `open_cf` to get the DB reference or an existing open database.
+    pub fn new(
+        dbref: &Arc<RocksDB>,
+        batch: RocksDBBatch,
+        db_metrics: &Arc<DBMetrics>,
+        write_sample_interval: &SamplingInterval,
+    ) -> Self {
+        DBBatch {
+            rocksdb: dbref.clone(),
+            batch,
+            db_metrics: db_metrics.clone(),
+            write_sample_interval: write_sample_interval.clone(),
+        }
+    }
+
+    /// Consume the batch and write its operations to the database
+    #[instrument(level = "trace", skip_all, err)]
+    pub fn write(self) -> Result<(), RawStoreError> {
+        let report_metrics = if self.write_sample_interval.sample() {
+            let db_name = self.rocksdb.db_name();
+            let timer = self
+                .db_metrics
+                .op_metrics
+                .rocksdb_batch_commit_latency_seconds
+                .with_label_values(&[&db_name])
+                .start_timer();
+            let size = self.batch.size_in_bytes();
+            Some((db_name, size, timer, RocksDBPerfContext::default()))
+        } else {
+            None
+        };
+        self.rocksdb.write(self.batch)?;
+        if let Some((db_name, batch_size, _timer, _perf_ctx)) = report_metrics {
+            self.db_metrics
+                .op_metrics
+                .rocksdb_batch_commit_bytes
+                .with_label_values(&[&db_name])
+                .observe(batch_size as f64);
+            self.db_metrics
+                .write_perf_ctx_metrics
+                .report_metrics(&db_name);
+        }
+        Ok(())
+    }
+}
+
+// TODO: Remove this entire implementation once we switch to sally
+impl DBBatch {
+    pub fn delete_batch<J: Borrow<K>, K: Serialize, V>(
+        &mut self,
+        db: &DBMap<K, V>,
+        purged_vals: impl IntoIterator<Item = J>,
+    ) -> Result<(), RawStoreError> {
+        if !Arc::ptr_eq(&db.rocksdb, &self.rocksdb) {
+            return Err(RawStoreError::CrossDBBatch);
+        }
+
+        purged_vals
+            .into_iter()
+            .try_for_each::<_, Result<_, RawStoreError>>(|k| {
+                let k_buf = be_fix_int_ser(k.borrow())?;
+                self.batch.delete_cf(&db.cf(), k_buf);
+
+                Ok(())
+            })?;
+        Ok(())
+    }
+
+    /// Deletes a range of keys between `from` (inclusive) and `to` (non-inclusive)
+    pub fn delete_range<K: Serialize, V>(
+        &mut self,
+        db: &DBMap<K, V>,
+        from: &K,
+        to: &K,
+    ) -> Result<(), RawStoreError> {
+        if !Arc::ptr_eq(&db.rocksdb, &self.rocksdb) {
+            return Err(RawStoreError::CrossDBBatch);
+        }
+
+        let from_buf = be_fix_int_ser(from)?;
+        let to_buf = be_fix_int_ser(to)?;
+
+        self.batch.delete_range_cf(&db.cf(), from_buf, to_buf)?;
+        Ok(())
+    }
+
+    /// inserts a range of (key, value) pairs given as an iterator
+    pub fn insert_batch<J: Borrow<K>, K: Serialize, U: Borrow<V>, V: Serialize>(
+        &mut self,
+        db: &DBMap<K, V>,
+        new_vals: impl IntoIterator<Item = (J, U)>,
+    ) -> Result<&mut Self, RawStoreError> {
+        if !Arc::ptr_eq(&db.rocksdb, &self.rocksdb) {
+            return Err(RawStoreError::CrossDBBatch);
+        }
+
+        new_vals
+            .into_iter()
+            .try_for_each::<_, Result<_, RawStoreError>>(|(k, v)| {
+                let k_buf = be_fix_int_ser(k.borrow())?;
+                let v_buf = bcs::to_bytes(v.borrow())?;
+                self.batch.put_cf(&db.cf(), k_buf, v_buf);
+                Ok(())
+            })?;
+        Ok(self)
+    }
+
+    /// merges a range of (key, value) pairs given as an iterator
+    pub fn merge_batch<J: Borrow<K>, K: Serialize, U: Borrow<V>, V: Serialize>(
+        &mut self,
+        db: &DBMap<K, V>,
+        new_vals: impl IntoIterator<Item = (J, U)>,
+    ) -> Result<&mut Self, RawStoreError> {
+        if !Arc::ptr_eq(&db.rocksdb, &self.rocksdb) {
+            return Err(RawStoreError::CrossDBBatch);
+        }
+
+        new_vals
+            .into_iter()
+            .try_for_each::<_, Result<_, RawStoreError>>(|(k, v)| {
+                let k_buf = be_fix_int_ser(k.borrow())?;
+                let v_buf = bcs::to_bytes(v.borrow())?;
+                self.batch.merge_cf(&db.cf(), k_buf, v_buf);
+                Ok(())
+            })?;
+        Ok(self)
+    }
+
+    /// similar to `merge_batch` but allows merge with partial values
+    pub fn partial_merge_batch<J: Borrow<K>, K: Serialize, V: Serialize, B: AsRef<[u8]>>(
+        &mut self,
+        db: &DBMap<K, V>,
+        new_vals: impl IntoIterator<Item = (J, B)>,
+    ) -> Result<&mut Self, RawStoreError> {
+        if !Arc::ptr_eq(&db.rocksdb, &self.rocksdb) {
+            return Err(RawStoreError::CrossDBBatch);
+        }
+        new_vals
+            .into_iter()
+            .try_for_each::<_, Result<_, RawStoreError>>(|(k, v)| {
+                let k_buf = be_fix_int_ser(k.borrow())?;
+                self.batch.merge_cf(&db.cf(), k_buf, v);
+                Ok(())
+            })?;
+        Ok(self)
+    }
+}
+
+pub struct DBTransaction<'a> {
+    rocksdb: Arc<RocksDB>,
+    transaction: Transaction<'a, rocksdb::OptimisticTransactionDB>,
+}
+
+impl<'a> DBTransaction<'a> {
+    pub fn new(db: &'a Arc<RocksDB>) -> Result<Self, RawStoreError> {
+        Ok(Self {
+            rocksdb: db.clone(),
+            transaction: db.transaction()?,
+        })
+    }
+
+    pub fn new_without_snapshot(db: &'a Arc<RocksDB>) -> Result<Self, RawStoreError> {
+        Ok(Self {
+            rocksdb: db.clone(),
+            transaction: db.transaction_without_snapshot()?,
+        })
+    }
+
+    pub fn insert_batch<J: Borrow<K>, K: Serialize, U: Borrow<V>, V: Serialize>(
+        &mut self,
+        db: &DBMap<K, V>,
+        new_vals: impl IntoIterator<Item = (J, U)>,
+    ) -> Result<&mut Self, RawStoreError> {
+        if !Arc::ptr_eq(&db.rocksdb, &self.rocksdb) {
+            return Err(RawStoreError::CrossDBBatch);
+        }
+
+        new_vals
+            .into_iter()
+            .try_for_each::<_, Result<_, RawStoreError>>(|(k, v)| {
+                let k_buf = be_fix_int_ser(k.borrow())?;
+                let v_buf = bcs::to_bytes(v.borrow())?;
+                self.transaction.put_cf(&db.cf(), k_buf, v_buf)?;
+                Ok(())
+            })?;
+        Ok(self)
+    }
+
+    /// Deletes a set of keys given as an iterator
+    pub fn delete_batch<J: Borrow<K>, K: Serialize, V>(
+        &mut self,
+        db: &DBMap<K, V>,
+        purged_vals: impl IntoIterator<Item = J>,
+    ) -> Result<&mut Self, RawStoreError> {
+        if !Arc::ptr_eq(&db.rocksdb, &self.rocksdb) {
+            return Err(RawStoreError::CrossDBBatch);
+        }
+        purged_vals
+            .into_iter()
+            .try_for_each::<_, Result<_, RawStoreError>>(|k| {
+                let k_buf = be_fix_int_ser(k.borrow())?;
+                self.transaction.delete_cf(&db.cf(), k_buf)?;
+                Ok(())
+            })?;
+        Ok(self)
+    }
+
+    pub fn snapshot(
+        &self,
+    ) -> rocksdb::SnapshotWithThreadMode<'_, Transaction<'a, rocksdb::OptimisticTransactionDB>>
+    {
+        self.transaction.snapshot()
+    }
+
+    pub fn get_for_update<K: Serialize, V: DeserializeOwned>(
+        &self,
+        db: &DBMap<K, V>,
+        key: &K,
+    ) -> Result<Option<V>, RawStoreError> {
+        if !Arc::ptr_eq(&db.rocksdb, &self.rocksdb) {
+            return Err(RawStoreError::CrossDBBatch);
+        }
+        let k_buf = be_fix_int_ser(key.borrow())?;
+        match self
+            .transaction
+            .get_for_update_cf_opt(&db.cf(), k_buf, true, &db.opts.readopts())?
+        {
+            Some(data) => Ok(Some(bcs::from_bytes(&data)?)),
+            None => Ok(None),
+        }
+    }
+
+    pub fn get<K: Serialize + DeserializeOwned, V: Serialize + DeserializeOwned>(
+        &self,
+        db: &DBMap<K, V>,
+        key: &K,
+    ) -> Result<Option<V>, RawStoreError> {
+        let key_buf = be_fix_int_ser(key)?;
+        self.transaction
+            .get_cf_opt(&db.cf(), key_buf, &db.opts.readopts())
+            .map_err(|e| RawStoreError::RocksDBError(e.to_string()))
+            .map(|res| res.and_then(|bytes| bcs::from_bytes::<V>(&bytes).ok()))
+    }
+
+    pub fn multi_get<J: Borrow<K>, K: Serialize + DeserializeOwned, V: DeserializeOwned>(
+        &self,
+        db: &DBMap<K, V>,
+        keys: impl IntoIterator<Item = J>,
+    ) -> Result<Vec<Option<V>>, RawStoreError> {
+        let cf = db.cf();
+        let keys_bytes: Result<Vec<_>, RawStoreError> = keys
+            .into_iter()
+            .map(|k| Ok((&cf, be_fix_int_ser(k.borrow())?)))
+            .collect();
+
+        let results = self
+            .transaction
+            .multi_get_cf_opt(keys_bytes?, &db.opts.readopts());
+
+        let values_parsed: Result<Vec<_>, RawStoreError> = results
+            .into_iter()
+            .map(|value_byte| match value_byte? {
+                Some(data) => Ok(Some(bcs::from_bytes(&data)?)),
+                None => Ok(None),
+            })
+            .collect();
+
+        values_parsed
+    }
+
+    pub fn iter<K: DeserializeOwned, V: DeserializeOwned>(
+        &'a self,
+        db: &DBMap<K, V>,
+    ) -> Iter<'a, K, V> {
+        let db_iter = self
+            .transaction
+            .raw_iterator_cf_opt(&db.cf(), db.opts.readopts());
+        Iter::new(
+            db.cf.clone(),
+            RocksDBRawIter::OptimisticTransaction(db_iter),
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+    }
+
+    pub fn keys<K: DeserializeOwned, V: DeserializeOwned>(
+        &'a self,
+        db: &DBMap<K, V>,
+    ) -> Keys<'a, K> {
+        let mut db_iter = RocksDBRawIter::OptimisticTransaction(
+            self.transaction
+                .raw_iterator_cf_opt(&db.cf(), db.opts.readopts()),
+        );
+        db_iter.seek_to_first();
+
+        Keys::new(db_iter)
+    }
+
+    pub fn values<K: DeserializeOwned, V: DeserializeOwned>(
+        &'a self,
+        db: &DBMap<K, V>,
+    ) -> Values<'a, V> {
+        let mut db_iter = RocksDBRawIter::OptimisticTransaction(
+            self.transaction
+                .raw_iterator_cf_opt(&db.cf(), db.opts.readopts()),
+        );
+        db_iter.seek_to_first();
+
+        Values::new(db_iter)
+    }
+
+    pub fn commit(self) -> Result<(), RawStoreError> {
+        fail_point!("transaction-commit");
+        self.transaction.commit().map_err(|e| match e.kind() {
+            // empirically, this is what you get when there is a write conflict. it is not
+            // documented whether this is the only time you can get this error.
+            ErrorKind::Busy | ErrorKind::TryAgain => RawStoreError::RetryableTransactionError,
+            _ => e.into(),
+        })?;
+        Ok(())
+    }
+}
+
+macro_rules! delegate_iter_call {
+    ($self:ident.$method:ident($($args:ident),*)) => {
+        match $self {
+            Self::DB(db) => db.$method($($args),*),
+            Self::OptimisticTransactionDB(db) => db.$method($($args),*),
+            Self::OptimisticTransaction(db) => db.$method($($args),*),
+        }
+    }
+}
+
+pub enum RocksDBRawIter<'a> {
+    DB(rocksdb::DBRawIteratorWithThreadMode<'a, DBWithThreadMode<MultiThreaded>>),
+    OptimisticTransactionDB(
+        rocksdb::DBRawIteratorWithThreadMode<'a, rocksdb::OptimisticTransactionDB<MultiThreaded>>,
+    ),
+    OptimisticTransaction(
+        rocksdb::DBRawIteratorWithThreadMode<
+            'a,
+            Transaction<'a, rocksdb::OptimisticTransactionDB<MultiThreaded>>,
+        >,
+    ),
+}
+
+impl<'a> RocksDBRawIter<'a> {
+    pub fn valid(&self) -> bool {
+        delegate_iter_call!(self.valid())
+    }
+    pub fn key(&self) -> Option<&[u8]> {
+        delegate_iter_call!(self.key())
+    }
+    pub fn value(&self) -> Option<&[u8]> {
+        delegate_iter_call!(self.value())
+    }
+    pub fn next(&mut self) {
+        delegate_iter_call!(self.next())
+    }
+    pub fn prev(&mut self) {
+        delegate_iter_call!(self.prev())
+    }
+    pub fn seek<K: AsRef<[u8]>>(&mut self, key: K) {
+        delegate_iter_call!(self.seek(key))
+    }
+    pub fn seek_to_last(&mut self) {
+        delegate_iter_call!(self.seek_to_last())
+    }
+    pub fn seek_to_first(&mut self) {
+        delegate_iter_call!(self.seek_to_first())
+    }
+    pub fn seek_for_prev<K: AsRef<[u8]>>(&mut self, key: K) {
+        delegate_iter_call!(self.seek_for_prev(key))
+    }
+    pub fn status(&self) -> Result<(), rocksdb::Error> {
+        delegate_iter_call!(self.status())
+    }
+}
+
+pub enum RocksDBIter<'a> {
+    DB(rocksdb::DBIteratorWithThreadMode<'a, DBWithThreadMode<MultiThreaded>>),
+    OptimisticTransactionDB(
+        rocksdb::DBIteratorWithThreadMode<'a, rocksdb::OptimisticTransactionDB<MultiThreaded>>,
+    ),
+}
+
+impl<'a> Iterator for RocksDBIter<'a> {
+    type Item = Result<(Box<[u8]>, Box<[u8]>), Error>;
+    fn next(&mut self) -> Option<Self::Item> {
+        match self {
+            Self::DB(db) => db.next(),
+            Self::OptimisticTransactionDB(db) => db.next(),
+        }
+    }
+}
+
+impl<'a, K, V> Map<'a, K, V> for DBMap<K, V>
+where
+    K: Serialize + DeserializeOwned,
+    V: Serialize + DeserializeOwned,
+{
+    type Error = RawStoreError;
+    type Iterator = Iter<'a, K, V>;
+    type SafeIterator = SafeIter<'a, K, V>;
+    type Keys = Keys<'a, K>;
+    type Values = Values<'a, V>;
+
+    #[instrument(level = "trace", skip_all, err)]
+    fn contains_key(&self, key: &K) -> Result<bool, RawStoreError> {
+        let key_buf = be_fix_int_ser(key)?;
+        // [`rocksdb::DBWithThreadMode::key_may_exist_cf`] can have false positives,
+        // but no false negatives. We use it to short-circuit the absent case
+        let readopts = self.opts.readopts();
+        Ok(self
+            .rocksdb
+            .key_may_exist_cf(&self.cf(), &key_buf, &readopts)
+            && self
+                .rocksdb
+                .get_pinned_cf(&self.cf(), &key_buf, &readopts)?
+                .is_some())
+    }
+
+    #[instrument(level = "trace", skip_all, err)]
+    fn get(&self, key: &K) -> Result<Option<V>, RawStoreError> {
+        let _timer = self
+            .db_metrics
+            .op_metrics
+            .rocksdb_get_latency_seconds
+            .with_label_values(&[&self.cf])
+            .start_timer();
+        let perf_ctx = if self.get_sample_interval.sample() {
+            Some(RocksDBPerfContext::default())
+        } else {
+            None
+        };
+        let key_buf = be_fix_int_ser(key)?;
+        let res = self
+            .rocksdb
+            .get_pinned_cf(&self.cf(), &key_buf, &self.opts.readopts())?;
+        self.db_metrics
+            .op_metrics
+            .rocksdb_get_bytes
+            .with_label_values(&[&self.cf])
+            .observe(res.as_ref().map_or(0.0, |v| v.len() as f64));
+        if perf_ctx.is_some() {
+            self.db_metrics
+                .read_perf_ctx_metrics
+                .report_metrics(&self.cf);
+        }
+        match res {
+            Some(data) => Ok(Some(bcs::from_bytes(&data)?)),
+            None => Ok(None),
+        }
+    }
+
+    #[instrument(level = "trace", skip_all, err)]
+    fn get_raw_bytes(&self, key: &K) -> Result<Option<Vec<u8>>, RawStoreError> {
+        let _timer = self
+            .db_metrics
+            .op_metrics
+            .rocksdb_get_latency_seconds
+            .with_label_values(&[&self.cf])
+            .start_timer();
+        let perf_ctx = if self.get_sample_interval.sample() {
+            Some(RocksDBPerfContext::default())
+        } else {
+            None
+        };
+        let key_buf = be_fix_int_ser(key)?;
+        let res = self
+            .rocksdb
+            .get_pinned_cf(&self.cf(), &key_buf, &self.opts.readopts())?;
+        self.db_metrics
+            .op_metrics
+            .rocksdb_get_bytes
+            .with_label_values(&[&self.cf])
+            .observe(res.as_ref().map_or(0.0, |v| v.len() as f64));
+        if perf_ctx.is_some() {
+            self.db_metrics
+                .read_perf_ctx_metrics
+                .report_metrics(&self.cf);
+        }
+        match res {
+            Some(data) => Ok(Some(data.to_vec())),
+            None => Ok(None),
+        }
+    }
+
+    #[instrument(level = "trace", skip_all, err)]
+    fn insert(&self, key: &K, value: &V) -> Result<(), RawStoreError> {
+        let _timer = self
+            .db_metrics
+            .op_metrics
+            .rocksdb_put_latency_seconds
+            .with_label_values(&[&self.cf])
+            .start_timer();
+        let perf_ctx = if self.write_sample_interval.sample() {
+            Some(RocksDBPerfContext::default())
+        } else {
+            None
+        };
+        let key_buf = be_fix_int_ser(key)?;
+        let value_buf = bcs::to_bytes(value)?;
+        self.db_metrics
+            .op_metrics
+            .rocksdb_put_bytes
+            .with_label_values(&[&self.cf])
+            .observe((key_buf.len() + value_buf.len()) as f64);
+        if perf_ctx.is_some() {
+            self.db_metrics
+                .write_perf_ctx_metrics
+                .report_metrics(&self.cf);
+        }
+        self.rocksdb
+            .put_cf(&self.cf(), &key_buf, &value_buf, &self.opts.writeopts())?;
+        Ok(())
+    }
+
+    #[instrument(level = "trace", skip_all, err)]
+    fn remove(&self, key: &K) -> Result<(), RawStoreError> {
+        let _timer = self
+            .db_metrics
+            .op_metrics
+            .rocksdb_delete_latency_seconds
+            .with_label_values(&[&self.cf])
+            .start_timer();
+        let perf_ctx = if self.write_sample_interval.sample() {
+            Some(RocksDBPerfContext::default())
+        } else {
+            None
+        };
+        let key_buf = be_fix_int_ser(key)?;
+        self.rocksdb
+            .delete_cf(&self.cf(), key_buf, &self.opts.writeopts())?;
+        self.db_metrics
+            .op_metrics
+            .rocksdb_deletes
+            .with_label_values(&[&self.cf])
+            .inc();
+        if perf_ctx.is_some() {
+            self.db_metrics
+                .write_perf_ctx_metrics
+                .report_metrics(&self.cf);
+        }
+        Ok(())
+    }
+
+    #[instrument(level = "trace", skip_all, err)]
+    fn clear(&self) -> Result<(), RawStoreError> {
+        let _ = self.rocksdb.drop_cf(&self.cf);
+        self.rocksdb
+            .create_cf(self.cf.clone(), &default_db_options().options)?;
+        Ok(())
+    }
+
+    fn is_empty(&self) -> bool {
+        self.safe_iter().next().is_none()
+    }
+
+    fn iter(&'a self) -> Self::Iterator {
+        let _timer = self
+            .db_metrics
+            .op_metrics
+            .rocksdb_iter_latency_seconds
+            .with_label_values(&[&self.cf])
+            .start_timer();
+        let bytes_scanned = self
+            .db_metrics
+            .op_metrics
+            .rocksdb_iter_bytes
+            .with_label_values(&[&self.cf]);
+        let keys_scanned = self
+            .db_metrics
+            .op_metrics
+            .rocksdb_iter_keys
+            .with_label_values(&[&self.cf]);
+        let _perf_ctx = if self.iter_sample_interval.sample() {
+            Some(RocksDBPerfContext::default())
+        } else {
+            None
+        };
+        let db_iter = self
+            .rocksdb
+            .raw_iterator_cf(&self.cf(), self.opts.readopts());
+        Iter::new(
+            self.cf.clone(),
+            db_iter,
+            Some(_timer),
+            _perf_ctx,
+            Some(bytes_scanned),
+            Some(keys_scanned),
+            Some(self.db_metrics.clone()),
+        )
+    }
+
+    fn safe_iter(&'a self) -> Self::SafeIterator {
+        let _timer = self
+            .db_metrics
+            .op_metrics
+            .rocksdb_iter_latency_seconds
+            .with_label_values(&[&self.cf])
+            .start_timer();
+        let _perf_ctx = if self.iter_sample_interval.sample() {
+            Some(RocksDBPerfContext::default())
+        } else {
+            None
+        };
+        let bytes_scanned = self
+            .db_metrics
+            .op_metrics
+            .rocksdb_iter_bytes
+            .with_label_values(&[&self.cf]);
+        let keys_scanned = self
+            .db_metrics
+            .op_metrics
+            .rocksdb_iter_keys
+            .with_label_values(&[&self.cf]);
+        let mut db_iter = self
+            .rocksdb
+            .raw_iterator_cf(&self.cf(), self.opts.readopts());
+        db_iter.seek_to_first();
+        SafeIter::new(
+            self.cf.clone(),
+            db_iter,
+            Some(_timer),
+            _perf_ctx,
+            Some(bytes_scanned),
+            Some(keys_scanned),
+            Some(self.db_metrics.clone()),
+        )
+    }
+
+    /// Returns an iterator visiting each key-value pair in the map. By proving bounds of the
+    /// scan range, RocksDB scan avoid unnecessary scans
+    fn iter_with_bounds(
+        &'a self,
+        lower_bound: Option<K>,
+        upper_bound: Option<K>,
+    ) -> Self::Iterator {
+        let _timer = self
+            .db_metrics
+            .op_metrics
+            .rocksdb_iter_latency_seconds
+            .with_label_values(&[&self.cf])
+            .start_timer();
+        let bytes_scanned = self
+            .db_metrics
+            .op_metrics
+            .rocksdb_iter_bytes
+            .with_label_values(&[&self.cf]);
+        let keys_scanned = self
+            .db_metrics
+            .op_metrics
+            .rocksdb_iter_keys
+            .with_label_values(&[&self.cf]);
+        let _perf_ctx = if self.iter_sample_interval.sample() {
+            Some(RocksDBPerfContext::default())
+        } else {
+            None
+        };
+        let mut readopts = ReadOptions::default();
+        if let Some(lower_bound) = lower_bound {
+            let key_buf = be_fix_int_ser(&lower_bound).unwrap();
+            readopts.set_iterate_lower_bound(key_buf);
+        }
+        if let Some(upper_bound) = upper_bound {
+            let key_buf = be_fix_int_ser(&upper_bound).unwrap();
+            readopts.set_iterate_upper_bound(key_buf);
+        }
+        let db_iter = self.rocksdb.raw_iterator_cf(&self.cf(), readopts);
+        Iter::new(
+            self.cf.clone(),
+            db_iter,
+            Some(_timer),
+            _perf_ctx,
+            Some(bytes_scanned),
+            Some(keys_scanned),
+            Some(self.db_metrics.clone()),
+        )
+    }
+
+    fn keys(&'a self) -> Self::Keys {
+        let mut db_iter = self
+            .rocksdb
+            .raw_iterator_cf(&self.cf(), self.opts.readopts());
+        db_iter.seek_to_first();
+
+        Keys::new(db_iter)
+    }
+
+    fn values(&'a self) -> Self::Values {
+        let mut db_iter = self
+            .rocksdb
+            .raw_iterator_cf(&self.cf(), self.opts.readopts());
+        db_iter.seek_to_first();
+
+        Values::new(db_iter)
+    }
+
+    /// Returns a vector of raw values corresponding to the keys provided.
+    #[instrument(level = "trace", skip_all, err)]
+    fn multi_get_raw_bytes<J>(
+        &self,
+        keys: impl IntoIterator<Item = J>,
+    ) -> Result<Vec<Option<Vec<u8>>>, RawStoreError>
+    where
+        J: Borrow<K>,
+    {
+        let _timer = self
+            .db_metrics
+            .op_metrics
+            .rocksdb_multiget_latency_seconds
+            .with_label_values(&[&self.cf])
+            .start_timer();
+        let perf_ctx = if self.multiget_sample_interval.sample() {
+            Some(RocksDBPerfContext::default())
+        } else {
+            None
+        };
+        let cf = self.cf();
+        let keys_bytes: Result<Vec<_>, RawStoreError> = keys
+            .into_iter()
+            .map(|k| Ok((&cf, be_fix_int_ser(k.borrow())?)))
+            .collect();
+        let results = self
+            .rocksdb
+            .multi_get_cf(keys_bytes?, &self.opts.readopts());
+        let entry_size = |entry: &Result<Option<Vec<u8>>, rocksdb::Error>| -> f64 {
+            entry
+                .as_ref()
+                .map_or(0.0, |e| e.as_ref().map_or(0.0, |v| v.len() as f64))
+        };
+        self.db_metrics
+            .op_metrics
+            .rocksdb_multiget_bytes
+            .with_label_values(&[&self.cf])
+            .observe(results.iter().map(entry_size).sum());
+        if perf_ctx.is_some() {
+            self.db_metrics
+                .read_perf_ctx_metrics
+                .report_metrics(&self.cf);
+        }
+        Ok(results.into_iter().collect::<Result<_, _>>()?)
+    }
+
+    /// Returns a vector of values corresponding to the keys provided.
+    #[instrument(level = "trace", skip_all, err)]
+    fn multi_get<J>(
+        &self,
+        keys: impl IntoIterator<Item = J>,
+    ) -> Result<Vec<Option<V>>, RawStoreError>
+    where
+        J: Borrow<K>,
+    {
+        let results = self.multi_get_raw_bytes(keys)?;
+        let values_parsed: Result<Vec<_>, RawStoreError> = results
+            .into_iter()
+            .map(|value_byte| match value_byte {
+                Some(data) => Ok(Some(bcs::from_bytes(&data)?)),
+                None => Ok(None),
+            })
+            .collect();
+
+        values_parsed
+    }
+
+    /// Returns a vector of values corresponding to the keys provided.
+    #[instrument(level = "trace", skip_all, err)]
+    fn chunked_multi_get<J>(
+        &self,
+        keys: impl IntoIterator<Item = J>,
+        chunk_size: usize,
+    ) -> Result<Vec<Option<V>>, RawStoreError>
+    where
+        J: Borrow<K>,
+    {
+        let cf = self.cf();
+        let keys_bytes = keys
+            .into_iter()
+            .map(|k| (&cf, be_fix_int_ser(k.borrow()).unwrap()));
+        let chunked_keys = keys_bytes.into_iter().chunks(chunk_size);
+        let snapshot = self.snapshot()?;
+        let mut results = vec![];
+        for chunk in chunked_keys.into_iter() {
+            let chunk_result = snapshot.multi_get_cf(chunk);
+            let values_parsed: Result<Vec<_>, RawStoreError> = chunk_result
+                .into_iter()
+                .map(|value_byte| {
+                    let value_byte = value_byte?;
+                    match value_byte {
+                        Some(data) => Ok(Some(bcs::from_bytes(&data)?)),
+                        None => Ok(None),
+                    }
+                })
+                .collect();
+            results.extend(values_parsed?);
+        }
+        Ok(results)
+    }
+
+    /// Convenience method for batch insertion
+    #[instrument(level = "trace", skip_all, err)]
+    fn multi_insert<J, U>(
+        &self,
+        key_val_pairs: impl IntoIterator<Item = (J, U)>,
+    ) -> Result<(), Self::Error>
+    where
+        J: Borrow<K>,
+        U: Borrow<V>,
+    {
+        let mut batch = self.batch();
+        batch.insert_batch(self, key_val_pairs)?;
+        batch.write()
+    }
+
+    /// Convenience method for batch removal
+    #[instrument(level = "trace", skip_all, err)]
+    fn multi_remove<J>(&self, keys: impl IntoIterator<Item = J>) -> Result<(), Self::Error>
+    where
+        J: Borrow<K>,
+    {
+        let mut batch = self.batch();
+        batch.delete_batch(self, keys)?;
+        batch.write()
+    }
+
+    /// Try to catch up with primary when running as secondary
+    #[instrument(level = "trace", skip_all, err)]
+    fn try_catch_up_with_primary(&self) -> Result<(), Self::Error> {
+        Ok(self.rocksdb.try_catch_up_with_primary()?)
+    }
+}
+
+impl<J, K, U, V> TryExtend<(J, U)> for DBMap<K, V>
+where
+    J: Borrow<K>,
+    U: Borrow<V>,
+    K: Serialize,
+    V: Serialize,
+{
+    type Error = RawStoreError;
+
+    fn try_extend<T>(&mut self, iter: &mut T) -> Result<(), Self::Error>
+    where
+        T: Iterator<Item = (J, U)>,
+    {
+        let mut batch = self.batch();
+        batch.insert_batch(self, iter)?;
+        batch.write()
+    }
+
+    fn try_extend_from_slice(&mut self, slice: &[(J, U)]) -> Result<(), Self::Error> {
+        let slice_of_refs = slice.iter().map(|(k, v)| (k.borrow(), v.borrow()));
+        let mut batch = self.batch();
+        batch.insert_batch(self, slice_of_refs)?;
+        batch.write()
+    }
+}
+
+pub fn read_size_from_env(var_name: &str) -> Option<usize> {
+    env::var(var_name)
+        .ok()?
+        .parse::<usize>()
+        .tap_err(|e| {
+            warn!(
+                "Env var {} does not contain valid usize integer: {}",
+                var_name, e
+            )
+        })
+        .ok()
+}
+
+#[derive(Default, Clone, Debug)]
+pub struct ReadWriteOptions {
+    pub ignore_range_deletions: bool,
+}
+
+impl ReadWriteOptions {
+    pub fn readopts(&self) -> ReadOptions {
+        let mut readopts = ReadOptions::default();
+        readopts.set_ignore_range_deletions(self.ignore_range_deletions);
+        readopts
+    }
+    pub fn writeopts(&self) -> WriteOptions {
+        WriteOptions::default()
+    }
+}
+
+// TODO: refactor this into a builder pattern, where rocksdb::Options are
+// generated after a call to build().
+#[derive(Default, Clone)]
+pub struct DBOptions {
+    pub options: rocksdb::Options,
+    pub rw_options: ReadWriteOptions,
+}
+
+impl DBOptions {
+    // Optimize lookup perf for tables where no scans are performed.
+    // If non-trivial number of values can be > 512B in size, it is beneficial to also
+    // specify optimize_for_large_values_no_scan().
+    pub fn optimize_for_point_lookup(mut self, block_cache_size_mb: usize) -> DBOptions {
+        // NOTE: this overwrites the block options.
+        self.options
+            .optimize_for_point_lookup(block_cache_size_mb as u64);
+        self
+    }
+
+    // Optimize write and lookup perf for tables which are rarely scanned, and have large values.
+    // https://rocksdb.org/blog/2021/05/26/integrated-blob-db.html
+    pub fn optimize_for_large_values_no_scan(mut self, min_blob_size: u64) -> DBOptions {
+        if env::var(ENV_VAR_DISABLE_BLOB_STORAGE).is_ok() {
+            info!("Large value blob storage optimization is disabled via env var.");
+            return self;
+        }
+
+        // Blob settings.
+        self.options.set_enable_blob_files(true);
+        self.options
+            .set_blob_compression_type(rocksdb::DBCompressionType::Lz4);
+        self.options.set_enable_blob_gc(true);
+        // Since each blob can have non-trivial size overhead, and compression does not work across blobs,
+        // set a min blob size in bytes to so small transactions and effects are kept in sst files.
+        self.options.set_min_blob_size(min_blob_size);
+
+        // Increase write buffer size to 256MiB.
+        let write_buffer_size = read_size_from_env(ENV_VAR_MAX_WRITE_BUFFER_SIZE_MB)
+            .unwrap_or(DEFAULT_MAX_WRITE_BUFFER_SIZE_MB)
+            * 1024
+            * 1024;
+        self.options.set_write_buffer_size(write_buffer_size);
+        // Since large blobs are not in sst files, reduce the target file size and base level
+        // target size.
+        let target_file_size_base = 64 << 20;
+        self.options
+            .set_target_file_size_base(target_file_size_base);
+        // Level 1 default to 64MiB * 6 ~ 384MiB.
+        let max_level_zero_file_num = read_size_from_env(ENV_VAR_L0_NUM_FILES_COMPACTION_TRIGGER)
+            .unwrap_or(DEFAULT_L0_NUM_FILES_COMPACTION_TRIGGER);
+        self.options
+            .set_max_bytes_for_level_base(target_file_size_base * max_level_zero_file_num as u64);
+
+        self
+    }
+
+    // Optimize tables with a mix of lookup and scan workloads.
+    pub fn optimize_for_read(mut self, block_cache_size_mb: usize) -> DBOptions {
+        self.options
+            .set_block_based_table_factory(&get_block_options(block_cache_size_mb));
+        self
+    }
+
+    // Optimize DB receiving significant insertions.
+    pub fn optimize_db_for_write_throughput(mut self, db_max_write_buffer_gb: u64) -> DBOptions {
+        self.options
+            .set_db_write_buffer_size(db_max_write_buffer_gb as usize * 1024 * 1024 * 1024);
+        self.options
+            .set_max_total_wal_size(db_max_write_buffer_gb * 1024 * 1024 * 1024);
+        self
+    }
+
+    // Optimize tables receiving significant insertions.
+    pub fn optimize_for_write_throughput(mut self) -> DBOptions {
+        // Increase write buffer size to 256MiB.
+        let write_buffer_size = read_size_from_env(ENV_VAR_MAX_WRITE_BUFFER_SIZE_MB)
+            .unwrap_or(DEFAULT_MAX_WRITE_BUFFER_SIZE_MB)
+            * 1024
+            * 1024;
+        self.options.set_write_buffer_size(write_buffer_size);
+        // Increase write buffers to keep to 6 before slowing down writes.
+        let max_write_buffer_number = read_size_from_env(ENV_VAR_MAX_WRITE_BUFFER_NUMBER)
+            .unwrap_or(DEFAULT_MAX_WRITE_BUFFER_NUMBER);
+        self.options
+            .set_max_write_buffer_number(max_write_buffer_number.try_into().unwrap());
+        // Keep 1 write buffer so recent writes can be read from memory.
+        self.options
+            .set_max_write_buffer_size_to_maintain((write_buffer_size).try_into().unwrap());
+
+        // Increase compaction trigger for level 0 to 6.
+        let max_level_zero_file_num = read_size_from_env(ENV_VAR_L0_NUM_FILES_COMPACTION_TRIGGER)
+            .unwrap_or(DEFAULT_L0_NUM_FILES_COMPACTION_TRIGGER);
+        self.options.set_level_zero_file_num_compaction_trigger(
+            max_level_zero_file_num.try_into().unwrap(),
+        );
+        self.options.set_level_zero_slowdown_writes_trigger(
+            (max_level_zero_file_num * 4).try_into().unwrap(),
+        );
+        self.options
+            .set_level_zero_stop_writes_trigger((max_level_zero_file_num * 5).try_into().unwrap());
+
+        // Increase sst file size to 128MiB.
+        self.options.set_target_file_size_base(
+            read_size_from_env(ENV_VAR_TARGET_FILE_SIZE_BASE_MB)
+                .unwrap_or(DEFAULT_TARGET_FILE_SIZE_BASE_MB) as u64
+                * 1024
+                * 1024,
+        );
+
+        // Increase level 1 target size to 256MiB * 6 ~ 1.5GiB.
+        self.options
+            .set_max_bytes_for_level_base((write_buffer_size * max_level_zero_file_num) as u64);
+
+        self
+    }
+
+    // Optimize tables receiving significant deletions.
+    // TODO: revisit when intra-epoch pruning is enabled.
+    pub fn optimize_for_pruning(mut self) -> DBOptions {
+        self.options.set_min_write_buffer_number_to_merge(2);
+        self
+    }
+}
+
+/// Creates a default RocksDB option, to be used when RocksDB option is unspecified.
+pub fn default_db_options() -> DBOptions {
+    let mut opt = rocksdb::Options::default();
+
+    // One common issue when running tests on Mac is that the default ulimit is too low,
+    // leading to I/O errors such as "Too many open files". Raising fdlimit to bypass it.
+    if let Some(limit) = fdlimit::raise_fd_limit() {
+        // on windows raise_fd_limit return None
+        opt.set_max_open_files((limit / 8) as i32);
+    }
+
+    // The table cache is locked for updates and this determines the number
+    // of shards, ie 2^10. Increase in case of lock contentions.
+    opt.set_table_cache_num_shard_bits(10);
+
+    // LSM compression settings
+    opt.set_min_level_to_compress(2);
+    opt.set_compression_type(rocksdb::DBCompressionType::Lz4);
+    opt.set_bottommost_compression_type(rocksdb::DBCompressionType::Zstd);
+    opt.set_bottommost_zstd_max_train_bytes(1024 * 1024, true);
+
+    opt.set_max_background_jobs(
+        read_size_from_env(ENV_VAR_MAX_BACKGROUND_JOBS)
+            .unwrap_or(2)
+            .try_into()
+            .unwrap(),
+    );
+
+    // Sui uses multiple RocksDB in a node, so total sizes of write buffers and WAL can be higher
+    // than the limits below.
+    //
+    // RocksDB also exposes the option to configure total write buffer size across multiple instances
+    // via `write_buffer_manager`. But the write buffer flush policy (flushing the buffer receiving
+    // the next write) may not work well. So sticking to per-db write buffer size limit for now.
+    //
+    // The environment variables are only meant to be emergency overrides. They may go away in future.
+    // If you need to modify an option, either update the default value, or override the option in
+    // Sui / Narwhal.
+    opt.set_db_write_buffer_size(
+        read_size_from_env(ENV_VAR_DB_WRITE_BUFFER_SIZE).unwrap_or(DEFAULT_DB_WRITE_BUFFER_SIZE)
+            * 1024
+            * 1024,
+    );
+    opt.set_max_total_wal_size(
+        read_size_from_env(ENV_VAR_DB_WAL_SIZE).unwrap_or(DEFAULT_DB_WAL_SIZE) as u64 * 1024 * 1024,
+    );
+
+    opt.increase_parallelism(4);
+    opt.set_enable_pipelined_write(true);
+
+    opt.set_block_based_table_factory(&get_block_options(128));
+
+    // Set memtable bloomfilter.
+    opt.set_memtable_prefix_bloom_ratio(0.02);
+
+    DBOptions {
+        options: opt,
+        rw_options: ReadWriteOptions::default(),
+    }
+}
+
+fn get_block_options(block_cache_size_mb: usize) -> BlockBasedOptions {
+    // Set options mostly similar to those used in optimize_for_point_lookup(),
+    // except non-default binary and hash index, to hopefully reduce lookup latencies
+    // without causing any regression for scanning, with slightly more memory usages.
+    // https://github.com/facebook/rocksdb/blob/11cb6af6e5009c51794641905ca40ce5beec7fee/options/options.cc#L611-L621
+    let mut block_options = BlockBasedOptions::default();
+    // Increase block size to 16KiB.
+    // https://github.com/EighteenZi/rocksdb_wiki/blob/master/Memory-usage-in-RocksDB.md#indexes-and-filter-blocks
+    block_options.set_block_size(16 * 1024);
+    // Configure a block cache.
+    block_options.set_block_cache(&Cache::new_lru_cache(block_cache_size_mb << 20));
+    // Set a bloomfilter with 1% false positive rate.
+    block_options.set_bloom_filter(10.0, false);
+    // From https://github.com/EighteenZi/rocksdb_wiki/blob/master/Block-Cache.md#caching-index-and-filter-blocks
+    block_options.set_pin_l0_filter_and_index_blocks_in_cache(true);
+    block_options
+}
+
+/// Opens a database with options, and a number of column families that are created if they do not exist.
+#[instrument(level="debug", skip_all, fields(path = ?path.as_ref(), cf = ?opt_cfs), err)]
+pub fn open_cf<P: AsRef<Path>>(
+    path: P,
+    db_options: Option<rocksdb::Options>,
+    metric_conf: MetricConf,
+    opt_cfs: &[&str],
+) -> Result<Arc<RocksDB>, RawStoreError> {
+    let options = db_options.unwrap_or_else(|| default_db_options().options);
+    let column_descriptors: Vec<_> = opt_cfs
+        .iter()
+        .map(|name| (*name, options.clone()))
+        .collect();
+    open_cf_opts(
+        path,
+        Some(options.clone()),
+        metric_conf,
+        &column_descriptors[..],
+    )
+}
+
+fn prepare_db_options(db_options: Option<rocksdb::Options>) -> rocksdb::Options {
+    // Customize database options
+    let mut options = db_options.unwrap_or_else(|| default_db_options().options);
+    options.create_if_missing(true);
+    options.create_missing_column_families(true);
+    options
+}
+
+/// Opens a database with options, and a number of column families with individual options that are created if they do not exist.
+#[instrument(level="debug", skip_all, fields(path = ?path.as_ref()), err)]
+pub fn open_cf_opts<P: AsRef<Path>>(
+    path: P,
+    db_options: Option<rocksdb::Options>,
+    metric_conf: MetricConf,
+    opt_cfs: &[(&str, rocksdb::Options)],
+) -> Result<Arc<RocksDB>, RawStoreError> {
+    let path = path.as_ref();
+    // In the simulator, we intercept the wall clock in the test thread only. This causes problems
+    // because rocksdb uses the simulated clock when creating its background threads, but then
+    // those threads see the real wall clock (because they are not the test thread), which causes
+    // rocksdb to panic. The `nondeterministic` macro evaluates expressions in new threads, which
+    // resolves the issue.
+    //
+    // This is a no-op in non-simulator builds.
+
+    let cfs = populate_missing_cfs(opt_cfs, path)?;
+    nondeterministic!({
+        let options = prepare_db_options(db_options);
+        let rocksdb = {
+            rocksdb::DBWithThreadMode::<MultiThreaded>::open_cf_descriptors(
+                &options,
+                path,
+                cfs.into_iter()
+                    .map(|(name, opts)| ColumnFamilyDescriptor::new(name, opts)),
+            )?
+        };
+        Ok(Arc::new(RocksDB::DBWithThreadMode(
+            DBWithThreadModeWrapper {
+                underlying: rocksdb,
+                metric_conf,
+                db_path: PathBuf::from(path),
+            },
+        )))
+    })
+}
+
+/// Opens a database with options, and a number of column families with individual options that are created if they do not exist.
+#[instrument(level="debug", skip_all, fields(path = ?path.as_ref()), err)]
+pub fn open_cf_opts_transactional<P: AsRef<Path>>(
+    path: P,
+    db_options: Option<rocksdb::Options>,
+    metric_conf: MetricConf,
+    opt_cfs: &[(&str, rocksdb::Options)],
+) -> Result<Arc<RocksDB>, RawStoreError> {
+    let path = path.as_ref();
+    let cfs = populate_missing_cfs(opt_cfs, path)?;
+    // See comment above for explanation of why nondeterministic is necessary here.
+    nondeterministic!({
+        let options = prepare_db_options(db_options);
+        let rocksdb = rocksdb::OptimisticTransactionDB::<MultiThreaded>::open_cf_descriptors(
+            &options,
+            path,
+            cfs.into_iter()
+                .map(|(name, opts)| ColumnFamilyDescriptor::new(name, opts)),
+        )?;
+        Ok(Arc::new(RocksDB::OptimisticTransactionDB(
+            OptimisticTransactionDBWrapper {
+                underlying: rocksdb,
+                metric_conf,
+                db_path: PathBuf::from(path),
+            },
+        )))
+    })
+}
+
+/// Opens a database with options, and a number of column families with individual options that are created if they do not exist.
+pub fn open_cf_opts_secondary<P: AsRef<Path>>(
+    primary_path: P,
+    secondary_path: Option<P>,
+    db_options: Option<rocksdb::Options>,
+    metric_conf: MetricConf,
+    opt_cfs: &[(&str, rocksdb::Options)],
+) -> Result<Arc<RocksDB>, RawStoreError> {
+    let primary_path = primary_path.as_ref();
+    let secondary_path = secondary_path.as_ref().map(|p| p.as_ref());
+    // See comment above for explanation of why nondeterministic is necessary here.
+    nondeterministic!({
+        // Customize database options
+        let mut options = db_options.unwrap_or_else(|| default_db_options().options);
+
+        fdlimit::raise_fd_limit();
+        // This is a requirement by RocksDB when opening as secondary
+        options.set_max_open_files(-1);
+
+        let mut opt_cfs: std::collections::HashMap<_, _> = opt_cfs.iter().cloned().collect();
+        let cfs = rocksdb::DBWithThreadMode::<MultiThreaded>::list_cf(&options, primary_path)
+            .ok()
+            .unwrap_or_default();
+
+        let default_db_options = default_db_options();
+        // Add CFs not explicitly listed
+        for cf_key in cfs.iter() {
+            if !opt_cfs.contains_key(&cf_key[..]) {
+                opt_cfs.insert(cf_key, default_db_options.options.clone());
+            }
+        }
+
+        let primary_path = primary_path.to_path_buf();
+        let secondary_path = secondary_path.map(|q| q.to_path_buf()).unwrap_or_else(|| {
+            let mut s = primary_path.clone();
+            s.pop();
+            s.push("SECONDARY");
+            s.as_path().to_path_buf()
+        });
+
+        let rocksdb = {
+            options.create_if_missing(true);
+            options.create_missing_column_families(true);
+            let db = rocksdb::DBWithThreadMode::<MultiThreaded>::open_cf_descriptors_as_secondary(
+                &options,
+                &primary_path,
+                &secondary_path,
+                opt_cfs
+                    .iter()
+                    .map(|(name, opts)| ColumnFamilyDescriptor::new(*name, (*opts).clone())),
+            )?;
+            db.try_catch_up_with_primary()?;
+            db
+        };
+        Ok(Arc::new(RocksDB::DBWithThreadMode(
+            DBWithThreadModeWrapper {
+                underlying: rocksdb,
+                metric_conf,
+                db_path: secondary_path,
+            },
+        )))
+    })
+}
+
+pub fn list_tables(path: std::path::PathBuf) -> eyre::Result<Vec<String>> {
+    const DB_DEFAULT_CF_NAME: &str = "default";
+
+    let opts = rocksdb::Options::default();
+    rocksdb::DBWithThreadMode::<rocksdb::MultiThreaded>::list_cf(&opts, path)
+        .map_err(|e| e.into())
+        .map(|q| {
+            q.iter()
+                .filter_map(|s| {
+                    // The `default` table is not used
+                    if s != DB_DEFAULT_CF_NAME {
+                        Some(s.clone())
+                    } else {
+                        None
+                    }
+                })
+                .collect()
+        })
+}
+
+/// TODO: Good description of why we're doing this : RocksDB stores keys in BE and has a seek operator on iterators, see `https://github.com/facebook/rocksdb/wiki/Iterator#introduction`
+#[inline]
+pub fn be_fix_int_ser<S>(t: &S) -> Result<Vec<u8>, RawStoreError>
+where
+    S: ?Sized + serde::Serialize,
+{
+    bincode::DefaultOptions::new()
+        .with_big_endian()
+        .with_fixint_encoding()
+        .serialize(t)
+        .map_err(|e| e.into())
+}
+
+#[derive(Clone)]
+pub struct DBMapTableConfigMap(BTreeMap<String, DBOptions>);
+impl DBMapTableConfigMap {
+    pub fn new(map: BTreeMap<String, DBOptions>) -> Self {
+        Self(map)
+    }
+
+    pub fn to_map(&self) -> BTreeMap<String, DBOptions> {
+        self.0.clone()
+    }
+}
+
+pub enum RocksDBAccessType {
+    Primary,
+    Secondary(Option<PathBuf>),
+}
+
+pub fn safe_drop_db(path: PathBuf) -> Result<(), rocksdb::Error> {
+    rocksdb::DB::destroy(&rocksdb::Options::default(), path)
+}
+
+fn populate_missing_cfs(
+    input_cfs: &[(&str, rocksdb::Options)],
+    path: &Path,
+) -> Result<Vec<(String, rocksdb::Options)>, rocksdb::Error> {
+    let mut cfs = vec![];
+    let input_cf_index: HashSet<_> = input_cfs.iter().map(|(name, _)| *name).collect();
+    let existing_cfs =
+        rocksdb::DBWithThreadMode::<MultiThreaded>::list_cf(&rocksdb::Options::default(), path)
+            .ok()
+            .unwrap_or_default();
+
+    for cf_name in existing_cfs {
+        if !input_cf_index.contains(&cf_name[..]) {
+            cfs.push((cf_name, rocksdb::Options::default()));
+        }
+    }
+    cfs.extend(
+        input_cfs
+            .iter()
+            .map(|(name, opts)| (name.to_string(), (*opts).clone())),
+    );
+    Ok(cfs)
+}
diff --git a/moveos/raw-store/src/rocks/safe_iter.rs b/moveos/raw-store/src/rocks/safe_iter.rs
new file mode 100644
index 000000000..d57360cf0
--- /dev/null
+++ b/moveos/raw-store/src/rocks/safe_iter.rs
@@ -0,0 +1,161 @@
+// Copyright (c) RoochNetwork
+// SPDX-License-Identifier: Apache-2.0
+
+// Copyright (c) Mysten Labs, Inc.
+// SPDX-License-Identifier: Apache-2.0
+use std::{marker::PhantomData, sync::Arc};
+
+use bincode::Options;
+use prometheus::{Histogram, HistogramTimer};
+use rocksdb::Direction;
+
+use crate::metrics::{DBMetrics, RocksDBPerfContext};
+
+use super::{be_fix_int_ser, errors::RawStoreError, RocksDBRawIter};
+use serde::{de::DeserializeOwned, Serialize};
+
+/// An iterator over all key-value pairs in a data map.
+pub struct SafeIter<'a, K, V> {
+    cf_name: String,
+    db_iter: RocksDBRawIter<'a>,
+    _phantom: PhantomData<(K, V)>,
+    direction: Direction,
+    _timer: Option<HistogramTimer>,
+    _perf_ctx: Option<RocksDBPerfContext>,
+    bytes_scanned: Option<Histogram>,
+    keys_scanned: Option<Histogram>,
+    db_metrics: Option<Arc<DBMetrics>>,
+    bytes_scanned_counter: usize,
+    keys_returned_counter: usize,
+}
+
+impl<'a, K: DeserializeOwned, V: DeserializeOwned> SafeIter<'a, K, V> {
+    pub(super) fn new(
+        cf_name: String,
+        db_iter: RocksDBRawIter<'a>,
+        _timer: Option<HistogramTimer>,
+        _perf_ctx: Option<RocksDBPerfContext>,
+        bytes_scanned: Option<Histogram>,
+        keys_scanned: Option<Histogram>,
+        db_metrics: Option<Arc<DBMetrics>>,
+    ) -> Self {
+        Self {
+            cf_name,
+            db_iter,
+            _phantom: PhantomData,
+            direction: Direction::Forward,
+            _timer,
+            _perf_ctx,
+            bytes_scanned,
+            keys_scanned,
+            db_metrics,
+            bytes_scanned_counter: 0,
+            keys_returned_counter: 0,
+        }
+    }
+}
+
+impl<'a, K: DeserializeOwned, V: DeserializeOwned> Iterator for SafeIter<'a, K, V> {
+    type Item = Result<(K, V), RawStoreError>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.db_iter.valid() {
+            let config = bincode::DefaultOptions::new()
+                .with_big_endian()
+                .with_fixint_encoding();
+            let raw_key = self
+                .db_iter
+                .key()
+                .expect("Valid iterator failed to get key");
+            let raw_value = self
+                .db_iter
+                .value()
+                .expect("Valid iterator failed to get value");
+            self.bytes_scanned_counter += raw_key.len() + raw_value.len();
+            self.keys_returned_counter += 1;
+            let key = config.deserialize(raw_key).ok();
+            let value = bcs::from_bytes(raw_value).ok();
+            match self.direction {
+                Direction::Forward => self.db_iter.next(),
+                Direction::Reverse => self.db_iter.prev(),
+            }
+            key.and_then(|k| value.map(|v| Ok((k, v))))
+        } else {
+            match self.db_iter.status() {
+                Ok(_) => None,
+                Err(err) => Some(Err(RawStoreError::RocksDBError(format!("{err}")))),
+            }
+        }
+    }
+}
+
+impl<'a, K, V> Drop for SafeIter<'a, K, V> {
+    fn drop(&mut self) {
+        if let Some(bytes_scanned) = self.bytes_scanned.take() {
+            bytes_scanned.observe(self.bytes_scanned_counter as f64);
+        }
+        if let Some(keys_scanned) = self.keys_scanned.take() {
+            keys_scanned.observe(self.keys_returned_counter as f64);
+        }
+        if let Some(db_metrics) = self.db_metrics.take() {
+            db_metrics
+                .read_perf_ctx_metrics
+                .report_metrics(&self.cf_name);
+        }
+    }
+}
+
+impl<'a, K: Serialize, V> SafeIter<'a, K, V> {
+    /// Skips all the elements that are smaller than the given key,
+    /// and either lands on the key or the first one greater than
+    /// the key.
+    pub fn skip_to(mut self, key: &K) -> Result<Self, RawStoreError> {
+        self.db_iter.seek(be_fix_int_ser(key)?);
+        Ok(self)
+    }
+
+    /// Moves the iterator the element given or
+    /// the one prior to it if it does not exist. If there is
+    /// no element prior to it, it returns an empty iterator.
+    pub fn skip_prior_to(mut self, key: &K) -> Result<Self, RawStoreError> {
+        self.db_iter.seek_for_prev(be_fix_int_ser(key)?);
+        Ok(self)
+    }
+
+    /// Seeks to the last key in the database (at this column family).
+    pub fn skip_to_last(mut self) -> Self {
+        self.db_iter.seek_to_last();
+        self
+    }
+
+    /// Will make the direction of the iteration reverse and will
+    /// create a new `RevIter` to consume. Every call to `next` method
+    /// will give the next element from the end.
+    pub fn reverse(mut self) -> SafeRevIter<'a, K, V> {
+        self.direction = Direction::Reverse;
+        SafeRevIter::new(self)
+    }
+}
+
+/// An iterator with a reverted direction to the original. The `RevIter`
+/// is hosting an iteration which is consuming in the opposing direction.
+/// It's not possible to do further manipulation (ex re-reverse) to the
+/// iterator.
+pub struct SafeRevIter<'a, K, V> {
+    iter: SafeIter<'a, K, V>,
+}
+
+impl<'a, K, V> SafeRevIter<'a, K, V> {
+    fn new(iter: SafeIter<'a, K, V>) -> Self {
+        Self { iter }
+    }
+}
+
+impl<'a, K: DeserializeOwned, V: DeserializeOwned> Iterator for SafeRevIter<'a, K, V> {
+    type Item = Result<(K, V), RawStoreError>;
+
+    /// Will give the next item backwards
+    fn next(&mut self) -> Option<Self::Item> {
+        self.iter.next()
+    }
+}
diff --git a/moveos/raw-store/src/rocks/tests.rs b/moveos/raw-store/src/rocks/tests.rs
new file mode 100644
index 000000000..ee6a8a738
--- /dev/null
+++ b/moveos/raw-store/src/rocks/tests.rs
@@ -0,0 +1,1154 @@
+// Copyright (c) RoochNetwork
+// SPDX-License-Identifier: Apache-2.0
+
+// Copyright (c) Mysten Labs, Inc.
+// SPDX-License-Identifier: Apache-2.0
+use super::*;
+// use crate::rocks::util::{is_ref_count_value, reference_count_merge_operator};
+use crate::{reopen, retry_transaction, retry_transaction_forever};
+use rstest::rstest;
+use serde::Deserialize;
+
+fn temp_dir() -> std::path::PathBuf {
+    tempfile::tempdir()
+        .expect("Failed to open temporary directory")
+        .into_path()
+}
+
+#[rstest]
+#[tokio::test]
+async fn test_open(#[values(true, false)] is_transactional: bool) {
+    let _db = open_map::<_, u32, String>(temp_dir(), None, is_transactional);
+}
+
+#[rstest]
+#[tokio::test]
+async fn test_reopen(#[values(true, false)] is_transactional: bool) {
+    let arc = {
+        let db = open_map::<_, u32, String>(temp_dir(), None, is_transactional);
+        db.insert(&123456789, &"123456789".to_string())
+            .expect("Failed to insert");
+        db
+    };
+    let db = DBMap::<u32, String>::reopen(&arc.rocksdb, None, &ReadWriteOptions::default())
+        .expect("Failed to re-open storage");
+    assert!(db
+        .contains_key(&123456789)
+        .expect("Failed to retrieve item in storage"));
+}
+
+#[tokio::test]
+async fn test_reopen_macro() {
+    const FIRST_CF: &str = "First_CF";
+    const SECOND_CF: &str = "Second_CF";
+
+    let rocks = open_cf(
+        temp_dir(),
+        None,
+        MetricConf::default(),
+        &[FIRST_CF, SECOND_CF],
+    )
+    .unwrap();
+
+    let (db_map_1, db_map_2) = reopen!(&rocks, FIRST_CF;<i32, String>, SECOND_CF;<i32, String>);
+
+    let keys_vals_cf1 = (1..100).map(|i| (i, i.to_string()));
+    let keys_vals_cf2 = (1..100).map(|i| (i, i.to_string()));
+
+    assert_eq!(db_map_1.cf, FIRST_CF);
+    assert_eq!(db_map_2.cf, SECOND_CF);
+
+    assert!(db_map_1.multi_insert(keys_vals_cf1).is_ok());
+    assert!(db_map_2.multi_insert(keys_vals_cf2).is_ok());
+}
+
+#[rstest]
+#[tokio::test]
+async fn test_wrong_reopen(#[values(true, false)] is_transactional: bool) {
+    let rocks = open_rocksdb(temp_dir(), &["foo", "bar", "baz"], is_transactional);
+    let db = DBMap::<u8, u8>::reopen(&rocks, Some("quux"), &ReadWriteOptions::default());
+    assert!(db.is_err());
+}
+
+// #[rstest]
+// #[tokio::test]
+// async fn test_contains_key(#[values(true, false)] is_transactional: bool) {
+//     let db = open_map(temp_dir(), None, is_transactional);
+//
+//     db.insert(&123456789, &"123456789".to_string())
+//         .expect("Failed to insert");
+//     assert!(db
+//         .contains_key(&123456789)
+//         .expect("Failed to call contains key"));
+//     assert!(!db
+//         .contains_key(&000000000)
+//         .expect("Failed to call contains key"));
+// }
+
+// #[rstest]
+// #[tokio::test]
+// async fn test_get(#[values(true, false)] is_transactional: bool) {
+//     let db = open_map(temp_dir(), None, is_transactional);
+//
+//     db.insert(&123456789, &"123456789".to_string())
+//         .expect("Failed to insert");
+//     assert_eq!(
+//         Some("123456789".to_string()),
+//         db.get(&123456789).expect("Failed to get")
+//     );
+//     assert_eq!(None, db.get(&000000000).expect("Failed to get"));
+// }
+
+// #[rstest]
+// #[tokio::test]
+// async fn test_get_raw(#[values(true, false)] is_transactional: bool) {
+//     let db = open_map(temp_dir(), None, is_transactional);
+//
+//     db.insert(&123456789, &"123456789".to_string())
+//         .expect("Failed to insert");
+//
+//     let val_bytes = db
+//         .get_raw_bytes(&123456789)
+//         .expect("Failed to get_raw_bytes")
+//         .unwrap();
+//
+//     assert_eq!(bcs::to_bytes(&"123456789".to_string()).unwrap(), val_bytes);
+//     assert_eq!(
+//         None,
+//         db.get_raw_bytes(&000000000)
+//             .expect("Failed to get_raw_bytes")
+//     );
+// }
+
+#[rstest]
+#[tokio::test]
+async fn test_multi_get(#[values(true, false)] is_transactional: bool) {
+    let db = open_map(temp_dir(), None, is_transactional);
+
+    db.insert(&123, &"123".to_string())
+        .expect("Failed to insert");
+    db.insert(&456, &"456".to_string())
+        .expect("Failed to insert");
+
+    let result = db.multi_get([123, 456, 789]).expect("Failed to multi get");
+
+    assert_eq!(result.len(), 3);
+    assert_eq!(result[0], Some("123".to_string()));
+    assert_eq!(result[1], Some("456".to_string()));
+    assert_eq!(result[2], None);
+}
+
+// #[rstest]
+// #[tokio::test]
+// async fn test_chunked_multi_get(#[values(true, false)] is_transactional: bool) {
+//     let db = open_map(temp_dir(), None, is_transactional);
+//
+//     db.insert(&123, &"123".to_string())
+//         .expect("Failed to insert");
+//     db.insert(&456, &"456".to_string())
+//         .expect("Failed to insert");
+//
+//     let result = db
+//         .chunked_multi_get([123, 456, 789], 1)
+//         .expect("Failed to chunk multi get");
+//
+//     assert_eq!(result.len(), 3);
+//     assert_eq!(result[0], Some("123".to_string()));
+//     assert_eq!(result[1], Some("456".to_string()));
+//     assert_eq!(result[2], None);
+// }
+
+#[rstest]
+#[tokio::test]
+async fn test_skip(#[values(true, false)] is_transactional: bool) {
+    let db = open_map(temp_dir(), None, is_transactional);
+
+    db.insert(&123, &"123".to_string())
+        .expect("Failed to insert");
+    db.insert(&456, &"456".to_string())
+        .expect("Failed to insert");
+    db.insert(&789, &"789".to_string())
+        .expect("Failed to insert");
+
+    // Skip all smaller
+    let key_vals: Vec<_> = db.safe_iter().skip_to(&456).expect("Seek failed").collect();
+    assert_eq!(key_vals.len(), 2);
+    assert_eq!(key_vals[0], Ok((456, "456".to_string())));
+    assert_eq!(key_vals[1], Ok((789, "789".to_string())));
+
+    // Skip all smaller: same for the keys iterator
+    let keys: Vec<_> = db.keys().skip_to(&456).expect("Seek failed").collect();
+    assert_eq!(keys.len(), 2);
+    assert_eq!(keys[0], Ok(456));
+    assert_eq!(keys[1], Ok(789));
+
+    // Skip to the end
+    assert_eq!(
+        db.safe_iter().skip_to(&999).expect("Seek failed").count(),
+        0
+    );
+    // same for the keys
+    assert_eq!(db.keys().skip_to(&999).expect("Seek failed").count(), 0);
+
+    // Skip to last
+    assert_eq!(
+        db.safe_iter().skip_to_last().next(),
+        Some(Ok((789, "789".to_string())))
+    );
+    // same for the keys
+    assert_eq!(db.keys().skip_to_last().next(), Some(Ok(789)));
+
+    // Skip to successor of first value
+    assert_eq!(
+        db.safe_iter().skip_to(&000).expect("Skip failed").count(),
+        3
+    );
+    assert_eq!(db.keys().skip_to(&000).expect("Skip failed").count(), 3);
+}
+
+#[rstest]
+#[tokio::test]
+async fn test_skip_to_previous_simple(#[values(true, false)] is_transactional: bool) {
+    let db = open_map(temp_dir(), None, is_transactional);
+
+    db.insert(&123, &"123".to_string())
+        .expect("Failed to insert");
+    db.insert(&456, &"456".to_string())
+        .expect("Failed to insert");
+    db.insert(&789, &"789".to_string())
+        .expect("Failed to insert");
+
+    // Skip to the one before the end
+    let key_vals: Vec<_> = db
+        .safe_iter()
+        .skip_prior_to(&999)
+        .expect("Seek failed")
+        .collect();
+    assert_eq!(key_vals.len(), 1);
+    assert_eq!(key_vals[0], Ok((789, "789".to_string())));
+    // Same for the keys iterator
+    let keys: Vec<_> = db
+        .keys()
+        .skip_prior_to(&999)
+        .expect("Seek failed")
+        .collect();
+    assert_eq!(keys.len(), 1);
+    assert_eq!(keys[0], Ok(789));
+
+    // Skip to prior of first value
+    // Note: returns an empty iterator!
+    assert_eq!(
+        db.safe_iter()
+            .skip_prior_to(&000)
+            .expect("Seek failed")
+            .count(),
+        0
+    );
+    // Same for the keys iterator
+    assert_eq!(
+        db.keys().skip_prior_to(&000).expect("Seek failed").count(),
+        0
+    );
+}
+
+#[rstest]
+#[tokio::test]
+async fn test_iter_skip_to_previous_gap(#[values(true, false)] is_transactional: bool) {
+    let db = open_map(temp_dir(), None, is_transactional);
+
+    for i in 1..100 {
+        if i != 50 {
+            db.insert(&i, &i.to_string()).unwrap();
+        }
+    }
+
+    // Skip prior to will return an iterator starting with an "unexpected" key if the sought one is not in the table
+    let db_iter = db.safe_iter().skip_prior_to(&50).unwrap();
+
+    assert_eq!(
+        (49..50)
+            .chain(51..100)
+            .map(|i| Ok((i, i.to_string())))
+            .collect::<Vec<_>>(),
+        db_iter.collect::<Vec<_>>()
+    );
+    // Same logic in the keys iterator
+    let db_iter = db.keys().skip_prior_to(&50).unwrap();
+
+    assert_eq!(
+        (49..50).chain(51..100).map(Ok).collect::<Vec<_>>(),
+        db_iter.collect::<Vec<_>>()
+    );
+}
+
+#[rstest]
+#[tokio::test]
+async fn test_remove(#[values(true, false)] is_transactional: bool) {
+    let db = open_map(temp_dir(), None, is_transactional);
+
+    db.insert(&123456789, &"123456789".to_string())
+        .expect("Failed to insert");
+    assert!(db.get(&123456789).expect("Failed to get").is_some());
+
+    db.remove(&123456789).expect("Failed to remove");
+    assert!(db.get(&123456789).expect("Failed to get").is_none());
+}
+
+#[rstest]
+#[tokio::test]
+async fn test_iter(#[values(true, false)] is_transactional: bool) {
+    let db = open_map(temp_dir(), None, is_transactional);
+    db.insert(&123456789, &"123456789".to_string())
+        .expect("Failed to insert");
+
+    let mut iter = db.safe_iter();
+    assert_eq!(Some(Ok((123456789, "123456789".to_string()))), iter.next());
+    assert_eq!(None, iter.next());
+}
+
+#[rstest]
+#[tokio::test]
+async fn test_iter_reverse(#[values(true, false)] is_transactional: bool) {
+    let db = open_map(temp_dir(), None, is_transactional);
+
+    db.insert(&1, &"1".to_string()).expect("Failed to insert");
+    db.insert(&2, &"2".to_string()).expect("Failed to insert");
+    db.insert(&3, &"3".to_string()).expect("Failed to insert");
+
+    let mut iter = db.safe_iter().skip_to_last().reverse();
+    assert_eq!(Some(Ok((3, "3".to_string()))), iter.next());
+    assert_eq!(Some(Ok((2, "2".to_string()))), iter.next());
+    assert_eq!(Some(Ok((1, "1".to_string()))), iter.next());
+    assert_eq!(None, iter.next());
+
+    let mut iter = db.safe_iter().skip_to(&2).unwrap().reverse();
+    assert_eq!(Some(Ok((2, "2".to_string()))), iter.next());
+    assert_eq!(Some(Ok((1, "1".to_string()))), iter.next());
+    assert_eq!(None, iter.next());
+}
+
+#[rstest]
+#[tokio::test]
+async fn test_keys(#[values(true, false)] is_transactional: bool) {
+    let db = open_map(temp_dir(), None, is_transactional);
+
+    db.insert(&123456789, &"123456789".to_string())
+        .expect("Failed to insert");
+
+    let mut keys = db.keys();
+    assert_eq!(Some(Ok(123456789)), keys.next());
+    assert_eq!(None, keys.next());
+}
+
+#[rstest]
+#[tokio::test]
+async fn test_values(#[values(true, false)] is_transactional: bool) {
+    let db = open_map(temp_dir(), None, is_transactional);
+
+    db.insert(&123456789, &"123456789".to_string())
+        .expect("Failed to insert");
+
+    let mut values = db.values();
+    assert_eq!(Some(Ok("123456789".to_string())), values.next());
+    assert_eq!(None, values.next());
+}
+
+#[rstest]
+#[tokio::test]
+async fn test_try_extend(#[values(true, false)] is_transactional: bool) {
+    let mut db = open_map(temp_dir(), None, is_transactional);
+    let mut keys_vals = (1..100).map(|i| (i, i.to_string()));
+
+    db.try_extend(&mut keys_vals)
+        .expect("Failed to extend the DB with (k, v) pairs");
+    for (k, v) in keys_vals {
+        let val = db.get(&k).expect("Failed to get inserted key");
+        assert_eq!(Some(v), val);
+    }
+}
+
+#[rstest]
+#[tokio::test]
+async fn test_try_extend_from_slice(#[values(true, false)] is_transactional: bool) {
+    let mut db = open_map(temp_dir(), None, is_transactional);
+    let keys_vals = (1..100).map(|i| (i, i.to_string()));
+
+    db.try_extend_from_slice(&keys_vals.clone().collect::<Vec<_>>()[..])
+        .expect("Failed to extend the DB with (k, v) pairs");
+    for (k, v) in keys_vals {
+        let val = db.get(&k).expect("Failed to get inserted key");
+        assert_eq!(Some(v), val);
+    }
+}
+
+#[rstest]
+#[tokio::test]
+async fn test_insert_batch(#[values(true, false)] is_transactional: bool) {
+    let db = open_map(temp_dir(), None, is_transactional);
+    let keys_vals = (1..100).map(|i| (i, i.to_string()));
+    let mut insert_batch = db.batch();
+    insert_batch
+        .insert_batch(&db, keys_vals.clone())
+        .expect("Failed to batch insert");
+    insert_batch.write().expect("Failed to execute batch");
+    for (k, v) in keys_vals {
+        let val = db.get(&k).expect("Failed to get inserted key");
+        assert_eq!(Some(v), val);
+    }
+}
+
+#[rstest]
+#[tokio::test]
+async fn test_insert_batch_across_cf(#[values(true, false)] is_transactional: bool) {
+    let rocks = open_rocksdb(temp_dir(), &["First_CF", "Second_CF"], is_transactional);
+
+    let db_cf_1 = DBMap::reopen(&rocks, Some("First_CF"), &ReadWriteOptions::default())
+        .expect("Failed to open storage");
+    let keys_vals_1 = (1..100).map(|i| (i, i.to_string()));
+
+    let db_cf_2 = DBMap::reopen(&rocks, Some("Second_CF"), &ReadWriteOptions::default())
+        .expect("Failed to open storage");
+    let keys_vals_2 = (1000..1100).map(|i| (i, i.to_string()));
+
+    let mut batch = db_cf_1.batch();
+    batch
+        .insert_batch(&db_cf_1, keys_vals_1.clone())
+        .expect("Failed to batch insert")
+        .insert_batch(&db_cf_2, keys_vals_2.clone())
+        .expect("Failed to batch insert");
+
+    batch.write().expect("Failed to execute batch");
+    for (k, v) in keys_vals_1 {
+        let val = db_cf_1.get(&k).expect("Failed to get inserted key");
+        assert_eq!(Some(v), val);
+    }
+
+    for (k, v) in keys_vals_2 {
+        let val = db_cf_2.get(&k).expect("Failed to get inserted key");
+        assert_eq!(Some(v), val);
+    }
+}
+
+#[rstest]
+#[tokio::test]
+async fn test_insert_batch_across_different_db(#[values(true, false)] is_transactional: bool) {
+    let rocks = open_rocksdb(temp_dir(), &["First_CF", "Second_CF"], is_transactional);
+    let rocks2 = open_rocksdb(temp_dir(), &["First_CF", "Second_CF"], is_transactional);
+
+    let db_cf_1: DBMap<i32, String> =
+        DBMap::reopen(&rocks, Some("First_CF"), &ReadWriteOptions::default())
+            .expect("Failed to open storage");
+    let keys_vals_1 = (1..100).map(|i| (i, i.to_string()));
+
+    let db_cf_2: DBMap<i32, String> =
+        DBMap::reopen(&rocks2, Some("Second_CF"), &ReadWriteOptions::default())
+            .expect("Failed to open storage");
+    let keys_vals_2 = (1000..1100).map(|i| (i, i.to_string()));
+
+    assert!(db_cf_1
+        .batch()
+        .insert_batch(&db_cf_1, keys_vals_1)
+        .expect("Failed to batch insert")
+        .insert_batch(&db_cf_2, keys_vals_2)
+        .is_err());
+}
+
+// #[tokio::test]
+// async fn test_delete_batch() {
+//     let db = DBMap::<i32, String>::open(
+//         temp_dir(),
+//         MetricConf::default(),
+//         None,
+//         None,
+//         &ReadWriteOptions::default(),
+//     )
+//     .expect("Failed to open storage");
+//
+//     let keys_vals = (1..100).map(|i| (i, i.to_string()));
+//     let mut batch = db.batch();
+//     batch
+//         .insert_batch(&db, keys_vals)
+//         .expect("Failed to batch insert");
+//
+//     // delete the odd-index keys
+//     let deletion_keys = (1..100).step_by(2);
+//     batch
+//         .delete_batch(&db, deletion_keys)
+//         .expect("Failed to batch delete");
+//
+//     batch.write().expect("Failed to execute batch");
+//
+//     for k in db.keys() {
+//         assert_eq!(k.unwrap() % 2, 0);
+//     }
+// }
+
+// #[tokio::test]
+// async fn test_delete_range() {
+//     let db: DBMap<i32, String> = DBMap::open(
+//         temp_dir(),
+//         MetricConf::default(),
+//         None,
+//         None,
+//         &ReadWriteOptions::default(),
+//     )
+//     .expect("Failed to open storage");
+//
+//     // Note that the last element is (100, "100".to_owned()) here
+//     let keys_vals = (0..101).map(|i| (i, i.to_string()));
+//     let mut batch = db.batch();
+//     batch
+//         .insert_batch(&db, keys_vals)
+//         .expect("Failed to batch insert");
+//
+//     batch
+//         .delete_range(&db, &50, &100)
+//         .expect("Failed to delete range");
+//
+//     batch.write().expect("Failed to execute batch");
+//
+//     for k in 0..50 {
+//         assert!(db.contains_key(&k).expect("Failed to query legal key"),);
+//     }
+//     for k in 50..100 {
+//         assert!(!db.contains_key(&k).expect("Failed to query legal key"));
+//     }
+//
+//     // range operator is not inclusive of to
+//     assert!(db.contains_key(&100).expect("Failed to query legal key"));
+// }
+
+#[tokio::test]
+async fn test_clear() {
+    let db = DBMap::<i32, String>::open(
+        temp_dir(),
+        MetricConf::default(),
+        None,
+        Some("table"),
+        &ReadWriteOptions::default(),
+    )
+    .expect("Failed to open storage");
+    // Test clear of empty map
+    let _ = db.clear();
+
+    let keys_vals = (0..101).map(|i| (i, i.to_string()));
+    let mut insert_batch = db.batch();
+    insert_batch
+        .insert_batch(&db, keys_vals)
+        .expect("Failed to batch insert");
+
+    insert_batch.write().expect("Failed to execute batch");
+
+    // Check we have multiple entries
+    assert!(db.safe_iter().count() > 1);
+    let _ = db.clear();
+    assert_eq!(db.safe_iter().count(), 0);
+    // Clear again to ensure safety when clearing empty map
+    let _ = db.clear();
+    assert_eq!(db.safe_iter().count(), 0);
+    // Clear with one item
+    let _ = db.insert(&1, &"e".to_string());
+    assert_eq!(db.safe_iter().count(), 1);
+    let _ = db.clear();
+    assert_eq!(db.safe_iter().count(), 0);
+}
+
+#[rstest]
+#[tokio::test]
+async fn test_iter_with_bounds(#[values(true, false)] is_transactional: bool) {
+    let db = open_map(temp_dir(), None, is_transactional);
+
+    // Add [1, 50) and (50, 100) in the db
+    for i in 1..100 {
+        if i != 50 {
+            db.insert(&i, &i.to_string()).unwrap();
+        }
+    }
+
+    // Skip prior to will return an iterator starting with an "unexpected" key if the sought one is not in the table
+    let db_iter = db
+        .iter_with_bounds(Some(1), Some(100))
+        .skip_prior_to(&50)
+        .unwrap();
+
+    assert_eq!(
+        (49..50)
+            .chain(51..100)
+            .map(|i| (i, i.to_string()))
+            .collect::<Vec<_>>(),
+        db_iter.collect::<Vec<_>>()
+    );
+
+    // Same logic in the keys iterator
+    let db_iter = db.keys().skip_prior_to(&50).unwrap();
+
+    assert_eq!(
+        (49..50).chain(51..100).map(Ok).collect::<Vec<_>>(),
+        db_iter.collect::<Vec<_>>()
+    );
+
+    // Skip to a key which is not within the bounds (bound is [1, 50))
+    let db_iter = db.iter_with_bounds(Some(1), Some(50)).skip_to(&50).unwrap();
+    assert_eq!(Vec::<(i32, String)>::new(), db_iter.collect::<Vec<_>>());
+
+    // Skip to first key in the bound (bound is [1, 50))
+    let db_iter = db.iter_with_bounds(Some(1), Some(50)).skip_to(&1).unwrap();
+    assert_eq!(
+        (1..50).map(|i| (i, i.to_string())).collect::<Vec<_>>(),
+        db_iter.collect::<Vec<_>>()
+    );
+
+    // Skip to a key which is not within the bounds (bound is [1, 50))
+    let db_iter = db
+        .iter_with_bounds(Some(1), Some(50))
+        .skip_prior_to(&50)
+        .unwrap();
+    assert_eq!(vec![(49, "49".to_string())], db_iter.collect::<Vec<_>>());
+}
+
+#[tokio::test]
+async fn test_is_empty() {
+    let db = DBMap::<i32, String>::open(
+        temp_dir(),
+        MetricConf::default(),
+        None,
+        Some("table"),
+        &ReadWriteOptions::default(),
+    )
+    .expect("Failed to open storage");
+
+    // Test empty map is truly empty
+    assert!(db.is_empty());
+    let _ = db.clear();
+    assert!(db.is_empty());
+
+    let keys_vals = (0..101).map(|i| (i, i.to_string()));
+    let mut insert_batch = db.batch();
+    insert_batch
+        .insert_batch(&db, keys_vals)
+        .expect("Failed to batch insert");
+
+    insert_batch.write().expect("Failed to execute batch");
+
+    // Check we have multiple entries and not empty
+    assert!(db.safe_iter().count() > 1);
+    assert!(!db.is_empty());
+
+    // Clear again to ensure empty works after clearing
+    let _ = db.clear();
+    assert_eq!(db.safe_iter().count(), 0);
+    assert!(db.is_empty());
+}
+
+#[rstest]
+#[tokio::test]
+async fn test_multi_insert(#[values(true, false)] is_transactional: bool) {
+    // Init a DB
+    let db: DBMap<i32, String> = open_map(temp_dir(), Some("table"), is_transactional);
+    // Create kv pairs
+    let keys_vals = (0..101).map(|i| (i, i.to_string()));
+
+    db.multi_insert(keys_vals.clone())
+        .expect("Failed to multi-insert");
+
+    for (k, v) in keys_vals {
+        let val = db.get(&k).expect("Failed to get inserted key");
+        assert_eq!(Some(v), val);
+    }
+}
+
+#[rstest]
+#[tokio::test]
+async fn test_checkpoint(#[values(true, false)] is_transactional: bool) {
+    let path_prefix = temp_dir();
+    let db_path = path_prefix.join("db");
+    let db: DBMap<i32, String> = open_map(db_path, Some("table"), is_transactional);
+    // Create kv pairs
+    let keys_vals = (0..101).map(|i| (i, i.to_string()));
+
+    db.multi_insert(keys_vals.clone())
+        .expect("Failed to multi-insert");
+    let checkpointed_path = path_prefix.join("checkpointed_db");
+    db.rocksdb
+        .checkpoint(&checkpointed_path)
+        .expect("Failed to create db checkpoint");
+    // Create more kv pairs
+    let new_keys_vals = (101..201).map(|i| (i, i.to_string()));
+    db.multi_insert(new_keys_vals.clone())
+        .expect("Failed to multi-insert");
+    // Verify checkpoint
+    let checkpointed_db: DBMap<i32, String> =
+        open_map(checkpointed_path, Some("table"), is_transactional);
+    // Ensure keys inserted before checkpoint are present in original and checkpointed db
+    for (k, v) in keys_vals {
+        let val = db.get(&k).expect("Failed to get inserted key");
+        assert_eq!(Some(v.clone()), val);
+        let val = checkpointed_db.get(&k).expect("Failed to get inserted key");
+        assert_eq!(Some(v), val);
+    }
+    // Ensure keys inserted after checkpoint are only present in original db but not in checkpointed db
+    for (k, v) in new_keys_vals {
+        let val = db.get(&k).expect("Failed to get inserted key");
+        assert_eq!(Some(v.clone()), val);
+        let val = checkpointed_db.get(&k).expect("Failed to get inserted key");
+        assert_eq!(None, val);
+    }
+}
+
+#[rstest]
+#[tokio::test]
+async fn test_multi_remove(#[values(true, false)] is_transactional: bool) {
+    // Init a DB
+    let db: DBMap<i32, String> = open_map(temp_dir(), Some("table"), is_transactional);
+
+    // Create kv pairs
+    let keys_vals = (0..101).map(|i| (i, i.to_string()));
+
+    db.multi_insert(keys_vals.clone())
+        .expect("Failed to multi-insert");
+
+    // Check insertion
+    for (k, v) in keys_vals.clone() {
+        let val = db.get(&k).expect("Failed to get inserted key");
+        assert_eq!(Some(v), val);
+    }
+
+    // Remove 50 items
+    db.multi_remove(keys_vals.clone().map(|kv| kv.0).take(50))
+        .expect("Failed to multi-remove");
+    assert_eq!(db.safe_iter().count(), 101 - 50);
+
+    // Check that the remaining are present
+    for (k, v) in keys_vals.skip(50) {
+        let val = db.get(&k).expect("Failed to get inserted key");
+        assert_eq!(Some(v), val);
+    }
+}
+
+#[tokio::test]
+async fn test_transactional() {
+    let key = "key";
+    let path = temp_dir();
+    let opt = rocksdb::Options::default();
+    let rocksdb =
+        open_cf_opts_transactional(path, None, MetricConf::default(), &[("cf", opt)]).unwrap();
+    let db = DBMap::<String, String>::reopen(&rocksdb, None, &ReadWriteOptions::default())
+        .expect("Failed to re-open storage");
+
+    // transaction is used instead
+    let mut tx1 = db.transaction().expect("failed to initiate transaction");
+    let mut tx2 = db.transaction().expect("failed to initiate transaction");
+
+    tx1.insert_batch(&db, vec![(key.to_string(), "1".to_string())])
+        .unwrap();
+    tx2.insert_batch(&db, vec![(key.to_string(), "2".to_string())])
+        .unwrap();
+
+    tx1.commit().expect("failed to commit first transaction");
+    assert!(tx2.commit().is_err());
+    assert_eq!(db.get(&key.to_string()).unwrap(), Some("1".to_string()));
+}
+
+#[tokio::test]
+async fn test_transaction_snapshot() {
+    let key = "key".to_string();
+    let path = temp_dir();
+    let opt = rocksdb::Options::default();
+    let rocksdb =
+        open_cf_opts_transactional(path, None, MetricConf::default(), &[("cf", opt)]).unwrap();
+    let db = DBMap::<String, String>::reopen(&rocksdb, None, &ReadWriteOptions::default())
+        .expect("Failed to re-open storage");
+
+    // transaction without set_snapshot succeeds when extraneous write occurs before transaction
+    // write.
+    let mut tx1 = db
+        .transaction_without_snapshot()
+        .expect("failed to initiate transaction");
+    // write occurs after transaction is created but before first write
+    db.insert(&key, &"1".to_string()).unwrap();
+    tx1.insert_batch(&db, vec![(key.to_string(), "2".to_string())])
+        .unwrap();
+    tx1.commit().expect("failed to commit first transaction");
+    assert_eq!(db.get(&key).unwrap().unwrap(), "2".to_string());
+
+    // transaction without set_snapshot fails when extraneous write occurs after transaction
+    // write.
+    let mut tx1 = db
+        .transaction_without_snapshot()
+        .expect("failed to initiate transaction");
+    tx1.insert_batch(&db, vec![(key.to_string(), "2".to_string())])
+        .unwrap();
+    db.insert(&key, &"1".to_string()).unwrap();
+    assert!(matches!(
+        tx1.commit(),
+        Err(RawStoreError::RetryableTransactionError)
+    ));
+    assert_eq!(db.get(&key).unwrap().unwrap(), "1".to_string());
+
+    // failed transaction with set_snapshot
+    let mut tx1 = db.transaction().expect("failed to initiate transaction");
+    // write occurs after transaction is created, so the conflict is detected
+    db.insert(&key, &"1".to_string()).unwrap();
+    tx1.insert_batch(&db, vec![(key.to_string(), "2".to_string())])
+        .unwrap();
+    assert!(matches!(
+        tx1.commit(),
+        Err(RawStoreError::RetryableTransactionError)
+    ));
+
+    let mut tx1 = db.transaction().expect("failed to initiate transaction");
+    tx1.insert_batch(&db, vec![(key.to_string(), "2".to_string())])
+        .unwrap();
+    // no conflicting writes, should succeed this time.
+    tx1.commit().unwrap();
+
+    // when to transactions race, one will fail provided that neither commits before the other
+    // writes.
+    let mut tx1 = db
+        .transaction_without_snapshot()
+        .expect("failed to initiate transaction");
+    let mut tx2 = db
+        .transaction_without_snapshot()
+        .expect("failed to initiate transaction");
+    tx1.insert_batch(&db, vec![(key.to_string(), "1".to_string())])
+        .unwrap();
+    tx2.insert_batch(&db, vec![(key.to_string(), "2".to_string())])
+        .unwrap();
+    // which ever tx is committed first will succeed.
+    tx1.commit().expect("failed to commit");
+    assert!(matches!(
+        tx2.commit(),
+        Err(RawStoreError::RetryableTransactionError)
+    ));
+
+    // IMPORTANT: a race is still possible if one tx commits before the other writes.
+    let mut tx1 = db
+        .transaction_without_snapshot()
+        .expect("failed to initiate transaction");
+    let mut tx2 = db
+        .transaction_without_snapshot()
+        .expect("failed to initiate transaction");
+    tx1.insert_batch(&db, vec![(key.to_string(), "1".to_string())])
+        .unwrap();
+    tx1.commit().expect("failed to commit");
+
+    tx2.insert_batch(&db, vec![(key, "2".to_string())]).unwrap();
+    tx2.commit().expect("failed to commit");
+}
+
+#[tokio::test]
+async fn test_retry_transaction() {
+    let key = "key".to_string();
+    let path = temp_dir();
+    let opt = rocksdb::Options::default();
+    let rocksdb =
+        open_cf_opts_transactional(path, None, MetricConf::default(), &[("cf", opt)]).unwrap();
+    let db = DBMap::<String, String>::reopen(&rocksdb, None, &ReadWriteOptions::default())
+        .expect("Failed to re-open storage");
+
+    let mut conflicts = 0;
+    retry_transaction!({
+        let mut tx1 = db
+            .transaction_without_snapshot()
+            .expect("failed to initiate transaction");
+        tx1.insert_batch(&db, vec![(key.to_string(), "2".to_string())])
+            .unwrap();
+        if conflicts < 3 {
+            db.insert(&key, &"1".to_string()).unwrap();
+        }
+        conflicts += 1;
+        tx1.commit()
+    })
+    // succeeds after we stop causing conflicts
+    .unwrap();
+
+    retry_transaction!({
+        let mut tx1 = db
+            .transaction_without_snapshot()
+            .expect("failed to initiate transaction");
+        tx1.insert_batch(&db, vec![(key.to_string(), "2".to_string())])
+            .unwrap();
+        db.insert(&key, &"1".to_string()).unwrap();
+        tx1.commit()
+    })
+    // fails after hitting maximum number of retries
+    .unwrap_err();
+
+    // obviously we cannot verify that this never times out, this is more just a test to make sure
+    // the macro compiles as expected.
+    tokio::time::timeout(Duration::from_secs(1), async move {
+        retry_transaction_forever!({
+            let mut tx1 = db
+                .transaction_without_snapshot()
+                .expect("failed to initiate transaction");
+            tx1.insert_batch(&db, vec![(key.to_string(), "2".to_string())])
+                .unwrap();
+            db.insert(&key, &"1".to_string()).unwrap();
+            tx1.commit()
+        })
+        // fails after hitting maximum number of retries
+        .unwrap_err();
+        panic!("should never finish");
+    })
+    .await
+    // must timeout
+    .unwrap_err();
+}
+
+#[tokio::test]
+async fn test_transaction_read_your_write() {
+    let key1 = "key1";
+    let key2 = "key2";
+    let path = temp_dir();
+    let opt = rocksdb::Options::default();
+    let rocksdb =
+        open_cf_opts_transactional(path, None, MetricConf::default(), &[("cf", opt)]).unwrap();
+    let db = DBMap::<String, String>::reopen(&rocksdb, None, &ReadWriteOptions::default())
+        .expect("Failed to re-open storage");
+    db.insert(&key1.to_string(), &"1".to_string()).unwrap();
+    let mut tx = db.transaction().expect("failed to initiate transaction");
+    tx.insert_batch(
+        &db,
+        vec![
+            (key1.to_string(), "11".to_string()),
+            (key2.to_string(), "2".to_string()),
+        ],
+    )
+    .unwrap();
+    assert_eq!(db.get(&key1.to_string()).unwrap(), Some("1".to_string()));
+    assert_eq!(db.get(&key2.to_string()).unwrap(), None);
+
+    assert_eq!(
+        tx.get(&db, &key1.to_string()).unwrap(),
+        Some("11".to_string())
+    );
+    assert_eq!(
+        tx.get(&db, &key2.to_string()).unwrap(),
+        Some("2".to_string())
+    );
+
+    tx.delete_batch(&db, vec![(key2.to_string())]).unwrap();
+
+    assert_eq!(
+        tx.multi_get(&db, vec![key1.to_string(), key2.to_string()])
+            .unwrap(),
+        vec![Some("11".to_string()), None]
+    );
+    let keys: Vec<String> = tx.keys(&db).map(|x| x.unwrap()).collect();
+    assert_eq!(keys, vec![key1.to_string()]);
+    let values: Vec<_> = tx.values(&db).collect();
+    assert_eq!(values, vec![Ok("11".to_string())]);
+    assert!(tx.commit().is_ok());
+}
+
+#[tokio::test]
+async fn open_as_secondary_test() {
+    let primary_path = temp_dir();
+
+    // Init a DB
+    let primary_db = DBMap::<i32, String>::open(
+        primary_path.clone(),
+        MetricConf::default(),
+        None,
+        Some("table"),
+        &ReadWriteOptions::default(),
+    )
+    .expect("Failed to open storage");
+    // Create kv pairs
+    let keys_vals = (0..101).map(|i| (i, i.to_string()));
+
+    primary_db
+        .multi_insert(keys_vals.clone())
+        .expect("Failed to multi-insert");
+
+    let opt = rocksdb::Options::default();
+    let secondary_store = open_cf_opts_secondary(
+        primary_path,
+        None,
+        None,
+        MetricConf::default(),
+        &[("table", opt)],
+    )
+    .unwrap();
+    let secondary_db = DBMap::<i32, String>::reopen(
+        &secondary_store,
+        Some("table"),
+        &ReadWriteOptions::default(),
+    )
+    .unwrap();
+
+    secondary_db.try_catch_up_with_primary().unwrap();
+    // Check secondary
+    for (k, v) in keys_vals {
+        assert_eq!(secondary_db.get(&k).unwrap(), Some(v));
+    }
+
+    // Update the value from 0 to 10
+    primary_db.insert(&0, &"10".to_string()).unwrap();
+
+    // This should still be stale since secondary is behind
+    assert_eq!(secondary_db.get(&0).unwrap(), Some("0".to_string()));
+
+    // Try force catchup
+    secondary_db.try_catch_up_with_primary().unwrap();
+
+    // New value should be present
+    assert_eq!(secondary_db.get(&0).unwrap(), Some("10".to_string()));
+}
+
+#[derive(Serialize, Deserialize, Copy, Clone)]
+struct ObjectWithRefCount {
+    value: i64,
+    ref_count: i64,
+}
+
+fn increment_counter(db: &DBMap<String, ObjectWithRefCount>, key: &str, value: i64) {
+    let mut batch = db.batch();
+    batch
+        .partial_merge_batch(db, [(key.to_string(), value.to_le_bytes())])
+        .unwrap();
+    batch.write().unwrap();
+}
+
+// #[tokio::test]
+// async fn refcount_test() {
+//     let key = "key".to_string();
+//     let mut options = rocksdb::Options::default();
+//     options.set_merge_operator(
+//         "refcount operator",
+//         reference_count_merge_operator,
+//         reference_count_merge_operator,
+//     );
+//     let db = DBMap::<String, ObjectWithRefCount>::open(
+//         temp_dir(),
+//         MetricConf::default(),
+//         Some(options),
+//         None,
+//         &ReadWriteOptions::default(),
+//     )
+//     .expect("failed to open rocksdb");
+//     let object = ObjectWithRefCount {
+//         value: 3,
+//         ref_count: 1,
+//     };
+//     // increment value 10 times
+//     let iterations = 10;
+//     for _ in 0..iterations {
+//         let mut batch = db.batch();
+//         batch.merge_batch(&db, [(key.to_string(), object)]).unwrap();
+//         batch.write().unwrap();
+//     }
+//     let value = db
+//         .get(&key)
+//         .expect("failed to read value")
+//         .expect("value is empty");
+//     assert_eq!(value.value, object.value);
+//     assert_eq!(value.ref_count, iterations);
+//
+//     // decrement value
+//     increment_counter(&db, &key, -1);
+//     let value = db.get(&key).unwrap().unwrap();
+//     assert_eq!(value.value, object.value);
+//     assert_eq!(value.ref_count, iterations - 1);
+// }
+
+// #[tokio::test]
+// async fn refcount_with_compaction_test() {
+//     let key = "key".to_string();
+//     let mut options = rocksdb::Options::default();
+//     options.set_merge_operator(
+//         "refcount operator",
+//         reference_count_merge_operator,
+//         reference_count_merge_operator,
+//     );
+//     let db = DBMap::<String, ObjectWithRefCount>::open(
+//         temp_dir(),
+//         MetricConf::default(),
+//         Some(options),
+//         None,
+//         &ReadWriteOptions::default(),
+//     )
+//     .expect("failed to open rocksdb");
+//
+//     let object = ObjectWithRefCount {
+//         value: 3,
+//         ref_count: 1,
+//     };
+//     let mut batch = db.batch();
+//     batch.merge_batch(&db, [(key.to_string(), object)]).unwrap();
+//     batch.write().unwrap();
+//     // increment value once
+//     increment_counter(&db, &key, 1);
+//     let value = db.get(&key).unwrap().unwrap();
+//     assert_eq!(value.value, object.value);
+//
+//     // decrement value to 0
+//     increment_counter(&db, &key, -1);
+//     increment_counter(&db, &key, -1);
+//     // ref count went to zero. Reading value returns empty array
+//     assert!(db.get(&key).is_err());
+//     let value = db.multi_get_raw_bytes([(&key)]).unwrap()[0]
+//         .clone()
+//         .unwrap();
+//     assert!(value.is_empty());
+//
+//     // refcount increment makes value visible again
+//     increment_counter(&db, &key, 1);
+//     let value = db.get(&key).unwrap().unwrap();
+//     assert_eq!(value.value, object.value);
+//
+//     increment_counter(&db, &key, -1);
+//     db.compact_range(
+//         &object,
+//         &ObjectWithRefCount {
+//             value: 100,
+//             ref_count: 1,
+//         },
+//     )
+//     .unwrap();
+//
+//     increment_counter(&db, &key, 1);
+//     let value = db.get_raw_bytes(&key).unwrap().unwrap();
+//     assert!(is_ref_count_value(&value));
+// }
+
+fn open_map<P: AsRef<Path>, K, V>(
+    path: P,
+    opt_cf: Option<&str>,
+    is_transactional: bool,
+) -> DBMap<K, V> {
+    if is_transactional {
+        let cf = opt_cf.unwrap_or(rocksdb::DEFAULT_COLUMN_FAMILY_NAME);
+        open_cf_opts_transactional(
+            path,
+            None,
+            MetricConf::default(),
+            &[(cf, default_db_options().options)],
+        )
+        .map(|db| DBMap::new(db, &ReadWriteOptions::default(), cf))
+        .expect("failed to open rocksdb")
+    } else {
+        DBMap::<K, V>::open(
+            path,
+            MetricConf::default(),
+            None,
+            opt_cf,
+            &ReadWriteOptions::default(),
+        )
+        .expect("failed to open rocksdb")
+    }
+}
+
+fn open_rocksdb<P: AsRef<Path>>(path: P, opt_cfs: &[&str], is_transactional: bool) -> Arc<RocksDB> {
+    if is_transactional {
+        let options = default_db_options().options;
+        let cfs: Vec<_> = opt_cfs
+            .iter()
+            .map(|name| (*name, options.clone()))
+            .collect();
+        open_cf_opts_transactional(path, None, MetricConf::default(), &cfs)
+            .expect("failed to open rocksdb")
+    } else {
+        open_cf(path, None, MetricConf::default(), opt_cfs).expect("failed to open rocksdb")
+    }
+}
diff --git a/moveos/raw-store/src/rocks/util.rs b/moveos/raw-store/src/rocks/util.rs
new file mode 100644
index 000000000..f9663343e
--- /dev/null
+++ b/moveos/raw-store/src/rocks/util.rs
@@ -0,0 +1,81 @@
+// Copyright (c) RoochNetwork
+// SPDX-License-Identifier: Apache-2.0
+
+// Copyright (c) Mysten Labs, Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+use rocksdb::{CompactionDecision, MergeOperands};
+use std::cmp::Ordering;
+
+/// custom rocksdb merge operator used for storing objects with reference counts
+/// important: reference count field must be 64-bit integer and must be last in struct declaration
+/// should be used with immutable objects only
+pub fn reference_count_merge_operator(
+    _key: &[u8],
+    stored_value: Option<&[u8]>,
+    operands: &MergeOperands,
+) -> Option<Vec<u8>> {
+    let (mut value, mut ref_count) = stored_value.map_or((None, 0), deserialize_ref_count_value);
+
+    for operand in operands {
+        let (new_value, delta) = deserialize_ref_count_value(operand);
+        assert!(value.is_none() || new_value.is_none() || value == new_value);
+        if value.is_none() && new_value.is_some() {
+            value = new_value;
+        }
+        ref_count += delta;
+    }
+    match ref_count.cmp(&0) {
+        Ordering::Greater => Some([value.unwrap_or(b""), &ref_count.to_le_bytes()].concat()),
+        Ordering::Equal => Some(vec![]),
+        Ordering::Less => Some(ref_count.to_le_bytes().to_vec()),
+    }
+}
+
+pub fn empty_compaction_filter(_level: u32, _key: &[u8], value: &[u8]) -> CompactionDecision {
+    if value.is_empty() {
+        CompactionDecision::Remove
+    } else {
+        CompactionDecision::Keep
+    }
+}
+
+pub fn is_ref_count_value(value: &[u8]) -> bool {
+    value.is_empty() || value.len() == 8
+}
+
+fn deserialize_ref_count_value(bytes: &[u8]) -> (Option<&[u8]>, i64) {
+    if bytes.is_empty() {
+        return (None, 0);
+    }
+    assert!(bytes.len() >= 8);
+    let (value, rc_bytes) = bytes.split_at(bytes.len() - 8);
+    let ref_count = i64::from_le_bytes(rc_bytes.try_into().unwrap());
+    (if value.is_empty() { None } else { Some(value) }, ref_count)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::deserialize_ref_count_value;
+
+    #[test]
+    fn deserialize_ref_count_value_test() {
+        assert_eq!(deserialize_ref_count_value(&[]), (None, 0));
+        assert_eq!(
+            deserialize_ref_count_value(b"\x01\0\0\0\0\0\0\0"),
+            (None, 1)
+        );
+        assert_eq!(
+            deserialize_ref_count_value(b"\xff\xff\xff\xff\xff\xff\xff\xff"),
+            (None, -1)
+        );
+        assert_eq!(
+            deserialize_ref_count_value(b"\xfe\xff\xff\xff\xff\xff\xff\xff"),
+            (None, -2)
+        );
+        assert_eq!(
+            deserialize_ref_count_value(b"test\x04\0\0\0\0\0\0\0"),
+            (Some(b"test".as_ref()), 4)
+        );
+    }
+}
diff --git a/moveos/raw-store/src/rocks/values.rs b/moveos/raw-store/src/rocks/values.rs
new file mode 100644
index 000000000..78e97c0e8
--- /dev/null
+++ b/moveos/raw-store/src/rocks/values.rs
@@ -0,0 +1,47 @@
+// Copyright (c) RoochNetwork
+// SPDX-License-Identifier: Apache-2.0
+
+// Copyright (c) Mysten Labs, Inc.
+// SPDX-License-Identifier: Apache-2.0
+use std::marker::PhantomData;
+
+use crate::RawStoreError;
+use serde::de::DeserializeOwned;
+
+use super::RocksDBRawIter;
+
+/// An iterator over the values of a prefix.
+pub struct Values<'a, V> {
+    db_iter: RocksDBRawIter<'a>,
+    _phantom: PhantomData<V>,
+}
+
+impl<'a, V: DeserializeOwned> Values<'a, V> {
+    pub(crate) fn new(db_iter: RocksDBRawIter<'a>) -> Self {
+        Self {
+            db_iter,
+            _phantom: PhantomData,
+        }
+    }
+}
+
+impl<'a, V: DeserializeOwned> Iterator for Values<'a, V> {
+    type Item = Result<V, RawStoreError>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.db_iter.valid() {
+            let value = self
+                .db_iter
+                .key()
+                .and_then(|_| self.db_iter.value().and_then(|v| bcs::from_bytes(v).ok()));
+
+            self.db_iter.next();
+            value.map(Ok)
+        } else {
+            match self.db_iter.status() {
+                Ok(_) => None,
+                Err(err) => Some(Err(RawStoreError::RocksDBError(format!("{err}")))),
+            }
+        }
+    }
+}
diff --git a/moveos/raw-store/src/test_db.rs b/moveos/raw-store/src/test_db.rs
new file mode 100644
index 000000000..30b875065
--- /dev/null
+++ b/moveos/raw-store/src/test_db.rs
@@ -0,0 +1,807 @@
+// Copyright (c) RoochNetwork
+// SPDX-License-Identifier: Apache-2.0
+
+// Copyright (c) Mysten Labs, Inc.
+// SPDX-License-Identifier: Apache-2.0
+#![allow(clippy::await_holding_lock)]
+
+use std::{
+    borrow::Borrow,
+    collections::{btree_map::Iter, BTreeMap, HashMap, VecDeque},
+    marker::PhantomData,
+    sync::{Arc, RwLock},
+};
+
+use crate::{
+    rocks::{be_fix_int_ser, RawStoreError},
+    Map,
+};
+use bincode::Options;
+use collectable::TryExtend;
+use ouroboros::self_referencing;
+use rand::distributions::{Alphanumeric, DistString};
+use rocksdb::Direction;
+use serde::{de::DeserializeOwned, Serialize};
+use std::sync::{RwLockReadGuard, RwLockWriteGuard};
+
+/// An interface to a btree map backed sally database. This is mainly intended
+/// for tests and performing benchmark comparisons
+#[derive(Clone, Debug)]
+pub struct TestDB<K, V> {
+    pub rows: Arc<RwLock<BTreeMap<Vec<u8>, Vec<u8>>>>,
+    pub name: String,
+    _phantom: PhantomData<fn(K) -> V>,
+}
+
+impl<K, V> TestDB<K, V> {
+    pub fn open() -> Self {
+        TestDB {
+            rows: Arc::new(RwLock::new(BTreeMap::new())),
+            name: Alphanumeric.sample_string(&mut rand::thread_rng(), 16),
+            _phantom: PhantomData,
+        }
+    }
+    pub fn batch(&self) -> TestDBWriteBatch {
+        TestDBWriteBatch::default()
+    }
+}
+
+#[self_referencing(pub_extras)]
+pub struct TestDBIter<'a, K, V> {
+    pub rows: RwLockReadGuard<'a, BTreeMap<Vec<u8>, Vec<u8>>>,
+    #[borrows(mut rows)]
+    #[covariant]
+    pub iter: Iter<'this, Vec<u8>, Vec<u8>>,
+    phantom: PhantomData<(K, V)>,
+    pub direction: Direction,
+}
+
+#[self_referencing(pub_extras)]
+pub struct TestDBKeys<'a, K> {
+    rows: RwLockReadGuard<'a, BTreeMap<Vec<u8>, Vec<u8>>>,
+    #[borrows(mut rows)]
+    #[covariant]
+    pub iter: Iter<'this, Vec<u8>, Vec<u8>>,
+    phantom: PhantomData<K>,
+}
+
+#[self_referencing(pub_extras)]
+pub struct TestDBValues<'a, V> {
+    rows: RwLockReadGuard<'a, BTreeMap<Vec<u8>, Vec<u8>>>,
+    #[borrows(mut rows)]
+    #[covariant]
+    pub iter: Iter<'this, Vec<u8>, Vec<u8>>,
+    phantom: PhantomData<V>,
+}
+
+impl<'a, K: DeserializeOwned, V: DeserializeOwned> Iterator for TestDBIter<'a, K, V> {
+    type Item = Result<(K, V), RawStoreError>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let mut out: Option<Self::Item> = None;
+        let config = bincode::DefaultOptions::new()
+            .with_big_endian()
+            .with_fixint_encoding();
+        self.with_mut(|fields| {
+            let resp = match fields.direction {
+                Direction::Forward => fields.iter.next(),
+                Direction::Reverse => panic!("Reverse iteration not supported in test db"),
+            };
+            if let Some((raw_key, raw_value)) = resp {
+                let key: K = config.deserialize(raw_key).ok().unwrap();
+                let value: V = bcs::from_bytes(raw_value).ok().unwrap();
+                out = Some(Ok((key, value)));
+            }
+        });
+        out
+    }
+}
+
+impl<'a, K: Serialize, V> TestDBIter<'a, K, V> {
+    /// Skips all the elements that are smaller than the given key,
+    /// and either lands on the key or the first one greater than
+    /// the key.
+    pub fn skip_to(mut self, key: &K) -> Result<Self, RawStoreError> {
+        self.with_mut(|fields| {
+            let serialized_key = be_fix_int_ser(key).expect("serialization failed");
+            let mut peekable = fields.iter.peekable();
+            let mut peeked = peekable.peek();
+            while peeked.is_some() {
+                let serialized = be_fix_int_ser(peeked.unwrap()).expect("serialization failed");
+                if serialized >= serialized_key {
+                    break;
+                } else {
+                    peekable.next();
+                    peeked = peekable.peek();
+                }
+            }
+        });
+        Ok(self)
+    }
+
+    /// Moves the iterator to the element given or
+    /// the one prior to it if it does not exist. If there is
+    /// no element prior to it, it returns an empty iterator.
+    pub fn skip_prior_to(mut self, key: &K) -> Result<Self, RawStoreError> {
+        self.with_mut(|fields| {
+            let serialized_key = be_fix_int_ser(key).expect("serialization failed");
+            let mut peekable = fields.iter.peekable();
+            let mut peeked = peekable.peek();
+            while peeked.is_some() {
+                let serialized = be_fix_int_ser(peeked.unwrap()).expect("serialization failed");
+                if serialized > serialized_key {
+                    break;
+                } else {
+                    peekable.next();
+                    peeked = peekable.peek();
+                }
+            }
+        });
+        Ok(self)
+    }
+
+    /// Seeks to the last key in the database (at this column family).
+    pub fn skip_to_last(mut self) -> Self {
+        self.with_mut(|fields| {
+            fields.iter.last();
+        });
+        self
+    }
+
+    /// Will make the direction of the iteration reverse and will
+    /// create a new `RevIter` to consume. Every call to `next` method
+    /// will give the next element from the end.
+    pub fn reverse(mut self) -> TestDBRevIter<'a, K, V> {
+        self.with_mut(|fields| {
+            *fields.direction = Direction::Reverse;
+        });
+        TestDBRevIter::new(self)
+    }
+}
+
+/// An iterator with a reverted direction to the original. The `RevIter`
+/// is hosting an iteration which is consuming in the opposing direction.
+/// It's not possible to do further manipulation (ex re-reverse) to the
+/// iterator.
+pub struct TestDBRevIter<'a, K, V> {
+    iter: TestDBIter<'a, K, V>,
+}
+
+impl<'a, K, V> TestDBRevIter<'a, K, V> {
+    fn new(iter: TestDBIter<'a, K, V>) -> Self {
+        Self { iter }
+    }
+}
+
+impl<'a, K: DeserializeOwned, V: DeserializeOwned> Iterator for TestDBRevIter<'a, K, V> {
+    type Item = Result<(K, V), RawStoreError>;
+
+    /// Will give the next item backwards
+    fn next(&mut self) -> Option<Self::Item> {
+        self.iter.next()
+    }
+}
+
+impl<'a, K: DeserializeOwned> Iterator for TestDBKeys<'a, K> {
+    type Item = Result<K, RawStoreError>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let mut out: Option<Self::Item> = None;
+        self.with_mut(|fields| {
+            let config = bincode::DefaultOptions::new()
+                .with_big_endian()
+                .with_fixint_encoding();
+            if let Some((raw_key, _)) = fields.iter.next() {
+                let key: K = config.deserialize(raw_key).ok().unwrap();
+                out = Some(Ok(key));
+            }
+        });
+        out
+    }
+}
+
+impl<'a, V: DeserializeOwned> Iterator for TestDBValues<'a, V> {
+    type Item = Result<V, RawStoreError>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let mut out: Option<Self::Item> = None;
+        self.with_mut(|fields| {
+            if let Some((_, raw_value)) = fields.iter.next() {
+                let value: V = bcs::from_bytes(raw_value).ok().unwrap();
+                out = Some(Ok(value));
+            }
+        });
+        out
+    }
+}
+
+impl<'a, K, V> Map<'a, K, V> for TestDB<K, V>
+where
+    K: Serialize + DeserializeOwned,
+    V: Serialize + DeserializeOwned,
+{
+    type Error = RawStoreError;
+    type Iterator = std::iter::Empty<(K, V)>;
+    type SafeIterator = TestDBIter<'a, K, V>;
+    type Keys = TestDBKeys<'a, K>;
+    type Values = TestDBValues<'a, V>;
+
+    fn contains_key(&self, key: &K) -> Result<bool, Self::Error> {
+        let raw_key = be_fix_int_ser(key)?;
+        let locked = self.rows.read().unwrap();
+        Ok(locked.contains_key(&raw_key))
+    }
+
+    fn get(&self, key: &K) -> Result<Option<V>, Self::Error> {
+        let raw_key = be_fix_int_ser(key)?;
+        let locked = self.rows.read().unwrap();
+        let res = locked.get(&raw_key);
+        Ok(res.map(|raw_value| bcs::from_bytes(raw_value).ok().unwrap()))
+    }
+
+    fn get_raw_bytes(&self, key: &K) -> Result<Option<Vec<u8>>, Self::Error> {
+        let raw_key = be_fix_int_ser(key)?;
+        let locked = self.rows.read().unwrap();
+        let res = locked.get(&raw_key);
+        Ok(res.cloned())
+    }
+
+    fn insert(&self, key: &K, value: &V) -> Result<(), Self::Error> {
+        let raw_key = be_fix_int_ser(key)?;
+        let raw_value = bcs::to_bytes(value)?;
+        let mut locked = self.rows.write().unwrap();
+        locked.insert(raw_key, raw_value);
+        Ok(())
+    }
+
+    fn remove(&self, key: &K) -> Result<(), Self::Error> {
+        let raw_key = be_fix_int_ser(key)?;
+        let mut locked = self.rows.write().unwrap();
+        locked.remove(&raw_key);
+        Ok(())
+    }
+
+    fn clear(&self) -> Result<(), Self::Error> {
+        let mut locked = self.rows.write().unwrap();
+        locked.clear();
+        Ok(())
+    }
+
+    fn is_empty(&self) -> bool {
+        let locked = self.rows.read().unwrap();
+        locked.is_empty()
+    }
+
+    fn iter(&'a self) -> Self::Iterator {
+        unimplemented!("umplemented API");
+    }
+
+    fn iter_with_bounds(
+        &'a self,
+        _lower_bound: Option<K>,
+        _upper_bound: Option<K>,
+    ) -> Self::Iterator {
+        unimplemented!("umplemented API");
+    }
+
+    fn safe_iter(&'a self) -> Self::SafeIterator {
+        TestDBIterBuilder {
+            rows: self.rows.read().unwrap(),
+            iter_builder: |rows: &mut RwLockReadGuard<'a, BTreeMap<Vec<u8>, Vec<u8>>>| rows.iter(),
+            phantom: PhantomData,
+            direction: Direction::Forward,
+        }
+        .build()
+    }
+
+    fn keys(&'a self) -> Self::Keys {
+        TestDBKeysBuilder {
+            rows: self.rows.read().unwrap(),
+            iter_builder: |rows: &mut RwLockReadGuard<'a, BTreeMap<Vec<u8>, Vec<u8>>>| rows.iter(),
+            phantom: PhantomData,
+        }
+        .build()
+    }
+
+    fn values(&'a self) -> Self::Values {
+        TestDBValuesBuilder {
+            rows: self.rows.read().unwrap(),
+            iter_builder: |rows: &mut RwLockReadGuard<'a, BTreeMap<Vec<u8>, Vec<u8>>>| rows.iter(),
+            phantom: PhantomData,
+        }
+        .build()
+    }
+
+    fn try_catch_up_with_primary(&self) -> Result<(), Self::Error> {
+        Ok(())
+    }
+}
+
+impl<J, K, U, V> TryExtend<(J, U)> for TestDB<K, V>
+where
+    J: Borrow<K>,
+    U: Borrow<V>,
+    K: Serialize,
+    V: Serialize,
+{
+    type Error = RawStoreError;
+
+    fn try_extend<T>(&mut self, iter: &mut T) -> Result<(), Self::Error>
+    where
+        T: Iterator<Item = (J, U)>,
+    {
+        let mut wb = self.batch();
+        wb.insert_batch(self, iter)?;
+        wb.write()
+    }
+
+    fn try_extend_from_slice(&mut self, slice: &[(J, U)]) -> Result<(), Self::Error> {
+        let slice_of_refs = slice.iter().map(|(k, v)| (k.borrow(), v.borrow()));
+        let mut wb = self.batch();
+        wb.insert_batch(self, slice_of_refs)?;
+        wb.write()
+    }
+}
+
+pub type DeleteBatchPayload = (
+    Arc<RwLock<BTreeMap<Vec<u8>, Vec<u8>>>>,
+    String,
+    Vec<Vec<u8>>,
+);
+pub type DeleteRangePayload = (
+    Arc<RwLock<BTreeMap<Vec<u8>, Vec<u8>>>>,
+    String,
+    (Vec<u8>, Vec<u8>),
+);
+pub type InsertBatchPayload = (
+    Arc<RwLock<BTreeMap<Vec<u8>, Vec<u8>>>>,
+    String,
+    Vec<(Vec<u8>, Vec<u8>)>,
+);
+type DBAndName = (Arc<RwLock<BTreeMap<Vec<u8>, Vec<u8>>>>, String);
+
+pub enum WriteBatchOp {
+    DeleteBatch(DeleteBatchPayload),
+    DeleteRange(DeleteRangePayload),
+    InsertBatch(InsertBatchPayload),
+}
+
+#[derive(Default)]
+pub struct TestDBWriteBatch {
+    pub ops: VecDeque<WriteBatchOp>,
+}
+
+#[self_referencing]
+pub struct DBLocked {
+    db: Arc<RwLock<BTreeMap<Vec<u8>, Vec<u8>>>>,
+    #[borrows(db)]
+    #[covariant]
+    db_guard: RwLockWriteGuard<'this, BTreeMap<Vec<u8>, Vec<u8>>>,
+}
+
+impl TestDBWriteBatch {
+    pub fn write(self) -> Result<(), RawStoreError> {
+        let mut dbs: Vec<DBAndName> = self
+            .ops
+            .iter()
+            .map(|op| match op {
+                WriteBatchOp::DeleteBatch((db, name, _)) => (db.clone(), name.clone()),
+                WriteBatchOp::DeleteRange((db, name, _)) => (db.clone(), name.clone()),
+                WriteBatchOp::InsertBatch((db, name, _)) => (db.clone(), name.clone()),
+            })
+            .collect();
+        dbs.sort_by_key(|(_k, v)| v.clone());
+        dbs.dedup_by_key(|(_k, v)| v.clone());
+        // lock all databases
+        let mut db_locks = HashMap::new();
+        dbs.iter().for_each(|(db, name)| {
+            if !db_locks.contains_key(name) {
+                db_locks.insert(
+                    name.clone(),
+                    DBLockedBuilder {
+                        db: db.clone(),
+                        db_guard_builder: |db: &Arc<RwLock<BTreeMap<Vec<u8>, Vec<u8>>>>| {
+                            db.write().unwrap()
+                        },
+                    }
+                    .build(),
+                );
+            }
+        });
+        self.ops.iter().for_each(|op| match op {
+            WriteBatchOp::DeleteBatch((_, id, keys)) => {
+                let locked = db_locks.get_mut(id).unwrap();
+                locked.with_db_guard_mut(|db| {
+                    keys.iter().for_each(|key| {
+                        db.remove(key);
+                    });
+                });
+            }
+            WriteBatchOp::DeleteRange((_, id, (from, to))) => {
+                let locked = db_locks.get_mut(id).unwrap();
+                locked.with_db_guard_mut(|db| {
+                    db.retain(|k, _| k < from || k >= to);
+                });
+            }
+            WriteBatchOp::InsertBatch((_, id, key_values)) => {
+                let locked = db_locks.get_mut(id).unwrap();
+                locked.with_db_guard_mut(|db| {
+                    key_values.iter().for_each(|(k, v)| {
+                        db.insert(k.clone(), v.clone());
+                    });
+                });
+            }
+        });
+        // unlock in the reverse order
+        dbs.iter().rev().for_each(|(_db, id)| {
+            if db_locks.contains_key(id) {
+                db_locks.remove(id);
+            }
+        });
+        Ok(())
+    }
+    /// Deletes a set of keys given as an iterator
+    pub fn delete_batch<J: Borrow<K>, K: Serialize, V>(
+        &mut self,
+        db: &TestDB<K, V>,
+        purged_vals: impl IntoIterator<Item = J>,
+    ) -> Result<(), RawStoreError> {
+        self.ops.push_back(WriteBatchOp::DeleteBatch((
+            db.rows.clone(),
+            db.name.clone(),
+            purged_vals
+                .into_iter()
+                .map(|key| be_fix_int_ser(&key.borrow()).unwrap())
+                .collect(),
+        )));
+        Ok(())
+    }
+    /// Deletes a range of keys between `from` (inclusive) and `to` (non-inclusive)
+    pub fn delete_range<K: Serialize, V>(
+        &mut self,
+        db: &TestDB<K, V>,
+        from: &K,
+        to: &K,
+    ) -> Result<(), RawStoreError> {
+        let raw_from = be_fix_int_ser(from.borrow()).unwrap();
+        let raw_to = be_fix_int_ser(to.borrow()).unwrap();
+        self.ops.push_back(WriteBatchOp::DeleteRange((
+            db.rows.clone(),
+            db.name.clone(),
+            (raw_from, raw_to),
+        )));
+        Ok(())
+    }
+    /// inserts a range of (key, value) pairs given as an iterator
+    pub fn insert_batch<J: Borrow<K>, K: Serialize, U: Borrow<V>, V: Serialize>(
+        &mut self,
+        db: &TestDB<K, V>,
+        new_vals: impl IntoIterator<Item = (J, U)>,
+    ) -> Result<(), RawStoreError> {
+        self.ops.push_back(WriteBatchOp::InsertBatch((
+            db.rows.clone(),
+            db.name.clone(),
+            new_vals
+                .into_iter()
+                .map(|(key, value)| {
+                    (
+                        be_fix_int_ser(&key.borrow()).unwrap(),
+                        bcs::to_bytes(&value.borrow()).unwrap(),
+                    )
+                })
+                .collect(),
+        )));
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use crate::{test_db::TestDB, Map};
+
+    #[test]
+    fn test_contains_key() {
+        let db = TestDB::open();
+        db.insert(&123456789, &"123456789".to_string())
+            .expect("Failed to insert");
+        assert!(db
+            .contains_key(&123456789)
+            .expect("Failed to call contains key"));
+        assert!(!db
+            .contains_key(&000000000)
+            .expect("Failed to call contains key"));
+    }
+
+    #[test]
+    fn test_get() {
+        let db = TestDB::open();
+        db.insert(&123456789, &"123456789".to_string())
+            .expect("Failed to insert");
+        assert_eq!(
+            Some("123456789".to_string()),
+            db.get(&123456789).expect("Failed to get")
+        );
+        assert_eq!(None, db.get(&000000000).expect("Failed to get"));
+    }
+
+    #[test]
+    fn test_get_raw() {
+        let db = TestDB::open();
+        db.insert(&123456789, &"123456789".to_string())
+            .expect("Failed to insert");
+
+        let val_bytes = db
+            .get_raw_bytes(&123456789)
+            .expect("Failed to get_raw_bytes")
+            .unwrap();
+
+        assert_eq!(bcs::to_bytes(&"123456789".to_string()).unwrap(), val_bytes);
+        assert_eq!(
+            None,
+            db.get_raw_bytes(&000000000)
+                .expect("Failed to get_raw_bytes")
+        );
+    }
+
+    #[test]
+    fn test_multi_get() {
+        let db = TestDB::open();
+        db.insert(&123, &"123".to_string())
+            .expect("Failed to insert");
+        db.insert(&456, &"456".to_string())
+            .expect("Failed to insert");
+
+        let result = db.multi_get([123, 456, 789]).expect("Failed to multi get");
+
+        assert_eq!(result.len(), 3);
+        assert_eq!(result[0], Some("123".to_string()));
+        assert_eq!(result[1], Some("456".to_string()));
+        assert_eq!(result[2], None);
+    }
+
+    #[test]
+    fn test_remove() {
+        let db = TestDB::open();
+        db.insert(&123456789, &"123456789".to_string())
+            .expect("Failed to insert");
+        assert!(db.get(&123456789).expect("Failed to get").is_some());
+
+        db.remove(&123456789).expect("Failed to remove");
+        assert!(db.get(&123456789).expect("Failed to get").is_none());
+    }
+
+    #[test]
+    fn test_iter() {
+        let db = TestDB::open();
+        db.insert(&123456789, &"123456789".to_string())
+            .expect("Failed to insert");
+
+        let mut iter = db.safe_iter();
+        assert_eq!(Some(Ok((123456789, "123456789".to_string()))), iter.next());
+        assert_eq!(None, iter.next());
+    }
+
+    #[test]
+    fn test_iter_reverse() {
+        let db = TestDB::open();
+        db.insert(&1, &"1".to_string()).expect("Failed to insert");
+        db.insert(&2, &"2".to_string()).expect("Failed to insert");
+        db.insert(&3, &"3".to_string()).expect("Failed to insert");
+        let mut iter = db.safe_iter();
+
+        assert_eq!(Some(Ok((1, "1".to_string()))), iter.next());
+        assert_eq!(Some(Ok((2, "2".to_string()))), iter.next());
+        assert_eq!(Some(Ok((3, "3".to_string()))), iter.next());
+        assert_eq!(None, iter.next());
+    }
+
+    #[test]
+    fn test_keys() {
+        let db = TestDB::open();
+
+        db.insert(&123456789, &"123456789".to_string())
+            .expect("Failed to insert");
+
+        let mut keys = db.keys();
+        assert_eq!(Some(Ok(123456789)), keys.next());
+        assert_eq!(None, keys.next());
+    }
+
+    #[test]
+    fn test_values() {
+        let db = TestDB::open();
+
+        db.insert(&123456789, &"123456789".to_string())
+            .expect("Failed to insert");
+
+        let mut values = db.values();
+        assert_eq!(Some(Ok("123456789".to_string())), values.next());
+        assert_eq!(None, values.next());
+    }
+
+    #[test]
+    fn test_insert_batch() {
+        let db = TestDB::open();
+        let keys_vals = (1..100).map(|i| (i, i.to_string()));
+        let mut wb = db.batch();
+        wb.insert_batch(&db, keys_vals.clone())
+            .expect("Failed to batch insert");
+        wb.write().expect("Failed to execute batch");
+        for (k, v) in keys_vals {
+            let val = db.get(&k).expect("Failed to get inserted key");
+            assert_eq!(Some(v), val);
+        }
+    }
+
+    #[test]
+    fn test_insert_batch_across_cf() {
+        let db_cf_1 = TestDB::open();
+        let keys_vals_1 = (1..100).map(|i| (i, i.to_string()));
+
+        let db_cf_2 = TestDB::open();
+        let keys_vals_2 = (1000..1100).map(|i| (i, i.to_string()));
+
+        let mut wb = db_cf_1.batch();
+        wb.insert_batch(&db_cf_1, keys_vals_1.clone())
+            .expect("Failed to batch insert");
+        wb.insert_batch(&db_cf_2, keys_vals_2.clone())
+            .expect("Failed to batch insert");
+        wb.write().expect("Failed to execute batch");
+        for (k, v) in keys_vals_1 {
+            let val = db_cf_1.get(&k).expect("Failed to get inserted key");
+            assert_eq!(Some(v), val);
+        }
+
+        for (k, v) in keys_vals_2 {
+            let val = db_cf_2.get(&k).expect("Failed to get inserted key");
+            assert_eq!(Some(v), val);
+        }
+    }
+
+    #[test]
+    fn test_delete_batch() {
+        let db: TestDB<i32, String> = TestDB::open();
+
+        let keys_vals = (1..100).map(|i| (i, i.to_string()));
+        let mut wb = db.batch();
+        wb.insert_batch(&db, keys_vals)
+            .expect("Failed to batch insert");
+
+        // delete the odd-index keys
+        let deletion_keys = (1..100).step_by(2);
+        wb.delete_batch(&db, deletion_keys)
+            .expect("Failed to batch delete");
+
+        wb.write().expect("Failed to execute batch");
+
+        for k in db.keys() {
+            assert_eq!(k.unwrap() % 2, 0);
+        }
+    }
+
+    #[test]
+    fn test_delete_range() {
+        let db: TestDB<i32, String> = TestDB::open();
+
+        // Note that the last element is (100, "100".to_owned()) here
+        let keys_vals = (0..101).map(|i| (i, i.to_string()));
+        let mut wb = db.batch();
+        wb.insert_batch(&db, keys_vals)
+            .expect("Failed to batch insert");
+
+        wb.delete_range(&db, &50, &100)
+            .expect("Failed to delete range");
+
+        wb.write().expect("Failed to execute batch");
+
+        for k in 0..50 {
+            assert!(db.contains_key(&k).expect("Failed to query legal key"),);
+        }
+        for k in 50..100 {
+            assert!(!db.contains_key(&k).expect("Failed to query legal key"));
+        }
+
+        // range operator is not inclusive of to
+        assert!(db.contains_key(&100).expect("Failed to query legel key"));
+    }
+
+    #[test]
+    fn test_clear() {
+        let db: TestDB<i32, String> = TestDB::open();
+
+        // Test clear of empty map
+        let _ = db.clear();
+
+        let keys_vals = (0..101).map(|i| (i, i.to_string()));
+        let mut wb = db.batch();
+        wb.insert_batch(&db, keys_vals)
+            .expect("Failed to batch insert");
+
+        wb.write().expect("Failed to execute batch");
+
+        // Check we have multiple entries
+        assert!(db.safe_iter().count() > 1);
+        let _ = db.clear();
+        assert_eq!(db.safe_iter().count(), 0);
+        // Clear again to ensure safety when clearing empty map
+        let _ = db.clear();
+        assert_eq!(db.safe_iter().count(), 0);
+        // Clear with one item
+        let _ = db.insert(&1, &"e".to_string());
+        assert_eq!(db.safe_iter().count(), 1);
+        let _ = db.clear();
+        assert_eq!(db.safe_iter().count(), 0);
+    }
+
+    #[test]
+    fn test_is_empty() {
+        let db: TestDB<i32, String> = TestDB::open();
+
+        // Test empty map is truly empty
+        assert!(db.is_empty());
+        let _ = db.clear();
+        assert!(db.is_empty());
+
+        let keys_vals = (0..101).map(|i| (i, i.to_string()));
+        let mut wb = db.batch();
+        wb.insert_batch(&db, keys_vals)
+            .expect("Failed to batch insert");
+
+        wb.write().expect("Failed to execute batch");
+
+        // Check we have multiple entries and not empty
+        assert!(db.safe_iter().count() > 1);
+        assert!(!db.is_empty());
+
+        // Clear again to ensure empty works after clearing
+        let _ = db.clear();
+        assert_eq!(db.safe_iter().count(), 0);
+        assert!(db.is_empty());
+    }
+
+    #[test]
+    fn test_multi_insert() {
+        // Init a DB
+        let db: TestDB<i32, String> = TestDB::open();
+
+        // Create kv pairs
+        let keys_vals = (0..101).map(|i| (i, i.to_string()));
+
+        db.multi_insert(keys_vals.clone())
+            .expect("Failed to multi-insert");
+
+        for (k, v) in keys_vals {
+            let val = db.get(&k).expect("Failed to get inserted key");
+            assert_eq!(Some(v), val);
+        }
+    }
+
+    #[test]
+    fn test_multi_remove() {
+        // Init a DB
+        let db: TestDB<i32, String> = TestDB::open();
+
+        // Create kv pairs
+        let keys_vals = (0..101).map(|i| (i, i.to_string()));
+
+        db.multi_insert(keys_vals.clone())
+            .expect("Failed to multi-insert");
+
+        // Check insertion
+        for (k, v) in keys_vals.clone() {
+            let val = db.get(&k).expect("Failed to get inserted key");
+            assert_eq!(Some(v), val);
+        }
+
+        // Remove 50 items
+        db.multi_remove(keys_vals.clone().map(|kv| kv.0).take(50))
+            .expect("Failed to multi-remove");
+        assert_eq!(db.safe_iter().count(), 101 - 50);
+
+        // Check that the remaining are present
+        for (k, v) in keys_vals.skip(50) {
+            let val = db.get(&k).expect("Failed to get inserted key");
+            assert_eq!(Some(v), val);
+        }
+    }
+}
diff --git a/moveos/raw-store/src/traits.rs b/moveos/raw-store/src/traits.rs
new file mode 100644
index 000000000..a4154ba7f
--- /dev/null
+++ b/moveos/raw-store/src/traits.rs
@@ -0,0 +1,208 @@
+// Copyright (c) RoochNetwork
+// SPDX-License-Identifier: Apache-2.0
+
+// Copyright (c) Mysten Labs, Inc.
+// SPDX-License-Identifier: Apache-2.0
+use crate::RawStoreError;
+use async_trait::async_trait;
+use serde::{de::DeserializeOwned, Serialize};
+use std::{borrow::Borrow, collections::BTreeMap, error::Error};
+
+pub trait Map<'a, K, V>
+where
+    K: Serialize + DeserializeOwned + ?Sized,
+    V: Serialize + DeserializeOwned,
+{
+    type Error: Error;
+    type Iterator: Iterator<Item = (K, V)>;
+    type SafeIterator: Iterator<Item = Result<(K, V), RawStoreError>>;
+    type Keys: Iterator<Item = Result<K, RawStoreError>>;
+    type Values: Iterator<Item = Result<V, RawStoreError>>;
+
+    /// Returns true if the map contains a value for the specified key.
+    fn contains_key(&self, key: &K) -> Result<bool, Self::Error>;
+
+    /// Returns the value for the given key from the map, if it exists.
+    fn get(&self, key: &K) -> Result<Option<V>, Self::Error>;
+
+    /// Returns the raw value (serialized bytes) for the given key from the map, if it exists.
+    fn get_raw_bytes(&self, key: &K) -> Result<Option<Vec<u8>>, Self::Error>;
+
+    /// Returns the value for the given key from the map, if it exists
+    /// or the given default value if it does not.
+    /// This method is not thread safe
+    fn get_or_insert_unsafe<F: FnOnce() -> V>(
+        &self,
+        key: &K,
+        default: F,
+    ) -> Result<V, Self::Error> {
+        self.get(key).and_then(|optv| match optv {
+            Some(v) => Ok(v),
+            None => {
+                self.insert(key, &default())?;
+                self.get(key).transpose().expect("default just inserted")
+            }
+        })
+    }
+
+    /// Inserts the given key-value pair into the map.
+    fn insert(&self, key: &K, value: &V) -> Result<(), Self::Error>;
+
+    /// Removes the entry for the given key from the map.
+    fn remove(&self, key: &K) -> Result<(), Self::Error>;
+
+    /// Removes every key-value pair from the map.
+    fn clear(&self) -> Result<(), Self::Error>;
+
+    /// Returns true if the map is empty, otherwise false.
+    fn is_empty(&self) -> bool;
+
+    /// Returns an iterator visiting each key-value pair in the map.
+    fn iter(&'a self) -> Self::Iterator;
+
+    /// Returns an iterator visiting each key-value pair in the map.
+    fn iter_with_bounds(&'a self, lower_bound: Option<K>, upper_bound: Option<K>)
+        -> Self::Iterator;
+
+    /// Same as `iter` but performs status check
+    fn safe_iter(&'a self) -> Self::SafeIterator;
+
+    /// Returns an iterator over each key in the map.
+    fn keys(&'a self) -> Self::Keys;
+
+    /// Returns an iterator over each value in the map.
+    fn values(&'a self) -> Self::Values;
+
+    /// Returns a vector of values corresponding to the keys provided, non-atomically.
+    fn multi_get<J>(&self, keys: impl IntoIterator<Item = J>) -> Result<Vec<Option<V>>, Self::Error>
+    where
+        J: Borrow<K>,
+    {
+        keys.into_iter().map(|key| self.get(key.borrow())).collect()
+    }
+
+    /// Returns a vector of raw values corresponding to the keys provided, non-atomically.
+    fn multi_get_raw_bytes<J>(
+        &self,
+        keys: impl IntoIterator<Item = J>,
+    ) -> Result<Vec<Option<Vec<u8>>>, Self::Error>
+    where
+        J: Borrow<K>,
+    {
+        keys.into_iter()
+            .map(|key| self.get_raw_bytes(key.borrow()))
+            .collect()
+    }
+
+    /// Returns a vector of values corresponding to the keys provided, non-atomically.
+    fn chunked_multi_get<J>(
+        &self,
+        keys: impl IntoIterator<Item = J>,
+        _chunk_size: usize,
+    ) -> Result<Vec<Option<V>>, Self::Error>
+    where
+        J: Borrow<K>,
+    {
+        keys.into_iter().map(|key| self.get(key.borrow())).collect()
+    }
+
+    /// Inserts key-value pairs, non-atomically.
+    fn multi_insert<J, U>(
+        &self,
+        key_val_pairs: impl IntoIterator<Item = (J, U)>,
+    ) -> Result<(), Self::Error>
+    where
+        J: Borrow<K>,
+        U: Borrow<V>,
+    {
+        key_val_pairs
+            .into_iter()
+            .try_for_each(|(key, value)| self.insert(key.borrow(), value.borrow()))
+    }
+
+    /// Removes keys, non-atomically.
+    fn multi_remove<J>(&self, keys: impl IntoIterator<Item = J>) -> Result<(), Self::Error>
+    where
+        J: Borrow<K>,
+    {
+        keys.into_iter()
+            .try_for_each(|key| self.remove(key.borrow()))
+    }
+
+    /// Try to catch up with primary when running as secondary
+    fn try_catch_up_with_primary(&self) -> Result<(), Self::Error>;
+}
+
+#[async_trait]
+pub trait AsyncMap<'a, K, V>
+where
+    K: Serialize + DeserializeOwned + ?Sized + std::marker::Sync,
+    V: Serialize + DeserializeOwned + std::marker::Sync + std::marker::Send,
+{
+    type Error: Error;
+    type Iterator: Iterator<Item = Result<(K, V), RawStoreError>>;
+    type Keys: Iterator<Item = Result<K, RawStoreError>>;
+    type Values: Iterator<Item = Result<V, RawStoreError>>;
+
+    /// Returns true if the map contains a value for the specified key.
+    async fn contains_key(&self, key: &K) -> Result<bool, Self::Error>;
+
+    /// Returns the value for the given key from the map, if it exists.
+    async fn get(&self, key: &K) -> Result<Option<V>, Self::Error>;
+
+    /// Returns the raw value (serialized bytes) for the given key from the map, if it exists.
+    async fn get_raw_bytes(&self, key: &K) -> Result<Option<Vec<u8>>, Self::Error>;
+
+    /// Returns true if the map is empty, otherwise false.
+    async fn is_empty(&self) -> bool;
+
+    /// Returns an iterator visiting each key-value pair in the map.
+    async fn iter(&'a self) -> Self::Iterator;
+
+    /// Returns an iterator over each key in the map.
+    async fn keys(&'a self) -> Self::Keys;
+
+    /// Returns an iterator over each value in the map.
+    async fn values(&'a self) -> Self::Values;
+
+    /// Returns a vector of values corresponding to the keys provided, non-atomically.
+    async fn multi_get<J>(
+        &self,
+        keys: impl IntoIterator<Item = J> + std::marker::Send,
+    ) -> Result<Vec<Option<V>>, Self::Error>
+    where
+        J: Borrow<K>;
+
+    /// Try to catch up with primary when running as secondary
+    async fn try_catch_up_with_primary(&self) -> Result<(), Self::Error>;
+}
+
+pub struct TableSummary {
+    pub num_keys: u64,
+    pub key_bytes_total: usize,
+    pub value_bytes_total: usize,
+    pub key_hist: hdrhistogram::Histogram<u64>,
+    pub value_hist: hdrhistogram::Histogram<u64>,
+}
+
+pub trait RawStoreDebug {
+    /// Dump a DB table with pagination
+    fn dump_table(
+        &self,
+        table_name: String,
+        page_size: u16,
+        page_number: usize,
+    ) -> eyre::Result<BTreeMap<String, String>>;
+
+    /// Get the name of the DB. This is simply the name of the struct
+    fn primary_db_name(&self) -> String;
+
+    /// Get a map of table names to key-value types
+    fn describe_all_tables(&self) -> BTreeMap<String, (String, String)>;
+
+    /// Count the entries in the table
+    fn count_table_keys(&self, table_name: String) -> eyre::Result<usize>;
+
+    /// Return table summary of the input table
+    fn table_summary(&self, table_name: String) -> eyre::Result<TableSummary>;
+}