diff --git a/Cargo.lock b/Cargo.lock index 122923d2e7..d21cb3d52b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -52,12 +52,6 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "41ed9a86bf92ae6580e0a31281f65a1b1d867c0cc68d5346e2ae128dddfa6a7d" -[[package]] -name = "assert_matches" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b34d609dfbaf33d6889b2b7106d3ca345eacad44200913df5ba02bfd31d2ba9" - [[package]] name = "autocfg" version = "1.1.0" @@ -1019,15 +1013,6 @@ dependencies = [ "libc", ] -[[package]] -name = "numsep" -version = "0.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad5c49c3e12c314efb1f43cba136031b657dcd59ee26936ab2be313c5e97da22" -dependencies = [ - "slicestring", -] - [[package]] name = "once_cell" version = "1.18.0" @@ -1486,18 +1471,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" -[[package]] -name = "size" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fed904c7fb2856d868b92464fc8fa597fce366edea1a9cbfaa8cb5fe080bd6d" - -[[package]] -name = "slicestring" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "636b979c5672ac7c2a1120ca0a9a6074cd090dadfec42af6f8a5baea1223d180" - [[package]] name = "smallvec" version = "1.8.0" @@ -1514,7 +1487,6 @@ checksum = "9f1341053f34bb13b5e9590afb7d94b48b48d4b87467ec28e3c238693bb553de" name = "sourmash" version = "0.12.0" dependencies = [ - "assert_matches", "az", "bytecount", "byteorder", @@ -1538,7 +1510,6 @@ dependencies = [ "niffler", "nohash-hasher", "num-iter", - "numsep", "once_cell", "ouroboros", "piz", @@ -1551,7 +1522,6 @@ dependencies = [ "rocksdb", "serde", "serde_json", - "size", "tempfile", "thiserror", "twox-hash", diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 73e42057e2..d2b07848bb 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -46,7 +46,6 @@ murmurhash3 = "0.0.5" niffler = { version = "2.3.1", default-features = false, features = [ "gz" ] } nohash-hasher = "0.2.0" num-iter = "0.1.43" -numsep = "0.1.12" once_cell = "1.18.0" ouroboros = "0.18.0" piz = "0.5.0" @@ -56,14 +55,12 @@ rkyv = { version = "0.7.39", optional = true } roaring = "0.10.0" serde = { version = "1.0.168", features = ["derive"] } serde_json = "1.0.107" -size = "0.4.0" thiserror = "1.0" twox-hash = "1.6.0" typed-builder = "0.14.0" vec-collections = "0.4.3" [dev-dependencies] -assert_matches = "1.3.0" criterion = "0.5.1" needletail = { version = "0.5.1", default-features = false } proptest = { version = "1.2.0", default-features = false, features = ["std"]} diff --git a/src/core/src/index/revindex/disk_revindex.rs b/src/core/src/index/revindex/disk_revindex.rs index 5080ca7ce1..5ab4a5c321 100644 --- a/src/core/src/index/revindex/disk_revindex.rs +++ b/src/core/src/index/revindex/disk_revindex.rs @@ -10,10 +10,9 @@ use rocksdb::{ColumnFamilyDescriptor, MergeOperands, Options}; use crate::collection::{Collection, CollectionSet}; use crate::encodings::{Color, Idx}; -use crate::index::revindex::prepare_query; use crate::index::revindex::{ - self as module, stats_for_cf, Datasets, HashToColor, QueryColors, RevIndexOps, DB, HASHES, - MANIFEST, METADATA, STORAGE_SPEC, + self as module, prepare_query, stats_for_cf, Datasets, DbStats, HashToColor, QueryColors, + RevIndexOps, DB, HASHES, MANIFEST, METADATA, STORAGE_SPEC, }; use crate::index::{GatherResult, SigCounter}; use crate::manifest::Manifest; @@ -409,8 +408,8 @@ impl RevIndexOps for RevIndex { Ok(module::RevIndex::Plain(self)) } - fn check(&self, quick: bool) { - stats_for_cf(self.db.clone(), HASHES, true, quick); + fn check(&self, quick: bool) -> DbStats { + stats_for_cf(self.db.clone(), HASHES, true, quick) } fn compact(&self) { diff --git a/src/core/src/index/revindex/mod.rs b/src/core/src/index/revindex/mod.rs index a1f796bb7f..36245c604d 100644 --- a/src/core/src/index/revindex/mod.rs +++ b/src/core/src/index/revindex/mod.rs @@ -8,6 +8,7 @@ use std::sync::Arc; use byteorder::{LittleEndian, WriteBytesExt}; use enum_dispatch::enum_dispatch; +use getset::{Getters, Setters}; use nohash_hasher::BuildNoHashHasher; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; @@ -67,7 +68,7 @@ pub trait RevIndexOps { fn convert(&self, output_db: RevIndex) -> Result<()>; - fn check(&self, quick: bool); + fn check(&self, quick: bool) -> DbStats; fn gather( &self, @@ -381,11 +382,27 @@ impl Datasets { */ } -fn stats_for_cf(db: Arc, cf_name: &str, deep_check: bool, quick: bool) { +#[derive(Getters, Setters, Debug)] +pub struct DbStats { + #[getset(get = "pub")] + total_datasets: usize, + + #[getset(get = "pub")] + total_keys: usize, + + #[getset(get = "pub")] + kcount: usize, + + #[getset(get = "pub")] + vcount: usize, + + #[getset(get = "pub")] + vcounts: histogram::Histogram, +} + +fn stats_for_cf(db: Arc, cf_name: &str, deep_check: bool, quick: bool) -> DbStats { use byteorder::ReadBytesExt; use histogram::Histogram; - use log::info; - use numsep::{separate, Locale}; let cf = db.cf_handle(cf_name).unwrap(); @@ -411,28 +428,12 @@ fn stats_for_cf(db: Arc, cf_name: &str, deep_check: bool, quick: bool) { //println!("Saw {} {:?}", k, value); } - info!("*** {} ***", cf_name); - use size::Size; - let ksize = Size::from_bytes(kcount); - let vsize = Size::from_bytes(vcount); - if !quick && cf_name == COLORS { - info!( - "total datasets: {}", - separate(datasets.len(), Locale::English) - ); - } - info!("total keys: {}", separate(kcount / 8, Locale::English)); - - info!("k: {}", ksize.to_string()); - info!("v: {}", vsize.to_string()); - - if !quick && kcount > 0 && deep_check { - info!("max v: {}", vcounts.maximum().unwrap()); - info!("mean v: {}", vcounts.mean().unwrap()); - info!("stddev: {}", vcounts.stddev().unwrap()); - info!("median v: {}", vcounts.percentile(50.0).unwrap()); - info!("p25 v: {}", vcounts.percentile(25.0).unwrap()); - info!("p75 v: {}", vcounts.percentile(75.0).unwrap()); + DbStats { + total_datasets: datasets.len(), + total_keys: kcount / 8, + kcount, + vcount, + vcounts, } }