diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7fadcff..9fafd21 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,7 +2,7 @@ name: CI on: push: branches: - - master + - main pull_request: jobs: @@ -10,12 +10,12 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@master + uses: actions/checkout@main - uses: actions-rs/toolchain@v1 with: profile: minimal - toolchain: 1.60.0 + toolchain: stable override: true - name: version info @@ -28,7 +28,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@master + uses: actions/checkout@main - uses: actions-rs/toolchain@v1 with: @@ -46,7 +46,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@master + uses: actions/checkout@main - uses: actions-rs/toolchain@v1 with: @@ -63,7 +63,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@master + uses: actions/checkout@main - uses: actions-rs/toolchain@v1 with: diff --git a/.gitignore b/.gitignore index f842fd9..cfee32f 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ Cargo.lock .DS_Store .idea/ old/ +docs/notebook/.python-version diff --git a/Cargo.toml b/Cargo.toml index 7461037..688c99d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,12 +1,24 @@ [package] name = "bfield" -version = "0.2.1" -authors = ["Roderick Bovee "] +description = "B-field datastructure implementation in Rust" +version = "0.3.0" +authors = ["Vincent Prouillet ", "Gerrit Gerritsen ", "Nick Greenfield "] +homepage = "https://github.com/onecodex/rust-bfield/" +repository = "https://github.com/onecodex/rust-bfield/" +readme = "README.md" +keywords = ["B-field", "probabilistic data structures"] +categories = ["data-structures"] edition = "2018" +license = "Apache 2.0" +exclude = [ + ".gitignore", + ".github/*", + "docs/*", +] [dependencies] bincode = "1" -mmap-bitvec = "0.4.0" +mmap-bitvec = "0.4.1" murmurhash3 = "0.0.5" serde = { version = "1.0", features = ["derive"] } once_cell = "1.3.1" diff --git a/README.md b/README.md index 9cfcfd6..81ccef7 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,12 @@ # `rust-bfield`, an implementation of the B-field probabilistic key-value data structure +[![Crates.io Version](https://img.shields.io/crates/v/bfield.svg)](https://crates.io/crates/bfield) + The B-field is a novel, probabilistic data structure for storing key-value pairs (or, said differently, it is a probabilistic associative array or map). B-fields support insertion (`insert`) and lookup (`get`) operations, and share a number of mathematical and performance properties with the well-known [Bloom filter](https://doi.org/10.1145/362686.362692). At [One Codex](https://www.onecodex.com), we use the `rust-bfield` crate in bioinformatics applications to efficiently store associations between billions of $k$-length nucleotide substrings (["k-mers"](https://en.wikipedia.org/wiki/K-mer)) and [their taxonomic identity](https://www.ncbi.nlm.nih.gov/taxonomy) _**using only 6-7 bytes per `(kmer, value)` pair**_ for up to 100,000 unique taxonomic IDs (distinct values) and a 0.1% error rate. We hope others are able to use this library (or implementations in other languages) for applications in bioinformatics and beyond. -> _Note: In the [Implementation Details](#implementation-details) section below, we detail the use of this B-field implementation in Rust and use `code` formatting and English parameter names (e.g., we discuss the B-field being a data structure for storing `(key, value)` pairs). In the following [Formal Data Structure Details](#formal-data-structure-details) section, we detail the design and mechanics of the B-field using mathematical notation (i.e., we discuss it as an associate array mapping a set of_ $(x, y)$ _pairs). The generated Rust documentation includes both notations for ease of reference._ +> _Note: In the [Implementation Details](#implementation-details) section below, we detail the use of this B-field implementation in Rust and use `code` formatting and English parameter names (e.g., we discuss the B-field being a data structure for storing `(key, value)` pairs). In the following [Formal Data Structure Details](#formal-data-structure-details) section, we detail the design and mechanics of the B-field using mathematical notation (i.e., we discuss it as an associate array mapping a set of_ $(x, y)$ _pairs). The [generated Rust documentation](https://docs.rs/bfield/latest/bfield/) includes both notations for ease of reference._ ## Implementation Details @@ -73,7 +75,7 @@ for p in 0..4u32 { * After creation, a B-field can optionally be loaded from a directory containing the produced `mmap` and related files with the `load` function. And once created or loaded, a B-field can be directly queried using the `get` function, which will either return `None`, `Indeterminate`, or `Some(BFieldValue)` (which is currently an alias for `Some(u32)` see [limitations](#⚠️-current-limitations-of-the-rust-bfield-implementation) below for more details): -```rust +```rust no_run use bfield::BField; // Load based on filename of the first array ".0.bfd" diff --git a/src/bfield.rs b/src/bfield.rs index 1a9ffbf..f37c906 100644 --- a/src/bfield.rs +++ b/src/bfield.rs @@ -7,7 +7,7 @@ use serde::Serialize; use crate::bfield_member::{BFieldLookup, BFieldMember, BFieldVal}; -/// The struct holding the various bfields +/// The `struct` holding the `BField` primary and secondary bit arrays. pub struct BField { members: Vec>, read_only: bool, @@ -18,18 +18,26 @@ unsafe impl Send for BField {} unsafe impl Sync for BField {} impl BField { - /// The (complicated) method to create a bfield. - /// The bfield files will be created in `directory` with the given `filename` and the - /// suffixes `(0..n_secondaries).bfd` - /// `size` is the primary bfield size, subsequent bfield sizes will be determined by - /// `secondary_scaledown` and `max_scaledown`. - /// If you set `in_memory` to true, remember to call `persist_to_disk` when it's built to + /// A (rather complex) method for creating a `BField`. + /// + /// This will create a series of `BField` bit array files in `directory` with the given `filename` and the + /// suffixes `(0..n_secondaries).bfd`. If you set `in_memory` to true, remember to call `persist_to_disk` once it's built to /// save it. - /// The params are the following in the paper: - /// `n_hashes` -> k - /// `marker_width` -> v (nu) - /// `n_marker_bits` -> κ (kappa) - /// `secondary_scaledown` -> β (beta) + /// + /// The following parameters are required. See the [README.md](https://github.com/onecodex/rust-bfield/) + /// for additional details as well as the + /// [parameter selection notebook](https://github.com/onecodex/rust-bfield/blob/main/docs/notebook/calculate-parameters.ipynb) + /// for helpful guidance in picking optimal parameters. + /// - `size` is the primary `BField` size, subsequent `BField` sizes will be determined + /// by the `secondary_scaledown` and `max_scaledown` parameters + /// - `n_hashes`. The number of hash functions _k_ to use. + /// - `marker_width` or v (nu). The length of the bit-string to use for + /// - `n_marker_bits` or κ (kappa). The number of 1s to set in each v-length bit-string (also its Hamming weight). + /// - `secondary_scaledown` or β (beta). The scaling factor to use for each subsequent `BField` size. + /// - `max_scaledown`. A maximum scaling factor to use for secondary `BField` sizes, since β raised to the power of + /// `n_secondaries` can be impractically/needlessly small. + /// - `n_secondaries`. The number of secondary `BField`s to create. + /// - `in_memory`. Whether to create the `BField` in memory or on disk. #[allow(clippy::too_many_arguments)] pub fn create

( directory: P, @@ -84,7 +92,7 @@ impl BField { }) } - /// Loads the bfield given the path to the "main" db path (eg the one ending with `0.bfd`). + /// Loads the `BField` given the path to the primary array data file (eg the one ending with `0.bfd`). pub fn load>(main_db_path: P, read_only: bool) -> Result { let mut members = Vec::new(); let mut n = 0; @@ -126,8 +134,8 @@ impl BField { Ok(BField { members, read_only }) } - /// Write the current bfields to disk. - /// Only useful if you are creating a bfield in memory + /// Write the current `BField` to disk. + /// Only useful if you are creating a `BField` in memory. pub fn persist_to_disk(self) -> Result { let mut members = Vec::with_capacity(self.members.len()); for m in self.members { @@ -139,32 +147,32 @@ impl BField { }) } - /// Returns (n_hashes, marker_width, n_marker_bits, Vec) + /// Returns `(n_hashes, marker_width, n_marker_bits, Vec)`. pub fn build_params(&self) -> (u8, u8, u8, Vec) { let (_, n_hashes, marker_width, n_marker_bits) = self.members[0].info(); let sizes = self.members.iter().map(|i| i.info().0).collect(); (n_hashes, marker_width, n_marker_bits, sizes) } - /// Returns the params given at build time to the bfields + /// Returns the params given at build time to the `BField` arrays. pub fn params(&self) -> &Option { &self.members[0].params.other } - /// This doesn't actually update the file, so we can use it to e.g. - /// simulate params on an old legacy file that may not actually have - /// them set. + /// ⚠️ Method for setting parameters without actually updating any files on disk. **Only useful for supporting legacy file formats + /// in which these parameters are not saved.** pub fn mock_params(&mut self, params: T) { self.members[0].params.other = Some(params); } - /// This allows an insert of a value into the b-field after the entire - /// b-field build process has been completed. - /// - /// It has the very bad downside of potentially knocking other keys out - /// of the b-field by making them indeterminate (which will make them fall - /// back to the secondaries where they don't exist and thus it'll appear - /// as if they were never inserted to begin with) + /// ⚠️ Method for inserting a value into a `BField` + /// after it has been fully built and finalized. + /// **This method should be used with extreme care** + /// as it does not guarantee that keys are properly propagated + /// to secondary arrays and therefore may make lookups of previously + /// set values return an indeterminate result in the primary array, + /// then causing fallback to the secondary arrays where they were never + /// inserted (and returning a false negative). pub fn force_insert(&self, key: &[u8], value: BFieldVal) { debug_assert!(!self.read_only, "Can't insert into read_only bfields"); for secondary in &self.members { @@ -174,8 +182,8 @@ impl BField { } } - /// Insert the given key/value at the given pass - /// Returns whether the value was inserted during this call, eg will return `false` if + /// Insert the given key/value at the given pass (1-indexed `BField` array/member). + /// Returns whether the value was inserted during this call, i.e., will return `false` if /// the value was already present. pub fn insert(&self, key: &[u8], value: BFieldVal, pass: usize) -> bool { debug_assert!(!self.read_only, "Can't insert into read_only bfields"); @@ -195,8 +203,8 @@ impl BField { true } - /// Returns the value of the given key if found, None otherwise. - /// If the value is indeterminate, we still return None. + /// Returns the value of the given key if found, `None` otherwise. + /// The current implementation also returns `None` for indeterminate values. pub fn get(&self, key: &[u8]) -> Option { for secondary in self.members.iter() { match secondary.get(key) { @@ -210,8 +218,8 @@ impl BField { None } - /// Get the info of each member - /// Returns Vec<(size, n_hashes, marker_width, n_marker_bits)> + /// Get the info of each secondary array (`BFieldMember`) in the `BField`. + /// Returns `Vec<(size, n_hashes, marker_width, n_marker_bits)>`. pub fn info(&self) -> Vec<(usize, u8, u8, u8)> { self.members.iter().map(|m| m.info()).collect() } @@ -304,6 +312,7 @@ mod tests { } } +// Causes cargo test to run doc tests on all `rust` code blocks #[doc = include_str!("../README.md")] #[cfg(doctest)] -pub struct ReadmeDoctests; +struct ReadmeDoctests; diff --git a/src/combinatorial.rs b/src/combinatorial.rs index 61780f9..28d0117 100644 --- a/src/combinatorial.rs +++ b/src/combinatorial.rs @@ -63,9 +63,9 @@ pub fn unrank(marker: u128) -> usize { value as usize } -/// (Hopefully) fast implementation of a binomial +/// (Hopefully) fast implementation of a binomial. /// -/// This uses a preset group of equations for k < 8 and then falls back to a +/// This function uses a preset group of equations for k < 8 and then falls back to a /// multiplicative implementation that tries to prevent overflows while /// maintaining all results as exact integers. #[inline] diff --git a/src/lib.rs b/src/lib.rs index 64f7338..1e2c966 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,7 +1,15 @@ #![deny(missing_docs)] -//! The bfield datastructure, implemented in Rust. +//! The B-field datastructure, implemented in Rust. //! A space-efficient, probabilistic data structure and storage and retrieval method for key-value information. +//! These Rust docs represent some minimal documentation of the crate itself. +//! See the [Github README](https://github.com/onecodex/rust-bfield) for an +//! extensive write-up, including the math and design underpinning the B-field +//! data structure, guidance on B-field parameter selection, as well as usage +//! examples.[^1] +//! +//! [^1]: These are not embeddable in the Cargo docs as they include MathJax, +//! which is currently unsupported. mod bfield; mod bfield_member;