Skip to content

Commit

Permalink
MRG: standardize on u32 for scaled, and introduce ScaledType (#3364)
Browse files Browse the repository at this point in the history
Makes `scaled` a u32, and change a few others as well.

This PR started because we were mixing u32 and u64 in places, but I
think a switch away from usize (architecture specific & mostly returned
from collection lengths, and so on) to explicit u32/u64 seems good.

Fixes #3363

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Luiz Irber <[email protected]>
  • Loading branch information
3 people authored Nov 5, 2024
1 parent 122c4b7 commit 2cc44e0
Show file tree
Hide file tree
Showing 20 changed files with 169 additions and 169 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 6 additions & 4 deletions include/sourmash.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ typedef struct SourmashSignature SourmashSignature;

typedef struct SourmashZipStorage SourmashZipStorage;

typedef uint32_t ScaledType;

/**
* Represents a string.
*/
Expand Down Expand Up @@ -104,7 +106,7 @@ uint32_t computeparams_num_hashes(const SourmashComputeParameters *ptr);

bool computeparams_protein(const SourmashComputeParameters *ptr);

uint64_t computeparams_scaled(const SourmashComputeParameters *ptr);
ScaledType computeparams_scaled(const SourmashComputeParameters *ptr);

uint64_t computeparams_seed(const SourmashComputeParameters *ptr);

Expand All @@ -122,7 +124,7 @@ void computeparams_set_num_hashes(SourmashComputeParameters *ptr, uint32_t num);

void computeparams_set_protein(SourmashComputeParameters *ptr, bool v);

void computeparams_set_scaled(SourmashComputeParameters *ptr, uint64_t scaled);
void computeparams_set_scaled(SourmashComputeParameters *ptr, uint32_t scaled);

void computeparams_set_seed(SourmashComputeParameters *ptr, uint64_t new_seed);

Expand Down Expand Up @@ -230,7 +232,7 @@ SourmashStr kmerminhash_md5sum(const SourmashKmerMinHash *ptr);

void kmerminhash_merge(SourmashKmerMinHash *ptr, const SourmashKmerMinHash *other);

SourmashKmerMinHash *kmerminhash_new(uint64_t scaled,
SourmashKmerMinHash *kmerminhash_new(uint32_t scaled,
uint32_t k,
HashFunctions hash_function,
uint64_t seed,
Expand Down Expand Up @@ -342,7 +344,7 @@ SourmashRevIndex *revindex_new_with_sigs(const SourmashSignature *const *search_
const SourmashKmerMinHash *const *queries_ptr,
uintptr_t inqueries);

uint64_t revindex_scaled(const SourmashRevIndex *ptr);
ScaledType revindex_scaled(const SourmashRevIndex *ptr);

const SourmashSearchResult *const *revindex_search(const SourmashRevIndex *ptr,
const SourmashSignature *sig_ptr,
Expand Down
2 changes: 1 addition & 1 deletion src/core/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "sourmash"
version = "0.16.0"
version = "0.17.0"
authors = ["Luiz Irber <[email protected]>", "N. Tessa Pierce-Ward <[email protected]>"]
description = "tools for comparing biological sequences with k-mer sketches"
repository = "https://github.com/sourmash-bio/sourmash"
Expand Down
4 changes: 2 additions & 2 deletions src/core/src/ani_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
use roots::{find_root_brent, SimpleConvergency};
use statrs::distribution::{ContinuousCDF, Normal};

use crate::Error;
use crate::{Error, ScaledType};

fn exp_n_mutated(l: f64, k: f64, r1: f64) -> f64 {
let q = r1_to_q(k, r1);
Expand Down Expand Up @@ -93,7 +93,7 @@ pub fn ani_from_containment(containment: f64, ksize: f64) -> f64 {
pub fn ani_ci_from_containment(
containment: f64,
ksize: f64,
scaled: u64,
scaled: ScaledType,
n_unique_kmers: u64,
confidence: Option<f64>,
) -> Result<(f64, f64), Error> {
Expand Down
4 changes: 2 additions & 2 deletions src/core/src/cmd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ pub struct ComputeParameters {
singleton: bool,

#[getset(get_copy = "pub", set = "pub")]
#[builder(default = 0u64)]
scaled: u64,
#[builder(default = 0u32)]
scaled: u32,

#[getset(get_copy = "pub", set = "pub")]
#[builder(default = false)]
Expand Down
5 changes: 3 additions & 2 deletions src/core/src/ffi/cmd/compute.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::slice;

use crate::cmd::ComputeParameters;
use crate::ScaledType;

use crate::ffi::utils::ForeignObject;

Expand Down Expand Up @@ -155,15 +156,15 @@ pub unsafe extern "C" fn computeparams_set_num_hashes(
}

#[no_mangle]
pub unsafe extern "C" fn computeparams_scaled(ptr: *const SourmashComputeParameters) -> u64 {
pub unsafe extern "C" fn computeparams_scaled(ptr: *const SourmashComputeParameters) -> ScaledType {
let cp = SourmashComputeParameters::as_rust(ptr);
cp.scaled()
}

#[no_mangle]
pub unsafe extern "C" fn computeparams_set_scaled(
ptr: *mut SourmashComputeParameters,
scaled: u64,
scaled: u32,
) {
let cp = SourmashComputeParameters::as_rust_mut(ptr);
cp.set_scaled(scaled);
Expand Down
7 changes: 4 additions & 3 deletions src/core/src/ffi/index/revindex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use crate::prelude::*;
use crate::signature::{Signature, SigsTrait};
use crate::sketch::minhash::KmerMinHash;
use crate::sketch::Sketch;
use crate::ScaledType;

pub struct SourmashRevIndex;

Expand All @@ -22,8 +23,8 @@ impl ForeignObject for SourmashRevIndex {
// TODO: remove this when it is possible to pass Selection thru the FFI
fn from_template(template: &Sketch) -> Selection {
let (num, scaled) = match template {
Sketch::MinHash(mh) => (mh.num(), mh.scaled() as u32),
Sketch::LargeMinHash(mh) => (mh.num(), mh.scaled() as u32),
Sketch::MinHash(mh) => (mh.num(), mh.scaled()),
Sketch::LargeMinHash(mh) => (mh.num(), mh.scaled()),
_ => unimplemented!(),
};

Expand Down Expand Up @@ -233,7 +234,7 @@ unsafe fn revindex_gather(
}

#[no_mangle]
pub unsafe extern "C" fn revindex_scaled(ptr: *const SourmashRevIndex) -> u64 {
pub unsafe extern "C" fn revindex_scaled(ptr: *const SourmashRevIndex) -> ScaledType {
let revindex = SourmashRevIndex::as_rust(ptr);
if let Sketch::MinHash(mh) = revindex.template() {
mh.scaled()
Expand Down
2 changes: 1 addition & 1 deletion src/core/src/ffi/minhash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ impl ForeignObject for SourmashKmerMinHash {

#[no_mangle]
pub unsafe extern "C" fn kmerminhash_new(
scaled: u64,
scaled: u32,
k: u32,
hash_function: HashFunctions,
seed: u64,
Expand Down
9 changes: 5 additions & 4 deletions src/core/src/index/linear.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,8 @@ impl LinearIndex {
.internal_location()
.into();
let match_sig = self.collection.sig_for_dataset(dataset_id)?;
let result = self.stats_for_match(match_sig, query, match_size, match_path, round)?;
let result =
self.stats_for_match(match_sig, query, match_size, match_path, round as u32)?;
Ok(result)
}

Expand All @@ -161,7 +162,7 @@ impl LinearIndex {
query: &KmerMinHash,
match_size: usize,
match_path: PathBuf,
gather_result_rank: usize,
gather_result_rank: u32,
) -> Result<GatherResult> {
let template = self.template();

Expand All @@ -176,10 +177,10 @@ impl LinearIndex {
let f_match = match_size as f64 / match_mh.size() as f64;
let filename = match_path.into_string();
let name = match_sig.name();
let unique_intersect_bp = match_mh.scaled() as usize * match_size;
let unique_intersect_bp = (match_mh.scaled() as usize * match_size) as u64;

let (intersect_orig, _) = match_mh.intersection_size(query)?;
let intersect_bp = (match_mh.scaled() * intersect_orig) as usize;
let intersect_bp: u64 = match_mh.scaled() as u64 * intersect_orig;

let f_unique_to_query = intersect_orig as f64 / query.size() as f64;
let match_ = match_sig;
Expand Down
29 changes: 15 additions & 14 deletions src/core/src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ use crate::Result;
#[derive(TypedBuilder, CopyGetters, Getters, Setters, Serialize, Deserialize, Debug, PartialEq)]
pub struct GatherResult {
#[getset(get_copy = "pub")]
intersect_bp: usize,
intersect_bp: u64,

#[getset(get_copy = "pub")]
f_orig_query: f64,
Expand Down Expand Up @@ -72,22 +72,22 @@ pub struct GatherResult {
f_match_orig: f64,

#[getset(get_copy = "pub")]
unique_intersect_bp: usize,
unique_intersect_bp: u64,

#[getset(get_copy = "pub")]
gather_result_rank: usize,
gather_result_rank: u32,

#[getset(get_copy = "pub")]
remaining_bp: usize,
remaining_bp: u64,

#[getset(get_copy = "pub")]
n_unique_weighted_found: usize,
n_unique_weighted_found: u64,

#[getset(get_copy = "pub")]
total_weighted_hashes: usize,
total_weighted_hashes: u64,

#[getset(get_copy = "pub")]
sum_weighted_found: usize,
sum_weighted_found: u64,

#[getset(get_copy = "pub")]
query_containment_ani: f64,
Expand Down Expand Up @@ -212,9 +212,9 @@ pub fn calculate_gather_stats(
remaining_query: KmerMinHash,
match_sig: SigStore,
match_size: usize,
gather_result_rank: usize,
sum_weighted_found: usize,
total_weighted_hashes: usize,
gather_result_rank: u32,
sum_weighted_found: u64,
total_weighted_hashes: u64,
calc_abund_stats: bool,
calc_ani_ci: bool,
confidence: Option<f64>,
Expand Down Expand Up @@ -242,17 +242,18 @@ pub fn calculate_gather_stats(
trace!("query.size: {}", remaining_query.size());

//bp remaining in subtracted query
let remaining_bp = (remaining_query.size() - isect_size) * remaining_query.scaled() as usize;
let remaining_bp =
(remaining_query.size() - isect_size) as u64 * remaining_query.scaled() as u64;

// stats for this match vs original query
let (intersect_orig, _) = match_mh.intersection_size(orig_query).unwrap();
let intersect_bp = (match_mh.scaled() * intersect_orig) as usize;
let intersect_bp = match_mh.scaled() as u64 * intersect_orig;
let f_orig_query = intersect_orig as f64 / orig_query.size() as f64;
let f_match_orig = intersect_orig as f64 / match_mh.size() as f64;

// stats for this match vs current (subtracted) query
let f_match = match_size as f64 / match_mh.size() as f64;
let unique_intersect_bp = match_mh.scaled() as usize * isect_size;
let unique_intersect_bp = match_mh.scaled() as u64 * isect_size as u64;
let f_unique_to_query = isect_size as f64 / orig_query.size() as f64;

// // get ANI values
Expand Down Expand Up @@ -309,7 +310,7 @@ pub fn calculate_gather_stats(
}
};

n_unique_weighted_found = unique_weighted_found as usize;
n_unique_weighted_found = unique_weighted_found;
sum_total_weighted_found = sum_weighted_found + n_unique_weighted_found;
f_unique_weighted = n_unique_weighted_found as f64 / total_weighted_hashes as f64;

Expand Down
4 changes: 2 additions & 2 deletions src/core/src/index/revindex/disk_revindex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@ impl RevIndexOps for RevIndex {
let query_mh = KmerMinHash::from(query.clone());

// just calculate essentials here
let gather_result_rank = matches.len();
let gather_result_rank = matches.len() as u32;

// grab the specific intersection:
// Calculate stats
Expand All @@ -422,7 +422,7 @@ impl RevIndexOps for RevIndex {
match_size,
gather_result_rank,
sum_weighted_found,
total_weighted_hashes.try_into().unwrap(),
total_weighted_hashes,
calc_abund_stats,
calc_ani_ci,
ani_confidence_interval_fraction,
Expand Down
1 change: 1 addition & 0 deletions src/core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ cfg_if! {
}

type HashIntoType = u64;
pub type ScaledType = u32;

pub fn _hash_murmur(kmer: &[u8], seed: u64) -> u64 {
murmurhash3_x64_128(kmer, seed).0
Expand Down
6 changes: 3 additions & 3 deletions src/core/src/manifest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ use crate::encodings::HashFunctions;
use crate::prelude::*;
use crate::signature::SigsTrait;
use crate::sketch::Sketch;
use crate::Result;
use crate::{Result, ScaledType};

/// Individual manifest record, containing information about sketches.
Expand All @@ -38,7 +38,7 @@ pub struct Record {
num: u32,

#[getset(get = "pub")]
scaled: u64,
scaled: ScaledType,

#[getset(get = "pub")]
n_hashes: usize,
Expand Down Expand Up @@ -279,7 +279,7 @@ impl Select for Manifest {
};
valid = if let Some(scaled) = selection.scaled() {
// num sigs have row.scaled = 0, don't include them
valid && row.scaled != 0 && row.scaled <= scaled as u64
valid && row.scaled != 0 && row.scaled <= scaled
} else {
valid
};
Expand Down
8 changes: 4 additions & 4 deletions src/core/src/selection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use typed_builder::TypedBuilder;

use crate::encodings::HashFunctions;
use crate::manifest::Record;
use crate::Result;
use crate::{Result, ScaledType};

#[derive(Default, Debug, TypedBuilder, Clone)]
pub struct Selection {
Expand All @@ -17,7 +17,7 @@ pub struct Selection {
num: Option<u32>,

#[builder(default, setter(strip_option))]
scaled: Option<u32>,
scaled: Option<ScaledType>,

#[builder(default, setter(strip_option))]
containment: Option<bool>,
Expand Down Expand Up @@ -87,11 +87,11 @@ impl Selection {
self.num = Some(num);
}

pub fn scaled(&self) -> Option<u32> {
pub fn scaled(&self) -> Option<ScaledType> {
self.scaled
}

pub fn set_scaled(&mut self, scaled: u32) {
pub fn set_scaled(&mut self, scaled: ScaledType) {
self.scaled = Some(scaled);
}

Expand Down
6 changes: 3 additions & 3 deletions src/core/src/signature.rs
Original file line number Diff line number Diff line change
Expand Up @@ -812,7 +812,7 @@ impl Select for Signature {
// keep compatible scaled if applicable
valid = if let Some(sel_scaled) = selection.scaled() {
match s {
Sketch::MinHash(mh) => valid && mh.scaled() <= sel_scaled as u64,
Sketch::MinHash(mh) => valid && mh.scaled() <= sel_scaled,
// TODO: test LargeMinHash
// Sketch::LargeMinHash(lmh) => valid && lmh.scaled() <= sel_scaled as u64,
_ => valid, // other sketch types or invalid cases
Expand Down Expand Up @@ -841,8 +841,8 @@ impl Select for Signature {
for sketch in self.signatures.iter_mut() {
// TODO: also account for LargeMinHash
if let Sketch::MinHash(mh) = sketch {
if (mh.scaled() as u32) < sel_scaled {
*sketch = Sketch::MinHash(mh.clone().downsample_scaled(sel_scaled as u64)?);
if mh.scaled() < sel_scaled {
*sketch = Sketch::MinHash(mh.clone().downsample_scaled(sel_scaled)?);
}
}
}
Expand Down
Loading

0 comments on commit 2cc44e0

Please sign in to comment.