From 2cc44e026bc147a292a1dcac3ab2bcd046be09fa Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Tue, 5 Nov 2024 11:36:26 -0800
Subject: [PATCH] MRG: standardize on u32 for scaled, and introduce
 `ScaledType` (#3364)

Makes `scaled` a u32, and change a few others as well.

This PR started because we were mixing u32 and u64 in places, but I
think a switch away from usize (architecture specific & mostly returned
from collection lengths, and so on) to explicit u32/u64 seems good.

Fixes https://github.com/sourmash-bio/sourmash/issues/3363

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Luiz Irber <contact+github@luizirber.org>
---
 Cargo.lock                                   |   2 +-
 include/sourmash.h                           |  10 +-
 src/core/Cargo.toml                          |   2 +-
 src/core/src/ani_utils.rs                    |   4 +-
 src/core/src/cmd.rs                          |   4 +-
 src/core/src/ffi/cmd/compute.rs              |   5 +-
 src/core/src/ffi/index/revindex.rs           |   7 +-
 src/core/src/ffi/minhash.rs                  |   2 +-
 src/core/src/index/linear.rs                 |   9 +-
 src/core/src/index/mod.rs                    |  29 +--
 src/core/src/index/revindex/disk_revindex.rs |   4 +-
 src/core/src/lib.rs                          |   1 +
 src/core/src/manifest.rs                     |   6 +-
 src/core/src/selection.rs                    |   8 +-
 src/core/src/signature.rs                    |   6 +-
 src/core/src/sketch/minhash.rs               |  22 +--
 src/core/src/wasm.rs                         |   9 +-
 src/core/tests/minhash.rs                    |   3 +-
 tests/test_minhash.py                        | 197 +++++++++----------
 tests/test_signature.py                      |   8 +-
 20 files changed, 169 insertions(+), 169 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 58856f4c4e..0d35a5a679 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1657,7 +1657,7 @@ checksum = "9f1341053f34bb13b5e9590afb7d94b48b48d4b87467ec28e3c238693bb553de"
 
 [[package]]
 name = "sourmash"
-version = "0.16.0"
+version = "0.17.0"
 dependencies = [
  "az",
  "byteorder",
diff --git a/include/sourmash.h b/include/sourmash.h
index 65585519dd..beb64ea455 100644
--- a/include/sourmash.h
+++ b/include/sourmash.h
@@ -68,6 +68,8 @@ typedef struct SourmashSignature SourmashSignature;
 
 typedef struct SourmashZipStorage SourmashZipStorage;
 
+typedef uint32_t ScaledType;
+
 /**
  * Represents a string.
  */
@@ -104,7 +106,7 @@ uint32_t computeparams_num_hashes(const SourmashComputeParameters *ptr);
 
 bool computeparams_protein(const SourmashComputeParameters *ptr);
 
-uint64_t computeparams_scaled(const SourmashComputeParameters *ptr);
+ScaledType computeparams_scaled(const SourmashComputeParameters *ptr);
 
 uint64_t computeparams_seed(const SourmashComputeParameters *ptr);
 
@@ -122,7 +124,7 @@ void computeparams_set_num_hashes(SourmashComputeParameters *ptr, uint32_t num);
 
 void computeparams_set_protein(SourmashComputeParameters *ptr, bool v);
 
-void computeparams_set_scaled(SourmashComputeParameters *ptr, uint64_t scaled);
+void computeparams_set_scaled(SourmashComputeParameters *ptr, uint32_t scaled);
 
 void computeparams_set_seed(SourmashComputeParameters *ptr, uint64_t new_seed);
 
@@ -230,7 +232,7 @@ SourmashStr kmerminhash_md5sum(const SourmashKmerMinHash *ptr);
 
 void kmerminhash_merge(SourmashKmerMinHash *ptr, const SourmashKmerMinHash *other);
 
-SourmashKmerMinHash *kmerminhash_new(uint64_t scaled,
+SourmashKmerMinHash *kmerminhash_new(uint32_t scaled,
                                      uint32_t k,
                                      HashFunctions hash_function,
                                      uint64_t seed,
@@ -342,7 +344,7 @@ SourmashRevIndex *revindex_new_with_sigs(const SourmashSignature *const *search_
                                          const SourmashKmerMinHash *const *queries_ptr,
                                          uintptr_t inqueries);
 
-uint64_t revindex_scaled(const SourmashRevIndex *ptr);
+ScaledType revindex_scaled(const SourmashRevIndex *ptr);
 
 const SourmashSearchResult *const *revindex_search(const SourmashRevIndex *ptr,
                                                    const SourmashSignature *sig_ptr,
diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml
index 8775b2af88..ad588cb22b 100644
--- a/src/core/Cargo.toml
+++ b/src/core/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "sourmash"
-version = "0.16.0"
+version = "0.17.0"
 authors = ["Luiz Irber <luiz@sourmash.bio>", "N. Tessa Pierce-Ward <tessa@sourmash.bio>"]
 description = "tools for comparing biological sequences with k-mer sketches"
 repository = "https://github.com/sourmash-bio/sourmash"
diff --git a/src/core/src/ani_utils.rs b/src/core/src/ani_utils.rs
index 073cf6050b..6886c481ed 100644
--- a/src/core/src/ani_utils.rs
+++ b/src/core/src/ani_utils.rs
@@ -5,7 +5,7 @@
 use roots::{find_root_brent, SimpleConvergency};
 use statrs::distribution::{ContinuousCDF, Normal};
 
-use crate::Error;
+use crate::{Error, ScaledType};
 
 fn exp_n_mutated(l: f64, k: f64, r1: f64) -> f64 {
     let q = r1_to_q(k, r1);
@@ -93,7 +93,7 @@ pub fn ani_from_containment(containment: f64, ksize: f64) -> f64 {
 pub fn ani_ci_from_containment(
     containment: f64,
     ksize: f64,
-    scaled: u64,
+    scaled: ScaledType,
     n_unique_kmers: u64,
     confidence: Option<f64>,
 ) -> Result<(f64, f64), Error> {
diff --git a/src/core/src/cmd.rs b/src/core/src/cmd.rs
index a760e0f79d..b59e99b912 100644
--- a/src/core/src/cmd.rs
+++ b/src/core/src/cmd.rs
@@ -47,8 +47,8 @@ pub struct ComputeParameters {
     singleton: bool,
 
     #[getset(get_copy = "pub", set = "pub")]
-    #[builder(default = 0u64)]
-    scaled: u64,
+    #[builder(default = 0u32)]
+    scaled: u32,
 
     #[getset(get_copy = "pub", set = "pub")]
     #[builder(default = false)]
diff --git a/src/core/src/ffi/cmd/compute.rs b/src/core/src/ffi/cmd/compute.rs
index a3aa294234..4cecd456e2 100644
--- a/src/core/src/ffi/cmd/compute.rs
+++ b/src/core/src/ffi/cmd/compute.rs
@@ -1,6 +1,7 @@
 use std::slice;
 
 use crate::cmd::ComputeParameters;
+use crate::ScaledType;
 
 use crate::ffi::utils::ForeignObject;
 
@@ -155,7 +156,7 @@ pub unsafe extern "C" fn computeparams_set_num_hashes(
 }
 
 #[no_mangle]
-pub unsafe extern "C" fn computeparams_scaled(ptr: *const SourmashComputeParameters) -> u64 {
+pub unsafe extern "C" fn computeparams_scaled(ptr: *const SourmashComputeParameters) -> ScaledType {
     let cp = SourmashComputeParameters::as_rust(ptr);
     cp.scaled()
 }
@@ -163,7 +164,7 @@ pub unsafe extern "C" fn computeparams_scaled(ptr: *const SourmashComputeParamet
 #[no_mangle]
 pub unsafe extern "C" fn computeparams_set_scaled(
     ptr: *mut SourmashComputeParameters,
-    scaled: u64,
+    scaled: u32,
 ) {
     let cp = SourmashComputeParameters::as_rust_mut(ptr);
     cp.set_scaled(scaled);
diff --git a/src/core/src/ffi/index/revindex.rs b/src/core/src/ffi/index/revindex.rs
index e38bdef7fb..ef0e328139 100644
--- a/src/core/src/ffi/index/revindex.rs
+++ b/src/core/src/ffi/index/revindex.rs
@@ -12,6 +12,7 @@ use crate::prelude::*;
 use crate::signature::{Signature, SigsTrait};
 use crate::sketch::minhash::KmerMinHash;
 use crate::sketch::Sketch;
+use crate::ScaledType;
 
 pub struct SourmashRevIndex;
 
@@ -22,8 +23,8 @@ impl ForeignObject for SourmashRevIndex {
 // TODO: remove this when it is possible to pass Selection thru the FFI
 fn from_template(template: &Sketch) -> Selection {
     let (num, scaled) = match template {
-        Sketch::MinHash(mh) => (mh.num(), mh.scaled() as u32),
-        Sketch::LargeMinHash(mh) => (mh.num(), mh.scaled() as u32),
+        Sketch::MinHash(mh) => (mh.num(), mh.scaled()),
+        Sketch::LargeMinHash(mh) => (mh.num(), mh.scaled()),
         _ => unimplemented!(),
     };
 
@@ -233,7 +234,7 @@ unsafe fn revindex_gather(
 }
 
 #[no_mangle]
-pub unsafe extern "C" fn revindex_scaled(ptr: *const SourmashRevIndex) -> u64 {
+pub unsafe extern "C" fn revindex_scaled(ptr: *const SourmashRevIndex) -> ScaledType {
     let revindex = SourmashRevIndex::as_rust(ptr);
     if let Sketch::MinHash(mh) = revindex.template() {
         mh.scaled()
diff --git a/src/core/src/ffi/minhash.rs b/src/core/src/ffi/minhash.rs
index 7d7c050fb0..55879dd060 100644
--- a/src/core/src/ffi/minhash.rs
+++ b/src/core/src/ffi/minhash.rs
@@ -17,7 +17,7 @@ impl ForeignObject for SourmashKmerMinHash {
 
 #[no_mangle]
 pub unsafe extern "C" fn kmerminhash_new(
-    scaled: u64,
+    scaled: u32,
     k: u32,
     hash_function: HashFunctions,
     seed: u64,
diff --git a/src/core/src/index/linear.rs b/src/core/src/index/linear.rs
index f0690539b9..489e69c86e 100644
--- a/src/core/src/index/linear.rs
+++ b/src/core/src/index/linear.rs
@@ -151,7 +151,8 @@ impl LinearIndex {
             .internal_location()
             .into();
         let match_sig = self.collection.sig_for_dataset(dataset_id)?;
-        let result = self.stats_for_match(match_sig, query, match_size, match_path, round)?;
+        let result =
+            self.stats_for_match(match_sig, query, match_size, match_path, round as u32)?;
         Ok(result)
     }
 
@@ -161,7 +162,7 @@ impl LinearIndex {
         query: &KmerMinHash,
         match_size: usize,
         match_path: PathBuf,
-        gather_result_rank: usize,
+        gather_result_rank: u32,
     ) -> Result<GatherResult> {
         let template = self.template();
 
@@ -176,10 +177,10 @@ impl LinearIndex {
         let f_match = match_size as f64 / match_mh.size() as f64;
         let filename = match_path.into_string();
         let name = match_sig.name();
-        let unique_intersect_bp = match_mh.scaled() as usize * match_size;
+        let unique_intersect_bp = (match_mh.scaled() as usize * match_size) as u64;
 
         let (intersect_orig, _) = match_mh.intersection_size(query)?;
-        let intersect_bp = (match_mh.scaled() * intersect_orig) as usize;
+        let intersect_bp: u64 = match_mh.scaled() as u64 * intersect_orig;
 
         let f_unique_to_query = intersect_orig as f64 / query.size() as f64;
         let match_ = match_sig;
diff --git a/src/core/src/index/mod.rs b/src/core/src/index/mod.rs
index 58330235ee..0bd9d9fec8 100644
--- a/src/core/src/index/mod.rs
+++ b/src/core/src/index/mod.rs
@@ -33,7 +33,7 @@ use crate::Result;
 #[derive(TypedBuilder, CopyGetters, Getters, Setters, Serialize, Deserialize, Debug, PartialEq)]
 pub struct GatherResult {
     #[getset(get_copy = "pub")]
-    intersect_bp: usize,
+    intersect_bp: u64,
 
     #[getset(get_copy = "pub")]
     f_orig_query: f64,
@@ -72,22 +72,22 @@ pub struct GatherResult {
     f_match_orig: f64,
 
     #[getset(get_copy = "pub")]
-    unique_intersect_bp: usize,
+    unique_intersect_bp: u64,
 
     #[getset(get_copy = "pub")]
-    gather_result_rank: usize,
+    gather_result_rank: u32,
 
     #[getset(get_copy = "pub")]
-    remaining_bp: usize,
+    remaining_bp: u64,
 
     #[getset(get_copy = "pub")]
-    n_unique_weighted_found: usize,
+    n_unique_weighted_found: u64,
 
     #[getset(get_copy = "pub")]
-    total_weighted_hashes: usize,
+    total_weighted_hashes: u64,
 
     #[getset(get_copy = "pub")]
-    sum_weighted_found: usize,
+    sum_weighted_found: u64,
 
     #[getset(get_copy = "pub")]
     query_containment_ani: f64,
@@ -212,9 +212,9 @@ pub fn calculate_gather_stats(
     remaining_query: KmerMinHash,
     match_sig: SigStore,
     match_size: usize,
-    gather_result_rank: usize,
-    sum_weighted_found: usize,
-    total_weighted_hashes: usize,
+    gather_result_rank: u32,
+    sum_weighted_found: u64,
+    total_weighted_hashes: u64,
     calc_abund_stats: bool,
     calc_ani_ci: bool,
     confidence: Option<f64>,
@@ -242,17 +242,18 @@ pub fn calculate_gather_stats(
     trace!("query.size: {}", remaining_query.size());
 
     //bp remaining in subtracted query
-    let remaining_bp = (remaining_query.size() - isect_size) * remaining_query.scaled() as usize;
+    let remaining_bp =
+        (remaining_query.size() - isect_size) as u64 * remaining_query.scaled() as u64;
 
     // stats for this match vs original query
     let (intersect_orig, _) = match_mh.intersection_size(orig_query).unwrap();
-    let intersect_bp = (match_mh.scaled() * intersect_orig) as usize;
+    let intersect_bp = match_mh.scaled() as u64 * intersect_orig;
     let f_orig_query = intersect_orig as f64 / orig_query.size() as f64;
     let f_match_orig = intersect_orig as f64 / match_mh.size() as f64;
 
     // stats for this match vs current (subtracted) query
     let f_match = match_size as f64 / match_mh.size() as f64;
-    let unique_intersect_bp = match_mh.scaled() as usize * isect_size;
+    let unique_intersect_bp = match_mh.scaled() as u64 * isect_size as u64;
     let f_unique_to_query = isect_size as f64 / orig_query.size() as f64;
 
     // // get ANI values
@@ -309,7 +310,7 @@ pub fn calculate_gather_stats(
             }
         };
 
-        n_unique_weighted_found = unique_weighted_found as usize;
+        n_unique_weighted_found = unique_weighted_found;
         sum_total_weighted_found = sum_weighted_found + n_unique_weighted_found;
         f_unique_weighted = n_unique_weighted_found as f64 / total_weighted_hashes as f64;
 
diff --git a/src/core/src/index/revindex/disk_revindex.rs b/src/core/src/index/revindex/disk_revindex.rs
index 0d575310aa..46552c2c67 100644
--- a/src/core/src/index/revindex/disk_revindex.rs
+++ b/src/core/src/index/revindex/disk_revindex.rs
@@ -411,7 +411,7 @@ impl RevIndexOps for RevIndex {
             let query_mh = KmerMinHash::from(query.clone());
 
             // just calculate essentials here
-            let gather_result_rank = matches.len();
+            let gather_result_rank = matches.len() as u32;
 
             // grab the specific intersection:
             // Calculate stats
@@ -422,7 +422,7 @@ impl RevIndexOps for RevIndex {
                 match_size,
                 gather_result_rank,
                 sum_weighted_found,
-                total_weighted_hashes.try_into().unwrap(),
+                total_weighted_hashes,
                 calc_abund_stats,
                 calc_ani_ci,
                 ani_confidence_interval_fraction,
diff --git a/src/core/src/lib.rs b/src/core/src/lib.rs
index 8676f683dc..d8f994f87c 100644
--- a/src/core/src/lib.rs
+++ b/src/core/src/lib.rs
@@ -53,6 +53,7 @@ cfg_if! {
 }
 
 type HashIntoType = u64;
+pub type ScaledType = u32;
 
 pub fn _hash_murmur(kmer: &[u8], seed: u64) -> u64 {
     murmurhash3_x64_128(kmer, seed).0
diff --git a/src/core/src/manifest.rs b/src/core/src/manifest.rs
index 2f1eca7e0f..a23776dfdc 100644
--- a/src/core/src/manifest.rs
+++ b/src/core/src/manifest.rs
@@ -15,7 +15,7 @@ use crate::encodings::HashFunctions;
 use crate::prelude::*;
 use crate::signature::SigsTrait;
 use crate::sketch::Sketch;
-use crate::Result;
+use crate::{Result, ScaledType};
 
 /// Individual manifest record, containing information about sketches.
 
@@ -38,7 +38,7 @@ pub struct Record {
     num: u32,
 
     #[getset(get = "pub")]
-    scaled: u64,
+    scaled: ScaledType,
 
     #[getset(get = "pub")]
     n_hashes: usize,
@@ -279,7 +279,7 @@ impl Select for Manifest {
             };
             valid = if let Some(scaled) = selection.scaled() {
                 // num sigs have row.scaled = 0, don't include them
-                valid && row.scaled != 0 && row.scaled <= scaled as u64
+                valid && row.scaled != 0 && row.scaled <= scaled
             } else {
                 valid
             };
diff --git a/src/core/src/selection.rs b/src/core/src/selection.rs
index 536981fa97..3c94c7f0c7 100644
--- a/src/core/src/selection.rs
+++ b/src/core/src/selection.rs
@@ -3,7 +3,7 @@ use typed_builder::TypedBuilder;
 
 use crate::encodings::HashFunctions;
 use crate::manifest::Record;
-use crate::Result;
+use crate::{Result, ScaledType};
 
 #[derive(Default, Debug, TypedBuilder, Clone)]
 pub struct Selection {
@@ -17,7 +17,7 @@ pub struct Selection {
     num: Option<u32>,
 
     #[builder(default, setter(strip_option))]
-    scaled: Option<u32>,
+    scaled: Option<ScaledType>,
 
     #[builder(default, setter(strip_option))]
     containment: Option<bool>,
@@ -87,11 +87,11 @@ impl Selection {
         self.num = Some(num);
     }
 
-    pub fn scaled(&self) -> Option<u32> {
+    pub fn scaled(&self) -> Option<ScaledType> {
         self.scaled
     }
 
-    pub fn set_scaled(&mut self, scaled: u32) {
+    pub fn set_scaled(&mut self, scaled: ScaledType) {
         self.scaled = Some(scaled);
     }
 
diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs
index 4544897054..31fc86f4af 100644
--- a/src/core/src/signature.rs
+++ b/src/core/src/signature.rs
@@ -812,7 +812,7 @@ impl Select for Signature {
             // keep compatible scaled if applicable
             valid = if let Some(sel_scaled) = selection.scaled() {
                 match s {
-                    Sketch::MinHash(mh) => valid && mh.scaled() <= sel_scaled as u64,
+                    Sketch::MinHash(mh) => valid && mh.scaled() <= sel_scaled,
                     // TODO: test LargeMinHash
                     // Sketch::LargeMinHash(lmh) => valid && lmh.scaled() <= sel_scaled as u64,
                     _ => valid, // other sketch types or invalid cases
@@ -841,8 +841,8 @@ impl Select for Signature {
             for sketch in self.signatures.iter_mut() {
                 // TODO: also account for LargeMinHash
                 if let Sketch::MinHash(mh) = sketch {
-                    if (mh.scaled() as u32) < sel_scaled {
-                        *sketch = Sketch::MinHash(mh.clone().downsample_scaled(sel_scaled as u64)?);
+                    if mh.scaled() < sel_scaled {
+                        *sketch = Sketch::MinHash(mh.clone().downsample_scaled(sel_scaled)?);
                     }
                 }
             }
diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs
index 7341031673..60b1c97c73 100644
--- a/src/core/src/sketch/minhash.rs
+++ b/src/core/src/sketch/minhash.rs
@@ -12,13 +12,13 @@ use serde::ser::{SerializeStruct, Serializer};
 use serde::{Deserialize, Serialize};
 use typed_builder::TypedBuilder;
 
-use crate::_hash_murmur;
 use crate::encodings::HashFunctions;
 use crate::signature::SigsTrait;
 use crate::sketch::hyperloglog::HyperLogLog;
 use crate::Error;
+use crate::{ScaledType, _hash_murmur};
 
-pub fn max_hash_for_scaled(scaled: u64) -> u64 {
+pub fn max_hash_for_scaled(scaled: ScaledType) -> u64 {
     match scaled {
         0 => 0, // scaled == 0 indicates this is a num minhash
         1 => u64::MAX,
@@ -26,10 +26,10 @@ pub fn max_hash_for_scaled(scaled: u64) -> u64 {
     }
 }
 
-pub fn scaled_for_max_hash(max_hash: u64) -> u64 {
+pub fn scaled_for_max_hash(max_hash: u64) -> ScaledType {
     match max_hash {
         0 => 0, // scaled == 0 indicates this is a num minhash
-        _ => (u64::MAX as f64 / max_hash as f64) as u64,
+        _ => (u64::MAX as f64 / max_hash as f64) as ScaledType,
     }
 }
 
@@ -185,7 +185,7 @@ impl<'de> Deserialize<'de> for KmerMinHash {
 
 impl KmerMinHash {
     pub fn new(
-        scaled: u64,
+        scaled: ScaledType,
         ksize: u32,
         hash_function: HashFunctions,
         seed: u64,
@@ -230,7 +230,7 @@ impl KmerMinHash {
         self.max_hash
     }
 
-    pub fn scaled(&self) -> u64 {
+    pub fn scaled(&self) -> ScaledType {
         scaled_for_max_hash(self.max_hash)
     }
 
@@ -770,11 +770,11 @@ impl KmerMinHash {
     // this could be improved by generating an HLL estimate while sketching instead
     // (for scaled minhashes)
     pub fn n_unique_kmers(&self) -> u64 {
-        self.size() as u64 * self.scaled() // + (self.ksize - 1) for bp estimation
+        self.size() as u64 * self.scaled() as u64 // + (self.ksize - 1) for bp estimation
     }
 
     // create a downsampled copy of self
-    pub fn downsample_scaled(self, scaled: u64) -> Result<KmerMinHash, Error> {
+    pub fn downsample_scaled(self, scaled: ScaledType) -> Result<KmerMinHash, Error> {
         if self.scaled() == scaled || self.scaled() == 0 {
             Ok(self)
         } else if self.scaled() > scaled {
@@ -1115,7 +1115,7 @@ impl<'de> Deserialize<'de> for KmerMinHashBTree {
 
 impl KmerMinHashBTree {
     pub fn new(
-        scaled: u64,
+        scaled: ScaledType,
         ksize: u32,
         hash_function: HashFunctions,
         seed: u64,
@@ -1157,7 +1157,7 @@ impl KmerMinHashBTree {
         self.max_hash
     }
 
-    pub fn scaled(&self) -> u64 {
+    pub fn scaled(&self) -> ScaledType {
         scaled_for_max_hash(self.max_hash)
     }
 
@@ -1552,7 +1552,7 @@ impl KmerMinHashBTree {
     }
 
     // create a downsampled copy of self
-    pub fn downsample_scaled(self, scaled: u64) -> Result<KmerMinHashBTree, Error> {
+    pub fn downsample_scaled(self, scaled: ScaledType) -> Result<KmerMinHashBTree, Error> {
         if self.scaled() == scaled || self.scaled() == 0 {
             Ok(self)
         } else if self.scaled() > scaled {
diff --git a/src/core/src/wasm.rs b/src/core/src/wasm.rs
index cd9efec091..c10eda4e6e 100644
--- a/src/core/src/wasm.rs
+++ b/src/core/src/wasm.rs
@@ -12,6 +12,7 @@ use crate::encodings::HashFunctions;
 use crate::signature::Signature as _Signature;
 use crate::signature::SigsTrait;
 use crate::sketch::minhash::KmerMinHash as _KmerMinHash;
+use crate::ScaledType;
 
 #[wasm_bindgen]
 pub struct KmerMinHash(_KmerMinHash);
@@ -32,7 +33,7 @@ impl KmerMinHash {
         dayhoff: bool,
         hp: bool,
         seed: u32,
-        scaled: u32,
+        scaled: ScaledType,
         track_abundance: bool,
     ) -> KmerMinHash {
         // TODO: at most one of (prot, dayhoff, hp) should be true
@@ -48,7 +49,7 @@ impl KmerMinHash {
         };
 
         KmerMinHash(_KmerMinHash::new(
-            scaled as u64,
+            scaled,
             ksize,
             hash_function,
             seed as u64,
@@ -84,8 +85,8 @@ impl ComputeParameters {
     }
 
     #[wasm_bindgen]
-    pub fn set_scaled(&mut self, scaled: u32) {
-        self.0.set_scaled(scaled as u64);
+    pub fn set_scaled(&mut self, scaled: ScaledType) {
+        self.0.set_scaled(scaled);
     }
 
     #[wasm_bindgen]
diff --git a/src/core/tests/minhash.rs b/src/core/tests/minhash.rs
index 040f3506b6..bdbba0cc20 100644
--- a/src/core/tests/minhash.rs
+++ b/src/core/tests/minhash.rs
@@ -12,6 +12,7 @@ use sourmash::sketch::minhash::{
     max_hash_for_scaled, scaled_for_max_hash, KmerMinHash, KmerMinHashBTree,
 };
 use sourmash::sketch::Sketch;
+use sourmash::ScaledType;
 
 // TODO: use f64::EPSILON when we bump MSRV
 const EPSILON: f64 = 0.01;
@@ -328,7 +329,7 @@ fn oracle_mins_scaled(hashes in vec(u64::ANY, 1..10000)) {
 proptest! {
 #[test]
 fn prop_merge(seq1 in "[ACGT]{6,100}", seq2 in "[ACGT]{6,200}") {
-    let scaled: u64 = 10;
+    let scaled: ScaledType = 10;
     let mut a = KmerMinHash::new(scaled, 6, HashFunctions::Murmur64Dna, 42, true, 0);
     let mut b = KmerMinHashBTree::new(scaled, 6, HashFunctions::Murmur64Dna, 42, true, 0);
 
diff --git a/tests/test_minhash.py b/tests/test_minhash.py
index f181381485..654c4ea828 100644
--- a/tests/test_minhash.py
+++ b/tests/test_minhash.py
@@ -63,9 +63,12 @@
 # * nan on empty minhash
 # * define equals
 
-scaled50 = _get_scaled_for_max_hash(50)
-scaled100 = _get_scaled_for_max_hash(100)
-scaled5000 = _get_scaled_for_max_hash(5000)
+scaled2 = _get_scaled_for_max_hash(2**63)
+assert scaled2 == 2
+scaled4 = _get_scaled_for_max_hash(2**62)
+assert scaled4 == 4
+scaled8 = _get_scaled_for_max_hash(2**61)
+assert scaled8 == 8
 
 
 def _kmers_from_all_coding_frames(sequence, ksize):
@@ -473,20 +476,20 @@ def test_size_limit(track_abundance):
 
 
 def test_scaled(track_abundance):
-    # test behavior with scaled
-    scaled = _get_scaled_for_max_hash(35)
+    # test behavior with scaled: cannot add hashes above max hash.
+    scaled = 2**3
     print("XX", scaled, _get_max_hash_for_scaled(scaled))
     mh = MinHash(0, 4, track_abundance=track_abundance, scaled=scaled)
-    assert mh._max_hash == 35
+    assert mh._max_hash == 2**61  # 2**64 / 2**3
 
     mh.add_hash(10)
     mh.add_hash(20)
     mh.add_hash(30)
 
     assert list(sorted(mh.hashes)) == [10, 20, 30]
-    mh.add_hash(40)
+    mh.add_hash(2**62)
     assert list(sorted(mh.hashes)) == [10, 20, 30]
-    mh.add_hash(36)
+    mh.add_hash(2**63)
     assert list(sorted(mh.hashes)) == [10, 20, 30]
 
 
@@ -541,8 +544,8 @@ def test_scaled_num_both(track_abundance):
 
 def test_mh_jaccard_similarity():
     # check actual Jaccard value for a non-trivial case
-    a = MinHash(0, 20, scaled=scaled50, track_abundance=False)
-    b = MinHash(0, 20, scaled=scaled50, track_abundance=False)
+    a = MinHash(0, 20, scaled=scaled2, track_abundance=False)
+    b = MinHash(0, 20, scaled=scaled2, track_abundance=False)
     a.add_many([1, 3, 5, 8])
     b.add_many([1, 3, 5, 6, 8, 10])
 
@@ -553,12 +556,12 @@ def test_mh_similarity_downsample_jaccard_value():
     # check jaccard value after downsampling
 
     # max_hash = 50
-    a = MinHash(0, 20, scaled=scaled50, track_abundance=False)
+    a = MinHash(0, 20, scaled=scaled2, track_abundance=False)
     # max_hash = 100
-    b = MinHash(0, 20, scaled=scaled100, track_abundance=False)
+    b = MinHash(0, 20, scaled=scaled8, track_abundance=False)
 
-    a.add_many([1, 3, 5, 8, 70])
-    b.add_many([1, 3, 5, 6, 8, 10, 70])
+    a.add_many([1, 3, 5, 8, 2**62])
+    b.add_many([1, 3, 5, 6, 8, 10, 2**62])
 
     # the hash=70 will be truncated by downsampling
     assert a.similarity(b, downsample=True) == 4.0 / 6.0
@@ -569,8 +572,8 @@ def test_mh_angular_similarity():
     # https://www.sciencedirect.com/topics/computer-science/cosine-similarity
     # note: angular similarity is 1 - 2*(acos(sim) / pi), when elements
     # are always positive (https://en.wikipedia.org/wiki/Cosine_similarity)
-    a = MinHash(0, 20, scaled=scaled50, track_abundance=True)
-    b = MinHash(0, 20, scaled=scaled50, track_abundance=True)
+    a = MinHash(0, 20, scaled=scaled2, track_abundance=True)
+    b = MinHash(0, 20, scaled=scaled2, track_abundance=True)
     a.set_abundances({1: 5, 3: 3, 5: 2, 8: 2})
     b.set_abundances({1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1})
 
@@ -583,8 +586,8 @@ def test_mh_angular_similarity():
 
 def test_mh_angular_similarity_2():
     # check actual angular similarity for a second non-trivial case
-    a = MinHash(0, 20, scaled=scaled100, track_abundance=True)
-    b = MinHash(0, 20, scaled=scaled100, track_abundance=True)
+    a = MinHash(0, 20, scaled=scaled2, track_abundance=True)
+    b = MinHash(0, 20, scaled=scaled2, track_abundance=True)
     a.set_abundances({1: 5, 3: 3, 5: 2, 8: 2, 70: 70})
     b.set_abundances({1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1, 70: 70})
 
@@ -598,12 +601,12 @@ def test_mh_similarity_downsample_angular_value():
     # test downsample=True argument to MinHash.similarity
 
     # max_hash = 50
-    a = MinHash(0, 20, scaled=scaled50, track_abundance=True)
+    a = MinHash(0, 20, scaled=scaled2, track_abundance=True)
     # max_hash = 100
-    b = MinHash(0, 20, scaled=scaled100, track_abundance=True)
+    b = MinHash(0, 20, scaled=scaled8, track_abundance=True)
 
-    a.set_abundances({1: 5, 3: 3, 5: 2, 8: 2, 70: 70})
-    b.set_abundances({1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1, 70: 70})
+    a.set_abundances({1: 5, 3: 3, 5: 2, 8: 2, 2**62: 70})
+    b.set_abundances({1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1, 2**62: 70})
 
     # the hash=70 will be truncated by downsampling
     sim = a.similarity(b, downsample=True)
@@ -617,8 +620,8 @@ def test_mh_similarity_downsample_angular_value():
 def test_mh_angular_similarity_fail():
     # raise TypeError if calling angular_similarity directly and
     # one or both sketches do not have abundance info
-    a = MinHash(0, 20, scaled=scaled50, track_abundance=True)
-    b = MinHash(0, 20, scaled=scaled50, track_abundance=False)
+    a = MinHash(0, 20, scaled=scaled2, track_abundance=True)
+    b = MinHash(0, 20, scaled=scaled2, track_abundance=False)
     a_values = {1: 5, 3: 3, 5: 2, 8: 2}
     b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1}
 
@@ -634,7 +637,7 @@ def test_mh_angular_similarity_fail():
         in str(exc)
     )
     # both sketches lack track abundance
-    a = MinHash(0, 20, scaled=scaled50, track_abundance=False)
+    a = MinHash(0, 20, scaled=scaled2, track_abundance=False)
     a.add_many(a_values.keys())
     with pytest.raises(TypeError) as exc:
         a.angular_similarity(b)
@@ -649,9 +652,9 @@ def test_mh_similarity_downsample_true(track_abundance):
     # verify sim(a, b) == sim(b, a), with and without ignore_abundance
 
     # max_hash = 50
-    a = MinHash(0, 20, scaled=scaled50, track_abundance=track_abundance)
+    a = MinHash(0, 20, scaled=scaled2, track_abundance=track_abundance)
     # max_hash = 100
-    b = MinHash(0, 20, scaled=scaled100, track_abundance=track_abundance)
+    b = MinHash(0, 20, scaled=scaled8, track_abundance=track_abundance)
 
     a_values = {1: 5, 3: 3, 5: 2, 8: 2}
     b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1}
@@ -677,9 +680,9 @@ def test_mh_similarity_downsample_errors(track_abundance):
     # test downsample=False (default) argument to MinHash.similarity
 
     # max_hash = 50
-    a = MinHash(0, 20, scaled=scaled50, track_abundance=track_abundance)
+    a = MinHash(0, 20, scaled=scaled2, track_abundance=track_abundance)
     # max_hash = 100
-    b = MinHash(0, 20, scaled=scaled100, track_abundance=track_abundance)
+    b = MinHash(0, 20, scaled=scaled8, track_abundance=track_abundance)
 
     a_values = {1: 5, 3: 3, 5: 2, 8: 2}
     b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1}
@@ -869,14 +872,14 @@ def test_mh_count_common_diff_maxhash(track_abundance):
         5,
         is_protein=False,
         track_abundance=track_abundance,
-        scaled=_get_scaled_for_max_hash(1),
+        scaled=scaled8,
     )
     b = MinHash(
         0,
         5,
         is_protein=True,
         track_abundance=track_abundance,
-        scaled=_get_scaled_for_max_hash(2),
+        scaled=scaled4,
     )
 
     with pytest.raises(ValueError):
@@ -1181,13 +1184,9 @@ def test_mh_similarity_diff_seed(track_abundance):
 
 
 def test_mh_compare_diff_max_hash(track_abundance):
-    a = MinHash(
-        0, 5, track_abundance=track_abundance, scaled=_get_max_hash_for_scaled(5)
-    )
+    a = MinHash(0, 5, track_abundance=track_abundance, scaled=scaled2)
 
-    b = MinHash(
-        0, 5, track_abundance=track_abundance, scaled=_get_max_hash_for_scaled(10)
-    )
+    b = MinHash(0, 5, track_abundance=track_abundance, scaled=scaled4)
 
     with pytest.raises(ValueError):
         a.similarity(b)
@@ -1210,12 +1209,8 @@ def test_mh_concat_diff_ksize(track_abundance):
 
 
 def test_mh_concat_diff_max_hash(track_abundance):
-    a = MinHash(
-        0, 5, track_abundance=track_abundance, scaled=_get_max_hash_for_scaled(5)
-    )
-    b = MinHash(
-        0, 5, track_abundance=track_abundance, scaled=_get_max_hash_for_scaled(10)
-    )
+    a = MinHash(0, 5, track_abundance=track_abundance, scaled=scaled8)
+    b = MinHash(0, 5, track_abundance=track_abundance, scaled=scaled4)
 
     with pytest.raises(ValueError):
         a += b
@@ -1514,16 +1509,14 @@ def test_mh_copy_and_clear(track_abundance):
 
 def test_mh_copy_and_clear_with_max_hash(track_abundance):
     # test basic creation of new, empty MinHash w/max_hash param set
-    a = MinHash(
-        0, 10, track_abundance=track_abundance, scaled=_get_scaled_for_max_hash(20)
-    )
-    for i in range(0, 40, 2):
-        a.add_hash(i)
+    a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled8)
+    for i in range(44, 64):
+        a.add_hash(2**i)
 
     b = a.copy_and_clear()
     assert a.ksize == b.ksize
     assert b.num == a.num
-    assert b._max_hash == 20
+    assert b._max_hash == 2**61
     assert not b.is_protein
     assert b.track_abundance == track_abundance
     assert b.seed == a.seed
@@ -1545,21 +1538,21 @@ def test_pickle_protein(track_abundance):
         10,
         track_abundance=track_abundance,
         is_protein=True,
-        scaled=_get_scaled_for_max_hash(20),
+        scaled=scaled8,
     )
-    for i in range(0, 40, 2):
-        a.add_hash(i)
+    for i in range(44, 64):
+        a.add_hash(2**i)
 
     b = pickle.loads(pickle.dumps(a))
     assert a.ksize == b.ksize
     assert b.num == a.num
     assert b._max_hash == a._max_hash
-    assert b._max_hash == 20
+    assert b._max_hash == 2**61
     assert b.is_protein
     assert b.track_abundance == track_abundance
     assert b.seed == a.seed
     assert len(b.hashes) == len(a.hashes)
-    assert len(b.hashes) == 11
+    assert len(b.hashes) == 18
     assert a.scaled == b.scaled
     assert b.scaled != 0
 
@@ -1571,21 +1564,21 @@ def test_pickle_dayhoff(track_abundance):
         10,
         track_abundance=track_abundance,
         dayhoff=True,
-        scaled=_get_scaled_for_max_hash(20),
+        scaled=scaled8,
     )
-    for i in range(0, 40, 2):
-        a.add_hash(i)
+    for i in range(44, 64):
+        a.add_hash(2**i)
 
     b = pickle.loads(pickle.dumps(a))
     assert a.ksize == b.ksize
     assert b.num == a.num
     assert b._max_hash == a._max_hash
-    assert b._max_hash == 20
+    assert b._max_hash == 2**61
     assert b.dayhoff
     assert b.track_abundance == track_abundance
     assert b.seed == a.seed
     assert len(b.hashes) == len(a.hashes)
-    assert len(b.hashes) == 11
+    assert len(b.hashes) == 18
     assert a.scaled == b.scaled
     assert b.scaled != 0
 
@@ -1597,61 +1590,59 @@ def test_pickle_hp(track_abundance):
         10,
         track_abundance=track_abundance,
         hp=True,
-        scaled=_get_scaled_for_max_hash(20),
+        scaled=scaled8,
     )
-    for i in range(0, 40, 2):
-        a.add_hash(i)
+    for i in range(44, 64):
+        a.add_hash(2**i)
 
     b = pickle.loads(pickle.dumps(a))
     assert a.ksize == b.ksize
     assert b.num == a.num
     assert b._max_hash == a._max_hash
-    assert b._max_hash == 20
+    assert b._max_hash == 2**61
     assert b.hp
     assert b.track_abundance == track_abundance
     assert b.seed == a.seed
     assert len(b.hashes) == len(a.hashes)
-    assert len(b.hashes) == 11
+    assert len(b.hashes) == 18
     assert a.scaled == b.scaled
     assert b.scaled != 0
 
 
 def test_pickle_max_hash(track_abundance):
-    a = MinHash(
-        0, 10, track_abundance=track_abundance, scaled=_get_scaled_for_max_hash(20)
-    )
-    for i in range(0, 40, 2):
-        a.add_hash(i)
+    a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled8)
+    for i in range(44, 64):
+        a.add_hash(2**i)
 
     b = pickle.loads(pickle.dumps(a))
     assert a.ksize == b.ksize
     assert b.num == a.num
     assert b._max_hash == a._max_hash
-    assert b._max_hash == 20
+    assert b._max_hash == 2**61
     assert not b.is_protein
     assert b.track_abundance == track_abundance
     assert b.seed == a.seed
     assert len(b.hashes) == len(a.hashes)
-    assert len(b.hashes) == 11
+    assert len(b.hashes) == 18
     assert a.scaled == b.scaled
     assert b.scaled != 0
 
 
 def test_pickle_scaled(track_abundance):
-    a = MinHash(0, 10, track_abundance=track_abundance, scaled=922337203685477632)
-    for i in range(0, 40, 2):
-        a.add_hash(i)
+    a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled8)
+    for i in range(44, 64):
+        a.add_hash(2**i)
 
     b = pickle.loads(pickle.dumps(a))
     assert a.ksize == b.ksize
     assert b.num == a.num
     assert b._max_hash == a._max_hash
-    assert b._max_hash == 20
+    assert b._max_hash == 2**61
     assert not b.is_protein
     assert b.track_abundance == track_abundance
     assert b.seed == a.seed
     assert len(b.hashes) == len(a.hashes)
-    assert len(b.hashes) == 11
+    assert len(b.hashes) == 18
     assert a.scaled == b.scaled
     assert b.scaled != 0
 
@@ -1661,7 +1652,7 @@ def test_minhash_abund_add():
     # std::vector iterators upon vector resizing - in this case, there
     # was also a bug in inserting into the middle of mins when scaled was set.
 
-    a = MinHash(0, 10, track_abundance=True, scaled=scaled5000)
+    a = MinHash(0, 10, track_abundance=True, scaled=scaled8)
 
     n = 0
     for i in range(10, 0, -1):
@@ -1677,7 +1668,7 @@ def test_minhash_abund_capacity_increase():
 
     # this should set capacity to 1000 - see KmerMinHash constructor call
     # to 'reserve' when n > 0 for specific parameter.
-    a = MinHash(0, 10, track_abundance=True, scaled=scaled5000)
+    a = MinHash(0, 10, track_abundance=True, scaled=scaled8)
 
     # 1001 is dependent on the value passed to reserve (currently 1000).
     for i in range(1001, 0, -1):
@@ -1689,8 +1680,8 @@ def test_minhash_abund_merge_flat():
     # of a signature with abundance and a signature without abundance.
     # the correct behavior for now is to calculate simple Jaccard,
     # i.e. 'flatten' both of them.
-    a = MinHash(0, 10, track_abundance=True, scaled=scaled5000)
-    b = MinHash(0, 10, scaled=scaled5000)
+    a = MinHash(0, 10, track_abundance=True, scaled=scaled8)
+    b = MinHash(0, 10, scaled=scaled8)
 
     for i in range(0, 10, 2):
         a.add_hash(i)
@@ -1707,8 +1698,8 @@ def test_minhash_abund_merge_flat_2():
     # this targets a segfault caused by trying to merge
     # a signature with abundance and a signature without abundance.
 
-    a = MinHash(0, 10, track_abundance=True, scaled=scaled5000)
-    b = MinHash(0, 10, scaled=scaled5000)
+    a = MinHash(0, 10, track_abundance=True, scaled=scaled2)
+    b = MinHash(0, 10, scaled=scaled2)
 
     for i in range(0, 10, 2):
         a.add_hash(i)
@@ -1746,7 +1737,7 @@ def test_distance_matrix(track_abundance):
 
 
 def test_remove_many(track_abundance):
-    a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000)
+    a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled2)
 
     a.add_many(list(range(0, 100, 2)))
 
@@ -1766,13 +1757,13 @@ def test_remove_many(track_abundance):
 
 
 def test_remove_minhash(track_abundance):
-    original_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000)
-    added_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000)
-    tested_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000)
+    original_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled8)
+    added_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled8)
+    tested_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled8)
 
-    original_mh.add_many(list(range(101)))
-    added_mh.add_many(list(range(101, 201)))  # contains original in it
-    tested_mh.add_many(list(range(201)))  # original + added
+    original_mh.add_many([2**i for i in range(62)])
+    added_mh.add_many([2**63])  # contains original in it
+    tested_mh.add_many([2**i for i in range(63)])  # original + added
 
     # Now we should expect tested_minhash == original_minhash
     # Note we are passing a MinHash object instead of an iterable object
@@ -1789,8 +1780,8 @@ def test_remove_minhash(track_abundance):
 
 
 def test_add_many(track_abundance):
-    a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000)
-    b = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000)
+    a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled8)
+    b = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled8)
 
     a.add_many(list(range(0, 100, 2)))
     a.add_many(list(range(0, 100, 2)))  # => abundance = 2
@@ -1807,18 +1798,18 @@ def test_add_many(track_abundance):
 
 
 def test_set_abundances_huge():
-    max_hash = 4000000
+    max_hash = _get_max_hash_for_scaled(2**31)
     a = MinHash(0, 10, track_abundance=True, scaled=_get_scaled_for_max_hash(max_hash))
 
-    hashes = list(range(max_hash))
+    hashes = [2**i for i in range(64)]
     abundances = itertools.repeat(2)
 
     a.set_abundances(dict(zip(hashes, abundances)))
 
 
 def test_try_change_hashes(track_abundance):
-    a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000)
-    MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000)
+    a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled8)
+    MinHash(0, 10, track_abundance=track_abundance, scaled=scaled8)
 
     a.add_many(list(range(0, 100, 2)))
 
@@ -1829,9 +1820,9 @@ def test_try_change_hashes(track_abundance):
 
 def test_flatten():
     # test behavior with scaled
-    scaled = _get_scaled_for_max_hash(35)
+    scaled = scaled8
     mh = MinHash(0, 4, track_abundance=True, scaled=scaled)
-    assert mh._max_hash == 35
+    assert mh._max_hash == 2**61
 
     mh.add_hash(10)
     mh.add_hash(10)
@@ -1856,10 +1847,10 @@ def test_flatten():
 
 def test_inflate():
     # test behavior of inflate function
-    scaled = _get_scaled_for_max_hash(35)
+    scaled = scaled2
     mh = MinHash(0, 4, track_abundance=False, scaled=scaled)
     mh2 = MinHash(0, 4, track_abundance=True, scaled=scaled)
-    assert mh._max_hash == 35
+    assert mh._max_hash == 2**63
 
     mh.add_hash(10)
     mh.add_hash(20)
@@ -1891,10 +1882,10 @@ def test_inflate():
 
 def test_inflate_error():
     # test behavior of inflate function with 'self' as an abund sketch
-    scaled = _get_scaled_for_max_hash(35)
+    scaled = scaled2
     mh = MinHash(0, 4, track_abundance=True, scaled=scaled)
     mh2 = MinHash(0, 4, track_abundance=True, scaled=scaled)
-    assert mh._max_hash == 35
+    assert mh._max_hash == 2**63
 
     mh.add_hash(10)
     mh.add_hash(20)
@@ -1928,10 +1919,10 @@ def test_inflate_error():
 
 def test_inflate_not_a_subset():
     # test behavior of inflate function when 'from_mh' is not a subset.
-    scaled = _get_scaled_for_max_hash(35)
+    scaled = scaled2
     mh = MinHash(0, 4, track_abundance=False, scaled=scaled)
     mh2 = MinHash(0, 4, track_abundance=True, scaled=scaled)
-    assert mh._max_hash == 35
+    assert mh._max_hash == 2**63
 
     mh.add_hash(10)
     mh.add_hash(20)
diff --git a/tests/test_signature.py b/tests/test_signature.py
index 6365b58856..0dd412c73e 100644
--- a/tests/test_signature.py
+++ b/tests/test_signature.py
@@ -191,7 +191,7 @@ def test_roundtrip_empty(track_abundance):
 
 
 def test_roundtrip_scaled(track_abundance):
-    e = MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=10)
+    e = MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=2**61)
     e.add_hash(5)
     sig = SourmashSignature(e)
     s = save_signatures_to_json([sig])
@@ -222,14 +222,14 @@ def test_roundtrip_seed(track_abundance):
 
 def test_similarity_downsample(track_abundance):
     e = MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=2**63)
-    f = MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=2**2)
+    f = MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=2**61)
 
     e.add_hash(1)
-    e.add_hash(5)
+    e.add_hash(2**62)
     assert len(e.hashes) == 2
 
     f.add_hash(1)
-    f.add_hash(5)  # should be discarded due to max_hash
+    f.add_hash(2**62)  # should be discarded due to max_hash
     assert len(f.hashes) == 1
 
     ee = SourmashSignature(e)