From 9e3204f1ad1f8cf05e3e8878c3b8b26cad6d4975 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Mon, 11 Nov 2024 10:05:22 -0800 Subject: [PATCH] cleanup and refactor --- src/fastmultigather.rs | 22 ++++----- src/fastmultigather_rocksdb.rs | 60 ++++++++++++------------ src/manysearch.rs | 18 +++---- src/manysearch_rocksdb.rs | 1 - src/multisearch.rs | 11 ++--- src/python/tests/test_fastmultigather.py | 30 ++++++------ 6 files changed, 69 insertions(+), 73 deletions(-) diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index b910794b..001c5299 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -30,7 +30,7 @@ pub fn fastmultigather( query_filepath: String, against_filepath: String, threshold_bp: u32, - scaled: Option, + fooscaled: Option, selection: Selection, allow_failed_sigpaths: bool, save_matches: bool, @@ -46,25 +46,23 @@ pub fn fastmultigather( allow_failed_sigpaths, )?; - let scaled = match scaled { + let common_scaled = match fooscaled { Some(s) => s, None => { - let scaled = *query_collection.max_scaled().expect("no records!?"); + let s = *query_collection.max_scaled().expect("no records!?"); eprintln!( "Setting scaled={} based on max scaled in query collection", - scaled + s ); - scaled + s } }; - let common_scaled = scaled; - let mut against_selection = selection; - against_selection.set_scaled(scaled); + against_selection.set_scaled(common_scaled); let threshold_hashes: u64 = { - let x = threshold_bp as u64 / scaled as u64; + let x = threshold_bp as u64 / common_scaled as u64; if x > 0 { x as u64 } else { @@ -111,7 +109,9 @@ pub fn fastmultigather( let query_mh: KmerMinHash = query_sig.try_into().expect("cannot get sketch"); - let query_mh = query_mh.downsample_scaled(common_scaled).expect("fiz"); + let query_mh = query_mh + .downsample_scaled(common_scaled) + .expect("cannot downsample query"); // CTB refactor let query_scaled = query_mh.scaled(); @@ -167,7 +167,7 @@ pub fn fastmultigather( query_name, query_filename, query_mh, - scaled, + common_scaled, matchlist, threshold_hashes, Some(gather_output), diff --git a/src/fastmultigather_rocksdb.rs b/src/fastmultigather_rocksdb.rs index 2473ab4a..ddedf27d 100644 --- a/src/fastmultigather_rocksdb.rs +++ b/src/fastmultigather_rocksdb.rs @@ -30,37 +30,37 @@ pub fn fastmultigather_rocksdb( let db = RevIndex::open(index, true, None)?; println!("Loaded DB"); - let query_collection = load_collection( - &queries_file, - &selection, - ReportType::Query, - allow_failed_sigpaths, - )?; - - // set scaled from query collection. - let scaled = match selection.scaled() { - Some(s) => s, - None => { - let scaled = *query_collection.max_scaled().expect("no records!?"); - eprintln!( - "Setting scaled={} based on max scaled in query collection", - scaled - ); + // grab scaled from the database. + let max_db_scaled = db + .collection() + .manifest() + .iter() + .map(|r| r.scaled()) + .max() + .expect("no records in db?!"); + + let selection_scaled: u32 = match selection.scaled() { + Some(scaled) => { + if *max_db_scaled > scaled { + return Err("Error: database scaled is higher than requested scaled".into()); + } scaled } + None => { + eprintln!("Setting scaled={} from the database", *max_db_scaled); + *max_db_scaled + } }; - let query_set_scaled = scaled; // redundant, but is nicer name - let mut against_selection = selection; - against_selection.set_scaled(scaled); + let mut set_selection = selection; + set_selection.set_scaled(selection_scaled); - // update query collection scaled value, too? - let mut query_collection = query_collection; - if scaled != query_set_scaled { - query_collection = query_collection - .select(&against_selection) - .expect("cannot properly downsample query collection"); - } + let query_collection = load_collection( + &queries_file, + &set_selection, + ReportType::Query, + allow_failed_sigpaths, + )?; // set up a multi-producer, single-consumer channel. let (send, recv) = @@ -83,8 +83,8 @@ pub fn fastmultigather_rocksdb( let send = query_collection .par_iter() .filter_map(|(coll, _idx, record)| { - let threshold = threshold_bp / against_selection.scaled().expect("scaled is not set!?"); - let ksize = against_selection.ksize().expect("ksize not set!?"); + let threshold = threshold_bp / set_selection.scaled().expect("scaled is not set!?"); + let ksize = set_selection.ksize().expect("ksize not set!?"); // query downsampling happens here match coll.sig_from_record(record) { @@ -96,7 +96,7 @@ pub fn fastmultigather_rocksdb( let mut results = vec![]; if let Ok(query_mh) = >::try_into(query_sig) { let query_mh = query_mh - .downsample_scaled(query_set_scaled) + .downsample_scaled(selection_scaled) .expect("cannot downsample!?"); let _ = processed_sigs.fetch_add(1, atomic::Ordering::SeqCst); // Gather! @@ -109,7 +109,7 @@ pub fn fastmultigather_rocksdb( hash_to_color, threshold as usize, &query_mh, - Some(against_selection.clone()), + Some(set_selection.clone()), ); if let Ok(matches) = matches { for match_ in &matches { diff --git a/src/manysearch.rs b/src/manysearch.rs index 861598d8..b18b29ae 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -35,19 +35,19 @@ pub fn manysearch( )?; // Figure out what scaled to use - either from selection, or from query. - let scaled: u32; - if let Some(set_scaled) = selection.scaled() { - scaled = set_scaled; + let common_scaled: u32 = if let Some(set_scaled) = selection.scaled() { + set_scaled } else { - scaled = *query_collection.max_scaled().expect("no records!?"); + let s = *query_collection.max_scaled().expect("no records!?"); eprintln!( "Setting scaled={} based on max scaled in query collection", - scaled + s ); + s }; let mut selection = selection; - selection.set_scaled(scaled); + selection.set_scaled(common_scaled); // load all query sketches into memory, downsampling on the way let query_sketchlist = query_collection.load_sketches(&selection)?; @@ -95,14 +95,14 @@ pub fn manysearch( >::try_into(against_sig) { let against_mh = against_mh - .downsample_scaled(scaled) + .downsample_scaled(common_scaled) .expect("cannot downsample search minhash to requested scaled"); for query in query_sketchlist.iter() { // be paranoid and confirm scaled match. - if query.minhash.scaled() != scaled { + if query.minhash.scaled() != common_scaled { panic!("different query scaled"); } - if against_mh.scaled() != scaled { + if against_mh.scaled() != common_scaled { panic!("different against scaled"); } diff --git a/src/manysearch_rocksdb.rs b/src/manysearch_rocksdb.rs index 64d9a66e..d47248b2 100644 --- a/src/manysearch_rocksdb.rs +++ b/src/manysearch_rocksdb.rs @@ -111,7 +111,6 @@ pub fn manysearch_rocksdb( db.matches_from_counter(counter, minimum_containment as usize); // filter the matches for containment - // debug!("FOUND: {} matches for {:?}", matches.len(), query_sig); for (path, overlap) in matches { let containment = overlap as f64 / query_size as f64; if containment >= minimum_containment { diff --git a/src/multisearch.rs b/src/multisearch.rs index 1b5e07b2..27eeab35 100644 --- a/src/multisearch.rs +++ b/src/multisearch.rs @@ -31,23 +31,22 @@ pub fn multisearch( allow_failed_sigpaths, )?; - let scaled = match selection.scaled() { + let expected_scaled = match selection.scaled() { Some(s) => s, None => { - let scaled = *query_collection.max_scaled().expect("no records!?") as u32; + let s = *query_collection.max_scaled().expect("no records!?") as u32; eprintln!( "Setting scaled={} based on max scaled in query collection", - scaled + s ); - scaled + s } }; let ksize = selection.ksize().unwrap() as f64; - let expected_scaled = scaled; // nicer name. let mut new_selection = selection; - new_selection.set_scaled(scaled as u32); + new_selection.set_scaled(expected_scaled); let queries = query_collection.load_sketches(&new_selection)?; diff --git a/src/python/tests/test_fastmultigather.py b/src/python/tests/test_fastmultigather.py index 9a17dd0a..d08873b5 100644 --- a/src/python/tests/test_fastmultigather.py +++ b/src/python/tests/test_fastmultigather.py @@ -2094,24 +2094,22 @@ def test_simple_query_scaled_indexed(runtmp): make_file_list(query_list, [query]) make_file_list(against_list, [sig2, sig47, sig63]) - against_list = index_siglist(runtmp, against_list, runtmp.output("against.rocksdb")) - - runtmp.sourmash( - "scripts", - "fastmultigather", - query_list, - against_list, - "-o", - "foo.csv", - "-t", - "0", - in_directory=runtmp.output(""), + against_list = index_siglist( + runtmp, against_list, runtmp.output("against.rocksdb"), scaled=1000 ) - print(os.listdir(runtmp.output(""))) - - g_output = runtmp.output("foo.csv") - assert os.path.exists(g_output) + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash( + "scripts", + "fastmultigather", + query_list, + against_list, + "-o", + "foo.csv", + "-t", + "0", + in_directory=runtmp.output(""), + ) def test_equal_matches(runtmp, indexed):