Skip to content

Commit

Permalink
cleanup and refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
ctb committed Nov 11, 2024
1 parent f614e27 commit 9e3204f
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 73 deletions.
22 changes: 11 additions & 11 deletions src/fastmultigather.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ pub fn fastmultigather(
query_filepath: String,
against_filepath: String,
threshold_bp: u32,
scaled: Option<u32>,
fooscaled: Option<u32>,
selection: Selection,
allow_failed_sigpaths: bool,
save_matches: bool,
Expand All @@ -46,25 +46,23 @@ pub fn fastmultigather(
allow_failed_sigpaths,
)?;

let scaled = match scaled {
let common_scaled = match fooscaled {
Some(s) => s,
None => {
let scaled = *query_collection.max_scaled().expect("no records!?");
let s = *query_collection.max_scaled().expect("no records!?");
eprintln!(
"Setting scaled={} based on max scaled in query collection",
scaled
s
);
scaled
s
}
};

let common_scaled = scaled;

let mut against_selection = selection;
against_selection.set_scaled(scaled);
against_selection.set_scaled(common_scaled);

let threshold_hashes: u64 = {
let x = threshold_bp as u64 / scaled as u64;
let x = threshold_bp as u64 / common_scaled as u64;
if x > 0 {
x as u64
} else {
Expand Down Expand Up @@ -111,7 +109,9 @@ pub fn fastmultigather(

let query_mh: KmerMinHash = query_sig.try_into().expect("cannot get sketch");

let query_mh = query_mh.downsample_scaled(common_scaled).expect("fiz");
let query_mh = query_mh
.downsample_scaled(common_scaled)
.expect("cannot downsample query");

// CTB refactor
let query_scaled = query_mh.scaled();
Expand Down Expand Up @@ -167,7 +167,7 @@ pub fn fastmultigather(
query_name,
query_filename,
query_mh,
scaled,
common_scaled,
matchlist,
threshold_hashes,
Some(gather_output),
Expand Down
60 changes: 30 additions & 30 deletions src/fastmultigather_rocksdb.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,37 +30,37 @@ pub fn fastmultigather_rocksdb(
let db = RevIndex::open(index, true, None)?;
println!("Loaded DB");

let query_collection = load_collection(
&queries_file,
&selection,
ReportType::Query,
allow_failed_sigpaths,
)?;

// set scaled from query collection.
let scaled = match selection.scaled() {
Some(s) => s,
None => {
let scaled = *query_collection.max_scaled().expect("no records!?");
eprintln!(
"Setting scaled={} based on max scaled in query collection",
scaled
);
// grab scaled from the database.
let max_db_scaled = db
.collection()
.manifest()
.iter()
.map(|r| r.scaled())
.max()
.expect("no records in db?!");

let selection_scaled: u32 = match selection.scaled() {
Some(scaled) => {
if *max_db_scaled > scaled {
return Err("Error: database scaled is higher than requested scaled".into());
}
scaled
}
None => {
eprintln!("Setting scaled={} from the database", *max_db_scaled);
*max_db_scaled
}
};
let query_set_scaled = scaled; // redundant, but is nicer name

let mut against_selection = selection;
against_selection.set_scaled(scaled);
let mut set_selection = selection;
set_selection.set_scaled(selection_scaled);

// update query collection scaled value, too?
let mut query_collection = query_collection;
if scaled != query_set_scaled {
query_collection = query_collection
.select(&against_selection)
.expect("cannot properly downsample query collection");
}
let query_collection = load_collection(
&queries_file,
&set_selection,
ReportType::Query,
allow_failed_sigpaths,
)?;

// set up a multi-producer, single-consumer channel.
let (send, recv) =
Expand All @@ -83,8 +83,8 @@ pub fn fastmultigather_rocksdb(
let send = query_collection
.par_iter()
.filter_map(|(coll, _idx, record)| {
let threshold = threshold_bp / against_selection.scaled().expect("scaled is not set!?");
let ksize = against_selection.ksize().expect("ksize not set!?");
let threshold = threshold_bp / set_selection.scaled().expect("scaled is not set!?");
let ksize = set_selection.ksize().expect("ksize not set!?");

// query downsampling happens here
match coll.sig_from_record(record) {
Expand All @@ -96,7 +96,7 @@ pub fn fastmultigather_rocksdb(
let mut results = vec![];
if let Ok(query_mh) = <SigStore as TryInto<KmerMinHash>>::try_into(query_sig) {
let query_mh = query_mh
.downsample_scaled(query_set_scaled)
.downsample_scaled(selection_scaled)
.expect("cannot downsample!?");
let _ = processed_sigs.fetch_add(1, atomic::Ordering::SeqCst);
// Gather!
Expand All @@ -109,7 +109,7 @@ pub fn fastmultigather_rocksdb(
hash_to_color,
threshold as usize,
&query_mh,
Some(against_selection.clone()),
Some(set_selection.clone()),
);
if let Ok(matches) = matches {
for match_ in &matches {
Expand Down
18 changes: 9 additions & 9 deletions src/manysearch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,19 +35,19 @@ pub fn manysearch(
)?;

// Figure out what scaled to use - either from selection, or from query.
let scaled: u32;
if let Some(set_scaled) = selection.scaled() {
scaled = set_scaled;
let common_scaled: u32 = if let Some(set_scaled) = selection.scaled() {
set_scaled
} else {
scaled = *query_collection.max_scaled().expect("no records!?");
let s = *query_collection.max_scaled().expect("no records!?");
eprintln!(
"Setting scaled={} based on max scaled in query collection",
scaled
s
);
s
};

let mut selection = selection;
selection.set_scaled(scaled);
selection.set_scaled(common_scaled);

// load all query sketches into memory, downsampling on the way
let query_sketchlist = query_collection.load_sketches(&selection)?;
Expand Down Expand Up @@ -95,14 +95,14 @@ pub fn manysearch(
<SigStore as TryInto<KmerMinHash>>::try_into(against_sig)
{
let against_mh = against_mh
.downsample_scaled(scaled)
.downsample_scaled(common_scaled)
.expect("cannot downsample search minhash to requested scaled");
for query in query_sketchlist.iter() {
// be paranoid and confirm scaled match.
if query.minhash.scaled() != scaled {
if query.minhash.scaled() != common_scaled {
panic!("different query scaled");
}
if against_mh.scaled() != scaled {
if against_mh.scaled() != common_scaled {
panic!("different against scaled");
}

Expand Down
1 change: 0 additions & 1 deletion src/manysearch_rocksdb.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@ pub fn manysearch_rocksdb(
db.matches_from_counter(counter, minimum_containment as usize);

// filter the matches for containment
// debug!("FOUND: {} matches for {:?}", matches.len(), query_sig);
for (path, overlap) in matches {
let containment = overlap as f64 / query_size as f64;
if containment >= minimum_containment {
Expand Down
11 changes: 5 additions & 6 deletions src/multisearch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,23 +31,22 @@ pub fn multisearch(
allow_failed_sigpaths,
)?;

let scaled = match selection.scaled() {
let expected_scaled = match selection.scaled() {
Some(s) => s,
None => {
let scaled = *query_collection.max_scaled().expect("no records!?") as u32;
let s = *query_collection.max_scaled().expect("no records!?") as u32;
eprintln!(
"Setting scaled={} based on max scaled in query collection",
scaled
s
);
scaled
s
}
};

let ksize = selection.ksize().unwrap() as f64;
let expected_scaled = scaled; // nicer name.

let mut new_selection = selection;
new_selection.set_scaled(scaled as u32);
new_selection.set_scaled(expected_scaled);

let queries = query_collection.load_sketches(&new_selection)?;

Expand Down
30 changes: 14 additions & 16 deletions src/python/tests/test_fastmultigather.py
Original file line number Diff line number Diff line change
Expand Up @@ -2094,24 +2094,22 @@ def test_simple_query_scaled_indexed(runtmp):

make_file_list(query_list, [query])
make_file_list(against_list, [sig2, sig47, sig63])
against_list = index_siglist(runtmp, against_list, runtmp.output("against.rocksdb"))

runtmp.sourmash(
"scripts",
"fastmultigather",
query_list,
against_list,
"-o",
"foo.csv",
"-t",
"0",
in_directory=runtmp.output(""),
against_list = index_siglist(
runtmp, against_list, runtmp.output("against.rocksdb"), scaled=1000
)

print(os.listdir(runtmp.output("")))

g_output = runtmp.output("foo.csv")
assert os.path.exists(g_output)
with pytest.raises(utils.SourmashCommandFailed):
runtmp.sourmash(
"scripts",
"fastmultigather",
query_list,
against_list,
"-o",
"foo.csv",
"-t",
"0",
in_directory=runtmp.output(""),
)


def test_equal_matches(runtmp, indexed):
Expand Down

0 comments on commit 9e3204f

Please sign in to comment.