From 38d99b6952bf024a60664553d6e5ea18f7891dc1 Mon Sep 17 00:00:00 2001
From: Tessa Pierce Ward <bluegenes@users.noreply.github.com>
Date: Thu, 26 Dec 2024 17:57:43 -0800
Subject: [PATCH 1/2] MRG: modify n simultaneous downloads; update buildutils
 (#154)

This PR integrates the changes to `BuildUtils` (`MultiSelection`
details, minor changes to `BuildCollection` filtering + writing) that
arose from integration into branchwater. It also makes the number of
simultaneous downloads tunable, since I was having trouble when using
the 3 default permits with large eukaryotic genomes.

It also handles changes associated with zipfile handling from
https://github.com/sourmash-bio/sourmash/pull/3431 arising from updating
sourmash core to 0.18.0

ref #134
---
 Cargo.lock                                    |  43 +-
 Cargo.toml                                    |   4 +-
 README.md                                     |  19 +-
 src/directsketch.rs                           | 290 ++++++-------
 src/lib.rs                                    |   8 +-
 .../sourmash_plugin_directsketch/__init__.py  |   8 +-
 src/utils/buildutils.rs                       | 386 +++++++-----------
 src/utils/mod.rs                              |   3 +-
 tests/test_gbsketch.py                        |  15 +-
 tests/test_urlsketch.py                       |   2 +-
 10 files changed, 330 insertions(+), 448 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 9f24d17..58aef4a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -992,13 +992,12 @@ checksum = "a2983372caf4480544083767bf2d27defafe32af49ab4df3a0b7fc90793a3664"
 
 [[package]]
 name = "nalgebra"
-version = "0.32.6"
+version = "0.33.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7b5c17de023a86f59ed79891b2e5d5a94c705dbe904a5b5c9c952ea6221b03e4"
+checksum = "26aecdf64b707efd1310e3544d709c5c0ac61c13756046aaaba41be5c4f66a3b"
 dependencies = [
  "approx",
  "matrixmultiply",
- "nalgebra-macros",
  "num-complex",
  "num-rational",
  "num-traits",
@@ -1008,17 +1007,6 @@ dependencies = [
  "typenum",
 ]
 
-[[package]]
-name = "nalgebra-macros"
-version = "0.2.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "254a5372af8fc138e36684761d3c0cdb758a4410e938babcff1c860ce14ddbfc"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
 [[package]]
 name = "native-tls"
 version = "0.2.12"
@@ -1082,6 +1070,16 @@ version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2bf50223579dc7cdcfb3bfcacf7069ff68243f8c363f62ffa99cf000a6b9c451"
 
+[[package]]
+name = "num-bigint"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
+dependencies = [
+ "num-integer",
+ "num-traits",
+]
+
 [[package]]
 name = "num-complex"
 version = "0.4.6"
@@ -1117,6 +1115,7 @@ version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
 dependencies = [
+ "num-bigint",
  "num-integer",
  "num-traits",
 ]
@@ -1627,9 +1626,9 @@ dependencies = [
 
 [[package]]
 name = "roaring"
-version = "0.10.6"
+version = "0.10.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f4b84ba6e838ceb47b41de5194a60244fac43d9fe03b71dbe8c5a201081d6d1"
+checksum = "41589aba99537475bf697f2118357cad1c31590c5a1b9f6d9fc4ad6d07503661"
 dependencies = [
  "bytemuck",
  "byteorder",
@@ -1814,9 +1813,9 @@ dependencies = [
 
 [[package]]
 name = "simba"
-version = "0.8.1"
+version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "061507c94fc6ab4ba1c9a0305018408e312e17c041eb63bef8aa726fa33aceae"
+checksum = "b3a386a501cd104797982c15ae17aafe8b9261315b5d07e3ec803f2ea26be0fa"
 dependencies = [
  "approx",
  "num-complex",
@@ -1864,9 +1863,9 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7"
 
 [[package]]
 name = "sourmash"
-version = "0.17.2"
+version = "0.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54e30f752d984b1d8456024973f8d89772b4ba248f592b77b57d59ad27a232a0"
+checksum = "fec589a91cf0d7d8cde46a51ccf165f32b9b4d709688f69b3fcea14c6f12e6e6"
 dependencies = [
  "az",
  "byteorder",
@@ -1950,9 +1949,9 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
 
 [[package]]
 name = "statrs"
-version = "0.17.1"
+version = "0.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f697a07e4606a0a25c044de247e583a330dbb1731d11bc7350b81f48ad567255"
+checksum = "2a3fe7c28c6512e766b0874335db33c94ad7b8f9054228ae1c2abd47ce7d335e"
 dependencies = [
  "approx",
  "nalgebra",
diff --git a/Cargo.toml b/Cargo.toml
index 05ad101..c3c9c92 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,10 +9,10 @@ name = "sourmash_plugin_directsketch"
 crate-type = ["cdylib"]
 
 [dependencies]
-pyo3 = { version = "0.23.3", features = ["extension-module", "anyhow"] }
+pyo3 = { version = "0.23.3", features = ["extension-module","anyhow"]}
 rayon = "1.10.0"
 serde = { version = "1.0.204", features = ["derive"] }
-sourmash = { version = "0.17.2"}
+sourmash = { version = "0.18.0"}
 serde_json = "1.0.134"
 niffler = "2.4.0"
 needletail = "0.5.1"
diff --git a/README.md b/README.md
index 7b24325..566c532 100644
--- a/README.md
+++ b/README.md
@@ -47,8 +47,15 @@ pip install sourmash_plugin_directsketch
 
 ## Usage Considerations
 
-If you're building large databases (over 20k files), we highly recommend you use batched zipfiles (v0.4+) to facilitate restart. If you encounter unexpected failures and are using a single zipfile output (default), `gbsketch`/`urlsketch` will have to re-download and re-sketch all files. If you instead set a batch size using `--batch-size`, e.g. 10000, then `gbsketch`/`urlsketch` can load any batched zips that finished writing, and avoid re-generating those signatures. For `gbsketch`, the batch size represents the number of accessions included in each zip, with all signatures associated with an accession grouped within a single `zip`. For `urlsketch`, the batch size represents the number of total signatures included in each zip. Note that batches will use the `--output` file to build batched filenames, so if you provided `output.zip`, your batches will be `output.1.zip`, `output.2.zip`, etc.
+### Allowing restart with batching
 
+If you're building large databases, we highly recommend you use batched zipfiles (v0.4+) to facilitate restart. If you encounter unexpected failures and are using a single zipfile output (default), `gbsketch`/`urlsketch` will have to re-download and re-sketch all files. If you instead set a batch size using `--batch-size`, then `gbsketch`/`urlsketch` can load any batched zips that finished writing, and avoid re-generating those signatures. For `gbsketch`, the batch size represents the number of accessions included in each zip, with all signatures associated with an accession grouped within a single `zip`. For `urlsketch`, the batch size represents the number of sigs associated with each url provided. Note that batches will use the `--output` file to build batched filenames, so if you provided `output.zip`, your batches will be `output.1.zip`, `output.2.zip`, etc. For small genomes (e.g. microbes), you can keep batch sizes quite large, e.g. 1000s-10000s. For large eukaryotic genomes where download takes much longer, you may want to use smaller batch sizes.
+
+To build a single database after batched sketching, you can use `sig cat` to build a single zipfile (`sourmash sig cat *.zip -o OUTPUT.zip`) or `sig collect` to collect all the zips into a standalone manifest that can be used with sourmash and branchwater commands.
+
+### Memory Requirements
+
+Directsketch downloads the full file, optionally checking the `md5sum`, then performs the sketch. As a result, you will need enough memory to hold up to 3 genomes in memory at once. For microbial genomes, this is trivial. For large eukaryotic genomes (e.g. plants!), be sure to provide sufficient memory. You can tune the number of simultaneous downloads (and thus, the number of genomes that will be in memory simultaneously) with `--n-simultaneous-downloads`.
 
 ## Running the commands
 
@@ -125,7 +132,8 @@ options:
   -f FASTAS, --fastas FASTAS
                         Write fastas here
   --batch-size BATCH_SIZE
-                        Write smaller zipfiles, each containing sigs associated with this number of accessions. This allows gbsketch to recover after unexpected failures, rather than needing to
+                        Write smaller zipfiles, each containing sigs associated with this number of accessions. 
+                        This allows gbsketch to recover after unexpected failures, rather than needing to
                         restart sketching from scratch. Default: write all sigs to single zipfile.
   -k, --keep-fasta      write FASTA files in addition to sketching. Default: do not write FASTA files
   --download-only       just download genomes; do not sketch
@@ -138,6 +146,8 @@ options:
                         number of cores to use (default is all available)
   -r RETRY_TIMES, --retry-times RETRY_TIMES
                         number of times to retry failed downloads
+  -n {1,2,3}, --n-simultaneous-downloads {1,2,3}
+                        number of accessions to download simultaneously (default=1)
   -g, --genomes-only    just download and sketch genome (DNA) files
   -m, --proteomes-only  just download and sketch proteome (protein) files
 ```
@@ -186,7 +196,8 @@ options:
   -o OUTPUT, --output OUTPUT
                         output zip file for the signatures
   --batch-size BATCH_SIZE
-                        Write smaller zipfiles, each containing sigs associated with this number of accessions. This allows urlsketch to recover after unexpected failures, rather than needing to
+                        Write smaller zipfiles, each containing sigs associated with this number of urls.
+                        This allows urlsketch to recover after unexpected failures, rather than needing to
                         restart sketching from scratch. Default: write all sigs to single zipfile.
   -f FASTAS, --fastas FASTAS
                         Write fastas here
@@ -202,6 +213,8 @@ options:
                         number of cores to use (default is all available)
   -r RETRY_TIMES, --retry-times RETRY_TIMES
                         number of times to retry failed downloads
+  -n {1,2,3}, --n-simultaneous-downloads {1,2,3}
+                        number of simultaneous downloads (default=3)
 ```
 
 ## Code of Conduct
diff --git a/src/directsketch.rs b/src/directsketch.rs
index cbac395..0c89e01 100644
--- a/src/directsketch.rs
+++ b/src/directsketch.rs
@@ -22,9 +22,7 @@ use crate::utils::{
     InputMolType, MultiCollection,
 };
 
-use crate::utils::buildutils::{
-    BuildCollection, BuildManifest, MultiBuildCollection, MultiSelect, MultiSelection,
-};
+use crate::utils::buildutils::{BuildCollection, BuildManifest, MultiSelect, MultiSelection};
 use reqwest::Url;
 
 async fn find_genome_directory(
@@ -237,17 +235,14 @@ async fn dl_sketch_assembly_accession(
     location: &PathBuf,
     retry: Option<u32>,
     keep_fastas: bool,
-    sigs: &mut BuildCollection,
+    mut sigs: BuildCollection,
     genomes_only: bool,
     proteomes_only: bool,
     download_only: bool,
-) -> Result<(
-    MultiBuildCollection,
-    Vec<FailedDownload>,
-    Vec<FailedChecksum>,
-)> {
+) -> Result<(BuildCollection, Vec<FailedDownload>, Vec<FailedChecksum>)> {
+    // todo -- move default retry to main function
     let retry_count = retry.unwrap_or(3); // Default retry count
-    let mut built_sigs = MultiBuildCollection::new();
+    let empty_coll = BuildCollection::new();
     let mut download_failures = Vec::<FailedDownload>::new();
     let mut checksum_failures = Vec::<FailedChecksum>::new();
 
@@ -283,7 +278,7 @@ async fn dl_sketch_assembly_accession(
                     download_failures.push(failed_download_protein);
                 }
 
-                return Ok((built_sigs, download_failures, checksum_failures));
+                return Ok((empty_coll, download_failures, checksum_failures));
             }
         };
     let md5sum_url = GenBankFileType::Checksum.url(&base_url, &full_name);
@@ -322,7 +317,7 @@ async fn dl_sketch_assembly_accession(
                 checksum_failures.push(failed_checksum_download);
             }
             // return early from function b/c we can't check any checksums
-            return Ok((built_sigs, download_failures, checksum_failures));
+            return Ok((empty_coll, download_failures, checksum_failures));
         }
     };
 
@@ -383,16 +378,8 @@ async fn dl_sketch_assembly_accession(
             };
         }
     }
-    if !download_only {
-        // remove any template sigs that were not populated
-        sigs.filter_empty();
-        // to do: can we use sigs directly rather than adding to a multibuildcollection, now?
-        if !sigs.is_empty() {
-            built_sigs.add_collection(sigs);
-        }
-    }
 
-    Ok((built_sigs, download_failures, checksum_failures))
+    Ok((sigs, download_failures, checksum_failures))
 }
 
 #[allow(clippy::too_many_arguments)]
@@ -402,17 +389,13 @@ async fn dl_sketch_url(
     location: &PathBuf,
     retry: Option<u32>,
     _keep_fastas: bool,
-    sigs: &mut BuildCollection,
+    mut sigs: BuildCollection,
     _genomes_only: bool,
     _proteomes_only: bool,
     download_only: bool,
-) -> Result<(
-    MultiBuildCollection,
-    Vec<FailedDownload>,
-    Vec<FailedChecksum>,
-)> {
+) -> Result<(BuildCollection, Vec<FailedDownload>, Vec<FailedChecksum>)> {
     let retry_count = retry.unwrap_or(3); // Default retry count
-    let mut built_sigs = MultiBuildCollection::new();
+    let empty_coll = BuildCollection::new();
     let mut download_failures = Vec::<FailedDownload>::new();
     let mut checksum_failures = Vec::<FailedChecksum>::new();
 
@@ -442,12 +425,6 @@ async fn dl_sketch_url(
                         sigs.build_sigs_from_data(data, "protein", name.clone(), filename.clone())?;
                     }
                 };
-                // remove any template sigs that were not populated
-                sigs.filter_empty();
-                // to do: can we use sigs directly rather than adding to a collection, now?
-                if !sigs.is_empty() {
-                    built_sigs.add_collection(sigs);
-                }
             }
         }
         Err(err) => {
@@ -466,6 +443,7 @@ async fn dl_sketch_url(
                     reason: error_message.clone(),
                 };
                 checksum_failures.push(checksum_mismatch);
+                sigs = empty_coll;
             } else {
                 let failed_download = FailedDownload {
                     accession: accession.clone(),
@@ -476,11 +454,12 @@ async fn dl_sketch_url(
                     url: Some(url),
                 };
                 download_failures.push(failed_download);
+                sigs = empty_coll;
             }
         }
     }
 
-    Ok((built_sigs, download_failures, checksum_failures))
+    Ok((sigs, download_failures, checksum_failures))
 }
 
 fn get_current_directory() -> Result<PathBuf> {
@@ -561,34 +540,24 @@ async fn load_existing_zip_batches(outpath: &PathBuf) -> Result<(MultiCollection
         if let Some(file_name) = entry_path.file_name() {
             // Check if the file matches the base zip file or any batched zip file (outpath.zip, outpath.1.zip, etc.)
             if let Some(captures) = zip_file_pattern.captures(file_name) {
-                // Wrap the `from_zipfile` call in `catch_unwind` to prevent panic propagation
-                let result = panic::catch_unwind(|| Collection::from_zipfile(&entry_path));
-                match result {
-                    Ok(Ok(collection)) => {
-                        // Successfully loaded the collection, push to `collections`
+                Collection::from_zipfile(&entry_path)
+                    .map(|collection| {
                         collections.push(collection);
+                        eprintln!("loaded existing collection from {}", &entry_path);
 
-                        // Extract the batch number (if it exists) and update the highest_batch
+                        // Extract batch number if it exists
                         if let Some(batch_str) = captures.get(1) {
                             if let Ok(batch_num) = batch_str.as_str().parse::<usize>() {
                                 highest_batch = max(highest_batch, batch_num);
                             }
                         }
-                    }
-                    Ok(Err(e)) => {
-                        // Handle the case where `from_zipfile` returned an error
+                    })
+                    .unwrap_or_else(|e| {
                         eprintln!(
-                            "Warning: Failed to load zip file '{}'. Error: {:?}",
+                            "Warning: Failed to load zip file '{}'; skipping. Zipfile Error: {:?}",
                             entry_path, e
                         );
-                        continue; // Skip the file and continue
-                    }
-                    Err(_) => {
-                        // The code inside `from_zipfile` panicked
-                        eprintln!("Warning: Invalid zip file '{}'; skipping.", entry_path);
-                        continue; // Skip the file and continue
-                    }
-                }
+                    });
             }
         }
     }
@@ -622,7 +591,7 @@ async fn create_or_get_zip_file(
 }
 
 pub fn zipwriter_handle(
-    mut recv_sigs: tokio::sync::mpsc::Receiver<MultiBuildCollection>,
+    mut recv_sigs: tokio::sync::mpsc::Receiver<BuildCollection>,
     output_sigs: Option<String>,
     batch_size: usize,      // Tunable batch size
     mut batch_index: usize, // starting batch index
@@ -632,13 +601,13 @@ pub fn zipwriter_handle(
         let mut md5sum_occurrences = HashMap::new();
         let mut zip_manifest = BuildManifest::new();
         let mut wrote_sigs = false;
-        let mut file_count = 0; // Count of files in the current batch
+        let mut acc_count = 0; // count the number of accessions (or urls, in urlsketch)
         let mut zip_writer = None;
 
         if let Some(outpath) = output_sigs {
             let outpath: PathBuf = outpath.into();
 
-            while let Some(mut multibuildcoll) = recv_sigs.recv().await {
+            while let Some(mut buildcoll) = recv_sigs.recv().await {
                 if zip_writer.is_none() {
                     // create zip file if needed
                     zip_writer =
@@ -653,29 +622,28 @@ pub fn zipwriter_handle(
 
                 if let Some(zip_writer) = zip_writer.as_mut() {
                     // write all sigs from sigcoll. Note that this method updates each record's internal location
-                    for sigcoll in &mut multibuildcoll.collections {
-                        match sigcoll
-                            .async_write_sigs_to_zip(zip_writer, &mut md5sum_occurrences)
-                            .await
-                        {
-                            Ok(_) => {
-                                file_count += sigcoll.size();
-                                wrote_sigs = true;
-                            }
-                            Err(e) => {
-                                let error = e.context("Error processing signature");
-                                if error_sender.send(error).await.is_err() {
-                                    return;
-                                }
+                    match buildcoll
+                        .async_write_sigs_to_zip(zip_writer, &mut md5sum_occurrences)
+                        .await
+                    {
+                        Ok(_) => {
+                            wrote_sigs = true;
+                        }
+                        Err(e) => {
+                            let error = e.context("Error processing signature");
+                            if error_sender.send(error).await.is_err() {
+                                return;
                             }
                         }
-                        // Add all records from sigcoll manifest
-                        zip_manifest.extend_from_manifest(&sigcoll.manifest);
                     }
+                    // Add all records from buildcoll manifest
+                    zip_manifest.extend_from_manifest(&buildcoll.manifest);
+                    // each buildcoll has accession
+                    acc_count += 1;
                 }
 
                 // if batch size is non-zero and is reached, close the current zip
-                if batch_size > 0 && file_count >= batch_size {
+                if batch_size > 0 && acc_count >= batch_size {
                     eprintln!("writing batch {}", batch_index);
                     if let Some(mut zip_writer) = zip_writer.take() {
                         if let Err(e) = zip_manifest
@@ -692,13 +660,13 @@ pub fn zipwriter_handle(
                     }
                     // Start a new batch
                     batch_index += 1;
-                    file_count = 0;
+                    acc_count = 0;
                     zip_manifest.clear();
                     zip_writer = None; // reset zip_writer so a new zip will be created when needed
                 }
             }
 
-            if file_count > 0 {
+            if acc_count > 0 {
                 // write the final manifest
                 if let Some(mut zip_writer) = zip_writer.take() {
                     if let Err(e) = zip_manifest
@@ -884,12 +852,14 @@ pub async fn gbsketch(
     proteomes_only: bool,
     download_only: bool,
     batch_size: u32,
+    n_permits: usize,
     output_sigs: Option<String>,
 ) -> Result<(), anyhow::Error> {
     let batch_size = batch_size as usize;
     let mut batch_index = 1;
     let mut existing_records_map: HashMap<String, BuildManifest> = HashMap::new();
     let mut filter = false;
+    // if writing sigs, prepare output and look for existing sig batches
     if let Some(ref output_sigs) = output_sigs {
         // Create outpath from output_sigs
         let outpath = PathBuf::from(output_sigs);
@@ -922,7 +892,7 @@ pub async fn gbsketch(
         create_dir_all(&download_path)?;
     }
     // create channels. buffer size here is 4 b/c we can do 3 downloads simultaneously
-    let (send_sigs, recv_sigs) = tokio::sync::mpsc::channel::<MultiBuildCollection>(4);
+    let (send_sigs, recv_sigs) = tokio::sync::mpsc::channel::<BuildCollection>(4);
     let (send_failed, recv_failed) = tokio::sync::mpsc::channel::<FailedDownload>(4);
     let (send_failed_checksums, recv_failed_checksum) =
         tokio::sync::mpsc::channel::<FailedChecksum>(4);
@@ -954,7 +924,7 @@ pub async fn gbsketch(
     handles.push(checksum_failures_handle);
 
     // Worker tasks
-    let semaphore = Arc::new(Semaphore::new(3)); // Limiting concurrent downloads
+    let semaphore = Arc::new(Semaphore::new(n_permits)); // Limiting concurrent downloads
     let client = Arc::new(Client::new());
 
     // Open the file containing the accessions synchronously
@@ -963,52 +933,49 @@ pub async fn gbsketch(
         bail!("No accessions to download and sketch.")
     }
 
-    let sig_template_result = BuildCollection::from_param_str(param_str.as_str());
-    let mut sig_templates = match sig_template_result {
-        Ok(sig_templates) => sig_templates,
-        Err(e) => {
-            bail!("Failed to parse params string: {}", e);
-        }
-    };
-
+    let mut sig_templates = BuildCollection::new();
     let mut genomes_only = genomes_only;
     let mut proteomes_only = proteomes_only;
 
-    // Check if we have dna signature templates and not keep_fastas
-    if sig_templates.dna_size()? == 0 && !keep_fastas {
-        eprintln!("No DNA signature templates provided, and --keep-fasta is not set.");
-        proteomes_only = true;
-    }
-    // Check if we have protein signature templates not keep_fastas
-    if sig_templates.anyprotein_size()? == 0 && !keep_fastas {
-        eprintln!("No protein signature templates provided, and --keep-fasta is not set.");
-        genomes_only = true;
-    }
-    if genomes_only {
-        // select only DNA templates
-        let multiselection = MultiSelection::from_moltypes(vec!["dna"])?;
-        sig_templates = sig_templates.select(&multiselection)?;
-
-        if !download_only {
-            eprintln!("Downloading and sketching genomes only.");
-        } else {
+    if download_only {
+        if genomes_only {
             eprintln!("Downloading genomes only.");
+        } else if proteomes_only {
+            eprintln!("Downloading proteomes only.");
         }
-    } else if proteomes_only {
-        // select only protein templates
-        let multiselection = MultiSelection::from_moltypes(vec!["protein", "dayhoff", "hp"])?;
-        sig_templates = sig_templates.select(&multiselection)?;
-        if !download_only {
+    } else {
+        let sig_template_result = BuildCollection::from_param_str(param_str.as_str());
+        sig_templates = match sig_template_result {
+            Ok(sig_templates) => sig_templates,
+            Err(e) => {
+                bail!("Failed to parse params string: {}", e);
+            }
+        };
+        // Check if we have dna signature templates and not keep_fastas
+        if sig_templates.dna_size()? == 0 && !keep_fastas {
+            eprintln!("No DNA signature templates provided, and --keep-fasta is not set.");
+            proteomes_only = true;
+        }
+        // Check if we have protein signature templates not keep_fastas
+        if sig_templates.anyprotein_size()? == 0 && !keep_fastas {
+            eprintln!("No protein signature templates provided, and --keep-fasta is not set.");
+            genomes_only = true;
+        }
+        if genomes_only {
+            // select only templates built from DNA input
+            let multiselection = MultiSelection::from_input_moltype("DNA")?;
+            sig_templates.select(&multiselection)?;
+            eprintln!("Downloading and sketching genomes only.");
+        } else if proteomes_only {
+            // select only templates built from protein input
+            let multiselection = MultiSelection::from_input_moltype("protein")?;
+            sig_templates.select(&multiselection)?;
             eprintln!("Downloading and sketching proteomes only.");
-        } else {
-            eprintln!("Downloading proteomes only.");
+        }
+        if sig_templates.is_empty() && !download_only {
+            bail!("No signatures to build.")
         }
     }
-
-    if sig_templates.is_empty() && !download_only {
-        bail!("No signatures to build.")
-    }
-
     // report every 1 percent (or every 1, whichever is larger)
     let reporting_threshold = std::cmp::max(n_accs / 100, 1);
 
@@ -1053,7 +1020,7 @@ pub async fn gbsketch(
                 &download_path_clone,
                 Some(retry_times),
                 keep_fastas,
-                &mut sigs,
+                sigs,
                 genomes_only,
                 proteomes_only,
                 download_only,
@@ -1119,6 +1086,7 @@ pub async fn urlsketch(
     keep_fastas: bool,
     download_only: bool,
     batch_size: u32,
+    n_permits: usize,
     output_sigs: Option<String>,
     failed_checksums_csv: Option<String>,
 ) -> Result<(), anyhow::Error> {
@@ -1159,7 +1127,7 @@ pub async fn urlsketch(
     }
 
     // create channels. buffer size here is 4 b/c we can do 3 downloads simultaneously
-    let (send_sigs, recv_sigs) = tokio::sync::mpsc::channel::<MultiBuildCollection>(4);
+    let (send_sigs, recv_sigs) = tokio::sync::mpsc::channel::<BuildCollection>(4);
     let (send_failed, recv_failed) = tokio::sync::mpsc::channel::<FailedDownload>(4);
     let (send_failed_checksums, recv_failed_checksum) =
         tokio::sync::mpsc::channel::<FailedChecksum>(4);
@@ -1197,7 +1165,7 @@ pub async fn urlsketch(
     handles.push(error_handle);
 
     // Worker tasks
-    let semaphore = Arc::new(Semaphore::new(3)); // Limiting concurrent downloads
+    let semaphore = Arc::new(Semaphore::new(n_permits)); // Limiting concurrent downloads
     let client = Arc::new(Client::new());
 
     // Open the file containing the accessions synchronously
@@ -1206,50 +1174,49 @@ pub async fn urlsketch(
         bail!("No accessions to download and sketch.")
     }
 
-    let sig_template_result = BuildCollection::from_param_str(param_str.as_str());
-    let mut sig_templates = match sig_template_result {
-        Ok(sig_templates) => sig_templates,
-        Err(e) => {
-            bail!("Failed to parse params string: {}", e);
-        }
-    };
-
+    // todo: add genomes_only / proteomes_only to the input options
+    let mut sig_templates = BuildCollection::new();
     let mut genomes_only = false;
     let mut proteomes_only = false;
+    let dna_multiselection = MultiSelection::from_input_moltype("dna")?;
+    let protein_multiselection = MultiSelection::from_input_moltype("protein")?;
 
-    // Check if we have dna signature templates and not keep_fastas
-    if sig_templates.dna_size()? == 0 && !keep_fastas {
-        eprintln!("No DNA signature templates provided, and --keep-fasta is not set.");
-        proteomes_only = true;
-    }
-    // Check if we have protein signature templates not keep_fastas
-    if sig_templates.anyprotein_size()? == 0 && !keep_fastas {
-        eprintln!("No protein signature templates provided, and --keep-fasta is not set.");
-        genomes_only = true;
-    }
-    if genomes_only {
-        // select only DNA templates
-        let multiselection = MultiSelection::from_moltypes(vec!["dna"])?;
-        sig_templates = sig_templates.select(&multiselection)?;
-
-        if !download_only {
-            eprintln!("Downloading and sketching genomes only.");
-        } else {
+    if download_only {
+        if genomes_only {
             eprintln!("Downloading genomes only.");
+        } else if proteomes_only {
+            eprintln!("Downloading proteomes only.");
         }
-    } else if proteomes_only {
-        // select only protein templates
-        let multiselection = MultiSelection::from_moltypes(vec!["protein", "dayhoff", "hp"])?;
-        sig_templates = sig_templates.select(&multiselection)?;
-        if !download_only {
+    } else {
+        let sig_template_result = BuildCollection::from_param_str(param_str.as_str());
+        sig_templates = match sig_template_result {
+            Ok(sig_templates) => sig_templates,
+            Err(e) => {
+                bail!("Failed to parse params string: {}", e);
+            }
+        };
+        // Check if we have dna signature templates and not keep_fastas
+        if sig_templates.dna_size()? == 0 && !keep_fastas {
+            eprintln!("No DNA signature templates provided, and --keep-fasta is not set.");
+            proteomes_only = true;
+        }
+        // Check if we have protein signature templates not keep_fastas
+        if sig_templates.anyprotein_size()? == 0 && !keep_fastas {
+            eprintln!("No protein signature templates provided, and --keep-fasta is not set.");
+            genomes_only = true;
+        }
+        if genomes_only {
+            // select only DNA templates
+            sig_templates.select(&dna_multiselection)?;
+            eprintln!("Downloading and sketching genomes only.");
+        } else if proteomes_only {
+            // select only protein templates
+            sig_templates.select(&protein_multiselection)?;
             eprintln!("Downloading and sketching proteomes only.");
-        } else {
-            eprintln!("Downloading proteomes only.");
         }
-    }
-
-    if sig_templates.is_empty() && !download_only {
-        bail!("No signatures to build.")
+        if sig_templates.is_empty() && !download_only {
+            bail!("No signatures to build.")
+        }
     }
 
     // report every 1 percent (or every 1, whichever is larger)
@@ -1267,6 +1234,17 @@ pub async fn urlsketch(
             }
         }
 
+        // eliminate sigs that won't be added to based on moltype
+        // this assumes no translation --> modify as needed if adding that.
+        if accinfo.moltype == InputMolType::Dna {
+            sigs.select(&dna_multiselection)?;
+        } else {
+            sigs.select(&protein_multiselection)?;
+        }
+        if sigs.is_empty() && !download_only {
+            continue;
+        }
+
         let semaphore_clone = Arc::clone(&semaphore);
         let client_clone = Arc::clone(&client);
         let send_sigs = send_sigs.clone();
@@ -1294,7 +1272,7 @@ pub async fn urlsketch(
                 &download_path_clone,
                 Some(retry_times),
                 keep_fastas,
-                &mut sigs,
+                sigs,
                 genomes_only,
                 proteomes_only,
                 download_only,
diff --git a/src/lib.rs b/src/lib.rs
index 648ae69..6c8d251 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -49,7 +49,7 @@ fn set_tokio_thread_pool(num_threads: usize) -> PyResult<usize> {
 
 #[pyfunction]
 #[allow(clippy::too_many_arguments)]
-#[pyo3(signature = (input_csv, param_str, failed_csv, failed_checksums, retry_times, fasta_location, keep_fastas, genomes_only, proteomes_only, download_only, batch_size, output_sigs=None))]
+#[pyo3(signature = (input_csv, param_str, failed_csv, failed_checksums, retry_times, fasta_location, keep_fastas, genomes_only, proteomes_only, download_only, batch_size, n_permits, output_sigs=None))]
 fn do_gbsketch(
     py: Python,
     input_csv: String,
@@ -63,6 +63,7 @@ fn do_gbsketch(
     proteomes_only: bool,
     download_only: bool,
     batch_size: u32,
+    n_permits: usize,
     output_sigs: Option<String>,
 ) -> anyhow::Result<u8> {
     match directsketch::gbsketch(
@@ -78,6 +79,7 @@ fn do_gbsketch(
         proteomes_only,
         download_only,
         batch_size,
+        n_permits,
         output_sigs,
     ) {
         Ok(_) => Ok(0),
@@ -90,7 +92,7 @@ fn do_gbsketch(
 
 #[pyfunction]
 #[allow(clippy::too_many_arguments)]
-#[pyo3(signature = (input_csv, param_str, failed_csv, retry_times, fasta_location, keep_fastas, download_only, batch_size, output_sigs=None, failed_checksums=None))]
+#[pyo3(signature = (input_csv, param_str, failed_csv, retry_times, fasta_location, keep_fastas, download_only, batch_size, n_permits, output_sigs=None, failed_checksums=None))]
 fn do_urlsketch(
     py: Python,
     input_csv: String,
@@ -101,6 +103,7 @@ fn do_urlsketch(
     keep_fastas: bool,
     download_only: bool,
     batch_size: u32,
+    n_permits: usize,
     output_sigs: Option<String>,
     failed_checksums: Option<String>,
 ) -> anyhow::Result<u8> {
@@ -114,6 +117,7 @@ fn do_urlsketch(
         keep_fastas,
         download_only,
         batch_size,
+        n_permits,
         output_sigs,
         failed_checksums,
     ) {
diff --git a/src/python/sourmash_plugin_directsketch/__init__.py b/src/python/sourmash_plugin_directsketch/__init__.py
index da88548..d47d7ac 100644
--- a/src/python/sourmash_plugin_directsketch/__init__.py
+++ b/src/python/sourmash_plugin_directsketch/__init__.py
@@ -65,6 +65,8 @@ def __init__(self, p):
                        help='number of cores to use (default is all available)')
         p.add_argument('-r', '--retry-times', default=1, type=int,
                        help='number of times to retry failed downloads')
+        p.add_argument('-n', '--n-simultaneous-downloads', default=1, type=int, choices = [1, 2, 3],
+                       help='number of accessions to download simultaneously (default=1)')
         group = p.add_mutually_exclusive_group()
         group.add_argument('-g', '--genomes-only', action='store_true', help='just download and sketch genome (DNA) files')
         group.add_argument('-m', '--proteomes-only', action='store_true', help='just download and sketch proteome (protein) files')
@@ -104,6 +106,7 @@ def main(self, args):
                                                            args.proteomes_only,
                                                            args.download_only,
                                                            args.batch_size,
+                                                           args.n_simultaneous_downloads,
                                                            args.output)
         
         if status == 0:
@@ -126,7 +129,7 @@ def __init__(self, p):
         p.add_argument('-o', '--output', default=None,
                        help='output zip file for the signatures')
         p.add_argument('--batch-size', type=non_negative_int, default = 0,
-                       help='Write smaller zipfiles, each containing sigs associated with this number of accessions. \
+                       help='Write smaller zipfiles, each containing sigs associated with this number of urls. \
                             This allows urlsketch to recover after unexpected failures, rather than needing to \
                             restart sketching from scratch. Default: write all sigs to single zipfile.')
         p.add_argument('-f', '--fastas',
@@ -143,6 +146,8 @@ def __init__(self, p):
                        help='number of cores to use (default is all available)')
         p.add_argument('-r', '--retry-times', default=1, type=int,
                        help='number of times to retry failed downloads')
+        p.add_argument('-n', '--n-simultaneous-downloads', default=3, type=int, choices = [1, 2, 3],
+                       help='number of simultaneous downloads (default=3)')
 
 
     def main(self, args):
@@ -176,6 +181,7 @@ def main(self, args):
                                                            args.keep_fasta,
                                                            args.download_only,
                                                            args.batch_size,
+                                                           args.n_simultaneous_downloads,
                                                            args.output,
                                                            args.checksum_fail)
         
diff --git a/src/utils/buildutils.rs b/src/utils/buildutils.rs
index 4120d3d..b4d1d2c 100644
--- a/src/utils/buildutils.rs
+++ b/src/utils/buildutils.rs
@@ -32,14 +32,29 @@ pub struct MultiSelection {
 }
 
 impl MultiSelection {
-    /// Create a `MultiSelection` from a single `Selection`
-    pub fn new(selection: Selection) -> Self {
-        MultiSelection {
-            selections: vec![selection],
-        }
+    pub fn from_moltypes(moltypes: Vec<&str>) -> Result<Self, SourmashError> {
+        let selections: Result<Vec<Selection>, SourmashError> = moltypes
+            .into_iter()
+            .map(|moltype_str| {
+                let moltype = HashFunctions::try_from(moltype_str)?;
+                let mut new_selection = Selection::default(); // Create a default Selection
+                new_selection.set_moltype(moltype); // Set the moltype
+                Ok(new_selection)
+            })
+            .collect();
+
+        Ok(MultiSelection {
+            selections: selections?,
+        })
     }
 
-    pub fn from_moltypes(moltypes: Vec<&str>) -> Result<Self, SourmashError> {
+    pub fn from_input_moltype(input_moltype: &str) -> Result<Self, SourmashError> {
+        // currently we don't allow translation. Will need to change this when we do.
+        // is there a better way to do this?
+        let mut moltypes = vec!["DNA"];
+        if input_moltype == "protein" {
+            moltypes = vec!["protein", "dayhoff", "hp"];
+        }
         let selections: Result<Vec<Selection>, SourmashError> = moltypes
             .into_iter()
             .map(|moltype_str| {
@@ -57,9 +72,7 @@ impl MultiSelection {
 }
 
 pub trait MultiSelect {
-    fn select(self, multi_selection: &MultiSelection) -> Result<Self, SourmashError>
-    where
-        Self: Sized;
+    fn select(&mut self, multi_selection: &MultiSelection) -> Result<(), SourmashError>;
 }
 
 #[derive(Debug, Clone, Getters, Setters, Serialize)]
@@ -122,10 +135,10 @@ where
     }
 }
 
-impl Default for BuildRecord {
-    fn default() -> Self {
-        // Default BuildRecord is DNA default
-        BuildRecord {
+impl BuildRecord {
+    // no general default, but we have defaults for each moltype
+    pub fn default_dna() -> Self {
+        Self {
             internal_location: None,
             md5: None,
             md5short: None,
@@ -142,21 +155,13 @@ impl Default for BuildRecord {
             sequence_added: false,
         }
     }
-}
-
-impl BuildRecord {
-    pub fn default_dna() -> Self {
-        Self {
-            ..Default::default()
-        }
-    }
 
     pub fn default_protein() -> Self {
         Self {
             moltype: "protein".to_string(),
             ksize: 10,
             scaled: 200,
-            ..Default::default()
+            ..Self::default_dna()
         }
     }
 
@@ -165,7 +170,7 @@ impl BuildRecord {
             moltype: "dayhoff".to_string(),
             ksize: 10,
             scaled: 200,
-            ..Default::default()
+            ..Self::default_dna()
         }
     }
 
@@ -174,7 +179,7 @@ impl BuildRecord {
             moltype: "hp".to_string(),
             ksize: 10,
             scaled: 200,
-            ..Default::default()
+            ..Self::default_dna()
         }
     }
 
@@ -189,7 +194,7 @@ impl BuildRecord {
             num: *record.num(),
             scaled: *record.scaled(),
             with_abundance: record.with_abundance(),
-            ..Default::default() // ignore remaining fields
+            ..Self::default_dna() // ignore remaining fields
         }
     }
 
@@ -219,6 +224,16 @@ impl BuildRecord {
 
         valid
     }
+
+    pub fn params(&self) -> (u32, String, bool, u32, u32) {
+        (
+            self.ksize,
+            self.moltype.clone(),
+            self.with_abundance,
+            self.num,
+            self.scaled,
+        )
+    }
 }
 
 impl PartialEq for BuildRecord {
@@ -272,6 +287,10 @@ impl BuildManifest {
         self.records.clear();
     }
 
+    pub fn summarize_params(&self) -> HashSet<(u32, String, bool, u32, u32)> {
+        self.iter().map(|record| record.params()).collect()
+    }
+
     pub fn filter_manifest(&self, other: &BuildManifest) -> Self {
         // Create a HashSet of references to the `BuildRecord`s in `other`
         let pairs: HashSet<_> = other.records.iter().collect();
@@ -307,7 +326,10 @@ impl BuildManifest {
         let mut csv_writer = csv::Writer::from_writer(wtr);
 
         for record in &self.records {
-            csv_writer.serialize(record)?; // Serialize each BuildRecord
+            // don't write empty records (empty template sigs aren't written from BuildCollection)
+            if record.sequence_added {
+                csv_writer.serialize(record)?; // Serialize each BuildRecord
+            }
         }
 
         csv_writer.flush()?; // Ensure all data is written
@@ -344,18 +366,15 @@ impl BuildManifest {
 }
 
 impl MultiSelect for BuildManifest {
-    fn select(self, multi_selection: &MultiSelection) -> Result<Self, SourmashError> {
-        let rows = self.records.iter().filter(|row| {
-            // for each row, check if it matches any of the Selection structs in MultiSelection
+    fn select(&mut self, multi_selection: &MultiSelection) -> Result<(), SourmashError> {
+        // Retain only the records that match any selection
+        self.records.retain(|record| {
             multi_selection
                 .selections
                 .iter()
-                .any(|selection| row.matches_selection(selection))
+                .any(|selection| record.matches_selection(selection))
         });
-
-        Ok(BuildManifest {
-            records: rows.cloned().collect(),
-        })
+        Ok(())
     }
 }
 
@@ -415,25 +434,24 @@ impl BuildCollection {
 
     pub fn dna_size(&self) -> Result<usize, SourmashError> {
         let multiselection = MultiSelection::from_moltypes(vec!["dna"])?;
-        let selected_manifest = self.manifest.clone().select(&multiselection)?;
-
-        Ok(selected_manifest.records.len())
+        let mut mf = self.manifest.clone(); // temporary mutable copy
+        mf.select(&multiselection)?;
+        Ok(mf.records.len())
     }
 
     pub fn protein_size(&self) -> Result<usize, SourmashError> {
         let multiselection = MultiSelection::from_moltypes(vec!["protein"])?;
-        let selected_manifest = self.manifest.clone().select(&multiselection)?;
-
-        Ok(selected_manifest.records.len())
+        let mut mf = self.manifest.clone(); // temporary mutable copy
+        mf.select(&multiselection)?;
+        Ok(mf.records.len())
     }
 
     pub fn anyprotein_size(&self) -> Result<usize, SourmashError> {
         let multiselection = MultiSelection::from_moltypes(vec!["protein", "dayhoff", "hp"])?;
-        let selected_manifest = self.manifest.clone().select(&multiselection)?;
-
-        Ok(selected_manifest.records.len())
+        let mut mf = self.manifest.clone(); // temporary mutable copy
+        mf.select(&multiselection)?;
+        Ok(mf.records.len())
     }
-
     pub fn parse_ksize(value: &str) -> Result<u32, String> {
         value
             .parse::<u32>()
@@ -663,26 +681,6 @@ impl BuildCollection {
         });
     }
 
-    // filter template signatures that had no sequence added
-    // suggested use right before writing signatures
-    pub fn filter_empty(&mut self) {
-        let mut sig_index = 0;
-
-        self.manifest.records.retain(|record| {
-            // Keep only records where `sequence_added` is `true`.
-            let keep = record.sequence_added;
-
-            if !keep {
-                // Remove the corresponding signature at the same index if the record is not kept.
-                self.sigs.remove(sig_index);
-            } else {
-                sig_index += 1; // Only increment if we keep the record and signature.
-            }
-
-            keep
-        });
-    }
-
     pub fn filter(&mut self, params_set: &HashSet<u64>) {
         let mut index = 0;
         while index < self.manifest.records.len() {
@@ -715,6 +713,36 @@ impl BuildCollection {
         self.manifest.records.iter_mut().zip(self.sigs.iter_mut())
     }
 
+    fn build_sigs_from_record(
+        &mut self,
+        input_moltype: &str,
+        record: &SequenceRecord,
+    ) -> Result<()> {
+        // Optionally use `par_iter_mut` for parallel execution
+        self.iter_mut().try_for_each(|(rec, sig)| {
+            if input_moltype == "protein"
+                && (rec.moltype() == HashFunctions::Murmur64Protein
+                    || rec.moltype() == HashFunctions::Murmur64Dayhoff
+                    || rec.moltype() == HashFunctions::Murmur64Hp)
+            {
+                sig.add_protein(&record.seq())
+                    .context("Failed to add protein")?;
+                if !rec.sequence_added {
+                    rec.sequence_added = true;
+                }
+            } else if (input_moltype == "DNA" || input_moltype == "dna")
+                && rec.moltype() == HashFunctions::Murmur64Dna
+            {
+                sig.add_sequence(&record.seq(), true)
+                    .context("Failed to add sequence")?;
+                if !rec.sequence_added {
+                    rec.sequence_added = true;
+                }
+            }
+            Ok(())
+        })
+    }
+
     pub fn build_sigs_from_data(
         &mut self,
         data: Vec<u8>,
@@ -729,24 +757,7 @@ impl BuildCollection {
         // Iterate over FASTA records and add sequences/proteins to sigs
         while let Some(record) = fastx_reader.next() {
             let record = record.context("Failed to read record")?;
-            self.iter_mut().for_each(|(rec, sig)| {
-                if input_moltype == "protein"
-                    && (rec.moltype == "protein" || rec.moltype == "dayhoff" || rec.moltype == "hp")
-                {
-                    sig.add_protein(&record.seq())
-                        .expect("Failed to add protein");
-                    if !rec.sequence_added {
-                        rec.sequence_added = true
-                    }
-                } else if input_moltype == "DNA" && rec.moltype == "DNA" {
-                    sig.add_sequence(&record.seq(), true)
-                        .expect("Failed to add sequence");
-                    // if not force, panics with 'N' in dna sequence
-                    if !rec.sequence_added {
-                        rec.sequence_added = true
-                    }
-                }
-            });
+            self.build_sigs_from_record(input_moltype, &record)?;
         }
 
         // After processing sequences, update sig, record information
@@ -755,42 +766,37 @@ impl BuildCollection {
         Ok(())
     }
 
-    pub fn build_sigs_from_file(
+    pub fn build_sigs_from_file_or_stdin(
         &mut self,
-        input_moltype: &str, // (protein/dna); todo - use hashfns?
+        input_moltype: &str, // "protein" or "DNA"
         name: String,
         filename: String,
-    ) -> Result<()> {
-        let mut fastx_reader = parse_fastx_file(&filename)?;
-        // Iterate over FASTA records and add sequences/proteins to sigs
-        while let Some(record) = fastx_reader.next() {
-            let record = record.context("Failed to read record")?;
-            self.iter_mut().for_each(|(rec, sig)| {
-                if input_moltype == "protein"
-                    && (rec.moltype() == HashFunctions::Murmur64Protein
-                        || rec.moltype() == HashFunctions::Murmur64Dayhoff
-                        || rec.moltype() == HashFunctions::Murmur64Hp)
-                {
-                    sig.add_protein(&record.seq())
-                        .expect("Failed to add protein");
-                    if !rec.sequence_added {
-                        rec.sequence_added = true
-                    }
-                } else {
-                    sig.add_sequence(&record.seq(), true)
-                        .expect("Failed to add sequence");
-                    // if not force, panics with 'N' in dna sequence
-                    if !rec.sequence_added {
-                        rec.sequence_added = true
-                    }
-                }
-            });
+    ) -> Result<u64> {
+        // Create a FASTX reader from the file or stdin
+        let mut fastx_reader = if filename == "-" {
+            let stdin = std::io::stdin();
+            parse_fastx_reader(stdin).context("Failed to parse FASTA/FASTQ data from stdin")?
+        } else {
+            parse_fastx_file(&filename).context("Failed to open file for FASTA/FASTQ data")?
+        };
+
+        // Counter for the number of records processed
+        let mut record_count: u64 = 0;
+
+        // Parse records and add sequences to signatures
+        while let Some(record_result) = fastx_reader.next() {
+            let record = record_result.context("Failed to read a record from input")?;
+
+            self.build_sigs_from_record(input_moltype, &record)?;
+
+            record_count += 1;
         }
 
-        // After processing sequences, update sig, record information
+        // Update signature and record metadata
         self.update_info(name, filename);
 
-        Ok(())
+        // Return the count of records parsed
+        Ok(record_count)
     }
 
     pub fn build_singleton_sigs(
@@ -799,30 +805,11 @@ impl BuildCollection {
         input_moltype: &str, // (protein/dna); todo - use hashfns?
         filename: String,
     ) -> Result<()> {
-        self.iter_mut().for_each(|(rec, sig)| {
-            if input_moltype == "protein"
-                && (rec.moltype() == HashFunctions::Murmur64Protein
-                    || rec.moltype() == HashFunctions::Murmur64Dayhoff
-                    || rec.moltype() == HashFunctions::Murmur64Hp)
-            {
-                sig.add_protein(&record.seq())
-                    .expect("Failed to add protein");
-                if !rec.sequence_added {
-                    rec.sequence_added = true
-                }
-            } else {
-                sig.add_sequence(&record.seq(), true)
-                    .expect("Failed to add sequence");
-                // if not force, panics with 'N' in dna sequence
-                if !rec.sequence_added {
-                    rec.sequence_added = true
-                }
-            }
-        });
+        self.build_sigs_from_record(input_moltype, &record)?;
+        // After processing sequences, update sig, record information
         let record_name = std::str::from_utf8(record.id())
             .expect("could not get record id")
             .to_string();
-        // After processing sequences, update sig, record information
         self.update_info(record_name, filename);
 
         Ok(())
@@ -849,7 +836,6 @@ impl BuildCollection {
         }
     }
 
-    // to do -- use filter_empty to ensure we're not writing empty template sigs??
     pub async fn async_write_sigs_to_zip(
         &mut self, // need mutable to update records
         zip_writer: &mut ZipFileWriter<Compat<File>>,
@@ -857,6 +843,10 @@ impl BuildCollection {
     ) -> Result<()> {
         // iterate over both records and signatures
         for (record, sig) in self.iter_mut() {
+            // skip any empty sig templates (no sequence added)
+            if !record.sequence_added {
+                continue;
+            }
             let md5sum_str = sig.md5sum();
             let count = md5sum_occurrences.entry(md5sum_str.clone()).or_insert(0);
             *count += 1;
@@ -917,57 +907,25 @@ impl<'a> IntoIterator for &'a mut BuildCollection {
 }
 
 impl MultiSelect for BuildCollection {
-    // to do --> think through the best/most efficient way to do this
     // in sourmash core, we don't need to select sigs themselves. Is this due to the way that Idx/Storage work?
-    fn select(mut self, multi_selection: &MultiSelection) -> Result<Self, SourmashError> {
-        // Collect indices while retaining matching records
-        let mut selected_indices = Vec::new();
-        let mut current_index = 0;
-
+    fn select(&mut self, multi_selection: &MultiSelection) -> Result<(), SourmashError> {
+        // Retain records and sigs in place
+        let mut i = 0;
         self.manifest.records.retain(|record| {
             let keep = multi_selection
                 .selections
                 .iter()
                 .any(|selection| record.matches_selection(selection));
 
-            if keep {
-                selected_indices.push(current_index); // Collect the index of the retained record
+            if !keep {
+                self.sigs.remove(i); // Remove corresponding signature
+            } else {
+                i += 1;
             }
-
-            current_index += 1; // Move to the next index
-            keep // Retain the record if it matches the selection
-        });
-
-        // Retain corresponding signatures using the collected indices
-        let mut sig_index = 0;
-        self.sigs.retain(|_sig| {
-            let keep = selected_indices.contains(&sig_index);
-            sig_index += 1;
             keep
         });
 
-        Ok(self)
-    }
-}
-
-#[derive(Debug, Clone)]
-pub struct MultiBuildCollection {
-    pub collections: Vec<BuildCollection>,
-}
-
-impl MultiBuildCollection {
-    pub fn new() -> Self {
-        MultiBuildCollection {
-            collections: Vec::new(),
-        }
-    }
-
-    pub fn is_empty(&self) -> bool {
-        self.collections.is_empty()
-    }
-
-    pub fn add_collection(&mut self, collection: &mut BuildCollection) {
-        self.collections.push(collection.clone())
+        Ok(())
     }
 }
 
@@ -1025,19 +983,19 @@ mod tests {
                 ksize: 31,
                 moltype: "DNA".to_string(),
                 with_abundance: true,
-                ..Default::default()
+                ..BuildRecord::default_dna()
             },
             BuildRecord {
                 ksize: 21,
                 moltype: "DNA".to_string(),
                 with_abundance: true,
-                ..Default::default()
+                ..BuildRecord::default_dna()
             },
             BuildRecord {
                 ksize: 51,
                 moltype: "DNA".to_string(),
                 with_abundance: true,
-                ..Default::default()
+                ..BuildRecord::default_dna()
             },
             BuildRecord::default_protein(),
         ];
@@ -1228,14 +1186,14 @@ mod tests {
             ksize: 21,
             moltype: "DNA".to_string(),
             scaled: 1000,
-            ..Default::default()
+            ..BuildRecord::default_dna()
         };
         let rec3 = BuildRecord {
             ksize: 31,
             moltype: "DNA".to_string(),
             scaled: 1000,
             with_abundance: true,
-            ..Default::default()
+            ..BuildRecord::default_dna()
         };
 
         let bmanifest = BuildManifest {
@@ -1272,7 +1230,7 @@ mod tests {
             moltype: "DNA".to_string(),
             scaled: 1000,
             with_abundance: true,
-            ..Default::default()
+            ..BuildRecord::default_dna()
         };
 
         // Add the DNA record to the collection with a matching moltype.
@@ -1294,7 +1252,7 @@ mod tests {
             moltype: "protein".to_string(),
             scaled: 200,
             with_abundance: false,
-            ..Default::default()
+            ..BuildRecord::default_dna()
         };
 
         // Add the protein record to the collection with a matching moltype.
@@ -1316,17 +1274,9 @@ mod tests {
             moltype: "dayhoff".to_string(),
             scaled: 200,
             with_abundance: true,
-            ..Default::default()
+            ..BuildRecord::default_dna()
         };
 
-        // Attempt to add the non-matching record with "DNA" as input moltype.
-        // this is because we currently don't allow translation
-        // build_collection.add_template_sig_from_record(&non_matching_record, "DNA");
-
-        // Verify that the non-matching record was not added.
-        // assert_eq!(build_collection.manifest.records.len(), 2);
-        // assert_eq!(build_collection.sigs.len(), 2);
-
         // Add the same non-matching record with a matching input moltype.
         build_collection.add_template_sig_from_record(&non_matching_record);
 
@@ -1339,62 +1289,4 @@ mod tests {
         assert_eq!(added_dayhoff_record.ksize, 10);
         assert_eq!(added_dayhoff_record.with_abundance, true);
     }
-
-    #[test]
-    fn test_filter_empty() {
-        // Create a parameter string that generates BuildRecords with different `sequence_added` values.
-        let params_str = "k=31,abund,dna_k=21,protein_k=10,abund";
-
-        // Use `from_param_str` to build a `BuildCollection`.
-        let mut build_collection = BuildCollection::from_param_str(params_str)
-            .expect("Failed to build BuildCollection from params_str");
-
-        // Manually set `sequence_added` for each record to simulate different conditions.
-        build_collection.manifest.records[0].sequence_added = true; // Keep this record.
-        build_collection.manifest.records[1].sequence_added = false; // This record should be removed.
-        build_collection.manifest.records[2].sequence_added = true; // Keep this record.
-
-        // Check initial sizes before filtering.
-        assert_eq!(
-            build_collection.manifest.records.len(),
-            3,
-            "Expected 3 records before filtering, but found {}",
-            build_collection.manifest.records.len()
-        );
-        assert_eq!(
-            build_collection.sigs.len(),
-            3,
-            "Expected 3 signatures before filtering, but found {}",
-            build_collection.sigs.len()
-        );
-
-        // Apply the `filter_empty` method.
-        build_collection.filter_empty();
-
-        // After filtering, only the records with `sequence_added == true` should remain.
-        assert_eq!(
-            build_collection.manifest.records.len(),
-            2,
-            "Expected 2 records after filtering, but found {}",
-            build_collection.manifest.records.len()
-        );
-
-        // Check that the signatures also match the remaining records.
-        assert_eq!(
-            build_collection.sigs.len(),
-            2,
-            "Expected 2 signatures after filtering, but found {}",
-            build_collection.sigs.len()
-        );
-
-        // Verify that the remaining records have `sequence_added == true`.
-        assert!(
-            build_collection
-                .manifest
-                .records
-                .iter()
-                .all(|rec| rec.sequence_added),
-            "All remaining records should have `sequence_added == true`"
-        );
-    }
 }
diff --git a/src/utils/mod.rs b/src/utils/mod.rs
index 041f8be..6c7fa18 100644
--- a/src/utils/mod.rs
+++ b/src/utils/mod.rs
@@ -2,13 +2,12 @@ use anyhow::{anyhow, Result};
 use reqwest::Url;
 use sourmash::collection::Collection;
 use std::collections::HashMap;
-// use std::collections::HashSet;
 use std::fmt;
 
 pub mod buildutils;
 use crate::utils::buildutils::{BuildManifest, BuildRecord};
 
-#[derive(Clone)]
+#[derive(Clone, PartialEq)]
 pub enum InputMolType {
     Dna,
     Protein,
diff --git a/tests/test_gbsketch.py b/tests/test_gbsketch.py
index 9a189f5..f7ead2b 100644
--- a/tests/test_gbsketch.py
+++ b/tests/test_gbsketch.py
@@ -276,22 +276,13 @@ def test_gbsketch_save_fastas(runtmp):
             else:
                 assert sig.md5sum() == ss3.md5sum()
 
+
 def test_gbsketch_download_only(runtmp, capfd):
     acc_csv = get_test_data('acc.csv')
-    output = runtmp.output('simple.zip')
     failed = runtmp.output('failed.csv')
     out_dir = runtmp.output('out_fastas')
     ch_fail = runtmp.output('checksum_dl_failed.csv')
 
-
-    sig1 = get_test_data('GCA_000175535.1.sig.gz')
-    sig2 = get_test_data('GCA_000961135.2.sig.gz')
-    sig3 = get_test_data('GCA_000961135.2.protein.sig.gz')
-    ss1 = sourmash.load_one_signature(sig1, ksize=31)
-    ss2 = sourmash.load_one_signature(sig2, ksize=31)
-    # why does this need ksize =30 and not ksize = 10!???
-    ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein')
-
     runtmp.sourmash('scripts', 'gbsketch', acc_csv, '--download-only',
                     '--failed', failed, '-r', '1', '--fastas', out_dir, '--keep-fasta',
                     '--checksum-fail', ch_fail,
@@ -301,6 +292,7 @@ def test_gbsketch_download_only(runtmp, capfd):
     fa_files = os.listdir(out_dir)
     assert set(fa_files) == set(['GCA_000175535.1_genomic.fna.gz', 'GCA_000961135.2_protein.faa.gz', 'GCA_000961135.2_genomic.fna.gz'])
     captured = capfd.readouterr()
+    print(captured)
     assert "Failed to send signatures: channel closed" not in captured.err
 
 
@@ -643,7 +635,6 @@ def test_gbsketch_simple_batched_multiple(runtmp, capfd):
     sig3 = get_test_data('GCA_000961135.2.protein.sig.gz')
     ss1 = sourmash.load_one_signature(sig1, ksize=31)
     ss2 = sourmash.load_one_signature(sig2, ksize=31)
-    # why does this need ksize =30 and not ksize = 10!???
     ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein')
 
     runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output,
@@ -791,7 +782,7 @@ def test_gbsketch_simple_batch_restart_with_incomplete_zip(runtmp, capfd):
     assert not os.path.exists(output)  # for now, orig output file should be empty.
     captured = capfd.readouterr()
     print(captured.err)
-    assert f"Warning: Invalid zip file '{out2}'; skipping." in captured.err
+    assert f"Warning: Failed to load zip file '{out2}'; skipping." in captured.err
 
     # we created this one with sig cat
     idx = sourmash.load_file_as_index(out1)
diff --git a/tests/test_urlsketch.py b/tests/test_urlsketch.py
index a1d354c..18685ba 100644
--- a/tests/test_urlsketch.py
+++ b/tests/test_urlsketch.py
@@ -636,7 +636,7 @@ def test_urlsketch_simple_batch_restart_with_incomplete_zip(runtmp, capfd):
     assert not os.path.exists(output) # for now, orig output file should be empty.
     captured = capfd.readouterr()
     print(captured.err)
-    assert f"Warning: Invalid zip file '{out2}'; skipping." in captured.err
+    assert f"Warning: Failed to load zip file '{out2}'; skipping." in captured.err
 
     expected_siginfo = {
         (ss2.name, ss2.md5sum(), ss2.minhash.moltype),

From 3a038dbf4b653c48aefc8cd1290836f153e7cf8e Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Thu, 26 Dec 2024 18:05:44 -0800
Subject: [PATCH 2/2] add version to pyproject toml

---
 pyproject.toml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e75dfd2..adb9be8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,6 +2,7 @@
 name = "sourmash_plugin_directsketch"
 description = "Download and Sketch GenBank Assembly Datasets"
 readme = "README.md"
+version = "0.4.1"
 requires-python = ">=3.10"
 classifiers = [
     "Programming Language :: Rust",
@@ -11,7 +12,7 @@ classifiers = [
 dependencies = ["sourmash>=4.8.5,<5"]
 
 authors = [
-  { name="N. Tessa Pierce-Ward", orcid="0000-0002-2942-5331" },
+  { name= "N. Tessa Pierce-Ward" },
 ]
 
 [build-system]
@@ -36,4 +37,7 @@ python-source = "src/python"
 macos-deployment-target = "10.14"
 
 [metadata]
-license = { text = "GNU Affero General Public License v3" }
\ No newline at end of file
+license = { text = "GNU Affero General Public License v3" }
+authors = [
+  { name="N. Tessa Pierce-Ward", orcid="0000-0002-2942-5331" },
+]