diff --git a/README.md b/README.md index 2c9edc4..df2ef53 100644 --- a/README.md +++ b/README.md @@ -4,11 +4,16 @@ [![DOI](https://zenodo.org/badge/792101561.svg)](https://zenodo.org/doi/10.5281/zenodo.11165725) -tl;dr - download and sketch NCBI Assembly Datasets by accession +tl;dr - download and sketch data directly ## About -This plugin is an attempt to improve database generation by downloading assemblies, checking md5sum, and sketching to a sourmash zipfile. FASTA files can also be saved if desired. It's quite fast, but still very much at alpha level. Here be dragons. +Commands: + +- `gbsketch` - download and sketch NCBI Assembly Datasets by accession +- `urlsketch` - download and sketch directly from a url + +This plugin is an attempt to improve sourmash database generation by downloading files, checking md5sum if provided or accessible, and sketching to a sourmash zipfile. FASTA files can also be saved if desired. It's quite fast, but still very much at alpha level. Here be dragons. ## Installation @@ -16,7 +21,8 @@ This plugin is an attempt to improve database generation by downloading assembli pip install sourmash_plugin_directsketch ``` -## Usage +## `gbsketch` +download and sketch NCBI Assembly Datasets by accession ### Create an input file @@ -43,15 +49,13 @@ For reference: To run the test accession file at `tests/test-data/acc.csv`, run: ``` -sourmash scripts gbsketch tests/test-data/acc.csv -o test.zip -f out_fastas -k --failed test.failed.csv -p dna,k=21,k=31,scaled=1000,abund -p protein,k=10,scaled=100,abund -r 1 +sourmash scripts gbsketch tests/test-data/acc.csv -o test-gbsketch.zip -f out_fastas -k --failed test.failed.csv -p dna,k=21,k=31,scaled=1000,abund -p protein,k=10,scaled=100,abund -r 1 ``` Full Usage: ``` -usage: gbsketch [-h] [-q] [-d] -o OUTPUT [-f FASTAS] [-k] [--download-only] [--failed FAILED] [-p PARAM_STRING] [-c CORES] - [-r RETRY_TIMES] [-g | -m] - input_csv +usage: gbsketch [-h] [-q] [-d] [-o OUTPUT] [-f FASTAS] [-k] [--download-only] [--failed FAILED] [-p PARAM_STRING] [-c CORES] [-r RETRY_TIMES] [-g | -m] input_csv download and sketch GenBank assembly datasets @@ -66,7 +70,7 @@ options: output zip file for the signatures -f FASTAS, --fastas FASTAS Write fastas here - -k, --keep-fastas write FASTA files in addition to sketching. Default: do not write FASTA files + -k, --keep-fasta write FASTA files in addition to sketching. Default: do not write FASTA files --download-only just download genomes; do not sketch --failed FAILED csv of failed accessions and download links (should be mostly protein). -p PARAM_STRING, --param-string PARAM_STRING @@ -76,7 +80,61 @@ options: -r RETRY_TIMES, --retry-times RETRY_TIMES number of times to retry failed downloads -g, --genomes-only just download and sketch genome (DNA) files - -m, --proteomes-only just download and sketch proteome (protein) files +``` + +## `urlsketch` +download and sketch directly from a url +### Create an input file + +First, create a file, e.g. `acc-url.csv` with identifiers, sketch names, and other required info. +``` +accession,name,moltype,md5sum,download_filename,url +GCA_000961135.2,GCA_000961135.2 Candidatus Aramenus sulfurataquae isolate AZ1-454,dna,47b9fb20c51f0552b87db5d44d5d4566,GCA_000961135.2_genomic.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_genomic.fna.gz +GCA_000961135.2,GCA_000961135.2 Candidatus Aramenus sulfurataquae isolate AZ1-454,protein,fb7920fb8f3cf5d6ab9b6b754a5976a4,GCA_000961135.2_protein.urlsketch.faa.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_protein.faa.gz +GCA_000175535.1,GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14,dna,a1a8f1c6dc56999c73fe298871c963d1,GCA_000175535.1_genomic.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz +``` +> Six columns must be present: +> - `accession` - an accession or unique identifier. Ideally no spaces. +> - `name` - full name for the sketch. +> - `moltype` - is the file 'dna' or 'protein'? +> - `md5sum` - expected md5sum (optional, will be checked after download if provided) +> - `download_filename` - filename for FASTA download. Required if `--keep-fastas`, but useful for signatures, too (saved in sig data). +> - `url` - direct link for the file + +### Run: + +To run the test accession file at `tests/test-data/acc-url.csv`, run: +``` +sourmash scripts urlsketch tests/test-data/acc-url.csv -o test-urlsketch.zip -f out_fastas -k --failed test.failed.csv -p dna,k=21,k=31,scaled=1000,abund -p protein,k=10,scaled=100,abund -r 1 +``` + +Full Usage: +``` +usage: urlsketch [-h] [-q] [-d] [-o OUTPUT] [-f FASTAS] [-k] [--download-only] [--failed FAILED] [-p PARAM_STRING] [-c CORES] [-r RETRY_TIMES] input_csv + +download and sketch GenBank assembly datasets + +positional arguments: + input_csv a txt file or csv file containing accessions in the first column + +options: + -h, --help show this help message and exit + -q, --quiet suppress non-error output + -d, --debug provide debugging output + -o OUTPUT, --output OUTPUT + output zip file for the signatures + -f FASTAS, --fastas FASTAS + Write fastas here + -k, --keep-fasta, --keep-fastq + write FASTA/Q files in addition to sketching. Default: do not write FASTA files + --download-only just download genomes; do not sketch + --failed FAILED csv of failed accessions and download links (should be mostly protein). + -p PARAM_STRING, --param-string PARAM_STRING + parameter string for sketching (default: k=31,scaled=1000) + -c CORES, --cores CORES + number of cores to use (default is all available) + -r RETRY_TIMES, --retry-times RETRY_TIMES + number of times to retry failed downloads ``` ## Code of Conduct diff --git a/pyproject.toml b/pyproject.toml index a1255d6..25ce6f9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ build-backend = "maturin" [project.entry-points."sourmash.cli_script"] gbsketch = "sourmash_plugin_directsketch:Download_and_Sketch_Assemblies" +urlsketch = "sourmash_plugin_directsketch:Download_and_Sketch_Url" [project.optional-dependencies] test = [ diff --git a/src/directsketch.rs b/src/directsketch.rs index 916070b..597f799 100644 --- a/src/directsketch.rs +++ b/src/directsketch.rs @@ -23,7 +23,8 @@ use sourmash::manifest::{Manifest, Record}; use sourmash::signature::Signature; use crate::utils::{ - build_siginfo, load_gbassembly_info, parse_params_str, GBAssemblyData, GenBankFileType, + build_siginfo, load_accession_info, load_gbassembly_info, parse_params_str, AccessionData, + GBAssemblyData, GenBankFileType, InputMolType, }; use reqwest::Url; @@ -241,12 +242,14 @@ async fn sketch_data( pub struct FailedDownload { accession: String, name: String, - url: Option, moltype: String, + md5sum: Option, + download_filename: Option, + url: Option, } #[allow(clippy::too_many_arguments)] -async fn dl_sketch_accession( +async fn dl_sketch_assembly_accession( client: &Client, accinfo: GBAssemblyData, location: &PathBuf, @@ -275,8 +278,10 @@ async fn dl_sketch_accession( let failed_download_dna = FailedDownload { accession: accession.clone(), name: name.clone(), - url: None, moltype: "dna".to_string(), + md5sum: None, + download_filename: None, + url: None, }; failed.push(failed_download_dna); } @@ -284,8 +289,10 @@ async fn dl_sketch_accession( let failed_download_protein = FailedDownload { accession: accession.clone(), name: name.clone(), - url: None, moltype: "protein".to_string(), + md5sum: None, + download_filename: None, + url: None, }; failed.push(failed_download_protein); } @@ -316,6 +323,7 @@ async fn dl_sketch_accession( for file_type in &file_types { let url = file_type.url(&base_url, &full_name); let expected_md5 = checksums.get(&file_type.server_filename(&full_name)); + let file_name = file_type.filename_to_write(&accession); let data = match download_with_retry(client, &url, expected_md5.map(|x| x.as_str()), retry_count) .await @@ -326,14 +334,15 @@ async fn dl_sketch_accession( let failed_download = FailedDownload { accession: accession.clone(), name: name.clone(), - url: Some(url), moltype: file_type.moltype(), + md5sum: expected_md5.map(|x| x.to_string()), + download_filename: Some(file_name), + url: Some(url), }; failed.push(failed_download); continue; } }; - let file_name = file_type.filename_to_write(&accession); if keep_fastas { let path = location.join(&file_name); @@ -372,6 +381,85 @@ async fn dl_sketch_accession( Ok((sigs, failed)) } +#[allow(clippy::too_many_arguments)] +async fn dl_sketch_url( + client: &Client, + accinfo: AccessionData, + location: &PathBuf, + retry: Option, + _keep_fastas: bool, + dna_sigs: Vec, + prot_sigs: Vec, + _genomes_only: bool, + _proteomes_only: bool, + download_only: bool, +) -> Result<(Vec, Vec)> { + let retry_count = retry.unwrap_or(3); // Default retry count + let mut sigs = Vec::::new(); + let mut failed = Vec::::new(); + + let name = accinfo.name; + let accession = accinfo.accession; + let url = accinfo.url; + let expected_md5 = accinfo.expected_md5sum; + let download_filename = accinfo.download_filename; + let moltype = accinfo.moltype; + + match download_with_retry(client, &url, expected_md5.as_deref(), retry_count) + .await + .ok() + { + Some(data) => { + // check keep_fastas instead?? + if let Some(ref download_filename) = download_filename { + let path = location.join(download_filename); + fs::write(path, &data).context("Failed to write data to file")?; + } + if !download_only { + let filename = download_filename.clone().unwrap_or("".to_string()); + // sketch data + match moltype { + InputMolType::Dna => sigs.extend( + sketch_data( + name.clone(), + filename.clone(), + data, + dna_sigs.clone(), + "dna".to_string(), + ) + .await?, + ), + InputMolType::Protein => { + sigs.extend( + sketch_data( + name.clone(), + filename.clone(), + data, + prot_sigs.clone(), + "protein".to_string(), + ) + .await?, + ); + } + }; + } + } + None => { + let failed_download = FailedDownload { + accession: accession.clone(), + name: name.clone(), + moltype: moltype.to_string(), + md5sum: expected_md5.map(|x| x.to_string()), + download_filename, + url: Some(url), + }; + failed.push(failed_download); + } + } + + Ok((sigs, failed)) +} + async fn write_sig( sig: &Signature, md5sum_occurrences: &mut HashMap, @@ -514,7 +602,10 @@ pub fn failures_handle( let mut writer = BufWriter::new(file); // Attempt to write CSV headers - if let Err(e) = writer.write_all(b"accession,name,moltype,url\n").await { + if let Err(e) = writer + .write_all(b"accession,name,moltype,md5sum,download_filename,url\n") + .await + { let error = Error::new(e).context("Failed to write headers"); let _ = error_sender.send(error).await; return; // Exit the task early after reporting the error @@ -523,15 +614,19 @@ pub fn failures_handle( while let Some(FailedDownload { accession, name, - moltype, + md5sum, + download_filename, url, + moltype, }) = recv_failed.recv().await { let record = format!( - "{},{},{},{:?}\n", + "{},{},{},{},{},{}\n", accession, name, moltype, + md5sum.unwrap_or("".to_string()), + download_filename.unwrap_or("".to_string()), url.map(|u| u.to_string()).unwrap_or("".to_string()) ); // Attempt to write each record @@ -574,7 +669,7 @@ pub fn error_handler( #[tokio::main] #[allow(clippy::too_many_arguments)] -pub async fn download_and_sketch( +pub async fn gbsketch( py: Python, input_csv: String, param_str: String, @@ -642,6 +737,180 @@ pub async fn download_and_sketch( let mut genomes_only = genomes_only; let mut proteomes_only = proteomes_only; + // Check if dna_sig_templates is empty and not keep_fastas + if dna_sig_templates.is_empty() && !keep_fastas { + eprintln!("No DNA signature templates provided, and --keep-fasta is not set."); + proteomes_only = true; + } + // Check if protein_sig_templates is empty and not keep_fastas + if prot_sig_templates.is_empty() && !keep_fastas { + eprintln!("No protein signature templates provided, and --keep-fasta is not set."); + genomes_only = true; + } + if genomes_only { + if !download_only { + eprintln!("Downloading and sketching genomes only."); + } else { + eprintln!("Downloading genomes only."); + } + } else if proteomes_only { + if !download_only { + eprintln!("Downloading and sketching proteomes only."); + } else { + eprintln!("Downloading proteomes only."); + } + } + + // report every 1 percent (or every 1, whichever is larger) + let reporting_threshold = std::cmp::max(n_accs / 100, 1); + + for (i, accinfo) in accession_info.into_iter().enumerate() { + py.check_signals()?; // If interrupted, return an Err automatically + let semaphore_clone = Arc::clone(&semaphore); + let client_clone = Arc::clone(&client); + let send_sigs = send_sigs.clone(); + let send_failed = send_failed.clone(); + let download_path_clone = download_path.clone(); // Clone the path for each task + let send_errors = error_sender.clone(); + + let dna_sigs = dna_sig_templates.clone(); + let prot_sigs = prot_sig_templates.clone(); + + tokio::spawn(async move { + let _permit = semaphore_clone.acquire().await; + // progress report when the permit is available and processing begins + if (i + 1) % reporting_threshold == 0 { + let percent_processed = (((i + 1) as f64 / n_accs as f64) * 100.0).round(); + println!( + "Starting accession {}/{} ({}%)", + (i + 1), + n_accs, + percent_processed + ); + } + // Perform download and sketch + let result = dl_sketch_assembly_accession( + &client_clone, + accinfo.clone(), + &download_path_clone, + Some(retry_times), + keep_fastas, + dna_sigs, + prot_sigs, + genomes_only, + proteomes_only, + download_only, + ) + .await; + match result { + Ok((sigs, failed_downloads)) => { + if !sigs.is_empty() { + if let Err(e) = send_sigs.send(sigs).await { + eprintln!("Failed to send signatures: {}", e); + let _ = send_errors.send(e.into()).await; // Send the error through the channel + } + } + for fail in failed_downloads { + if let Err(e) = send_failed.send(fail).await { + eprintln!("Failed to send failed download info: {}", e); + let _ = send_errors.send(e.into()).await; // Send the error through the channel + } + } + } + Err(e) => { + let _ = send_errors.send(e).await; + } + } + drop(send_errors); + }); + } + // drop senders as we're done sending data + drop(send_sigs); + drop(send_failed); + drop(error_sender); + // Wait for all tasks to complete + for handle in handles { + if let Err(e) = handle.await { + eprintln!("Handle join error: {}.", e); + } + } + // since the only critical error is not having written any sigs + // check this here at end. Bail if we wrote expected sigs but wrote none. + if critical_error_flag.load(Ordering::SeqCst) & !download_only { + bail!("No signatures written, exiting."); + } + + Ok(()) +} + +#[tokio::main] +#[allow(clippy::too_many_arguments)] +pub async fn urlsketch( + py: Python, + input_csv: String, + param_str: String, + failed_csv: String, + retry_times: u32, + fasta_location: String, + keep_fastas: bool, + download_only: bool, + output_sigs: Option, +) -> Result<(), anyhow::Error> { + // if sig output provided but doesn't end in zip, bail + if let Some(ref output_sigs) = output_sigs { + if Path::new(&output_sigs) + .extension() + .map_or(true, |ext| ext != "zip") + { + bail!("Output must be a zip file."); + } + } + // set up fasta download path + let download_path = PathBuf::from(fasta_location); + if !download_path.exists() { + create_dir_all(&download_path)?; + } + + // create channels. buffer size here is 4 b/c we can do 3 downloads simultaneously + let (send_sigs, recv_sigs) = tokio::sync::mpsc::channel::>(4); + let (send_failed, recv_failed) = tokio::sync::mpsc::channel::(4); + // Error channel for handling task errors + let (error_sender, error_receiver) = tokio::sync::mpsc::channel::(1); + + // Set up collector/writing tasks + let mut handles = Vec::new(); + let sig_handle = sigwriter_handle(recv_sigs, output_sigs, error_sender.clone()); + let failures_handle = failures_handle(failed_csv, recv_failed, error_sender.clone()); + let critical_error_flag = Arc::new(AtomicBool::new(false)); + let error_handle = error_handler(error_receiver, critical_error_flag.clone()); + handles.push(sig_handle); + handles.push(failures_handle); + handles.push(error_handle); + + // Worker tasks + let semaphore = Arc::new(Semaphore::new(3)); // Limiting concurrent downloads + let client = Arc::new(Client::new()); + + // Open the file containing the accessions synchronously + let (accession_info, n_accs) = load_accession_info(input_csv, keep_fastas)?; + if n_accs == 0 { + bail!("No accessions to download and sketch.") + } + + // parse param string into params_vec, print error if fail + let param_result = parse_params_str(param_str); + let params_vec = match param_result { + Ok(params) => params, + Err(e) => { + bail!("Failed to parse params string: {}", e); + } + }; + let dna_sig_templates = build_siginfo(¶ms_vec, "DNA"); + let prot_sig_templates = build_siginfo(¶ms_vec, "protein"); + + let mut genomes_only = false; + let mut proteomes_only = false; + // Check if dna_sig_templates is empty and not keep_fastas if dna_sig_templates.is_empty() && !keep_fastas { eprintln!("No DNA signature templates provided, and --keep-fastas is not set."); @@ -694,7 +963,7 @@ pub async fn download_and_sketch( ); } // Perform download and sketch - let result = dl_sketch_accession( + let result = dl_sketch_url( &client_clone, accinfo.clone(), &download_path_clone, diff --git a/src/lib.rs b/src/lib.rs index 7d7b7e4..88e77ca 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -62,7 +62,7 @@ fn do_gbsketch( download_only: bool, output_sigs: Option, ) -> anyhow::Result { - match directsketch::download_and_sketch( + match directsketch::gbsketch( py, input_csv, param_str, @@ -83,9 +83,42 @@ fn do_gbsketch( } } +#[pyfunction] +#[allow(clippy::too_many_arguments)] +fn do_urlsketch( + py: Python, + input_csv: String, + param_str: String, + failed_csv: String, + retry_times: u32, + fasta_location: String, + keep_fastas: bool, + download_only: bool, + output_sigs: Option, +) -> anyhow::Result { + match directsketch::urlsketch( + py, + input_csv, + param_str, + failed_csv, + retry_times, + fasta_location, + keep_fastas, + download_only, + output_sigs, + ) { + Ok(_) => Ok(0), + Err(e) => { + eprintln!("Error: {e}"); + Ok(1) + } + } +} + #[pymodule] fn sourmash_plugin_directsketch(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(do_gbsketch, m)?)?; + m.add_function(wrap_pyfunction!(do_urlsketch, m)?)?; m.add_function(wrap_pyfunction!(set_tokio_thread_pool, m)?)?; Ok(()) } diff --git a/src/python/sourmash_plugin_directsketch/__init__.py b/src/python/sourmash_plugin_directsketch/__init__.py index bdeade4..bd52812 100644 --- a/src/python/sourmash_plugin_directsketch/__init__.py +++ b/src/python/sourmash_plugin_directsketch/__init__.py @@ -43,7 +43,7 @@ def __init__(self, p): help='output zip file for the signatures') p.add_argument('-f', '--fastas', help='Write fastas here', default = '.') - p.add_argument('-k', '--keep-fastas', action='store_true', + p.add_argument('-k', '--keep-fasta', action='store_true', help="write FASTA files in addition to sketching. Default: do not write FASTA files") p.add_argument('--download-only', help='just download genomes; do not sketch', action='store_true') p.add_argument('--failed',help='csv of failed accessions and download links (should be mostly protein).') @@ -64,8 +64,8 @@ def main(self, args): args.param_string = ["k=31,scaled=1000"] notify(f"params: {args.param_string}") - if args.download_only and not args.keep_fastas: - notify("Error: '--download-only' requires '--keep-fastas'.") + if args.download_only and not args.keep_fasta: + notify("Error: '--download-only' requires '--keep-fasta'.") sys.exit(-1) if args.output is None and not args.download_only: notify("Error: output signature zipfile is required if not using '--download-only'.") @@ -78,7 +78,7 @@ def main(self, args): num_threads = set_thread_pool(args.cores) - notify(f"downloading and sketching all accessions in '{args.input_csv} using {num_threads} threads") + notify(f"downloading and sketching all accessions in '{args.input_csv} using {args.retry_times} retries and {num_threads} threads") super().main(args) status = sourmash_plugin_directsketch.do_gbsketch(args.input_csv, @@ -86,7 +86,7 @@ def main(self, args): args.failed, args.retry_times, args.fastas, - args.keep_fastas, + args.keep_fasta, args.genomes_only, args.proteomes_only, args.download_only, @@ -96,7 +96,72 @@ def main(self, args): notify("...gbsketch is done!") if args.output is not None: notify(f"Sigs in '{args.output}'.") - if args.keep_fastas: + if args.keep_fasta: + notify(f"FASTAs in '{args.fastas}'.") + + return status + + +class Download_and_Sketch_Url(CommandLinePlugin): + command = 'urlsketch' + description = 'download and sketch GenBank assembly datasets' + + def __init__(self, p): + super().__init__(p) + p.add_argument('input_csv', help="a txt file or csv file containing accessions in the first column") + p.add_argument('-o', '--output', default=None, + help='output zip file for the signatures') + p.add_argument('-f', '--fastas', + help='Write fastas here', default = '.') + p.add_argument('-k', '--keep-fasta', '--keep-fastq', action='store_true', + help="write FASTA/Q files in addition to sketching. Default: do not write FASTA files") + p.add_argument('--download-only', help='just download genomes; do not sketch', action='store_true') + p.add_argument('--failed',help='csv of failed accessions and download links (should be mostly protein).') + p.add_argument('-p', '--param-string', action='append', type=str, default=[], + help='parameter string for sketching (default: k=31,scaled=1000)') + p.add_argument('-c', '--cores', default=0, type=int, + help='number of cores to use (default is all available)') + p.add_argument('-r', '--retry-times', default=1, type=int, + help='number of times to retry failed downloads') + + + def main(self, args): + print_version() + if not args.param_string: + args.param_string = ["k=31,scaled=1000"] + notify(f"params: {args.param_string}") + + if args.download_only and not args.keep_fasta: + notify("Error: '--download-only' requires '--keep-fasta'.") + sys.exit(-1) + if args.output is None and not args.download_only: + notify("Error: output signature zipfile is required if not using '--download-only'.") + sys.exit(-1) + + # convert to a single string for easier rust handling + args.param_string = "_".join(args.param_string) + # lowercase the param string + args.param_string = args.param_string.lower() + + num_threads = set_thread_pool(args.cores) + + notify(f"downloading and sketching all accessions in '{args.input_csv} using {num_threads} threads") + + super().main(args) + status = sourmash_plugin_directsketch.do_urlsketch(args.input_csv, + args.param_string, + args.failed, + args.retry_times, + args.fastas, + args.keep_fasta, + args.download_only, + args.output) + + if status == 0: + notify("...gbsketch is done!") + if args.output is not None: + notify(f"Sigs in '{args.output}'.") + if args.keep_fasta: notify(f"FASTAs in '{args.fastas}'.") return status diff --git a/src/utils.rs b/src/utils.rs index 63dc9b7..ab91ac7 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -2,6 +2,7 @@ use anyhow::{anyhow, Result}; use reqwest::Url; use sourmash::cmd::ComputeParameters; use sourmash::signature::Signature; +use std::fmt; use std::hash::Hash; use std::hash::Hasher; @@ -11,6 +12,17 @@ pub enum InputMolType { Protein, } +impl InputMolType {} + +impl fmt::Display for InputMolType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + InputMolType::Dna => write!(f, "dna"), + InputMolType::Protein => write!(f, "protein"), + } + } +} + impl std::str::FromStr for InputMolType { type Err = (); @@ -77,9 +89,10 @@ impl GenBankFileType { pub struct AccessionData { pub accession: String, pub name: String, - pub input_moltype: InputMolType, + pub moltype: InputMolType, pub url: reqwest::Url, - pub md5sum: Option, + pub expected_md5sum: Option, + pub download_filename: Option, // need to require this if --keep-fastas are used } #[derive(Clone)] @@ -160,8 +173,10 @@ pub fn load_gbassembly_info(input_csv: String) -> Result<(Vec, u Ok((results, row_count)) } -#[allow(dead_code)] -pub fn load_accession_info(input_csv: String) -> Result<(Vec, usize)> { +pub fn load_accession_info( + input_csv: String, + keep_fasta: bool, +) -> Result<(Vec, usize)> { let mut results = Vec::new(); let mut row_count = 0; let mut processed_rows = std::collections::HashSet::new(); @@ -172,7 +187,14 @@ pub fn load_accession_info(input_csv: String) -> Result<(Vec, usi // Check column names let header = rdr.headers()?; - let expected_header = vec!["accession", "name", "input_moltype", "url", "md5sum"]; + let expected_header = vec![ + "accession", + "name", + "moltype", + "md5sum", + "download_filename", + "url", + ]; if header != expected_header { return Err(anyhow!( "Invalid column names in CSV file. Columns should be: {:?}", @@ -199,13 +221,18 @@ pub fn load_accession_info(input_csv: String) -> Result<(Vec, usi .get(1) .ok_or_else(|| anyhow!("Missing 'name' field"))? .to_string(); - let input_moltype = record + let moltype = record .get(2) - .ok_or_else(|| anyhow!("Missing 'input_moltype' field"))? + .ok_or_else(|| anyhow!("Missing 'moltype' field"))? .parse::() - .map_err(|_| anyhow!("Invalid 'input_moltype' value"))?; + .map_err(|_| anyhow!("Invalid 'moltype' value"))?; + let expected_md5sum = record.get(3).map(|s| s.to_string()); + let download_filename = record.get(4).map(|s| s.to_string()); + if keep_fasta && download_filename.is_none() { + return Err(anyhow!("Missing 'download_filename' field")); + } let url = record - .get(3) + .get(5) .ok_or_else(|| anyhow!("Missing 'url' field"))? .split(',') .filter_map(|s| { @@ -218,19 +245,18 @@ pub fn load_accession_info(input_csv: String) -> Result<(Vec, usi }) .next() .ok_or_else(|| anyhow!("Invalid 'url' value"))?; - let md5sum = record.get(4).map(|s| s.to_string()); - // count entries with url and md5sum - if md5sum.is_some() { + if expected_md5sum.is_some() { md5sum_count += 1; } // store accession data results.push(AccessionData { - accession: acc.to_string(), - name: name.to_string(), - input_moltype, + accession: acc, + name, + moltype, url, - md5sum, + expected_md5sum, + download_filename, }); } diff --git a/tests/test-data/acc-url.csv b/tests/test-data/acc-url.csv new file mode 100644 index 0000000..8c3a87e --- /dev/null +++ b/tests/test-data/acc-url.csv @@ -0,0 +1,4 @@ +accession,name,moltype,md5sum,download_filename,url +GCA_000961135.2,GCA_000961135.2 Candidatus Aramenus sulfurataquae isolate AZ1-454,dna,47b9fb20c51f0552b87db5d44d5d4566,GCA_000961135.2_genomic.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_genomic.fna.gz +GCA_000961135.2,GCA_000961135.2 Candidatus Aramenus sulfurataquae isolate AZ1-454,protein,fb7920fb8f3cf5d6ab9b6b754a5976a4,GCA_000961135.2_protein.urlsketch.faa.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_protein.faa.gz +GCA_000175535.1,GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14,dna,a1a8f1c6dc56999c73fe298871c963d1,GCA_000175535.1_genomic.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz diff --git a/tests/test_gbsketch.py b/tests/test_gbsketch.py index ecfc085..3308e40 100644 --- a/tests/test_gbsketch.py +++ b/tests/test_gbsketch.py @@ -57,13 +57,16 @@ def test_gbsketch_simple(runtmp): assert sig.md5sum() == ss3.md5sum() assert os.path.exists(failed) with open(failed, 'r') as failF: - next(failF) # skip header line - for line in failF: - acc, name, moltype, url = line.strip().split(',') - assert acc == "GCA_000175535.1" - assert name == "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14" - assert moltype == "protein" - assert url == '"https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz"' + fail_lines = failF.readlines() + print(fail_lines) + assert len(fail_lines) == 2 + assert fail_lines[0] == "accession,name,moltype,md5sum,download_filename,url\n" + acc, name, moltype, md5sum, download_filename, url = fail_lines[1].strip().split(',') + assert acc == "GCA_000175535.1" + assert name == "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14" + assert moltype == "protein" + assert download_filename == "GCA_000175535.1_protein.faa.gz" + assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz" def test_gbsketch_simple_url(runtmp): @@ -186,7 +189,7 @@ def test_gbsketch_genomes_only_via_params(runtmp, capfd): elif 'GCA_000961135.2' in sig.name: assert sig.name == ss2.name assert sig.md5sum() == ss2.md5sum() - assert 'No protein signature templates provided, and --keep-fastas is not set.' in captured.err + assert 'No protein signature templates provided, and --keep-fasta is not set.' in captured.err assert 'Downloading and sketching genomes only.' in captured.err @@ -215,7 +218,7 @@ def test_gbsketch_proteomes_only_via_params(runtmp, capfd): for sig in sigs: assert 'GCA_000961135.2' in sig.name assert sig.md5sum() == ss3.md5sum() - assert 'No DNA signature templates provided, and --keep-fastas is not set.' in captured.err + assert 'No DNA signature templates provided, and --keep-fasta is not set.' in captured.err assert 'Downloading and sketching proteomes only.' in captured.err @@ -235,7 +238,7 @@ def test_gbsketch_save_fastas(runtmp): ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein') runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', '--fastas', out_dir, '--keep-fastas', + '--failed', failed, '-r', '1', '--fastas', out_dir, '--keep-fasta', '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert os.path.exists(output) @@ -274,7 +277,7 @@ def test_gbsketch_download_only(runtmp, capfd): ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein') runtmp.sourmash('scripts', 'gbsketch', acc_csv, '--download-only', - '--failed', failed, '-r', '1', '--fastas', out_dir, '--keep-fastas', + '--failed', failed, '-r', '1', '--fastas', out_dir, '--keep-fasta', '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert not runtmp.last_result.out # stdout should be empty diff --git a/tests/test_urlsketch.py b/tests/test_urlsketch.py new file mode 100644 index 0000000..64d1f1f --- /dev/null +++ b/tests/test_urlsketch.py @@ -0,0 +1,321 @@ +""" +Tests for urlsketch +""" +import os +import pytest + +import sourmash +import sourmash_tst_utils as utils +from sourmash_tst_utils import SourmashCommandFailed + + + +def get_test_data(filename): + thisdir = os.path.dirname(__file__) + return os.path.join(thisdir, 'test-data', filename) + +def test_installed(runtmp): + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'urlsketch') + + assert 'usage: urlsketch' in runtmp.last_result.err + + +def test_urlsketch_simple(runtmp): + acc_csv = get_test_data('acc-url.csv') + output = runtmp.output('simple.zip') + failed = runtmp.output('failed.csv') + + sig1 = get_test_data('GCA_000175535.1.sig.gz') + sig2 = get_test_data('GCA_000961135.2.sig.gz') + sig3 = get_test_data('GCA_000961135.2.protein.sig.gz') + ss1 = sourmash.load_one_signature(sig1, ksize=31) + ss2 = sourmash.load_one_signature(sig2, ksize=31) + ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein') + + runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, + '--failed', failed, '-r', '1', + '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + + assert len(sigs) == 3 + for sig in sigs: + if 'GCA_000175535.1' in sig.name: + assert sig.name == ss1.name + assert sig.md5sum() == ss1.md5sum() + elif 'GCA_000961135.2' in sig.name: + assert sig.name == ss2.name + if sig.minhash.moltype == 'DNA': + assert sig.md5sum() == ss2.md5sum() + else: + assert sig.md5sum() == ss3.md5sum() + assert os.path.exists(failed) + with open(failed, 'r') as failF: + header = next(failF).strip() + assert header == "accession,name,moltype,md5sum,download_filename,url" + for line in failF: + print(line) + acc, name, moltype, md5sum, download_filename, url = line.strip().split(',') + assert acc == "GCA_000175535.1" + assert name == "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14" + assert moltype == "protein" + assert download_filename == "GCA_000175535.1_protein.faa.gz" + assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz" + + +def test_urlsketch_save_fastas(runtmp): + acc_csv = get_test_data('acc-url.csv') + output = runtmp.output('simple.zip') + failed = runtmp.output('failed.csv') + out_dir = runtmp.output('out_fastas') + + + sig1 = get_test_data('GCA_000175535.1.sig.gz') + sig2 = get_test_data('GCA_000961135.2.sig.gz') + sig3 = get_test_data('GCA_000961135.2.protein.sig.gz') + ss1 = sourmash.load_one_signature(sig1, ksize=31) + ss2 = sourmash.load_one_signature(sig2, ksize=31) + # why does this need ksize =30 and not ksize = 10!??? + ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein') + + runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, + '--failed', failed, '-r', '1', '--fastas', out_dir, '--keep-fasta', + '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + fa_files = os.listdir(out_dir) + assert set(fa_files) == set(['GCA_000175535.1_genomic.urlsketch.fna.gz', 'GCA_000961135.2_protein.urlsketch.faa.gz', 'GCA_000961135.2_genomic.urlsketch.fna.gz']) + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + + assert len(sigs) == 3 + for sig in sigs: + if 'GCA_000175535.1' in sig.name: + assert sig.name == ss1.name + assert sig.md5sum() == ss1.md5sum() + elif 'GCA_000961135.2' in sig.name: + assert sig.name == ss2.name + if sig.minhash.moltype == 'DNA': + assert sig.md5sum() == ss2.md5sum() + else: + assert sig.md5sum() == ss3.md5sum() + + +def test_urlsketch_download_only(runtmp, capfd): + acc_csv = get_test_data('acc-url.csv') + output = runtmp.output('simple.zip') + failed = runtmp.output('failed.csv') + out_dir = runtmp.output('out_fastas') + + + sig1 = get_test_data('GCA_000175535.1.sig.gz') + sig2 = get_test_data('GCA_000961135.2.sig.gz') + sig3 = get_test_data('GCA_000961135.2.protein.sig.gz') + ss1 = sourmash.load_one_signature(sig1, ksize=31) + ss2 = sourmash.load_one_signature(sig2, ksize=31) + # why does this need ksize =30 and not ksize = 10!??? + ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein') + + runtmp.sourmash('scripts', 'urlsketch', acc_csv, '--download-only', + '--failed', failed, '-r', '1', '--fastas', out_dir, '--keep-fasta', + '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") + + assert not runtmp.last_result.out # stdout should be empty + fa_files = os.listdir(out_dir) + assert set(fa_files) == set(['GCA_000175535.1_genomic.urlsketch.fna.gz', 'GCA_000961135.2_protein.urlsketch.faa.gz', 'GCA_000961135.2_genomic.urlsketch.fna.gz']) + captured = capfd.readouterr() + assert "Failed to send signatures: channel closed" not in captured.err + + +def test_urlsketch_bad_acc(runtmp): + acc_csv = get_test_data('acc-url.csv') + acc_mod = runtmp.output('acc_mod.csv') + with open(acc_csv, 'r') as inF, open(acc_mod, 'w') as outF: + lines = inF.readlines() + for line in lines: + # if this acc exist in line, copy it and write an extra line with an invalid accession + outF.write(line) + print(line) + if "GCA_000175535.1" in line: + mod_line = line.replace('GCA_000175535.1', 'GCA_0001755559.1') # add extra digit - should not be valid + print(mod_line) + outF.write(mod_line) + + output = runtmp.output('simple.zip') + failed = runtmp.output('failed.csv') + + sig1 = get_test_data('GCA_000175535.1.sig.gz') + sig2 = get_test_data('GCA_000961135.2.sig.gz') + sig3 = get_test_data('GCA_000961135.2.protein.sig.gz') + ss1 = sourmash.load_one_signature(sig1, ksize=31) + ss2 = sourmash.load_one_signature(sig2, ksize=31) + # why does this need ksize =30 and not ksize = 10!??? + ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein') + + runtmp.sourmash('scripts', 'urlsketch', acc_mod, '-o', output, + '--failed', failed, '-r', '1', + '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + + # Open the failed file + assert os.path.exists(failed) + with open(failed, 'r') as acc_file: + # Read the lines of the file + lines = acc_file.readlines() + # Check if the modified accession exists in the first column of any line + for line in lines: + print(line) + if "GCA_0001755559.1" in line.split(',')[0]: + assert True + break + else: + assert False, "Modified accession not found" + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + + assert len(sigs) == 3 + for sig in sigs: + if 'GCA_000175535.1' in sig.name: + assert sig.name == ss1.name + assert sig.md5sum() == ss1.md5sum() + elif 'GCA_000961135.2' in sig.name: + assert sig.name == ss2.name + if sig.minhash.moltype == 'DNA': + assert sig.md5sum() == ss2.md5sum() + else: + assert sig.md5sum() == ss3.md5sum() + +def test_urlsketch_missing_accfile(runtmp, capfd): + acc_csv = runtmp.output('acc1.csv') + output = runtmp.output('simple.zip') + failed = runtmp.output('failed.csv') + + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, + '--failed', failed, '-r', '1', + '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") + + captured = capfd.readouterr() + print(captured.err) + assert "Error: No such file or directory" in captured.err + + +def test_urlsketch_empty_accfile(runtmp, capfd): + acc_csv = get_test_data('acc1.csv') + with open(acc_csv, 'w') as file: + file.write('') + output = runtmp.output('simple.zip') + failed = runtmp.output('failed.csv') + + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, + '--failed', failed, '-r', '1', + '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") + + captured = capfd.readouterr() + print(captured.err) + assert 'Error: Invalid column names in CSV file. Columns should be: ["accession", "name", "moltype", "md5sum", "download_filename", "url"]' in captured.err + + +def test_urlsketch_bad_acc_fail(runtmp, capfd): + acc_csv = get_test_data('acc-url.csv') + acc_mod = runtmp.output('acc_mod.csv') + with open(acc_csv, 'r') as inF, open(acc_mod, 'w') as outF: + lines = inF.readlines() + outF.write(lines[0]) # write the header line + for line in lines: + # if this acc exist in line, copy it and write + if "GCA_000175535.1" in line: + mod_line = line.replace('GCA_000175535.1', 'GCA_0001755559.1') # add extra digit - should not be valid + print(mod_line) + outF.write(mod_line) + + output = runtmp.output('simple.zip') + failed = runtmp.output('failed.csv') + + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'urlsketch', acc_mod, '-o', output, + '--failed', failed, '-r', '1', + '--param-str', "dna,k=31,scaled=1000") + + captured = capfd.readouterr() + print(captured.out) + print(captured.err) + assert "Error: No signatures written, exiting." in captured.err + + + +def test_urlsketch_missing_output(runtmp): + # no output sig zipfile provided but also not --download-only + acc_csv = runtmp.output('acc1.csv') + output = runtmp.output('simple.zip') + failed = runtmp.output('failed.csv') + + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'urlsketch', acc_csv, + '--failed', failed, '-r', '1', + '--param-str', "dna,k=31,scaled=1000") + + assert "Error: output signature zipfile is required if not using '--download-only'." in runtmp.last_result.err + + +def test_urlsketch_from_gbsketch_failed(runtmp, capfd): + acc_csv = get_test_data('acc.csv') + output = runtmp.output('simple.zip') + failed = runtmp.output('failed.csv') + + runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, + '--failed', failed, '-r', '1', + '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") + + assert os.path.exists(failed) + with open(failed, 'r') as failF: + fail_lines = failF.readlines() + assert len(fail_lines) == 2 + assert fail_lines[0] == "accession,name,moltype,md5sum,download_filename,url\n" + acc, name, moltype, md5sum, download_filename, url = fail_lines[1].strip().split(',') + assert acc == "GCA_000175535.1" + assert name == "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14" + assert moltype == "protein" + assert download_filename == "GCA_000175535.1_protein.faa.gz" + assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz" + assert not runtmp.last_result.out # stdout should be empty + + out2 = runtmp.output('failed-retry.zip') + fail2 = runtmp.output('fail2.csv') + with pytest.raises(utils.SourmashCommandFailed): + + runtmp.sourmash('scripts', 'urlsketch', failed, '-o', out2, + '--failed', fail2, '-r', '1', + '-p', "protein,k=10,scaled=200") + captured = capfd.readouterr() + print(captured.out) + print(captured.err) + assert "Error: No signatures written, exiting." in captured.err + + # since no protein file exists, fail2 should just be the same as failed + assert os.path.exists(fail2) + with open(fail2, 'r') as failF: + header = next(failF).strip() + assert header == "accession,name,moltype,md5sum,download_filename,url" + for line in failF: + print(line) + acc, name, moltype, md5sum, download_filename, url = line.strip().split(',') + assert acc == "GCA_000175535.1" + assert name == "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14" + assert moltype == "protein" + assert download_filename == "GCA_000175535.1_protein.faa.gz" + assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz" + + +# def test_urlsketch_from_urlsketch_failed(runtmp, capfd):