diff --git a/src/directsketch.rs b/src/directsketch.rs index c994cad..860fc4a 100644 --- a/src/directsketch.rs +++ b/src/directsketch.rs @@ -194,6 +194,7 @@ async fn dl_sketch_accession( prot_sigs: Vec, genomes_only: bool, proteomes_only: bool, + download_only: bool, ) -> Result<(Vec, Vec)> { let retry_count = retry.unwrap_or(3); // Default retry count let mut sigs = Vec::::new(); @@ -259,30 +260,33 @@ async fn dl_sketch_accession( let path = location.join(&file_name); fs::write(&path, &data).context("Failed to write data to file")?; } - match file_type { - GenBankFileType::Genomic => sigs.extend( - sketch_data( - name.as_str(), - file_name.as_str(), - data, - dna_sigs.clone(), - "dna", - ) - .await?, - ), - GenBankFileType::Protein => { - sigs.extend( + if !download_only { + // sketch data + match file_type { + GenBankFileType::Genomic => sigs.extend( sketch_data( name.as_str(), file_name.as_str(), data, - prot_sigs.clone(), - "protein", + dna_sigs.clone(), + "dna", ) .await?, - ); - } - _ => {} // Do nothing for other file types + ), + GenBankFileType::Protein => { + sigs.extend( + sketch_data( + name.as_str(), + file_name.as_str(), + data, + prot_sigs.clone(), + "protein", + ) + .await?, + ); + } + _ => {} // Do nothing for other file types + }; } } @@ -345,6 +349,7 @@ pub async fn download_and_sketch( keep_fastas: bool, genomes_only: bool, proteomes_only: bool, + download_only: bool, ) -> Result<(), anyhow::Error> { let download_path = PathBuf::from(fasta_location); if !download_path.exists() { @@ -358,6 +363,13 @@ pub async fn download_and_sketch( { bail!("Output must be a zip file."); } + // start zip file; set up trackers + let outpath: PathBuf = output_sigs.into(); + let mut file = File::create(outpath).await?; + let mut zipF = ZipFileWriter::new(&mut file); + + let mut manifest_rows: Vec = Vec::new(); + let mut md5sum_occurrences: HashMap = HashMap::new(); // Open the file containing the accessions synchronously let (accession_info, n_accs) = load_accession_info(input_csv)?; @@ -377,14 +389,6 @@ pub async fn download_and_sketch( let dna_sig_templates = build_siginfo(¶ms_vec, "DNA"); let prot_sig_templates = build_siginfo(¶ms_vec, "protein"); - // start zip file; set up trackers - let outpath: PathBuf = output_sigs.into(); - let mut file = File::create(outpath).await?; - let mut zipF = ZipFileWriter::new(&mut file); - - let mut manifest_rows: Vec = Vec::new(); - let mut md5sum_occurrences: HashMap = HashMap::new(); - // failures let file = std::fs::File::create(failed_csv)?; let mut failed_writer = csv::Writer::from_writer(file); @@ -421,6 +425,7 @@ pub async fn download_and_sketch( prot_sig_templates.clone(), genomes_only, proteomes_only, + download_only, ) .await; @@ -443,7 +448,7 @@ pub async fn download_and_sketch( } } // if no signatures were written, bail so user knows something went wrong - if !wrote_sigs { + if !wrote_sigs && !download_only { bail!("No signatures written.") } diff --git a/src/lib.rs b/src/lib.rs index 7318c6d..af66c96 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -35,6 +35,7 @@ fn do_gbsketch( keep_fastas: bool, genomes_only: bool, proteomes_only: bool, + download_only: bool, ) -> anyhow::Result { // let runtime = tokio::runtime::Runtime::new().unwrap(); @@ -52,6 +53,7 @@ fn do_gbsketch( keep_fastas, genomes_only, proteomes_only, + download_only, ) { Ok(_) => Ok(0), Err(e) => { diff --git a/src/python/sourmash_plugin_directsketch/__init__.py b/src/python/sourmash_plugin_directsketch/__init__.py index 6ee00ba..57b4389 100644 --- a/src/python/sourmash_plugin_directsketch/__init__.py +++ b/src/python/sourmash_plugin_directsketch/__init__.py @@ -1,5 +1,6 @@ #! /usr/bin/env python import os +import sys from sourmash.logging import notify from sourmash.plugins import CommandLinePlugin import importlib.metadata @@ -44,6 +45,7 @@ def __init__(self, p): help='Write fastas here', default = '.') p.add_argument('-k', '--keep-fastas', action='store_true', help="write FASTA files in addition to sketching. Default: do not write FASTA files") + p.add_argument('--download-only', help='just download genomes; do not sketch', action='store_true') p.add_argument('--failed',help='csv of failed accessions and download links (should be mostly protein).') p.add_argument('-p', '--param-string', action='append', type=str, default=[], help='parameter string for sketching (default: k=31,scaled=1000)') @@ -55,12 +57,18 @@ def __init__(self, p): group.add_argument('-g', '--genomes-only', action='store_true', help='just download and sketch genome (DNA) files') group.add_argument('-m', '--proteomes-only', action='store_true', help='just download and sketch proteome (protein) files') + def main(self, args): print_version() if not args.param_string: args.param_string = ["k=31,scaled=1000"] notify(f"params: {args.param_string}") + if args.download_only and not args.keep_fastas: + notify("Error: '--download-only' requires '--keep-fastas'.") + sys.exit(-1) + + # convert to a single string for easier rust handling args.param_string = "_".join(args.param_string) # lowercase the param string @@ -79,7 +87,8 @@ def main(self, args): args.fastas, args.keep_fastas, args.genomes_only, - args.proteomes_only) + args.proteomes_only, + args.download_only) if status == 0: notify(f"...gbsketch is done! Sigs in '{args.output}'. Fastas in '{args.fastas}'.") diff --git a/tests/test_gbsketch.py b/tests/test_gbsketch.py index dcd624f..5559d34 100644 --- a/tests/test_gbsketch.py +++ b/tests/test_gbsketch.py @@ -150,6 +150,31 @@ def test_gbsketch_save_fastas(runtmp): else: assert sig.md5sum() == ss3.md5sum() +def test_gbsketch_download_only(runtmp): + acc_csv = get_test_data('acc.csv') + output = runtmp.output('simple.zip') + failed = runtmp.output('failed.csv') + out_dir = runtmp.output('out_fastas') + + + sig1 = get_test_data('GCA_000175555.1.sig.gz') + sig2 = get_test_data('GCA_000961135.2.sig.gz') + sig3 = get_test_data('GCA_000961135.2.protein.sig.gz') + ss1 = sourmash.load_one_signature(sig1, ksize=31) + ss2 = sourmash.load_one_signature(sig2, ksize=31) + # why does this need ksize =30 and not ksize = 10!??? + ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein') + + runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, '--download-only', + '--failed', failed, '-r', '1', '--fastas', out_dir, '--keep-fastas', + '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") + + assert os.path.exists(output) # would be better if this didn't exist + assert not runtmp.last_result.out # stdout should be empty + fa_files = os.listdir(out_dir) + assert set(fa_files) == set(['GCA_000175555.1_genomic.fna.gz', 'GCA_000961135.2_protein.faa.gz', 'GCA_000961135.2_genomic.fna.gz']) + + def test_gbsketch_bad_acc(runtmp): acc_csv = get_test_data('acc.csv') acc_mod = runtmp.output('acc_mod.csv')