Skip to content

Commit

Permalink
add --download-only
Browse files Browse the repository at this point in the history
  • Loading branch information
bluegenes committed Apr 30, 2024
1 parent ed9ec2e commit 6a66169
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 28 deletions.
59 changes: 32 additions & 27 deletions src/directsketch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ async fn dl_sketch_accession(
prot_sigs: Vec<Signature>,
genomes_only: bool,
proteomes_only: bool,
download_only: bool,
) -> Result<(Vec<Signature>, Vec<FailedDownload>)> {
let retry_count = retry.unwrap_or(3); // Default retry count
let mut sigs = Vec::<Signature>::new();
Expand Down Expand Up @@ -259,30 +260,33 @@ async fn dl_sketch_accession(
let path = location.join(&file_name);
fs::write(&path, &data).context("Failed to write data to file")?;
}
match file_type {
GenBankFileType::Genomic => sigs.extend(
sketch_data(
name.as_str(),
file_name.as_str(),
data,
dna_sigs.clone(),
"dna",
)
.await?,
),
GenBankFileType::Protein => {
sigs.extend(
if !download_only {
// sketch data
match file_type {
GenBankFileType::Genomic => sigs.extend(
sketch_data(
name.as_str(),
file_name.as_str(),
data,
prot_sigs.clone(),
"protein",
dna_sigs.clone(),
"dna",
)
.await?,
);
}
_ => {} // Do nothing for other file types
),
GenBankFileType::Protein => {
sigs.extend(
sketch_data(
name.as_str(),
file_name.as_str(),
data,
prot_sigs.clone(),
"protein",
)
.await?,
);
}
_ => {} // Do nothing for other file types
};
}
}

Expand Down Expand Up @@ -345,6 +349,7 @@ pub async fn download_and_sketch(
keep_fastas: bool,
genomes_only: bool,
proteomes_only: bool,
download_only: bool,
) -> Result<(), anyhow::Error> {
let download_path = PathBuf::from(fasta_location);
if !download_path.exists() {
Expand All @@ -358,6 +363,13 @@ pub async fn download_and_sketch(
{
bail!("Output must be a zip file.");
}
// start zip file; set up trackers
let outpath: PathBuf = output_sigs.into();
let mut file = File::create(outpath).await?;
let mut zipF = ZipFileWriter::new(&mut file);

let mut manifest_rows: Vec<Record> = Vec::new();
let mut md5sum_occurrences: HashMap<String, usize> = HashMap::new();

// Open the file containing the accessions synchronously
let (accession_info, n_accs) = load_accession_info(input_csv)?;
Expand All @@ -377,14 +389,6 @@ pub async fn download_and_sketch(
let dna_sig_templates = build_siginfo(&params_vec, "DNA");
let prot_sig_templates = build_siginfo(&params_vec, "protein");

// start zip file; set up trackers
let outpath: PathBuf = output_sigs.into();
let mut file = File::create(outpath).await?;
let mut zipF = ZipFileWriter::new(&mut file);

let mut manifest_rows: Vec<Record> = Vec::new();
let mut md5sum_occurrences: HashMap<String, usize> = HashMap::new();

// failures
let file = std::fs::File::create(failed_csv)?;
let mut failed_writer = csv::Writer::from_writer(file);
Expand Down Expand Up @@ -421,6 +425,7 @@ pub async fn download_and_sketch(
prot_sig_templates.clone(),
genomes_only,
proteomes_only,
download_only,
)
.await;

Expand All @@ -443,7 +448,7 @@ pub async fn download_and_sketch(
}
}
// if no signatures were written, bail so user knows something went wrong
if !wrote_sigs {
if !wrote_sigs && !download_only {
bail!("No signatures written.")
}

Expand Down
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ fn do_gbsketch(
keep_fastas: bool,
genomes_only: bool,
proteomes_only: bool,
download_only: bool,
) -> anyhow::Result<u8> {
// let runtime = tokio::runtime::Runtime::new().unwrap();

Expand All @@ -52,6 +53,7 @@ fn do_gbsketch(
keep_fastas,
genomes_only,
proteomes_only,
download_only,
) {
Ok(_) => Ok(0),
Err(e) => {
Expand Down
11 changes: 10 additions & 1 deletion src/python/sourmash_plugin_directsketch/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#! /usr/bin/env python
import os
import sys
from sourmash.logging import notify
from sourmash.plugins import CommandLinePlugin
import importlib.metadata
Expand Down Expand Up @@ -44,6 +45,7 @@ def __init__(self, p):
help='Write fastas here', default = '.')
p.add_argument('-k', '--keep-fastas', action='store_true',
help="write FASTA files in addition to sketching. Default: do not write FASTA files")
p.add_argument('--download-only', help='just download genomes; do not sketch', action='store_true')
p.add_argument('--failed',help='csv of failed accessions and download links (should be mostly protein).')
p.add_argument('-p', '--param-string', action='append', type=str, default=[],
help='parameter string for sketching (default: k=31,scaled=1000)')
Expand All @@ -55,12 +57,18 @@ def __init__(self, p):
group.add_argument('-g', '--genomes-only', action='store_true', help='just download and sketch genome (DNA) files')
group.add_argument('-m', '--proteomes-only', action='store_true', help='just download and sketch proteome (protein) files')


def main(self, args):
print_version()
if not args.param_string:
args.param_string = ["k=31,scaled=1000"]
notify(f"params: {args.param_string}")

if args.download_only and not args.keep_fastas:
notify("Error: '--download-only' requires '--keep-fastas'.")
sys.exit(-1)


# convert to a single string for easier rust handling
args.param_string = "_".join(args.param_string)
# lowercase the param string
Expand All @@ -79,7 +87,8 @@ def main(self, args):
args.fastas,
args.keep_fastas,
args.genomes_only,
args.proteomes_only)
args.proteomes_only,
args.download_only)

if status == 0:
notify(f"...gbsketch is done! Sigs in '{args.output}'. Fastas in '{args.fastas}'.")
Expand Down
25 changes: 25 additions & 0 deletions tests/test_gbsketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,31 @@ def test_gbsketch_save_fastas(runtmp):
else:
assert sig.md5sum() == ss3.md5sum()

def test_gbsketch_download_only(runtmp):
acc_csv = get_test_data('acc.csv')
output = runtmp.output('simple.zip')
failed = runtmp.output('failed.csv')
out_dir = runtmp.output('out_fastas')


sig1 = get_test_data('GCA_000175555.1.sig.gz')
sig2 = get_test_data('GCA_000961135.2.sig.gz')
sig3 = get_test_data('GCA_000961135.2.protein.sig.gz')
ss1 = sourmash.load_one_signature(sig1, ksize=31)
ss2 = sourmash.load_one_signature(sig2, ksize=31)
# why does this need ksize =30 and not ksize = 10!???
ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein')

runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, '--download-only',
'--failed', failed, '-r', '1', '--fastas', out_dir, '--keep-fastas',
'--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200")

assert os.path.exists(output) # would be better if this didn't exist
assert not runtmp.last_result.out # stdout should be empty
fa_files = os.listdir(out_dir)
assert set(fa_files) == set(['GCA_000175555.1_genomic.fna.gz', 'GCA_000961135.2_protein.faa.gz', 'GCA_000961135.2_genomic.fna.gz'])


def test_gbsketch_bad_acc(runtmp):
acc_csv = get_test_data('acc.csv')
acc_mod = runtmp.output('acc_mod.csv')
Expand Down

0 comments on commit 6a66169

Please sign in to comment.