diff --git a/src/utils.rs b/src/utils.rs index 6bb026c..709a9cb 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -127,13 +127,13 @@ pub fn load_gbassembly_info(input_csv: String) -> Result<(Vec, u .get(1) .ok_or_else(|| anyhow!("Missing 'name' field"))? .to_string(); - // optionally get url - let url = record.get(3).and_then(|s| { + let url = record.get(2).and_then(|s| { if s.is_empty() { None } else { - reqwest::Url::parse(s).map_err(|_| ()).ok() + let trimmed_s = s.trim_end_matches("/"); + reqwest::Url::parse(trimmed_s).map_err(|_| ()).ok() } }); diff --git a/tests/test_gbsketch.py b/tests/test_gbsketch.py index aa865a2..0f716a5 100644 --- a/tests/test_gbsketch.py +++ b/tests/test_gbsketch.py @@ -56,6 +56,43 @@ def test_gbsketch_simple(runtmp): else: assert sig.md5sum() == ss3.md5sum() + +def test_gbsketch_simple_url(runtmp): + acc_csv = get_test_data('acc-with-url.csv') + output = runtmp.output('simple.zip') + failed = runtmp.output('failed.csv') + + sig1 = get_test_data('GCA_000175535.1.sig.gz') + sig2 = get_test_data('GCA_000961135.2.sig.gz') + sig3 = get_test_data('GCA_000961135.2.protein.sig.gz') + ss1 = sourmash.load_one_signature(sig1, ksize=31) + ss2 = sourmash.load_one_signature(sig2, ksize=31) + # why does this need ksize =30 and not ksize = 10!??? + ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein') + + runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, + '--failed', failed, '-r', '1', + '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + + assert len(sigs) == 3 + for sig in sigs: + if 'GCA_000175535.1' in sig.name: + assert sig.name == ss1.name + assert sig.md5sum() == ss1.md5sum() + elif 'GCA_000961135.2' in sig.name: + assert sig.name == ss2.name + if sig.minhash.moltype == 'DNA': + assert sig.md5sum() == ss2.md5sum() + else: + assert sig.md5sum() == ss3.md5sum() + + def test_gbsketch_genomes_only(runtmp): acc_csv = get_test_data('acc.csv') output = runtmp.output('simple.zip')