From 8675842541cde4a8b52e328ce9706b939551b7be Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 21 May 2024 07:36:47 +0100 Subject: [PATCH] init --- src/directsketch.rs | 9 +++++++-- tests/test-data/acc-url-xz.csv | 2 ++ tests/test_urlsketch.py | 24 +++++++++++++++++++++++- 3 files changed, 32 insertions(+), 3 deletions(-) create mode 100644 tests/test-data/acc-url-xz.csv diff --git a/src/directsketch.rs b/src/directsketch.rs index 597f799..7c35b40 100644 --- a/src/directsketch.rs +++ b/src/directsketch.rs @@ -210,9 +210,14 @@ async fn sketch_data( moltype: String, ) -> Result> { tokio::task::spawn_blocking(move || { + let cursor = Cursor::new(compressed_data); - let mut fastx_reader = - parse_fastx_reader(cursor).context("Failed to parse FASTA/FASTQ data")?; + // use niffler to get decompressed reader + let (mut reader, compression) = niffler::get_reader(Box::new(cursor))?; + let reader = Arc::new(reader); + let mut fastx_reader = parse_fastx_reader(&mut reader).context("Failed to parse FASTA/FASTQ data")?; + // let mut fastx_reader = + // parse_fastx_reader(cursor).context("Failed to parse FASTA/FASTQ data")?; let mut set_name = false; while let Some(record) = fastx_reader.next() { diff --git a/tests/test-data/acc-url-xz.csv b/tests/test-data/acc-url-xz.csv new file mode 100644 index 0000000..38fb5ba --- /dev/null +++ b/tests/test-data/acc-url-xz.csv @@ -0,0 +1,2 @@ +accession,name,moltype,md5sum,download_filename,url +achromobacter_xylosoxidans,achromobacter_xylosoxidans,dna,,achromobacter_xylosoxidans__01.asm.genomic.fna.tar.xz,https://ftp.ebi.ac.uk/pub/databases/AllTheBacteria/Releases/0.2/assembly/achromobacter_xylosoxidans__01.asm.tar.xz diff --git a/tests/test_urlsketch.py b/tests/test_urlsketch.py index 64d1f1f..177f855 100644 --- a/tests/test_urlsketch.py +++ b/tests/test_urlsketch.py @@ -318,4 +318,26 @@ def test_urlsketch_from_gbsketch_failed(runtmp, capfd): assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz" -# def test_urlsketch_from_urlsketch_failed(runtmp, capfd): +def test_urlsketch_tarxz(runtmp): + acc_csv = get_test_data('acc-url-xz.csv') + output = runtmp.output('simple.zip') + failed = runtmp.output('failed.csv') + + # sig3 = get_test_data('GCA_000961135.2.protein.sig.gz') + # ss1 = sourmash.load_one_signature(sig1, ksize=31) + + runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, + '--failed', failed, '-r', '1', + '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + + # idx = sourmash.load_file_as_index(output) + # sigs = list(idx.signatures()) + + # assert len(sigs) == 1 + # for sig in sigs: + # if 'GCA_000175535.1' in sig.name: + # assert sig.name == ss1.name + # assert sig.md5sum() == ss1.md5sum()