Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
bluegenes committed May 21, 2024
1 parent c328cc3 commit 1a4e004
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 3 deletions.
8 changes: 6 additions & 2 deletions src/directsketch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -210,9 +210,13 @@ async fn sketch_data(
moltype: String,
) -> Result<Vec<Signature>> {
tokio::task::spawn_blocking(move || {

let cursor = Cursor::new(compressed_data);
let mut fastx_reader =
parse_fastx_reader(cursor).context("Failed to parse FASTA/FASTQ data")?;
// use niffler to get decompressed reader
let (mut reader, compression) = niffler::get_reader(Box::new(cursor))?;
let mut fastx_reader = parse_fastx_reader(&mut reader).context("Failed to parse FASTA/FASTQ data")?;
// let mut fastx_reader =
// parse_fastx_reader(cursor).context("Failed to parse FASTA/FASTQ data")?;

let mut set_name = false;
while let Some(record) = fastx_reader.next() {
Expand Down
2 changes: 2 additions & 0 deletions tests/test-data/acc-url-xz.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
accession,name,moltype,md5sum,download_filename,url
achromobacter_xylosoxidans,achromobacter_xylosoxidans,dna,,achromobacter_xylosoxidans__01.asm.genomic.fna.tar.xz,https://ftp.ebi.ac.uk/pub/databases/AllTheBacteria/Releases/0.2/assembly/achromobacter_xylosoxidans__01.asm.tar.xz
24 changes: 23 additions & 1 deletion tests/test_urlsketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,4 +318,26 @@ def test_urlsketch_from_gbsketch_failed(runtmp, capfd):
assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz"


# def test_urlsketch_from_urlsketch_failed(runtmp, capfd):
def test_urlsketch_tarxz(runtmp):
acc_csv = get_test_data('acc-url-xz.csv')
output = runtmp.output('simple.zip')
failed = runtmp.output('failed.csv')

# sig3 = get_test_data('GCA_000961135.2.protein.sig.gz')
# ss1 = sourmash.load_one_signature(sig1, ksize=31)

runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output,
'--failed', failed, '-r', '1',
'--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200")

assert os.path.exists(output)
assert not runtmp.last_result.out # stdout should be empty

# idx = sourmash.load_file_as_index(output)
# sigs = list(idx.signatures())

# assert len(sigs) == 1
# for sig in sigs:
# if 'GCA_000175535.1' in sig.name:
# assert sig.name == ss1.name
# assert sig.md5sum() == ss1.md5sum()

0 comments on commit 1a4e004

Please sign in to comment.