From bcadb0b4bf716426a0c2251c605ce620d772e795 Mon Sep 17 00:00:00 2001 From: jeff-k Date: Sun, 12 May 2024 18:30:57 +0100 Subject: [PATCH] cargo release with first fasta/fastq capabilities --- README.md | 33 ++++++++++++++++++++--- src/lib.rs | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 5a6dcfd..d287c2b 100644 --- a/README.md +++ b/README.md @@ -9,8 +9,7 @@ #### This crate is in early development. Contributions are very welcome. -Webassembly example: (https://jeff-k.github.io/fqdemo/)[Remove non M. TB reads from streaming fastqs], (https://jeff-k.github.io/amplicon-tiling/)[amplicon bases SARS-CoV-2 assembly] - +Webassembly examples: [Remove non M. TB reads from streaming fastqs](https://jeff-k.github.io/fqdemo/), [amplicon based SARS-CoV-2 assembly](https://jeff-k.github.io/amplicon-tiling/) ## Features @@ -28,6 +27,7 @@ Records can be read into custom types: `pub struct Fastq` sequences @@ -51,7 +51,6 @@ for zipped in fq1.zip(fq2) { // check that the description fields are equal up to the last character if r1.fields[..r1.fields.len() - 1] != r2.fields[..r2.fields.len() - 1] { eprintln!("reads do not have the same names"); - exit(1); } } _ => { @@ -68,6 +67,34 @@ $ cargo build --example fqcheck --release $ target/release/examples/fqcheck r1.fq.gz r2.fq.gz ``` +### Count amino acid k-mers + +```rust +// this opens a gzipped data stream and parses it into `Records` with `Seq` sequence fields +let faa: Fasta, Seq> = + Fasta::new(BufReader::new(File::open(&faa_file).unwrap())); + +// we can convert amino acid k-mers directly into usizes and use them to index into a table +let mut histogram = Box::new([0u64; 1 << (K * Amino::BITS as usize)]); + +for contig in faa { + // here "contig" is a fasta record + for kmer in contig.unwrap().seq.kmers::() { + histogram[usize::from(kmer)] += 1; + } +} +``` + + +To run the `aminokmers` example program with fasta file `proteins.faa`: + +``` +$ cargo build --example fqcheck --release +$ target/release/examples/aminokmers proteins.faa +``` + +## Roadmap + input streams: * fastq diff --git a/src/lib.rs b/src/lib.rs index ede760d..973f5e0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,80 @@ +//! # bio-steams +//! +//! ### Types and datastructures for streaming genomics data +//! +//! #### This crate is in early development. Contributions are very welcome. +//! +//! Webassembly examples: [Remove non M. TB reads from streaming fastqs](https://jeff-k.github.io/fqdemo/), [amplicon based SARS-CoV-2 assembly](https://jeff-k.github.io/amplicon-tiling/) +//! +//! ## Features +//! +//! Shared `Record` type by `Fastq` and `Fasta` streams: +//! +//! ``` +//! use bio_streams::record::Phred; +//! +//! pub struct Record TryFrom<&'a [u8]> = Vec> { +//! pub fields: Vec, +//! pub seq: T, +//! pub quality: Option>, // fasta records set quality to `None`//! +//! } +//! ``` +//! +//! Records can be read into custom types: `pub struct Fastq>` +//! +//! ## Examples +//! +//! ### Stream a pair of fastqs and check some conditions on their name fields +//! ```text +//! // Open a pair of gzipped fastq files as streams of `Record`s with `Seq` sequences +//! +//! let fq1: Fastq>> = Fastq::new(BufReader::new( +//! MultiGzDecoder::new(File::open(&file1).unwrap()), +//! )); +//! +//! let fq2: Fastq>> = Fastq::new(BufReader::new( +//! MultiGzDecoder::new(File::open(&file2).unwrap()), +//! )); +//! +//! for zipped in fq1.zip(fq2) { +//! match zipped { +//! (Ok(r1), Ok(r2)) => { +//! // check that the last characters of the name strings are 1 and 2 +//! if r1.fields[r1.fields.len() - 1] != b'1' || r2.fields[r2.fields.len() - 1] != b'2' +//! { +//! eprintln!("paired records do not end in 1/2"); +//! } +//! +//! // check that the description fields are equal up to the last character +//! if r1.fields[..r1.fields.len() - 1] != r2.fields[..r2.fields.len() - 1] { +//! eprintln!("reads do not have the same names"); +//! } +//! } +//! _ => { +//! eprintln!("Parse error in fastq files"); +//! } +//! } +//! } +//! ``` +//! +//! ### Count amino acid k-mers +//! +//! ```text +//! // this opens a gzipped data stream and parses it into `Records` with `Seq` sequence fields +//! let faa: Fasta, Seq> = +//! Fasta::new(BufReader::new(File::open(&faa_file).unwrap())); +//! +//! // we can convert amino acid k-mers directly into usizes and use them to index into a table +//! let mut histogram = Box::new([0u64; 1 << (K * Amino::BITS as usize)]); +//! +//! for contig in faa { +//! // here "contig" is a fasta record +//! for kmer in contig.unwrap().seq.kmers::() { +//! histogram[usize::from(kmer)] += 1; +//! } +//! } +//! ``` +//! extern crate bio_seq; pub mod fasta;