From bcadb0b4bf716426a0c2251c605ce620d772e795 Mon Sep 17 00:00:00 2001
From: jeff-k <jeff_k@fastmail.com>
Date: Sun, 12 May 2024 18:30:57 +0100
Subject: [PATCH] cargo release with first fasta/fastq capabilities

---
 README.md  | 33 ++++++++++++++++++++---
 src/lib.rs | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 107 insertions(+), 3 deletions(-)
diff --git a/README.md b/README.md
index 5a6dcfd..d287c2b 100644
--- a/README.md
+++ b/README.md
@@ -9,8 +9,7 @@
 
 #### This crate is in early development. Contributions are very welcome.
 
-Webassembly example: (https://jeff-k.github.io/fqdemo/)[Remove non M. TB reads from streaming fastqs], (https://jeff-k.github.io/amplicon-tiling/)[amplicon bases SARS-CoV-2 assembly]
-</div>
+Webassembly examples: [Remove non M. TB reads from streaming fastqs](https://jeff-k.github.io/fqdemo/), [amplicon based SARS-CoV-2 assembly](https://jeff-k.github.io/amplicon-tiling/)</div>
 
 ## Features
 
@@ -28,6 +27,7 @@ Records can be read into custom types: `pub struct Fastq<R: BufRead, T = Seq<Dna
 
 ## Examples
 
+### Stream a pair of fastqs and check some conditions on their name fields
 ```rust
 // Open a pair of gzipped fastq files as streams of `Record`s with `Seq<Dna>` sequences
 
@@ -51,7 +51,6 @@ for zipped in fq1.zip(fq2) {
             // check that the description fields are equal up to the last character
             if r1.fields[..r1.fields.len() - 1] != r2.fields[..r2.fields.len() - 1] {
                 eprintln!("reads do not have the same names");
-                exit(1);
             }
         }
         _ => {
@@ -68,6 +67,34 @@ $ cargo build --example fqcheck --release
 $ target/release/examples/fqcheck r1.fq.gz r2.fq.gz
 ```
 
+### Count amino acid k-mers
+
+```rust
+// this opens a gzipped data stream and parses it into `Records` with `Seq<Amino>` sequence fields
+let faa: Fasta<BufReader<File>, Seq<Amino>> =
+    Fasta::new(BufReader::new(File::open(&faa_file).unwrap()));
+
+// we can convert amino acid k-mers directly into usizes and use them to index into a table
+let mut histogram = Box::new([0u64; 1 << (K * Amino::BITS as usize)]);
+
+for contig in faa {
+    // here "contig" is a fasta record
+    for kmer in contig.unwrap().seq.kmers::<K>() {
+        histogram[usize::from(kmer)] += 1;
+    }
+}
+```
+
+
+To run the `aminokmers` example program with fasta file `proteins.faa`:
+
+```
+$ cargo build --example fqcheck --release
+$ target/release/examples/aminokmers proteins.faa
+```
+
+## Roadmap
+
 input streams:
 
 * fastq
diff --git a/src/lib.rs b/src/lib.rs
index ede760d..973f5e0 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,3 +1,80 @@
+//! # bio-steams
+//!
+//! ### Types and datastructures for streaming genomics data
+//!
+//! #### This crate is in early development. Contributions are very welcome.
+//!
+//! Webassembly examples: [Remove non M. TB reads from streaming fastqs](https://jeff-k.github.io/fqdemo/), [amplicon based SARS-CoV-2 assembly](https://jeff-k.github.io/amplicon-tiling/)
+//!
+//! ## Features
+//!
+//! Shared `Record` type by `Fastq` and `Fasta` streams:
+//!
+//! ```
+//! use bio_streams::record::Phred;
+//!
+//! pub struct Record<T: for<'a> TryFrom<&'a [u8]> = Vec<u8>> {
+//!    pub fields: Vec<u8>,
+//!    pub seq: T,
+//!    pub quality: Option<Vec<Phred>>, // fasta records set quality to `None`//!
+//! }
+//! ```
+//!
+//! Records can be read into custom types: `pub struct Fastq<R: BufRead, T = Seq<Dna>>`
+//!
+//! ## Examples
+//!
+//! ### Stream a pair of fastqs and check some conditions on their name fields
+//! ```text
+//! // Open a pair of gzipped fastq files as streams of `Record`s with `Seq<Dna>` sequences
+//!
+//! let fq1: Fastq<BufReader<MultiGzDecoder<File>>> = Fastq::new(BufReader::new(
+//!    MultiGzDecoder::new(File::open(&file1).unwrap()),
+//! ));
+//!
+//! let fq2: Fastq<BufReader<MultiGzDecoder<File>>> = Fastq::new(BufReader::new(
+//!    MultiGzDecoder::new(File::open(&file2).unwrap()),
+//! ));
+//!
+//! for zipped in fq1.zip(fq2) {
+//!     match zipped {
+//!         (Ok(r1), Ok(r2)) => {
+//!            // check that the last characters of the name strings are 1 and 2
+//!            if r1.fields[r1.fields.len() - 1] != b'1' || r2.fields[r2.fields.len() - 1] != b'2'
+//!            {
+//!                eprintln!("paired records do not end in 1/2");
+//!            }
+//!
+//!            // check that the description fields are equal up to the last character
+//!            if r1.fields[..r1.fields.len() - 1] != r2.fields[..r2.fields.len() - 1] {
+//!                eprintln!("reads do not have the same names");
+//!            }
+//!         }
+//!         _ => {
+//!             eprintln!("Parse error in fastq files");
+//!         }
+//!     }
+//! }
+//! ```
+//!
+//! ### Count amino acid k-mers
+//!
+//! ```text
+//! // this opens a gzipped data stream and parses it into `Records` with `Seq<Amino>` sequence fields
+//! let faa: Fasta<BufReader<File>, Seq<Amino>> =
+//!     Fasta::new(BufReader::new(File::open(&faa_file).unwrap()));
+//!
+//! // we can convert amino acid k-mers directly into usizes and use them to index into a table
+//! let mut histogram = Box::new([0u64; 1 << (K * Amino::BITS as usize)]);
+//!
+//! for contig in faa {
+//!    // here "contig" is a fasta record
+//!    for kmer in contig.unwrap().seq.kmers::<K>() {
+//!        histogram[usize::from(kmer)] += 1;
+//!    }
+//! }
+//! ```
+//!
 extern crate bio_seq;
 
 pub mod fasta;