Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add histo for frequency counts #29

Merged
merged 4 commits into from
Sep 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 57 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -186,8 +186,63 @@ impl KmerCountTable {
// Default sort by count
// Option sort on keys

// TODO: Add method "histo"
// Output frequency counts
/// Calculates the frequency histogram for k-mer counts
/// Returns a vector of tuples (frequency, count), where 'frequency' is
/// the observed number of times a k-mer count occurred and 'count' is
/// how many different k-mers have that frequency.
/// If `zero` is True, include all frequencies from 0 to max observed count,
/// even if no k-mers were observed for those frequencies.
#[pyo3(signature = (zero=true))]
pub fn histo(&self, zero: bool) -> Vec<(u64, u64)> {
let mut freq_count: HashMap<u64, u64> = HashMap::new();

// Step 1: Count the frequencies of observed k-mer counts
for &count in self.counts.values() {
*freq_count.entry(count).or_insert(0) += 1;
}

let mut histo_vec: Vec<(u64, u64)>;

if zero {
// Step 2 (optional): Include all frequencies from 0 to max_count
let max_count = self.max();
histo_vec = (0..=max_count)
.map(|freq| (freq, *freq_count.get(&freq).unwrap_or(&0)))
Adamtaranto marked this conversation as resolved.
Show resolved Hide resolved
.collect();
} else {
// Step 2: Only include observed frequencies
histo_vec = freq_count.into_iter().collect();
histo_vec.sort_by_key(|&(frequency, _)| frequency);
}

histo_vec
}

/// Finds and returns the minimum count in the counts HashMap.
/// Returns 0 if the HashMap is empty.
#[getter]
pub fn min(&self) -> u64 {
// Check if the HashMap is empty, return 0 if true
if self.counts.is_empty() {
return 0;
}

// Iterate over the counts and find the minimum value
*self.counts.values().min().unwrap_or(&0)
}

/// Finds and returns the maximum count in the counts HashMap.
/// Returns 0 if the HashMap is empty.
#[getter]
pub fn max(&self) -> u64 {
// Check if the HashMap is empty, return 0 if true
if self.counts.is_empty() {
return 0;
}

// Iterate over the counts and find the maximum value
*self.counts.values().max().unwrap_or(&0)
}

// Getter for the 'hashes' attribute, returning all hash keys in the table
#[getter]
Expand Down
125 changes: 125 additions & 0 deletions src/python/tests/test_histo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import pytest
import oxli


@pytest.fixture
def kmer_count_table():
"""Fixture to create a KmerCountTable with ksize=4."""
table = oxli.KmerCountTable(ksize=4)
return table


def test_min_empty_table(kmer_count_table):
"""Test min on an empty KmerCountTable.

Edge case: When the table is empty, min should return 0.
"""
assert kmer_count_table.min == 0, "min should return 0 for an empty table"


def test_max_empty_table(kmer_count_table):
"""Test max on an empty KmerCountTable.

Edge case: When the table is empty, max should return 0.
"""
assert kmer_count_table.max == 0, "max should return 0 for an empty table"


def test_min_non_empty_table(kmer_count_table):
"""Test min on a non-empty KmerCountTable."""
kmer_count_table.count("AAAA") # Adding 1 k-mer
kmer_count_table.count("TTTT") # Another k-mer with same hash (canonical k-mer)
kmer_count_table.consume("CCCCCC") # Count "CCCC" 3 times

assert (
kmer_count_table.min == 2
), "min should return the minimum count value, in this case 2"


def test_max_non_empty_table(kmer_count_table):
"""Test max on a non-empty KmerCountTable."""
kmer_count_table.count("AAAA") # Adding k-mers
kmer_count_table.count("TTTT") # Another k-mer with same hash (canonical k-mer)
kmer_count_table.count("CCCC") # Another distinct k-mer

assert (
kmer_count_table.max == 2
), "max should return the maximum count value, in this case 2"


def test_histo_zero_false_empty_table(kmer_count_table):
"""Test histo(zero=False) on an empty KmerCountTable.

Edge case: When the table is empty, histo() should return an empty list.
"""
assert (
kmer_count_table.histo(zero=False) == []
), "histo() should return an empty list for an empty table"


def test_histo_zero_true_empty_table(kmer_count_table):
"""Test histo(zero=True) on an empty KmerCountTable.

Edge case: When the table is empty, histo() should return [(0, 0)].
"""
assert kmer_count_table.histo(zero=True) == [
(0, 0)
], "histo(zero=True) should return [(0, 0)] for an empty table"


def test_histo_zero_false_non_empty_table(kmer_count_table):
"""
Test histo(zero=False) on a non-empty KmerCountTable.
Only observed frequencies should be included in the histogram.
"""
kmer_count_table.count("AAAA") # Add k-mer, counts=1
kmer_count_table.count("AAAA") # Add k-mer, counts=2
kmer_count_table.count("TTTT") # Add another k-mer, canonical hash same, counts=3
kmer_count_table.count("CCCC") # Add distinct k-mer, counts=1

expected_histo = [(1, 1), (3, 1)] # 1 k-mer observed once, 1 observed thrice
assert (
kmer_count_table.histo(zero=False) == expected_histo
), "histo(zero=False) should only return observed frequencies"


def test_histo_zero_true_non_empty_table(kmer_count_table):
"""
Test histo(zero=True) on a non-empty KmerCountTable.
All frequencies up to the maximum count should be included, including zero frequencies.
"""
kmer_count_table.count("AAAA") # Add k-mer, counts=1
kmer_count_table.count("AAAA") # Add k-mer, counts=2
kmer_count_table.count("TTTT") # Add another k-mer, canonical hash same, counts=3
kmer_count_table.count("CCCC") # Add distinct k-mer, counts=1

expected_histo = [
(0, 0),
(1, 1),
(2, 0),
(3, 1),
] # Include 0 frequency, 1 k-mer observed once, 0 observed twice, 1 observed thrice
assert (
kmer_count_table.histo(zero=True) == expected_histo
), "histo(zero=True) should include all frequencies up to max"


def test_histo_with_large_max_count(kmer_count_table):
"""Test histo() when there is a large maximum count in the table.

Edge case: The histogram should correctly account for large frequency values.
"""
for _ in range(5):
kmer_count_table.count("AAAA") # Add the same k-mer 100 times

expected_histo = [
(0, 0),
(1, 0),
(2, 0),
(3, 0),
(4, 0),
(5, 1),
] # 1 k-mer with count 100
assert (
kmer_count_table.histo(zero=True) == expected_histo
), "histo() include all zero counts up to max observed count."
Loading