Skip to content

Commit

Permalink
Add cosine() tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Adamtaranto committed Sep 22, 2024
1 parent 13555a6 commit da1c480
Showing 1 changed file with 162 additions and 0 deletions.
162 changes: 162 additions & 0 deletions src/python/tests/test_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
from math import isclose
from oxli import KmerCountTable
from scipy.spatial.distance import cosine
import numpy as np
import pytest


def test_cosine_similarity_identical_tables():
"""
Test cosine similarity for two identical KmerCountTable objects.
The cosine similarity should be 1.0 because the vectors representing
the k-mer counts are exactly the same, meaning the angle between them
is 0 degrees (cos(0) = 1).
"""
kct1 = KmerCountTable(ksize=4)
kct2 = KmerCountTable(ksize=4)

# Manually set k-mer counts
kct1["AAAA"] = 5
kct1["AATT"] = 3
kct1["GGGG"] = 1
kct1["CCAA"] = 4
kct1["ATTG"] = 6

# Copy the exact same counts to kct2
kct2["AAAA"] = 5
kct2["AATT"] = 3
kct2["GGGG"] = 1
kct2["CCAA"] = 4
kct2["ATTG"] = 6

# Cosine similarity between identical tables should be 1.0
# Allow value within 0.001%
assert isclose(kct1.cosine(kct2), 1.0, rel_tol=1e-5)

# Using scipy to calculate the expected value
vector1 = [5, 3, 1, 4, 6]
vector2 = [5, 3, 1, 4, 6]
expected_cosine_sim = 1 - cosine(vector1, vector2)

# Allow value within 0.001%
assert isclose(kct1.cosine(kct2), expected_cosine_sim, rel_tol=1e-5)


def test_cosine_similarity_different_tables():
"""
Test cosine similarity for two different KmerCountTable objects.
The cosine similarity will be less than 1.0 since the vectors
are not identical. We will calculate the expected cosine
similarity using scipy.
"""
kct1 = KmerCountTable(ksize=4)
kct2 = KmerCountTable(ksize=4)

# Manually set k-mer counts for kct1
kct1["AAAA"] = 4
kct1["AATT"] = 3
kct1["GGGG"] = 1
kct1["CCAA"] = 4
kct1["ATTG"] = 6

# Manually set different counts for kct2
kct2["AAAA"] = 5
kct2["AATT"] = 3
kct2["GGGG"] = 1
kct2["CCAA"] = 4
kct2["ATTG"] = 0

# Using scipy to calculate the expected value
vector1 = [4, 3, 1, 4, 6]
vector2 = [5, 3, 1, 4, 0]
expected_cosine_sim = 1 - cosine(vector1, vector2)

# Allow value within 0.001%
assert isclose(kct1.cosine(kct2), expected_cosine_sim, rel_tol=1e-5)


def test_cosine_similarity_empty_table():
"""
Test cosine similarity for two KmerCountTable objects where one is empty.
The cosine similarity should be 0.0 because the dot product with an
empty table will result in zero, making the numerator of the cosine
similarity formula zero.
"""
kct1 = KmerCountTable(ksize=4)
kct2 = KmerCountTable(ksize=4)

# Set counts for kct1
kct1["AAAA"] = 5
kct1["TTTG"] = 10

# Leave kct2 empty

# Cosine similarity should be 0 since one table is empty
assert kct1.cosine(kct2) == 0.0

# Set kct2 with 1 non-overlapping kmer
kct2["ATTG"] = 1

# Cosine similarity should be 0 since no shared kmers
assert kct1.cosine(kct2) == 0.0

# Using scipy for comparison
vector1 = [5, 10, 0]
vector2 = [0, 0, 1] # Representing the empty table with no overlap
expected_cosine_sim = 1 - cosine(vector1, vector2)

assert isclose(kct1.cosine(kct2), expected_cosine_sim, rel_tol=1e-5)


def test_cosine_similarity_both_empty():
"""
Test cosine similarity for two empty KmerCountTable objects.
"""
# Both tables are empty
kct1 = KmerCountTable(ksize=4)
kct2 = KmerCountTable(ksize=4)

# Cosine similarity should be 0.0 for two empty tables
assert kct1.cosine(kct2) == 0.0


def test_cosine_similarity_partial_overlap():
"""
Test cosine similarity for two KmerCountTable objects with partial overlap in k-mers.
The cosine similarity should be less than 1.0 but greater than 0.0 because
the tables have overlapping k-mers, but their counts differ.
"""
kct1 = KmerCountTable(ksize=4)
kct2 = KmerCountTable(ksize=4)

# Manually set k-mer counts for kct1
kct1["AAAA"] = 0 # Not in kct2
kct1["AATT"] = 3
kct1["GGGG"] = 1
kct1["CCAA"] = 4
kct1["ATTG"] = 0
kct1["AGAT"] = 0 # Set but not in either

# Manually set k-mer counts for kct2
kct2["AAAA"] = 5
kct2["AATT"] = 4 # Diff value to kct1
kct2["GGGG"] = 1
kct2["CCAA"] = 4
kct2["ATTG"] = 1 # Not in kct1

# Using scipy for comparison
vector1 = [0, 3, 1, 4, 0, 0]
vector2 = [5, 4, 1, 4, 1, 0]
expected_cosine_sim = 1 - cosine(vector1, vector2)

# Cosine similarity is expected to be > 0 but < 1
assert isclose(kct1.cosine(kct2), expected_cosine_sim, rel_tol=1e-5)


# Jaccard

# Jaccard with setops

0 comments on commit da1c480

Please sign in to comment.