diff --git a/intact/intact.py b/intact/intact.py index 2623c7f..311a29a 100644 --- a/intact/intact.py +++ b/intact/intact.py @@ -8,6 +8,7 @@ import tempfile import csv from dataclasses import dataclass +from collections import Counter from Bio import AlignIO, Seq, SeqIO, SeqRecord from scipy.stats import fisher_exact @@ -159,6 +160,12 @@ def is_sorted(lst): return True +def most_frequent_element(lst): + counter = Counter(lst) + most_common = counter.most_common(1) + return most_common[0][0] if most_common else None + + def remove_5_prime(blast_rows): # HIV 5' region can easily map to its 3' region because they are identical. # Such a maping would not constitute a scramble, so we ignore the 5' region for this check. @@ -194,7 +201,7 @@ def is_scrambled(seqid, blast_rows): return None ignored_5_prime.sort(key=lambda x: x.qstart) - direction = ignored_5_prime[0].sstrand + direction = most_frequent_element(x.sstrand for x in ignored_5_prime) if direction == "plus" and is_sorted(x.sstart for x in ignored_5_prime): return None elif direction == "minus" and is_sorted(x.send for x in reversed(ignored_5_prime)): diff --git a/tests/test_scramble.py b/tests/test_scramble.py index 7b04cab..15f1fe2 100644 --- a/tests/test_scramble.py +++ b/tests/test_scramble.py @@ -3,7 +3,19 @@ import os import intact.intact as intact -from intact.intact import is_scrambled, contains_internal_inversion, IntactnessError +from intact.intact import is_scrambled, contains_internal_inversion, IntactnessError, most_frequent_element + +@pytest.mark.parametrize("lst, expected", [ + ([1, 2, 3, 4, 2, 2, 3, 1, 4, 4, 4], 4), + ([1, 2, 3, 4, 2, 2, 3, 1], 2), + ([], None), + ([5], 5), + ([1, 2, 3, 4, 5], 1), + ([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 1), + ([1, 2, 3, 2, 4, 5, 4, 6, 6, 6], 6) +]) +def test_most_frequent_element(lst, expected): + assert most_frequent_element(lst) == expected @pytest.mark.parametrize("lst, expected", [ ([1, 2, 3, 4, 5], True), @@ -114,7 +126,7 @@ def test_is_scrambled_mixed_direction(): ] result = is_scrambled("id", blast_rows) or contains_internal_inversion("id", blast_rows) assert isinstance(result, IntactnessError) - assert result.error == intact.INTERNALINVERSION_ERROR + assert result.error == intact.SCRAMBLE_ERROR def test_is_scrambled_single_row_plus_strand(): # Test case with a single row aligned in the "plus" strand