Skip to content

Commit

Permalink
use most frequent element to determine scramble check direction
Browse files Browse the repository at this point in the history
  • Loading branch information
Donaim committed Jun 6, 2023
1 parent 94902f9 commit 67ac930
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 3 deletions.
9 changes: 8 additions & 1 deletion intact/intact.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import tempfile
import csv
from dataclasses import dataclass
from collections import Counter
from Bio import AlignIO, Seq, SeqIO, SeqRecord
from scipy.stats import fisher_exact

Expand Down Expand Up @@ -159,6 +160,12 @@ def is_sorted(lst):
return True


def most_frequent_element(lst):
counter = Counter(lst)
most_common = counter.most_common(1)
return most_common[0][0] if most_common else None


def remove_5_prime(blast_rows):
# HIV 5' region can easily map to its 3' region because they are identical.
# Such a maping would not constitute a scramble, so we ignore the 5' region for this check.
Expand Down Expand Up @@ -194,7 +201,7 @@ def is_scrambled(seqid, blast_rows):
return None

ignored_5_prime.sort(key=lambda x: x.qstart)
direction = ignored_5_prime[0].sstrand
direction = most_frequent_element(x.sstrand for x in ignored_5_prime)
if direction == "plus" and is_sorted(x.sstart for x in ignored_5_prime):
return None
elif direction == "minus" and is_sorted(x.send for x in reversed(ignored_5_prime)):
Expand Down
16 changes: 14 additions & 2 deletions tests/test_scramble.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,19 @@
import os

import intact.intact as intact
from intact.intact import is_scrambled, contains_internal_inversion, IntactnessError
from intact.intact import is_scrambled, contains_internal_inversion, IntactnessError, most_frequent_element

@pytest.mark.parametrize("lst, expected", [
([1, 2, 3, 4, 2, 2, 3, 1, 4, 4, 4], 4),
([1, 2, 3, 4, 2, 2, 3, 1], 2),
([], None),
([5], 5),
([1, 2, 3, 4, 5], 1),
([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 1),
([1, 2, 3, 2, 4, 5, 4, 6, 6, 6], 6)
])
def test_most_frequent_element(lst, expected):
assert most_frequent_element(lst) == expected

@pytest.mark.parametrize("lst, expected", [
([1, 2, 3, 4, 5], True),
Expand Down Expand Up @@ -114,7 +126,7 @@ def test_is_scrambled_mixed_direction():
]
result = is_scrambled("id", blast_rows) or contains_internal_inversion("id", blast_rows)
assert isinstance(result, IntactnessError)
assert result.error == intact.INTERNALINVERSION_ERROR
assert result.error == intact.SCRAMBLE_ERROR

def test_is_scrambled_single_row_plus_strand():
# Test case with a single row aligned in the "plus" strand
Expand Down

0 comments on commit 67ac930

Please sign in to comment.