Skip to content

Commit

Permalink
move blast checks into the main intact.py file
Browse files Browse the repository at this point in the history
  • Loading branch information
Donaim committed Jun 5, 2023
1 parent b979e78 commit 619bcd0
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 143 deletions.
135 changes: 0 additions & 135 deletions intact/blastit.py

This file was deleted.

120 changes: 120 additions & 0 deletions intact/intact.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
import subprocess
import sys
import uuid
import tempfile
import csv
from dataclasses import dataclass
from Bio import AlignIO, Seq, SeqIO, SeqRecord
from scipy.stats import fisher_exact

Expand Down Expand Up @@ -57,6 +60,123 @@ def __init__(self, start, end, deleted_count, inserted_count):
self.deleted_count = deleted_count
self.inserted_count = inserted_count

@dataclass
class BlastRow:
qseqid: str
qlen: int
sseqid: str
sgi: str
slen: int
qstart: int
qend: int
sstart: int
send: int
evalue: float
bitscore: float
length: int
pident: float
nident: float
btop: int
stitle: str
sstrand: str


def init_blast_row(row):
it = iter(row)
return BlastRow(
qseqid=next(it),
qlen=int(next(it)),
sseqid=next(it),
sgi=next(it),
slen=int(next(it)),
qstart=int(next(it)),
qend=int(next(it)),
sstart=int(next(it)),
send=int(next(it)),
evalue=float(next(it)),
bitscore=float(next(it)),
length=int(next(it)),
pident=float(next(it)),
nident=float(next(it)),
btop=next(it),
stitle=next(it),
sstrand=next(it),
)

def iterate_blast_rows_from_tsv(file_path):
previous_key = None
values = []

for row in iterate_values_from_tsv(file_path):
key = row[0]
typed = init_blast_row(row)

if key != previous_key and previous_key is not None:
yield values
values = []

values.append(typed)
previous_key = key

if values:
yield values


def blast_interate(subtype, input_file):
with tempfile.NamedTemporaryFile() as output_file:
db_file = st.alignment_file(subtype)
wrappers.blast(db_file, input_file, output_file.name)
for seq in iterate_blast_rows_from_tsv(output_file.name):
yield seq


def is_sorted(lst):
last = None
for x in lst:
if last is not None and x < last:
return False
last = x
return True


def check_scramble(blast_rows):
# HIV 5' region can easily map to its 3' region because they are identical.
# Such a maping would not constitute a scramble, so we ignore the 5' region for this check.
ignored_5_prime = [x for x in blast_rows if x.sstart > 622 and x.send > 622]

if not ignored_5_prime:
# No alignment.
# It should be an error normally, yet not a scramble error.
return None

all_same = len(set(x.sstrand for x in ignored_5_prime)) == 1
if not all_same:
# Some parts of the sequence were aligned
# in forward direction (plus)
# and some in reverse (minus).
# This indicates an internal inversion.
return "mix"

ignored_5_prime.sort(key=lambda x: x.qstart)
direction = ignored_5_prime[0].sstrand
if direction == "plus" and is_sorted(x.sstart for x in ignored_5_prime):
return None
elif direction == "minus" and is_sorted(x.send for x in reversed(ignored_5_prime)):
return None
else:
return direction + "Scramble"


def check_nonhiv(blast_rows):
aligned_length = sum(abs(x.qend - x.qstart) + 1 for x in blast_rows)
total_length = blast_rows[0].qlen if blast_rows else 1
ratio = aligned_length / total_length

if ratio < 0.8:
return "CHIMERA"
else:
return None

def _getPositions(pattern, string):
"""
Hidden function used to get interator of all intances of pattern in string
Expand Down
4 changes: 2 additions & 2 deletions tests/test_nonhiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import pytest
import os

import intact.blastit as blastit
from intact.blastit import check_scramble, check_nonhiv
import intact.intact as intact
from intact.intact import check_scramble, check_nonhiv

class BlastRow:
def __init__(self, qstart, qend, qlen):
Expand Down
12 changes: 6 additions & 6 deletions tests/test_scramble.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import pytest
import os

import intact.blastit as blastit
from intact.blastit import check_scramble, check_nonhiv
import intact.intact as intact
from intact.intact import check_scramble, check_nonhiv

@pytest.mark.parametrize("lst, expected", [
([1, 2, 3, 4, 5], True),
Expand All @@ -13,24 +13,24 @@
([1, 1, 2, 2, 3, 3], True),
])
def test_is_sorted(lst, expected):
assert blastit.is_sorted(lst) == expected
assert intact.is_sorted(lst) == expected

# def test_blast(tmp_path):
# input_file = "tests/data-large.fasta"
# output_file = os.path.join(tmp_path, "out.tsv")

# # blastit.blast(
# # intact.blast(
# # subtype="B",
# # input_file=input_file,
# # output_file=output_file,
# # )

# it = blastit.blast_interate(subtype="B", input_file=input_file)
# it = intact.blast_interate(subtype="B", input_file=input_file)
# for v in it:
# print("\n\n\n")
# print("---------------------------")
# print(v[0].qseqid)
# print(blastit.check_scramble(v))
# print(intact.check_scramble(v))

# assert False

Expand Down

0 comments on commit 619bcd0

Please sign in to comment.