Skip to content
This repository has been archived by the owner on Mar 19, 2024. It is now read-only.

Commit

Permalink
move find_orf into its own source file
Browse files Browse the repository at this point in the history
  • Loading branch information
Donaim committed Sep 22, 2023
1 parent 3a216a2 commit 87e361f
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 64 deletions.
66 changes: 2 additions & 64 deletions intact/intact.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from util.expected_orf import ExpectedORF
from util.translate_to_aminoacids import translate_to_aminoacids
from util.get_biggest_protein import get_biggest_protein
from util.find_orf import find_orf


WRONGORFNUMBER_ERROR = "WrongORFNumber"
Expand Down Expand Up @@ -432,14 +433,6 @@ def has_rev_response_element(alignment, rre_locus, rre_tolerance):
#/end def has_rev_response_element


def has_start_codon(orf):
return orf.aminoacids[0] == "M"


def has_stop_codon(orf):
return orf.aminoacids[-1] == "*"


def has_reading_frames(
aligned_sequence, is_small,
expected, error_bar, reverse = False
Expand All @@ -450,61 +443,6 @@ def has_reading_frames(
errors = []
matches = []

try:
query_aminoacids_table = [translate_to_aminoacids(sequence.seq, i) for i in range(3)]
except Bio.Data.CodonTable.TranslationError as e:
log.error(e)
err = IntactnessError(sequence.id, INVALID_CODON, e.args[0])
errors.append(err)
return matches, errors

def find_closest(aminoacids, start, direction, target):
distance = 0
n = len(aminoacids) - 1

while start + distance >= 0 and start + distance <= n:
if aminoacids[start + distance] == target:
return start + distance
distance += direction

if target == '*':
return n
else:
return 0

def find_candidate_positions(e):
q_start = ReferenceIndex(e.start).mapto(aligned_sequence)
q_end = ReferenceIndex(e.end).mapto(aligned_sequence)
expected_aminoacids = e.aminoacids
expected_protein = expected_aminoacids.strip("*")
q_start_a = q_start // 3
q_end_a = q_end // 3
n = len(sequence.seq) - 1
visited_set = set()

for frame in range(3):
aminoacids = query_aminoacids_table[frame]
for start_direction in [-1, +1]:
for end_direction in [-1, +1]:
closest_start_a = q_start_a if not has_start_codon(e) else find_closest(aminoacids, q_start_a, start_direction, 'M')
closest_end_a = q_end_a if not has_stop_codon(e) else find_closest(aminoacids, q_end_a, end_direction, '*')
got_aminoacids = aminoacids[closest_start_a:closest_end_a + 1]
if got_aminoacids in visited_set:
continue
else:
visited_set.add(got_aminoacids)

closest_start = min(n, (closest_start_a * 3) + frame)
closest_end = min(n + 1, (closest_end_a * 3) + 3 + frame)
got_protein = get_biggest_protein(has_start_codon(e), got_aminoacids)
dist = detailed_aligner.align(got_protein, expected_protein).distance()
yield CandidateORF(e.name, closest_start, closest_end, e.start, e.end,
"forward", dist, got_protein, got_aminoacids)

def find_real_correspondence(e):
candidates = find_candidate_positions(e)
return min(candidates, key=lambda x: x.distance)

def get_indel_impact(alignment):
shift = 0
impacted = 0
Expand All @@ -528,7 +466,7 @@ def get_indel_impact(alignment):
return impacted

for e in expected:
best_match = find_real_correspondence(e)
best_match = find_orf(aligned_sequence, e)
matches.append(best_match)

got_protein = best_match.protein
Expand Down
1 change: 1 addition & 0 deletions util/aligned_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from util.candidate_orf import CandidateORF
from util.expected_orf import ExpectedORF
from util.reference_index import ReferenceIndex
from util.find_orf import find_orf

@dataclass
class AlignedSequence:
Expand Down
66 changes: 66 additions & 0 deletions util/find_orf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from util.reference_index import ReferenceIndex
from util.get_query_aminoacids_table import get_query_aminoacids_table
from util.get_biggest_protein import get_biggest_protein
from util.candidate_orf import CandidateORF

import util.detailed_aligner as detailed_aligner

def has_start_codon(orf):
return orf.aminoacids[0] == "M"


def has_stop_codon(orf):
return orf.aminoacids[-1] == "*"


def find_closest(aminoacids, start, direction, target):
distance = 0
n = len(aminoacids) - 1

while start + distance >= 0 and start + distance <= n:
if aminoacids[start + distance] == target:
return start + distance
distance += direction

if target == '*':
return n
else:
return 0


def find_candidate_positions(aligned_sequence, e):
q_start = ReferenceIndex(e.start).mapto(aligned_sequence)
q_end = ReferenceIndex(e.end).mapto(aligned_sequence)
expected_aminoacids = e.aminoacids
expected_protein = expected_aminoacids.strip("*")
q_start_a = q_start // 3
q_end_a = q_end // 3
n = len(aligned_sequence.this.seq) - 1
visited_set = set()
query_aminoacids_table = get_query_aminoacids_table(aligned_sequence.this)
if isinstance(query_aminoacids_table, Exception):
raise query_aminoacids_table

for frame in range(3):
aminoacids = query_aminoacids_table[frame]
for start_direction in [-1, +1]:
for end_direction in [-1, +1]:
closest_start_a = q_start_a if not has_start_codon(e) else find_closest(aminoacids, q_start_a, start_direction, 'M')
closest_end_a = q_end_a if not has_stop_codon(e) else find_closest(aminoacids, q_end_a, end_direction, '*')
got_aminoacids = aminoacids[closest_start_a:closest_end_a + 1]
if got_aminoacids in visited_set:
continue
else:
visited_set.add(got_aminoacids)

closest_start = min(n, (closest_start_a * 3) + frame)
closest_end = min(n + 1, (closest_end_a * 3) + 3 + frame)
got_protein = get_biggest_protein(has_start_codon(e), got_aminoacids)
dist = detailed_aligner.align(got_protein, expected_protein).distance()
yield CandidateORF(e.name, closest_start, closest_end, e.start, e.end,
"forward", dist, got_protein, got_aminoacids)


def find_orf(aligned_sequence, e):
candidates = find_candidate_positions(aligned_sequence, e)
return min(candidates, key=lambda x: x.distance)
10 changes: 10 additions & 0 deletions util/get_query_aminoacids_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from util.translate_to_aminoacids import translate_to_aminoacids

def get_query_aminoacids_table(sequence):
if not hasattr(sequence, "query_aminoacids_table"):
try:
sequence.query_aminoacids_table = [translate_to_aminoacids(sequence.seq, i) for i in range(3)]
except Bio.Data.CodonTable.TranslationError as e:
sequence.query_aminoacids_table = e

return sequence.query_aminoacids_table

0 comments on commit 87e361f

Please sign in to comment.