Skip to content
This repository has been archived by the owner on Mar 19, 2024. It is now read-only.

Commit

Permalink
move shared classes to their own source files
Browse files Browse the repository at this point in the history
  • Loading branch information
Donaim committed Sep 22, 2023
1 parent 59c576a commit 3a216a2
Show file tree
Hide file tree
Showing 7 changed files with 99 additions and 79 deletions.
78 changes: 7 additions & 71 deletions intact/intact.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,13 @@
import util.log as log
import util.coordinates as coords
import util.detailed_aligner as detailed_aligner
from util.aligned_sequence import AlignedSequence, ReferenceIndex
from util.aligned_sequence import AlignedSequence
from util.reference_index import ReferenceIndex
from util.blastrow import BlastRow
from util.candidate_orf import CandidateORF
from util.expected_orf import ExpectedORF
from util.translate_to_aminoacids import translate_to_aminoacids
from util.get_biggest_protein import get_biggest_protein


WRONGORFNUMBER_ERROR = "WrongORFNumber"
Expand Down Expand Up @@ -46,54 +51,6 @@ class IntactnessError:
message: str


@dataclass
class ExpectedORF:
name: str
start: int
end: int
deletion_tolerence: int
nucleotides: str
aminoacids: str
protein: str


@staticmethod
def subtyped(aligned_sequence, name, start, end, deletion_tolerence):
vpr_defective_insertion_pos = 5772
start = start if start < vpr_defective_insertion_pos else start - 1
end = end if end < vpr_defective_insertion_pos else end - 1

start_s = ReferenceIndex(start - 1).mapto(aligned_sequence) # decrement is needed because original "start" is 1-based.
end_s = ReferenceIndex(end).mapto(aligned_sequence)

nucleotides = str(aligned_sequence.this.seq[start_s:end_s])
aminoacids = translate(nucleotides)
has_start_codon = translate(aligned_sequence.this.seq[(start - 1):end]).startswith("M")
protein = get_biggest_protein(has_start_codon, aminoacids)

return ExpectedORF(name=name,
start=start_s,
end=end_s,
deletion_tolerence=deletion_tolerence,
nucleotides=nucleotides,
aminoacids=aminoacids,
protein=protein,
)


@dataclass
class CandidateORF:
name: str
start: int
end: int
subtype_start: int
subtype_end: int
orientation: str
distance: float
protein: str
aminoacids: str


@dataclass
class FoundORF:
name: str
Expand Down Expand Up @@ -475,21 +432,6 @@ def has_rev_response_element(alignment, rre_locus, rre_tolerance):
#/end def has_rev_response_element


def get_biggest_protein(has_start_codon, aminoacids):
def skip_to_startcodon(x):
index = x.find("M")
if index >= 0:
return x[index:]
else:
return ""

parts = aminoacids.split("*")
subparts = [skip_to_startcodon(x) for x in parts] if has_start_codon else parts
longest = max(subparts, key=len)
return longest



def has_start_codon(orf):
return orf.aminoacids[0] == "M"

Expand All @@ -498,12 +440,6 @@ def has_stop_codon(orf):
return orf.aminoacids[-1] == "*"


def translate(seq, frame = 0, to_stop = False):
for_translation = seq[frame:]
for_translation += 'N' * ({0: 0, 1: 2, 2: 1}[len(for_translation) % 3])
return Seq.translate(for_translation, to_stop = to_stop)


def has_reading_frames(
aligned_sequence, is_small,
expected, error_bar, reverse = False
Expand All @@ -515,7 +451,7 @@ def has_reading_frames(
matches = []

try:
query_aminoacids_table = [translate(sequence.seq, i) for i in range(3)]
query_aminoacids_table = [translate_to_aminoacids(sequence.seq, i) for i in range(3)]
except Bio.Data.CodonTable.TranslationError as e:
log.error(e)
err = IntactnessError(sequence.id, INVALID_CODON, e.args[0])
Expand Down
20 changes: 12 additions & 8 deletions util/aligned_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,16 @@

import util.coordinates as coords
import util.wrappers as wrappers

@dataclass
class ReferenceIndex:
value: int

def mapto(self, aligned):
return aligned.map_index(self.value)
from util.candidate_orf import CandidateORF
from util.expected_orf import ExpectedORF
from util.reference_index import ReferenceIndex

@dataclass
class AlignedSequence:
this: Seq
reference: Seq
alignment: (str, str) = dataclasses.field(default=None)
orfs: dict[str, CandidateORF] = dataclasses.field(default=None)
alignment: (str, str) = dataclasses.field(default=None)
coordinates_mapping: list[int] = dataclasses.field(default=None)


Expand Down Expand Up @@ -82,3 +79,10 @@ def reverse(self):

def alignment_score(self):
return sum([a==b for a, b in zip(self.get_alignment()[0].seq, self.get_alignment()[1].seq)])


def get_orf(self, expected_orf: ExpectedORF):
if expected_orf.name not in self.orfs:
self.orfs[expected_orf.name] = find_orf(expected_orf)

return self.orfs[expected_orf.name]
13 changes: 13 additions & 0 deletions util/candidate_orf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from dataclasses import dataclass

@dataclass
class CandidateORF:
name: str
start: int
end: int
subtype_start: int
subtype_end: int
orientation: str
distance: float
protein: str
aminoacids: str
40 changes: 40 additions & 0 deletions util/expected_orf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from dataclasses import dataclass
from util.reference_index import ReferenceIndex
from util.translate_to_aminoacids import translate_to_aminoacids
from util.get_biggest_protein import get_biggest_protein


@dataclass
class ExpectedORF:
name: str
start: int
end: int
deletion_tolerence: int
nucleotides: str
aminoacids: str
protein: str


@staticmethod
def subtyped(aligned_sequence, name, start, end, deletion_tolerence):
vpr_defective_insertion_pos = 5772
start = start if start < vpr_defective_insertion_pos else start - 1
end = end if end < vpr_defective_insertion_pos else end - 1

start_s = ReferenceIndex(start - 1).mapto(aligned_sequence) # decrement is needed because original "start" is 1-based.
end_s = ReferenceIndex(end).mapto(aligned_sequence)

nucleotides = str(aligned_sequence.this.seq[start_s:end_s])
aminoacids = translate_to_aminoacids(nucleotides)
has_start_codon = translate_to_aminoacids(aligned_sequence.this.seq[(start - 1):end]).startswith("M")
protein = get_biggest_protein(has_start_codon, aminoacids)

return ExpectedORF(name=name,
start=start_s,
end=end_s,
deletion_tolerence=deletion_tolerence,
nucleotides=nucleotides,
aminoacids=aminoacids,
protein=protein,
)

13 changes: 13 additions & 0 deletions util/get_biggest_protein.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@

def get_biggest_protein(has_start_codon, aminoacids):
def skip_to_startcodon(x):
index = x.find("M")
if index >= 0:
return x[index:]
else:
return ""

parts = aminoacids.split("*")
subparts = [skip_to_startcodon(x) for x in parts] if has_start_codon else parts
longest = max(subparts, key=len)
return longest
8 changes: 8 additions & 0 deletions util/reference_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from dataclasses import dataclass

@dataclass
class ReferenceIndex:
value: int

def mapto(self, aligned):
return aligned.map_index(self.value)
6 changes: 6 additions & 0 deletions util/translate_to_aminoacids.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from Bio import Seq

def translate_to_aminoacids(seq, frame = 0, to_stop = False):
for_translation = seq[frame:]
for_translation += 'N' * ({0: 0, 1: 2, 2: 1}[len(for_translation) % 3])
return Seq.translate(for_translation, to_stop = to_stop)

0 comments on commit 3a216a2

Please sign in to comment.