Skip to content

Commit

Permalink
Filter fusion artifacts (resolves #29)
Browse files Browse the repository at this point in the history
resolves #29
related to BD2KGenomics/protect#237

Fusion calling often has FPS related to homology, especially among
mitochondrial  and (lnc,etc) RNA genes. Furthermore, transcriptional
readthroughs are usually oncologically irrelevant. This PR adds
filtering flags for mitochondrial, Immunoglobulin, RP11- (lncRNA) and
transcriptional readthrough filtering.
  • Loading branch information
arkal committed Dec 6, 2017
1 parent 5cf7d66 commit fd2f299
Show file tree
Hide file tree
Showing 5 changed files with 210 additions and 92 deletions.
84 changes: 83 additions & 1 deletion common.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,4 +180,86 @@ def __eq__(self, other):
# The number of reads at the position
'coverage',
# The variant allele frequency
'vaf'))
'vaf'))

# Standard Genetic Code from NCBI
amino = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'
genetic_code = {''.join([b1, b2, b3]): aa
for aa, b1, b2, b3 in zip(amino, base1, base2, base3)}


BEDPE = collections.namedtuple('BEDPE',
'chrom1, start1, end1, '
'chrom2, start2, end2, '
'name, '
'score, '
'strand1, strand2, '
'junctionSeq1, junctionSeq2, '
'hugo1, hugo2')


def get_exons(genome_file, annotation_file, genes_of_interest):
"""
Generates list of GTFRecord objects for each transcript
:param file genome_file: Reference genome FASTA file
:param file annotation_file: Genome annotation file (GTF)
:param set(str) genes_of_interest: The genes in this sample that might need translation.
:return: GTFRecord exons
:rtype: dict
"""
annotation_file.seek(0)
chroms = {}
exons = collections.defaultdict(list)
for header, comment, seq in read_fasta(genome_file, 'ACGTN'):
chroms[header] = seq

for line in annotation_file:
if line.startswith('#'):
continue
else:
gtf = GTFRecord(line)
if gtf.feature == 'exon' and gtf.gene_name in genes_of_interest:
gtf.sequence = chroms[gtf.seqname][gtf.start - 1: gtf.end]
exons[gtf.transcript_id].append(gtf)
return exons


def read_genes_from_gtf(gtf_file):
"""
Read the gene annotations into a dict
:param file gtf_file: A file handle ot the annotation file.
:returns: A dict of a gtf record for each gene
:rtype: dict(string, GTFRecord)
"""
gene_annotations = {}
for line in gtf_file:
if line.startswith('#'):
continue
else:
gtf = GTFRecord(line)
if gtf.feature == 'gene':
gene_annotations[gtf.gene_name] = gtf
return gene_annotations


def translate(seq):
"""
Translates DNA sequence into protein sequence using globally defined genetic code
:param str seq: DNA sequence
:returns: Translated sequence
:rtype: str
>>> translate('ATGTTTCGTT')
'MFR'
"""
start = 0
n = len(seq)
codons = (seq[i: i+3] for i in range(start, n - n % 3, 3))
protein = [genetic_code[codon] for codon in codons]
return ''.join(protein)
118 changes: 62 additions & 56 deletions fusion.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,14 @@
from __future__ import print_function

import collections
import csv
import logging
import pickle
import re
import string
from cStringIO import StringIO

import swalign

from common import read_fasta, GTFRecord, trans

# Standard Genetic Code from NCBI
amino = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'
genetic_code = {''.join([b1, b2, b3]): aa
for aa, b1, b2, b3 in zip(amino, base1, base2, base3)}
from common import read_fasta, trans, BEDPE, translate


def get_transcriptome_data(infile):
Expand Down Expand Up @@ -49,56 +40,53 @@ def get_transcriptome_data(infile):
return transcript_cds, gene_transcripts


def get_exons(genome_file, annotation_file):
def rna_gene_in_bedpe(record):
"""
Generates list of GTFRecord objects for each transcript
Determine if one of the two candidates in a BEDPE line is an rna gene.
:param file genome_file: Reference genome FASTA file
:param file annotation_file: Genome annotation file (GTF)
:return: GTFRecord exons
:rtype: dict
:param BEDPE record: A BEDPE line from the input file
:returns: True if one of the candidates is an RNA gene and False if not
:rtype: bool
"""
chroms = {}
exons = collections.defaultdict(list)
for header, comment, seq in read_fasta(genome_file, 'ACGTN'):
chroms[header] = seq
# We will accept fusions that have an RP11- (lncRNA) 3' partner since they can still be
# translated. This is a heuristic.
return 'RP11-' in record.hugo1

for line in annotation_file:
if line.startswith('#'):
continue
else:
gtf = GTFRecord(line)
if gtf.feature == 'exon':
gtf.sequence = chroms[gtf.seqname][gtf.start - 1: gtf.end]
exons[gtf.transcript_id].append(gtf)
return exons


def translate(seq):
def readthrough_in_bedpe(record, annotation, rt_threshold):
"""
Translates DNA sequence into protein sequence using globally defined genetic code
:param str seq: DNA sequence
:returns: Translated sequence
:rtype: str
>>> translate('ATGTTTCGTT')
'MFR'
Determine if the two genes in the record are within `rt_threshold` bp of each other on the same
chromosome.
:param BEDPE record: A BEDPE line from the input file
:param dict(str, GTFRecord) annotation: see `read_fusions:gene_annotations`
:param rt_threshold: The genomic distance on the same chromosome below which we will call a
candidate fusion a readthrough.
:returns: True if the pair is considered a readthrough and False if not
:rtype: bool
"""
start = 0
n = len(seq)
codons = (seq[i: i+3] for i in range(start, n - n % 3, 3))
protein = [genetic_code[codon] for codon in codons]
return ''.join(protein)
return (record.chrom1 == record.chrom2 and
((annotation[record.hugo1].start <= annotation[record.hugo2].start <=
annotation[record.hugo1].end + rt_threshold) or
(annotation[record.hugo2].start <= annotation[record.hugo1].start <=
annotation[record.hugo2].end + rt_threshold)))


def read_fusions(fusion_file):
def read_fusions(fusion_file, gene_annotations, filter_mt, filter_ig, filter_rg, filter_rt,
rt_threshold, out_bedpe):
"""
Reads in gene fusion predictions in modified BEDPE format.
In addition to the basic BEDPE features, this function requires the fusion
junction sequences and HUGO names for the donor and acceptor genes.
:param file fusion_file: Fusion calls in BEDPE format
:param dict(str, GTFRecord) gene_annotations: The gene annotations from the gtf
:param bool filter_mt: Filter mitochondrial events?
:param bool filter_ig: Filter immunoglobulin pairs?
:param bool filter_rg: Filter RNA-Gene events?
:param bool filter_rt: Filter transcriptional read-throughs?
:param int rt_threshold: Distance threshold to call a readthrough
:param file out_bedpe: A file handle to an output BEDPE file
:returns: list of BEDPE namedtuples
:rtype: list
Expand All @@ -118,31 +106,49 @@ def read_fusions(fusion_file):
hugo1: HUGO name for first feature
hugo2: HUGO name for second feature
"""
BEDPE = collections.namedtuple('BEDPE',
'chrom1, start1, end1, '
'chrom2, start2, end2, '
'name, score, '
'strand1, strand2, '
'junctionSeq1, junctionSeq2, '
'hugo1, hugo2')

calls = []

for line in csv.reader(fusion_file, delimiter='\t'):
if line[0].startswith('#'):
print('\t'.join(line), file=out_bedpe)
continue
try:
calls.append(BEDPE(*line))

record = BEDPE(*line)
except TypeError:
raise ValueError("ERROR: fusion file is malformed.\n{}".format(read_fusions.__doc__))

if filter_mt and 'M' in record.chrom1 or 'M' in record.chrom2:
logging.warning("Rejecting %s-%s for containing a Mitochondrial gene.", record.hugo1,
record.hugo2)
continue
elif filter_ig and record.hugo1.startswith('IG') and record.hugo2.startswith('IG'):
# This will drop some Insulin-like growth factor (IGF) proteins but they have a lot of
# homology too so its ok.
logging.warning("Rejecting %s-%s an an Immunoglobulin gene pair.", record.hugo1,
record.hugo2)
continue
elif filter_rg and rna_gene_in_bedpe(record):
logging.warning("Rejecting %s-%s for containing a 5' RNA gene.", record.hugo1,
record.hugo2)
continue
elif filter_rt and readthrough_in_bedpe(record, gene_annotations, rt_threshold):
logging.warning("Rejecting %s-%s as a potential readthrough.", record.hugo1,
record.hugo2)
continue
else:
logging.info("Accepting %s-%s for further study.", record.hugo1, record.hugo2)
print('\t'.join(line), file=out_bedpe)
calls.append(record)

return calls

# Namedtuple for storing alignment metrics
# Neeeds to be global for pickling
# Needs to be global for pickling
AlignStats = collections.namedtuple('AlignStats',
'qstart, qstop, rstart, rstop, insertions, deletions')


def align_filter(ref, query, mode, mismatches_per_kb=1):
"""
Aligns query to reference CDS sequence using the Smith-Waterman algorithm.
Expand Down
3 changes: 3 additions & 0 deletions test/test_input/test_fusions.bedpe
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@ chr8 . 42968214 chr10 43116584 . ENSG00000168172-ENSG00000165731 1_1 + + TTGGAGG
chr21 . 41494381 chr7 13935844 . ENSG00000184012-ENSG00000006468 1_1 - - CCGGAAAACCCCTATCCCGCACAGCCCACTGTGGTCCCCACTGTCTACGAGGTGCATCCGGCTCAGTACTACCCGTCCCCCGTGCCCCAGTACGCCCCGAGGGTCCTGACGCAGGCTTCCAACCCCGTCGTCTGCACGCAGCCCAAATCC CCATCCAGCACGCCAGTGTCCCCACTGCATCATGCATCTCCAAACTCAACTCATACACCGAAACCTGACCGGGCCTTCCCAGCTCACCTCCCTCCATCGCAGTCCATACCAGATAGCAGCTACCCCATGGACCACAGATTTCGCCGCCAGCTTTCTGAACCCTGTAACTCCTTTCCTCCTTTGCCGACGATGCCAAGGGAAGGACGTCCTATGTACCAACGCCAGATGTCTGAGCCAAACATCCCCTTCCCACCACAAGGCTTTAAGCAGGAGTACCACGACCCAGTGTATGAACACAACACCATGGTTGGCAGTGCGGCCAGCCAAAGCTTTCCCCCTCCTCTGATGATTAAACAGGAACCCAGAGATTTTGCATATGACTCAGAAGTGCCTAGCTGCCACTCCATTTATATGAGGCAAGAAGGCTTCCTGGCTCATCCCAGCAGAACAGAAGGCTGTATGTTTGAAAAGGGCCCCAGGCAGTTTTATGATGACACCTGTGTTGTCCCAGAAAAATTCGATGGAGACATCAAACAAGAGCCAGGAATGTATCGGGAAGGACCCACATACCAACGGCGAGGATCACTTCAGCTCTGGCAGTTTTTGGTAGCTCTTCTGGATGACCCTTCAAATTCTCATTTTATTGCCTGGACTGGTCGAGGCATGGAATTTAAACTGATTGAGCCTGAAGAGGTGGCCCGACGTTGGGGCATTCAGAAAAACAGGCCAGCTATGAACTATGATAAACTTAGCCGTTCACTCCGCTATTACTATGAGAAAGGAATTATGCAAAAGGTGGCTGGAGAGAGATATGTCTACAAGTTTGTGTGTGATCCAGAAGCCCTTTTCTCCATGGCCTTTCCAGATAATCAGCGTCCACTGCTGAAGACAGACATGGAACGTCACATCAACGAGGAGGACACAGTGCCTCTTTCTCACTTTGATGAGAGCATGGCCTACATGCCGGAAGGGGGCTGCTGCAACCCCCACCCCTACAACGAAGGCTACGTGTATTAACACAAGTGACAGTCAAGCAGGGCGTT TMPRSS2 ETV1
chr21 . 41494381 chr7 13935844 . ENSG00000184012-ENSG00000006468 1_1 - - CCGGAAAACCCCTATCCCGCACAGCCCACTGTGGTCCCCACTGTCTACGAGGTGCATCCGGCTCAGTACTACCCGTCCCCCGTGCCCCAGTACGCCCCGAGGGTCCTGACGCAGGCTTCCAACCCCGTCGTCTGCACGCAGCCCAAATCC CCATCCAGCACGCCAGTGTCCCCACTGCATCATGCATCTCCAAACTCAACTCATACACCGAAACCTGACCGGGCCTTCCCAGCTCACCTCCCTCCATCGCAGTCCATACCAGATAGCAGCTACCCCATGGACCACAGATTTCGCCGCCAGCTTTCTGAACCCTGTAACTCCTTTCCTCCTTTGCCGACGATGCCAAGGGAAGGACGTCCTATGTACCAACGCCAGATGTCTGAGCCAAACATCCCCTTCCCACCACAAGGCTTTAAGCAGGAGTACCACGACCCAGTGTATGAACACAACACCATGGTTGGCAGTGCGGCCA TMPRSS2 ETV1
chr21 . 42870046 chr21 39817544 . ENSG00000184012-ENSG00000157554 1_1 - - . . TMPRSS2 ERG
chr21 . 46493768 chr21 46511596 . ENSG00000197381-ENSG00000268805 1_1 - - THIS_IS_A_READTHROUGH . ADARB1 PRED57
chr21 . 16904219 chr21 16914235 . ENSG00000230481-ENSG00000253691 1_1 - - THIS_IS_AN_IG . IGKV1OR22-5 IGKV2OR22-4
chr21 . 10397644 chr21 39817544 . ENSG00000278106-ENSG00000157554 1_1 - - THIS_IS_AN_RP11 . RP11-589M2.1 ERG
3 changes: 3 additions & 0 deletions test/test_transgene.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,9 @@ def _test_transgene(self, use_RNA=False, use_DNA=False, fusions=False):
'annotation.gtf')) if fusions else None
params.genome_file = open(self._get_input_path('test_input/chr21.fa')) if fusions else None

params.filter_mt = params.filter_rg = params.filter_ig = params.filter_rt = True
params.rt_threshold = 100000

params.cores = 3
transgene.main(params)
output = {'9mer': {'fasta': 'unit_test_tumor_9_mer_snpeffed.faa',
Expand Down
Loading

0 comments on commit fd2f299

Please sign in to comment.