Skip to content

Commit

Permalink
Draft conversion logic.
Browse files Browse the repository at this point in the history
  • Loading branch information
ielis committed Oct 3, 2023
1 parent a51d810 commit a56193c
Show file tree
Hide file tree
Showing 5 changed files with 1,858 additions and 42 deletions.
40 changes: 40 additions & 0 deletions src/genophenocorr/preprocessing/_test_variant.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import json
import os

import pytest

from pkg_resources import resource_filename
Expand Down Expand Up @@ -126,3 +129,40 @@ def test_cache_from_older_file(oldfile_cache_annotator, pp_vc_finder, variant_an
cached_file_results = oldfile_cache_annotator.annotate(var_coords)
assert var_anno_results == cached_file_results


class TestVepFunctionalAnnotator:

TEST_DATA_DIR = resource_filename(__name__, os.path.join('test_data', 'vep_response'))

def test__process_item_missense(self, variant_annotator: VepFunctionalAnnotator):
response = self._load_response_json('missense.json')
first = response[0]

annotations = list(filter(lambda i: i is not None, map(lambda item: variant_annotator._process_item(item), first['transcript_consequences'])))

ann_by_tx = {ann.transcript_id: ann for ann in annotations}

assert {'NM_013275.6', 'NM_001256183.2', 'NM_001256182.2'} == set(ann_by_tx.keys())

preferred = ann_by_tx['NM_013275.6']
assert preferred.transcript_id == 'NM_013275.6'
assert preferred.is_preferred == True
assert preferred.hgvsc_id == 'NM_013275.6:c.7407C>G'
assert preferred.variant_effects == ('stop_gained',)

def test__process_item_deletion(self, variant_annotator: VepFunctionalAnnotator):
response = self._load_response_json('deletion.json')
first = response[0]

annotations = [variant_annotator._process_item(item) for item in first['transcript_consequences']]

# TODO - finish
for a in annotations:
if a is not None:
print(a)
print('Done')

def _load_response_json(self, test_name: str):
response_fpath = os.path.join(self.TEST_DATA_DIR, test_name)
with open(response_fpath) as fh:
return json.load(fh)
90 changes: 48 additions & 42 deletions src/genophenocorr/preprocessing/_variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,8 @@
import hpotk
import requests


from genophenocorr.model import VariantCoordinates, TranscriptAnnotation, Variant, TranscriptInfoAware, TranscriptCoordinates

from genophenocorr.model import VariantCoordinates, TranscriptAnnotation, Variant, TranscriptInfoAware, \
TranscriptCoordinates
from ._api import FunctionalAnnotator, ProteinMetadataService, TranscriptCoordinateService


Expand Down Expand Up @@ -50,22 +49,25 @@ def verify_start_end_coordinates(vc: VariantCoordinates):


class VepFunctionalAnnotator(FunctionalAnnotator):
"""A class that peforms functional annotation of variant coordinates using Variant Effect Predictor (VEP) REST API.
"""A class that performs functional annotation of variant coordinates using Variant Effect Predictor (VEP) REST API.
Methods:
annotate(variant_coordinates: VariantCoordinates): the variant to annotate.
"""

def __init__(self, protein_annotator: ProteinMetadataService):
def __init__(self, protein_annotator: ProteinMetadataService,
include_computational_txs: bool = False):
"""Constructs all necessary attributes for a VepFunctionalAnnotator object
Args:
protein_annotator (ProteinMetadataService): A ProteinMetadataService object for ProteinMetadata creation
include_computational_txs (bool): Include computational transcripts, such as RefSEq `XM_`.
"""
self._logging = logging.getLogger(__name__)
self._protein_annotator = protein_annotator
self._url = 'https://rest.ensembl.org/vep/human/region/%s?LoF=1&canonical=1&domains=1&hgvs=1' \
'&mutfunc=1&numbers=1&protein=1&refseq=1&mane=1&transcript_version=1&variant_class=1'
self._include_computational_txs = include_computational_txs

def annotate(self, variant_coordinates: VariantCoordinates) -> typing.Sequence[TranscriptAnnotation]:
"""Perform functional annotation using Variant Effect Predictor (VEP) REST API.
Expand All @@ -79,46 +81,50 @@ def annotate(self, variant_coordinates: VariantCoordinates) -> typing.Sequence[T
# canon_tx = None
annotations = []
for trans in variant.get('transcript_consequences'):
trans_id = trans.get('transcript_id')
if not trans_id.startswith('NM'):
continue
# TODO - implement
is_preferred = False
# if trans.get('canonical') == 1:
# canon_tx = trans_id
hgvsc_id = trans.get('hgvsc')
consequences = trans.get('consequence_terms')
gene_name = trans.get('gene_symbol')
protein_id = trans.get('protein_id')
protein = self._protein_annotator.annotate(protein_id)
protein_effect_start = trans.get('protein_start')
protein_effect_end = trans.get('protein_end')
if protein_effect_start is None and protein_effect_end is not None:
protein_effect_start = 1
if protein_effect_end is not None:
protein_effect_end = int(protein_effect_end)
if protein_effect_start is not None:
protein_effect_start = int(protein_effect_start)
exons_effected = trans.get('exon')
if exons_effected is not None:
exons_effected = exons_effected.split('/')[0].split('-')
if len(exons_effected) == 2:
exons_effected = range(int(exons_effected[0]), int(exons_effected[1]) + 1)
exons_effected = [int(x) for x in exons_effected]
annotations.append(
TranscriptAnnotation(gene_name,
trans_id,
hgvsc_id,
is_preferred,
consequences,
exons_effected,
protein,
protein_effect_start,
protein_effect_end)
)
annotation = self._process_item(trans)
if annotation is not None:
annotations.append(annotation)

return annotations

def _process_item(self, item) -> typing.Optional[TranscriptAnnotation]:
"""
Parse one transcript annotation from the JSON response.
"""
trans_id = item.get('transcript_id')
if not self._include_computational_txs and not trans_id.startswith('NM_'):
# Skipping a computational transcript
return None
is_preferred = True if 'canonical' in item and item['canonical'] else False
hgvsc_id = item.get('hgvsc')
consequences = item.get('consequence_terms')
gene_name = item.get('gene_symbol')
protein_id = item.get('protein_id')
protein = self._protein_annotator.annotate(protein_id)
protein_effect_start = item.get('protein_start')
protein_effect_end = item.get('protein_end')
if protein_effect_start is None and protein_effect_end is not None:
protein_effect_start = 1
if protein_effect_end is not None:
protein_effect_end = int(protein_effect_end)
if protein_effect_start is not None:
protein_effect_start = int(protein_effect_start)
exons_effected = item.get('exon')
if exons_effected is not None:
exons_effected = exons_effected.split('/')[0].split('-')
if len(exons_effected) == 2:
exons_effected = range(int(exons_effected[0]), int(exons_effected[1]) + 1)
exons_effected = [int(x) for x in exons_effected]
return TranscriptAnnotation(gene_name,
trans_id,
hgvsc_id,
is_preferred,
consequences,
exons_effected,
protein,
protein_effect_start,
protein_effect_end)

def _query_vep(self, variant_coordinates) -> dict:
api_url = self._url % (verify_start_end_coordinates(variant_coordinates))
r = requests.get(api_url, headers={'Content-Type': 'application/json'})
Expand Down
18 changes: 18 additions & 0 deletions src/genophenocorr/preprocessing/test_data/vep_response/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# VEP responses

The folder contains JSON files with example VEP responses.

## `missense.json`

QUERY:
```shell
curl 'https://rest.ensembl.org/vep/human/region/16:89279135-89279135/C?LoF=1&canonical=1&domains=1&hgvs=1&mutfunc=1&numbers=1&protein=1&refseq=1&mane=1&transcript_version=1&variant_class=1' \
-H 'Content-type:application/json' | python3 -m json.tool > missense.json
```

## `deletion.json`

```shell
curl 'https://rest.ensembl.org/vep/human/region/16:89279135-89279135/C?LoF=1&canonical=1&domains=1&hgvs=1&mutfunc=1&numbers=1&protein=1&refseq=1&mane=1&transcript_version=1&variant_class=1' \
-H 'Content-type:application/json' | python3 -m json.tool > deletion.json
```
Loading

0 comments on commit a56193c

Please sign in to comment.