Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Moved VariantEffects so it's using the Enum #78

Merged
merged 15 commits into from
Oct 19, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/tutorial.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,15 @@ We can then view the data using the list commands.
[('NP_09876.5', 26)]
>>> tx_dict = cohort.list_data_by_tx('NM_1234.5')
>>> sorted(tx_dict['NM_1234.5'].items())
[('frameshift_variant', 1), ('missense_variant', 1)]
[('FRAMESHIFT_VARIANT', 1), ('MISSENSE_VARIANT', 1)]

Using the counts, we can choose and run what analyses we want.
For instance, we can partition the patients into two groups based on presence/absence of a *frameshift* variant:

.. doctest:: tutorial

>>> from genophenocorr.analysis import CohortAnalysis
>>> from genophenocorr.constants import VariantEffect
>>> from genophenocorr.model import VariantEffect
>>> cohort_analysis = CohortAnalysis(cohort, 'NM_1234.5', hpo, include_unmeasured=False)
>>> frameshift = cohort_analysis.compare_by_variant_type(VariantEffect.FRAMESHIFT_VARIANT)
>>> frameshift # doctest: +NORMALIZE_WHITESPACE
Expand Down
2,711 changes: 1,427 additions & 1,284 deletions notebooks/KBG/KBG_Martinez_PMID_36446582_RunGenoPhenoCorr.ipynb

Large diffs are not rendered by default.

2 changes: 0 additions & 2 deletions src/genophenocorr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,4 @@
from . import preprocessing
from . import view

from .constants import VariantEffect

__version__ = "0.1.1dev"
3 changes: 1 addition & 2 deletions src/genophenocorr/analysis/_analyzers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@
from pandas import DataFrame, MultiIndex
from collections import Counter, namedtuple

from genophenocorr.constants import VariantEffect
from genophenocorr.model import Cohort, FeatureType
from genophenocorr.model import Cohort, FeatureType, VariantEffect

from .predicate import VariantEffectPredicate, HPOPresentPredicate, VariantPredicate, ExonPredicate, ProtFeatureTypePredicate, ProtFeaturePredicate
from .predicate import HOMOZYGOUS, HETEROZYGOUS, NO_VARIANT
Expand Down
5 changes: 2 additions & 3 deletions src/genophenocorr/analysis/predicate/_all_predicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@

import hpotk

from genophenocorr.constants import VariantEffect
from genophenocorr.model import Patient, FeatureType, Genotype
from genophenocorr.model import Patient, FeatureType, Genotype, VariantEffect
from ._api import PolyPredicate, PatientCategory


Expand Down Expand Up @@ -105,7 +104,7 @@ def test(self, patient: Patient, query: VariantEffect) -> typing.Optional[Patien
for trans in var.tx_annotations:
if trans.transcript_id == self._transcript:
for var_eff in trans.variant_effects:
if var_eff == str(query):
if var_eff == query:
vars.add(var)
if len(vars) == 1:
for v in vars:
Expand Down
1 change: 0 additions & 1 deletion src/genophenocorr/constants/__init__.py

This file was deleted.

8 changes: 4 additions & 4 deletions src/genophenocorr/data/_toy.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def get_toy_cohort() -> Cohort:
prot = ProteinMetadata('NP_09876.5', 'FakeProtein', [prot_feat_1, prot_feat_2])

snv = Variant.create_variant_from_scratch(VariantCoordinates(make_region(280, 281), 'A', 'G', 0), 'FakeGene',
'NM_1234.5', 'NM_1234.5:c.180A>G', False, ['missense_variant'], [1],
'NM_1234.5', 'NM_1234.5:c.180A>G', False, [VariantEffect.MISSENSE_VARIANT], [1],
[prot], 60, 60,
Genotypes.from_mapping({
'A': Genotype.HETEROZYGOUS, 'B': Genotype.HETEROZYGOUS,
Expand All @@ -54,7 +54,7 @@ def get_toy_cohort() -> Cohort:
)
deletion = Variant.create_variant_from_scratch(VariantCoordinates(make_region(360, 363), 'TTC', 'T', -2),
'FakeGene', 'NM_1234.5', 'NM_1234.5:c.261_263del',
False, ['frameshift_variant'],
False, [VariantEffect.FRAMESHIFT_VARIANT],
[2], [prot], 86, 87,
Genotypes.from_mapping({
'D': Genotype.HETEROZYGOUS, 'F': Genotype.HETEROZYGOUS,
Expand All @@ -69,11 +69,11 @@ def get_toy_cohort() -> Cohort:
)
het_dup = Variant.create_variant_from_scratch(
VariantCoordinates(make_region(175, 176), 'T', 'TG', 1), 'FakeGene', 'NM_1234.5',
'NM_1234.5:c.75A>G', False, ['frameshift_variant'], [1], [prot], 25, 25,
'NM_1234.5:c.75A>G', False, [VariantEffect.FRAMESHIFT_VARIANT], [1], [prot], 25, 25,
Genotypes.empty()) # Not used in the patients below, hence `empty()`.
hom_dup = Variant.create_variant_from_scratch(
VariantCoordinates(make_region(175, 176), 'T', 'TG', 1),'FakeGene', 'NM_1234.5',
'NM_1234.5:c.75A>G', False, ['frameshift_variant'], [1], [prot], 25, 25,
'NM_1234.5:c.75A>G', False, [VariantEffect.FRAMESHIFT_VARIANT], [1], [prot], 25, 25,
Genotypes.empty()) # Not used in the patients below, hence `empty()`.

patients = (
Expand Down
2 changes: 2 additions & 0 deletions src/genophenocorr/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,13 @@
from ._protein import FeatureInfo, FeatureType, ProteinFeature, ProteinMetadata
from ._tx import TranscriptCoordinates
from ._variant import VariantCoordinates, TranscriptAnnotation, TranscriptInfoAware, Variant
from ._variant_effects import VariantEffect

__all__ = [
'Cohort', 'Patient',
'Phenotype',
'Variant', 'VariantCoordinates', 'Genotype', 'Genotypes', 'Genotyped',
'TranscriptAnnotation', 'TranscriptInfoAware', 'TranscriptCoordinates',
'ProteinMetadata', 'ProteinFeature', 'FeatureInfo', 'FeatureType',
'VariantEffect'
]
2 changes: 1 addition & 1 deletion src/genophenocorr/model/_cohort.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ def list_data_by_tx(self, transcript=None):
for var in self.all_variants:
for trans in var.tx_annotations:
if trans.transcript_id in var_type_dict:
var_type_dict.get(trans.transcript_id).update(trans.variant_effects)
var_type_dict.get(trans.transcript_id).update([var_eff.name for var_eff in trans.variant_effects])
too_small = []
for tx_id, var_effect_counter in var_type_dict.items():
if len(var_effect_counter) <= 1:
Expand Down
7 changes: 4 additions & 3 deletions src/genophenocorr/model/_variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .genome import GenomicRegion
from ._gt import Genotyped, Genotypes
from ._protein import ProteinMetadata
from ._variant_effects import VariantEffect


class TranscriptInfoAware(metaclass=abc.ABCMeta):
Expand Down Expand Up @@ -52,7 +53,7 @@ def __init__(self, gene_id: str,
tx_id: str,
hgvsc: typing.Optional[str],
is_preferred: bool,
variant_effects,
variant_effects: typing.Optional[typing.Sequence[VariantEffect]],
affected_exons: typing.Optional[typing.Sequence[int]],
affected_protein: typing.Sequence[ProteinMetadata],
protein_effect_start: typing.Optional[int],
Expand Down Expand Up @@ -115,7 +116,7 @@ def hgvsc_id(self) -> typing.Optional[str]:
return self._hgvsc_id

@property
def variant_effects(self) -> typing.Sequence[str]:
def variant_effects(self) -> typing.Sequence[VariantEffect]:
"""
Returns:
Sequence[string]: A sequence of variant effects.
Expand Down Expand Up @@ -377,7 +378,7 @@ def create_variant_from_scratch(variant_coordinates: VariantCoordinates,
trans_id: str,
hgvsc_id: str,
is_preferred: bool,
consequences: typing.Sequence[str],
consequences: typing.Sequence[VariantEffect],
exons_effected: typing.Sequence[int],
protein: typing.Sequence[ProteinMetadata],
protein_effect_start: int,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,12 @@ class VariantEffect(Enum):
SEQUENCE_VARIANT = "SO:0001060"

def __str__(self) -> str:
return self.name.lower()
return self.name.lower()

def __eq__(self, other) -> bool:
return isinstance(other, VariantEffect) \
and self.value == other.value \
and self.name == other.name

def __hash__(self) -> int:
return hash((self.value, self.name))
10 changes: 1 addition & 9 deletions src/genophenocorr/preprocessing/_phenopacket.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,7 @@ def find_coordinates(self, item: GenomicInterpretation) -> typing.Tuple[VariantC
alt = '<DUP>'
else:
alt = '<DEL>'
chrom = re.findall(r'NC_0000(\d{2})\.\d*',
variant_descriptor.variation.copy_number.allele.sequence_location.sequence_id)[0]
if chrom.startswith('0'):
chrom = str(int(chrom))
elif chrom == '23':
chrom = 'X'
elif chrom == '24':
chrom = 'Y'
contig = self._build.contig_by_name(chrom)
contig = self._build._contig_by_name[variant_descriptor.variation.copy_number.allele.sequence_location.sequence_id.split(':')[1]]
elif len(variant_descriptor.vcf_record.chrom) != 0 and len(
variant_descriptor.variation.copy_number.allele.sequence_location.sequence_id) == 0:
ref = variant_descriptor.vcf_record.ref
Expand Down
6 changes: 3 additions & 3 deletions src/genophenocorr/preprocessing/_test_vep.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import pytest

from genophenocorr.model import VariantCoordinates
from genophenocorr.model import VariantCoordinates, VariantEffect
from genophenocorr.model.genome import GenomicRegion, Strand, GRCh38

from ._vep import verify_start_end_coordinates, VepFunctionalAnnotator
Expand Down Expand Up @@ -79,7 +79,7 @@ def test__process_item_missense(self, variant_annotator: VepFunctionalAnnotator)
assert preferred.transcript_id == 'NM_013275.6'
assert preferred.is_preferred is True
assert preferred.hgvsc_id == 'NM_013275.6:c.7407C>G'
assert preferred.variant_effects == ('stop_gained',)
assert preferred.variant_effects == (VariantEffect.STOP_GAINED,)
assert preferred.overlapping_exons == (9,)

def test__process_item_deletion(self, variant_annotator: VepFunctionalAnnotator):
Expand All @@ -99,7 +99,7 @@ def test__process_item_deletion(self, variant_annotator: VepFunctionalAnnotator)
assert preferred.transcript_id == 'NM_013275.6'
assert preferred.is_preferred is True
assert preferred.hgvsc_id == 'NM_013275.6:c.2408_2412del'
assert preferred.variant_effects == ('frameshift_variant',)
assert preferred.variant_effects == (VariantEffect.FRAMESHIFT_VARIANT,)
assert preferred.overlapping_exons == (9,)

def _load_response_json(self, test_name: str):
Expand Down
11 changes: 9 additions & 2 deletions src/genophenocorr/preprocessing/_vep.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import requests

from genophenocorr.model import VariantCoordinates, TranscriptAnnotation
from genophenocorr.model import VariantCoordinates, TranscriptAnnotation, VariantEffect
from ._api import FunctionalAnnotator, ProteinMetadataService


Expand Down Expand Up @@ -90,7 +90,14 @@ def _process_item(self, item) -> typing.Optional[TranscriptAnnotation]:
return None
is_preferred = True if 'canonical' in item and item['canonical'] else False
hgvsc_id = item.get('hgvsc')
var_effects = []
consequences = item.get('consequence_terms')
for con in consequences:
if con[0] == '5':
con = "FIVE_PRIME_UTR_VARIANT"
if con[0] == '3':
con = 'THREE_PRIME_UTR_VARIANT'
var_effects.append(VariantEffect[con.upper()])
Copy link
Member

@ielis ielis Oct 11, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you should make the line #100 a little more verbose and add code to handle failure. What if VEP returns a value that does not have a corresponding enum member? I think we should log a warning and ask the user to report missing variant effect to developers (make a Github issue), so that we can add it.

I think you should put the logic for decoding str -> VariantEffect into a separate private method, e.g.

def _parse_variant_effect(self, effect: str) -> typing.Optional[VariantEffect]:
  # TODO implement
  pass

Basically, the code that you have in the loop should be the method's body. Then, the loop applies the method and appends the variant effect to var_effects if it is not None. The warning can be made in the method's body.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ielis I have made this change, let me know if this is what you were thinking. I'll merge the changes once I get the okay!

gene_name = item.get('gene_symbol')
protein_id = item.get('protein_id')
protein = self._protein_annotator.annotate(protein_id)
Expand All @@ -112,7 +119,7 @@ def _process_item(self, item) -> typing.Optional[TranscriptAnnotation]:
trans_id,
hgvsc_id,
is_preferred,
consequences,
var_effects,
exons_effected,
protein,
protein_effect_start,
Expand Down
16 changes: 8 additions & 8 deletions tests/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,44 +36,44 @@ def toy_cohort() -> Cohort:

dup = Variant(VariantCoordinates(make_region("16", 89279849, 89279850), ref='G', alt='GC', change_length=1),
[
TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.6691dup', False, ['frameshift_variant'], [9],
TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.6691dup', False, [VariantEffect.FRAMESHIFT_VARIANT], [9],
[prot], 2231, 2231)
],
Genotypes.from_mapping({'HetSingleVar': Genotype.HETEROZYGOUS}))
indel = Variant(VariantCoordinates(make_region("16", 89284600, 89284602), ref='GG', alt='A', change_length=-1),
[
TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.1940_1941delinsT', False, ['frameshift_variant'],
TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.1940_1941delinsT', False, [VariantEffect.FRAMESHIFT_VARIANT],
[9], [prot], 647, 647)
],
Genotypes.from_mapping({'HetDoubleVar1': Genotype.HETEROZYGOUS}))
snv_stop_gain = Variant(VariantCoordinates(make_region("16", 89280751, 89280752), ref='G', alt='T', change_length=0),
[
TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.5790C>A', False, ['stop_gained'], [9], [prot],
TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.5790C>A', False, [VariantEffect.STOP_GAINED], [9], [prot],
1930, 1930)],
Genotypes.from_mapping({'HetDoubleVar1': Genotype.HETEROZYGOUS}))
snv_missense = Variant(VariantCoordinates(make_region("16", 89275127, 89275128), ref='G', alt='A', change_length=0),
[
TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.7534C>T', False, ['missense_variant'], [10],
TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.7534C>T', False, [VariantEffect.MISSENSE_VARIANT], [10],
[prot], 2512, 2512)
],
Genotypes.from_mapping({'HetDoubleVar2': Genotype.HETEROZYGOUS}))
del_frameshift = Variant(VariantCoordinates(make_region("16", 89279707, 89279725), ref='AGTGTTCGGGGCGGGGCC', alt='A', change_length=-17),
[
TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.6817_6833del', False, ['frameshift_variant'],
TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.6817_6833del', False, [VariantEffect.FRAMESHIFT_VARIANT],
[9], [prot], 2273, 2278)
],
Genotypes.from_mapping({'HetDoubleVar2': Genotype.HETEROZYGOUS}))
del_small = Variant(VariantCoordinates(make_region("16", 89279457, 89279459), ref='TG', alt='T', change_length=-1),
[
TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.7083del', False, ['frameshift_variant'], [9],
TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.7083del', False, [VariantEffect.FRAMESHIFT_VARIANT], [9],
[prot], 2361, 2362)
],
Genotypes.from_mapping({'HomoVar': Genotype.HOMOZYGOUS_ALTERNATE}))
del_large = Variant(VariantCoordinates(make_region("16", 89_190_070, 89_439_815), ref='N', alt='<DEL>', change_length=-249_745),
[
TranscriptAnnotation('ANKRD11', 'NM_013275.6', None, False,
['stop_lost', 'feature_truncation', 'coding_sequence_variant', '5_prime_UTR_variant',
'3_prime_UTR_variant', 'intron_variant'], [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
[VariantEffect.STOP_LOST, VariantEffect.FEATURE_TRUNCATION, VariantEffect.CODING_SEQUENCE_VARIANT, VariantEffect.FIVE_PRIME_UTR_VARIANT,
VariantEffect.THREE_PRIME_UTR_VARIANT, VariantEffect.INTRON_VARIANT], [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
[prot], None, None)
],
Genotypes.from_mapping({'LargeCNV': Genotype.HETEROZYGOUS}))
Expand Down
3 changes: 1 addition & 2 deletions tests/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@

import pytest

from genophenocorr import VariantEffect
from genophenocorr.model import TranscriptCoordinates, ProteinMetadata, Variant, VariantCoordinates, TranscriptAnnotation, Genotypes
from genophenocorr.model import TranscriptCoordinates, ProteinMetadata, Variant, VariantCoordinates, TranscriptAnnotation, Genotypes, VariantEffect
from genophenocorr.model.genome import Contig, GenomicRegion, Strand


Expand Down
3 changes: 1 addition & 2 deletions tests/test_predicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
import pytest

from genophenocorr.analysis.predicate import *
from genophenocorr.constants import VariantEffect
from genophenocorr.model import Cohort, Patient, FeatureType
from genophenocorr.model import Cohort, Patient, FeatureType, VariantEffect
from .fixtures import toy_hpo, toy_cohort


Expand Down