Skip to content

Commit

Permalink
Merge pull request #85 from monarch-initiative/cohort_viewer
Browse files Browse the repository at this point in the history
Created Cohort Viewer that shows Excluded Patients
  • Loading branch information
lnrekerle authored Oct 31, 2023
2 parents dfe5c50 + 016629e commit 60328e5
Show file tree
Hide file tree
Showing 12 changed files with 1,766 additions and 1,473 deletions.
2,804 changes: 1,429 additions & 1,375 deletions notebooks/KBG/KBG_Martinez_PMID_36446582_RunGenoPhenoCorr.ipynb

Large diffs are not rendered by default.

32 changes: 17 additions & 15 deletions src/genophenocorr/analysis/predicate/_all_predicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,22 @@ class HPOPresentPredicate(PolyPredicate[hpotk.TermId]):
name='Observed',
description="""
The sample *is* annotated with the tested phenotype feature `q`.
This is either because the sample is annotated with `q` (exact match),
or because one of sample's annotations is a descendant `q` (annotation propagation).
For instance, we tested for a Seizure and the sample *had* a Clonic seizure
For instance, we tested for a Seizure and the sample *had* a Clonic seizure
(a descendant of Seizure).
""")

NOT_OBSERVED = PatientCategory(cat_id=1,
name='Not observed',
description="""
We are particular about the sample *not* having the tested feature `q`.
In other words, `q` was *excluded* in the sample or the sample is annotated with an excluded ancestor of `q`.
For instance, we tested for a Clonic seizure and the sample did *not* have any Seizure, which implies
*not* Clonic seizure.
For instance, we tested for a Clonic seizure and the sample did *not* have any Seizure, which implies
*not* Clonic seizure.
""")

NOT_MEASURED = PatientCategory(cat_id=2,
Expand All @@ -36,7 +36,7 @@ class HPOPresentPredicate(PolyPredicate[hpotk.TermId]):
We do not know if the sample has or has not the tested feature.
""")

def __init__(self,
def __init__(self,
hpo: hpotk.MinimalOntology) -> None:
self._hpo = hpotk.util.validate_instance(hpo, hpotk.MinimalOntology, 'hpo')

Expand Down Expand Up @@ -69,13 +69,13 @@ def test(self, patient: Patient, query: hpotk.TermId) -> typing.Optional[Patient
HETEROZYGOUS = PatientCategory(cat_id=0,
name='Heterozygous',
description="""
This sample has the tested attribute on one allele.
This sample has the tested attribute on one allele.
""")

HOMOZYGOUS = PatientCategory(cat_id=1,
name='Homozygous',
description="""
This sample has the tested attribute on both alleles.
This sample has the tested attribute on both alleles.
""")

NO_VARIANT = PatientCategory(cat_id=2,
Expand All @@ -87,7 +87,7 @@ def test(self, patient: Patient, query: hpotk.TermId) -> typing.Optional[Patient


class VariantEffectPredicate(PolyPredicate[VariantEffect]):

def __init__(self, transcript:str) -> None:
self._transcript = transcript

Expand Down Expand Up @@ -205,11 +205,12 @@ def test(self, patient: Patient, query:FeatureType) -> typing.Optional[PatientCa
for var in patient.variants:
for trans in var.tx_annotations:
if trans.transcript_id == self._transcript:
if trans.protein_effect_location is not None and trans.protein_effect_location[0] is not None and trans.protein_effect_location[1] is not None:
protein_location = trans.protein_effect_location
if protein_location is not None:
for prot in trans.protein_affected:
for feat in prot.protein_features:
if feat.feature_type == query:
if len(list(range(max(trans.protein_effect_location[0], feat.info.start), min(trans.protein_effect_location[1], feat.info.end) + 1))) > 0:
if len(list(range(max(protein_location.start, feat.info.start), min(protein_location.end, feat.info.end) + 1))) > 0:
vars.add(var)
if len(vars) == 1:
for v in vars:
Expand Down Expand Up @@ -249,11 +250,12 @@ def test(self, patient: Patient, query: str) -> typing.Optional[PatientCategory]
for var in patient.variants:
for trans in var.tx_annotations:
if trans.transcript_id == self._transcript:
if trans.protein_effect_location is not None and trans.protein_effect_location[0] is not None and trans.protein_effect_location[1] is not None:
protein_location = trans.protein_effect_location
if protein_location is not None:
for prot in trans.protein_affected:
for feat in prot.protein_features:
if feat.info.name == query:
if len(list(range(max(trans.protein_effect_location[0], feat.info.start), min(trans.protein_effect_location[1], feat.info.end) + 1))) > 0:
if len(list(range(max(protein_location.start, feat.info.start), min(protein_location.end, feat.info.end) + 1))) > 0:
vars.add(var)
if len(vars) == 1:
for v in vars:
Expand All @@ -268,4 +270,4 @@ def test(self, patient: Patient, query: str) -> typing.Optional[PatientCategory]
elif len(vars) > 1:
return HOMOZYGOUS
else:
return NO_VARIANT
return NO_VARIANT
43 changes: 39 additions & 4 deletions src/genophenocorr/model/_cohort.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def __hash__(self) -> int:
class Cohort(typing.Sized):

@staticmethod
def from_patients(members: typing.Sequence[Patient]):
def from_patients(members: typing.Sequence[Patient], include_patients_with_no_HPO: bool = False):
"""
Create a cohort from a sequence of patients.
Expand All @@ -99,17 +99,21 @@ def from_patients(members: typing.Sequence[Patient]):
cohort_variants, cohort_phenotypes, cohort_proteins = set(), set(), set() # , cohort_proteins
var_counts, pheno_count, prot_counts = Counter(), Counter(), Counter() # , prot_counts
members = set(members)
excluded_members = []
for patient in members:
if len(patient.phenotypes) == 0 and not include_patients_with_no_HPO:
excluded_members.append(patient)
continue
cohort_phenotypes.update(patient.phenotypes)
cohort_variants.update(patient.variants)
var_counts.update([var.variant_coordinates.variant_key for var in patient.variants])
cohort_phenotypes.update(patient.phenotypes)
pheno_count.update([pheno.identifier.value for pheno in patient.phenotypes if pheno.observed == True])
cohort_proteins.update(patient.proteins)
prot_counts.update([prot.protein_id for prot in patient.proteins])
all_counts = {'patients': len(members), 'variants': var_counts, 'phenotypes': pheno_count,
'proteins': prot_counts} # 'proteins':prot_counts
return Cohort(members, cohort_phenotypes, cohort_variants, cohort_proteins,
all_counts) # cohort_proteins, all_counts
all_counts, excluded_members) # cohort_proteins, all_counts

"""This class creates a collection of patients and makes it easier to determine overlapping diseases,
phenotypes, variants, and proteins among the patients. If a list of JSON files is given, it will
Expand All @@ -130,7 +134,7 @@ def from_patients(members: typing.Sequence[Patient]):
list_data_by_tx(transcript:Optional[string]): A list and count of all the variants effects found for all transcripts or a given transcript if transcript is not None.
"""

def __init__(self, patient_set: typing.Set[Patient], phenotype_set, variant_set, protein_set, counts_dict,
def __init__(self, patient_set: typing.Set[Patient], phenotype_set, variant_set, protein_set, counts_dict, excluded_members,
recessive=False):
"""Constructs all necessary attributes for a Cohort object
Expand All @@ -151,6 +155,7 @@ def __init__(self, patient_set: typing.Set[Patient], phenotype_set, variant_set,
self._protein_set = protein_set
self._variant_set = variant_set
self._all_counts_dict = counts_dict
self._excluded_members = excluded_members
self._recessive = recessive

@property
Expand Down Expand Up @@ -196,6 +201,10 @@ def all_transcripts(self):
all_trans.update([trans.transcript_id for trans in var.tx_annotations])
return all_trans

@property
def all_excluded_patients(self):
return self._excluded_members

@property
def total_patient_count(self):
"""
Expand Down Expand Up @@ -261,5 +270,31 @@ def list_data_by_tx(self, transcript=None):
del var_type_dict[tx_id]
return var_type_dict

def get_excluded_ids(self):
return [ex.patient_id for ex in self.all_excluded_patients]

def get_excluded_count(self):
return len(self.all_excluded_patients)

def get_protein_features_affected(self, transcript):
all_features = Counter()
protein_set = set()
var_coords = []
for var in self.all_variants:
for tx in var.tx_annotations:
if tx.transcript_id == transcript:
protein_set.add(tx.protein_affected)
if tx.protein_effect_location is None or tx.protein_effect_location[0] is None or tx.protein_effect_location[1] is None:
continue
else:
var_coords.append(tx.protein_effect_location)
if len(protein_set) != 1:
raise ValueError(f"Found more than 1 protein: {protein_set}")
else:
protein = list(protein_set)[0][0]
for pair in var_coords:
all_features.update(list(protein.get_features_variant_overlaps(pair[0], pair[1])))
return all_features

def __len__(self) -> int:
return len(self._patient_set)
17 changes: 17 additions & 0 deletions src/genophenocorr/model/_protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .genome import Region



class FeatureInfo:
"""A class that represents a protein feature
(e.g. a repeated sequence given the name "ANK 1" in protein "Ankyrin repeat domain-containing protein 11")
Expand Down Expand Up @@ -242,6 +243,22 @@ def motifs(self) -> typing.Iterable[ProteinFeature]:
"""
return filter(lambda f: f.feature_type == FeatureType.MOTIF, self.protein_features)

def get_features_variant_overlaps(self, region: Region) -> typing.Collection[ProteinFeature]:
"""
Get a collection of protein features that overlap with the `region`.
Args:
region: the query region.
Returns:
Collection[ProteinFeature]: a collection of overlapping protein features.
"""
affected_features = set()
for feat in self.protein_features:
if feat.info.region.overlaps_with_region(region):
affected_features.add(feat)

return affected_features

def __str__(self) -> str:
return f"ProteinMetadata(id={self.protein_id}, " \
f"label={self.label}, " \
Expand Down
59 changes: 25 additions & 34 deletions src/genophenocorr/model/_variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import hpotk

from .genome import GenomicRegion
from .genome import Region, GenomicRegion
from ._gt import Genotyped, Genotypes
from ._protein import ProteinMetadata
from ._variant_effects import VariantEffect
Expand Down Expand Up @@ -38,49 +38,38 @@ def transcript_id(self) -> str:
class TranscriptAnnotation(TranscriptInfoAware):
"""Class that represents results of the functional annotation of a variant with respect to single transcript of a gene.
Attributes:
Args:
gene_id (string): The gene symbol associated with the transcript
transcript_id (string): The transcript ID
hgvsc_id (string): The HGVS "coding-DNA" ID if available, else None
tx_id (string): The transcript ID
hgvsc (string): The HGVS "coding-DNA" ID if available, else None
is_preferred (bool): The transcript is a MANE transcript, canonical Ensembl transcript, etc.
variant_effects (Sequence[string]): A sequence of predicted effects given by VEP
overlapping_exons (Sequence[integer]): A sequence of exons affected by the variant. Returns None if none are affected.
protein_affected (ProteinMetadata): A ProteinMetadata object representing the protein affected by this transcript
protein_effect_location (Tuple(integer, integer)): The start and end coordinates of the effect on the protein sequence.
variant_effects (Iterable[string]): An iterable of predicted effects given by VEP
affected_exons (Iterable[integer]): An iterable of exons affected by the variant. Returns None if none are affected.
affected_protein (ProteinMetadata): A ProteinMetadata object representing the protein affected by this transcript
protein_effect_coordinates (Region, optional): An optional :class:`Region` with start and end coordinates
of the effect on the protein sequence.
"""

def __init__(self, gene_id: str,
tx_id: str,
hgvsc: typing.Optional[str],
is_preferred: bool,
variant_effects: typing.Iterable[VariantEffect],
affected_exons: typing.Optional[typing.Sequence[int]],
affected_protein: typing.Sequence[ProteinMetadata],
protein_effect_start: typing.Optional[int],
protein_effect_end: typing.Optional[int]):
"""Constructs all necessary attributes for a TranscriptAnnotation object
Args:
gene_id (string): The gene symbol associated with the transcript
tx_id (string): The transcript ID
hgvsc (string, Optional): The HGVS "coding-DNA" ID if available, else None
variant_effects (Iterable[string]): An iterable of predicted effects given by functional annotator
affected_exons (Sequence[integer], Optional): A sequence of exons affected by the variant. Returns None if none are affected.
affected_protein (Sequence[ProteinMetadata]): A ProteinMetadata object representing the protein affected by this transcript
protein_effect_start (integer, Optional): The start coordinate of the effect on the protein sequence.
protein_effect_end (integer, Optional): The end coordinate of the effect on the protein sequence.
"""
self._gene_id = gene_id
self._tx_id = tx_id
self._hgvsc_id = hgvsc
self._is_preferred = is_preferred
affected_exons: typing.Optional[typing.Iterable[int]],
affected_protein: typing.Iterable[ProteinMetadata],
protein_effect_coordinates: typing.Optional[Region]):
self._gene_id = hpotk.util.validate_instance(gene_id, str, 'gene_id')
self._tx_id = hpotk.util.validate_instance(tx_id, str, 'tx_id')
self._hgvsc_id = hpotk.util.validate_optional_instance(hgvsc, str, 'hgvsc')
self._is_preferred = hpotk.util.validate_instance(is_preferred, bool, 'is_preferred')
self._variant_effects = tuple(variant_effects)
if affected_exons is not None:
self._affected_exons = tuple(affected_exons)
else:
self._affected_exons = None
self._affected_protein = tuple(affected_protein)
self._protein_effect_location = (protein_effect_start, protein_effect_end)
self._protein_effect_location = hpotk.util.validate_optional_instance(protein_effect_coordinates, Region,
'protein_effect_coordinates')

@property
def gene_id(self) -> str:
Expand Down Expand Up @@ -142,10 +131,11 @@ def protein_affected(self) -> typing.Sequence[ProteinMetadata]:
return self._affected_protein

@property
def protein_effect_location(self) -> typing.Tuple[int, int]:
def protein_effect_location(self) -> typing.Optional[Region]:
"""
Returns:
Tuple(integer, integer): The start and end position on the protein sequence that the variant effects. (e.g. (1234, 1235))
Region: a :class:`genophenocorr.model.genome.Region` with start and end position on the protein sequence
that the variant affects.
"""
return self._protein_effect_location

Expand Down Expand Up @@ -381,9 +371,10 @@ def create_variant_from_scratch(variant_coordinates: VariantCoordinates,
protein_effect_start: int,
protein_effect_end: int,
genotypes: Genotypes):
transcript = TranscriptAnnotation(gene_name, trans_id, hgvsc_id, is_preferred, consequences, exons_effected, protein,
protein_effect_start, protein_effect_end)
return Variant(variant_coordinates, [transcript], genotypes)
protein_effect = Region(protein_effect_start, protein_effect_end)
transcript = TranscriptAnnotation(gene_name, trans_id, hgvsc_id, is_preferred, consequences, exons_effected,
protein, protein_effect)
return Variant(variant_coordinates, (transcript,), genotypes)

def __init__(self, var_coordinates: VariantCoordinates,
tx_annotations: typing.Iterable[TranscriptAnnotation],
Expand Down
Loading

0 comments on commit 60328e5

Please sign in to comment.