Merge pull request #85 from monarch-initiative/cohort_viewer

Created Cohort Viewer that shows Excluded Patients
monarch-initiative · Oct 31, 2023 · 60328e5 · 60328e5
2 parents dfe5c50 + 016629e
commit 60328e5
Show file tree

Hide file tree

Showing 12 changed files with 1,766 additions and 1,473 deletions.
diff --git a/notebooks/KBG/KBG_Martinez_PMID_36446582_RunGenoPhenoCorr.ipynb b/notebooks/KBG/KBG_Martinez_PMID_36446582_RunGenoPhenoCorr.ipynb
diff --git a/src/genophenocorr/analysis/predicate/_all_predicates.py b/src/genophenocorr/analysis/predicate/_all_predicates.py
@@ -12,22 +12,22 @@ class HPOPresentPredicate(PolyPredicate[hpotk.TermId]):
                                name='Observed',
                                description="""
                                The sample *is* annotated with the tested phenotype feature `q`.
-                               
+
                                This is either because the sample is annotated with `q` (exact match),
                                or because one of sample's annotations is a descendant `q` (annotation propagation).
-                               For instance, we tested for a Seizure and the sample *had* a Clonic seizure 
+                               For instance, we tested for a Seizure and the sample *had* a Clonic seizure
                                (a descendant of Seizure).
                                """)
 
     NOT_OBSERVED = PatientCategory(cat_id=1,
                                    name='Not observed',
                                    description="""
                                    We are particular about the sample *not* having the tested feature `q`.
-                                   
+
                                    In other words, `q` was *excluded* in the sample or the sample is annotated with an excluded ancestor of `q`.
-                                   
-                                   For instance, we tested for a Clonic seizure and the sample did *not* have any Seizure, which implies 
-                                   *not* Clonic seizure.  
+
+                                   For instance, we tested for a Clonic seizure and the sample did *not* have any Seizure, which implies
+                                   *not* Clonic seizure.
                                    """)
 
     NOT_MEASURED = PatientCategory(cat_id=2,
@@ -36,7 +36,7 @@ class HPOPresentPredicate(PolyPredicate[hpotk.TermId]):
                                    We do not know if the sample has or has not the tested feature.
                                    """)
 
-    def __init__(self, 
+    def __init__(self,
                  hpo: hpotk.MinimalOntology) -> None:
         self._hpo = hpotk.util.validate_instance(hpo, hpotk.MinimalOntology, 'hpo')
 
@@ -69,13 +69,13 @@ def test(self, patient: Patient, query: hpotk.TermId) -> typing.Optional[Patient
 HETEROZYGOUS = PatientCategory(cat_id=0,
                             name='Heterozygous',
                             description="""
-                            This sample has the tested attribute on one allele. 
+                            This sample has the tested attribute on one allele.
                             """)
 
 HOMOZYGOUS = PatientCategory(cat_id=1,
                                 name='Homozygous',
                                 description="""
-                                This sample has the tested attribute on both alleles. 
+                                This sample has the tested attribute on both alleles.
                                 """)
 
 NO_VARIANT = PatientCategory(cat_id=2,
@@ -87,7 +87,7 @@ def test(self, patient: Patient, query: hpotk.TermId) -> typing.Optional[Patient
 
 
 class VariantEffectPredicate(PolyPredicate[VariantEffect]):
-    
+
     def __init__(self, transcript:str) -> None:
         self._transcript = transcript
 
@@ -205,11 +205,12 @@ def test(self, patient: Patient, query:FeatureType) -> typing.Optional[PatientCa
         for var in patient.variants:
             for trans in var.tx_annotations:
                 if trans.transcript_id == self._transcript:
-                    if trans.protein_effect_location is not None and trans.protein_effect_location[0] is not None and trans.protein_effect_location[1] is not None:
+                    protein_location = trans.protein_effect_location
+                    if protein_location is not None:
                         for prot in trans.protein_affected:
                             for feat in prot.protein_features:
                                 if feat.feature_type == query:
-                                    if len(list(range(max(trans.protein_effect_location[0], feat.info.start), min(trans.protein_effect_location[1], feat.info.end) + 1))) > 0:
+                                    if len(list(range(max(protein_location.start, feat.info.start), min(protein_location.end, feat.info.end) + 1))) > 0:
                                         vars.add(var)
         if len(vars) == 1:
             for v in vars:
@@ -249,11 +250,12 @@ def test(self, patient: Patient, query: str) -> typing.Optional[PatientCategory]
         for var in patient.variants:
             for trans in var.tx_annotations:
                 if trans.transcript_id == self._transcript:
-                    if trans.protein_effect_location is not None and trans.protein_effect_location[0] is not None and trans.protein_effect_location[1] is not None:
+                    protein_location = trans.protein_effect_location
+                    if protein_location is not None:
                         for prot in trans.protein_affected:
                             for feat in prot.protein_features:
                                 if feat.info.name == query:
-                                    if len(list(range(max(trans.protein_effect_location[0], feat.info.start), min(trans.protein_effect_location[1], feat.info.end) + 1))) > 0:
+                                    if len(list(range(max(protein_location.start, feat.info.start), min(protein_location.end, feat.info.end) + 1))) > 0:
                                         vars.add(var)
         if len(vars) == 1:
             for v in vars:
@@ -268,4 +270,4 @@ def test(self, patient: Patient, query: str) -> typing.Optional[PatientCategory]
         elif len(vars) > 1:
             return HOMOZYGOUS
         else:
-            return NO_VARIANT
+            return NO_VARIANT
diff --git a/src/genophenocorr/model/_cohort.py b/src/genophenocorr/model/_cohort.py
@@ -89,7 +89,7 @@ def __hash__(self) -> int:
 class Cohort(typing.Sized):
 
     @staticmethod
-    def from_patients(members: typing.Sequence[Patient]):
+    def from_patients(members: typing.Sequence[Patient], include_patients_with_no_HPO: bool = False):
         """
         Create a cohort from a sequence of patients.
 
@@ -99,17 +99,21 @@ def from_patients(members: typing.Sequence[Patient]):
         cohort_variants, cohort_phenotypes, cohort_proteins = set(), set(), set()  # , cohort_proteins
         var_counts, pheno_count, prot_counts = Counter(), Counter(), Counter()  # , prot_counts
         members = set(members)
+        excluded_members = []
         for patient in members:
+            if len(patient.phenotypes) == 0 and not include_patients_with_no_HPO:
+                excluded_members.append(patient)
+                continue
+            cohort_phenotypes.update(patient.phenotypes)
             cohort_variants.update(patient.variants)
             var_counts.update([var.variant_coordinates.variant_key for var in patient.variants])
-            cohort_phenotypes.update(patient.phenotypes)
             pheno_count.update([pheno.identifier.value for pheno in patient.phenotypes if pheno.observed == True])
             cohort_proteins.update(patient.proteins)
             prot_counts.update([prot.protein_id for prot in patient.proteins])
         all_counts = {'patients': len(members), 'variants': var_counts, 'phenotypes': pheno_count,
                       'proteins': prot_counts}  # 'proteins':prot_counts
         return Cohort(members, cohort_phenotypes, cohort_variants, cohort_proteins,
-                      all_counts)  # cohort_proteins, all_counts
+                      all_counts, excluded_members)  # cohort_proteins, all_counts
 
     """This class creates a collection of patients and makes it easier to determine overlapping diseases, 
     phenotypes, variants, and proteins among the patients. If a list of JSON files is given, it will
@@ -130,7 +134,7 @@ def from_patients(members: typing.Sequence[Patient]):
         list_data_by_tx(transcript:Optional[string]): A list and count of all the variants effects found for all transcripts or a given transcript if transcript is not None.
     """
 
-    def __init__(self, patient_set: typing.Set[Patient], phenotype_set, variant_set, protein_set, counts_dict,
+    def __init__(self, patient_set: typing.Set[Patient], phenotype_set, variant_set, protein_set, counts_dict, excluded_members,
                  recessive=False):
         """Constructs all necessary attributes for a Cohort object
 
@@ -151,6 +155,7 @@ def __init__(self, patient_set: typing.Set[Patient], phenotype_set, variant_set,
         self._protein_set = protein_set
         self._variant_set = variant_set
         self._all_counts_dict = counts_dict
+        self._excluded_members = excluded_members
         self._recessive = recessive
 
     @property
@@ -196,6 +201,10 @@ def all_transcripts(self):
             all_trans.update([trans.transcript_id for trans in var.tx_annotations])
         return all_trans
 
+    @property
+    def all_excluded_patients(self):
+        return self._excluded_members
+
     @property
     def total_patient_count(self):
         """
@@ -261,5 +270,31 @@ def list_data_by_tx(self, transcript=None):
             del var_type_dict[tx_id]
         return var_type_dict
 
+    def get_excluded_ids(self):
+        return [ex.patient_id for ex in self.all_excluded_patients]
+
+    def get_excluded_count(self):
+        return len(self.all_excluded_patients)
+
+    def get_protein_features_affected(self, transcript):
+        all_features = Counter()
+        protein_set = set()
+        var_coords = []
+        for var in self.all_variants:
+            for tx in var.tx_annotations:
+                if tx.transcript_id == transcript:
+                    protein_set.add(tx.protein_affected)
+                    if tx.protein_effect_location is None or tx.protein_effect_location[0] is None or tx.protein_effect_location[1] is None:
+                        continue
+                    else:
+                        var_coords.append(tx.protein_effect_location)
+        if len(protein_set) != 1:
+            raise ValueError(f"Found more than 1 protein: {protein_set}")
+        else:
+            protein = list(protein_set)[0][0]
+        for pair in var_coords:
+            all_features.update(list(protein.get_features_variant_overlaps(pair[0], pair[1])))
+        return all_features
+
     def __len__(self) -> int:
         return len(self._patient_set)
diff --git a/src/genophenocorr/model/_protein.py b/src/genophenocorr/model/_protein.py
@@ -7,6 +7,7 @@
 from .genome import Region
 
 
+
 class FeatureInfo:
     """A class that represents a protein feature
     (e.g. a repeated sequence given the name "ANK 1" in protein "Ankyrin repeat domain-containing protein 11")
@@ -242,6 +243,22 @@ def motifs(self) -> typing.Iterable[ProteinFeature]:
         """
         return filter(lambda f: f.feature_type == FeatureType.MOTIF, self.protein_features)
 
+    def get_features_variant_overlaps(self, region: Region) -> typing.Collection[ProteinFeature]:
+        """
+        Get a collection of protein features that overlap with the `region`.
+        Args:
+            region: the query region.
+
+        Returns:
+            Collection[ProteinFeature]: a collection of overlapping protein features.
+        """
+        affected_features = set()
+        for feat in self.protein_features:
+            if feat.info.region.overlaps_with_region(region):
+                affected_features.add(feat)
+
+        return affected_features
+
     def __str__(self) -> str:
         return f"ProteinMetadata(id={self.protein_id}, " \
                f"label={self.label}, " \

diff --git a/src/genophenocorr/model/_variant.py b/src/genophenocorr/model/_variant.py
@@ -4,7 +4,7 @@
 
 import hpotk
 
-from .genome import GenomicRegion
+from .genome import Region, GenomicRegion
 from ._gt import Genotyped, Genotypes
 from ._protein import ProteinMetadata
 from ._variant_effects import VariantEffect
@@ -38,49 +38,38 @@ def transcript_id(self) -> str:
 class TranscriptAnnotation(TranscriptInfoAware):
     """Class that represents results of the functional annotation of a variant with respect to single transcript of a gene.
 
-    Attributes:
+    Args:
         gene_id (string): The gene symbol associated with the transcript
-        transcript_id (string): The transcript ID
-        hgvsc_id (string): The HGVS "coding-DNA" ID if available, else None
+        tx_id (string): The transcript ID
+        hgvsc (string): The HGVS "coding-DNA" ID if available, else None
         is_preferred (bool): The transcript is a MANE transcript, canonical Ensembl transcript, etc.
-        variant_effects (Sequence[string]): A sequence of predicted effects given by VEP
-        overlapping_exons (Sequence[integer]): A sequence of exons affected by the variant. Returns None if none are affected.
-        protein_affected (ProteinMetadata): A ProteinMetadata object representing the protein affected by this transcript
-        protein_effect_location (Tuple(integer, integer)): The start and end coordinates of the effect on the protein sequence.
+        variant_effects (Iterable[string]): An iterable of predicted effects given by VEP
+        affected_exons (Iterable[integer]): An iterable of exons affected by the variant. Returns None if none are affected.
+        affected_protein (ProteinMetadata): A ProteinMetadata object representing the protein affected by this transcript
+        protein_effect_coordinates (Region, optional): An optional :class:`Region` with start and end coordinates
+         of the effect on the protein sequence.
     """
 
     def __init__(self, gene_id: str,
                  tx_id: str,
                  hgvsc: typing.Optional[str],
                  is_preferred: bool,
                  variant_effects: typing.Iterable[VariantEffect],
-                 affected_exons: typing.Optional[typing.Sequence[int]],
-                 affected_protein: typing.Sequence[ProteinMetadata],
-                 protein_effect_start: typing.Optional[int],
-                 protein_effect_end: typing.Optional[int]):
-        """Constructs all necessary attributes for a TranscriptAnnotation object
-
-        Args:
-            gene_id (string): The gene symbol associated with the transcript
-            tx_id (string): The transcript ID
-            hgvsc (string, Optional): The HGVS "coding-DNA" ID if available, else None
-            variant_effects (Iterable[string]): An iterable of predicted effects given by functional annotator
-            affected_exons (Sequence[integer], Optional): A sequence of exons affected by the variant. Returns None if none are affected.
-            affected_protein (Sequence[ProteinMetadata]): A ProteinMetadata object representing the protein affected by this transcript
-            protein_effect_start (integer, Optional): The start coordinate of the effect on the protein sequence.
-            protein_effect_end (integer, Optional): The end coordinate of the effect on the protein sequence.
-        """
-        self._gene_id = gene_id
-        self._tx_id = tx_id
-        self._hgvsc_id = hgvsc
-        self._is_preferred = is_preferred
+                 affected_exons: typing.Optional[typing.Iterable[int]],
+                 affected_protein: typing.Iterable[ProteinMetadata],
+                 protein_effect_coordinates: typing.Optional[Region]):
+        self._gene_id = hpotk.util.validate_instance(gene_id, str, 'gene_id')
+        self._tx_id = hpotk.util.validate_instance(tx_id, str, 'tx_id')
+        self._hgvsc_id = hpotk.util.validate_optional_instance(hgvsc, str, 'hgvsc')
+        self._is_preferred = hpotk.util.validate_instance(is_preferred, bool, 'is_preferred')
         self._variant_effects = tuple(variant_effects)
         if affected_exons is not None:
             self._affected_exons = tuple(affected_exons)
         else:
             self._affected_exons = None
         self._affected_protein = tuple(affected_protein)
-        self._protein_effect_location = (protein_effect_start, protein_effect_end)
+        self._protein_effect_location = hpotk.util.validate_optional_instance(protein_effect_coordinates, Region,
+                                                                             'protein_effect_coordinates')
 
     @property
     def gene_id(self) -> str:
@@ -142,10 +131,11 @@ def protein_affected(self) -> typing.Sequence[ProteinMetadata]:
         return self._affected_protein
 
     @property
-    def protein_effect_location(self) -> typing.Tuple[int, int]:
+    def protein_effect_location(self) -> typing.Optional[Region]:
         """
         Returns:
-            Tuple(integer, integer): The start and end position on the protein sequence that the variant effects. (e.g. (1234, 1235))
+            Region: a :class:`genophenocorr.model.genome.Region` with start and end position on the protein sequence
+            that the variant affects.
         """
         return self._protein_effect_location
 
@@ -381,9 +371,10 @@ def create_variant_from_scratch(variant_coordinates: VariantCoordinates,
                                     protein_effect_start: int,
                                     protein_effect_end: int,
                                     genotypes: Genotypes):
-        transcript = TranscriptAnnotation(gene_name, trans_id, hgvsc_id, is_preferred, consequences, exons_effected, protein,
-                                          protein_effect_start, protein_effect_end)
-        return Variant(variant_coordinates, [transcript], genotypes)
+        protein_effect = Region(protein_effect_start, protein_effect_end)
+        transcript = TranscriptAnnotation(gene_name, trans_id, hgvsc_id, is_preferred, consequences, exons_effected,
+                                          protein, protein_effect)
+        return Variant(variant_coordinates, (transcript,), genotypes)
 
     def __init__(self, var_coordinates: VariantCoordinates,
                  tx_annotations: typing.Iterable[TranscriptAnnotation],