monarch-initiative · lnrekerle · Dec 13, 2023 · Oct 27, 2023 · Nov 10, 2023 · Nov 30, 2023
diff --git a/notebooks/KBG/KBG_Martinez_PMID_36446582_RunGenoPhenoCorr.ipynb b/notebooks/KBG/KBG_Martinez_PMID_36446582_RunGenoPhenoCorr.ipynb
diff --git a/notebooks/RPGRIP1/RPGRIP1_Beryoskin_PMID_34722527_RunGenoPhenoCorr.ipynb b/notebooks/RPGRIP1/RPGRIP1_Beryoskin_PMID_34722527_RunGenoPhenoCorr.ipynb
diff --git a/notebooks/STXBP1/STXBP1.ipynb b/notebooks/STXBP1/STXBP1.ipynb
diff --git a/notebooks/SUOX/SUOX_Li_PMID36303223_RunGenoPhenoCorr.ipynb b/notebooks/SUOX/SUOX_Li_PMID36303223_RunGenoPhenoCorr.ipynb
diff --git a/src/genophenocorr/analysis/_analyzers.py b/src/genophenocorr/analysis/_analyzers.py
@@ -242,7 +242,7 @@ def compare_by_variant_effect(self, effect: VariantEffect, tx_id: str):
         self._transcript = previous
         return result
 
-    def compare_by_variant_type(self, var_type1:VariantEffect, var_type2:VariantEffect = None):
+    def compare_by_variant_type(self, var_type1:VariantEffect, var_type2:typing.Optional[VariantEffect] = None):
         """Runs Fisher Exact analysis, finds any correlation between given variant effects across phenotypes.
 
         Args:
@@ -309,7 +309,7 @@ def compare_by_variant(self, variant1:str, variant2:str = None):
             final_df.insert(5, ('', "Corrected p-values"), corrected_pvals, True)
         return final_df.sort_values([('', 'Corrected p-values'), ('', 'p-value')])
 
-    def compare_by_exon(self, exon1:int, exon2:int = None):
+    def compare_by_exon(self, exon1:int, exon2:typing.Optional[int] = None):
         """Runs Fisher Exact analysis, finds any correlation between given exons across phenotypes.
 
         Args:
@@ -341,7 +341,7 @@ def compare_by_exon(self, exon1:int, exon2:int = None):
             final_df.insert(5, ('', "Corrected p-values"), corrected_pvals, True)
         return final_df.sort_values([('', 'Corrected p-values'), ('', 'p-value')])
 
-    def compare_by_protein_feature_type(self, feature1:FeatureType, feature2:FeatureType = None):
+    def compare_by_protein_feature_type(self, feature1:FeatureType, feature2:typing.Optional[FeatureType] = None):
         """Runs Fisher Exact analysis, finds any correlation between given feature type across phenotypes.
 
         Args:
@@ -374,7 +374,7 @@ def compare_by_protein_feature_type(self, feature1:FeatureType, feature2:Feature
         return final_df.sort_values([('', 'Corrected p-values'), ('', 'p-value')])
 
 
-    def compare_by_protein_feature(self, feature1:str, feature2:str = None):
+    def compare_by_protein_feature(self, feature1:str, feature2:typing.Optional[str] = None):
         """Runs Fisher Exact analysis, finds any correlation between given feature and phenotypes.
 
         Args:

diff --git a/src/genophenocorr/analysis/_commie.py b/src/genophenocorr/analysis/_commie.py
@@ -14,7 +14,7 @@
 from .predicate.phenotype import PropagatingPhenotypeBooleanPredicateFactory, PhenotypePredicateFactory
 
 from ._api import CohortAnalysis, GenotypePhenotypeAnalysisResult
-from ._stats import run_fisher_exact
+from ._stats import run_fisher_exact, run_recessive_fisher_exact
 
 
 def _filter_rare_phenotypes_using_hierarchy(patients: typing.Collection[Patient],
@@ -193,7 +193,12 @@ def _run_gp_analysis(self, patients: typing.Iterable[Patient],
         for pf in phenotypic_features:
             counts = all_counts.loc[pf]
             # TODO - this is where we must fail unless we have the contingency table of the right size!
-            pvals[pf] = run_fisher_exact(counts)
+            if counts.shape == (2, 2):
+                pvals[pf] = run_fisher_exact(counts)
+            elif counts.shape == (3, 2):
+                pvals[pf] = run_recessive_fisher_exact(counts)
+            else:
+                raise ValueError(f"Invalid number of categories. A {counts.shape} table was created. Only (2, 2) and (3, 2) are valid sizes.")
 
         # 3) Multiple correction
         if self._correction is not None:

diff --git a/src/genophenocorr/analysis/_config.py b/src/genophenocorr/analysis/_config.py
@@ -9,6 +9,19 @@
 from ._commie import CommunistCohortAnalysis
 
 
+P_VAL_OPTIONS = ['bonferroni', 'b', 
+                 'sidak', 's', 
+                 'holm-sidak', 'hs', 
+                 'holm', 'h', 
+                 'simes-hochberg', 'sh', 
+                 'hommel', 'ho', 
+                 'fdr_bh', 
+                 'fdr_by', 
+                 'fdr_tsbh', 
+                 'fdr_tsbky',
+                 'fdr_gbs',
+                 None]
+
 class CohortAnalysisConfiguration:
     """
     `CohortAnalysisConfiguration` is a value class for storing :class:`genophenocorr.analysis.CohortAnalysis`
@@ -99,25 +112,35 @@ def missing_implies_excluded(self, missing_implies_excluded: bool):
         if isinstance(missing_implies_excluded, bool):
             self._missing_implies_excluded = missing_implies_excluded
         else:
-            self._logger.warning('Ignoring invalid `missing_implies_excluded` value %s',
-                                 missing_implies_excluded)
+            self._logger.warning('Ignoring invalid `missing_implies_excluded` value %s. Using %s.',
+                                 missing_implies_excluded, self._missing_implies_excluded)
 
         return self
 
     def pval_correction(self, pval_correction: typing.Optional[str]):
         """
         Set `pval_correction` option.
         """
-        # TODO - check admissible values
-        self._pval_correction = pval_correction
+        if pval_correction in P_VAL_OPTIONS:
+            self._pval_correction = pval_correction
+        else:
+            self._logger.warning('Ignoring invalid `pval_correction` value %s. Using %s correction.', pval_correction, self._pval_correction)
         return self
 
     def min_perc_patients_w_hpo(self, min_perc_patients_w_hpo: float):
         """
         Set `min_perc_patients_w_hpo` option.
         """
-        # TODO - check float in range [0,1)
-        self._min_perc_patients_w_hpo = min_perc_patients_w_hpo
+        if not isinstance(min_perc_patients_w_hpo, float):
+            try:
+                min_perc_patients_w_hpo = float(min_perc_patients_w_hpo)
+            except ValueError:
+                self._logger.warning("min_perc_patients_w_hpo must be a number, but was %s. Using %f", min_perc_patients_w_hpo, self._min_perc_patients_w_hpo)
+                return self
+        if min_perc_patients_w_hpo > 1 or min_perc_patients_w_hpo <= 0:
+            self._logger.warning("min_perc_patients_w_hpo must be greater than 0 and at most 1, but was %f. Using %f", min_perc_patients_w_hpo, self._min_perc_patients_w_hpo)
+        else:
+            self._min_perc_patients_w_hpo = min_perc_patients_w_hpo
         return self
 
     def include_sv(self, include_sv: bool):
@@ -127,7 +150,7 @@ def include_sv(self, include_sv: bool):
         if isinstance(include_sv, bool):
             self._include_sv = include_sv
         else:
-            self._logger.warning('Ignoring invalid `include_sv` value %s', include_sv)
+            self._logger.warning('Ignoring invalid `include_sv` value %s. Using %s', include_sv, self._include_sv)
         return self
 
     def build(self) -> CohortAnalysisConfiguration:

diff --git a/src/genophenocorr/model/_cohort.py b/src/genophenocorr/model/_cohort.py
@@ -101,7 +101,7 @@ def __hash__(self) -> int:
 class Cohort(typing.Sized):
 
     @staticmethod
-    def from_patients(members: typing.Sequence[Patient], include_patients_with_no_HPO: bool = False):
+    def from_patients(members: typing.Sequence[Patient], include_patients_with_no_HPO: bool = False, include_patients_with_no_variants: bool = False):
         """
         Create a cohort from a sequence of patients.
 
@@ -116,6 +116,9 @@ def from_patients(members: typing.Sequence[Patient], include_patients_with_no_HP
             if len(patient.phenotypes) == 0 and not include_patients_with_no_HPO:
                 excluded_members.append(patient)
                 continue
+            if len(patient.variants) == 0 and not include_patients_with_no_variants:
+                excluded_members.append(patient)
+                continue
             cohort_phenotypes.update(patient.phenotypes)
             cohort_variants.update(patient.variants)
             var_counts.update([var.variant_coordinates.variant_key for var in patient.variants])

diff --git a/src/genophenocorr/model/_protein.py b/src/genophenocorr/model/_protein.py
@@ -102,6 +102,9 @@ def info(self) -> FeatureInfo:
     def feature_type(self) -> FeatureType:
         pass
 
+    def to_string(self) -> str:
+        return f"{self.feature_type.name}-{self.info.name}-{self.info.region}"
+
 
 class SimpleProteinFeature(ProteinFeature):
     """A class that represents a protein feature

diff --git a/src/genophenocorr/preprocessing/_phenopacket.py b/src/genophenocorr/preprocessing/_phenopacket.py
@@ -181,15 +181,18 @@ def _add_variants(self, sample_id: str, pp: Phenopacket) -> typing.Sequence[Vari
             for genomic_interp in interp.diagnosis.genomic_interpretations:
                 vc, gt = self._coord_finder.find_coordinates(genomic_interp)
                 if "N" in vc.alt:
-                    self._logger.warning(f'Patient {pp.id} has unknown alternative variant {vc.alt} and will not be included.')
+                    self._logger.warning('Patient %s has unknown alternative variant %s, this variant will not be included.', pp.id, vc.variant_key)
                     continue
                 tx_annotations = self._func_ann.annotate(vc)
+                if tx_annotations is None:
+                    self._logger.warning("Patient %s has an error with variant %s, this variant will not be included.", pp.id, vc.variant_key)
+                    continue
                 genotype = Genotypes.single(sample_id, gt)
                 variant = Variant(vc, tx_annotations, genotype)
                 variants_list.append(variant)
 
         if len(variants_list) == 0:
-            self._logger.warning(f'Expected at least one variant per patient, but received none for patient {pp.id}')
+            self._logger.warning('Expected at least one variant per patient, but received none for patient %s', pp.id)
         return variants_list
 
     def _add_phenotypes(self, pp: Phenopacket) -> typing.Sequence[Phenotype]:

diff --git a/src/genophenocorr/preprocessing/_phenotype.py b/src/genophenocorr/preprocessing/_phenotype.py
@@ -48,8 +48,9 @@ def create_phenotype(self, term_ids: typing.Iterable[typing.Tuple[str, bool]]) -
         for term_id, observed in term_ids:
             term = self._hpo.get_term(term_id)
             if term is None:
-                raise ValueError(f'Term ID {term_id} is not present in HPO v{self._hpo.version}')
-            terms.append((term, observed))
+                self._logger.warning("Term %s cannot be found in HPO version %s. It will be ignored.", term_id, self._hpo.version)
+            else:
+                terms.append((term, observed))
         validation_results = self._validator.validate_all([term[0] for term in terms])
         if validation_results.is_ok:
             return tuple(Phenotype.from_term(term, observed) for term, observed in terms)

diff --git a/src/genophenocorr/preprocessing/_vep.py b/src/genophenocorr/preprocessing/_vep.py
@@ -8,7 +8,6 @@
 from genophenocorr.model.genome import Region
 from ._api import FunctionalAnnotator, ProteinMetadataService
 
-
 def verify_start_end_coordinates(vc: VariantCoordinates):
     """
     Converts the 0-based VariantCoordinates to ones that will be interpreted
@@ -58,13 +57,15 @@ class VepFunctionalAnnotator(FunctionalAnnotator):
 
     def __init__(self, protein_annotator: ProteinMetadataService,
                  include_computational_txs: bool = False):
-        self._logging = logging.getLogger(__name__)
+        self._logger = logging.getLogger(__name__)
         self._protein_annotator = protein_annotator
         self._url = 'https://rest.ensembl.org/vep/human/region/%s?LoF=1&canonical=1' \
                     '&domains=1&hgvs=1' \
                     '&mutfunc=1&numbers=1&protein=1&refseq=1&mane=1' \
                     '&transcript_version=1&variant_class=1'
         self._include_computational_txs = include_computational_txs
+        self._slice_effects = [VariantEffect.SPLICE_ACCEPTOR_VARIANT, VariantEffect.SPLICE_DONOR_VARIANT, VariantEffect.SPLICE_DONOR_5TH_BASE_VARIANT, VariantEffect.SPLICE_POLYPYRIMIDINE_TRACT_VARIANT]
+
 
     def annotate(self, variant_coordinates: VariantCoordinates) -> typing.Sequence[TranscriptAnnotation]:
         """Perform functional annotation using Variant Effect Predictor (VEP) REST API.
@@ -78,9 +79,8 @@ def annotate(self, variant_coordinates: VariantCoordinates) -> typing.Sequence[T
         response = self._query_vep(variant_coordinates)
         annotations = []
         if 'transcript_consequences' not in response:
-            raise ValueError(
-                f'The VEP response lacked the required `transcript_consequences` field')
-
+            self._logger.error('The VEP response lacked the required `transcript_consequences` field. %s', response)
+            return None
         for trans in response['transcript_consequences']:
             annotation = self._process_item(trans)
             if annotation is not None:
@@ -97,7 +97,7 @@ def _parse_variant_effect(self, effect: str) -> typing.Optional[VariantEffect]:
         try:
             var_effect = VariantEffect[effect]
         except KeyError:
-            self._logging.warning("VariantEffect %s was not found in our record of possible effects. Please report this issue to the genophenocorr GitHub." , effect)
+            self._logger.warning("VariantEffect %s was not found in our record of possible effects. Please report this issue to the genophenocorr GitHub.", effect)
             return None
         return var_effect
 
@@ -118,7 +118,6 @@ def _process_item(self, item: typing.Dict) -> typing.Optional[TranscriptAnnotati
             if var_effect is not None:
                 var_effects.append(var_effect)
         gene_name = item.get('gene_symbol')
-
         exons_effected = item.get('exon')
         if exons_effected is not None:
             exons_effected = exons_effected.split('/')[0].split('-')
@@ -132,11 +131,8 @@ def _process_item(self, item: typing.Dict) -> typing.Optional[TranscriptAnnotati
         protein_effect_start = item.get('protein_start')
         protein_effect_end = item.get('protein_end')
         if protein_effect_start is None or protein_effect_end is None:
-            # Does this ever happen? Let's log a warning for now and address the absence of a coordinate later,
-            # if we see a lot of these warnings popping out.
-            # Note that Lauren's version of the code had a special branch for missing start, where she set the variable
-            # to `1` (1-based coordinate).
-            self._logging.warning('Missing start/end coordinate for %s on protein %s', hgvsc_id, protein_id)
+            if not any(ve in var_effects for ve in self._slice_effects):
+                self._logger.warning('Missing start/end coordinate for %s on protein %s. Protein effect will not be included.', hgvsc_id, protein_id)
             protein_effect = None
         else:
             # The coordinates are in 1-based system and we need 0-based.
@@ -158,15 +154,17 @@ def _query_vep(self, variant_coordinates: VariantCoordinates) -> dict:
         api_url = self._url % (verify_start_end_coordinates(variant_coordinates))
         r = requests.get(api_url, headers={'Content-Type': 'application/json'})
         if not r.ok:
-            self._logging.error(f"Expected a result but got an Error for variant: {variant_coordinates.variant_key}")
-            r.raise_for_status()
+            self._logger.error("Expected a result but got an Error for variant: %s", variant_coordinates.variant_key)
+            self._logger.error(r.text)
+            return None
         results = r.json()
         if not isinstance(results, list):
-            self._logging.error(results.get('error'))
+            self._logger.error(results.get('error'))
             raise ConnectionError(
                 f"Expected a result but got an Error. See log for details.")
         if len(results) > 1:
-            self._logging.error([result.id for result in results])
+            self._logger.error("Expected only one variant per request but received %s different variants.", len(results))
+            self._logger.error([result.id for result in results])
             raise ValueError(
                 f"Expected only one variant per request but received {len(results)} "
                 f"different variants.")