Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Error messages #96

Merged
merged 9 commits into from
Dec 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4,171 changes: 1,883 additions & 2,288 deletions notebooks/KBG/KBG_Martinez_PMID_36446582_RunGenoPhenoCorr.ipynb

Large diffs are not rendered by default.

645 changes: 317 additions & 328 deletions notebooks/RPGRIP1/RPGRIP1_Beryoskin_PMID_34722527_RunGenoPhenoCorr.ipynb

Large diffs are not rendered by default.

1,365 changes: 1,319 additions & 46 deletions notebooks/STXBP1/STXBP1.ipynb

Large diffs are not rendered by default.

256 changes: 135 additions & 121 deletions notebooks/SUOX/SUOX_Li_PMID36303223_RunGenoPhenoCorr.ipynb

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions src/genophenocorr/analysis/_analyzers.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def compare_by_variant_effect(self, effect: VariantEffect, tx_id: str):
self._transcript = previous
return result

def compare_by_variant_type(self, var_type1:VariantEffect, var_type2:VariantEffect = None):
def compare_by_variant_type(self, var_type1:VariantEffect, var_type2:typing.Optional[VariantEffect] = None):
"""Runs Fisher Exact analysis, finds any correlation between given variant effects across phenotypes.

Args:
Expand Down Expand Up @@ -309,7 +309,7 @@ def compare_by_variant(self, variant1:str, variant2:str = None):
final_df.insert(5, ('', "Corrected p-values"), corrected_pvals, True)
return final_df.sort_values([('', 'Corrected p-values'), ('', 'p-value')])

def compare_by_exon(self, exon1:int, exon2:int = None):
def compare_by_exon(self, exon1:int, exon2:typing.Optional[int] = None):
"""Runs Fisher Exact analysis, finds any correlation between given exons across phenotypes.

Args:
Expand Down Expand Up @@ -341,7 +341,7 @@ def compare_by_exon(self, exon1:int, exon2:int = None):
final_df.insert(5, ('', "Corrected p-values"), corrected_pvals, True)
return final_df.sort_values([('', 'Corrected p-values'), ('', 'p-value')])

def compare_by_protein_feature_type(self, feature1:FeatureType, feature2:FeatureType = None):
def compare_by_protein_feature_type(self, feature1:FeatureType, feature2:typing.Optional[FeatureType] = None):
"""Runs Fisher Exact analysis, finds any correlation between given feature type across phenotypes.

Args:
Expand Down Expand Up @@ -374,7 +374,7 @@ def compare_by_protein_feature_type(self, feature1:FeatureType, feature2:Feature
return final_df.sort_values([('', 'Corrected p-values'), ('', 'p-value')])


def compare_by_protein_feature(self, feature1:str, feature2:str = None):
def compare_by_protein_feature(self, feature1:str, feature2:typing.Optional[str] = None):
"""Runs Fisher Exact analysis, finds any correlation between given feature and phenotypes.

Args:
Expand Down
9 changes: 7 additions & 2 deletions src/genophenocorr/analysis/_commie.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from .predicate.phenotype import PropagatingPhenotypeBooleanPredicateFactory, PhenotypePredicateFactory

from ._api import CohortAnalysis, GenotypePhenotypeAnalysisResult
from ._stats import run_fisher_exact
from ._stats import run_fisher_exact, run_recessive_fisher_exact


def _filter_rare_phenotypes_using_hierarchy(patients: typing.Collection[Patient],
Expand Down Expand Up @@ -193,7 +193,12 @@ def _run_gp_analysis(self, patients: typing.Iterable[Patient],
for pf in phenotypic_features:
counts = all_counts.loc[pf]
# TODO - this is where we must fail unless we have the contingency table of the right size!
pvals[pf] = run_fisher_exact(counts)
if counts.shape == (2, 2):
pvals[pf] = run_fisher_exact(counts)
elif counts.shape == (3, 2):
pvals[pf] = run_recessive_fisher_exact(counts)
else:
raise ValueError(f"Invalid number of categories. A {counts.shape} table was created. Only (2, 2) and (3, 2) are valid sizes.")

# 3) Multiple correction
if self._correction is not None:
Expand Down
37 changes: 30 additions & 7 deletions src/genophenocorr/analysis/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,19 @@
from ._commie import CommunistCohortAnalysis


P_VAL_OPTIONS = ['bonferroni', 'b',
'sidak', 's',
'holm-sidak', 'hs',
'holm', 'h',
'simes-hochberg', 'sh',
'hommel', 'ho',
'fdr_bh',
'fdr_by',
'fdr_tsbh',
'fdr_tsbky',
'fdr_gbs',
None]

class CohortAnalysisConfiguration:
"""
`CohortAnalysisConfiguration` is a value class for storing :class:`genophenocorr.analysis.CohortAnalysis`
Expand Down Expand Up @@ -99,25 +112,35 @@ def missing_implies_excluded(self, missing_implies_excluded: bool):
if isinstance(missing_implies_excluded, bool):
self._missing_implies_excluded = missing_implies_excluded
else:
self._logger.warning('Ignoring invalid `missing_implies_excluded` value %s',
missing_implies_excluded)
self._logger.warning('Ignoring invalid `missing_implies_excluded` value %s. Using %s.',
missing_implies_excluded, self._missing_implies_excluded)

return self

def pval_correction(self, pval_correction: typing.Optional[str]):
"""
Set `pval_correction` option.
"""
# TODO - check admissible values
self._pval_correction = pval_correction
if pval_correction in P_VAL_OPTIONS:
lnrekerle marked this conversation as resolved.
Show resolved Hide resolved
self._pval_correction = pval_correction
else:
self._logger.warning('Ignoring invalid `pval_correction` value %s. Using %s correction.', pval_correction, self._pval_correction)
return self

def min_perc_patients_w_hpo(self, min_perc_patients_w_hpo: float):
"""
Set `min_perc_patients_w_hpo` option.
"""
# TODO - check float in range [0,1)
self._min_perc_patients_w_hpo = min_perc_patients_w_hpo
if not isinstance(min_perc_patients_w_hpo, float):
try:
min_perc_patients_w_hpo = float(min_perc_patients_w_hpo)
except ValueError:
self._logger.warning("min_perc_patients_w_hpo must be a number, but was %s. Using %f", min_perc_patients_w_hpo, self._min_perc_patients_w_hpo)
return self
if min_perc_patients_w_hpo > 1 or min_perc_patients_w_hpo <= 0:
lnrekerle marked this conversation as resolved.
Show resolved Hide resolved
self._logger.warning("min_perc_patients_w_hpo must be greater than 0 and at most 1, but was %f. Using %f", min_perc_patients_w_hpo, self._min_perc_patients_w_hpo)
else:
self._min_perc_patients_w_hpo = min_perc_patients_w_hpo
return self

def include_sv(self, include_sv: bool):
Expand All @@ -127,7 +150,7 @@ def include_sv(self, include_sv: bool):
if isinstance(include_sv, bool):
self._include_sv = include_sv
else:
self._logger.warning('Ignoring invalid `include_sv` value %s', include_sv)
self._logger.warning('Ignoring invalid `include_sv` value %s. Using %s', include_sv, self._include_sv)
return self

def build(self) -> CohortAnalysisConfiguration:
Expand Down
5 changes: 4 additions & 1 deletion src/genophenocorr/model/_cohort.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def __hash__(self) -> int:
class Cohort(typing.Sized):

@staticmethod
def from_patients(members: typing.Sequence[Patient], include_patients_with_no_HPO: bool = False):
def from_patients(members: typing.Sequence[Patient], include_patients_with_no_HPO: bool = False, include_patients_with_no_variants: bool = False):
"""
Create a cohort from a sequence of patients.

Expand All @@ -116,6 +116,9 @@ def from_patients(members: typing.Sequence[Patient], include_patients_with_no_HP
if len(patient.phenotypes) == 0 and not include_patients_with_no_HPO:
excluded_members.append(patient)
continue
if len(patient.variants) == 0 and not include_patients_with_no_variants:
excluded_members.append(patient)
continue
cohort_phenotypes.update(patient.phenotypes)
cohort_variants.update(patient.variants)
var_counts.update([var.variant_coordinates.variant_key for var in patient.variants])
Expand Down
3 changes: 3 additions & 0 deletions src/genophenocorr/model/_protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ def info(self) -> FeatureInfo:
def feature_type(self) -> FeatureType:
pass

def to_string(self) -> str:
return f"{self.feature_type.name}-{self.info.name}-{self.info.region}"


class SimpleProteinFeature(ProteinFeature):
"""A class that represents a protein feature
Expand Down
7 changes: 5 additions & 2 deletions src/genophenocorr/preprocessing/_phenopacket.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,15 +181,18 @@ def _add_variants(self, sample_id: str, pp: Phenopacket) -> typing.Sequence[Vari
for genomic_interp in interp.diagnosis.genomic_interpretations:
vc, gt = self._coord_finder.find_coordinates(genomic_interp)
if "N" in vc.alt:
self._logger.warning(f'Patient {pp.id} has unknown alternative variant {vc.alt} and will not be included.')
self._logger.warning('Patient %s has unknown alternative variant %s, this variant will not be included.', pp.id, vc.variant_key)
continue
tx_annotations = self._func_ann.annotate(vc)
if tx_annotations is None:
self._logger.warning("Patient %s has an error with variant %s, this variant will not be included.", pp.id, vc.variant_key)
continue
genotype = Genotypes.single(sample_id, gt)
variant = Variant(vc, tx_annotations, genotype)
variants_list.append(variant)

if len(variants_list) == 0:
self._logger.warning(f'Expected at least one variant per patient, but received none for patient {pp.id}')
self._logger.warning('Expected at least one variant per patient, but received none for patient %s', pp.id)
return variants_list

def _add_phenotypes(self, pp: Phenopacket) -> typing.Sequence[Phenotype]:
Expand Down
5 changes: 3 additions & 2 deletions src/genophenocorr/preprocessing/_phenotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,9 @@ def create_phenotype(self, term_ids: typing.Iterable[typing.Tuple[str, bool]]) -
for term_id, observed in term_ids:
term = self._hpo.get_term(term_id)
if term is None:
raise ValueError(f'Term ID {term_id} is not present in HPO v{self._hpo.version}')
terms.append((term, observed))
self._logger.warning("Term %s cannot be found in HPO version %s. It will be ignored.", term_id, self._hpo.version)
lnrekerle marked this conversation as resolved.
Show resolved Hide resolved
else:
terms.append((term, observed))
validation_results = self._validator.validate_all([term[0] for term in terms])
if validation_results.is_ok:
return tuple(Phenotype.from_term(term, observed) for term, observed in terms)
Expand Down
30 changes: 14 additions & 16 deletions src/genophenocorr/preprocessing/_vep.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from genophenocorr.model.genome import Region
from ._api import FunctionalAnnotator, ProteinMetadataService


def verify_start_end_coordinates(vc: VariantCoordinates):
"""
Converts the 0-based VariantCoordinates to ones that will be interpreted
Expand Down Expand Up @@ -58,13 +57,15 @@ class VepFunctionalAnnotator(FunctionalAnnotator):

def __init__(self, protein_annotator: ProteinMetadataService,
include_computational_txs: bool = False):
self._logging = logging.getLogger(__name__)
self._logger = logging.getLogger(__name__)
self._protein_annotator = protein_annotator
self._url = 'https://rest.ensembl.org/vep/human/region/%s?LoF=1&canonical=1' \
'&domains=1&hgvs=1' \
'&mutfunc=1&numbers=1&protein=1&refseq=1&mane=1' \
'&transcript_version=1&variant_class=1'
self._include_computational_txs = include_computational_txs
self._slice_effects = [VariantEffect.SPLICE_ACCEPTOR_VARIANT, VariantEffect.SPLICE_DONOR_VARIANT, VariantEffect.SPLICE_DONOR_5TH_BASE_VARIANT, VariantEffect.SPLICE_POLYPYRIMIDINE_TRACT_VARIANT]


def annotate(self, variant_coordinates: VariantCoordinates) -> typing.Sequence[TranscriptAnnotation]:
"""Perform functional annotation using Variant Effect Predictor (VEP) REST API.
Expand All @@ -78,9 +79,8 @@ def annotate(self, variant_coordinates: VariantCoordinates) -> typing.Sequence[T
response = self._query_vep(variant_coordinates)
annotations = []
if 'transcript_consequences' not in response:
raise ValueError(
f'The VEP response lacked the required `transcript_consequences` field')

self._logger.error('The VEP response lacked the required `transcript_consequences` field. %s', response)
return None
for trans in response['transcript_consequences']:
annotation = self._process_item(trans)
if annotation is not None:
Expand All @@ -97,7 +97,7 @@ def _parse_variant_effect(self, effect: str) -> typing.Optional[VariantEffect]:
try:
var_effect = VariantEffect[effect]
except KeyError:
self._logging.warning("VariantEffect %s was not found in our record of possible effects. Please report this issue to the genophenocorr GitHub." , effect)
self._logger.warning("VariantEffect %s was not found in our record of possible effects. Please report this issue to the genophenocorr GitHub.", effect)
return None
return var_effect

Expand All @@ -118,7 +118,6 @@ def _process_item(self, item: typing.Dict) -> typing.Optional[TranscriptAnnotati
if var_effect is not None:
var_effects.append(var_effect)
gene_name = item.get('gene_symbol')

exons_effected = item.get('exon')
if exons_effected is not None:
exons_effected = exons_effected.split('/')[0].split('-')
Expand All @@ -132,11 +131,8 @@ def _process_item(self, item: typing.Dict) -> typing.Optional[TranscriptAnnotati
protein_effect_start = item.get('protein_start')
protein_effect_end = item.get('protein_end')
if protein_effect_start is None or protein_effect_end is None:
# Does this ever happen? Let's log a warning for now and address the absence of a coordinate later,
# if we see a lot of these warnings popping out.
# Note that Lauren's version of the code had a special branch for missing start, where she set the variable
# to `1` (1-based coordinate).
self._logging.warning('Missing start/end coordinate for %s on protein %s', hgvsc_id, protein_id)
if not any(ve in var_effects for ve in self._slice_effects):
self._logger.warning('Missing start/end coordinate for %s on protein %s. Protein effect will not be included.', hgvsc_id, protein_id)
protein_effect = None
else:
# The coordinates are in 1-based system and we need 0-based.
Expand All @@ -158,15 +154,17 @@ def _query_vep(self, variant_coordinates: VariantCoordinates) -> dict:
api_url = self._url % (verify_start_end_coordinates(variant_coordinates))
r = requests.get(api_url, headers={'Content-Type': 'application/json'})
if not r.ok:
self._logging.error(f"Expected a result but got an Error for variant: {variant_coordinates.variant_key}")
r.raise_for_status()
self._logger.error("Expected a result but got an Error for variant: %s", variant_coordinates.variant_key)
self._logger.error(r.text)
return None
results = r.json()
if not isinstance(results, list):
self._logging.error(results.get('error'))
self._logger.error(results.get('error'))
raise ConnectionError(
f"Expected a result but got an Error. See log for details.")
if len(results) > 1:
self._logging.error([result.id for result in results])
self._logger.error("Expected only one variant per request but received %s different variants.", len(results))
self._logger.error([result.id for result in results])
raise ValueError(
f"Expected only one variant per request but received {len(results)} "
f"different variants.")
Expand Down