Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Error messages #96

Merged
merged 9 commits into from
Dec 13, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4,171 changes: 1,883 additions & 2,288 deletions notebooks/KBG/KBG_Martinez_PMID_36446582_RunGenoPhenoCorr.ipynb

Large diffs are not rendered by default.

645 changes: 317 additions & 328 deletions notebooks/RPGRIP1/RPGRIP1_Beryoskin_PMID_34722527_RunGenoPhenoCorr.ipynb

Large diffs are not rendered by default.

1,365 changes: 1,319 additions & 46 deletions notebooks/STXBP1/STXBP1.ipynb

Large diffs are not rendered by default.

256 changes: 135 additions & 121 deletions notebooks/SUOX/SUOX_Li_PMID36303223_RunGenoPhenoCorr.ipynb

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions src/genophenocorr/analysis/_analyzers.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def compare_by_variant_effect(self, effect: VariantEffect, tx_id: str):
self._transcript = previous
return result

def compare_by_variant_type(self, var_type1:VariantEffect, var_type2:VariantEffect = None):
def compare_by_variant_type(self, var_type1:VariantEffect, var_type2:typing.Optional[VariantEffect] = None):
"""Runs Fisher Exact analysis, finds any correlation between given variant effects across phenotypes.

Args:
Expand Down Expand Up @@ -309,7 +309,7 @@ def compare_by_variant(self, variant1:str, variant2:str = None):
final_df.insert(5, ('', "Corrected p-values"), corrected_pvals, True)
return final_df.sort_values([('', 'Corrected p-values'), ('', 'p-value')])

def compare_by_exon(self, exon1:int, exon2:int = None):
def compare_by_exon(self, exon1:int, exon2:typing.Optional[int] = None):
"""Runs Fisher Exact analysis, finds any correlation between given exons across phenotypes.

Args:
Expand Down Expand Up @@ -341,7 +341,7 @@ def compare_by_exon(self, exon1:int, exon2:int = None):
final_df.insert(5, ('', "Corrected p-values"), corrected_pvals, True)
return final_df.sort_values([('', 'Corrected p-values'), ('', 'p-value')])

def compare_by_protein_feature_type(self, feature1:FeatureType, feature2:FeatureType = None):
def compare_by_protein_feature_type(self, feature1:FeatureType, feature2:typing.Optional[FeatureType] = None):
"""Runs Fisher Exact analysis, finds any correlation between given feature type across phenotypes.

Args:
Expand Down Expand Up @@ -374,7 +374,7 @@ def compare_by_protein_feature_type(self, feature1:FeatureType, feature2:Feature
return final_df.sort_values([('', 'Corrected p-values'), ('', 'p-value')])


def compare_by_protein_feature(self, feature1:str, feature2:str = None):
def compare_by_protein_feature(self, feature1:str, feature2:typing.Optional[str] = None):
"""Runs Fisher Exact analysis, finds any correlation between given feature and phenotypes.

Args:
Expand Down
18 changes: 15 additions & 3 deletions src/genophenocorr/analysis/_commie.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from .predicate.phenotype import PropagatingPhenotypeBooleanPredicateFactory, PhenotypePredicateFactory

from ._api import CohortAnalysis, GenotypePhenotypeAnalysisResult
from ._stats import run_fisher_exact
from ._stats import run_fisher_exact, run_recessive_fisher_exact


def _filter_rare_phenotypes_using_hierarchy(patients: typing.Collection[Patient],
Expand Down Expand Up @@ -106,15 +106,22 @@ def __init__(self, cohort: Cohort,
missing_implies_excluded: bool = False,
include_sv: bool = False,
p_val_correction: typing.Optional[str] = None,
min_perc_patients_w_hpo: typing.Union[float, int] = .1):
min_perc_patients_w_hpo: typing.Union[float, int] = .1,
recessive: bool = False):
lnrekerle marked this conversation as resolved.
Show resolved Hide resolved
if not isinstance(cohort, Cohort):
raise ValueError(f"cohort must be type Cohort but was type {type(cohort)}")

self._logger = logging.getLogger(__name__)
handler = logging.FileHandler(f"{__name__}.log", mode='w')
lnrekerle marked this conversation as resolved.
Show resolved Hide resolved
formatter = logging.Formatter("%(name)s %(asctime)s %(levelname)s %(message)s")
handler.setFormatter(formatter)
self._logger.addHandler(handler)
self._hpo = hpotk.util.validate_instance(hpo, hpotk.MinimalOntology, 'hpo')
self._phenotype_predicate_factory = PropagatingPhenotypeBooleanPredicateFactory(self._hpo,
missing_implies_excluded)
self._correction = p_val_correction
#TODO: For recessive tests, we need new predicates that return 3 categories
self._recessive = recessive
self._patient_list = list(cohort.all_patients) \
if include_sv \
else [pat for pat in cohort.all_patients if not all(var.variant_coordinates.is_structural() for var in pat.variants)]
Expand Down Expand Up @@ -193,7 +200,12 @@ def _run_gp_analysis(self, patients: typing.Iterable[Patient],
for pf in phenotypic_features:
counts = all_counts.loc[pf]
# TODO - this is where we must fail unless we have the contingency table of the right size!
pvals[pf] = run_fisher_exact(counts)
if counts.shape == (2, 2):
pvals[pf] = run_fisher_exact(counts)
elif counts.shape == (3, 2):
pvals[pf] = run_recessive_fisher_exact(counts)
else:
raise ValueError(f"Invalid number of categories. A {counts.shape} table was created. Only (2, 2) and (3, 2) are valid sizes.")

# 3) Multiple correction
if self._correction is not None:
Expand Down
62 changes: 54 additions & 8 deletions src/genophenocorr/analysis/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,19 @@
from ._commie import CommunistCohortAnalysis


P_VAL_OPTIONS = ['bonferroni', 'b',
'sidak', 's',
'holm-sidak', 'hs',
'holm', 'h',
'simes-hochberg', 'sh',
'hommel', 'ho',
'fdr_bh',
'fdr_by',
'fdr_tsbh',
'fdr_tsbky',
'fdr_gbs',
None]

class CohortAnalysisConfiguration:
"""
`CohortAnalysisConfiguration` is a value class for storing :class:`genophenocorr.analysis.CohortAnalysis`
Expand All @@ -31,11 +44,13 @@ class CohortAnalysisConfiguration:
def __init__(self, missing_implies_excluded: bool,
pval_correction: typing.Optional[str],
min_perc_patients_w_hpo: float,
include_sv: bool):
include_sv: bool,
recessive: bool):
lnrekerle marked this conversation as resolved.
Show resolved Hide resolved
self._missing_implies_excluded = missing_implies_excluded
self._pval_correction = pval_correction
self._min_perc_patients_w_hpo = min_perc_patients_w_hpo
self._include_sv = include_sv
self._recessive = recessive

@staticmethod
def builder():
Expand Down Expand Up @@ -74,6 +89,15 @@ def include_sv(self) -> bool:
(i.e. the variants that use symbolic VCF notation).
"""
return self._include_sv

@property
def recessive(self) -> bool:
"""
`True` if we want to test the correlation between heterozygous variants,
homozygous variants, and no variants (i.e. comparing frameshift variants on
both alleles, frameshift variant on one allele, and no frameshift variants)
"""
return self._recessive


class CohortAnalysisConfigurationBuilder:
Expand All @@ -87,10 +111,15 @@ class CohortAnalysisConfigurationBuilder:

def __init__(self):
self._logger = logging.getLogger(__name__)
handler = logging.FileHandler(f"{__name__}.log", mode='w')
lnrekerle marked this conversation as resolved.
Show resolved Hide resolved
formatter = logging.Formatter("%(name)s %(asctime)s %(levelname)s %(message)s")
handler.setFormatter(formatter)
self._logger.addHandler(handler)
self._missing_implies_excluded = False
self._pval_correction = 'bonferroni'
self._min_perc_patients_w_hpo = .1
self._include_sv = False
self._recessive = False

def missing_implies_excluded(self, missing_implies_excluded: bool):
"""
Expand All @@ -108,16 +137,20 @@ def pval_correction(self, pval_correction: typing.Optional[str]):
"""
Set `pval_correction` option.
"""
# TODO - check admissible values
self._pval_correction = pval_correction
if pval_correction in P_VAL_OPTIONS:
lnrekerle marked this conversation as resolved.
Show resolved Hide resolved
self._pval_correction = pval_correction
else:
self._logger.warning('Ignoring invalid `pval_correction` value %s. Not doing p-value correction.', pval_correction)
return self

def min_perc_patients_w_hpo(self, min_perc_patients_w_hpo: float):
"""
Set `min_perc_patients_w_hpo` option.
"""
# TODO - check float in range [0,1)
self._min_perc_patients_w_hpo = min_perc_patients_w_hpo
if min_perc_patients_w_hpo > 1 or min_perc_patients_w_hpo <= 0:
lnrekerle marked this conversation as resolved.
Show resolved Hide resolved
self._logger.warning("min_perc_patients_w_hpo must be greater than 0 and at most 1. Using default of 0.1")
lnrekerle marked this conversation as resolved.
Show resolved Hide resolved
else:
self._min_perc_patients_w_hpo = min_perc_patients_w_hpo
return self

def include_sv(self, include_sv: bool):
Expand All @@ -127,17 +160,29 @@ def include_sv(self, include_sv: bool):
if isinstance(include_sv, bool):
self._include_sv = include_sv
else:
self._logger.warning('Ignoring invalid `include_sv` value %s', include_sv)
self._logger.warning('Ignoring invalid `include_sv` value %s. Defaulting to not include large structural variants.', include_sv)
return self

def recessive(self, recessive: bool):
"""
Set `recessive` option.
"""
if isinstance(recessive, bool):
self._recessive = recessive
else:
self._logger.warning('Ignoring invalid `recessive` value %s. Defaulting to a dominant test.', recessive)
return self


def build(self) -> CohortAnalysisConfiguration:
"""
Build the configuration.
"""
return CohortAnalysisConfiguration(self._missing_implies_excluded,
self._pval_correction,
self._min_perc_patients_w_hpo,
self._include_sv)
self._include_sv,
self._recessive)


def configure_cohort_analysis(cohort: Cohort,
Expand All @@ -158,4 +203,5 @@ def configure_cohort_analysis(cohort: Cohort,
missing_implies_excluded=config.missing_implies_excluded,
include_sv=config.include_sv,
p_val_correction=config.pval_correction,
min_perc_patients_w_hpo=config.min_perc_patients_w_hpo)
min_perc_patients_w_hpo=config.min_perc_patients_w_hpo,
recessive=config.recessive)
5 changes: 4 additions & 1 deletion src/genophenocorr/model/_cohort.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def __hash__(self) -> int:
class Cohort(typing.Sized):

@staticmethod
def from_patients(members: typing.Sequence[Patient], include_patients_with_no_HPO: bool = False):
def from_patients(members: typing.Sequence[Patient], include_patients_with_no_HPO: bool = False, include_patients_with_no_variants: bool = False):
"""
Create a cohort from a sequence of patients.

Expand All @@ -116,6 +116,9 @@ def from_patients(members: typing.Sequence[Patient], include_patients_with_no_HP
if len(patient.phenotypes) == 0 and not include_patients_with_no_HPO:
excluded_members.append(patient)
continue
if len(patient.variants) == 0 and not include_patients_with_no_variants:
excluded_members.append(patient)
continue
cohort_phenotypes.update(patient.phenotypes)
cohort_variants.update(patient.variants)
var_counts.update([var.variant_coordinates.variant_key for var in patient.variants])
Expand Down
3 changes: 3 additions & 0 deletions src/genophenocorr/model/_protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ def info(self) -> FeatureInfo:
def feature_type(self) -> FeatureType:
pass

def to_string(self) -> str:
return f"{self.feature_type.name}-{self.info.name}-{self.info.region}"


class SimpleProteinFeature(ProteinFeature):
"""A class that represents a protein feature
Expand Down
15 changes: 13 additions & 2 deletions src/genophenocorr/preprocessing/_phenopacket.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ class PhenopacketVariantCoordinateFinder(VariantCoordinateFinder[GenomicInterpre
def __init__(self, build: GenomeBuild,
hgvs_coordinate_finder: VariantCoordinateFinder[str]):
self._logger = logging.getLogger(__name__)
handler = logging.FileHandler(f"{__name__}.log", mode='w')
lnrekerle marked this conversation as resolved.
Show resolved Hide resolved
formatter = logging.Formatter("%(name)s %(asctime)s %(levelname)s %(message)s")
handler.setFormatter(formatter)
self._logger.addHandler(handler)
self._build = hpotk.util.validate_instance(build, GenomeBuild, 'build')
self._hgvs_finder = hpotk.util.validate_instance(hgvs_coordinate_finder, VariantCoordinateFinder,
'hgvs_coordinate_finder')
Expand Down Expand Up @@ -140,6 +144,10 @@ def __init__(self, build: GenomeBuild,
var_func_ann: FunctionalAnnotator,
hgvs_coordinate_finder: VariantCoordinateFinder[str]):
self._logger = logging.getLogger(__name__)
handler = logging.FileHandler(f"{__name__}.log", mode='w')
lnrekerle marked this conversation as resolved.
Show resolved Hide resolved
formatter = logging.Formatter("%(name)s %(asctime)s %(levelname)s %(message)s")
handler.setFormatter(formatter)
self._logger.addHandler(handler)
# Violates DI, but it is specific to this class, so I'll leave it "as is".
self._coord_finder = PhenopacketVariantCoordinateFinder(build, hgvs_coordinate_finder)
self._phenotype_creator = hpotk.util.validate_instance(phenotype_creator, PhenotypeCreator, 'phenotype_creator')
Expand Down Expand Up @@ -181,15 +189,18 @@ def _add_variants(self, sample_id: str, pp: Phenopacket) -> typing.Sequence[Vari
for genomic_interp in interp.diagnosis.genomic_interpretations:
vc, gt = self._coord_finder.find_coordinates(genomic_interp)
if "N" in vc.alt:
self._logger.warning(f'Patient {pp.id} has unknown alternative variant {vc.alt} and will not be included.')
self._logger.warning('Patient %s has unknown alternative variant %s, this variant will not be included.', pp.id, vc.variant_key)
continue
tx_annotations = self._func_ann.annotate(vc)
if tx_annotations is None:
self._logger.error("Patient %s has an error with variant %s, this variant will not be included.", pp.id, vc.variant_key)
lnrekerle marked this conversation as resolved.
Show resolved Hide resolved
continue
genotype = Genotypes.single(sample_id, gt)
variant = Variant(vc, tx_annotations, genotype)
variants_list.append(variant)

if len(variants_list) == 0:
self._logger.warning(f'Expected at least one variant per patient, but received none for patient {pp.id}')
self._logger.warning('Expected at least one variant per patient, but received none for patient %s', pp.id)
return variants_list

def _add_phenotypes(self, pp: Phenopacket) -> typing.Sequence[Phenotype]:
Expand Down
9 changes: 7 additions & 2 deletions src/genophenocorr/preprocessing/_phenotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ def __init__(self, hpo: hpotk.MinimalOntology,
validator (hpotk.validate.ValidationRunner): A ValidationRunner object
"""
self._logger = logging.getLogger(__name__)
handler = logging.FileHandler(f"{__name__}.log", mode='w')
lnrekerle marked this conversation as resolved.
Show resolved Hide resolved
formatter = logging.Formatter("%(name)s %(asctime)s %(levelname)s %(message)s")
handler.setFormatter(formatter)
self._logger.addHandler(handler)
self._hpo = hpotk.util.validate_instance(hpo, hpotk.MinimalOntology, 'hpo')
self._validator = hpotk.util.validate_instance(validator, hpotk.validate.ValidationRunner, 'validator')

Expand All @@ -48,8 +52,9 @@ def create_phenotype(self, term_ids: typing.Iterable[typing.Tuple[str, bool]]) -
for term_id, observed in term_ids:
term = self._hpo.get_term(term_id)
if term is None:
raise ValueError(f'Term ID {term_id} is not present in HPO v{self._hpo.version}')
terms.append((term, observed))
self._logger.warning("Term %s cannot be found in HPO version %s. It will be ignored.", term_id, self._hpo.version)
lnrekerle marked this conversation as resolved.
Show resolved Hide resolved
else:
terms.append((term, observed))
validation_results = self._validator.validate_all([term[0] for term in terms])
if validation_results.is_ok:
return tuple(Phenotype.from_term(term, observed) for term, observed in terms)
Expand Down
4 changes: 4 additions & 0 deletions src/genophenocorr/preprocessing/_uniprot.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ def __init__(self):
"""Constructs all necessary attributes for a UniprotProteinMetadataService object
"""
self._logger = logging.getLogger(__name__)
handler = logging.FileHandler(f"{__name__}.log", mode='w')
lnrekerle marked this conversation as resolved.
Show resolved Hide resolved
formatter = logging.Formatter("%(name)s %(asctime)s %(levelname)s %(message)s")
handler.setFormatter(formatter)
self._logger.addHandler(handler)
self._url = 'https://rest.uniprot.org/uniprotkb/search?query=(%s)AND(reviewed:true)&fields=accession,id,' \
'gene_names,gene_primary,protein_name,ft_domain,ft_motif,ft_region,ft_repeat,xref_refseq'

Expand Down
Loading