diff --git a/.gitignore b/.gitignore index 05be3cf74..de067b776 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# Cache with transcript/protein pickle files +.genophenocorr_cache + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/docs/conf.py b/docs/conf.py index 68648698e..6e87c3b05 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import doctest import os import sys @@ -175,13 +176,17 @@ doctest_path = [genophenocorr_src] doctest_test_doctest_blocks = "" -# Nothing special here doctest_global_setup = """ # For printing data frames "as is". import pandas as pd pd.set_option('expand_frame_repr', False) """ +doctest_default_flags = (doctest.REPORT_ONLY_FIRST_FAILURE + | doctest.ELLIPSIS + | doctest.IGNORE_EXCEPTION_DETAIL + | doctest.DONT_ACCEPT_TRUE_FOR_1) + # -- Intersphinx setup -------------------------------------------------------- intersphinx_mapping = { "python": ("https://docs.python.org/3/", None), diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 9f42dcc14..13c9f3980 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -46,12 +46,12 @@ We can then view the data using the list commands. >>> sorted(cohort.list_all_phenotypes()) [('HP:0001166', 14), ('HP:0001250', 20), ('HP:0001257', 17)] >>> sorted(cohort.list_all_variants()) - [('HetVar1', 13), ('HetVar2', 11), ('HomVar1', 3), ('HomVar2', 2)] + [('1_281_281_A_G', 16), ('1_361_363_TTC_T', 13)] >>> sorted(cohort.list_all_proteins()) [('NP_09876.5', 26)] >>> tx_dict = cohort.list_data_by_tx('NM_1234.5') >>> sorted(tx_dict['NM_1234.5'].items()) - [('frameshift_variant', 2), ('missense_variant', 2)] + [('frameshift_variant', 1), ('missense_variant', 1)] Using the counts, we can choose and run what analyses we want. For instance, we can partition the patients into two groups based on presence/absence of a *frameshift* variant: diff --git a/docs/user-guide/input-data.rst b/docs/user-guide/input-data.rst index 965e8b267..c5f2afabb 100644 --- a/docs/user-guide/input-data.rst +++ b/docs/user-guide/input-data.rst @@ -58,7 +58,8 @@ For the purpose of this example, we will use a folder `simple_cohort` with 5 exa .. doctest:: input-data - >>> simple_cohort_path = 'data/simple_cohort' + >>> import os + >>> simple_cohort_path = os.path.join(os.getcwd(), 'data', 'simple_cohort') Here we walk the file system, load all phenopacket JSON files, and transform the phenopackets into instances of :class:`genophenocorr.model.Patient`: @@ -76,8 +77,8 @@ Here we walk the file system, load all phenopacket JSON files, and transform the ... pp_path = os.path.join(dirpath, filename) ... with open(pp_path) as fh: ... pp = Parse(fh.read(), Phenopacket()) - ... patient = patient_creator.create_patient(pp) - ... patients.append(patient) + ... patient = patient_creator.create_patient(pp) + ... patients.append(patient) >>> f'Loaded {len(patients)} phenopackets' diff --git a/src/genophenocorr/analysis/predicate/_all_predicates.py b/src/genophenocorr/analysis/predicate/_all_predicates.py index 9c7c3d767..7a3208ed4 100644 --- a/src/genophenocorr/analysis/predicate/_all_predicates.py +++ b/src/genophenocorr/analysis/predicate/_all_predicates.py @@ -3,7 +3,7 @@ import hpotk from genophenocorr.constants import VariantEffect -from genophenocorr.model import Patient, FeatureType +from genophenocorr.model import Patient, FeatureType, Genotype from ._api import PolyPredicate, PatientCategory @@ -95,7 +95,7 @@ def __init__(self, transcript:str) -> None: def categories(self) -> typing.Sequence[PatientCategory]: return HETEROZYGOUS, HOMOZYGOUS, NO_VARIANT - def test(self, patient: Patient, query:VariantEffect) -> typing.Optional[PatientCategory]: + def test(self, patient: Patient, query: VariantEffect) -> typing.Optional[PatientCategory]: if not isinstance(patient, Patient): raise ValueError(f"patient must be type Patient but was type {type(patient)}") if not isinstance(query, VariantEffect): @@ -109,11 +109,13 @@ def test(self, patient: Patient, query:VariantEffect) -> typing.Optional[Patient vars.add(var) if len(vars) == 1: for v in vars: - if v.genotype == "heterozygous": + gt = v.genotype_for_sample(patient.patient_id) + if gt == Genotype.HETEROZYGOUS: return HETEROZYGOUS - elif v.genotype == "homozygous": + elif gt == Genotype.HOMOZYGOUS_ALTERNATE: return HOMOZYGOUS else: + # TODO - is this really what we want to return here? return HETEROZYGOUS elif len(vars) > 1: return HOMOZYGOUS @@ -135,16 +137,17 @@ def test(self, patient: Patient, query: str) -> typing.Optional[PatientCategory] raise ValueError(f"query must be type string but was type {type(query)}") vars = set() for var in patient.variants: - #print(f"{var.variant_string} == {query}") - if var.variant_string == query: + if var.variant_coordinates.variant_key == query: vars.add(var) if len(vars) == 1: for v in vars: - if v.genotype == "heterozygous": + gt = v.genotype_for_sample(patient.patient_id) + if gt == Genotype.HETEROZYGOUS: return HETEROZYGOUS - elif v.genotype == "homozygous": + elif gt == Genotype.HOMOZYGOUS_ALTERNATE: return HOMOZYGOUS else: + # TODO - is this really what we want to return here? return HETEROZYGOUS elif len(vars) > 1: return HOMOZYGOUS @@ -173,11 +176,13 @@ def test(self, patient: Patient, query: int) -> typing.Optional[PatientCategory] vars.add(var) if len(vars) == 1: for v in vars: - if v.genotype == "heterozygous": + gt = v.genotype_for_sample(patient.patient_id) + if gt == Genotype.HETEROZYGOUS: return HETEROZYGOUS - elif v.genotype == "homozygous": + elif gt == Genotype.HOMOZYGOUS_ALTERNATE: return HOMOZYGOUS else: + # TODO - is this really what we want to return here? return HETEROZYGOUS elif len(vars) > 1: return HOMOZYGOUS @@ -209,11 +214,13 @@ def test(self, patient: Patient, query:FeatureType) -> typing.Optional[PatientCa vars.add(var) if len(vars) == 1: for v in vars: - if v.genotype == "heterozygous": + gt = v.genotype_for_sample(patient.patient_id) + if gt == Genotype.HETEROZYGOUS: return HETEROZYGOUS - elif v.genotype == "homozygous": + elif gt == Genotype.HOMOZYGOUS_ALTERNATE: return HOMOZYGOUS else: + # TODO - is this really what we want to return here? return HETEROZYGOUS elif len(vars) > 1: return HOMOZYGOUS @@ -251,11 +258,13 @@ def test(self, patient: Patient, query: str) -> typing.Optional[PatientCategory] vars.add(var) if len(vars) == 1: for v in vars: - if v.genotype == "heterozygous": + gt = v.genotype_for_sample(patient.patient_id) + if gt == Genotype.HETEROZYGOUS: return HETEROZYGOUS - elif v.genotype == "homozygous": + elif gt == Genotype.HOMOZYGOUS_ALTERNATE: return HOMOZYGOUS else: + # TODO - is this really what we want to return here? return HETEROZYGOUS elif len(vars) > 1: return HOMOZYGOUS diff --git a/src/genophenocorr/data/_toy.py b/src/genophenocorr/data/_toy.py index c85f850e5..9843d06ed 100644 --- a/src/genophenocorr/data/_toy.py +++ b/src/genophenocorr/data/_toy.py @@ -36,166 +36,175 @@ def get_toy_cohort() -> Cohort: prot_feat_2 = ProteinFeature.create(FeatureInfo('region', Region(50, 100)), FeatureType.REGION) prot = ProteinMetadata('NP_09876.5', 'FakeProtein', [prot_feat_1, prot_feat_2]) - het_snv = Variant.create_variant_from_scratch( - 'HetVar1', 'SNV', - VariantCoordinates(make_region(280, 281), 'A', 'G', 0, 'heterozygous'), - 'FakeGene', 'NM_1234.5', 'NM_1234.5:c.180A>G', False, ['missense_variant'], - [1], [prot], 60, 60) - het_del = Variant.create_variant_from_scratch( - 'HetVar2', 'indel', - VariantCoordinates(make_region(360, 363), 'TTC', 'T', -2, 'heterozygous'), - 'FakeGene', 'NM_1234.5', 'NM_1234.5:c.261_263del', False, ['frameshift_variant'], - [2], [prot], 86, 87) + snv = Variant.create_variant_from_scratch(VariantCoordinates(make_region(280, 281), 'A', 'G', 0), 'FakeGene', + 'NM_1234.5', 'NM_1234.5:c.180A>G', False, ['missense_variant'], [1], + [prot], 60, 60, + Genotypes.from_mapping({ + 'A': Genotype.HETEROZYGOUS, 'B': Genotype.HETEROZYGOUS, + 'C': Genotype.HOMOZYGOUS_ALTERNATE, + 'D': Genotype.HETEROZYGOUS, 'E': Genotype.HETEROZYGOUS, + 'G': Genotype.HETEROZYGOUS, 'J': Genotype.HETEROZYGOUS, + 'K': Genotype.HETEROZYGOUS, 'M': Genotype.HETEROZYGOUS, + 'N': Genotype.HOMOZYGOUS_ALTERNATE, + 'P': Genotype.HETEROZYGOUS, + 'Q': Genotype.HOMOZYGOUS_ALTERNATE, + 'R': Genotype.HETEROZYGOUS, 'T': Genotype.HETEROZYGOUS, + 'V': Genotype.HETEROZYGOUS, 'Y': Genotype.HETEROZYGOUS, + }) + ) + deletion = Variant.create_variant_from_scratch(VariantCoordinates(make_region(360, 363), 'TTC', 'T', -2), + 'FakeGene', 'NM_1234.5', 'NM_1234.5:c.261_263del', + False, ['frameshift_variant'], + [2], [prot], 86, 87, + Genotypes.from_mapping({ + 'D': Genotype.HETEROZYGOUS, 'F': Genotype.HETEROZYGOUS, + 'G': Genotype.HETEROZYGOUS, 'H': Genotype.HETEROZYGOUS, + 'I': Genotype.HOMOZYGOUS_ALTERNATE, + 'L': Genotype.HETEROZYGOUS, 'O': Genotype.HETEROZYGOUS, + 'R': Genotype.HETEROZYGOUS, + 'S': Genotype.HOMOZYGOUS_ALTERNATE, + 'U': Genotype.HETEROZYGOUS, 'W': Genotype.HETEROZYGOUS, + 'X': Genotype.HETEROZYGOUS, 'Z': Genotype.HETEROZYGOUS, + }) + ) het_dup = Variant.create_variant_from_scratch( - 'HetVar3', 'insertion', - VariantCoordinates(make_region(175, 176), 'T', 'TG', 1, 'heterozygous'), - 'FakeGene', 'NM_1234.5', 'NM_1234.5:c.75A>G', False, ['frameshift_variant'], - [1], [prot], 25, 25) - hom_snv = Variant.create_variant_from_scratch( - 'HomVar1', 'SNV', - VariantCoordinates(make_region(280, 281), 'A', 'G', 0, 'homozygous'), - 'FakeGene', 'NM_1234.5', 'NM_1234.5:c.180A>G', False, ['missense_variant'], - [1], [prot], 60, 60) - hom_del = Variant.create_variant_from_scratch( - 'HomVar2', 'indel', - VariantCoordinates(make_region(360, 363), 'TTC', 'T', -2, 'homozygous'), - 'FakeGene', 'NM_1234.5', 'NM_1234.5:c.261_263del', False, ['frameshift_variant'], - [2], [prot], 86, 87) + VariantCoordinates(make_region(175, 176), 'T', 'TG', 1), 'FakeGene', 'NM_1234.5', + 'NM_1234.5:c.75A>G', False, ['frameshift_variant'], [1], [prot], 25, 25, + Genotypes.empty()) # Not used in the patients below, hence `empty()`. hom_dup = Variant.create_variant_from_scratch( - 'HomVar3', 'insertion', - VariantCoordinates(make_region(175, 176), 'T', 'TG', 1,'homozygous'), - 'FakeGene', 'NM_1234.5', 'NM_1234.5:c.75A>G', False, ['frameshift_variant'], - [1], [prot], 25, 25) + VariantCoordinates(make_region(175, 176), 'T', 'TG', 1),'FakeGene', 'NM_1234.5', + 'NM_1234.5:c.75A>G', False, ['frameshift_variant'], [1], [prot], 25, 25, + Genotypes.empty()) # Not used in the patients below, hence `empty()`. patients = ( Patient('A', phenotypes=(arachnodactyly_T, spasticity_F, seizure_T), - variants=[het_snv], + variants=[snv], proteins=[prot] ), Patient('B', phenotypes=(arachnodactyly_T, seizure_T, spasticity_T), - variants=[het_snv], + variants=[snv], proteins=[prot] ), Patient('C', phenotypes=(arachnodactyly_F, spasticity_T, seizure_T), - variants=[hom_snv], + variants=[snv], proteins=[prot] ), Patient('D', phenotypes=(arachnodactyly_T, spasticity_T, seizure_T), - variants=[het_snv, het_del], + variants=[snv, deletion], proteins=[prot] ), Patient('E', phenotypes=(arachnodactyly_T, spasticity_T, seizure_F), - variants=[het_snv], + variants=[snv], proteins=[prot] ), Patient('F', phenotypes=(arachnodactyly_F, spasticity_F, seizure_T), - variants=[het_del], + variants=[deletion], proteins=[prot] ), Patient('G', phenotypes=(arachnodactyly_T, seizure_T, spasticity_T), - variants=[het_snv, het_del], + variants=[snv, deletion], proteins=[prot] ), Patient('H', phenotypes=(arachnodactyly_T, seizure_T, spasticity_F), - variants=[het_del], + variants=[deletion], proteins=[prot] ), Patient('I', phenotypes=(arachnodactyly_F, spasticity_F, seizure_T), - variants=[hom_del], + variants=[deletion], proteins=[prot] ), Patient('J', phenotypes=(arachnodactyly_T, seizure_T, spasticity_T), - variants=[het_snv], + variants=[snv], proteins=[prot] ), Patient('K', phenotypes=(arachnodactyly_F, spasticity_T, seizure_T), - variants=[het_snv], + variants=[snv], proteins=[prot] ), Patient('L', phenotypes=(arachnodactyly_F, seizure_F, spasticity_F), - variants=[het_del], + variants=[deletion], proteins=[prot] ), Patient('M', phenotypes=(arachnodactyly_T, seizure_F, spasticity_T), - variants=[het_snv], + variants=[snv], proteins=[prot] ), Patient('N', phenotypes=(arachnodactyly_F, seizure_T, spasticity_F), - variants=[hom_snv], + variants=[snv], proteins=[prot] ), Patient('O', phenotypes=(arachnodactyly_F, seizure_F, spasticity_T), - variants=[het_del], + variants=[deletion], proteins=[prot] ), Patient('P', phenotypes=(arachnodactyly_T, seizure_T, spasticity_F), - variants=[het_snv], + variants=[snv], proteins=[prot] ), Patient('Q', phenotypes=(arachnodactyly_T, seizure_F, spasticity_F), - variants=[hom_snv], + variants=[snv], proteins=[prot] ), Patient('R', phenotypes=(arachnodactyly_T, seizure_T, spasticity_F), - variants=[het_snv, het_del], + variants=[snv, deletion], proteins=[prot] ), Patient('S', phenotypes=(arachnodactyly_F, seizure_T, spasticity_T), - variants=[hom_del], + variants=[deletion], proteins=[prot] ), Patient('T', phenotypes=(arachnodactyly_T, seizure_F, spasticity_T), - variants=[het_snv], + variants=[snv], proteins=[prot] ), Patient('U', phenotypes=(arachnodactyly_F, seizure_T, spasticity_T), - variants=[het_del], + variants=[deletion], proteins=[prot] ), Patient('V', phenotypes=(arachnodactyly_T, seizure_T, spasticity_T), - variants=[het_snv], + variants=[snv], proteins=[prot] ), Patient('W', phenotypes=(arachnodactyly_F, seizure_T, spasticity_T), - variants=[het_del], + variants=[deletion], proteins=[prot] ), Patient('X', phenotypes=(arachnodactyly_F, seizure_T, spasticity_T), - variants=[het_del], + variants=[deletion], proteins=[prot] ), Patient('Y', phenotypes=(arachnodactyly_T, seizure_T, spasticity_T), - variants=[het_snv], + variants=[snv], proteins=[prot] ), Patient('Z', phenotypes=(arachnodactyly_F, seizure_T, spasticity_T), - variants=[het_del], + variants=[deletion], proteins=[prot] ), ) diff --git a/src/genophenocorr/model/__init__.py b/src/genophenocorr/model/__init__.py index 3a0d70357..eef87c8ce 100644 --- a/src/genophenocorr/model/__init__.py +++ b/src/genophenocorr/model/__init__.py @@ -1,17 +1,21 @@ """ The `genophenocorr.model` package defines data model classes used in genophenocorr. We start with the top-level elements, -such as :class:`Cohort` and :class:`Patient`, and we follow with data classes for phenotype, genotype, and protein info. +such as :class:`Cohort` and :class:`Patient`, and we follow with data classes for phenotype, genotype, transcript, +and protein info. """ from . import genome from ._cohort import Cohort, Patient +from ._gt import Genotype, Genotypes, Genotyped from ._phenotype import Phenotype from ._protein import FeatureInfo, FeatureType, ProteinFeature, ProteinMetadata +from ._tx import TranscriptCoordinates from ._variant import VariantCoordinates, TranscriptAnnotation, TranscriptInfoAware, Variant __all__ = [ 'Cohort', 'Patient', 'Phenotype', - 'Variant', 'VariantCoordinates', 'TranscriptAnnotation', 'TranscriptInfoAware', + 'Variant', 'VariantCoordinates', 'Genotype', 'Genotypes', 'Genotyped', + 'TranscriptAnnotation', 'TranscriptInfoAware', 'TranscriptCoordinates', 'ProteinMetadata', 'ProteinFeature', 'FeatureInfo', 'FeatureType', ] diff --git a/src/genophenocorr/model/_cohort.py b/src/genophenocorr/model/_cohort.py index ab61f6c7f..63fe5ffaa 100644 --- a/src/genophenocorr/model/_cohort.py +++ b/src/genophenocorr/model/_cohort.py @@ -68,7 +68,7 @@ def proteins(self) -> typing.Sequence[ProteinMetadata]: def __str__(self) -> str: return (f"Patient(" f"patient_id:{self.patient_id}, " - f"variants:{[var.variant_string for var in self.variants]}, " + f"variants:{self.variants}, " f"phenotypes:{[pheno.identifier for pheno in self.phenotypes]}, " f"proteins:{[prot.protein_id for prot in self.proteins]})") @@ -101,7 +101,7 @@ def from_patients(members: typing.Sequence[Patient]): members = set(members) for patient in members: cohort_variants.update(patient.variants) - var_counts.update([var.variant_string for var in patient.variants]) + var_counts.update([var.variant_coordinates.variant_key for var in patient.variants]) cohort_phenotypes.update(patient.phenotypes) pheno_count.update([pheno.identifier.value for pheno in patient.phenotypes if pheno.observed == True]) cohort_proteins.update(patient.proteins) diff --git a/src/genophenocorr/model/_gt.py b/src/genophenocorr/model/_gt.py new file mode 100644 index 000000000..dd9159d92 --- /dev/null +++ b/src/genophenocorr/model/_gt.py @@ -0,0 +1,163 @@ +import abc +import bisect +import enum +import typing +from typing import Iterator + +import numpy as np + + +class Genotype(enum.Enum): + """ + `Genotype` represents state of a variable locus in a diploid genome. + """ + + NO_CALL = ('.',) + HOMOZYGOUS_REFERENCE = ('0/0',) + HETEROZYGOUS = ('0/1',) + HOMOZYGOUS_ALTERNATE = ('1/1',) + HEMIZYGOUS = ('1',) + + def __init__(self, code: str): + self._code = code + + @property + def code(self) -> str: + return self._code + + def __str__(self): + return self.code + + +class Genotypes(typing.Sized, typing.Iterable): + """ + `Genotypes` is a container for mapping between sample ID and its genotype. + + Use one of the static methods to create an instance: + + >>> gts = Genotypes.single('A', Genotype.HETEROZYGOUS) + >>> gts = Genotypes.from_mapping({'A': Genotype.HETEROZYGOUS, 'B': Genotype.HOMOZYGOUS_ALTERNATE}) + + There are 2 genotypes in the container: + + >>> len(gts) + 2 + + You can get a genotype for a sample ID: + + >>> gts.for_sample('A') + Genotype.HETEROZYGOUS + + You will get `None` if the sample is not present: + + >>> gts.for_sample('UNKNOWN') + None + + You can iterate over sample-genotype pairs: + + >>> for sample_id, gt in gts: + ... print(sample_id, gt) + A 0/1 + B 1/1 + """ + + @staticmethod + def empty(): + return EMPTY + + @staticmethod + def single(sample_id: str, genotype: Genotype): + """ + A shortcut for creating `Genotypes` for a single sample: + + >>> gts = Genotypes.single('A', Genotype.HOMOZYGOUS_ALTERNATE) + + >>> assert len(gts) == 1 + >>> assert gts.for_sample('A') == Genotype.HOMOZYGOUS_ALTERNATE + """ + + return Genotypes((sample_id,), (genotype,)) + + @staticmethod + def from_mapping(mapping: typing.Mapping[str, Genotype]): + """ + Create `Genotypes` from mapping between sample IDs and genotypes. + + >>> gts = Genotypes.from_mapping({'A': Genotype.HETEROZYGOUS, 'B': Genotype.HOMOZYGOUS_ALTERNATE}) + + >>> assert len(gts) == 2 + """ + + return Genotypes(*Genotypes._preprocess_mapping(mapping)) + + @staticmethod + def _preprocess_mapping(genotypes: typing.Mapping[str, Genotype]) -> typing.Tuple[typing.Sequence[str], typing.Sequence[Genotype]]: + samples = np.empty(shape=(len(genotypes),), dtype=object) + gts = np.empty(shape=(len(genotypes),), dtype=object) + + for i, (sample, gt) in enumerate(genotypes.items()): + samples[i] = sample + gts[i] = gt + + indices = np.argsort(samples) + + return samples[indices], gts[indices] + + def __init__(self, samples: typing.Iterable[str], genotypes: typing.Iterable[Genotype]): + self._samples = tuple(samples) + self._gts = tuple(genotypes) + if len(self._samples) != len(self._gts): + raise ValueError(f'Mismatch between the sample and genotype count: {len(self._samples)} != {len(self._gts)}') + + def for_sample(self, sample_id: str) -> typing.Optional[Genotype]: + """ + Get a genotype for a sample or `None` if the genotype is not present. + + :param sample_id: a `str` with sample's identifier. + """ + idx = bisect.bisect_left(self._samples, sample_id) + if idx != len(self._samples) and self._samples[idx] == sample_id: + return self._gts[idx] + return None + + def __len__(self) -> int: + return len(self._samples) + + def __iter__(self) -> Iterator[typing.Tuple[str, Genotype]]: + return zip(self._samples, self._gts) + + def __hash__(self): + return hash((self._samples, self._gts)) + + def __eq__(self, other): + return (isinstance(other, Genotypes) + and self._samples == other._samples + and self._gts == other._gts) + + def __str__(self): + return f'Genotypes({["{}={}".format(sid, gt) for sid, gt in zip(self._samples, self._gts)]})' + + def __repr__(self): + return f'Genotypes(samples={self._samples}, genotypes={self._gts})' + + +EMPTY = Genotypes((), ()) + + +class Genotyped(metaclass=abc.ABCMeta): + """ + `Genotyped` entities + """ + + @property + @abc.abstractmethod + def genotypes(self) -> Genotypes: + pass + + def genotype_for_sample(self, sample_id: str) -> typing.Optional[Genotype]: + """ + Get a genotype for a sample or `None` if the genotype is not present. + + :param sample_id: a `str` with sample's identifier. + """ + return self.genotypes.for_sample(sample_id) diff --git a/src/genophenocorr/model/_protein.py b/src/genophenocorr/model/_protein.py index e3f8df282..39c126f1c 100644 --- a/src/genophenocorr/model/_protein.py +++ b/src/genophenocorr/model/_protein.py @@ -17,8 +17,6 @@ class FeatureInfo: """ def __init__(self, name: str, region: Region): - if not isinstance(name, str): - raise ValueError(f"name must be type string but was type {type(name)}") self._name = hpotk.util.validate_instance(name, str, 'name') self._region = hpotk.util.validate_instance(region, Region, 'region') @@ -30,14 +28,6 @@ def name(self) -> str: """ return self._name - @property - def region(self) -> Region: - """ - Returns: - Region: a protein region spanned by the feature. - """ - return self._region - @property def start(self) -> int: """ @@ -54,6 +44,14 @@ def end(self) -> int: """ return self._region.end + @property + def region(self) -> Region: + """ + Returns: + Region: a protein region spanned by the feature. + """ + return self._region + def __len__(self): return len(self._region) diff --git a/src/genophenocorr/model/_test_gt.py b/src/genophenocorr/model/_test_gt.py new file mode 100644 index 000000000..6f5b46c33 --- /dev/null +++ b/src/genophenocorr/model/_test_gt.py @@ -0,0 +1,30 @@ +from ._gt import Genotypes, Genotype + + +class TestGenotypes: + + def test_genotypes(self): + gts = Genotypes.from_mapping({ + 'A': Genotype.HETEROZYGOUS, + 'C': Genotype.HEMIZYGOUS, + 'B': Genotype.HOMOZYGOUS_ALTERNATE, + 'E': Genotype.NO_CALL, + 'D': Genotype.HOMOZYGOUS_REFERENCE, + }) + assert gts.for_sample('A') == Genotype.HETEROZYGOUS + assert gts.for_sample('B') == Genotype.HOMOZYGOUS_ALTERNATE + assert gts.for_sample('C') == Genotype.HEMIZYGOUS + assert gts.for_sample('D') == Genotype.HOMOZYGOUS_REFERENCE + assert gts.for_sample('E') == Genotype.NO_CALL + assert gts.for_sample('X') is None + + def test_iteration(self): + gts = Genotypes.from_mapping({ + 'A': Genotype.HETEROZYGOUS, + 'C': Genotype.HEMIZYGOUS, + 'D': Genotype.HOMOZYGOUS_REFERENCE, + }) + items = [(sample_id, gt) for sample_id, gt in gts] + assert items[0] == ('A', Genotype.HETEROZYGOUS) + assert items[1] == ('C', Genotype.HEMIZYGOUS) + assert items[2] == ('D', Genotype.HOMOZYGOUS_REFERENCE) diff --git a/src/genophenocorr/model/genome/_tx.py b/src/genophenocorr/model/_tx.py similarity index 91% rename from src/genophenocorr/model/genome/_tx.py rename to src/genophenocorr/model/_tx.py index 2c7cd2d97..d8d9b6152 100644 --- a/src/genophenocorr/model/genome/_tx.py +++ b/src/genophenocorr/model/_tx.py @@ -2,7 +2,7 @@ import hpotk -from ._genome import GenomicRegion +from .genome import GenomicRegion class TranscriptCoordinates: @@ -19,7 +19,7 @@ def __init__(self, identifier: str, self._id = hpotk.util.validate_instance(identifier, str, 'identifier') self._region = hpotk.util.validate_instance(region, GenomicRegion, 'region') self._exons = tuple(exons) - # TODO - check + # TODO - check CDS self._cds_start = cds_start self._cds_end = cds_end @@ -60,6 +60,9 @@ def __eq__(self, other): and self.cds_start == other.cds_start and self.cds_end == other.cds_end) + def __hash__(self): + return hash((self._id, self._region, self._exons, self._cds_start, self._cds_end)) + def __str__(self): return f'TranscriptCoordinates(identifier={self.identifier}, region={self.region})' diff --git a/src/genophenocorr/model/_variant.py b/src/genophenocorr/model/_variant.py index 3e492da95..bd94cc2d0 100644 --- a/src/genophenocorr/model/_variant.py +++ b/src/genophenocorr/model/_variant.py @@ -1,164 +1,14 @@ import abc import typing +import warnings -import hpotk.util +import hpotk from .genome import GenomicRegion +from ._gt import Genotyped, Genotypes from ._protein import ProteinMetadata -class VariantCoordinates: - """A representation of coordinates of sequence and symbolic variants. - The breakend variants are not supported. - - Attributes: - chrom (string): Chromosome name such as `1`, `X`, `MT` - start (integer): The 0-based starting coordinate of the ref allele - end (integer): The 0-based ending coordinate of the ref allele - ref (string): The reference allele - alt (string): The alternate allele - change_length (integer): The change between the ref and alt alleles due to the variant presence - genotype (string): The genotype of the variant (e.g. Heterozygous, Homozygous) - Methods: - is_structural: Returns True if the variant coordinates use structural variant notation - as_string: Returns a readable representation of the variant coordinate - """ - - # TODO - should use/extend `genophenocorr.model.genome.GenomicRegion` - - def __init__(self, region: GenomicRegion, ref: str, alt: str, change_length: int, genotype: str): - """Constructs all necessary attributes for a VariantCoordinates object - - Args: - chrom (string): Chromosome variant coordinates are located on - start (integer): The 0-based starting coordinate of the ref allele - end (integer): The 0-based ending coordinate of the ref allele - ref (string): The reference allele - alt (string): The alternate allele - change_length (integer): The change between the ref and alt alleles due to the variant presence - genotype (string): The genotype of the variant (e.g. Heterozygous, Homozygous) - """ - # TODO(lnrekerle) - instance/type check - # TODO - id? - self._region = hpotk.util.validate_instance(region, GenomicRegion, 'region') - self._ref = ref - self._alt = alt - self._change_length = change_length - self._genotype = genotype - - @property - def chrom(self) -> str: - """ - Returns: - string: The label of the chromosome/contig where the variant is located. - """ - return self._region.contig.name - - @property - def start(self) -> int: - """ - Returns: - integer: The 0-based start coordinate (excluded) of the ref allele. - """ - return self._region.start - - @property - def end(self) -> int: - """ - Returns: - integer: The 0-based end coordinate (included) of the ref allele. - """ - return self._region.end - - @property - def region(self) -> GenomicRegion: - """ - Returns: - GenomicRegion: The genomic region spanned by the ref allele. - """ - return self._region - - @property - def ref(self) -> str: - """ - Returns: - string: The reference allele (e.g. "A", "N"). The allele may be an empty string. - """ - return self._ref - - @property - def alt(self) -> str: - """ - Returns: - string: The alternate allele (e.g. "A", "GG", ""). The allele may be an empty string for sequence variants. - The symbolic alternate allele follow the VCF notation and use the `<` and `>` characters - (e.g. "", ""). - """ - return self._alt - - @property - def change_length(self) -> int: - """ - Returns: - integer: The change between the ref and alt alleles due to the variant presence. SNVs lead to change length of zero, - deletions and insertions/duplications lead to negative and positive change lengths, respectively. - """ - return self._change_length - - @property - def genotype(self) -> str: - # TODO - add a doc string with an example. Do we return `0/1`, or `HET`, or GENO_0000135 - # (http://purl.obolibrary.org/obo/GENO_0000135)? All these are strings. - return self._genotype - - def is_structural(self) -> bool: - """Checks if the variant coordinates use structural variant notation - (e.g. `chr5 101 . N . . SVTYPE=DEL;END=120;SVLEN=-10`) - as opposed to the sequence/literal notation (`chr5 101 . NACGTACGTAC N`). - - Returns: - boolean: True if the variant coordinates use structural variant notation - """ - return len(self._alt) != 0 and self._alt.startswith('<') and self._alt.endswith('>') - - def as_string(self) -> str: - """ - Returns: - string: A readable representation of the variant coordinates - """ - return f"{self.chrom}_{self.start}_{self.end}_{self.ref}_{self.alt}_{self.genotype}".replace('<', '').replace( - '>', '') - - def __len__(self): - """ - Get the number of bases on the ref allele that are affected by the variant. - """ - return len(self._region) - - def __eq__(self, other) -> bool: - return isinstance(other, VariantCoordinates) \ - and self.alt == other.alt \ - and self.ref == other.ref \ - and self.chrom == other.chrom \ - and self.start == other.start \ - and self.end == other.end \ - and self.change_length == other.change_length \ - and self.genotype == other.genotype - - def __str__(self) -> str: - return f"VariantCoordinates(chrom={self.chrom}, " \ - f"start={self.start}, end={self.end}, " \ - f"ref={self.ref}, alt={self.alt}, " \ - f"change_length={self.change_length}, " \ - f"genotype={self.genotype})" - - def __repr__(self) -> str: - return str(self) - - def __hash__(self) -> int: - return hash((self._region, self._ref, self._alt, self._change_length, self._genotype)) - - class TranscriptInfoAware(metaclass=abc.ABCMeta): """ The implementors know about basic gene/transcript identifiers. @@ -276,7 +126,8 @@ def variant_effects(self) -> typing.Sequence[str]: def overlapping_exons(self) -> typing.Sequence[int]: """ Returns: - Sequence[integer]: A sequence of IDs of the exons that overlap with the variant. + Sequence[integer]: A sequence of 1-based exon indices (the index of the 1st exon is `1`) + that overlap with the variant. """ return self._affected_exons @@ -325,21 +176,202 @@ def __hash__(self) -> int: self.protein_affected, self.protein_effect_location)) -class Variant: + +class VariantCoordinates: + """A representation of coordinates of sequence and symbolic variants. + The breakend variants are not supported. + + Attributes: + region (GenomicRegion): The region spanned by the variant reference allele + ref (string): The reference allele + alt (string): The alternate allele + change_length (integer): The change between the ref and alt alleles due to the variant presence + """ + + def __init__(self, region: GenomicRegion, ref: str, alt: str, change_length: int): + self._region = hpotk.util.validate_instance(region, GenomicRegion, 'region') + self._ref = hpotk.util.validate_instance(ref, str, 'ref') + self._alt = hpotk.util.validate_instance(alt, str, 'alt') + self._change_length = hpotk.util.validate_instance(change_length, int, 'change_length') + + @property + def chrom(self) -> str: + """ + Returns: + string: The label of the chromosome/contig where the variant is located. + """ + return self._region.contig.name + + @property + def start(self) -> int: + """ + Returns: + integer: The 0-based start coordinate (excluded) of the ref allele. + """ + return self._region.start + + @property + def end(self) -> int: + """ + Returns: + integer: The 0-based end coordinate (included) of the ref allele. + """ + return self._region.end + + @property + def region(self) -> GenomicRegion: + """ + Returns: + GenomicRegion: The genomic region spanned by the ref allele. + """ + return self._region + + @property + def ref(self) -> str: + """ + Returns: + string: The reference allele (e.g. "A", "N"). The allele may be an empty string. + """ + return self._ref + + @property + def alt(self) -> str: + """ + Returns: + string: The alternate allele (e.g. "A", "GG", ""). The allele may be an empty string for sequence variants. + The symbolic alternate allele follow the VCF notation and use the `<` and `>` characters + (e.g. "", ""). + """ + return self._alt + + @property + def change_length(self) -> int: + """ + Returns: + integer: The change between the ref and alt alleles due to the variant presence. SNVs lead to change length of zero, + deletions and insertions/duplications lead to negative and positive change lengths, respectively. + """ + return self._change_length + + @property + def variant_key(self) -> str: + """ + Get a readable representation of the variant's coordinates. + + For instance, `X_12345_12345_C_G` for sequence variant or `22_10001_20000_INV` + Note that both start and end coordinates use 1-based (included) coordinate system. + """ + if self.is_structural(): + return f'{self.chrom}_{self.start + 1}_{self.end}_{self.alt[1:-1]}' + else: + return f'{self.chrom}_{self.start + 1}_{self.end}_{self.ref}_{self.alt}' + + @property + def variant_class(self) -> str: + """ + Returns: + string: The variant class. (e.g. `DUP`, `SNV`, `INS`, `MNV`, `INV`, ...) + """ + + if self.is_structural(): + # Expecting a `str` like , , , , ... + return self.alt[1:-1] + else: + if len(self.ref) > len(self.alt): + return 'DEL' + elif len(self.ref) < len(self.alt): + # may also be a duplication, but it's hard to say from this + return 'INS' + else: + if len(self.ref) == 1: + return 'SNV' + else: + return 'MNV' + + def is_structural(self) -> bool: + """Checks if the variant coordinates use structural variant notation + (e.g. `chr5 101 . N . . SVTYPE=DEL;END=120;SVLEN=-10`) + as opposed to the sequence/literal notation (`chr5 101 . NACGTACGTAC N`). + + Returns: + boolean: True if the variant coordinates use structural variant notation + """ + return len(self._alt) != 0 and self._alt.startswith('<') and self._alt.endswith('>') + + def __len__(self): + """ + Get the number of bases on the ref allele that are affected by the variant. + """ + return len(self._region) + + def __eq__(self, other) -> bool: + return isinstance(other, VariantCoordinates) \ + and self.alt == other.alt \ + and self.ref == other.ref \ + and self.chrom == other.chrom \ + and self.start == other.start \ + and self.end == other.end \ + and self.change_length == other.change_length + + def __str__(self) -> str: + return f"VariantCoordinates(chrom={self.chrom}, " \ + f"start={self.start}, end={self.end}, " \ + f"ref={self.ref}, alt={self.alt}, " \ + f"change_length={self.change_length})" + + def __repr__(self) -> str: + return str(self) + + def __hash__(self) -> int: + return hash((self._region, self._ref, self._alt, self._change_length)) + + +class VariantCoordinateAware(metaclass=abc.ABCMeta): + + @property + @abc.abstractmethod + def variant_coordinates(self) -> VariantCoordinates: + pass + + @property + def variant_string(self) -> str: + warnings.warn('variant_string` was deprecated and will be removed in v0.2.0. ' + 'Use `variant_coordinates.variant_key` instead', DeprecationWarning, stacklevel=2) + # TODO[0.2.0] - remove + return self.variant_coordinates.variant_key + + @property + def variant_class(self) -> str: + """ + Returns: + string: The variant class. (e.g. `DUP`, `SNV`, `INS`, `MNV`, `INV`, ...) + """ + return self.variant_coordinates.variant_class + + +class FunctionalAnnotationAware(metaclass=abc.ABCMeta): + + @property + @abc.abstractmethod + def tx_annotations(self) -> typing.Sequence[TranscriptAnnotation]: + pass + + +class Variant(VariantCoordinateAware, FunctionalAnnotationAware, Genotyped): """Class that represents results of the functional annotation of a variant with all included transcripts. + :param var_coordinates: the coordinates of the variant. + :param tx_annotations: an iterable of functional annotations. + :param genotypes: the genotypes Attributes: - variant_coordinates (VariantCoordinates): A VariantCoordinates object with coordinates for this Variant + variant_coordinates (VariantCoordinates): variant_string (string): A readable representation of the variant coordinates - genotype (string, Optional): The genotype of the variant (e.g. Homozygous, Heterozygous) tx_annotations (Sequence[TranscriptAnnotation], Optional): A sequence of TranscriptAnnotation objects representing transcripts affected by this variant variant_class (string): The variant class (e.g. Duplication, SNV, etc.) """ @staticmethod - def create_variant_from_scratch(variant_id: str, - variant_class: str, - variant_coordinates: VariantCoordinates, + def create_variant_from_scratch(variant_coordinates: VariantCoordinates, gene_name: str, trans_id: str, hgvsc_id: str, @@ -348,33 +380,25 @@ def create_variant_from_scratch(variant_id: str, exons_effected: typing.Sequence[int], protein: typing.Sequence[ProteinMetadata], protein_effect_start: int, - protein_effect_end: int): + protein_effect_end: int, + genotypes: Genotypes): transcript = TranscriptAnnotation(gene_name, trans_id, hgvsc_id, is_preferred, consequences, exons_effected, protein, protein_effect_start, protein_effect_end) - return Variant(variant_id, variant_class, variant_coordinates, [transcript], variant_coordinates.genotype) + return Variant(variant_coordinates, [transcript], genotypes) - def __init__(self, var_id: str, - var_class: str, - var_coordinates: VariantCoordinates, - tx_annotations: typing.Optional[typing.Sequence[TranscriptAnnotation]], - genotype: typing.Optional[str]): + def __init__(self, var_coordinates: VariantCoordinates, + tx_annotations: typing.Iterable[TranscriptAnnotation], + genotypes: Genotypes): """Constructs all necessary attributes for a Variant object Args: var_coordinates (VariantCoordinates): A VariantCoordinates object with coordinates for this Variant - var_id (string): A readable representation of the variant coordinates - genotype (string, Optional): The genotype of the variant (e.g. Homozygous, Heterozygous) - tx_annotations (Sequence[TranscriptAnnotation], Optional): A sequence of TranscriptAnnotation objects representing transcripts affected by this variant - var_class (string): The variant class (e.g. Duplication, SNV, etc.) + tx_annotations (typing.Sequence[TranscriptAnnotation]): A sequence of TranscriptAnnotation objects representing transcripts affected by this variant + genotypes (Genotypes): genotypes container """ - self._id = var_id self._var_coordinates = var_coordinates - self._var_class = var_class - if tx_annotations is None: - self._tx_annotations = None - else: - self._tx_annotations = tuple(tx_annotations) - self._genotype = genotype + self._tx_annotations = tuple(tx_annotations) + self._gts = genotypes @property def variant_coordinates(self) -> VariantCoordinates: @@ -384,26 +408,6 @@ def variant_coordinates(self) -> VariantCoordinates: """ return self._var_coordinates - @property - def variant_string(self) -> str: - """ - Returns: - string: A readable representation of the variant's coordinates. - Format - "Chromosome_Start_Reference/Alternative" or - "Chromosome_Start_StructuralType" - """ - return self._id - - @property - def genotype(self) -> str: - """Optional parameter. Required for recessive tests. - Possible values: Heterozygous, Homozygous, Hemizygous - - Returns: - string: Genotype of the variant - """ - return self._genotype - @property def tx_annotations(self) -> typing.Sequence[TranscriptAnnotation]: """A collection of TranscriptAnnotations that each represent results of the functional annotation @@ -415,32 +419,23 @@ def tx_annotations(self) -> typing.Sequence[TranscriptAnnotation]: return self._tx_annotations @property - def variant_class(self) -> str: - """ - Returns: - string: The variant class. (e.g. Duplication, SNV, Deletion, etc.) - """ - return self._var_class + def genotypes(self) -> Genotypes: + return self._gts def __eq__(self, other) -> bool: return isinstance(other, Variant) \ - and self.variant_string == other.variant_string \ - and self.genotype == other.genotype \ - and self.variant_class == other.variant_class \ and self.variant_coordinates == other.variant_coordinates \ - and self.tx_annotations == other.tx_annotations + and self.tx_annotations == other.tx_annotations \ + and self.genotypes == other.genotypes def __hash__(self) -> int: return hash( - (self.variant_coordinates, self.variant_string, self.variant_class, self.genotype, self.tx_annotations)) + (self.variant_coordinates, self.tx_annotations, self.genotypes)) def __repr__(self) -> str: return str(self) def __str__(self) -> str: - return f"Variant(variant_coordinates:{str(self.variant_coordinates)}," \ - f"variant_string:{self.variant_string}," \ - f"genotype:{self.genotype}," \ - f"tx_annotations:{self.tx_annotations}," \ - f"variant_class:{self.variant_class})" - + return (f"Variant(variant_coordinates:{str(self.variant_coordinates)}, " + f"tx_annotations:{self.tx_annotations}, " + f"genotypes:{self.genotypes})") diff --git a/src/genophenocorr/model/genome/__init__.py b/src/genophenocorr/model/genome/__init__.py index 88519c262..3d01fba84 100644 --- a/src/genophenocorr/model/genome/__init__.py +++ b/src/genophenocorr/model/genome/__init__.py @@ -7,13 +7,8 @@ The module provides *GRCh37.p13* and *GRCh38.p13*, the two most commonly used human genome builds. -Last, we provide models of :class:`TranscriptCoordinates` and :class:`ProteinCoordinates` to work with -their coordinates. - The classes are largely a port of `Svart `_ library. """ from ._builds import GRCh37, GRCh38 from ._genome import Contig, GenomeBuild, Strand, Stranded, Transposable, GenomicRegion, Region -from ._protein import ProteinCoordinates -from ._tx import TranscriptCoordinates diff --git a/src/genophenocorr/model/genome/_genome.py b/src/genophenocorr/model/genome/_genome.py index 98c26ecd5..16c426807 100644 --- a/src/genophenocorr/model/genome/_genome.py +++ b/src/genophenocorr/model/genome/_genome.py @@ -239,14 +239,14 @@ def __init__(self, contig: Contig, start: int, end: int, strand: Strand): def contig(self) -> Contig: return self._contig - def start_on_strand(self, strand: Strand) -> int: - if self.strand == strand: + def start_on_strand(self, other: Strand) -> int: + if self.strand == other: return self.start else: return len(self.contig) - self.end - def end_on_strand(self, strand: Strand) -> int: - if self.strand == strand: + def end_on_strand(self, other: Strand) -> int: + if self.strand == other: return self.end else: return len(self.contig) - self.start diff --git a/src/genophenocorr/model/genome/_protein.py b/src/genophenocorr/model/genome/_protein.py deleted file mode 100644 index bfc2133a7..000000000 --- a/src/genophenocorr/model/genome/_protein.py +++ /dev/null @@ -1,30 +0,0 @@ -import hpotk.util - -from ._genome import Region - - -class ProteinCoordinates: - - def __init__(self, identifier: str, - region: Region): - self._id = hpotk.util.validate_instance(identifier, str, 'identifier') - self._region = hpotk.util.validate_instance(region, Region, 'region') - - @property - def identifier(self) -> str: - return self._id - - @property - def region(self) -> Region: - return self._region - - def __eq__(self, other): - return (isinstance(other, ProteinCoordinates) - and self.identifier == other.identifier - and self.region == other.region) - - def __str__(self): - return f'ProteinCoordinates(identifier={self.identifier}, region={self.region})' - - def __repr__(self): - return str(self) diff --git a/src/genophenocorr/preprocessing/_api.py b/src/genophenocorr/preprocessing/_api.py index 86af5bf54..241f4b4ee 100644 --- a/src/genophenocorr/preprocessing/_api.py +++ b/src/genophenocorr/preprocessing/_api.py @@ -1,8 +1,7 @@ import abc import typing -from genophenocorr.model import Variant, VariantCoordinates, ProteinMetadata, TranscriptInfoAware -from genophenocorr.model.genome import TranscriptCoordinates +from genophenocorr.model import VariantCoordinates, Genotype, ProteinMetadata, TranscriptInfoAware, TranscriptCoordinates, TranscriptAnnotation T = typing.TypeVar('T') @@ -10,9 +9,9 @@ class VariantCoordinateFinder(typing.Generic[T], metaclass=abc.ABCMeta): @abc.abstractmethod - def find_coordinates(self, item: T) -> VariantCoordinates: + def find_coordinates(self, item: T) -> typing.Tuple[VariantCoordinates, Genotype]: """ - Determine :class:`VariantCoordinates` from an `item` of some sort. + Determine :class:`VariantCoordinates` and :class:`Genotype` from an `item` of some sort. """ pass @@ -20,7 +19,7 @@ def find_coordinates(self, item: T) -> VariantCoordinates: class FunctionalAnnotator(metaclass=abc.ABCMeta): @abc.abstractmethod - def annotate(self, variant_coordinates: VariantCoordinates) -> Variant: + def annotate(self, variant_coordinates: VariantCoordinates) -> typing.Sequence[TranscriptAnnotation]: pass diff --git a/src/genophenocorr/preprocessing/_config.py b/src/genophenocorr/preprocessing/_config.py index 0fe9f977b..63d321479 100644 --- a/src/genophenocorr/preprocessing/_config.py +++ b/src/genophenocorr/preprocessing/_config.py @@ -36,7 +36,7 @@ def configure_caching_patient_creator(hpo: hpotk.MinimalOntology, Choose from ``{'UNIPROT'}`` (just one fallback implementation is available at the moment). """ if cache_dir is None: - cache_dir = os.path.join(os.getcwd(), '.cache') + cache_dir = os.path.join(os.getcwd(), '.genophenocorr_cache') if genome_build == 'GRCh38.p13': build = GRCh38 diff --git a/src/genophenocorr/preprocessing/_patient.py b/src/genophenocorr/preprocessing/_patient.py index eec614b86..58eae22a6 100644 --- a/src/genophenocorr/preprocessing/_patient.py +++ b/src/genophenocorr/preprocessing/_patient.py @@ -1,13 +1,8 @@ import abc -import logging -import hpotk import typing -# pyright: reportGeneralTypeIssues=false -from genophenocorr.model import ProteinMetadata, Phenotype, Patient, Variant -from ._phenotype import PhenotypeCreator -from ._api import FunctionalAnnotator +from genophenocorr.model import Patient T = typing.TypeVar('T') diff --git a/src/genophenocorr/preprocessing/_phenopacket.py b/src/genophenocorr/preprocessing/_phenopacket.py index f2721c976..bcccddcd9 100644 --- a/src/genophenocorr/preprocessing/_phenopacket.py +++ b/src/genophenocorr/preprocessing/_phenopacket.py @@ -9,7 +9,8 @@ from google.protobuf.json_format import Parse from phenopackets import GenomicInterpretation, Phenopacket -from genophenocorr.model import Phenotype, ProteinMetadata, VariantCoordinates, Variant, Patient, Cohort +from genophenocorr.model import Phenotype, ProteinMetadata, VariantCoordinates, Variant, Genotype, Genotypes +from genophenocorr.model import Patient, Cohort from genophenocorr.model.genome import GenomeBuild, GenomicRegion, Strand from ._patient import PatientCreator @@ -29,18 +30,16 @@ def __init__(self, build: GenomeBuild): self._logger = logging.getLogger(__name__) self._build = build - def find_coordinates(self, item: GenomicInterpretation) -> VariantCoordinates: + def find_coordinates(self, item: GenomicInterpretation) -> typing.Tuple[VariantCoordinates, Genotype]: """Creates a VariantCoordinates object from the data in a given Phenopacket Args: - item (GenomicInterpretation): A Phenopacket object + item (GenomicInterpretation): a genomic interpretation element from Phenopacket Schema Returns: - VariantCoordinates: A VariantCoordinates object + tuple[VariantCall, Genotype]: A tuple of :class:`VariantCoordinates` and :class:`Genotype` """ if not isinstance(item, GenomicInterpretation): raise ValueError(f"item must be a Phenopacket GenomicInterpretation but was type {type(item)}") - chrom, ref, alt, genotype = None, None, None, None - start, end = 0, 0 variant_descriptor = item.variant_interpretation.variation_descriptor if len(variant_descriptor.vcf_record.chrom) == 0 and len( variant_descriptor.variation.copy_number.allele.sequence_location.sequence_id) != 0: @@ -77,8 +76,25 @@ def find_coordinates(self, item: GenomicInterpretation) -> VariantCoordinates: if any(field is None for field in (contig, ref, alt, genotype)): raise ValueError(f'Cannot determine variant coordinate from genomic interpretation {item}') region = GenomicRegion(contig, start, end, Strand.POSITIVE) - return VariantCoordinates(region, ref, alt, len(alt) - len(ref), genotype) + vc = VariantCoordinates(region, ref, alt, len(alt) - len(ref)) + gt = self._map_geno_genotype_label(genotype) + + return vc, gt + + @staticmethod + def _map_geno_genotype_label(genotype: str) -> Genotype: + """ + Mapping from labels of the relevant GENO terms that is valid as of Oct 2nd, 2023. + """ + if genotype in ('heterozygous', 'compound heterozygous', 'simple heterozygous'): + return Genotype.HETEROZYGOUS + elif genotype == 'homozygous': + return Genotype.HOMOZYGOUS_ALTERNATE + elif genotype in ('hemizygous', 'hemizygous X-linked', 'hemizygous Y-linked'): + return Genotype.HEMIZYGOUS + else: + raise ValueError(f'Unknown genotype {genotype}') class PhenopacketPatientCreator(PatientCreator[Phenopacket]): @@ -112,12 +128,22 @@ def create_patient(self, item: Phenopacket) -> Patient: Returns: Patient: A Patient object """ + sample_id = self._extract_id(item) phenotypes = self._add_phenotypes(item) - variants = self._add_variants(item) + variants = self._add_variants(sample_id, item) protein_data = self._add_protein_data(variants) return Patient(item.id, phenotypes, variants, protein_data) - def _add_variants(self, pp: Phenopacket) -> typing.Sequence[Variant]: + @staticmethod + def _extract_id(pp: Phenopacket): + subject = pp.subject + if len(subject.id) > 0: + return subject.id + else: + return pp.id + + + def _add_variants(self, sample_id: str, pp: Phenopacket) -> typing.Sequence[Variant]: """Creates a list of Variant objects from the data in a given Phenopacket Args: @@ -127,16 +153,16 @@ def _add_variants(self, pp: Phenopacket) -> typing.Sequence[Variant]: """ variants_list = [] for i, interp in enumerate(pp.interpretations): - if hasattr(interp, 'diagnosis') and interp.diagnosis is not None: - for genomic_interp in interp.diagnosis.genomic_interpretations: - vc = self._coord_finder.find_coordinates(genomic_interp) - if "N" in vc.alt: - self._logger.warning(f'Patient {pp.id} has unknown alternative variant {vc.alt} and will not be included.') - continue - variant = self._func_ann.annotate(vc) - variants_list.append(variant) - else: - self._logger.warning(f'No diagnosis in interpretation #{i} of phenopacket {pp.id}') + for genomic_interp in interp.diagnosis.genomic_interpretations: + vc, gt = self._coord_finder.find_coordinates(genomic_interp) + if "N" in vc.alt: + self._logger.warning(f'Patient {pp.id} has unknown alternative variant {vc.alt} and will not be included.') + continue + tx_annotations = self._func_ann.annotate(vc) + genotype = Genotypes.single(sample_id, gt) + variant = Variant(vc, tx_annotations, genotype) + variants_list.append(variant) + if len(variants_list) == 0: self._logger.warning(f'Expected at least one variant per patient, but received none for patient {pp.id}') return variants_list @@ -157,13 +183,13 @@ def _add_phenotypes(self, pp: Phenopacket) -> typing.Sequence[Phenotype]: return [] return self._phenotype_creator.create_phenotype(hpo_id_list) - def _add_protein_data(self, variants: typing.Sequence[Variant]) -> typing.Sequence[ProteinMetadata]: + def _add_protein_data(self, variants: typing.Sequence[Variant]) -> typing.Collection[ProteinMetadata]: """Creates a list of ProteinMetadata objects from a given list of Variant objects Args: variants (Sequence[Variant]): A list of Variant objects Returns: - Sequence[ProteinMetadata]: A list of ProteinMetadata objects + Collection[ProteinMetadata]: A list of ProteinMetadata objects """ final_prots = set() for var in variants: diff --git a/src/genophenocorr/preprocessing/_test_variant.py b/src/genophenocorr/preprocessing/_test_variant.py index 0b67cbebe..db3c276ad 100644 --- a/src/genophenocorr/preprocessing/_test_variant.py +++ b/src/genophenocorr/preprocessing/_test_variant.py @@ -57,7 +57,7 @@ def test_verify_start_end_coordinates(contig_name, start, end, ref, alt, chlen, expected): contig = GRCh38.contig_by_name(contig_name) region = GenomicRegion(contig, start, end, Strand.POSITIVE) - vc = VariantCoordinates(region, ref, alt, chlen, 'Whatever') + vc = VariantCoordinates(region, ref, alt, chlen) out = verify_start_end_coordinates(vc) assert out == expected @@ -68,19 +68,21 @@ def pp_vc_finder() -> PhenopacketVariantCoordinateFinder: @pytest.mark.parametrize("pp_path, expected", - [('test_data/deletion_test.json', '16_89284128_89284134_CTTTTT_C_heterozygous'), - ('test_data/insertion_test.json', '16_89280828_89280830_C_CA_heterozygous'), - ('test_data/missense_test.json', '16_89279134_89279135_G_C_heterozygous'), - ('test_data/duplication_test.json', '16_89279849_89279851_G_GC_heterozygous'), - ('test_data/delinsert_test.json', '16_89284600_89284602_GG_A_heterozygous'), - ('test_data/CVDup_test.json', '16_89284523_89373231_N_DUP_heterozygous'), - ('test_data/CVDel_test.json', '16_89217281_89506042_N_DEL_heterozygous') + [('test_data/deletion_test.json', '16_89284129_89284134_CTTTTT_C'), + ('test_data/insertion_test.json', '16_89280829_89280830_C_CA'), + ('test_data/missense_test.json', '16_89279135_89279135_G_C'), + ('test_data/duplication_test.json', '16_89279850_89279851_G_GC'), + ('test_data/delinsert_test.json', '16_89284601_89284602_GG_A'), + ('test_data/CVDup_test.json', '16_89284524_89373231_DUP'), + ('test_data/CVDel_test.json', '16_89217282_89506042_DEL') ]) def test_find_coordinates(pp_path, expected, pp_vc_finder): fname = resource_filename(__name__, pp_path) gi = read_genomic_interpretation_json(fname) - assert pp_vc_finder.find_coordinates(gi).as_string() == expected + vc, gt = pp_vc_finder.find_coordinates(gi) + + assert expected == vc.variant_key def read_genomic_interpretation_json(fpath: str) -> GenomicInterpretation: @@ -102,7 +104,7 @@ def caching_annotator(variant_annotator, tmp_path): def test_caching_full_circle(caching_annotator, pp_vc_finder, variant_annotator): fname = resource_filename(__name__, 'test_data/missense_test.json') gi = read_genomic_interpretation_json(fname) - var_coords = pp_vc_finder.find_coordinates(gi) + var_coords, gt = pp_vc_finder.find_coordinates(gi) var_anno_results = variant_annotator.annotate(var_coords) cache_anno_results = caching_annotator.annotate(var_coords) assert var_anno_results == cache_anno_results @@ -119,7 +121,7 @@ def oldfile_cache_annotator(variant_annotator): def test_cache_from_older_file(oldfile_cache_annotator, pp_vc_finder, variant_annotator): fname = resource_filename(__name__, 'test_data/missense_test.json') gi = read_genomic_interpretation_json(fname) - var_coords = pp_vc_finder.find_coordinates(gi) + var_coords, gt = pp_vc_finder.find_coordinates(gi) var_anno_results = variant_annotator.annotate(var_coords) cached_file_results = oldfile_cache_annotator.annotate(var_coords) assert var_anno_results == cached_file_results diff --git a/src/genophenocorr/preprocessing/_variant.py b/src/genophenocorr/preprocessing/_variant.py index fcfa65743..a5c6400ec 100644 --- a/src/genophenocorr/preprocessing/_variant.py +++ b/src/genophenocorr/preprocessing/_variant.py @@ -7,8 +7,7 @@ import requests -from genophenocorr.model import VariantCoordinates, TranscriptAnnotation, Variant, TranscriptInfoAware -from genophenocorr.model.genome import TranscriptCoordinates +from genophenocorr.model import VariantCoordinates, TranscriptAnnotation, Variant, TranscriptInfoAware, TranscriptCoordinates from ._api import FunctionalAnnotator, ProteinMetadataService, TranscriptCoordinateService @@ -51,10 +50,10 @@ def verify_start_end_coordinates(vc: VariantCoordinates): class VepFunctionalAnnotator(FunctionalAnnotator): - """A class that creates a Variant object using variant coordinates and Variant Effect Predictor (VEP) REST API + """A class that peforms functional annotation of variant coordinates using Variant Effect Predictor (VEP) REST API. Methods: - annotate(variant_coordinates: VariantCoordinates): Creates a Variant object from VariantCoordinates + annotate(variant_coordinates: VariantCoordinates): the variant to annotate. """ def __init__(self, protein_annotator: ProteinMetadataService): @@ -68,19 +67,17 @@ def __init__(self, protein_annotator: ProteinMetadataService): self._url = 'https://rest.ensembl.org/vep/human/region/%s?LoF=1&canonical=1&domains=1&hgvs=1' \ '&mutfunc=1&numbers=1&protein=1&refseq=1&mane=1&transcript_version=1&variant_class=1' - def annotate(self, variant_coordinates: VariantCoordinates) -> Variant: - """Creates a Variant object by searching variant coordinates with Variant Effect Predictor (VEP) REST API. + def annotate(self, variant_coordinates: VariantCoordinates) -> typing.Sequence[TranscriptAnnotation]: + """Perform functional annotation using Variant Effect Predictor (VEP) REST API. Args: variant_coordinates (VariantCoordinates): A VariantCoordinates object Returns: - Variant: A Variant object + typing.Sequence[TranscriptAnnotation]: A sequence of transcript annotations for the variant coordinates """ variant = self._query_vep(variant_coordinates) - variant_id = variant.get('id') - variant_class = variant.get('variant_class') # canon_tx = None - transcript_list = [] + annotations = [] for trans in variant.get('transcript_consequences'): trans_id = trans.get('transcript_id') if not trans_id.startswith('NM'): @@ -108,7 +105,7 @@ def annotate(self, variant_coordinates: VariantCoordinates) -> Variant: if len(exons_effected) == 2: exons_effected = range(int(exons_effected[0]), int(exons_effected[1]) + 1) exons_effected = [int(x) for x in exons_effected] - transcript_list.append( + annotations.append( TranscriptAnnotation(gene_name, trans_id, hgvsc_id, @@ -119,8 +116,8 @@ def annotate(self, variant_coordinates: VariantCoordinates) -> Variant: protein_effect_start, protein_effect_end) ) - return Variant(variant_id, variant_class, variant_coordinates, transcript_list, - variant_coordinates.genotype) + + return annotations def _query_vep(self, variant_coordinates) -> dict: api_url = self._url % (verify_start_end_coordinates(variant_coordinates)) @@ -156,7 +153,7 @@ def __init__(self, datadir: str): raise ValueError(f'datadir {datadir} must be an existing directory') self._datadir = datadir - def get_annotations(self, variant_coordinates: VariantCoordinates) -> typing.Optional[Variant]: + def get_annotations(self, variant_coordinates: VariantCoordinates) -> typing.Optional[typing.Sequence[TranscriptAnnotation]]: """Searches a given data directory for a pickle file with given variant coordinates and returns Variant from file. Returns None if no file is found. Args: @@ -169,27 +166,29 @@ def get_annotations(self, variant_coordinates: VariantCoordinates) -> typing.Opt else: return None - def store_annotations(self, variant_coordinates: VariantCoordinates, annotation: Variant): + def store_annotations(self, variant_coordinates: VariantCoordinates, annotations: typing.Sequence[TranscriptAnnotation]): """Creates a pickle file with the given variant coordinates in the file name. Loads the Variant object given into the file for storage. Args: variant_coordinates (VariantCoordinates): The variant_coordinates associated with the desired Variant - annotation (Variant): A Variant object that will be stored under the given variant coordinates + annotations (typing.Sequence[TranscriptAnnotation]): Annotations that will be stored under the given variant coordinates """ fpath = self._create_file_name(variant_coordinates) with open(fpath, 'wb') as f: - pickle.dump(annotation, f) + pickle.dump(annotations, f) - def _create_file_name(self, variant_coordinates: VariantCoordinates) -> str: + def _create_file_name(self, vc: VariantCoordinates) -> str: """Creates a file name with full location and the variant coordinates (e.g. "/path/to/desired/directory/1_2345_G_C_heterozygous.pickle") Args: - variant_coordinates (VariantCoordinates): The variant coordinates associated with the Variant + vc (VariantCoordinates): The variant coordinates associated with the Variant """ - if len(variant_coordinates.as_string()) <= 50: - fname = f'{variant_coordinates.as_string()}.pickle' + vk = vc.variant_key + if len(vk) <= 50: + fname = f'{vk}.pickle' else: - fname = f'{variant_coordinates.chrom}_{variant_coordinates.start}_{variant_coordinates.end}_{variant_coordinates.genotype}.pickle' + # long INDELs in sequence notation + fname = f'{vc.chrom}_{vc.start}_{vc.end}_{vc.variant_class}.pickle' return os.path.join(self._datadir, fname) @@ -210,7 +209,7 @@ def __init__(self, cache: VariantAnnotationCache, fallback: FunctionalAnnotator) self._cache = hpotk.util.validate_instance(cache, VariantAnnotationCache, 'cache') self._fallback = hpotk.util.validate_instance(fallback, FunctionalAnnotator, 'fallback') - def annotate(self, variant_coordinates: VariantCoordinates) -> Variant: + def annotate(self, variant_coordinates: VariantCoordinates) -> typing.Sequence[TranscriptAnnotation]: """Gets Variant for given variant coordinates Args: diff --git a/src/genophenocorr/view/__init__.py b/src/genophenocorr/view/__init__.py index b8bc74954..e456a6616 100644 --- a/src/genophenocorr/view/__init__.py +++ b/src/genophenocorr/view/__init__.py @@ -1,7 +1,7 @@ from ._cohort import CohortViewer -from ._tx import VariantTranscriptArtist +from ._txp import VariantTranscriptProteinArtist __all__ = [ 'CohortViewer', - 'VariantTranscriptArtist' + 'VariantTranscriptProteinArtist' ] diff --git a/src/genophenocorr/view/_tx.py b/src/genophenocorr/view/_tx.py deleted file mode 100644 index afe053370..000000000 --- a/src/genophenocorr/view/_tx.py +++ /dev/null @@ -1,14 +0,0 @@ -import typing - -from genophenocorr.model import Variant -from genophenocorr.model.genome import TranscriptCoordinates - - -class VariantTranscriptArtist: - """ - `VariantTranscriptArtist` creates a graphic to show distributions of variants across a provided transcript. - """ - - def draw_variants(self, tx: TranscriptCoordinates, variants: typing.Sequence[Variant]): - # TODO - implement - pass diff --git a/src/genophenocorr/view/_txp.py b/src/genophenocorr/view/_txp.py new file mode 100644 index 000000000..dae2e7718 --- /dev/null +++ b/src/genophenocorr/view/_txp.py @@ -0,0 +1,16 @@ +import typing + +from genophenocorr.model import Variant, TranscriptCoordinates, ProteinMetadata + + +class VariantTranscriptProteinArtist: + """ + `VariantTranscriptProteinArtist` creates a graphic to show distributions of variants across a provided transcript + and its protein product. + """ + + def draw_variants(self, variants: typing.Sequence[Variant], + tx: TranscriptCoordinates, + protein: ProteinMetadata): + # TODO - Peter implement + pass diff --git a/tests/fixtures.py b/tests/fixtures.py index fa584e7c6..f1ca20624 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -32,68 +32,76 @@ def toy_cohort() -> Cohort: ProteinFeature.create(feature_type=FeatureType.REGION, info=FeatureInfo('Disordered', Region(start=2131, end=2406))), ProteinFeature.create(feature_type=FeatureType.REGION, info=FeatureInfo('Important for protein degradation', Region(start=2369, end=2663))))) - HetSingleVar = [Variant('16_89279851_-/C', 'insertion', - VariantCoordinates(make_region("16", 89279849, 89279851), - ref='G', alt='GC', change_length=1, genotype='heterozygous'), - [TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.6691dup', False, ['frameshift_variant'], - [9], [prot], 2231, 2231)],genotype='heterozygous')] - HetDoubleVar1 = [Variant('16_89284601_GG/A', 'indel', - VariantCoordinates(make_region("16", 89284600, 89284602), - ref='GG', alt='A', change_length=-1, genotype='heterozygous'), - [TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.1940_1941delinsT', False, ['frameshift_variant'], - [9], [prot], 647, 647)],genotype='heterozygous'), - Variant('16_89280752_G/T', 'SNV', - VariantCoordinates(make_region("16", 89280751, 89280752), - ref='G', alt='T', change_length=0, genotype='heterozygous'), - [TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.5790C>A', False, ['stop_gained'], - [9], [prot], 1930, 1930)],genotype='heterozygous')] - HetDoubleVar2 = [Variant('16_89275128_G/A', 'SNV', - VariantCoordinates(make_region("16", 89275127, 89275128), - ref='G', alt='A', change_length=0, genotype='heterozygous'), - [TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.7534C>T', False, ['missense_variant'], - [10], [prot], 2512, 2512)],genotype='heterozygous'), - Variant('16_89279708_AGTGTTCGGGGCGGGGCC/A', 'indel', - VariantCoordinates(make_region("16", 89279707, 89279725), - ref='AGTGTTCGGGGCGGGGCC', alt='A', change_length=-17, genotype='heterozygous'), - [TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.6817_6833del', False, ['frameshift_variant'], - [9], [prot], 2273, 2278)],genotype='heterozygous')] - HomoVar = [Variant('16_89279458_TG/T', 'indel', - VariantCoordinates(make_region("16", 89279457, 89279459), - ref='TG', alt='T', change_length=-1, genotype='homozygous'), - [TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.7083del', False, ['frameshift_variant'], - [9], [prot], 2361, 2362)],genotype='homozygous')] - LargeCNV = [Variant('16_89190071_deletion', 'deletion', - VariantCoordinates(make_region("16", 89190070, 89439815), - ref='N', alt='', change_length=4, genotype='heterozygous'), - [TranscriptAnnotation('ANKRD11', 'NM_013275.6', None, False, ['stop_lost', 'feature_truncation', 'coding_sequence_variant', '5_prime_UTR_variant', '3_prime_UTR_variant', 'intron_variant'], - [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], [prot], None, None)],genotype='heterozygous')] - phenos = get_test_phenotypes() + dup = Variant(VariantCoordinates(make_region("16", 89279849, 89279850), ref='G', alt='GC', change_length=1), + [ + TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.6691dup', False, ['frameshift_variant'], [9], + [prot], 2231, 2231) + ], + Genotypes.from_mapping({'HetSingleVar': Genotype.HETEROZYGOUS})) + indel = Variant(VariantCoordinates(make_region("16", 89284600, 89284602), ref='GG', alt='A', change_length=-1), + [ + TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.1940_1941delinsT', False, ['frameshift_variant'], + [9], [prot], 647, 647) + ], + Genotypes.from_mapping({'HetDoubleVar1': Genotype.HETEROZYGOUS})) + snv_stop_gain = Variant(VariantCoordinates(make_region("16", 89280751, 89280752), ref='G', alt='T', change_length=0), + [ + TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.5790C>A', False, ['stop_gained'], [9], [prot], + 1930, 1930)], + Genotypes.from_mapping({'HetDoubleVar1': Genotype.HETEROZYGOUS})) + snv_missense = Variant(VariantCoordinates(make_region("16", 89275127, 89275128), ref='G', alt='A', change_length=0), + [ + TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.7534C>T', False, ['missense_variant'], [10], + [prot], 2512, 2512) + ], + Genotypes.from_mapping({'HetDoubleVar2': Genotype.HETEROZYGOUS})) + del_frameshift = Variant(VariantCoordinates(make_region("16", 89279707, 89279725), ref='AGTGTTCGGGGCGGGGCC', alt='A', change_length=-17), + [ + TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.6817_6833del', False, ['frameshift_variant'], + [9], [prot], 2273, 2278) + ], + Genotypes.from_mapping({'HetDoubleVar2': Genotype.HETEROZYGOUS})) + del_small = Variant(VariantCoordinates(make_region("16", 89279457, 89279459), ref='TG', alt='T', change_length=-1), + [ + TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.7083del', False, ['frameshift_variant'], [9], + [prot], 2361, 2362) + ], + Genotypes.from_mapping({'HomoVar': Genotype.HOMOZYGOUS_ALTERNATE})) + del_large = Variant(VariantCoordinates(make_region("16", 89_190_070, 89_439_815), ref='N', alt='', change_length=-249_745), + [ + TranscriptAnnotation('ANKRD11', 'NM_013275.6', None, False, + ['stop_lost', 'feature_truncation', 'coding_sequence_variant', '5_prime_UTR_variant', + '3_prime_UTR_variant', 'intron_variant'], [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], + [prot], None, None) + ], + Genotypes.from_mapping({'LargeCNV': Genotype.HETEROZYGOUS})) + patients = ( Patient('HetSingleVar', phenotypes=(phenos['arachnodactyly_T'], phenos['spasticity_F'], phenos['focal_clonic_seizure_T']), - variants=HetSingleVar, + variants=(dup,), proteins=[prot] ), Patient('HetDoubleVar1', phenotypes=(phenos['arachnodactyly_T'], phenos['seizure_T'], phenos['spasticity_T']), - variants=HetDoubleVar1, + variants=(indel, snv_stop_gain), proteins=[prot] ), Patient('HetDoubleVar2', phenotypes=(phenos['arachnodactyly_F'], phenos['spasticity_T'], phenos['seizure_T']), - variants=HetDoubleVar2, + variants=(snv_missense, del_frameshift), proteins=[prot] ), Patient('HomoVar', phenotypes=(phenos['arachnodactyly_T'], phenos['spasticity_T'], phenos['seizure_T']), - variants=HomoVar, + variants=(del_small,), proteins=[prot] ), Patient('LargeCNV', phenotypes=(phenos['arachnodactyly_T'], phenos['spasticity_T'], phenos['seizure_F']), - variants=LargeCNV, + variants=(del_large,), proteins=[prot] ), ) diff --git a/tests/genome_data.py b/tests/genome_data.py new file mode 100644 index 000000000..0e2bb1692 --- /dev/null +++ b/tests/genome_data.py @@ -0,0 +1,91 @@ +import typing + +import pytest + +from genophenocorr import VariantEffect +from genophenocorr.model import TranscriptCoordinates, ProteinMetadata, Variant, VariantCoordinates, TranscriptAnnotation, Genotypes +from genophenocorr.model.genome import Contig, GenomicRegion, Strand + + +@pytest.fixture +def toy_contig() -> Contig: + return Contig('1', 'GB_ACC', 'REFSEQ_ACC', 'USCD_ACC', 2_000) + + +@pytest.fixture +def toy_tx_positive(toy_contig: Contig) -> TranscriptCoordinates: + return TranscriptCoordinates('fake-tx-pos', + GenomicRegion(toy_contig, 200, 800, Strand.POSITIVE), + ( + GenomicRegion(toy_contig, 200, 250, Strand.POSITIVE), + GenomicRegion(toy_contig, 300, 340, Strand.POSITIVE), + GenomicRegion(toy_contig, 510, 540, Strand.POSITIVE), + GenomicRegion(toy_contig, 650, 800, Strand.POSITIVE) + ), + 310, 740) + + +@pytest.fixture +def toy_protein_positive() -> ProteinMetadata: + return ProteinMetadata('fake-prot-pos', 'FAKE PROTEIN ON POSITIVE STRAND', ()) + + +@pytest.fixture +def toy_tx_negative(toy_contig: Contig) -> TranscriptCoordinates: + return TranscriptCoordinates('fake-tx-neg', + GenomicRegion(toy_contig, 100, 600, Strand.NEGATIVE), + ( + GenomicRegion(toy_contig, 100, 180, Strand.NEGATIVE), + GenomicRegion(toy_contig, 210, 270, Strand.NEGATIVE), + GenomicRegion(toy_contig, 400, 450, Strand.NEGATIVE), + GenomicRegion(toy_contig, 550, 600, Strand.NEGATIVE) + ), + 130, 430) + + +@pytest.fixture +def toy_protein_negative() -> ProteinMetadata: + return ProteinMetadata('fake-prot-neg', 'FAKE PROTEIN ON NEGATIVE STRAND', ()) + + +@pytest.fixture +def toy_variants(toy_contig: Contig) -> typing.Sequence[Variant]: + return ( + Variant( + VariantCoordinates(GenomicRegion(toy_contig, 325, 326, Strand.POSITIVE),'C', 'T', 0), + ( + TranscriptAnnotation('some-gene', 'fake-tx-pos', 'fake-tx-pos-hgvsc:v1', + True, (VariantEffect.MISSENSE_VARIANT,), + (2,), + (), None, None), + ), Genotypes.empty() + ), + Variant(VariantCoordinates(GenomicRegion(toy_contig, 530, 531, Strand.POSITIVE), + 'C', 'CCT', 2), + ( + TranscriptAnnotation('some-gene', 'fake-tx-pos', 'fake-tx-pos-hgvsc:v2', + True, (VariantEffect.FRAMESHIFT_VARIANT,), + (3,), + (), None, None), + ), Genotypes.empty() + ), + Variant(VariantCoordinates(GenomicRegion(toy_contig, 160, 161, Strand.NEGATIVE).with_strand(Strand.POSITIVE), + 'G', 'A', 0), + ( + TranscriptAnnotation('other-gene', 'fake-tx-neg', 'fake-tx-neg-hgvsc:v3', + True, (VariantEffect.SYNONYMOUS_VARIANT,), + (1,), + (), None, None), + ), Genotypes.empty() + ), + Variant(VariantCoordinates(GenomicRegion(toy_contig, 570, 574, Strand.NEGATIVE).with_strand(Strand.POSITIVE), + 'CTGA', 'C', -3), + ( + TranscriptAnnotation('other-gene', 'fake-tx-neg', 'fake-tx-neg-hgvsc:v4', + True, (VariantEffect.THREE_PRIME_UTR_VARIANT,), + (4,), + (), None, None), + ), Genotypes.empty() + ) + + ) diff --git a/tests/test_predicates.py b/tests/test_predicates.py index 08f651f0c..6c2303874 100644 --- a/tests/test_predicates.py +++ b/tests/test_predicates.py @@ -66,13 +66,17 @@ def test_VariantEffectPredicate(patient_id: str, @pytest.mark.parametrize('patient_id, variant, hasVarResult', - (['HetSingleVar', '16_89279851_-/C', HETEROZYGOUS], - ['HetSingleVar', '16_89279708_AGTGTTCGGGGCGGGGCC/A', NO_VARIANT], - ['HetDoubleVar1', '16_89284601_GG/A', HETEROZYGOUS], - ['HetDoubleVar1', '16_89280752_G/T', HETEROZYGOUS], - ['HomoVar', '16_89280752_G/T', NO_VARIANT], - ['HomoVar', '16_89279458_TG/T', HOMOZYGOUS], - ['LargeCNV', '16_89190071_deletion', HETEROZYGOUS])) + (['HetSingleVar', '16_89279850_89279850_G_GC', HETEROZYGOUS], + # the `HetSingleVar` patient does NOT have the variant. + ['HetSingleVar', '16_89279708_89279725_AGTGTTCGGGGCGGGGCC_A', NO_VARIANT], + # but `HetDoubleVar2` below DOES have the variant. + ['HetDoubleVar2', '16_89279708_89279725_AGTGTTCGGGGCGGGGCC_A', HETEROZYGOUS], + ['HetDoubleVar1', '16_89284601_89284602_GG_A', HETEROZYGOUS], + ['HetDoubleVar1', '16_89280752_89280752_G_T', HETEROZYGOUS], + # the `HomoVar` patient does NOT have the variant + ['HomoVar', '16_89280752_89280752_G_T', NO_VARIANT], + ['HomoVar', '16_89279458_89279459_TG_T', HOMOZYGOUS], + ['LargeCNV', '16_89190071_89439815_DEL', HETEROZYGOUS])) def test_VariantPredicate(patient_id, variant, hasVarResult, toy_cohort): predicate = VariantPredicate('NM_013275.6') patient = find_patient(patient_id, toy_cohort) diff --git a/tests/test_view.py b/tests/test_view.py new file mode 100644 index 000000000..86f4b0926 --- /dev/null +++ b/tests/test_view.py @@ -0,0 +1,20 @@ +from genophenocorr.view import VariantTranscriptProteinArtist + + +from .genome_data import * + + +def test_draw_variants_on_transcript_pos(toy_variants: typing.Sequence[Variant], + toy_tx_positive: TranscriptCoordinates, + toy_protein_positive: ProteinMetadata): + artist = VariantTranscriptProteinArtist() + output = artist.draw_variants(toy_variants, toy_tx_positive, toy_protein_positive) + print(output) + + +def test_draw_variants_on_transcript_neg(toy_variants: typing.Sequence[Variant], + toy_tx_negative: TranscriptCoordinates, + toy_protein_negative: ProteinMetadata): + artist = VariantTranscriptProteinArtist() + output = artist.draw_variants(toy_variants, toy_tx_negative, toy_protein_negative) + print(output)