diff --git a/docs/installation.rst b/docs/installation.rst index 116dc747..8f8d76ef 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -32,3 +32,22 @@ The *latest* release can be installed by cloning the GitHub repository:: The code above will clone the source code from GitHub repository, switch to the `develop` branch with the latest features, and install the library into the current Python (virtual) environment. + +Run tests +^^^^^^^^^ + +Running tests can be done as an optional step after installation. However, some additional +libraries are required to run tests, hence we must do one more install, this time with `test` option enabled:: + + python3 -m pip install .[test] + +Then, running the tests is as simple as:: + + pytest + +This will run the unit and integration tests that do *not* require internet access. To run the "online" tests, +we add ``--runonlin`` option to the command line invocation:: + + pytest --runonline + +That's all about testing! diff --git a/src/genophenocorr/preprocessing/__init__.py b/src/genophenocorr/preprocessing/__init__.py index dab7ec5d..6770ef6a 100644 --- a/src/genophenocorr/preprocessing/__init__.py +++ b/src/genophenocorr/preprocessing/__init__.py @@ -1,7 +1,7 @@ from ._api import VariantCoordinateFinder, FunctionalAnnotator, ProteinMetadataService -from ._config import configure_caching_patient_creator +from ._config import configure_caching_patient_creator, configure_patient_creator from ._patient import PatientCreator -from ._phenopacket import PhenopacketVariantCoordinateFinder, PhenopacketPatientCreator, load_phenopacket_folder +from ._phenopacket import PhenopacketVariantCoordinateFinder, PhenopacketPatientCreator, load_phenopacket_folder, load_phenopacket from ._phenotype import PhenotypeCreator, PhenotypeValidationException from ._protein import ProteinAnnotationCache, ProtCachingFunctionalAnnotator from ._uniprot import UniprotProteinMetadataService @@ -11,10 +11,10 @@ __all__ = [ 'VariantCoordinateFinder', 'FunctionalAnnotator', 'ProteinMetadataService', 'PatientCreator', - 'PhenopacketVariantCoordinateFinder', 'PhenopacketPatientCreator', 'load_phenopacket_folder', + 'PhenopacketVariantCoordinateFinder', 'PhenopacketPatientCreator', 'load_phenopacket_folder', 'load_phenopacket', 'PhenotypeCreator', 'PhenotypeValidationException', 'ProteinAnnotationCache', 'ProtCachingFunctionalAnnotator', 'UniprotProteinMetadataService', 'VepFunctionalAnnotator', 'VariantAnnotationCache', 'VarCachingFunctionalAnnotator', - 'configure_caching_patient_creator' + 'configure_caching_patient_creator', 'configure_patient_creator' ] diff --git a/src/genophenocorr/preprocessing/_config.py b/src/genophenocorr/preprocessing/_config.py index 628c9e7f..1c90cd05 100644 --- a/src/genophenocorr/preprocessing/_config.py +++ b/src/genophenocorr/preprocessing/_config.py @@ -3,15 +3,16 @@ import hpotk -from genophenocorr.model.genome import GRCh37, GRCh38 +from genophenocorr.model.genome import GRCh37, GRCh38, GenomeBuild +from ._api import FunctionalAnnotator, ProteinMetadataService from ._phenotype import PhenotypeCreator from ._phenopacket import PhenopacketPatientCreator -from ._api import FunctionalAnnotator from ._protein import ProteinAnnotationCache, ProtCachingFunctionalAnnotator from ._uniprot import UniprotProteinMetadataService from ._variant import VarCachingFunctionalAnnotator, VariantAnnotationCache from ._vep import VepFunctionalAnnotator +from ._vv import VVHgvsVariantCoordinateFinder def configure_caching_patient_creator(hpo: hpotk.MinimalOntology, @@ -23,7 +24,7 @@ def configure_caching_patient_creator(hpo: hpotk.MinimalOntology, """ A convenience function for configuring a caching :class:`genophenocorr.preprocessing.PhenopacketPatientCreator`. - To create the patient creator, we need hpo-toolkit's representation of HPO, the validator + To create the patient creator, we need hpo-toolkit's representation of HPO. Other options are optional. :param hpo: a HPO instance. :param genome_build: name of the genome build to use, choose from `{'GRCh37.p13', 'GRCh38.p13'}`. @@ -39,17 +40,50 @@ def configure_caching_patient_creator(hpo: hpotk.MinimalOntology, if cache_dir is None: cache_dir = os.path.join(os.getcwd(), '.genophenocorr_cache') + build = _configure_build(genome_build) + phenotype_creator = _setup_phenotype_creator(hpo, validation_runner) + functional_annotator = _configure_functional_annotator(cache_dir, variant_fallback, protein_fallback) + hgvs_annotator = VVHgvsVariantCoordinateFinder(build) + return PhenopacketPatientCreator(build, phenotype_creator, functional_annotator, hgvs_annotator) + + +def configure_patient_creator(hpo: hpotk.MinimalOntology, + genome_build: str = 'GRCh38.p13', + validation_runner: typing.Optional[hpotk.validate.ValidationRunner] = None, + variant_fallback: str = 'VEP', + protein_fallback: str = 'UNIPROT') -> PhenopacketPatientCreator: + """ + A convenience function for configuring a non-caching :class:`genophenocorr.preprocessing.PhenopacketPatientCreator`. + + To create the patient creator, we need hpo-toolkit's representation of HPO. Other options are optional + + :param hpo: a HPO instance. + :param genome_build: name of the genome build to use, choose from `{'GRCh37.p13', 'GRCh38.p13'}`. + :param validation_runner: an instance of the validation runner. + if the data should be cached in `.cache` folder in the current working directory. + In any case, the directory will be created if it does not exist (including non-existing parents). + :param variant_fallback: the fallback variant annotator to use if we cannot find the annotation locally. + Choose from ``{'VEP'}`` (just one fallback implementation is available at the moment). + :param protein_fallback: the fallback protein metadata annotator to use if we cannot find the annotation locally. + Choose from ``{'UNIPROT'}`` (just one fallback implementation is available at the moment). + """ + build = _configure_build(genome_build) + + phenotype_creator = _setup_phenotype_creator(hpo, validation_runner) + protein_metadata_service = _configure_fallback_protein_service(protein_fallback) + functional_annotator = _configure_fallback_functional(protein_metadata_service, variant_fallback) + hgvs_annotator = VVHgvsVariantCoordinateFinder(build) + return PhenopacketPatientCreator(build, phenotype_creator, functional_annotator, hgvs_annotator) + + +def _configure_build(genome_build: str) -> GenomeBuild: if genome_build == 'GRCh38.p13': - build = GRCh38 + return GRCh38 elif genome_build == 'GRCh37.p13': - build = GRCh37 + return GRCh37 else: raise ValueError(f'Unknown build {genome_build}. Choose from [\'GRCh37.p13\', \'GRCh38.p13\']') - phenotype_creator = _setup_phenotype_creator(hpo, validation_runner) - functional_annotator = _configure_functional_annotator(cache_dir, variant_fallback, protein_fallback) - return PhenopacketPatientCreator(build, phenotype_creator, functional_annotator) - def _setup_phenotype_creator(hpo: hpotk.MinimalOntology, validator: typing.Optional[hpotk.validate.ValidationRunner]) -> PhenotypeCreator: @@ -70,23 +104,17 @@ def _configure_functional_annotator(cache_dir: str, protein_fallback: str) -> FunctionalAnnotator: # (1) ProteinMetadataService # Setup fallback - if protein_fallback == 'UNIPROT': - fallback1 = UniprotProteinMetadataService() - else: - raise ValueError(f'Unknown protein fallback annotator type {protein_fallback}') + protein_fallback = _configure_fallback_protein_service(protein_fallback) # Setup protein metadata cache prot_cache_dir = os.path.join(cache_dir, 'protein_cache') os.makedirs(prot_cache_dir, exist_ok=True) prot_cache = ProteinAnnotationCache(prot_cache_dir) # Assemble the final protein metadata service - protein_metadata_service = ProtCachingFunctionalAnnotator(prot_cache, fallback1) + protein_metadata_service = ProtCachingFunctionalAnnotator(prot_cache, protein_fallback) # (2) FunctionalAnnotator # Setup fallback - if variant_fallback == 'VEP': - fallback = VepFunctionalAnnotator(protein_metadata_service) - else: - raise ValueError(f'Unknown variant fallback annotator type {variant_fallback}') + fallback = _configure_fallback_functional(protein_metadata_service, variant_fallback) # Setup variant cache var_cache_dir = os.path.join(cache_dir, 'variant_cache') @@ -97,3 +125,20 @@ def _configure_functional_annotator(cache_dir: str, return VarCachingFunctionalAnnotator(var_cache, fallback) +def _configure_fallback_protein_service(protein_fallback: str) -> ProteinMetadataService: + if protein_fallback == 'UNIPROT': + fallback1 = UniprotProteinMetadataService() + else: + raise ValueError(f'Unknown protein fallback annotator type {protein_fallback}') + return fallback1 + + +def _configure_fallback_functional(protein_metadata_service: ProteinMetadataService, + variant_fallback: str) -> FunctionalAnnotator: + if variant_fallback == 'VEP': + fallback = VepFunctionalAnnotator(protein_metadata_service) + else: + raise ValueError(f'Unknown variant fallback annotator type {variant_fallback}') + return fallback + + diff --git a/src/genophenocorr/preprocessing/_phenopacket.py b/src/genophenocorr/preprocessing/_phenopacket.py index d926d3d7..de6375a4 100644 --- a/src/genophenocorr/preprocessing/_phenopacket.py +++ b/src/genophenocorr/preprocessing/_phenopacket.py @@ -1,5 +1,4 @@ import logging -import re import os import typing @@ -20,15 +19,19 @@ class PhenopacketVariantCoordinateFinder(VariantCoordinateFinder[GenomicInterpretation]): - """A class that creates VariantCoordinates from a Phenopacket + """ + `PhenopacketVariantCoordinateFinder` figures out :class:`genophenocorr.model.VariantCoordinates` + and :class:`genophenocorr.model.Genotype` from `GenomicInterpretation` element of Phenopacket Schema. - Methods: - find_coordinates(item:GenomicInterpretation): Creates VariantCoordinates from the data in a given Phenopacket + :param build: genome build to use in `VariantCoordinates + :param hgvs_coordinate_finder: the coordinate finder to use for parsing HGVS expressions """ - def __init__(self, build: GenomeBuild): - """Constructs all necessary attributes for a PhenopacketVariantCoordinateFinder object""" + def __init__(self, build: GenomeBuild, + hgvs_coordinate_finder: VariantCoordinateFinder[str]): self._logger = logging.getLogger(__name__) - self._build = build + self._build = hpotk.util.validate_instance(build, GenomeBuild, 'build') + self._hgvs_finder = hpotk.util.validate_instance(hgvs_coordinate_finder, VariantCoordinateFinder, + 'hgvs_coordinate_finder') def find_coordinates(self, item: GenomicInterpretation) -> typing.Tuple[VariantCoordinates, Genotype]: """Creates a VariantCoordinates object from the data in a given Phenopacket @@ -40,41 +43,78 @@ def find_coordinates(self, item: GenomicInterpretation) -> typing.Tuple[VariantC """ if not isinstance(item, GenomicInterpretation): raise ValueError(f"item must be a Phenopacket GenomicInterpretation but was type {type(item)}") + variant_descriptor = item.variant_interpretation.variation_descriptor - if len(variant_descriptor.vcf_record.chrom) == 0 and len( - variant_descriptor.variation.copy_number.allele.sequence_location.sequence_id) != 0: + + vc = None + if self._vcf_is_available(variant_descriptor.vcf_record): + # We have a VCF record. + contig = self._build.contig_by_name(variant_descriptor.vcf_record.chrom) + start = int(variant_descriptor.vcf_record.pos) - 1 + ref = variant_descriptor.vcf_record.ref + alt = variant_descriptor.vcf_record.alt + end = start + len(ref) + change_length = end - start + + region = GenomicRegion(contig, start, end, Strand.POSITIVE) + vc = VariantCoordinates(region, ref, alt, change_length) + elif self._cnv_is_available(variant_descriptor.variation): + # We have a CNV. + variation = variant_descriptor.variation + seq_location = variation.copy_number.allele.sequence_location + refseq_contig_name = seq_location.sequence_id.split(':')[1] + contig = self._build.contig_by_name(refseq_contig_name) + + # Assuming SV coordinates are 1-based (VCF style), + # so we subtract 1 to transform to 0-based coordinate system + start = int(seq_location.sequence_interval.start_number.value) - 1 + end = int(seq_location.sequence_interval.end_number.value) ref = 'N' - start = int( - variant_descriptor.variation.copy_number.allele.sequence_location.sequence_interval.start_number.value) - end = int( - variant_descriptor.variation.copy_number.allele.sequence_location.sequence_interval.end_number.value) - number = int(variant_descriptor.variation.copy_number.number.value) + number = int(variation.copy_number.number.value) if number > 2: alt = '' else: alt = '' - refseq_contig_name = variant_descriptor.variation.copy_number.allele.sequence_location.sequence_id.split(':')[1] - contig = self._build.contig_by_name(refseq_contig_name) - elif len(variant_descriptor.vcf_record.chrom) != 0 and len( - variant_descriptor.variation.copy_number.allele.sequence_location.sequence_id) == 0: - ref = variant_descriptor.vcf_record.ref - alt = variant_descriptor.vcf_record.alt - start = int(variant_descriptor.vcf_record.pos) - 1 - end = int(variant_descriptor.vcf_record.pos) + abs(len(alt) - len(ref)) - contig = self._build.contig_by_name(variant_descriptor.vcf_record.chrom[3:]) - else: - raise ValueError('Expected a VCF record or a VRS CNV but did not find one') + change_length = end - start + + region = GenomicRegion(contig, start, end, Strand.POSITIVE) + vc = VariantCoordinates(region, ref, alt, change_length) + elif len(variant_descriptor.expressions) > 0: + # We have some expressions. Let's try to find the 1st expression with `hgvs.c` syntax. + for expression in variant_descriptor.expressions: + if expression.syntax == 'hgvs.c': + vc = self._hgvs_finder.find_coordinates(expression.value) + break + + if vc is None: + raise ValueError('Expected a VCF record, a VRS CNV, or an expression with `hgvs.c` ' + 'but did not find one') + + # Last, parse genotype. genotype = variant_descriptor.allelic_state.label - - if any(field is None for field in (contig, ref, alt, genotype)): - raise ValueError(f'Cannot determine variant coordinate from genomic interpretation {item}') - region = GenomicRegion(contig, start, end, Strand.POSITIVE) - - vc = VariantCoordinates(region, ref, alt, len(alt) - len(ref)) gt = self._map_geno_genotype_label(genotype) return vc, gt + @staticmethod + def _vcf_is_available(vcf_record) -> bool: + """ + Check if we can parse data out of VCF record. + """ + return (vcf_record.genome_assembly != '' + and vcf_record.chrom != '' + and vcf_record.pos >= 0 + and vcf_record.ref != '' + and vcf_record.alt != '') + + @staticmethod + def _cnv_is_available(variation): + seq_location = variation.copy_number.allele.sequence_location + return (seq_location.sequence_id != '' + and seq_location.sequence_interval.start_number.value >= 0 + and seq_location.sequence_interval.end_number.value >= 0 + and variation.copy_number.number.value != '') + @staticmethod def _map_geno_genotype_label(genotype: str) -> Genotype: """ @@ -91,25 +131,17 @@ def _map_geno_genotype_label(genotype: str) -> Genotype: class PhenopacketPatientCreator(PatientCreator[Phenopacket]): - """A class that creates a Patient object - - Methods: - create_patient(item:Phenopacket): Creates a Patient from the data in a given Phenopacket + """ + `PhenopacketPatientCreator` transforms `Phenopacket` into :class:`genophenocorr.model.Patient`. """ def __init__(self, build: GenomeBuild, phenotype_creator: PhenotypeCreator, - var_func_ann: FunctionalAnnotator): - """Constructs all necessary attributes for a PhenopacketPatientCreator object - - Args: - build (GenomeBuild): A genome build to use to load variant coordinates. - phenotype_creator (PhenotypeCreator): A PhenotypeCreator object for Phenotype creation - var_func_ann (FunctionalAnnotator): A FunctionalAnnotator object for Variant creation - """ + var_func_ann: FunctionalAnnotator, + hgvs_coordinate_finder: VariantCoordinateFinder[str]): self._logger = logging.getLogger(__name__) # Violates DI, but it is specific to this class, so I'll leave it "as is". - self._coord_finder = PhenopacketVariantCoordinateFinder(build) + self._coord_finder = PhenopacketVariantCoordinateFinder(build, hgvs_coordinate_finder) self._phenotype_creator = hpotk.util.validate_instance(phenotype_creator, PhenotypeCreator, 'phenotype_creator') self._func_ann = hpotk.util.validate_instance(var_func_ann, FunctionalAnnotator, 'var_func_ann') @@ -224,11 +256,16 @@ def _load_phenopacket_dir(pp_dir: str) -> typing.Sequence[Phenopacket]: for patient_file in os.listdir(pp_dir): if patient_file.endswith('.json'): phenopacket_path = os.path.join(pp_dir, patient_file) - pp = _load_phenopacket(phenopacket_path) + pp = load_phenopacket(phenopacket_path) patients.append(pp) return patients -def _load_phenopacket(phenopacket_path: str) -> Phenopacket: +def load_phenopacket(phenopacket_path: str) -> Phenopacket: + """ + Load phenopacket JSON file. + + :param phenopacket_path: a `str` pointing to phenopacket JSON file. + """ with open(phenopacket_path) as f: return Parse(f.read(), Phenopacket()) diff --git a/src/genophenocorr/preprocessing/_test_variant.py b/src/genophenocorr/preprocessing/_test_variant.py index 4682373c..253dc1f1 100644 --- a/src/genophenocorr/preprocessing/_test_variant.py +++ b/src/genophenocorr/preprocessing/_test_variant.py @@ -7,28 +7,42 @@ # pyright: reportGeneralTypeIssues=false from phenopackets import Phenopacket, GenomicInterpretation -from genophenocorr.model.genome import GRCh38 +from genophenocorr.model.genome import GenomeBuild, GRCh38 +from ._api import VariantCoordinateFinder from ._phenopacket import PhenopacketVariantCoordinateFinder from ._protein import ProteinAnnotationCache, ProtCachingFunctionalAnnotator from ._uniprot import UniprotProteinMetadataService from ._variant import VariantAnnotationCache, VarCachingFunctionalAnnotator from ._vep import VepFunctionalAnnotator +from ._vv import VVHgvsVariantCoordinateFinder @pytest.fixture -def pp_vc_finder() -> PhenopacketVariantCoordinateFinder: - return PhenopacketVariantCoordinateFinder(GRCh38) +def build() -> GenomeBuild: + return GRCh38 + + +@pytest.fixture +def hgvs_vc_finder(build: GenomeBuild) -> VariantCoordinateFinder: + return VVHgvsVariantCoordinateFinder(build) + + +@pytest.fixture +def pp_vc_finder(build: GenomeBuild, + hgvs_vc_finder: VariantCoordinateFinder) -> PhenopacketVariantCoordinateFinder: + return PhenopacketVariantCoordinateFinder(build, hgvs_vc_finder) @pytest.mark.parametrize("pp_path, expected", [('test_data/deletion_test.json', '16_89284129_89284134_CTTTTT_C'), - ('test_data/insertion_test.json', '16_89280829_89280830_C_CA'), + ('test_data/insertion_test.json', '16_89280829_89280829_C_CA'), ('test_data/missense_test.json', '16_89279135_89279135_G_C'), - ('test_data/duplication_test.json', '16_89279850_89279851_G_GC'), + ('test_data/missense_hgvs_test.json', '16_89279135_89279135_G_C'), + ('test_data/duplication_test.json', '16_89279850_89279850_G_GC'), ('test_data/delinsert_test.json', '16_89284601_89284602_GG_A'), - ('test_data/CVDup_test.json', '16_89284524_89373231_DUP'), - ('test_data/CVDel_test.json', '16_89217282_89506042_DEL') + ('test_data/CVDup_test.json', '16_89284523_89373231_DUP'), + ('test_data/CVDel_test.json', '16_89217281_89506042_DEL') ]) def test_find_coordinates(pp_path, expected, pp_vc_finder): fname = resource_filename(__name__, pp_path) @@ -36,7 +50,7 @@ def test_find_coordinates(pp_path, expected, pp_vc_finder): vc, gt = pp_vc_finder.find_coordinates(gi) - assert expected == vc.variant_key + assert vc.variant_key == expected def read_genomic_interpretation_json(fpath: str) -> GenomicInterpretation: @@ -55,6 +69,7 @@ def caching_annotator(variant_annotator, tmp_path): vac = VariantAnnotationCache(tmp_path) return VarCachingFunctionalAnnotator(vac, variant_annotator) +@pytest.mark.online def test_caching_full_circle(caching_annotator, pp_vc_finder, variant_annotator): fname = resource_filename(__name__, 'test_data/missense_test.json') gi = read_genomic_interpretation_json(fname) diff --git a/src/genophenocorr/preprocessing/_test_vep.py b/src/genophenocorr/preprocessing/_test_vep.py index 580db5de..a0afdf39 100644 --- a/src/genophenocorr/preprocessing/_test_vep.py +++ b/src/genophenocorr/preprocessing/_test_vep.py @@ -59,6 +59,7 @@ def test_verify_start_end_coordinates(contig_name, start, end, ref, alt, chlen, assert out == expected +@pytest.mark.online # Online due to using real Uniprot service class TestVepFunctionalAnnotator: TEST_DATA_DIR = resource_filename(__name__, os.path.join('test_data', 'vep_response')) diff --git a/src/genophenocorr/preprocessing/_test_vv.py b/src/genophenocorr/preprocessing/_test_vv.py index c8fe7039..4ee9d292 100644 --- a/src/genophenocorr/preprocessing/_test_vv.py +++ b/src/genophenocorr/preprocessing/_test_vv.py @@ -90,6 +90,11 @@ def test_load_hgvs_SURF2_ins(self, coordinate_finder: VVHgvsVariantCoordinateFin assert vc.alt == 'TAGC' assert vc.change_length == 3 + @pytest.mark.skip('Online tests are disabled by default') + def test_find_coordinates(self, coordinate_finder: VVHgvsVariantCoordinateFinder): + vc = coordinate_finder.find_coordinates('NM_013275.6:c.7407C>G') + print(vc) + def load_response_json(path: str): with open(path) as fh: diff --git a/src/genophenocorr/preprocessing/_vep.py b/src/genophenocorr/preprocessing/_vep.py index 2de04f41..51f29297 100644 --- a/src/genophenocorr/preprocessing/_vep.py +++ b/src/genophenocorr/preprocessing/_vep.py @@ -39,7 +39,6 @@ def verify_start_end_coordinates(vc: VariantCoordinates): if len(vc.ref) == 1 and len(vc.alt) != 1: # INS/DUP start = start + 1 # we must "trim" - end = end - 1 alt = vc.alt[1:] # 100 AC AGT # MNV @@ -67,8 +66,7 @@ def __init__(self, protein_annotator: ProteinMetadataService, '&transcript_version=1&variant_class=1' self._include_computational_txs = include_computational_txs - def annotate(self, variant_coordinates: VariantCoordinates) -> typing.Sequence[ - TranscriptAnnotation]: + def annotate(self, variant_coordinates: VariantCoordinates) -> typing.Sequence[TranscriptAnnotation]: """Perform functional annotation using Variant Effect Predictor (VEP) REST API. Args: diff --git a/src/genophenocorr/preprocessing/_vv.py b/src/genophenocorr/preprocessing/_vv.py index af582186..1c3cfbc3 100644 --- a/src/genophenocorr/preprocessing/_vv.py +++ b/src/genophenocorr/preprocessing/_vv.py @@ -5,45 +5,40 @@ import hpotk import requests -from genophenocorr.model import VariantCoordinates, Genotype +from genophenocorr.model import VariantCoordinates from genophenocorr.model.genome import GenomeBuild, GenomicRegion, Strand from ._api import VariantCoordinateFinder -class VVHgvsVariantCoordinateFinder(VariantCoordinateFinder[typing.Tuple[str, str]]): +class VVHgvsVariantCoordinateFinder(VariantCoordinateFinder[str]): """ `VVHgvsVariantCoordinateFinder` uses Variant Validator's REST API to build :class:`VariantCoordinates` from an HGVS string. - The finder takes a tuple with two strings: + The finder takes an HGVS `str` (e.g. `NM_005912.3:c.253A>G`) and extracts the variant coordinates from the response. - * HGVS `str` (e.g. `NM_005912.3:c.253A>G`) - * genotype `str` (e.g. `heterozygous`) - - and extracts the variant coordinates from the response. - - :param genome_build: the genome build to use to construct :class:`VariantCoordinates`. - :param timeout: the REST API request timeout (default: 10). + :param genome_build: the genome build to use to construct :class:`VariantCoordinates` + :param timeout: the REST API request timeout """ - def __init__(self, genome_build: GenomeBuild, timeout: int = 10): + def __init__(self, genome_build: GenomeBuild, timeout: int = 30): self._build = hpotk.util.validate_instance(genome_build, GenomeBuild, 'genome_build') self._timeout = timeout self._url = 'https://rest.variantvalidator.org/VariantValidator/variantvalidator/%s/%s/%s' self._headers = {'Content-type': 'application/json'} self._hgvs_pattern = re.compile(r'^(?PNM_\d+\.\d+):c.\d+(_\d+)?.*') - def find_coordinates(self, item: typing.Tuple[str, str]) -> typing.Tuple[VariantCoordinates, Genotype]: + def find_coordinates(self, item: str) -> VariantCoordinates: """ Extracts variant coordinates from an HGVS string using Variant Validator's REST API. - :param item: Tuple of hgvs string and genotype string - :return: variant coordinates and genotype + + :param item: a hgvs string + :return: variant coordinates """ - hgvs, genotype = item - matcher = self._hgvs_pattern.match(hgvs) + matcher = self._hgvs_pattern.match(item) if matcher: transcript = matcher.group('tx') - request_url = self._url % (self._build.genome_build_id.major_assembly, hgvs, transcript) + request_url = self._url % (self._build.genome_build_id.major_assembly, item, transcript) try: response = requests.get(request_url, headers=self._headers, timeout=self._timeout) @@ -51,15 +46,14 @@ def find_coordinates(self, item: typing.Tuple[str, str]) -> typing.Tuple[Variant response = response.json() variant_coordinates = self._extract_variant_coordinates(response) except (requests.exceptions.RequestException, VariantValidatorDecodeException) as e: - raise ValueError(f'Error processing {hgvs}', e) + raise ValueError(f'Error processing {item}', e) else: # The HGVS did not pass validation by a regular expression. # Please submit an issue to our GitHub tracker if you believe # the HGVS expression is correct. - raise ValueError(f'Invalid HGVS string: {hgvs}') + raise ValueError(f'Invalid HGVS string: {item}') - genotype = Genotype[genotype.upper()] - return variant_coordinates, genotype + return variant_coordinates def _extract_variant_coordinates(self, response: typing.Dict) -> typing.Optional[VariantCoordinates]: """ diff --git a/src/genophenocorr/preprocessing/test_data/CVDel_test.json b/src/genophenocorr/preprocessing/test_data/CVDel_test.json index 13a44ad3..8597e9cb 100644 --- a/src/genophenocorr/preprocessing/test_data/CVDel_test.json +++ b/src/genophenocorr/preprocessing/test_data/CVDel_test.json @@ -1,41 +1,41 @@ - { - "subjectOrBiosampleId": "Gnazzo, 2020_P29", - "interpretationStatus": "CAUSATIVE", - "variantInterpretation": { - "variationDescriptor": { - "variation": { - "copyNumber": { - "allele": { - "sequenceLocation": { - "sequenceId": "refseq:NC_000016.10", - "sequenceInterval": { - "startNumber": { - "value": "89217281" - }, - "endNumber": { - "value": "89506042" - } - } - } - }, - "number": { - "value": "1" - } - } - }, - "description": "16q24.3(89217281_89506042)x1", - "geneContext": { - "valueId": "HGNC:21316", - "symbol": "ANKRD11" - }, - "structuralType": { - "id": "SO:1000029", - "label": "chromosomal_deletion" - }, - "allelicState": { - "id": "GENO:0000135", - "label": "heterozygous" - } - } +{ + "subjectOrBiosampleId": "Gnazzo, 2020_P29", + "interpretationStatus": "CAUSATIVE", + "variantInterpretation": { + "variationDescriptor": { + "variation": { + "copyNumber": { + "allele": { + "sequenceLocation": { + "sequenceId": "refseq:NC_000016.10", + "sequenceInterval": { + "startNumber": { + "value": "89217281" + }, + "endNumber": { + "value": "89506042" } + } } + }, + "number": { + "value": "1" + } + } + }, + "description": "16q24.3(89217281_89506042)x1", + "geneContext": { + "valueId": "HGNC:21316", + "symbol": "ANKRD11" + }, + "structuralType": { + "id": "SO:1000029", + "label": "chromosomal_deletion" + }, + "allelicState": { + "id": "GENO:0000135", + "label": "heterozygous" + } + } + } +} diff --git a/src/genophenocorr/preprocessing/test_data/CVDup_test.json b/src/genophenocorr/preprocessing/test_data/CVDup_test.json index 6f8718d5..1c47b0b0 100644 --- a/src/genophenocorr/preprocessing/test_data/CVDup_test.json +++ b/src/genophenocorr/preprocessing/test_data/CVDup_test.json @@ -1,42 +1,41 @@ - - { - "subjectOrBiosampleId": "Crippa2015_P3", - "interpretationStatus": "CAUSATIVE", - "variantInterpretation": { - "variationDescriptor": { - "variation": { - "copyNumber": { - "allele": { - "sequenceLocation": { - "sequenceId": "refseq:NC_000016.10", - "sequenceInterval": { - "startNumber": { - "value": "89284523" - }, - "endNumber": { - "value": "89373231" - } - } - } - }, - "number": { - "value": "3" - } - } - }, - "description": "16q24.3(89284523_89373231)x3", - "geneContext": { - "valueId": "HGNC:21316", - "symbol": "ANKRD11" - }, - "structuralType": { - "id": "SO:1000029", - "label": "chromosomal_deletion" - }, - "allelicState": { - "id": "GENO:0000135", - "label": "heterozygous" - } - } +{ + "subjectOrBiosampleId": "Crippa2015_P3", + "interpretationStatus": "CAUSATIVE", + "variantInterpretation": { + "variationDescriptor": { + "variation": { + "copyNumber": { + "allele": { + "sequenceLocation": { + "sequenceId": "refseq:NC_000016.10", + "sequenceInterval": { + "startNumber": { + "value": "89284523" + }, + "endNumber": { + "value": "89373231" } + } } + }, + "number": { + "value": "3" + } + } + }, + "description": "16q24.3(89284523_89373231)x3", + "geneContext": { + "valueId": "HGNC:21316", + "symbol": "ANKRD11" + }, + "structuralType": { + "id": "SO:1000029", + "label": "chromosomal_deletion" + }, + "allelicState": { + "id": "GENO:0000135", + "label": "heterozygous" + } + } + } +} diff --git a/src/genophenocorr/preprocessing/test_data/deletion_test.json b/src/genophenocorr/preprocessing/test_data/deletion_test.json index 290d21b5..d1aa2b4f 100644 --- a/src/genophenocorr/preprocessing/test_data/deletion_test.json +++ b/src/genophenocorr/preprocessing/test_data/deletion_test.json @@ -1,35 +1,34 @@ - - { - "subjectOrBiosampleId": "Bianchi, 2018", - "interpretationStatus": "CAUSATIVE", - "variantInterpretation": { - "variationDescriptor": { - "geneContext": { - "valueId": "HGNC:21316", - "symbol": "ANKRD11" - }, - "expressions": [ - { - "syntax": "hgvs.c", - "value": "NM_013275.6:c.2408_2412del" - }, - { - "syntax": "hgvs.g", - "value": "NC_000016.10:g.89284131_89284135del" - } - ], - "vcfRecord": { - "genomeAssembly": "hg38", - "chrom": "chr16", - "pos": "89284129", - "ref": "CTTTTT", - "alt": "C" - }, - "moleculeContext": "genomic", - "allelicState": { - "id": "GENO:0000135", - "label": "heterozygous" - } - } - } - } \ No newline at end of file +{ + "subjectOrBiosampleId": "Bianchi, 2018", + "interpretationStatus": "CAUSATIVE", + "variantInterpretation": { + "variationDescriptor": { + "geneContext": { + "valueId": "HGNC:21316", + "symbol": "ANKRD11" + }, + "expressions": [ + { + "syntax": "hgvs.c", + "value": "NM_013275.6:c.2408_2412del" + }, + { + "syntax": "hgvs.g", + "value": "NC_000016.10:g.89284131_89284135del" + } + ], + "vcfRecord": { + "genomeAssembly": "hg38", + "chrom": "chr16", + "pos": "89284129", + "ref": "CTTTTT", + "alt": "C" + }, + "moleculeContext": "genomic", + "allelicState": { + "id": "GENO:0000135", + "label": "heterozygous" + } + } + } +} diff --git a/src/genophenocorr/preprocessing/test_data/delinsert_test.json b/src/genophenocorr/preprocessing/test_data/delinsert_test.json index 50819252..c939724b 100644 --- a/src/genophenocorr/preprocessing/test_data/delinsert_test.json +++ b/src/genophenocorr/preprocessing/test_data/delinsert_test.json @@ -1,35 +1,34 @@ - - { - "subjectOrBiosampleId": "KBG7", - "interpretationStatus": "CAUSATIVE", - "variantInterpretation": { - "variationDescriptor": { - "geneContext": { - "valueId": "HGNC:21316", - "symbol": "ANKRD11" - }, - "expressions": [ - { - "syntax": "hgvs.c", - "value": "NM_013275.6:c.1940_1941delinsT" - }, - { - "syntax": "hgvs.g", - "value": "NC_000016.10:g.89284601_89284602delinsA" - } - ], - "vcfRecord": { - "genomeAssembly": "hg38", - "chrom": "chr16", - "pos": "89284601", - "ref": "GG", - "alt": "A" - }, - "moleculeContext": "genomic", - "allelicState": { - "id": "GENO:0000135", - "label": "heterozygous" - } - } - } - } \ No newline at end of file +{ + "subjectOrBiosampleId": "KBG7", + "interpretationStatus": "CAUSATIVE", + "variantInterpretation": { + "variationDescriptor": { + "geneContext": { + "valueId": "HGNC:21316", + "symbol": "ANKRD11" + }, + "expressions": [ + { + "syntax": "hgvs.c", + "value": "NM_013275.6:c.1940_1941delinsT" + }, + { + "syntax": "hgvs.g", + "value": "NC_000016.10:g.89284601_89284602delinsA" + } + ], + "vcfRecord": { + "genomeAssembly": "hg38", + "chrom": "chr16", + "pos": "89284601", + "ref": "GG", + "alt": "A" + }, + "moleculeContext": "genomic", + "allelicState": { + "id": "GENO:0000135", + "label": "heterozygous" + } + } + } +} diff --git a/src/genophenocorr/preprocessing/test_data/duplication_test.json b/src/genophenocorr/preprocessing/test_data/duplication_test.json index d3cbc56c..a5c05a9d 100644 --- a/src/genophenocorr/preprocessing/test_data/duplication_test.json +++ b/src/genophenocorr/preprocessing/test_data/duplication_test.json @@ -1,35 +1,34 @@ - - { - "subjectOrBiosampleId": "KBG6", - "interpretationStatus": "CAUSATIVE", - "variantInterpretation": { - "variationDescriptor": { - "geneContext": { - "valueId": "HGNC:21316", - "symbol": "ANKRD11" - }, - "expressions": [ - { - "syntax": "hgvs.c", - "value": "NM_013275.6:c.6691dup" - }, - { - "syntax": "hgvs.g", - "value": "NC_000016.10:g.89279853dup" - } - ], - "vcfRecord": { - "genomeAssembly": "hg38", - "chrom": "chr16", - "pos": "89279850", - "ref": "G", - "alt": "GC" - }, - "moleculeContext": "genomic", - "allelicState": { - "id": "GENO:0000135", - "label": "heterozygous" - } - } - } - } \ No newline at end of file +{ + "subjectOrBiosampleId": "KBG6", + "interpretationStatus": "CAUSATIVE", + "variantInterpretation": { + "variationDescriptor": { + "geneContext": { + "valueId": "HGNC:21316", + "symbol": "ANKRD11" + }, + "expressions": [ + { + "syntax": "hgvs.c", + "value": "NM_013275.6:c.6691dup" + }, + { + "syntax": "hgvs.g", + "value": "NC_000016.10:g.89279853dup" + } + ], + "vcfRecord": { + "genomeAssembly": "hg38", + "chrom": "chr16", + "pos": "89279850", + "ref": "G", + "alt": "GC" + }, + "moleculeContext": "genomic", + "allelicState": { + "id": "GENO:0000135", + "label": "heterozygous" + } + } + } +} diff --git a/src/genophenocorr/preprocessing/test_data/insertion_test.json b/src/genophenocorr/preprocessing/test_data/insertion_test.json index 10d355df..89f77d47 100644 --- a/src/genophenocorr/preprocessing/test_data/insertion_test.json +++ b/src/genophenocorr/preprocessing/test_data/insertion_test.json @@ -1,35 +1,34 @@ - - { - "subjectOrBiosampleId": "Gnazzo, 2020_P23", - "interpretationStatus": "CAUSATIVE", - "variantInterpretation": { - "variationDescriptor": { - "geneContext": { - "valueId": "HGNC:21316", - "symbol": "ANKRD11" - }, - "expressions": [ - { - "syntax": "hgvs.c", - "value": "NM_013275.6:c.5712_5713insT" - }, - { - "syntax": "hgvs.g", - "value": "NC_000016.10:g.89280829_89280830insA" - } - ], - "vcfRecord": { - "genomeAssembly": "hg38", - "chrom": "chr16", - "pos": "89280829", - "ref": "C", - "alt": "CA" - }, - "moleculeContext": "genomic", - "allelicState": { - "id": "GENO:0000135", - "label": "heterozygous" - } - } - } - } +{ + "subjectOrBiosampleId": "Gnazzo, 2020_P23", + "interpretationStatus": "CAUSATIVE", + "variantInterpretation": { + "variationDescriptor": { + "geneContext": { + "valueId": "HGNC:21316", + "symbol": "ANKRD11" + }, + "expressions": [ + { + "syntax": "hgvs.c", + "value": "NM_013275.6:c.5712_5713insT" + }, + { + "syntax": "hgvs.g", + "value": "NC_000016.10:g.89280829_89280830insA" + } + ], + "vcfRecord": { + "genomeAssembly": "hg38", + "chrom": "chr16", + "pos": "89280829", + "ref": "C", + "alt": "CA" + }, + "moleculeContext": "genomic", + "allelicState": { + "id": "GENO:0000135", + "label": "heterozygous" + } + } + } +} diff --git a/src/genophenocorr/preprocessing/test_data/missense_hgvs_test.json b/src/genophenocorr/preprocessing/test_data/missense_hgvs_test.json new file mode 100644 index 00000000..f9c38e4c --- /dev/null +++ b/src/genophenocorr/preprocessing/test_data/missense_hgvs_test.json @@ -0,0 +1,27 @@ +{ + "subjectOrBiosampleId": "KBG64", + "interpretationStatus": "CAUSATIVE", + "variantInterpretation": { + "variationDescriptor": { + "geneContext": { + "valueId": "HGNC:21316", + "symbol": "ANKRD11" + }, + "expressions": [ + { + "syntax": "hgvs.c", + "value": "NM_013275.6:c.7407C>G" + }, + { + "syntax": "hgvs.g", + "value": "NC_000016.10:g.89279135G>C" + } + ], + "moleculeContext": "genomic", + "allelicState": { + "id": "GENO:0000135", + "label": "heterozygous" + } + } + } +} diff --git a/src/genophenocorr/preprocessing/test_data/missense_test.json b/src/genophenocorr/preprocessing/test_data/missense_test.json index d11304d4..1c545153 100644 --- a/src/genophenocorr/preprocessing/test_data/missense_test.json +++ b/src/genophenocorr/preprocessing/test_data/missense_test.json @@ -1,35 +1,34 @@ - - { - "subjectOrBiosampleId": "KBG64", - "interpretationStatus": "CAUSATIVE", - "variantInterpretation": { - "variationDescriptor": { - "geneContext": { - "valueId": "HGNC:21316", - "symbol": "ANKRD11" - }, - "expressions": [ - { - "syntax": "hgvs.c", - "value": "NM_013275.6:c.7407C>G" - }, - { - "syntax": "hgvs.g", - "value": "NC_000016.10:g.89279135G>C" - } - ], - "vcfRecord": { - "genomeAssembly": "hg38", - "chrom": "chr16", - "pos": "89279135", - "ref": "G", - "alt": "C" - }, - "moleculeContext": "genomic", - "allelicState": { - "id": "GENO:0000135", - "label": "heterozygous" - } - } - } - } \ No newline at end of file +{ + "subjectOrBiosampleId": "KBG64", + "interpretationStatus": "CAUSATIVE", + "variantInterpretation": { + "variationDescriptor": { + "geneContext": { + "valueId": "HGNC:21316", + "symbol": "ANKRD11" + }, + "expressions": [ + { + "syntax": "hgvs.c", + "value": "NM_013275.6:c.7407C>G" + }, + { + "syntax": "hgvs.g", + "value": "NC_000016.10:g.89279135G>C" + } + ], + "vcfRecord": { + "genomeAssembly": "hg38", + "chrom": "chr16", + "pos": "89279135", + "ref": "G", + "alt": "C" + }, + "moleculeContext": "genomic", + "allelicState": { + "id": "GENO:0000135", + "label": "heterozygous" + } + } + } +} diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..9830291e --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,23 @@ +# content of conftest.py + +import pytest + + +def pytest_addoption(parser): + parser.addoption( + "--runonline", action="store_true", default=False, help="run online tests" + ) + + +def pytest_configure(config): + config.addinivalue_line("markers", "online: mark test that require internet access to run") + + +def pytest_collection_modifyitems(config, items): + if config.getoption("--runonline"): + # --runonline given in cli: do not skip online tests + return + skip_online = pytest.mark.skip(reason="need --runonline option to run") + for item in items: + if "online" in item.keywords: + item.add_marker(skip_online) diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py new file mode 100644 index 00000000..f1be4c66 --- /dev/null +++ b/tests/test_preprocessing.py @@ -0,0 +1,24 @@ +import hpotk +import pytest + + +from genophenocorr.preprocessing import PhenopacketPatientCreator +from genophenocorr.preprocessing import configure_patient_creator, load_phenopacket + + +class TestPhenopacketPatientCreator: + + @pytest.fixture + def hpo(self) -> hpotk.MinimalOntology: + return hpotk.load_minimal_ontology('testingDefaults/hp.json') + + @pytest.fixture + def phenopacket_patient_creator(self, hpo: hpotk.MinimalOntology) -> PhenopacketPatientCreator: + return configure_patient_creator(hpo) + + @pytest.mark.skip('Skipping online test') + def test_load_phenopacket(self, phenopacket_patient_creator: PhenopacketPatientCreator): + pp = load_phenopacket('../docs/data/simple_cohort/PMID_36446582_KBG12.json') + patient = phenopacket_patient_creator.create_patient(pp) + print(patient) +