Skip to content

Commit

Permalink
Merge pull request #94 from monarch-initiative/fix-change-length-bug
Browse files Browse the repository at this point in the history
Fix change length bug
  • Loading branch information
ielis authored Nov 1, 2023
2 parents 60328e5 + f86427c commit 343efa5
Show file tree
Hide file tree
Showing 19 changed files with 536 additions and 354 deletions.
19 changes: 19 additions & 0 deletions docs/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,22 @@ The *latest* release can be installed by cloning the GitHub repository::
The code above will clone the source code from GitHub repository, switch to the `develop` branch with the latest features,
and install the library into the current Python (virtual) environment.


Run tests
^^^^^^^^^

Running tests can be done as an optional step after installation. However, some additional
libraries are required to run tests, hence we must do one more install, this time with `test` option enabled::

python3 -m pip install .[test]

Then, running the tests is as simple as::

pytest

This will run the unit and integration tests that do *not* require internet access. To run the "online" tests,
we add ``--runonlin`` option to the command line invocation::

pytest --runonline

That's all about testing!
8 changes: 4 additions & 4 deletions src/genophenocorr/preprocessing/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from ._api import VariantCoordinateFinder, FunctionalAnnotator, ProteinMetadataService
from ._config import configure_caching_patient_creator
from ._config import configure_caching_patient_creator, configure_patient_creator
from ._patient import PatientCreator
from ._phenopacket import PhenopacketVariantCoordinateFinder, PhenopacketPatientCreator, load_phenopacket_folder
from ._phenopacket import PhenopacketVariantCoordinateFinder, PhenopacketPatientCreator, load_phenopacket_folder, load_phenopacket
from ._phenotype import PhenotypeCreator, PhenotypeValidationException
from ._protein import ProteinAnnotationCache, ProtCachingFunctionalAnnotator
from ._uniprot import UniprotProteinMetadataService
Expand All @@ -11,10 +11,10 @@
__all__ = [
'VariantCoordinateFinder', 'FunctionalAnnotator', 'ProteinMetadataService',
'PatientCreator',
'PhenopacketVariantCoordinateFinder', 'PhenopacketPatientCreator', 'load_phenopacket_folder',
'PhenopacketVariantCoordinateFinder', 'PhenopacketPatientCreator', 'load_phenopacket_folder', 'load_phenopacket',
'PhenotypeCreator', 'PhenotypeValidationException',
'ProteinAnnotationCache', 'ProtCachingFunctionalAnnotator',
'UniprotProteinMetadataService',
'VepFunctionalAnnotator', 'VariantAnnotationCache', 'VarCachingFunctionalAnnotator',
'configure_caching_patient_creator'
'configure_caching_patient_creator', 'configure_patient_creator'
]
81 changes: 63 additions & 18 deletions src/genophenocorr/preprocessing/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,16 @@

import hpotk

from genophenocorr.model.genome import GRCh37, GRCh38
from genophenocorr.model.genome import GRCh37, GRCh38, GenomeBuild

from ._api import FunctionalAnnotator, ProteinMetadataService
from ._phenotype import PhenotypeCreator
from ._phenopacket import PhenopacketPatientCreator
from ._api import FunctionalAnnotator
from ._protein import ProteinAnnotationCache, ProtCachingFunctionalAnnotator
from ._uniprot import UniprotProteinMetadataService
from ._variant import VarCachingFunctionalAnnotator, VariantAnnotationCache
from ._vep import VepFunctionalAnnotator
from ._vv import VVHgvsVariantCoordinateFinder


def configure_caching_patient_creator(hpo: hpotk.MinimalOntology,
Expand All @@ -23,7 +24,7 @@ def configure_caching_patient_creator(hpo: hpotk.MinimalOntology,
"""
A convenience function for configuring a caching :class:`genophenocorr.preprocessing.PhenopacketPatientCreator`.
To create the patient creator, we need hpo-toolkit's representation of HPO, the validator
To create the patient creator, we need hpo-toolkit's representation of HPO. Other options are optional.
:param hpo: a HPO instance.
:param genome_build: name of the genome build to use, choose from `{'GRCh37.p13', 'GRCh38.p13'}`.
Expand All @@ -39,17 +40,50 @@ def configure_caching_patient_creator(hpo: hpotk.MinimalOntology,
if cache_dir is None:
cache_dir = os.path.join(os.getcwd(), '.genophenocorr_cache')

build = _configure_build(genome_build)
phenotype_creator = _setup_phenotype_creator(hpo, validation_runner)
functional_annotator = _configure_functional_annotator(cache_dir, variant_fallback, protein_fallback)
hgvs_annotator = VVHgvsVariantCoordinateFinder(build)
return PhenopacketPatientCreator(build, phenotype_creator, functional_annotator, hgvs_annotator)


def configure_patient_creator(hpo: hpotk.MinimalOntology,
genome_build: str = 'GRCh38.p13',
validation_runner: typing.Optional[hpotk.validate.ValidationRunner] = None,
variant_fallback: str = 'VEP',
protein_fallback: str = 'UNIPROT') -> PhenopacketPatientCreator:
"""
A convenience function for configuring a non-caching :class:`genophenocorr.preprocessing.PhenopacketPatientCreator`.
To create the patient creator, we need hpo-toolkit's representation of HPO. Other options are optional
:param hpo: a HPO instance.
:param genome_build: name of the genome build to use, choose from `{'GRCh37.p13', 'GRCh38.p13'}`.
:param validation_runner: an instance of the validation runner.
if the data should be cached in `.cache` folder in the current working directory.
In any case, the directory will be created if it does not exist (including non-existing parents).
:param variant_fallback: the fallback variant annotator to use if we cannot find the annotation locally.
Choose from ``{'VEP'}`` (just one fallback implementation is available at the moment).
:param protein_fallback: the fallback protein metadata annotator to use if we cannot find the annotation locally.
Choose from ``{'UNIPROT'}`` (just one fallback implementation is available at the moment).
"""
build = _configure_build(genome_build)

phenotype_creator = _setup_phenotype_creator(hpo, validation_runner)
protein_metadata_service = _configure_fallback_protein_service(protein_fallback)
functional_annotator = _configure_fallback_functional(protein_metadata_service, variant_fallback)
hgvs_annotator = VVHgvsVariantCoordinateFinder(build)
return PhenopacketPatientCreator(build, phenotype_creator, functional_annotator, hgvs_annotator)


def _configure_build(genome_build: str) -> GenomeBuild:
if genome_build == 'GRCh38.p13':
build = GRCh38
return GRCh38
elif genome_build == 'GRCh37.p13':
build = GRCh37
return GRCh37
else:
raise ValueError(f'Unknown build {genome_build}. Choose from [\'GRCh37.p13\', \'GRCh38.p13\']')

phenotype_creator = _setup_phenotype_creator(hpo, validation_runner)
functional_annotator = _configure_functional_annotator(cache_dir, variant_fallback, protein_fallback)
return PhenopacketPatientCreator(build, phenotype_creator, functional_annotator)


def _setup_phenotype_creator(hpo: hpotk.MinimalOntology,
validator: typing.Optional[hpotk.validate.ValidationRunner]) -> PhenotypeCreator:
Expand All @@ -70,23 +104,17 @@ def _configure_functional_annotator(cache_dir: str,
protein_fallback: str) -> FunctionalAnnotator:
# (1) ProteinMetadataService
# Setup fallback
if protein_fallback == 'UNIPROT':
fallback1 = UniprotProteinMetadataService()
else:
raise ValueError(f'Unknown protein fallback annotator type {protein_fallback}')
protein_fallback = _configure_fallback_protein_service(protein_fallback)
# Setup protein metadata cache
prot_cache_dir = os.path.join(cache_dir, 'protein_cache')
os.makedirs(prot_cache_dir, exist_ok=True)
prot_cache = ProteinAnnotationCache(prot_cache_dir)
# Assemble the final protein metadata service
protein_metadata_service = ProtCachingFunctionalAnnotator(prot_cache, fallback1)
protein_metadata_service = ProtCachingFunctionalAnnotator(prot_cache, protein_fallback)

# (2) FunctionalAnnotator
# Setup fallback
if variant_fallback == 'VEP':
fallback = VepFunctionalAnnotator(protein_metadata_service)
else:
raise ValueError(f'Unknown variant fallback annotator type {variant_fallback}')
fallback = _configure_fallback_functional(protein_metadata_service, variant_fallback)

# Setup variant cache
var_cache_dir = os.path.join(cache_dir, 'variant_cache')
Expand All @@ -97,3 +125,20 @@ def _configure_functional_annotator(cache_dir: str,
return VarCachingFunctionalAnnotator(var_cache, fallback)


def _configure_fallback_protein_service(protein_fallback: str) -> ProteinMetadataService:
if protein_fallback == 'UNIPROT':
fallback1 = UniprotProteinMetadataService()
else:
raise ValueError(f'Unknown protein fallback annotator type {protein_fallback}')
return fallback1


def _configure_fallback_functional(protein_metadata_service: ProteinMetadataService,
variant_fallback: str) -> FunctionalAnnotator:
if variant_fallback == 'VEP':
fallback = VepFunctionalAnnotator(protein_metadata_service)
else:
raise ValueError(f'Unknown variant fallback annotator type {variant_fallback}')
return fallback


129 changes: 83 additions & 46 deletions src/genophenocorr/preprocessing/_phenopacket.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import logging
import re
import os
import typing

Expand All @@ -20,15 +19,19 @@


class PhenopacketVariantCoordinateFinder(VariantCoordinateFinder[GenomicInterpretation]):
"""A class that creates VariantCoordinates from a Phenopacket
"""
`PhenopacketVariantCoordinateFinder` figures out :class:`genophenocorr.model.VariantCoordinates`
and :class:`genophenocorr.model.Genotype` from `GenomicInterpretation` element of Phenopacket Schema.
Methods:
find_coordinates(item:GenomicInterpretation): Creates VariantCoordinates from the data in a given Phenopacket
:param build: genome build to use in `VariantCoordinates
:param hgvs_coordinate_finder: the coordinate finder to use for parsing HGVS expressions
"""
def __init__(self, build: GenomeBuild):
"""Constructs all necessary attributes for a PhenopacketVariantCoordinateFinder object"""
def __init__(self, build: GenomeBuild,
hgvs_coordinate_finder: VariantCoordinateFinder[str]):
self._logger = logging.getLogger(__name__)
self._build = build
self._build = hpotk.util.validate_instance(build, GenomeBuild, 'build')
self._hgvs_finder = hpotk.util.validate_instance(hgvs_coordinate_finder, VariantCoordinateFinder,
'hgvs_coordinate_finder')

def find_coordinates(self, item: GenomicInterpretation) -> typing.Tuple[VariantCoordinates, Genotype]:
"""Creates a VariantCoordinates object from the data in a given Phenopacket
Expand All @@ -40,41 +43,78 @@ def find_coordinates(self, item: GenomicInterpretation) -> typing.Tuple[VariantC
"""
if not isinstance(item, GenomicInterpretation):
raise ValueError(f"item must be a Phenopacket GenomicInterpretation but was type {type(item)}")

variant_descriptor = item.variant_interpretation.variation_descriptor
if len(variant_descriptor.vcf_record.chrom) == 0 and len(
variant_descriptor.variation.copy_number.allele.sequence_location.sequence_id) != 0:

vc = None
if self._vcf_is_available(variant_descriptor.vcf_record):
# We have a VCF record.
contig = self._build.contig_by_name(variant_descriptor.vcf_record.chrom)
start = int(variant_descriptor.vcf_record.pos) - 1
ref = variant_descriptor.vcf_record.ref
alt = variant_descriptor.vcf_record.alt
end = start + len(ref)
change_length = end - start

region = GenomicRegion(contig, start, end, Strand.POSITIVE)
vc = VariantCoordinates(region, ref, alt, change_length)
elif self._cnv_is_available(variant_descriptor.variation):
# We have a CNV.
variation = variant_descriptor.variation
seq_location = variation.copy_number.allele.sequence_location
refseq_contig_name = seq_location.sequence_id.split(':')[1]
contig = self._build.contig_by_name(refseq_contig_name)

# Assuming SV coordinates are 1-based (VCF style),
# so we subtract 1 to transform to 0-based coordinate system
start = int(seq_location.sequence_interval.start_number.value) - 1
end = int(seq_location.sequence_interval.end_number.value)
ref = 'N'
start = int(
variant_descriptor.variation.copy_number.allele.sequence_location.sequence_interval.start_number.value)
end = int(
variant_descriptor.variation.copy_number.allele.sequence_location.sequence_interval.end_number.value)
number = int(variant_descriptor.variation.copy_number.number.value)
number = int(variation.copy_number.number.value)
if number > 2:
alt = '<DUP>'
else:
alt = '<DEL>'
refseq_contig_name = variant_descriptor.variation.copy_number.allele.sequence_location.sequence_id.split(':')[1]
contig = self._build.contig_by_name(refseq_contig_name)
elif len(variant_descriptor.vcf_record.chrom) != 0 and len(
variant_descriptor.variation.copy_number.allele.sequence_location.sequence_id) == 0:
ref = variant_descriptor.vcf_record.ref
alt = variant_descriptor.vcf_record.alt
start = int(variant_descriptor.vcf_record.pos) - 1
end = int(variant_descriptor.vcf_record.pos) + abs(len(alt) - len(ref))
contig = self._build.contig_by_name(variant_descriptor.vcf_record.chrom[3:])
else:
raise ValueError('Expected a VCF record or a VRS CNV but did not find one')
change_length = end - start

region = GenomicRegion(contig, start, end, Strand.POSITIVE)
vc = VariantCoordinates(region, ref, alt, change_length)
elif len(variant_descriptor.expressions) > 0:
# We have some expressions. Let's try to find the 1st expression with `hgvs.c` syntax.
for expression in variant_descriptor.expressions:
if expression.syntax == 'hgvs.c':
vc = self._hgvs_finder.find_coordinates(expression.value)
break

if vc is None:
raise ValueError('Expected a VCF record, a VRS CNV, or an expression with `hgvs.c` '
'but did not find one')

# Last, parse genotype.
genotype = variant_descriptor.allelic_state.label

if any(field is None for field in (contig, ref, alt, genotype)):
raise ValueError(f'Cannot determine variant coordinate from genomic interpretation {item}')
region = GenomicRegion(contig, start, end, Strand.POSITIVE)

vc = VariantCoordinates(region, ref, alt, len(alt) - len(ref))
gt = self._map_geno_genotype_label(genotype)

return vc, gt

@staticmethod
def _vcf_is_available(vcf_record) -> bool:
"""
Check if we can parse data out of VCF record.
"""
return (vcf_record.genome_assembly != ''
and vcf_record.chrom != ''
and vcf_record.pos >= 0
and vcf_record.ref != ''
and vcf_record.alt != '')

@staticmethod
def _cnv_is_available(variation):
seq_location = variation.copy_number.allele.sequence_location
return (seq_location.sequence_id != ''
and seq_location.sequence_interval.start_number.value >= 0
and seq_location.sequence_interval.end_number.value >= 0
and variation.copy_number.number.value != '')

@staticmethod
def _map_geno_genotype_label(genotype: str) -> Genotype:
"""
Expand All @@ -91,25 +131,17 @@ def _map_geno_genotype_label(genotype: str) -> Genotype:


class PhenopacketPatientCreator(PatientCreator[Phenopacket]):
"""A class that creates a Patient object
Methods:
create_patient(item:Phenopacket): Creates a Patient from the data in a given Phenopacket
"""
`PhenopacketPatientCreator` transforms `Phenopacket` into :class:`genophenocorr.model.Patient`.
"""

def __init__(self, build: GenomeBuild,
phenotype_creator: PhenotypeCreator,
var_func_ann: FunctionalAnnotator):
"""Constructs all necessary attributes for a PhenopacketPatientCreator object
Args:
build (GenomeBuild): A genome build to use to load variant coordinates.
phenotype_creator (PhenotypeCreator): A PhenotypeCreator object for Phenotype creation
var_func_ann (FunctionalAnnotator): A FunctionalAnnotator object for Variant creation
"""
var_func_ann: FunctionalAnnotator,
hgvs_coordinate_finder: VariantCoordinateFinder[str]):
self._logger = logging.getLogger(__name__)
# Violates DI, but it is specific to this class, so I'll leave it "as is".
self._coord_finder = PhenopacketVariantCoordinateFinder(build)
self._coord_finder = PhenopacketVariantCoordinateFinder(build, hgvs_coordinate_finder)
self._phenotype_creator = hpotk.util.validate_instance(phenotype_creator, PhenotypeCreator, 'phenotype_creator')
self._func_ann = hpotk.util.validate_instance(var_func_ann, FunctionalAnnotator, 'var_func_ann')

Expand Down Expand Up @@ -224,11 +256,16 @@ def _load_phenopacket_dir(pp_dir: str) -> typing.Sequence[Phenopacket]:
for patient_file in os.listdir(pp_dir):
if patient_file.endswith('.json'):
phenopacket_path = os.path.join(pp_dir, patient_file)
pp = _load_phenopacket(phenopacket_path)
pp = load_phenopacket(phenopacket_path)
patients.append(pp)
return patients


def _load_phenopacket(phenopacket_path: str) -> Phenopacket:
def load_phenopacket(phenopacket_path: str) -> Phenopacket:
"""
Load phenopacket JSON file.
:param phenopacket_path: a `str` pointing to phenopacket JSON file.
"""
with open(phenopacket_path) as f:
return Parse(f.read(), Phenopacket())
Loading

0 comments on commit 343efa5

Please sign in to comment.