Skip to content

Commit

Permalink
Merge pull request #77 from BIH-CEI/get-variant-coordinates-from-hgvs
Browse files Browse the repository at this point in the history
HGVSVariantCoordinatesFinder
  • Loading branch information
ielis authored Oct 24, 2023
2 parents 4740fc3 + d97f60a commit dfe5c50
Show file tree
Hide file tree
Showing 16 changed files with 1,150 additions and 32 deletions.
4 changes: 4 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[*]
max_line_length = 120
trim_trailing_whitespace = true
insert_final_newline = true
2 changes: 1 addition & 1 deletion src/genophenocorr/model/genome/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@
"""

from ._builds import GRCh37, GRCh38
from ._genome import Contig, GenomeBuild, Strand, Stranded, Transposable, GenomicRegion, Region
from ._genome import Contig, GenomeBuild, GenomeBuildIdentifier, Strand, Stranded, Transposable, GenomicRegion, Region
8 changes: 4 additions & 4 deletions src/genophenocorr/model/genome/_builds.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import platform
import warnings

from ._genome import Contig, GenomeBuild
from ._genome import Contig, GenomeBuild, GenomeBuildIdentifier

major, minor, patch = platform.python_version_tuple()

Expand All @@ -20,7 +20,7 @@ def resource_loader(path: str):
warnings.warn(f'Untested Python version v{major}.{minor}.{patch}')


def read_assembly_report(identifier: str, path: str) -> GenomeBuild:
def read_assembly_report(identifier: GenomeBuildIdentifier, path: str) -> GenomeBuild:
contigs = []
with resource_loader(path) as fp:
for line in fp:
Expand All @@ -38,12 +38,12 @@ def read_assembly_report(identifier: str, path: str) -> GenomeBuild:
return GenomeBuild(identifier, contigs)


GRCh37 = read_assembly_report('GRCh37.p13', 'GCF_000001405.25_GRCh37.p13_assembly_report.tsv')
GRCh37 = read_assembly_report(GenomeBuildIdentifier('GRCh37', 'p13'), 'GCF_000001405.25_GRCh37.p13_assembly_report.tsv')
"""
The `GRCh37.p13` genomic build.
"""

GRCh38 = read_assembly_report('GRCh38.p13', 'GCF_000001405.39_GRCh38.p13_assembly_report.tsv')
GRCh38 = read_assembly_report(GenomeBuildIdentifier('GRCh38', 'p13'), 'GCF_000001405.39_GRCh38.p13_assembly_report.tsv')
"""
The `GRCh38.p13` genomic build.
"""
101 changes: 97 additions & 4 deletions src/genophenocorr/model/genome/_genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,20 @@


class Contig(typing.Sized):
"""
`Contig` represents identifiers and length of a contiguous sequence of genome assembly.
The identifiers include:
* :attr:`name` e.g. `1`
* :attr:`genbank_acc` e.g. `CM000663.2`
* :attr:`refseq_name` e.g. `NC_000001.11`
* :attr:`ucsc_name` e.g. `chr1`
The length of a `Contig` represents the number of bases of the contig sequence.
You should not try to create a `Contig` on your own, but always get it from a :class:`GenomeBuild`.
"""

def __init__(self, name: str, gb_acc: str, refseq_name: str, ucsc_name: str, length: int):
self._name = hpotk.util.validate_instance(name, str, 'name')
Expand Down Expand Up @@ -54,10 +68,73 @@ def __hash__(self):
return hash((self.name, self.refseq_name, self.ucsc_name, len(self)))


class GenomeBuildIdentifier:
"""
Identifier of the genome build consisting of :attr:`major_assembly` (e.g. GRCh38) and :attr:`patch` (e.g. p13).
:param major_assembly: major assembly `str`
:param patch: assembly patch `str`
"""

def __init__(self, major_assembly: str, patch: str):
self._major_assembly = major_assembly
self._patch = patch

@property
def major_assembly(self) -> str:
"""
Get major assembly, e.g. `GRCh38`.
"""
return self._major_assembly

@property
def patch(self) -> str:
"""
Get assembly patch , e.g. `p13`.
"""
return self._patch

@property
def identifier(self):
"""
Get genome build identifier consisting of major assembly + patch, e.g. `GRCh38.p13`
"""
return self._major_assembly + '.' + self._patch

def __eq__(self, other):
return (isinstance(other, GenomeBuildIdentifier)
and self.major_assembly == other.major_assembly
and self.patch == other.patch)

def __hash__(self):
return hash((self.major_assembly, self.patch))

def __str__(self):
return f"GenomeBuildIdentifier(prefix={self._major_assembly}, patch={self._patch})"

def __repr__(self):
return f"GenomeBuildIdentifier(prefix={self._major_assembly}, patch={self._patch})"


class GenomeBuild:
"""
`GenomeBuild` is a container for the :attr:`genome_build_id` and the :attr:`contigs` of the build.
The build supports retrieving contig by various identifiers:
.. doctest:: genome-build
def __init__(self, identifier: str, contigs: typing.Iterable[Contig]):
self._id = identifier
>>> from genophenocorr.model.genome import GRCh38
>>> chr1 = GRCh38.contig_by_name('1') # by sequence name
>>> assert chr1 == GRCh38.contig_by_name('CM000663.2') # by GenBank identifier
>>> assert chr1 == GRCh38.contig_by_name('NC_000001.11') # by RefSeq accession
>>> assert chr1 == GRCh38.contig_by_name('chr1') # by UCSC name
"""

def __init__(self, identifier: GenomeBuildIdentifier, contigs: typing.Iterable[Contig]):
self._id = hpotk.util.validate_instance(identifier, GenomeBuildIdentifier, 'identifier')
self._contigs = tuple(contigs)
self._contig_by_name = {}
for contig in self._contigs:
Expand All @@ -68,23 +145,39 @@ def __init__(self, identifier: str, contigs: typing.Iterable[Contig]):

@property
def identifier(self) -> str:
return self.genome_build_id.identifier

@property
def genome_build_id(self) -> GenomeBuildIdentifier:
return self._id

@property
def contigs(self) -> typing.Sequence[Contig]:
return self._contigs

def contig_by_name(self, name: str) -> typing.Optional[Contig]:
"""
Get a contig with `name`.
The name can come in various formats:
* sequence name, e.g. `1`
* GenBank accession, e.g. `CM000663.2`
* RefSeq accession, e.g. `NC_000001.11`
* UCSC name, e.g. `chr1`
:param name: a `str` with contig name.
"""
try:
return self._contig_by_name[name]
except KeyError:
return None

def __str__(self):
return f"GenomeBuild(identifier={self.identifier}, n_contigs={len(self.contigs)})"
return f"GenomeBuild(identifier={self._id.identifier}, n_contigs={len(self.contigs)})"

def __repr__(self):
return f"GenomeBuild(identifier={self.identifier}, contigs={self.contigs})"
return f"GenomeBuild(identifier={self._id.identifier}, contigs={self.contigs})"


class Region(typing.Sized):
Expand Down
10 changes: 7 additions & 3 deletions src/genophenocorr/model/genome/_test_builds.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import pytest

from ._builds import read_assembly_report, GRCh37, GRCh38
from ._builds import read_assembly_report, GRCh37, GRCh38, GenomeBuildIdentifier


def test_read_assembly_report():
build = read_assembly_report('GRCh37.p13', 'GCF_000001405.25_GRCh37.p13_assembly_report.tsv')
build = read_assembly_report(GenomeBuildIdentifier('GRCh37', 'p13'), 'GCF_000001405.25_GRCh37.p13_assembly_report.tsv')
assert build is not None


Expand All @@ -14,7 +14,9 @@ def test_read_assembly_report():
('X', 'CM000685.1', 'NC_000023.10', 'chrX', 155_270_560),
])
def test_hg19(name, genbank, refseq, ucsc, length):
assert GRCh37.identifier == 'GRCh37.p13'
assert GRCh37.genome_build_id.major_assembly == 'GRCh37'
assert GRCh37.genome_build_id.patch == 'p13'
assert GRCh37.genome_build_id.identifier == 'GRCh37.p13'
contig = GRCh37.contig_by_name(name)
assert contig.name == name
assert contig.genbank_acc == genbank
Expand All @@ -29,6 +31,8 @@ def test_hg19(name, genbank, refseq, ucsc, length):
('X', 'CM000685.2', 'NC_000023.11', 'chrX', 156_040_895),
])
def test_hg38(name, genbank, refseq, ucsc, length):
assert GRCh38.genome_build_id.major_assembly == 'GRCh38'
assert GRCh38.genome_build_id.patch == 'p13'
assert GRCh38.identifier == 'GRCh38.p13'
contig = GRCh38.contig_by_name(name)
assert contig.name == name
Expand Down
14 changes: 8 additions & 6 deletions src/genophenocorr/preprocessing/_test_vep.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ class TestVepFunctionalAnnotator:
TEST_DATA_DIR = resource_filename(__name__, os.path.join('test_data', 'vep_response'))

def test__process_item_missense(self, variant_annotator: VepFunctionalAnnotator):
response = self._load_response_json('missense.json')
response_fpath = os.path.join(self.TEST_DATA_DIR, 'missense.json')
response = load_response_json(response_fpath)

annotations = [variant_annotator._process_item(item) for item in response[0]['transcript_consequences']]

Expand All @@ -83,7 +84,8 @@ def test__process_item_missense(self, variant_annotator: VepFunctionalAnnotator)
assert preferred.overlapping_exons == (9,)

def test__process_item_deletion(self, variant_annotator: VepFunctionalAnnotator):
response = self._load_response_json('deletion.json')
response_fpath = os.path.join(self.TEST_DATA_DIR, 'deletion.json')
response = load_response_json(response_fpath)

annotations = [variant_annotator._process_item(item) for item in response[0]['transcript_consequences']]

Expand All @@ -102,7 +104,7 @@ def test__process_item_deletion(self, variant_annotator: VepFunctionalAnnotator)
assert preferred.variant_effects == (VariantEffect.FRAMESHIFT_VARIANT,)
assert preferred.overlapping_exons == (9,)

def _load_response_json(self, test_name: str):
response_fpath = os.path.join(self.TEST_DATA_DIR, test_name)
with open(response_fpath) as fh:
return json.load(fh)

def load_response_json(path: str):
with open(path) as fh:
return json.load(fh)
96 changes: 96 additions & 0 deletions src/genophenocorr/preprocessing/_test_vv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import json
import os

from pkg_resources import resource_filename

import pytest


from genophenocorr.model.genome import GRCh38

from ._vv import VVHgvsVariantCoordinateFinder


@pytest.fixture
def coordinate_finder() -> VVHgvsVariantCoordinateFinder:
return VVHgvsVariantCoordinateFinder(genome_build=GRCh38)


class TestVVHgvsVariantCoordinateFinder:
TEST_DATA_DIR = resource_filename(__name__, os.path.join('test_data', 'vv_response'))

def test_load_hgvs_MC4R_missense(self, coordinate_finder: VVHgvsVariantCoordinateFinder):
response_fpath = os.path.join(self.TEST_DATA_DIR, 'hgvs_MC4R_missense.json')
response = load_response_json(response_fpath)

vc = coordinate_finder._extract_variant_coordinates(response)

assert vc is not None
assert vc.chrom == '18'
assert vc.start == 60_372_096
assert vc.end == 60_372_097
assert vc.ref == 'T'
assert vc.alt == 'C'
assert vc.change_length == 0


def test_load_hgvs_SURF2_del(self, coordinate_finder: VVHgvsVariantCoordinateFinder):
response_fpath = os.path.join(self.TEST_DATA_DIR, 'hgvs_SURF2_del.json')
response = load_response_json(response_fpath)

vc = coordinate_finder._extract_variant_coordinates(response)

assert vc is not None
assert vc.chrom == '9'
assert vc.start == 133_357_809
assert vc.end == 133_357_813
assert vc.ref == 'TAAA'
assert vc.alt == 'T'
assert vc.change_length == -3

def test_load_hgvs_SURF2_dup(self, coordinate_finder: VVHgvsVariantCoordinateFinder):
response_fpath = os.path.join(self.TEST_DATA_DIR, 'hgvs_SURF2_dup.json')
response = load_response_json(response_fpath)

vc = coordinate_finder._extract_variant_coordinates(response)

assert vc is not None
assert vc.chrom == '9'
assert vc.start == 133_357_809
assert vc.end == 133_357_810
assert vc.ref == 'T'
assert vc.alt == 'TAAA'
assert vc.change_length == 3

def test_load_hgvs_MC4R_dup(self, coordinate_finder: VVHgvsVariantCoordinateFinder):
response_fpath = os.path.join(self.TEST_DATA_DIR, 'hgvs_MC4R_dup.json')
response = load_response_json(response_fpath)

vc = coordinate_finder._extract_variant_coordinates(response)

assert vc is not None
assert vc.chrom == '18'
assert vc.start == 60_372_345
assert vc.end == 60_372_346
assert vc.ref == 'C'
assert vc.alt == 'CCAT'
assert vc.change_length == 3

def test_load_hgvs_SURF2_ins(self, coordinate_finder: VVHgvsVariantCoordinateFinder):
response_fpath = os.path.join(self.TEST_DATA_DIR, 'hgvs_SURF2_ins.json')
response = load_response_json(response_fpath)

vc = coordinate_finder._extract_variant_coordinates(response)

assert vc is not None
assert vc.chrom == '9'
assert vc.start == 133_357_809
assert vc.end == 133_357_810
assert vc.ref == 'T'
assert vc.alt == 'TAGC'
assert vc.change_length == 3


def load_response_json(path: str):
with open(path) as fh:
return json.load(fh)
Loading

0 comments on commit dfe5c50

Please sign in to comment.