Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prepare logic for fetching transcript regions for visualization #67

Merged
merged 20 commits into from
Sep 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions .github/workflows/python_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,16 @@ on:
jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.8', '3.9', '3.10', '3.11']

steps:
- uses: actions/checkout@v4
- name: Initialize Python 3.11
- name: Initialize Python
uses: actions/[email protected]
with:
python-version: "3.11"
python-version: ${{ matrix.python-version }}
- name: Install package
run: |
python3 -m pip install .[test,docs]
Expand Down
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
include src/genophenocorr/model/genome/GCF_*.tsv
9 changes: 6 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ authors = [
]
description = "Search for genotype-phenotype correlations with GA4GH phenopackets"
readme = "README.md"
requires-python = ">=3.5"
requires-python = ">=3.8"
keywords = [
"Global Alliance for Genomics and Health",
"GA4GH Phenopacket Schema",
Expand All @@ -24,7 +24,10 @@ classifiers = [
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Development Status :: 3 - Alpha",
"Programming Language :: Python :: 3.5",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Intended Audience :: Science/Research",
"Topic :: Scientific/Engineering :: Bio-Informatics"
]
Expand All @@ -34,7 +37,7 @@ dependencies = [
"pandas>=2.0.0",
"phenopackets>=2.0.2",
"requests>=2.25.0",
"scipy>=1.11",
"scipy>=1.10",
"statsmodels>=0.13.0",
"numpy>=1.23"
]
Expand Down
35 changes: 21 additions & 14 deletions src/genophenocorr/data/_toy.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
from hpotk import TermId

from genophenocorr.model import *
from genophenocorr.model.genome import Contig, GenomicRegion, Region, Strand

CONTIG = Contig('1', 'GB_ACC', 'REFSEQ_NAME', 'UCSC_NAME', 1_000)


def make_region(start: int, end: int) -> GenomicRegion:
return GenomicRegion(CONTIG, start, end, Strand.POSITIVE)


def get_toy_cohort() -> Cohort:
Expand All @@ -25,39 +32,39 @@ def get_toy_cohort() -> Cohort:
spasticity_F = Phenotype(TermId.from_curie('HP:0001257'), 'Spasticity', False)


prot_feat_1 = ProteinFeature.create(FeatureInfo('domain', 1, 75), FeatureType.DOMAIN)
prot_feat_2 = ProteinFeature.create(FeatureInfo('region', 50, 100), FeatureType.REGION)
prot_feat_1 = ProteinFeature.create(FeatureInfo('domain', Region(1, 75)), FeatureType.DOMAIN)
prot_feat_2 = ProteinFeature.create(FeatureInfo('region', Region(50, 100)), FeatureType.REGION)
prot = ProteinMetadata('NP_09876.5', 'FakeProtein', [prot_feat_1, prot_feat_2])

het_snv = Variant.create_variant_from_scratch(
'HetVar1', 'SNV',
VariantCoordinates('chr1', 280, 281, 'A', 'G', 0, 'heterozygous'),
'FakeGene', 'NM_1234.5', 'NM_1234.5:c.180A>G', ['missense_variant'],
VariantCoordinates(make_region(280, 281), 'A', 'G', 0, 'heterozygous'),
'FakeGene', 'NM_1234.5', 'NM_1234.5:c.180A>G', False, ['missense_variant'],
[1], [prot], 60, 60)
het_del = Variant.create_variant_from_scratch(
'HetVar2', 'indel',
VariantCoordinates('chr1', 360, 363, 'TTC', 'T', -2, 'heterozygous'),
'FakeGene', 'NM_1234.5', 'NM_1234.5:c.261_263del', ['frameshift_variant'],
VariantCoordinates(make_region(360, 363), 'TTC', 'T', -2, 'heterozygous'),
'FakeGene', 'NM_1234.5', 'NM_1234.5:c.261_263del', False, ['frameshift_variant'],
[2], [prot], 86, 87)
het_dup = Variant.create_variant_from_scratch(
'HetVar3', 'insertion',
VariantCoordinates('chr1', 175, 176, 'T', 'TG', 1, 'heterozygous'),
'FakeGene', 'NM_1234.5', 'NM_1234.5:c.75A>G', ['frameshift_variant'],
VariantCoordinates(make_region(175, 176), 'T', 'TG', 1, 'heterozygous'),
'FakeGene', 'NM_1234.5', 'NM_1234.5:c.75A>G', False, ['frameshift_variant'],
[1], [prot], 25, 25)
hom_snv = Variant.create_variant_from_scratch(
'HomVar1', 'SNV',
VariantCoordinates('chr1', 280, 281, 'A', 'G', 0, 'homozygous'),
'FakeGene', 'NM_1234.5', 'NM_1234.5:c.180A>G', ['missense_variant'],
VariantCoordinates(make_region(280, 281), 'A', 'G', 0, 'homozygous'),
'FakeGene', 'NM_1234.5', 'NM_1234.5:c.180A>G', False, ['missense_variant'],
[1], [prot], 60, 60)
hom_del = Variant.create_variant_from_scratch(
'HomVar2', 'indel',
VariantCoordinates('chr1', 360, 363, 'TTC', 'T', -2, 'homozygous'),
'FakeGene', 'NM_1234.5', 'NM_1234.5:c.261_263del', ['frameshift_variant'],
VariantCoordinates(make_region(360, 363), 'TTC', 'T', -2, 'homozygous'),
'FakeGene', 'NM_1234.5', 'NM_1234.5:c.261_263del', False, ['frameshift_variant'],
[2], [prot], 86, 87)
hom_dup = Variant.create_variant_from_scratch(
'HomVar3', 'insertion',
VariantCoordinates('chr1', 175, 176, 'T', 'TG', 1,'homozygous'),
'FakeGene', 'NM_1234.5', 'NM_1234.5:c.75A>G', ['frameshift_variant'],
VariantCoordinates(make_region(175, 176), 'T', 'TG', 1,'homozygous'),
'FakeGene', 'NM_1234.5', 'NM_1234.5:c.75A>G', False, ['frameshift_variant'],
[1], [prot], 25, 25)

patients = (
Expand Down
5 changes: 3 additions & 2 deletions src/genophenocorr/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,16 @@
The `genophenocorr.model` package defines data model classes used in genophenocorr. We start with the top-level elements,
such as :class:`Cohort` and :class:`Patient`, and we follow with data classes for phenotype, genotype, and protein info.
"""
from . import genome

from ._cohort import Cohort, Patient
from ._phenotype import Phenotype
from ._protein import FeatureInfo, FeatureType, ProteinFeature, ProteinMetadata
from ._variant import VariantCoordinates, TranscriptAnnotation, Variant
from ._variant import VariantCoordinates, TranscriptAnnotation, TranscriptInfoAware, Variant

__all__ = [
'Cohort', 'Patient',
'Phenotype',
'Variant', 'TranscriptAnnotation', 'VariantCoordinates',
'Variant', 'VariantCoordinates', 'TranscriptAnnotation', 'TranscriptInfoAware',
'ProteinMetadata', 'ProteinFeature', 'FeatureInfo', 'FeatureType',
]
49 changes: 22 additions & 27 deletions src/genophenocorr/model/_protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,25 @@
import enum
import typing

import hpotk

from .genome import Region


class FeatureInfo:
"""A class that represents a protein feature
(e.g. a repeated sequence given the name "ANK 1" in protein "Ankyrin repeat domain-containing protein 11")

Attributes:
name (string): The given name or description of the protein feature
start (integer): The starting position of the feature on the protein sequence
end (integer): The ending position of the feature on the protein sequence
region (Region): The protein feature region coordinates
"""

def __init__(self, name: str, start: int, end: int):
"""Constructs all necessary attributes for a FeatureInfo object

Args:
name (string): The given name or description of the protein feature
start (integer): The starting position of the feature on the protein sequence
end (integer): The ending position of the feature on the protein sequence
"""
def __init__(self, name: str, region: Region):
if not isinstance(name, str):
raise ValueError(f"name must be type string but was type {type(name)}")
self._name = name
if not isinstance(start, int):
raise ValueError(f"start must be an integer but was type {type(start)}")
self._start = start
if not isinstance(end, int):
raise ValueError(f"end must be an integer but was type {type(end)}")
self._end = end

if self._start > self._end:
raise ValueError(f"The start value must come before end but {self._start} is greater than {self._end}")
self._name = hpotk.util.validate_instance(name, str, 'name')
self._region = hpotk.util.validate_instance(region, Region, 'region')

@property
def name(self) -> str:
Expand All @@ -42,33 +30,40 @@ def name(self) -> str:
"""
return self._name

@property
def region(self) -> Region:
"""
Returns:
Region: a protein region spanned by the feature.
"""
return self._region

@property
def start(self) -> int:
"""
Returns:
integer: A 0-based (excluded) start coordinate of the protein feature.
"""
return self._start
return self._region.start

@property
def end(self) -> int:
"""
Returns:
integer: A 0-based (included) end coordinate of the protein feature.
"""
return self._end
return self._region.end

def __len__(self):
return self._end - self._start
return len(self._region)

def __eq__(self, other) -> bool:
return isinstance(other, FeatureInfo) \
and self.name == other.name \
and self.start == other.start \
and self.end == other.end
and self.region == other.region

def __hash__(self):
return hash((self._name, self._start, self._end))
return hash((self._name, self._region))

def __str__(self) -> str:
return f"FeatureInfo(name={self.name}, start={self.start}, end={self.end})"
Expand Down Expand Up @@ -264,4 +259,4 @@ def __hash__(self) -> int:
return hash((self.protein_id, self.label, self._features))

def __repr__(self) -> str:
return str(self)
return str(self)
Loading