Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ps proteindna annotations 2 #10

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
**/.DS_Store

.vscode/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
2 changes: 1 addition & 1 deletion altanalyze3/bin/altanalyze3
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ def main(args=None):


if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
sys.exit(main(sys.argv[1:]))
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@
This is a generalized python module for getting data from Ensemble using Biomart server.
"""

from __future__ import absolute_import, division, print_function
import requests

from future.utils import native_str
from builtins import *
from xml.etree import ElementTree
import pandas as pd
from io import StringIO
from xml.etree.ElementTree import fromstring as xml_from_string

import math
from altanalyze3.utilities.helpers import (
TimeIt
)

DEFAULT_HOST = 'http://www.biomart.org'
DEFAULT_PATH = '/biomart/martservice'
Expand All @@ -26,7 +26,6 @@ class ServerBase(object):
path (str): Path to the biomart service on the host.
port (str): Port to connect to on the host.
url (str): Url used to connect to the biomart service.
use_cache (bool): Whether to cache requests to biomart.
"""

def __init__(self, host=None, path=None, port=None):
Expand All @@ -38,9 +37,9 @@ def __init__(self, host=None, path=None, port=None):
use_cache (bool): Whether to cache requests.
"""
# Use defaults if arg is None.
host = host or DEFAULT_HOST
path = path or DEFAULT_PATH
port = port or DEFAULT_PORT
host = DEFAULT_HOST if host is None else host
path = DEFAULT_PATH if path is None else path
port = DEFAULT_PORT if port is None else port

# Add http prefix and remove trailing slash.
host = self._add_http_prefix(host)
Expand Down Expand Up @@ -82,8 +81,7 @@ def _add_http_prefix(url, prefix='http://'):

@staticmethod
def _remove_trailing_slash(url):
if url.endswith('/'):
url = url[:-1]
url.strip("/")
return url

def get(self, **params):
Expand Down Expand Up @@ -121,22 +119,7 @@ class Dataset(ServerBase):
host (str): Url of host to connect to.
path (str): Path on the host to access to the biomart service.
port (int): Port to use for the connection.
use_cache (bool): Whether to cache requests.
virtual_schema (str): The virtual schema of the dataset.
Examples:
Directly connecting to a dataset:
>>> dataset = Dataset(name='hsapiens_gene_ensembl',
>>> host='http://www.ensembl.org')
Querying the dataset:
>>> dataset.query(attributes=['ensembl_gene_id',
>>> 'external_gene_name'],
>>> filters={'chromosome_name': ['1','2']})
Listing available attributes:
>>> dataset.attributes
>>> dataset.list_attributes()
Listing available filters:
>>> dataset.filters
>>> dataset.list_filters()
"""

def __init__(self,
Expand All @@ -145,7 +128,7 @@ def __init__(self,
host=None,
path=None,
port=None,
virtual_schema=DEFAULT_SCHEMA):
virtual_schema=DEFAULT_SCHEMA, location):
super().__init__(host=host, path=path, port=port)

self._name = name
Expand All @@ -155,6 +138,7 @@ def __init__(self,
self._attributes = None
self._default_attributes = None
self._datatype = None
self.location = location

@property
def name(self):
Expand Down Expand Up @@ -258,13 +242,20 @@ def _attributes_from_xml(xml):
description=attrib.get('description', ''),
default=default)

# on loop for each exon in one transcript
def calculate_aa_positions(self, cds_pos):
# check if new transcript

aa_position = math.ceil((cds_pos) / 3)
return aa_position

def query(self,
attributes=None,
filters=None,
only_unique=True,
use_attr_names=False,
dtypes=None,
datatype=None
datatype=None,
):
"""Queries the dataset to retrieve the contained data.
Args:
Expand Down Expand Up @@ -314,6 +305,8 @@ def query(self,
dataset.set('name', self.name)
dataset.set('interface', 'default')

csv_location = self.location.with_suffix(".csv")
logging.info(f"""Save protein coordinates reads to {csv_location}""")
# Default to default attributes if none requested.
if attributes is None:
attributes = list(self.default_attributes.keys())
Expand Down Expand Up @@ -350,13 +343,17 @@ def query(self,
try:
result = pd.read_csv(StringIO(response.text),
sep='\t', dtype=dtypes)
if (datatype == "protein_coordinates"):
result.to_csv(
'Hs_ProteinCoordinates_build_100_38.csv', sep='\t')
elif(datatype == "protein_feature"):
result.to_csv(
'Hs_ProteinFeatures_build_100_38.csv', sep='\t')
# Type error is raised of a data type is not understood by pandas
# calculate the aa_nt_start and end positions
result = result.dropna(subset=['CDS start'])
result = result.dropna(subset=['CDS end'])
cds_start = result['CDS start'].astype(int)
cds_stop = result['CDS end'].astype(int)
result["aa_start"] = cds_start.apply(lambda x: math.ceil((x) / 3))
result["aa_stop"] = cds_stop.apply(lambda x: math.ceil((x) / 3))
with csv_location.open("w") as out_handler:
out_handler.write(result)

# Type error is raised of a data type is not understood by Pandas
except TypeError as err:
raise ValueError("Non valid data type is used in dtypes")

Expand All @@ -367,7 +364,6 @@ def query(self,
for attr in attributes
}
result.rename(columns=column_map, inplace=True)

return result

@staticmethod
Expand Down Expand Up @@ -490,14 +486,9 @@ def __repr__(self):
.format(self.name, self.type))


dataset = Dataset(name='apolyacanthus_gene_ensembl',
host='http://www.ensembl.org')

# Protein Coordinates
dataset.query(attributes=["ensembl_transcript_id", "ensembl_exon_id", "ensembl_peptide_id", "start_position",
"end_position", "transcript_start", "transcript_end", "cdd", "cdd_start", "cdd_end"], datatype='protein_coordinates')


# Protein Features
dataset.query(attributes=["ensembl_gene_id", "ensembl_gene_id_version", "ensembl_transcript_id_version",
"interpro", "interpro_description", "interpro_start", "interpro_end", "cdd", "cdd_start", "cdd_end"], datatype='protein_feature')
def protein_coordinates(args):
with TimeIt():
dataset = Dataset(name={args.name}, host={args.host})
logging.info(
f"""Getting Data from {args.host} for given species {args.name}""")
dataset.query(attributes=args.attributes)
36 changes: 34 additions & 2 deletions altanalyze3/utilities/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from altanalyze3.utilities.helpers import get_version
from altanalyze3.components.intron_count.main import count_introns
from altanalyze3.components.junction_count.main import count_junctions
# from altanalyze3.components.junction_count.main import protein_coordinates
from altanalyze3.utilities.io import get_all_bam_chr
from altanalyze3.utilities.constants import (
IntRetCat,
Expand Down Expand Up @@ -52,7 +53,7 @@ def get_parser(self):
subparsers = general_parser.add_subparsers()
subparsers.required = True
# Global parameters for all components of the tool
general_parser.add_argument(
general_parser.add_argument(
"--version",
action="version",
version=get_version(),
Expand Down Expand Up @@ -150,6 +151,37 @@ def get_parser(self):
help="Export processed reads into the BAM file. Default: False",
action="store_true"
)
self.add_common_arguments(intron_parser)

# Protein Domain Annotation parser
protein_coordinates_parser = subparsers.add_parser(
"proteincoordinates",
parents=[parent_parser],
help="Get Protein to Domain annotations"
)
#TO-DO
# protein_coordinates_parser.set_defaults(func=protein_coordinates)
# protein_coordinates_parser.add_argument(
# "--name",
# help="name of species eg. apolyacanthus_gene_ensembl",
# type=str,
# required=True,
# )
# protein_coordinates_parser.add_argument(
# "--host",
# help="Select the host from where you want to import data",
# type=str,
# default="https://www.ensembl.org"
# )
# protein_coordinates_parser.add_argument(
# "--attributes",
# help="Export certain coordinates or features from Ensembl",
# nargs="*",
# default=["ensembl_transcript_id", "ensembl_exon_id", "ensembl_peptide_id", "start_position",
# "end_position", "transcript_start", "transcript_end", "cds_start", "cds_end"]
# )
# self.add_common_arguments(protein_coordinates_parser)

self.add_common_arguments(junction_parser)
return general_parser

Expand Down Expand Up @@ -200,4 +232,4 @@ def assert_common_args(self):
self.args.output.parent.mkdir(parents=True, exist_ok=True) # safety measure, shouldn't fail
self.args.chr = get_all_bam_chr(self.args.bam, self.args.threads) \
if len(self.args.chr) == 0 else [c if c.startswith("chr") else f"chr{c}" for c in self.args.chr]
self.args.loglevel = getattr(logging, self.args.loglevel.upper())
self.args.loglevel = getattr(logging, self.args.loglevel.upper())
30 changes: 18 additions & 12 deletions docs/Hs_ProteinCoordinates_build_100_38.csv
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
,Exon stable ID,Gene start (bp),Gene end (bp),Gene name,Protein stable ID,Transcript start (bp),Transcript end (bp),CDD start,CDD end
0,ENSAPOE00000120411,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,,
1,ENSAPOE00000120412,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,,
2,ENSAPOE00000120413,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,,
3,ENSAPOE00000120414,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,,
4,ENSAPOE00000120415,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,,
5,ENSAPOE00000120416,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,,
6,ENSAPOE00000000170,290637,294901,,ENSAPOP00000020929,290637,294901,,
7,ENSAPOE00000000171,290637,294901,,ENSAPOP00000020929,290637,294901,,
8,ENSAPOE00000000174,290637,294901,,ENSAPOP00000020929,290637,294901,,
9,ENSAPOE00000000177,290637,294901,,ENSAPOP00000020929,290637,294901,,
10,ENSAPOE00000000280,290637,294901,,ENSAPOP00000020929,290637,294901,,
,Transcript stable ID,Exon stable ID,Gene start (bp),Gene end (bp),Transcript start (bp),Transcript end (bp),CDS start,CDS end,aa_start,aa_stop
0,ENSAPOT00000017612,ENSAPOE00000120411,288439,298458,288439,298458,1.0,105.0,1,35
1,ENSAPOT00000017612,ENSAPOE00000120412,288439,298458,288439,298458,106.0,254.0,36,85
2,ENSAPOT00000017612,ENSAPOE00000120413,288439,298458,288439,298458,255.0,314.0,85,105
3,ENSAPOT00000017612,ENSAPOE00000120414,288439,298458,288439,298458,315.0,360.0,105,120
4,ENSAPOT00000017612,ENSAPOE00000120415,288439,298458,288439,298458,361.0,410.0,121,137
5,ENSAPOT00000017612,ENSAPOE00000120416,288439,298458,288439,298458,411.0,513.0,137,171
6,ENSAPOT00000017559,ENSAPOE00000000170,290637,294901,290637,294901,1.0,47.0,1,16
7,ENSAPOT00000017559,ENSAPOE00000000171,290637,294901,290637,294901,48.0,103.0,16,35
8,ENSAPOT00000017559,ENSAPOE00000000174,290637,294901,290637,294901,104.0,165.0,35,55
9,ENSAPOT00000017559,ENSAPOE00000000177,290637,294901,290637,294901,166.0,303.0,56,101
10,ENSAPOT00000017559,ENSAPOE00000000280,290637,294901,290637,294901,304.0,380.0,102,127
11,ENSAPOT00000017559,ENSAPOE00000120530,290637,294901,290637,294901,381.0,396.0,127,132
12,ENSAPOT00000017555,ENSAPOE00000000178,310862,317808,310862,317808,499.0,646.0,167,216
13,ENSAPOT00000017555,ENSAPOE00000000181,310862,317808,310862,317808,647.0,819.0,216,273
14,ENSAPOT00000017555,ENSAPOE00000000198,310862,317808,310862,317808,161.0,347.0,54,116
15,ENSAPOT00000017555,ENSAPOE00000000201,310862,317808,310862,317808,348.0,498.0,116,166
16,ENSAPOT00000017555,ENSAPOE00000000406,310862,317808,310862,317808,958.0,1085.0,320,362