diff --git a/.gitignore b/.gitignore index e3f4f6e..ddb8b28 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ **/.DS_Store - +.vscode/ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/altanalyze3/bin/altanalyze3 b/altanalyze3/bin/altanalyze3 index c054719..97d762b 100644 --- a/altanalyze3/bin/altanalyze3 +++ b/altanalyze3/bin/altanalyze3 @@ -12,4 +12,4 @@ def main(args=None): if __name__ == "__main__": - sys.exit(main(sys.argv[1:])) \ No newline at end of file + sys.exit(main(sys.argv[1:])) diff --git a/altanalyze3/components/annotation/ensembl_biomart.py b/altanalyze3/components/annotation/main.py similarity index 88% rename from altanalyze3/components/annotation/ensembl_biomart.py rename to altanalyze3/components/annotation/main.py index 3d36cbc..e25f61c 100644 --- a/altanalyze3/components/annotation/ensembl_biomart.py +++ b/altanalyze3/components/annotation/main.py @@ -2,16 +2,16 @@ This is a generalized python module for getting data from Ensemble using Biomart server. """ -from __future__ import absolute_import, division, print_function import requests - from future.utils import native_str -from builtins import * from xml.etree import ElementTree import pandas as pd from io import StringIO from xml.etree.ElementTree import fromstring as xml_from_string - +import math +from altanalyze3.utilities.helpers import ( + TimeIt +) DEFAULT_HOST = 'http://www.biomart.org' DEFAULT_PATH = '/biomart/martservice' @@ -26,7 +26,6 @@ class ServerBase(object): path (str): Path to the biomart service on the host. port (str): Port to connect to on the host. url (str): Url used to connect to the biomart service. - use_cache (bool): Whether to cache requests to biomart. """ def __init__(self, host=None, path=None, port=None): @@ -38,9 +37,9 @@ def __init__(self, host=None, path=None, port=None): use_cache (bool): Whether to cache requests. """ # Use defaults if arg is None. - host = host or DEFAULT_HOST - path = path or DEFAULT_PATH - port = port or DEFAULT_PORT + host = DEFAULT_HOST if host is None else host + path = DEFAULT_PATH if path is None else path + port = DEFAULT_PORT if port is None else port # Add http prefix and remove trailing slash. host = self._add_http_prefix(host) @@ -82,8 +81,7 @@ def _add_http_prefix(url, prefix='http://'): @staticmethod def _remove_trailing_slash(url): - if url.endswith('/'): - url = url[:-1] + url.strip("/") return url def get(self, **params): @@ -121,22 +119,7 @@ class Dataset(ServerBase): host (str): Url of host to connect to. path (str): Path on the host to access to the biomart service. port (int): Port to use for the connection. - use_cache (bool): Whether to cache requests. virtual_schema (str): The virtual schema of the dataset. - Examples: - Directly connecting to a dataset: - >>> dataset = Dataset(name='hsapiens_gene_ensembl', - >>> host='http://www.ensembl.org') - Querying the dataset: - >>> dataset.query(attributes=['ensembl_gene_id', - >>> 'external_gene_name'], - >>> filters={'chromosome_name': ['1','2']}) - Listing available attributes: - >>> dataset.attributes - >>> dataset.list_attributes() - Listing available filters: - >>> dataset.filters - >>> dataset.list_filters() """ def __init__(self, @@ -145,7 +128,7 @@ def __init__(self, host=None, path=None, port=None, - virtual_schema=DEFAULT_SCHEMA): + virtual_schema=DEFAULT_SCHEMA, location): super().__init__(host=host, path=path, port=port) self._name = name @@ -155,6 +138,7 @@ def __init__(self, self._attributes = None self._default_attributes = None self._datatype = None + self.location = location @property def name(self): @@ -258,13 +242,20 @@ def _attributes_from_xml(xml): description=attrib.get('description', ''), default=default) + # on loop for each exon in one transcript + def calculate_aa_positions(self, cds_pos): + # check if new transcript + + aa_position = math.ceil((cds_pos) / 3) + return aa_position + def query(self, attributes=None, filters=None, only_unique=True, use_attr_names=False, dtypes=None, - datatype=None + datatype=None, ): """Queries the dataset to retrieve the contained data. Args: @@ -314,6 +305,8 @@ def query(self, dataset.set('name', self.name) dataset.set('interface', 'default') + csv_location = self.location.with_suffix(".csv") + logging.info(f"""Save protein coordinates reads to {csv_location}""") # Default to default attributes if none requested. if attributes is None: attributes = list(self.default_attributes.keys()) @@ -350,13 +343,17 @@ def query(self, try: result = pd.read_csv(StringIO(response.text), sep='\t', dtype=dtypes) - if (datatype == "protein_coordinates"): - result.to_csv( - 'Hs_ProteinCoordinates_build_100_38.csv', sep='\t') - elif(datatype == "protein_feature"): - result.to_csv( - 'Hs_ProteinFeatures_build_100_38.csv', sep='\t') - # Type error is raised of a data type is not understood by pandas + # calculate the aa_nt_start and end positions + result = result.dropna(subset=['CDS start']) + result = result.dropna(subset=['CDS end']) + cds_start = result['CDS start'].astype(int) + cds_stop = result['CDS end'].astype(int) + result["aa_start"] = cds_start.apply(lambda x: math.ceil((x) / 3)) + result["aa_stop"] = cds_stop.apply(lambda x: math.ceil((x) / 3)) + with csv_location.open("w") as out_handler: + out_handler.write(result) + + # Type error is raised of a data type is not understood by Pandas except TypeError as err: raise ValueError("Non valid data type is used in dtypes") @@ -367,7 +364,6 @@ def query(self, for attr in attributes } result.rename(columns=column_map, inplace=True) - return result @staticmethod @@ -490,14 +486,9 @@ def __repr__(self): .format(self.name, self.type)) -dataset = Dataset(name='apolyacanthus_gene_ensembl', - host='http://www.ensembl.org') - -# Protein Coordinates -dataset.query(attributes=["ensembl_transcript_id", "ensembl_exon_id", "ensembl_peptide_id", "start_position", - "end_position", "transcript_start", "transcript_end", "cdd", "cdd_start", "cdd_end"], datatype='protein_coordinates') - - -# Protein Features -dataset.query(attributes=["ensembl_gene_id", "ensembl_gene_id_version", "ensembl_transcript_id_version", - "interpro", "interpro_description", "interpro_start", "interpro_end", "cdd", "cdd_start", "cdd_end"], datatype='protein_feature') +def protein_coordinates(args): + with TimeIt(): + dataset = Dataset(name={args.name}, host={args.host}) + logging.info( + f"""Getting Data from {args.host} for given species {args.name}""") + dataset.query(attributes=args.attributes) diff --git a/altanalyze3/utilities/parser.py b/altanalyze3/utilities/parser.py index 7d55cb7..1e555b8 100644 --- a/altanalyze3/utilities/parser.py +++ b/altanalyze3/utilities/parser.py @@ -5,6 +5,7 @@ from altanalyze3.utilities.helpers import get_version from altanalyze3.components.intron_count.main import count_introns from altanalyze3.components.junction_count.main import count_junctions +# from altanalyze3.components.junction_count.main import protein_coordinates from altanalyze3.utilities.io import get_all_bam_chr from altanalyze3.utilities.constants import ( IntRetCat, @@ -52,7 +53,7 @@ def get_parser(self): subparsers = general_parser.add_subparsers() subparsers.required = True # Global parameters for all components of the tool - general_parser.add_argument( + general_parser.add_argument( "--version", action="version", version=get_version(), @@ -150,6 +151,37 @@ def get_parser(self): help="Export processed reads into the BAM file. Default: False", action="store_true" ) + self.add_common_arguments(intron_parser) + + # Protein Domain Annotation parser + protein_coordinates_parser = subparsers.add_parser( + "proteincoordinates", + parents=[parent_parser], + help="Get Protein to Domain annotations" + ) + #TO-DO + # protein_coordinates_parser.set_defaults(func=protein_coordinates) + # protein_coordinates_parser.add_argument( + # "--name", + # help="name of species eg. apolyacanthus_gene_ensembl", + # type=str, + # required=True, + # ) + # protein_coordinates_parser.add_argument( + # "--host", + # help="Select the host from where you want to import data", + # type=str, + # default="https://www.ensembl.org" + # ) + # protein_coordinates_parser.add_argument( + # "--attributes", + # help="Export certain coordinates or features from Ensembl", + # nargs="*", + # default=["ensembl_transcript_id", "ensembl_exon_id", "ensembl_peptide_id", "start_position", + # "end_position", "transcript_start", "transcript_end", "cds_start", "cds_end"] + # ) + # self.add_common_arguments(protein_coordinates_parser) + self.add_common_arguments(junction_parser) return general_parser @@ -200,4 +232,4 @@ def assert_common_args(self): self.args.output.parent.mkdir(parents=True, exist_ok=True) # safety measure, shouldn't fail self.args.chr = get_all_bam_chr(self.args.bam, self.args.threads) \ if len(self.args.chr) == 0 else [c if c.startswith("chr") else f"chr{c}" for c in self.args.chr] - self.args.loglevel = getattr(logging, self.args.loglevel.upper()) \ No newline at end of file + self.args.loglevel = getattr(logging, self.args.loglevel.upper()) diff --git a/docs/Hs_ProteinCoordinates_build_100_38.csv b/docs/Hs_ProteinCoordinates_build_100_38.csv index e659e5d..efbb571 100644 --- a/docs/Hs_ProteinCoordinates_build_100_38.csv +++ b/docs/Hs_ProteinCoordinates_build_100_38.csv @@ -1,12 +1,18 @@ -,Exon stable ID,Gene start (bp),Gene end (bp),Gene name,Protein stable ID,Transcript start (bp),Transcript end (bp),CDD start,CDD end -0,ENSAPOE00000120411,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,, -1,ENSAPOE00000120412,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,, -2,ENSAPOE00000120413,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,, -3,ENSAPOE00000120414,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,, -4,ENSAPOE00000120415,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,, -5,ENSAPOE00000120416,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,, -6,ENSAPOE00000000170,290637,294901,,ENSAPOP00000020929,290637,294901,, -7,ENSAPOE00000000171,290637,294901,,ENSAPOP00000020929,290637,294901,, -8,ENSAPOE00000000174,290637,294901,,ENSAPOP00000020929,290637,294901,, -9,ENSAPOE00000000177,290637,294901,,ENSAPOP00000020929,290637,294901,, -10,ENSAPOE00000000280,290637,294901,,ENSAPOP00000020929,290637,294901,, \ No newline at end of file +,Transcript stable ID,Exon stable ID,Gene start (bp),Gene end (bp),Transcript start (bp),Transcript end (bp),CDS start,CDS end,aa_start,aa_stop +0,ENSAPOT00000017612,ENSAPOE00000120411,288439,298458,288439,298458,1.0,105.0,1,35 +1,ENSAPOT00000017612,ENSAPOE00000120412,288439,298458,288439,298458,106.0,254.0,36,85 +2,ENSAPOT00000017612,ENSAPOE00000120413,288439,298458,288439,298458,255.0,314.0,85,105 +3,ENSAPOT00000017612,ENSAPOE00000120414,288439,298458,288439,298458,315.0,360.0,105,120 +4,ENSAPOT00000017612,ENSAPOE00000120415,288439,298458,288439,298458,361.0,410.0,121,137 +5,ENSAPOT00000017612,ENSAPOE00000120416,288439,298458,288439,298458,411.0,513.0,137,171 +6,ENSAPOT00000017559,ENSAPOE00000000170,290637,294901,290637,294901,1.0,47.0,1,16 +7,ENSAPOT00000017559,ENSAPOE00000000171,290637,294901,290637,294901,48.0,103.0,16,35 +8,ENSAPOT00000017559,ENSAPOE00000000174,290637,294901,290637,294901,104.0,165.0,35,55 +9,ENSAPOT00000017559,ENSAPOE00000000177,290637,294901,290637,294901,166.0,303.0,56,101 +10,ENSAPOT00000017559,ENSAPOE00000000280,290637,294901,290637,294901,304.0,380.0,102,127 +11,ENSAPOT00000017559,ENSAPOE00000120530,290637,294901,290637,294901,381.0,396.0,127,132 +12,ENSAPOT00000017555,ENSAPOE00000000178,310862,317808,310862,317808,499.0,646.0,167,216 +13,ENSAPOT00000017555,ENSAPOE00000000181,310862,317808,310862,317808,647.0,819.0,216,273 +14,ENSAPOT00000017555,ENSAPOE00000000198,310862,317808,310862,317808,161.0,347.0,54,116 +15,ENSAPOT00000017555,ENSAPOE00000000201,310862,317808,310862,317808,348.0,498.0,116,166 +16,ENSAPOT00000017555,ENSAPOE00000000406,310862,317808,310862,317808,958.0,1085.0,320,362 \ No newline at end of file