SalomonisLab · preetisi · Jun 1, 2022 · Jun 1, 2022 · Jun 1, 2022 · Jun 2, 2022
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,5 @@
 **/.DS_Store
-
+.vscode/
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/altanalyze3/bin/altanalyze3 b/altanalyze3/bin/altanalyze3
@@ -12,4 +12,4 @@ def main(args=None):
 
 
 if __name__ == "__main__":
-    sys.exit(main(sys.argv[1:]))
+    sys.exit(main(sys.argv[1:]))
diff --git a/.../components/annotation/ensembl_biomart.py → altanalyze3/components/annotation/main.py b/.../components/annotation/ensembl_biomart.py → altanalyze3/components/annotation/main.py
@@ -2,16 +2,16 @@
 This is a generalized python module for getting data from Ensemble using Biomart server.
 """
 
-from __future__ import absolute_import, division, print_function
 import requests
-
 from future.utils import native_str
-from builtins import *
 from xml.etree import ElementTree
 import pandas as pd
 from io import StringIO
 from xml.etree.ElementTree import fromstring as xml_from_string
-
+import math
+from altanalyze3.utilities.helpers import (
+    TimeIt
+)
 
 DEFAULT_HOST = 'http://www.biomart.org'
 DEFAULT_PATH = '/biomart/martservice'
@@ -26,7 +26,6 @@ class ServerBase(object):
         path (str): Path to the biomart service on the host.
         port (str): Port to connect to on the host.
         url (str): Url used to connect to the biomart service.
-        use_cache (bool): Whether to cache requests to biomart.
     """
 
     def __init__(self, host=None, path=None, port=None):
@@ -38,9 +37,9 @@ def __init__(self, host=None, path=None, port=None):
             use_cache (bool): Whether to cache requests.
         """
         # Use defaults if arg is None.
-        host = host or DEFAULT_HOST
-        path = path or DEFAULT_PATH
-        port = port or DEFAULT_PORT
+        host = DEFAULT_HOST if host is None else host
+        path = DEFAULT_PATH if path is None else path
+        port = DEFAULT_PORT if port is None else port
 
         # Add http prefix and remove trailing slash.
         host = self._add_http_prefix(host)
@@ -82,8 +81,7 @@ def _add_http_prefix(url, prefix='http://'):
 
     @staticmethod
     def _remove_trailing_slash(url):
-        if url.endswith('/'):
-            url = url[:-1]
+        url.strip("/")
         return url
 
     def get(self, **params):
@@ -121,22 +119,7 @@ class Dataset(ServerBase):
         host (str): Url of host to connect to.
         path (str): Path on the host to access to the biomart service.
         port (int): Port to use for the connection.
-        use_cache (bool): Whether to cache requests.
         virtual_schema (str): The virtual schema of the dataset.
-    Examples:
-        Directly connecting to a dataset:
-            >>> dataset = Dataset(name='hsapiens_gene_ensembl',
-            >>>                   host='http://www.ensembl.org')
-        Querying the dataset:
-            >>> dataset.query(attributes=['ensembl_gene_id',
-            >>>                           'external_gene_name'],
-            >>>               filters={'chromosome_name': ['1','2']})
-        Listing available attributes:
-            >>> dataset.attributes
-            >>> dataset.list_attributes()
-        Listing available filters:
-            >>> dataset.filters
-            >>> dataset.list_filters()
     """
 
     def __init__(self,
@@ -145,7 +128,7 @@ def __init__(self,
                  host=None,
                  path=None,
                  port=None,
-                 virtual_schema=DEFAULT_SCHEMA):
+                 virtual_schema=DEFAULT_SCHEMA, location):
         super().__init__(host=host, path=path, port=port)
 
         self._name = name
@@ -155,6 +138,7 @@ def __init__(self,
         self._attributes = None
         self._default_attributes = None
         self._datatype = None
+        self.location = location
 
     @property
     def name(self):
@@ -258,13 +242,20 @@ def _attributes_from_xml(xml):
                     description=attrib.get('description', ''),
                     default=default)
 
+    # on loop for each exon in one transcript
+    def calculate_aa_positions(self, cds_pos):
+        # check if new transcript
+
+        aa_position = math.ceil((cds_pos) / 3)
+        return aa_position
+
     def query(self,
               attributes=None,
               filters=None,
               only_unique=True,
               use_attr_names=False,
               dtypes=None,
-              datatype=None
+              datatype=None,
               ):
         """Queries the dataset to retrieve the contained data.
         Args:
@@ -314,6 +305,8 @@ def query(self,
         dataset.set('name', self.name)
         dataset.set('interface', 'default')
 
+        csv_location = self.location.with_suffix(".csv")
+        logging.info(f"""Save protein coordinates reads to {csv_location}""")
         # Default to default attributes if none requested.
         if attributes is None:
             attributes = list(self.default_attributes.keys())
@@ -350,13 +343,17 @@ def query(self,
         try:
             result = pd.read_csv(StringIO(response.text),
                                  sep='\t', dtype=dtypes)
-            if (datatype == "protein_coordinates"):
-                result.to_csv(
-                    'Hs_ProteinCoordinates_build_100_38.csv', sep='\t')
-            elif(datatype == "protein_feature"):
-                result.to_csv(
-                    'Hs_ProteinFeatures_build_100_38.csv', sep='\t')
-        # Type error is raised of a data type is not understood by pandas
+            # calculate the aa_nt_start and end positions
+            result = result.dropna(subset=['CDS start'])
+            result = result.dropna(subset=['CDS end'])
+            cds_start = result['CDS start'].astype(int)
+            cds_stop = result['CDS end'].astype(int)
+            result["aa_start"] = cds_start.apply(lambda x: math.ceil((x) / 3))
+            result["aa_stop"] = cds_stop.apply(lambda x: math.ceil((x) / 3))
+            with csv_location.open("w") as out_handler:
+                out_handler.write(result)
+
+        # Type error is raised of a data type is not understood by Pandas
         except TypeError as err:
             raise ValueError("Non valid data type is used in dtypes")
 
@@ -367,7 +364,6 @@ def query(self,
                 for attr in attributes
             }
             result.rename(columns=column_map, inplace=True)
-
         return result
 
     @staticmethod
@@ -490,14 +486,9 @@ def __repr__(self):
                 .format(self.name, self.type))
 
 
-dataset = Dataset(name='apolyacanthus_gene_ensembl',
-                  host='http://www.ensembl.org')
-
-# Protein Coordinates
-dataset.query(attributes=["ensembl_transcript_id", "ensembl_exon_id", "ensembl_peptide_id", "start_position",
-              "end_position", "transcript_start", "transcript_end", "cdd", "cdd_start", "cdd_end"], datatype='protein_coordinates')
-
-
-# Protein Features
-dataset.query(attributes=["ensembl_gene_id", "ensembl_gene_id_version", "ensembl_transcript_id_version",
-              "interpro", "interpro_description", "interpro_start", "interpro_end", "cdd", "cdd_start", "cdd_end"], datatype='protein_feature')
+def protein_coordinates(args):
+    with TimeIt():
+        dataset = Dataset(name={args.name}, host={args.host})
+        logging.info(
+            f"""Getting Data from {args.host} for given species {args.name}""")
+        dataset.query(attributes=args.attributes)
diff --git a/altanalyze3/utilities/parser.py b/altanalyze3/utilities/parser.py
@@ -5,6 +5,7 @@
 from altanalyze3.utilities.helpers import get_version
 from altanalyze3.components.intron_count.main import count_introns
 from altanalyze3.components.junction_count.main import count_junctions
+# from altanalyze3.components.junction_count.main import protein_coordinates
 from altanalyze3.utilities.io import get_all_bam_chr
 from altanalyze3.utilities.constants import (
     IntRetCat,
@@ -52,7 +53,7 @@ def get_parser(self):
         subparsers = general_parser.add_subparsers()
         subparsers.required = True
         # Global parameters for all components of the tool
-        general_parser.add_argument(                       
+        general_parser.add_argument(
             "--version",
             action="version",
             version=get_version(),
@@ -150,6 +151,37 @@ def get_parser(self):
             help="Export processed reads into the BAM file. Default: False",
             action="store_true"
         )
+        self.add_common_arguments(intron_parser)
+
+        # Protein Domain Annotation parser
+        protein_coordinates_parser = subparsers.add_parser(
+            "proteincoordinates",
+            parents=[parent_parser],
+            help="Get Protein to Domain annotations"
+        )
+        #TO-DO
+        # protein_coordinates_parser.set_defaults(func=protein_coordinates)
+        # protein_coordinates_parser.add_argument(
+        #     "--name",
+        #     help="name of species eg. apolyacanthus_gene_ensembl",
+        #     type=str,
+        #     required=True,
+        # )
+        # protein_coordinates_parser.add_argument(
+        #     "--host",
+        #     help="Select the host from where you want to import data",
+        #     type=str,
+        #     default="https://www.ensembl.org"
+        # )
+        # protein_coordinates_parser.add_argument(
+        #     "--attributes",
+        #     help="Export certain coordinates or features from Ensembl",
+        #     nargs="*",
+        #     default=["ensembl_transcript_id", "ensembl_exon_id", "ensembl_peptide_id", "start_position",
+        #              "end_position", "transcript_start", "transcript_end", "cds_start", "cds_end"]
+        # )
+        # self.add_common_arguments(protein_coordinates_parser)
+
         self.add_common_arguments(junction_parser)
         return general_parser
 
@@ -200,4 +232,4 @@ def assert_common_args(self):
         self.args.output.parent.mkdir(parents=True, exist_ok=True)                        # safety measure, shouldn't fail
         self.args.chr = get_all_bam_chr(self.args.bam, self.args.threads) \
             if len(self.args.chr) == 0 else [c if c.startswith("chr") else f"chr{c}" for c in self.args.chr]
-        self.args.loglevel = getattr(logging, self.args.loglevel.upper())
+        self.args.loglevel = getattr(logging, self.args.loglevel.upper())
diff --git a/docs/Hs_ProteinCoordinates_build_100_38.csv b/docs/Hs_ProteinCoordinates_build_100_38.csv
@@ -1,12 +1,18 @@
-,Exon stable ID,Gene start (bp),Gene end (bp),Gene name,Protein stable ID,Transcript start (bp),Transcript end (bp),CDD start,CDD end
-0,ENSAPOE00000120411,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,,
-1,ENSAPOE00000120412,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,,
-2,ENSAPOE00000120413,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,,
-3,ENSAPOE00000120414,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,,
-4,ENSAPOE00000120415,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,,
-5,ENSAPOE00000120416,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,,
-6,ENSAPOE00000000170,290637,294901,,ENSAPOP00000020929,290637,294901,,
-7,ENSAPOE00000000171,290637,294901,,ENSAPOP00000020929,290637,294901,,
-8,ENSAPOE00000000174,290637,294901,,ENSAPOP00000020929,290637,294901,,
-9,ENSAPOE00000000177,290637,294901,,ENSAPOP00000020929,290637,294901,,
-10,ENSAPOE00000000280,290637,294901,,ENSAPOP00000020929,290637,294901,,
+,Transcript stable ID,Exon stable ID,Gene start (bp),Gene end (bp),Transcript start (bp),Transcript end (bp),CDS start,CDS end,aa_start,aa_stop
+0,ENSAPOT00000017612,ENSAPOE00000120411,288439,298458,288439,298458,1.0,105.0,1,35
+1,ENSAPOT00000017612,ENSAPOE00000120412,288439,298458,288439,298458,106.0,254.0,36,85
+2,ENSAPOT00000017612,ENSAPOE00000120413,288439,298458,288439,298458,255.0,314.0,85,105
+3,ENSAPOT00000017612,ENSAPOE00000120414,288439,298458,288439,298458,315.0,360.0,105,120
+4,ENSAPOT00000017612,ENSAPOE00000120415,288439,298458,288439,298458,361.0,410.0,121,137
+5,ENSAPOT00000017612,ENSAPOE00000120416,288439,298458,288439,298458,411.0,513.0,137,171
+6,ENSAPOT00000017559,ENSAPOE00000000170,290637,294901,290637,294901,1.0,47.0,1,16
+7,ENSAPOT00000017559,ENSAPOE00000000171,290637,294901,290637,294901,48.0,103.0,16,35
+8,ENSAPOT00000017559,ENSAPOE00000000174,290637,294901,290637,294901,104.0,165.0,35,55
+9,ENSAPOT00000017559,ENSAPOE00000000177,290637,294901,290637,294901,166.0,303.0,56,101
+10,ENSAPOT00000017559,ENSAPOE00000000280,290637,294901,290637,294901,304.0,380.0,102,127
+11,ENSAPOT00000017559,ENSAPOE00000120530,290637,294901,290637,294901,381.0,396.0,127,132
+12,ENSAPOT00000017555,ENSAPOE00000000178,310862,317808,310862,317808,499.0,646.0,167,216
+13,ENSAPOT00000017555,ENSAPOE00000000181,310862,317808,310862,317808,647.0,819.0,216,273
+14,ENSAPOT00000017555,ENSAPOE00000000198,310862,317808,310862,317808,161.0,347.0,54,116
+15,ENSAPOT00000017555,ENSAPOE00000000201,310862,317808,310862,317808,348.0,498.0,116,166
+16,ENSAPOT00000017555,ENSAPOE00000000406,310862,317808,310862,317808,958.0,1085.0,320,362
Original file line number	Diff line number	Diff line change
Expand Up		@@ -12,4 +12,4 @@ def main(args=None):


		if __name__ == "__main__":
		sys.exit(main(sys.argv[1:]))
		sys.exit(main(sys.argv[1:]))