From 8f301498d6c3e75e1b4538ce7129ff1346034c8b Mon Sep 17 00:00:00 2001
From: Preeti <singhpreeti236@gmail.com>
Date: Wed, 1 Jun 2022 16:04:10 -0400
Subject: [PATCH 01/16] add amino acid neucleaotide position calculation

---
 .../components/annotation/ensembl_biomart.py     | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/altanalyze3/components/annotation/ensembl_biomart.py b/altanalyze3/components/annotation/ensembl_biomart.py
index 3d36cbc..54a1cb5 100644
--- a/altanalyze3/components/annotation/ensembl_biomart.py
+++ b/altanalyze3/components/annotation/ensembl_biomart.py
@@ -2,11 +2,9 @@
 This is a generalized python module for getting data from Ensemble using Biomart server.
 """
 
-from __future__ import absolute_import, division, print_function
-import requests
 
+import requests
 from future.utils import native_str
-from builtins import *
 from xml.etree import ElementTree
 import pandas as pd
 from io import StringIO
@@ -370,6 +368,18 @@ def query(self,
 
         return result
 
+        # on loop for each exon in one transcript
+    # by default initialize the first aa start, aa_nt_start = 1
+    def calculate_aa_positions(enst_id_new, enst_id_old, cds_start, cds_stop):
+        # check if new transcript
+        aa_stop = math.ceil((cds_stop - cds_start + 1) / 3)
+        if enst_id_new != enst_id_old:
+            aa_start = 1
+        # check if the last codon has less than three neucleotides
+        elif (cds_stop - cds_start + 1) % 3 != 0:
+            aa_start = aa_stop
+        return aa_stop, aa_start
+
     @staticmethod
     def _add_attr_node(root, attr):
         attr_el = ElementTree.SubElement(root, 'Attribute')

From e9df4df93c16899fc0a5fa9f3d5487f4e9103847 Mon Sep 17 00:00:00 2001
From: Preeti <singhpreeti236@gmail.com>
Date: Wed, 1 Jun 2022 16:18:25 -0400
Subject: [PATCH 02/16] code review changes - add strip function to remove
 slash in url

---
 altanalyze3/components/annotation/ensembl_biomart.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/altanalyze3/components/annotation/ensembl_biomart.py b/altanalyze3/components/annotation/ensembl_biomart.py
index 54a1cb5..4eebf4e 100644
--- a/altanalyze3/components/annotation/ensembl_biomart.py
+++ b/altanalyze3/components/annotation/ensembl_biomart.py
@@ -80,8 +80,7 @@ def _add_http_prefix(url, prefix='http://'):
 
     @staticmethod
     def _remove_trailing_slash(url):
-        if url.endswith('/'):
-            url = url[:-1]
+        url.strip("/")
         return url
 
     def get(self, **params):

From 18f2e29de6af1151d4847f384bddd0c4737618ef Mon Sep 17 00:00:00 2001
From: Preeti <singhpreeti236@gmail.com>
Date: Wed, 1 Jun 2022 18:01:39 -0400
Subject: [PATCH 03/16] check if the value of host is none

---
 altanalyze3/components/annotation/ensembl_biomart.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/altanalyze3/components/annotation/ensembl_biomart.py b/altanalyze3/components/annotation/ensembl_biomart.py
index 4eebf4e..ab00aa5 100644
--- a/altanalyze3/components/annotation/ensembl_biomart.py
+++ b/altanalyze3/components/annotation/ensembl_biomart.py
@@ -36,9 +36,9 @@ def __init__(self, host=None, path=None, port=None):
             use_cache (bool): Whether to cache requests.
         """
         # Use defaults if arg is None.
-        host = host or DEFAULT_HOST
-        path = path or DEFAULT_PATH
-        port = port or DEFAULT_PORT
+        host = DEFAULT_HOST if host is None else host
+        path = DEFAULT_PATH if path is None else path
+        port = DEFAULT_PORT if port is None else port
 
         # Add http prefix and remove trailing slash.
         host = self._add_http_prefix(host)

From a59eda97117fafe35acca613731759adcd0d4fe2 Mon Sep 17 00:00:00 2001
From: Preeti <singhpreeti236@gmail.com>
Date: Thu, 2 Jun 2022 10:55:17 -0400
Subject: [PATCH 04/16] add a function to parse args

---
 .../components/annotation/ensembl_biomart.py  | 512 ------------------
 1 file changed, 512 deletions(-)
 delete mode 100644 altanalyze3/components/annotation/ensembl_biomart.py

diff --git a/altanalyze3/components/annotation/ensembl_biomart.py b/altanalyze3/components/annotation/ensembl_biomart.py
deleted file mode 100644
index ab00aa5..0000000
--- a/altanalyze3/components/annotation/ensembl_biomart.py
+++ /dev/null
@@ -1,512 +0,0 @@
-"""
-This is a generalized python module for getting data from Ensemble using Biomart server.
-"""
-
-
-import requests
-from future.utils import native_str
-from xml.etree import ElementTree
-import pandas as pd
-from io import StringIO
-from xml.etree.ElementTree import fromstring as xml_from_string
-
-
-DEFAULT_HOST = 'http://www.biomart.org'
-DEFAULT_PATH = '/biomart/martservice'
-DEFAULT_PORT = 80
-DEFAULT_SCHEMA = 'default'
-
-
-class ServerBase(object):
-    """Base class that handles requests to the biomart server.
-    Attributes:
-        host (str): Host to connect to for the biomart service.
-        path (str): Path to the biomart service on the host.
-        port (str): Port to connect to on the host.
-        url (str): Url used to connect to the biomart service.
-        use_cache (bool): Whether to cache requests to biomart.
-    """
-
-    def __init__(self, host=None, path=None, port=None):
-        """ServerBase constructor.
-        Args:
-            host (str): Url of host to connect to.
-            path (str): Path on the host to access to the biomart service.
-            port (int): Port to use for the connection.
-            use_cache (bool): Whether to cache requests.
-        """
-        # Use defaults if arg is None.
-        host = DEFAULT_HOST if host is None else host
-        path = DEFAULT_PATH if path is None else path
-        port = DEFAULT_PORT if port is None else port
-
-        # Add http prefix and remove trailing slash.
-        host = self._add_http_prefix(host)
-        host = self._remove_trailing_slash(host)
-
-        # Ensure path starts with slash.
-        if not path.startswith('/'):
-            path = '/' + path
-
-        self._host = host
-        self._path = path
-        self._port = port
-
-    @property
-    def host(self):
-        """Host to connect to for the biomart service."""
-        return self._host
-
-    @property
-    def path(self):
-        """Path to the biomart service on the host."""
-        return self._path
-
-    @property
-    def port(self):
-        """Port to connect to on the host."""
-        return self._port
-
-    @property
-    def url(self):
-        """Url used to connect to the biomart service."""
-        return '{}:{}{}'.format(self._host, self._port, self._path)
-
-    @staticmethod
-    def _add_http_prefix(url, prefix='http://'):
-        if not url.startswith('http://') or url.startswith('https://'):
-            url = prefix + url
-        return url
-
-    @staticmethod
-    def _remove_trailing_slash(url):
-        url.strip("/")
-        return url
-
-    def get(self, **params):
-        """Performs get request to the biomart service.
-        Args:
-            **params (dict of str: any): Arbitrary keyword arguments, which
-                are added as parameters to the get request to biomart.
-        Returns:
-            requests.models.Response: Response from biomart for the request.
-        """
-
-        r = requests.get(self.url, params=params)
-        r.raise_for_status()
-        return r
-
-
-class BiomartException(Exception):
-    """Basic exception class for biomart exceptions."""
-    pass
-
-
-class Dataset(ServerBase):
-    """Class representing a biomart dataset.
-    This class is responsible for handling queries to biomart
-    datasets. Queries can select a subset of attributes and can be filtered
-    using any available filters. A list of valid attributes is available in
-    the attributes property. If no attributes are given, a set of default
-    attributes is used. A list of valid filters is available in the filters
-    property. The type of value that can be specified for a given filter
-    depends on the filter as some filters accept single values, whilst others
-    can take lists of values.
-    Args:
-        name (str): Id of the dataset.
-        display_name (str): Display name of the dataset.
-        host (str): Url of host to connect to.
-        path (str): Path on the host to access to the biomart service.
-        port (int): Port to use for the connection.
-        use_cache (bool): Whether to cache requests.
-        virtual_schema (str): The virtual schema of the dataset.
-    Examples:
-        Directly connecting to a dataset:
-            >>> dataset = Dataset(name='hsapiens_gene_ensembl',
-            >>>                   host='http://www.ensembl.org')
-        Querying the dataset:
-            >>> dataset.query(attributes=['ensembl_gene_id',
-            >>>                           'external_gene_name'],
-            >>>               filters={'chromosome_name': ['1','2']})
-        Listing available attributes:
-            >>> dataset.attributes
-            >>> dataset.list_attributes()
-        Listing available filters:
-            >>> dataset.filters
-            >>> dataset.list_filters()
-    """
-
-    def __init__(self,
-                 name,
-                 display_name='',
-                 host=None,
-                 path=None,
-                 port=None,
-                 virtual_schema=DEFAULT_SCHEMA):
-        super().__init__(host=host, path=path, port=port)
-
-        self._name = name
-        self._display_name = display_name
-        self._virtual_schema = virtual_schema
-        self._filters = None
-        self._attributes = None
-        self._default_attributes = None
-        self._datatype = None
-
-    @property
-    def name(self):
-        """Name of the dataset (used as dataset id)."""
-        return self._name
-
-    @property
-    def display_name(self):
-        """Display name of the dataset."""
-        return self._display_name
-
-    @property
-    def filters(self):
-        """List of filters available for the dataset."""
-        if self._filters is None:
-            self._filters, self._attributes = self._fetch_configuration()
-        return self._filters
-
-    @property
-    def attributes(self):
-        """List of attributes available for the dataset (cached)."""
-        if self._attributes is None:
-            self._filters, self._attributes = self._fetch_configuration()
-        return self._attributes
-
-    @property
-    def default_attributes(self):
-        """List of default attributes for the dataset."""
-        if self._default_attributes is None:
-            self._default_attributes = {
-                name: attr
-                for name, attr in self.attributes.items()
-                if attr.default is True
-            }
-        return self._default_attributes
-
-    def list_attributes(self):
-        """Lists available attributes in a readable DataFrame format.
-        Returns:
-            pd.DataFrame: Frame listing available attributes.
-        """
-
-        def _row_gen(attributes):
-            for attr in attributes.values():
-                yield (attr.name, attr.display_name, attr.description)
-
-        return pd.DataFrame.from_records(
-            _row_gen(self.attributes),
-            columns=['name', 'display_name', 'description'])
-
-    def list_filters(self):
-        """Lists available filters in a readable DataFrame format.
-        Returns:
-            pd.DataFrame: Frame listing available filters.
-        """
-
-        def _row_gen(attributes):
-            for attr in attributes.values():
-                yield (attr.name, attr.type, attr.description)
-
-        return pd.DataFrame.from_records(
-            _row_gen(self.filters), columns=['name', 'type', 'description'])
-
-    def _fetch_configuration(self):
-        # Get datasets using biomart.
-        response = self.get(type='configuration', dataset=self._name)
-
-        # Check response for problems.
-        if 'Problem retrieving configuration' in response.text:
-            raise BiomartException('Failed to retrieve dataset configuration, '
-                                   'check the dataset name and schema.')
-
-        # Get filters and attributes from xml.
-        xml = ElementTree.fromstring(response.content)
-
-        filters = {f.name: f for f in self._filters_from_xml(xml)}
-        attributes = {a.name: a for a in self._attributes_from_xml(xml)}
-
-        return filters, attributes
-
-    @staticmethod
-    def _filters_from_xml(xml):
-        for node in xml.iter('FilterDescription'):
-            attrib = node.attrib
-            yield Filter(
-                name=attrib['internalName'], type=attrib.get('type', ''))
-
-    @staticmethod
-    def _attributes_from_xml(xml):
-        for page_index, page in enumerate(xml.iter('AttributePage')):
-            for desc in page.iter('AttributeDescription'):
-                attrib = desc.attrib
-
-                # Default attributes can only be from the first page.
-                default = (page_index == 0 and
-                           attrib.get('default', '') == 'true')
-
-                yield Attribute(
-                    name=attrib['internalName'],
-                    display_name=attrib.get('displayName', ''),
-                    description=attrib.get('description', ''),
-                    default=default)
-
-    def query(self,
-              attributes=None,
-              filters=None,
-              only_unique=True,
-              use_attr_names=False,
-              dtypes=None,
-              datatype=None
-              ):
-        """Queries the dataset to retrieve the contained data.
-        Args:
-            attributes (list[str]): Names of attributes to fetch in query.
-                Attribute names must correspond to valid attributes. See
-                the attributes property for a list of valid attributes.
-            filters (dict[str,any]): Dictionary of filters --> values
-                to filter the dataset by. Filter names and values must
-                correspond to valid filters and filter values. See the
-                filters property for a list of valid filters.
-            only_unique (bool): Whether to return only rows containing
-                unique values (True) or to include duplicate rows (False).
-            use_attr_names (bool): Whether to use the attribute names
-                as column names in the result (True) or the attribute
-                display names (False).
-            dtypes (dict[str,any]): Dictionary of attributes --> data types
-                to describe to pandas how the columns should be handled
-        Returns:
-            pandas.DataFrame: DataFrame containing the query results.
-        """
-
-        # Example query from Ensembl biomart:
-        #
-        # <?xml version="1.0" encoding="UTF-8"?>
-        # <!DOCTYPE Query>
-        # <Query  virtualSchemaName = "default" formatter = "TSV" header = "0"
-        #  uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
-        #   <Dataset name = "hsapiens_gene_ensembl" interface = "default" >
-        #       <Filter name = "chromosome_name" value = "1,2"/>
-        #       <Filter name = "end" value = "10000000"/>
-        #       <Filter name = "start" value = "1"/>
-        #       <Attribute name = "ensembl_gene_id" />
-        #       <Attribute name = "ensembl_transcript_id" />
-        #   </Dataset>
-        # </Query>
-
-        # Setup query element.
-        root = ElementTree.Element('Query')
-        root.set('virtualSchemaName', self._virtual_schema)
-        root.set('formatter', 'TSV')
-        root.set('header', '1')
-        root.set('uniqueRows', native_str(int(only_unique)))
-        root.set('datasetConfigVersion', '0.6')
-
-        # Add dataset element.
-        dataset = ElementTree.SubElement(root, 'Dataset')
-        dataset.set('name', self.name)
-        dataset.set('interface', 'default')
-
-        # Default to default attributes if none requested.
-        if attributes is None:
-            attributes = list(self.default_attributes.keys())
-
-        # Add attribute elements.
-        for name in attributes:
-            try:
-                attr = self.attributes[name]
-                self._add_attr_node(dataset, attr)
-            except KeyError:
-                raise BiomartException(
-                    'Unknown attribute {}, check dataset attributes '
-                    'for a list of valid attributes.'.format(name))
-
-        if filters is not None:
-            # Add filter elements.
-            for name, value in filters.items():
-                try:
-                    filter_ = self.filters[name]
-                    self._add_filter_node(dataset, filter_, value)
-                except KeyError:
-                    raise BiomartException(
-                        'Unknown filter {}, check dataset filters '
-                        'for a list of valid filters.'.format(name))
-
-        # Fetch response.
-        response = self.get(query=ElementTree.tostring(root))
-
-        # Raise exception if an error occurred.
-        if 'Query ERROR' in response.text:
-            raise BiomartException(response.text)
-
-        # Parse results into a DataFrame.
-        try:
-            result = pd.read_csv(StringIO(response.text),
-                                 sep='\t', dtype=dtypes)
-            if (datatype == "protein_coordinates"):
-                result.to_csv(
-                    'Hs_ProteinCoordinates_build_100_38.csv', sep='\t')
-            elif(datatype == "protein_feature"):
-                result.to_csv(
-                    'Hs_ProteinFeatures_build_100_38.csv', sep='\t')
-        # Type error is raised of a data type is not understood by pandas
-        except TypeError as err:
-            raise ValueError("Non valid data type is used in dtypes")
-
-        if use_attr_names:
-            # Rename columns with attribute names instead of display names.
-            column_map = {
-                self.attributes[attr].display_name: attr
-                for attr in attributes
-            }
-            result.rename(columns=column_map, inplace=True)
-
-        return result
-
-        # on loop for each exon in one transcript
-    # by default initialize the first aa start, aa_nt_start = 1
-    def calculate_aa_positions(enst_id_new, enst_id_old, cds_start, cds_stop):
-        # check if new transcript
-        aa_stop = math.ceil((cds_stop - cds_start + 1) / 3)
-        if enst_id_new != enst_id_old:
-            aa_start = 1
-        # check if the last codon has less than three neucleotides
-        elif (cds_stop - cds_start + 1) % 3 != 0:
-            aa_start = aa_stop
-        return aa_stop, aa_start
-
-    @staticmethod
-    def _add_attr_node(root, attr):
-        attr_el = ElementTree.SubElement(root, 'Attribute')
-        attr_el.set('name', attr.name)
-
-    @staticmethod
-    def _add_filter_node(root, filter_, value):
-        """Adds filter xml node to root."""
-        filter_el = ElementTree.SubElement(root, 'Filter')
-        filter_el.set('name', filter_.name)
-
-        # Set filter value depending on type.
-        if filter_.type == 'boolean':
-            # Boolean case.
-            if value is True or value.lower() in {'included', 'only'}:
-                filter_el.set('excluded', '0')
-            elif value is False or value.lower() == 'excluded':
-                filter_el.set('excluded', '1')
-            else:
-                raise ValueError('Invalid value for boolean filter ({})'
-                                 .format(value))
-        elif isinstance(value, list) or isinstance(value, tuple):
-            # List case.
-            filter_el.set('value', ','.join(map(str, value)))
-        else:
-            # Default case.
-            filter_el.set('value', str(value))
-
-    def __repr__(self):
-        return ('<biomart.Dataset name={!r}, display_name={!r}>'
-                .format(self._name, self._display_name))
-
-
-class Attribute(object):
-    """Biomart dataset attribute.
-    Attributes:
-        name (str): Attribute name.
-        display_name (str): Attribute display name.
-        description (str): Attribute description.
-    """
-
-    def __init__(self, name, display_name='', description='', default=False):
-        """Attribute constructor.
-        Args:
-            name (str): Attribute name.
-            display_name (str): Attribute display name.
-            description (str): Attribute description.
-            default (bool): Whether the attribute is a default
-                attribute of the corresponding datasets.
-        """
-        self._name = name
-        self._display_name = display_name
-        self._description = description
-        self._default = default
-
-    @property
-    def name(self):
-        """Name of the attribute."""
-        return self._name
-
-    @property
-    def display_name(self):
-        """Display name of the attribute."""
-        return self._display_name
-
-    @property
-    def description(self):
-        """Description of the attribute."""
-        return self._description
-
-    @property
-    def default(self):
-        """Whether this is a default attribute."""
-        return self._default
-
-    def __repr__(self):
-        return (('<biomart.Attribute name={!r},'
-                 ' display_name={!r}, description={!r}>')
-                .format(self._name, self._display_name, self._description))
-
-
-class Filter(object):
-    """Biomart dataset filter.
-    Attributes:
-        name (str): Filter name.
-        type (str): Type of the filter (boolean, int, etc.).
-        description (str): Filter description.
-    """
-
-    def __init__(self, name, type, description=''):
-        """ Filter constructor.
-        Args:
-            name (str): Filter name.
-            type (str): Type of the filter (boolean, int, etc.).
-            description (str): Filter description.
-        """
-        self._name = name
-        self._type = type
-        self._description = description
-
-    @property
-    def name(self):
-        """Filter name."""
-        return self._name
-
-    @property
-    def type(self):
-        """Filter type."""
-        return self._type
-
-    @property
-    def description(self):
-        """Filter description."""
-        return self._description
-
-    def __repr__(self):
-        return ('<biomart.Filter name={!r}, type={!r}>'
-                .format(self.name, self.type))
-
-
-dataset = Dataset(name='apolyacanthus_gene_ensembl',
-                  host='http://www.ensembl.org')
-
-# Protein Coordinates
-dataset.query(attributes=["ensembl_transcript_id", "ensembl_exon_id", "ensembl_peptide_id", "start_position",
-              "end_position", "transcript_start", "transcript_end", "cdd", "cdd_start", "cdd_end"], datatype='protein_coordinates')
-
-
-# Protein Features
-dataset.query(attributes=["ensembl_gene_id", "ensembl_gene_id_version", "ensembl_transcript_id_version",
-              "interpro", "interpro_description", "interpro_start", "interpro_end", "cdd", "cdd_start", "cdd_end"], datatype='protein_feature')

From c18c8607d441a48b38b1413adf342574d0cca10b Mon Sep 17 00:00:00 2001
From: Preeti <singhpreeti236@gmail.com>
Date: Thu, 2 Jun 2022 10:55:42 -0400
Subject: [PATCH 05/16] add args for getting protein coordinates

---
 altanalyze3/utilities/parser.py | 37 ++++++++++++++++++++++++++++++---
 1 file changed, 34 insertions(+), 3 deletions(-)

diff --git a/altanalyze3/utilities/parser.py b/altanalyze3/utilities/parser.py
index 45246d8..8c2dc5d 100644
--- a/altanalyze3/utilities/parser.py
+++ b/altanalyze3/utilities/parser.py
@@ -4,6 +4,7 @@
 from altanalyze3.utilities.helpers import get_version
 from altanalyze3.components.junction_count.main import count_junctions
 from altanalyze3.components.intron_count.main import count_introns
+from altanalyze3.components.annotation.main import protein_coordinates
 from altanalyze3.utilities.io import get_all_bam_chr
 from altanalyze3.utilities.constants import IntRetCat
 
@@ -25,7 +26,8 @@ def set_args_as_attributes(self):
 
     def add_common_arguments(self, parser):
         self.common_arguments = [
-            ("--loglevel", "Logging level. Default: info", str, "info", ["fatal", "error", "warning", "info", "debug"]),
+            ("--loglevel", "Logging level. Default: info", str,
+             "info", ["fatal", "error", "warning", "info", "debug"]),
             ("--threads", "Number of threads to run in parallel where applicable", int, 1, None),
             ("--cpus", "Number of processes to run in parallel where applicable", int, 1, None),
             ("--output", "Output prefix", str, "results", None)
@@ -49,7 +51,7 @@ def get_parser(self):
         subparsers = general_parser.add_subparsers()
         subparsers.required = True
         # Global parameters for all components of the tool
-        general_parser.add_argument(                       
+        general_parser.add_argument(
             "--version",
             action="version",
             version=get_version(),
@@ -147,6 +149,35 @@ def get_parser(self):
             action="store_true"
         )
         self.add_common_arguments(intron_parser)
+
+        # Protein Domain Annotation parser
+        protein_coordinates_parser = subparsers.add_parser(
+            "proteincoordinates",
+            parents=[parent_parser],
+            help="Get Protein to Domain annotations"
+        )
+        protein_coordinates_parser.set_defaults(func=get_protein_coordinates)
+        protein_coordinates_parser.add_argument(
+            "--name",
+            help="name of species eg. apolyacanthus_gene_ensembl",
+            type=str,
+            required=True,
+        )
+        protein_coordinates_parser.add_argument(
+            "--host",
+            help="Select the host from where you want to import data",
+            type=str,
+            default="http://www.ensembl.org"
+        )
+        protein_coordinates_parser.add_argument(
+            "--attributes",
+            help="Export certain coordinates or features from Ensembl",
+            type=str,
+            default=["ensembl_transcript_id", "ensembl_exon_id", "ensembl_peptide_id", "start_position",
+                     "end_position", "transcript_start", "transcript_end", "cdd", "cdd_start", "cdd_end"]
+        )
+        self.add_common_arguments(protein_coordinates_parser)
+
         return general_parser
 
     def resolve_path(self, selected=None):
@@ -193,4 +224,4 @@ def assert_args_for_count_introns(self):
     def assert_common_args(self):
         self.args.chr = get_all_bam_chr(self.args.bam, self.args.threads) \
             if len(self.args.chr) == 0 else [c if c.startswith("chr") else f"chr{c}" for c in self.args.chr]
-        self.args.loglevel = getattr(logging, self.args.loglevel.upper())
\ No newline at end of file
+        self.args.loglevel = getattr(logging, self.args.loglevel.upper())

From 03ed5a68642f6120592ad62fca38aace988a8a29 Mon Sep 17 00:00:00 2001
From: Preeti <singhpreeti236@gmail.com>
Date: Thu, 2 Jun 2022 10:56:08 -0400
Subject: [PATCH 06/16] rename the file to main

---
 altanalyze3/components/annotation/main.py | 511 ++++++++++++++++++++++
 1 file changed, 511 insertions(+)
 create mode 100644 altanalyze3/components/annotation/main.py

diff --git a/altanalyze3/components/annotation/main.py b/altanalyze3/components/annotation/main.py
new file mode 100644
index 0000000..c9c944c
--- /dev/null
+++ b/altanalyze3/components/annotation/main.py
@@ -0,0 +1,511 @@
+"""
+This is a generalized python module for getting data from Ensemble using Biomart server.
+"""
+
+
+import requests
+from future.utils import native_str
+from xml.etree import ElementTree
+import pandas as pd
+from io import StringIO
+from xml.etree.ElementTree import fromstring as xml_from_string
+from altanalyze3.utilities.helpers import (
+    TimeIt
+)
+
+DEFAULT_HOST = 'http://www.biomart.org'
+DEFAULT_PATH = '/biomart/martservice'
+DEFAULT_PORT = 80
+DEFAULT_SCHEMA = 'default'
+
+
+class ServerBase(object):
+    """Base class that handles requests to the biomart server.
+    Attributes:
+        host (str): Host to connect to for the biomart service.
+        path (str): Path to the biomart service on the host.
+        port (str): Port to connect to on the host.
+        url (str): Url used to connect to the biomart service.
+        use_cache (bool): Whether to cache requests to biomart.
+    """
+
+    def __init__(self, host=None, path=None, port=None):
+        """ServerBase constructor.
+        Args:
+            host (str): Url of host to connect to.
+            path (str): Path on the host to access to the biomart service.
+            port (int): Port to use for the connection.
+            use_cache (bool): Whether to cache requests.
+        """
+        # Use defaults if arg is None.
+        host = DEFAULT_HOST if host is None else host
+        path = DEFAULT_PATH if path is None else path
+        port = DEFAULT_PORT if port is None else port
+
+        # Add http prefix and remove trailing slash.
+        host = self._add_http_prefix(host)
+        host = self._remove_trailing_slash(host)
+
+        # Ensure path starts with slash.
+        if not path.startswith('/'):
+            path = '/' + path
+
+        self._host = host
+        self._path = path
+        self._port = port
+
+    @property
+    def host(self):
+        """Host to connect to for the biomart service."""
+        return self._host
+
+    @property
+    def path(self):
+        """Path to the biomart service on the host."""
+        return self._path
+
+    @property
+    def port(self):
+        """Port to connect to on the host."""
+        return self._port
+
+    @property
+    def url(self):
+        """Url used to connect to the biomart service."""
+        return '{}:{}{}'.format(self._host, self._port, self._path)
+
+    @staticmethod
+    def _add_http_prefix(url, prefix='http://'):
+        if not url.startswith('http://') or url.startswith('https://'):
+            url = prefix + url
+        return url
+
+    @staticmethod
+    def _remove_trailing_slash(url):
+        url.strip("/")
+        return url
+
+    def get(self, **params):
+        """Performs get request to the biomart service.
+        Args:
+            **params (dict of str: any): Arbitrary keyword arguments, which
+                are added as parameters to the get request to biomart.
+        Returns:
+            requests.models.Response: Response from biomart for the request.
+        """
+
+        r = requests.get(self.url, params=params)
+        r.raise_for_status()
+        return r
+
+
+class BiomartException(Exception):
+    """Basic exception class for biomart exceptions."""
+    pass
+
+
+class Dataset(ServerBase):
+    """Class representing a biomart dataset.
+    This class is responsible for handling queries to biomart
+    datasets. Queries can select a subset of attributes and can be filtered
+    using any available filters. A list of valid attributes is available in
+    the attributes property. If no attributes are given, a set of default
+    attributes is used. A list of valid filters is available in the filters
+    property. The type of value that can be specified for a given filter
+    depends on the filter as some filters accept single values, whilst others
+    can take lists of values.
+    Args:
+        name (str): Id of the dataset.
+        display_name (str): Display name of the dataset.
+        host (str): Url of host to connect to.
+        path (str): Path on the host to access to the biomart service.
+        port (int): Port to use for the connection.
+        use_cache (bool): Whether to cache requests.
+        virtual_schema (str): The virtual schema of the dataset.
+    Examples:
+        Directly connecting to a dataset:
+            >>> dataset = Dataset(name='hsapiens_gene_ensembl',
+            >>>                   host='http://www.ensembl.org')
+        Querying the dataset:
+            >>> dataset.query(attributes=['ensembl_gene_id',
+            >>>                           'external_gene_name'],
+            >>>               filters={'chromosome_name': ['1','2']})
+        Listing available attributes:
+            >>> dataset.attributes
+            >>> dataset.list_attributes()
+        Listing available filters:
+            >>> dataset.filters
+            >>> dataset.list_filters()
+    """
+
+    def __init__(self,
+                 name,
+                 display_name='',
+                 host=None,
+                 path=None,
+                 port=None,
+                 virtual_schema=DEFAULT_SCHEMA):
+        super().__init__(host=host, path=path, port=port)
+
+        self._name = name
+        self._display_name = display_name
+        self._virtual_schema = virtual_schema
+        self._filters = None
+        self._attributes = None
+        self._default_attributes = None
+        self._datatype = None
+
+    @property
+    def name(self):
+        """Name of the dataset (used as dataset id)."""
+        return self._name
+
+    @property
+    def display_name(self):
+        """Display name of the dataset."""
+        return self._display_name
+
+    @property
+    def filters(self):
+        """List of filters available for the dataset."""
+        if self._filters is None:
+            self._filters, self._attributes = self._fetch_configuration()
+        return self._filters
+
+    @property
+    def attributes(self):
+        """List of attributes available for the dataset (cached)."""
+        if self._attributes is None:
+            self._filters, self._attributes = self._fetch_configuration()
+        return self._attributes
+
+    @property
+    def default_attributes(self):
+        """List of default attributes for the dataset."""
+        if self._default_attributes is None:
+            self._default_attributes = {
+                name: attr
+                for name, attr in self.attributes.items()
+                if attr.default is True
+            }
+        return self._default_attributes
+
+    def list_attributes(self):
+        """Lists available attributes in a readable DataFrame format.
+        Returns:
+            pd.DataFrame: Frame listing available attributes.
+        """
+
+        def _row_gen(attributes):
+            for attr in attributes.values():
+                yield (attr.name, attr.display_name, attr.description)
+
+        return pd.DataFrame.from_records(
+            _row_gen(self.attributes),
+            columns=['name', 'display_name', 'description'])
+
+    def list_filters(self):
+        """Lists available filters in a readable DataFrame format.
+        Returns:
+            pd.DataFrame: Frame listing available filters.
+        """
+
+        def _row_gen(attributes):
+            for attr in attributes.values():
+                yield (attr.name, attr.type, attr.description)
+
+        return pd.DataFrame.from_records(
+            _row_gen(self.filters), columns=['name', 'type', 'description'])
+
+    def _fetch_configuration(self):
+        # Get datasets using biomart.
+        response = self.get(type='configuration', dataset=self._name)
+
+        # Check response for problems.
+        if 'Problem retrieving configuration' in response.text:
+            raise BiomartException('Failed to retrieve dataset configuration, '
+                                   'check the dataset name and schema.')
+
+        # Get filters and attributes from xml.
+        xml = ElementTree.fromstring(response.content)
+
+        filters = {f.name: f for f in self._filters_from_xml(xml)}
+        attributes = {a.name: a for a in self._attributes_from_xml(xml)}
+
+        return filters, attributes
+
+    @staticmethod
+    def _filters_from_xml(xml):
+        for node in xml.iter('FilterDescription'):
+            attrib = node.attrib
+            yield Filter(
+                name=attrib['internalName'], type=attrib.get('type', ''))
+
+    @staticmethod
+    def _attributes_from_xml(xml):
+        for page_index, page in enumerate(xml.iter('AttributePage')):
+            for desc in page.iter('AttributeDescription'):
+                attrib = desc.attrib
+
+                # Default attributes can only be from the first page.
+                default = (page_index == 0 and
+                           attrib.get('default', '') == 'true')
+
+                yield Attribute(
+                    name=attrib['internalName'],
+                    display_name=attrib.get('displayName', ''),
+                    description=attrib.get('description', ''),
+                    default=default)
+
+    def query(self,
+              attributes=None,
+              filters=None,
+              only_unique=True,
+              use_attr_names=False,
+              dtypes=None,
+              datatype=None
+              ):
+        """Queries the dataset to retrieve the contained data.
+        Args:
+            attributes (list[str]): Names of attributes to fetch in query.
+                Attribute names must correspond to valid attributes. See
+                the attributes property for a list of valid attributes.
+            filters (dict[str,any]): Dictionary of filters --> values
+                to filter the dataset by. Filter names and values must
+                correspond to valid filters and filter values. See the
+                filters property for a list of valid filters.
+            only_unique (bool): Whether to return only rows containing
+                unique values (True) or to include duplicate rows (False).
+            use_attr_names (bool): Whether to use the attribute names
+                as column names in the result (True) or the attribute
+                display names (False).
+            dtypes (dict[str,any]): Dictionary of attributes --> data types
+                to describe to pandas how the columns should be handled
+        Returns:
+            pandas.DataFrame: DataFrame containing the query results.
+        """
+
+        # Example query from Ensembl biomart:
+        #
+        # <?xml version="1.0" encoding="UTF-8"?>
+        # <!DOCTYPE Query>
+        # <Query  virtualSchemaName = "default" formatter = "TSV" header = "0"
+        #  uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
+        #   <Dataset name = "hsapiens_gene_ensembl" interface = "default" >
+        #       <Filter name = "chromosome_name" value = "1,2"/>
+        #       <Filter name = "end" value = "10000000"/>
+        #       <Filter name = "start" value = "1"/>
+        #       <Attribute name = "ensembl_gene_id" />
+        #       <Attribute name = "ensembl_transcript_id" />
+        #   </Dataset>
+        # </Query>
+
+        # Setup query element.
+        root = ElementTree.Element('Query')
+        root.set('virtualSchemaName', self._virtual_schema)
+        root.set('formatter', 'TSV')
+        root.set('header', '1')
+        root.set('uniqueRows', native_str(int(only_unique)))
+        root.set('datasetConfigVersion', '0.6')
+
+        # Add dataset element.
+        dataset = ElementTree.SubElement(root, 'Dataset')
+        dataset.set('name', self.name)
+        dataset.set('interface', 'default')
+
+        # Default to default attributes if none requested.
+        if attributes is None:
+            attributes = list(self.default_attributes.keys())
+
+        # Add attribute elements.
+        for name in attributes:
+            try:
+                attr = self.attributes[name]
+                self._add_attr_node(dataset, attr)
+            except KeyError:
+                raise BiomartException(
+                    'Unknown attribute {}, check dataset attributes '
+                    'for a list of valid attributes.'.format(name))
+
+        if filters is not None:
+            # Add filter elements.
+            for name, value in filters.items():
+                try:
+                    filter_ = self.filters[name]
+                    self._add_filter_node(dataset, filter_, value)
+                except KeyError:
+                    raise BiomartException(
+                        'Unknown filter {}, check dataset filters '
+                        'for a list of valid filters.'.format(name))
+
+        # Fetch response.
+        response = self.get(query=ElementTree.tostring(root))
+
+        # Raise exception if an error occurred.
+        if 'Query ERROR' in response.text:
+            raise BiomartException(response.text)
+
+        # Parse results into a DataFrame.
+        try:
+            result = pd.read_csv(StringIO(response.text),
+                                 sep='\t', dtype=dtypes)
+            # calculate the aa_nt_start and end positions
+
+            if (datatype == "protein_coordinates"):
+                result.to_csv(
+                    'Hs_ProteinCoordinates_build_100_38.csv', sep='\t')
+            elif(datatype == "protein_feature"):
+                result.to_csv(
+                    'Hs_ProteinFeatures_build_100_38.csv', sep='\t')
+        # Type error is raised of a data type is not understood by pandas
+        except TypeError as err:
+            raise ValueError("Non valid data type is used in dtypes")
+
+        if use_attr_names:
+            # Rename columns with attribute names instead of display names.
+            column_map = {
+                self.attributes[attr].display_name: attr
+                for attr in attributes
+            }
+            result.rename(columns=column_map, inplace=True)
+
+        return result
+
+        # on loop for each exon in one transcript
+    # by default initialize the first aa start, aa_nt_start = 1
+    def calculate_aa_positions(enst_id_new, enst_id_old, cds_start, cds_stop):
+        # check if new transcript
+        aa_stop = math.ceil((cds_stop - cds_start + 1) / 3)
+        if enst_id_new != enst_id_old:
+            aa_start = 1
+        # check if the last codon has less than three neucleotides
+        elif (cds_stop - cds_start + 1) % 3 != 0:
+            aa_start = aa_stop
+        return aa_stop, aa_start
+
+    @staticmethod
+    def _add_attr_node(root, attr):
+        attr_el = ElementTree.SubElement(root, 'Attribute')
+        attr_el.set('name', attr.name)
+
+    @staticmethod
+    def _add_filter_node(root, filter_, value):
+        """Adds filter xml node to root."""
+        filter_el = ElementTree.SubElement(root, 'Filter')
+        filter_el.set('name', filter_.name)
+
+        # Set filter value depending on type.
+        if filter_.type == 'boolean':
+            # Boolean case.
+            if value is True or value.lower() in {'included', 'only'}:
+                filter_el.set('excluded', '0')
+            elif value is False or value.lower() == 'excluded':
+                filter_el.set('excluded', '1')
+            else:
+                raise ValueError('Invalid value for boolean filter ({})'
+                                 .format(value))
+        elif isinstance(value, list) or isinstance(value, tuple):
+            # List case.
+            filter_el.set('value', ','.join(map(str, value)))
+        else:
+            # Default case.
+            filter_el.set('value', str(value))
+
+    def __repr__(self):
+        return ('<biomart.Dataset name={!r}, display_name={!r}>'
+                .format(self._name, self._display_name))
+
+
+class Attribute(object):
+    """Biomart dataset attribute.
+    Attributes:
+        name (str): Attribute name.
+        display_name (str): Attribute display name.
+        description (str): Attribute description.
+    """
+
+    def __init__(self, name, display_name='', description='', default=False):
+        """Attribute constructor.
+        Args:
+            name (str): Attribute name.
+            display_name (str): Attribute display name.
+            description (str): Attribute description.
+            default (bool): Whether the attribute is a default
+                attribute of the corresponding datasets.
+        """
+        self._name = name
+        self._display_name = display_name
+        self._description = description
+        self._default = default
+
+    @property
+    def name(self):
+        """Name of the attribute."""
+        return self._name
+
+    @property
+    def display_name(self):
+        """Display name of the attribute."""
+        return self._display_name
+
+    @property
+    def description(self):
+        """Description of the attribute."""
+        return self._description
+
+    @property
+    def default(self):
+        """Whether this is a default attribute."""
+        return self._default
+
+    def __repr__(self):
+        return (('<biomart.Attribute name={!r},'
+                 ' display_name={!r}, description={!r}>')
+                .format(self._name, self._display_name, self._description))
+
+
+class Filter(object):
+    """Biomart dataset filter.
+    Attributes:
+        name (str): Filter name.
+        type (str): Type of the filter (boolean, int, etc.).
+        description (str): Filter description.
+    """
+
+    def __init__(self, name, type, description=''):
+        """ Filter constructor.
+        Args:
+            name (str): Filter name.
+            type (str): Type of the filter (boolean, int, etc.).
+            description (str): Filter description.
+        """
+        self._name = name
+        self._type = type
+        self._description = description
+
+    @property
+    def name(self):
+        """Filter name."""
+        return self._name
+
+    @property
+    def type(self):
+        """Filter type."""
+        return self._type
+
+    @property
+    def description(self):
+        """Filter description."""
+        return self._description
+
+    def __repr__(self):
+        return ('<biomart.Filter name={!r}, type={!r}>'
+                .format(self.name, self.type))
+
+
+def protein_coordinates(args):
+    with TimeIt():
+        dataset = Dataset(name={args.name}, host={args.host})
+        logging.info(
+            f"""Getting Data from {args.host} for given species {args.name}""")
+        dataset.query(attributes=args.attributes)

From 66a5c4747ae2bde0ad1e140cfc719b78c2bfa6dc Mon Sep 17 00:00:00 2001
From: Preeti <singhpreeti236@gmail.com>
Date: Thu, 2 Jun 2022 21:14:29 -0400
Subject: [PATCH 07/16] code review comments

---
 altanalyze3/components/annotation/main.py | 10 ++++++----
 altanalyze3/utilities/parser.py           |  6 +++---
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/altanalyze3/components/annotation/main.py b/altanalyze3/components/annotation/main.py
index c9c944c..92805ec 100644
--- a/altanalyze3/components/annotation/main.py
+++ b/altanalyze3/components/annotation/main.py
@@ -350,6 +350,11 @@ def query(self,
             result = pd.read_csv(StringIO(response.text),
                                  sep='\t', dtype=dtypes)
             # calculate the aa_nt_start and end positions
+            cds_start = result["cdd_start"]
+            cds_end = result["cdd_end"]
+            enst_id_old = result["Exon stable ID"]
+            calculate_aa_positions(
+                enst_id_new, enst_id_old, cds_start, cds_stop)
 
             if (datatype == "protein_coordinates"):
                 result.to_csv(
@@ -368,16 +373,13 @@ def query(self,
                 for attr in attributes
             }
             result.rename(columns=column_map, inplace=True)
-
         return result
 
         # on loop for each exon in one transcript
     # by default initialize the first aa start, aa_nt_start = 1
-    def calculate_aa_positions(enst_id_new, enst_id_old, cds_start, cds_stop):
+    def calculate_aa_positions(cds_start, cds_stop):
         # check if new transcript
         aa_stop = math.ceil((cds_stop - cds_start + 1) / 3)
-        if enst_id_new != enst_id_old:
-            aa_start = 1
         # check if the last codon has less than three neucleotides
         elif (cds_stop - cds_start + 1) % 3 != 0:
             aa_start = aa_stop
diff --git a/altanalyze3/utilities/parser.py b/altanalyze3/utilities/parser.py
index 8c2dc5d..6bdc762 100644
--- a/altanalyze3/utilities/parser.py
+++ b/altanalyze3/utilities/parser.py
@@ -167,14 +167,14 @@ def get_parser(self):
             "--host",
             help="Select the host from where you want to import data",
             type=str,
-            default="http://www.ensembl.org"
+            default="https://www.ensembl.org"
         )
         protein_coordinates_parser.add_argument(
             "--attributes",
             help="Export certain coordinates or features from Ensembl",
-            type=str,
+            nargs="*",
             default=["ensembl_transcript_id", "ensembl_exon_id", "ensembl_peptide_id", "start_position",
-                     "end_position", "transcript_start", "transcript_end", "cdd", "cdd_start", "cdd_end"]
+                     "end_position", "transcript_start", "transcript_end", "cds_start", "cds_end"]
         )
         self.add_common_arguments(protein_coordinates_parser)
 

From e8fd70e335b3dc5769f2030ba465728e6f1780ea Mon Sep 17 00:00:00 2001
From: Preeti <singhpreeti236@gmail.com>
Date: Mon, 6 Jun 2022 11:28:40 -0400
Subject: [PATCH 08/16] correct values of aa_start and aa_end positions being
 calculated

---
 altanalyze3/components/annotation/main.py | 62 ++++++++++++++---------
 1 file changed, 38 insertions(+), 24 deletions(-)

diff --git a/altanalyze3/components/annotation/main.py b/altanalyze3/components/annotation/main.py
index 92805ec..17604df 100644
--- a/altanalyze3/components/annotation/main.py
+++ b/altanalyze3/components/annotation/main.py
@@ -9,9 +9,10 @@
 import pandas as pd
 from io import StringIO
 from xml.etree.ElementTree import fromstring as xml_from_string
-from altanalyze3.utilities.helpers import (
-    TimeIt
-)
+import math
+# from altanalyze3.utilities.helpers import (
+#     TimeIt
+# )
 
 DEFAULT_HOST = 'http://www.biomart.org'
 DEFAULT_PATH = '/biomart/martservice'
@@ -257,6 +258,13 @@ def _attributes_from_xml(xml):
                     description=attrib.get('description', ''),
                     default=default)
 
+    # on loop for each exon in one transcript
+    def calculate_aa_positions(self, cds_pos):
+        # check if new transcript
+
+        aa_position = math.ceil((cds_pos) / 3)
+        return aa_position
+
     def query(self,
               attributes=None,
               filters=None,
@@ -349,12 +357,20 @@ def query(self,
         try:
             result = pd.read_csv(StringIO(response.text),
                                  sep='\t', dtype=dtypes)
+
             # calculate the aa_nt_start and end positions
-            cds_start = result["cdd_start"]
-            cds_end = result["cdd_end"]
-            enst_id_old = result["Exon stable ID"]
-            calculate_aa_positions(
-                enst_id_new, enst_id_old, cds_start, cds_stop)
+
+            result = result.dropna(subset=['CDS start'])
+            result = result.dropna(subset=['CDS end'])
+            cds_start = result['CDS start'].astype(int)
+            cds_stop = result['CDS end'].astype(int)
+            result["aa_start"] = cds_start.apply(
+                lambda x: math.ceil((x) / 3))
+            result["aa_stop"] = cds_stop.apply(
+                lambda x: math.ceil((x) / 3))
+            # aa_start = self.calculate_aa_positions(cds_start).astype(float)
+            # aa_stop = self.calculate_aa_positions(cds_stop)
+            # aa_start = result["aa_start"]
 
             if (datatype == "protein_coordinates"):
                 result.to_csv(
@@ -375,16 +391,6 @@ def query(self,
             result.rename(columns=column_map, inplace=True)
         return result
 
-        # on loop for each exon in one transcript
-    # by default initialize the first aa start, aa_nt_start = 1
-    def calculate_aa_positions(cds_start, cds_stop):
-        # check if new transcript
-        aa_stop = math.ceil((cds_stop - cds_start + 1) / 3)
-        # check if the last codon has less than three neucleotides
-        elif (cds_stop - cds_start + 1) % 3 != 0:
-            aa_start = aa_stop
-        return aa_stop, aa_start
-
     @staticmethod
     def _add_attr_node(root, attr):
         attr_el = ElementTree.SubElement(root, 'Attribute')
@@ -505,9 +511,17 @@ def __repr__(self):
                 .format(self.name, self.type))
 
 
-def protein_coordinates(args):
-    with TimeIt():
-        dataset = Dataset(name={args.name}, host={args.host})
-        logging.info(
-            f"""Getting Data from {args.host} for given species {args.name}""")
-        dataset.query(attributes=args.attributes)
+# def protein_coordinates(args):
+#     with TimeIt():
+#         dataset = Dataset(name={args.name}, host={args.host})
+#         logging.info(
+#             f"""Getting Data from {args.host} for given species {args.name}""")
+#         dataset.query(attributes=args.attributes)
+
+
+dataset = Dataset(name='apolyacanthus_gene_ensembl',
+                  host='http://www.ensembl.org')
+
+# Protein Coordinates
+dataset.query(attributes=["ensembl_transcript_id", "ensembl_exon_id", "start_position",
+              "end_position", "transcript_start", "transcript_end", "cds_start", "cds_end"], datatype='protein_coordinates')

From a7ee6c6ae124f99152b77e168d4042f79ba0f324 Mon Sep 17 00:00:00 2001
From: Preeti <singhpreeti236@gmail.com>
Date: Mon, 6 Jun 2022 11:30:15 -0400
Subject: [PATCH 09/16] sample file updated

---
 docs/Hs_ProteinCoordinates_build_100_38.csv | 30 ++++++++++++---------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/docs/Hs_ProteinCoordinates_build_100_38.csv b/docs/Hs_ProteinCoordinates_build_100_38.csv
index e659e5d..3516b2c 100644
--- a/docs/Hs_ProteinCoordinates_build_100_38.csv
+++ b/docs/Hs_ProteinCoordinates_build_100_38.csv
@@ -1,12 +1,18 @@
-,Exon stable ID,Gene start (bp),Gene end (bp),Gene name,Protein stable ID,Transcript start (bp),Transcript end (bp),CDD start,CDD end
-0,ENSAPOE00000120411,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,,
-1,ENSAPOE00000120412,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,,
-2,ENSAPOE00000120413,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,,
-3,ENSAPOE00000120414,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,,
-4,ENSAPOE00000120415,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,,
-5,ENSAPOE00000120416,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,,
-6,ENSAPOE00000000170,290637,294901,,ENSAPOP00000020929,290637,294901,,
-7,ENSAPOE00000000171,290637,294901,,ENSAPOP00000020929,290637,294901,,
-8,ENSAPOE00000000174,290637,294901,,ENSAPOP00000020929,290637,294901,,
-9,ENSAPOE00000000177,290637,294901,,ENSAPOP00000020929,290637,294901,,
-10,ENSAPOE00000000280,290637,294901,,ENSAPOP00000020929,290637,294901,,
\ No newline at end of file
+Transcript stable ID	Exon stable ID	Gene start (bp)	Gene end (bp)	Transcript start (bp)	Transcript end (bp)	CDS start	CDS end	aa_start	aa_stop
+0	ENSAPOT00000017612	ENSAPOE00000120411	288439	298458	288439	298458	1.0	105.0	1	35
+1	ENSAPOT00000017612	ENSAPOE00000120412	288439	298458	288439	298458	106.0	254.0	36	85
+2	ENSAPOT00000017612	ENSAPOE00000120413	288439	298458	288439	298458	255.0	314.0	85	105
+3	ENSAPOT00000017612	ENSAPOE00000120414	288439	298458	288439	298458	315.0	360.0	105	120
+4	ENSAPOT00000017612	ENSAPOE00000120415	288439	298458	288439	298458	361.0	410.0	121	137
+5	ENSAPOT00000017612	ENSAPOE00000120416	288439	298458	288439	298458	411.0	513.0	137	171
+6	ENSAPOT00000017559	ENSAPOE00000000170	290637	294901	290637	294901	1.0	47.0	1	16
+7	ENSAPOT00000017559	ENSAPOE00000000171	290637	294901	290637	294901	48.0	103.0	16	35
+8	ENSAPOT00000017559	ENSAPOE00000000174	290637	294901	290637	294901	104.0	165.0	35	55
+9	ENSAPOT00000017559	ENSAPOE00000000177	290637	294901	290637	294901	166.0	303.0	56	101
+10	ENSAPOT00000017559	ENSAPOE00000000280	290637	294901	290637	294901	304.0	380.0	102	127
+11	ENSAPOT00000017559	ENSAPOE00000120530	290637	294901	290637	294901	381.0	396.0	127	132
+12	ENSAPOT00000017555	ENSAPOE00000000178	310862	317808	310862	317808	499.0	646.0	167	216
+13	ENSAPOT00000017555	ENSAPOE00000000181	310862	317808	310862	317808	647.0	819.0	216	273
+14	ENSAPOT00000017555	ENSAPOE00000000198	310862	317808	310862	317808	161.0	347.0	54	116
+15	ENSAPOT00000017555	ENSAPOE00000000201	310862	317808	310862	317808	348.0	498.0	116	166
+16	ENSAPOT00000017555	ENSAPOE00000000406	310862	317808	310862	317808	958.0	1085.0	320	362
\ No newline at end of file

From abbe0fcd07bb5c8b39c6efb8c48a6f76695a8656 Mon Sep 17 00:00:00 2001
From: Preeti <singhpreeti236@gmail.com>
Date: Mon, 6 Jun 2022 11:36:26 -0400
Subject: [PATCH 10/16] add commas

---
 docs/Hs_ProteinCoordinates_build_100_38.csv | 36 ++++++++++-----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/docs/Hs_ProteinCoordinates_build_100_38.csv b/docs/Hs_ProteinCoordinates_build_100_38.csv
index 3516b2c..efbb571 100644
--- a/docs/Hs_ProteinCoordinates_build_100_38.csv
+++ b/docs/Hs_ProteinCoordinates_build_100_38.csv
@@ -1,18 +1,18 @@
-Transcript stable ID	Exon stable ID	Gene start (bp)	Gene end (bp)	Transcript start (bp)	Transcript end (bp)	CDS start	CDS end	aa_start	aa_stop
-0	ENSAPOT00000017612	ENSAPOE00000120411	288439	298458	288439	298458	1.0	105.0	1	35
-1	ENSAPOT00000017612	ENSAPOE00000120412	288439	298458	288439	298458	106.0	254.0	36	85
-2	ENSAPOT00000017612	ENSAPOE00000120413	288439	298458	288439	298458	255.0	314.0	85	105
-3	ENSAPOT00000017612	ENSAPOE00000120414	288439	298458	288439	298458	315.0	360.0	105	120
-4	ENSAPOT00000017612	ENSAPOE00000120415	288439	298458	288439	298458	361.0	410.0	121	137
-5	ENSAPOT00000017612	ENSAPOE00000120416	288439	298458	288439	298458	411.0	513.0	137	171
-6	ENSAPOT00000017559	ENSAPOE00000000170	290637	294901	290637	294901	1.0	47.0	1	16
-7	ENSAPOT00000017559	ENSAPOE00000000171	290637	294901	290637	294901	48.0	103.0	16	35
-8	ENSAPOT00000017559	ENSAPOE00000000174	290637	294901	290637	294901	104.0	165.0	35	55
-9	ENSAPOT00000017559	ENSAPOE00000000177	290637	294901	290637	294901	166.0	303.0	56	101
-10	ENSAPOT00000017559	ENSAPOE00000000280	290637	294901	290637	294901	304.0	380.0	102	127
-11	ENSAPOT00000017559	ENSAPOE00000120530	290637	294901	290637	294901	381.0	396.0	127	132
-12	ENSAPOT00000017555	ENSAPOE00000000178	310862	317808	310862	317808	499.0	646.0	167	216
-13	ENSAPOT00000017555	ENSAPOE00000000181	310862	317808	310862	317808	647.0	819.0	216	273
-14	ENSAPOT00000017555	ENSAPOE00000000198	310862	317808	310862	317808	161.0	347.0	54	116
-15	ENSAPOT00000017555	ENSAPOE00000000201	310862	317808	310862	317808	348.0	498.0	116	166
-16	ENSAPOT00000017555	ENSAPOE00000000406	310862	317808	310862	317808	958.0	1085.0	320	362
\ No newline at end of file
+,Transcript stable ID,Exon stable ID,Gene start (bp),Gene end (bp),Transcript start (bp),Transcript end (bp),CDS start,CDS end,aa_start,aa_stop
+0,ENSAPOT00000017612,ENSAPOE00000120411,288439,298458,288439,298458,1.0,105.0,1,35
+1,ENSAPOT00000017612,ENSAPOE00000120412,288439,298458,288439,298458,106.0,254.0,36,85
+2,ENSAPOT00000017612,ENSAPOE00000120413,288439,298458,288439,298458,255.0,314.0,85,105
+3,ENSAPOT00000017612,ENSAPOE00000120414,288439,298458,288439,298458,315.0,360.0,105,120
+4,ENSAPOT00000017612,ENSAPOE00000120415,288439,298458,288439,298458,361.0,410.0,121,137
+5,ENSAPOT00000017612,ENSAPOE00000120416,288439,298458,288439,298458,411.0,513.0,137,171
+6,ENSAPOT00000017559,ENSAPOE00000000170,290637,294901,290637,294901,1.0,47.0,1,16
+7,ENSAPOT00000017559,ENSAPOE00000000171,290637,294901,290637,294901,48.0,103.0,16,35
+8,ENSAPOT00000017559,ENSAPOE00000000174,290637,294901,290637,294901,104.0,165.0,35,55
+9,ENSAPOT00000017559,ENSAPOE00000000177,290637,294901,290637,294901,166.0,303.0,56,101
+10,ENSAPOT00000017559,ENSAPOE00000000280,290637,294901,290637,294901,304.0,380.0,102,127
+11,ENSAPOT00000017559,ENSAPOE00000120530,290637,294901,290637,294901,381.0,396.0,127,132
+12,ENSAPOT00000017555,ENSAPOE00000000178,310862,317808,310862,317808,499.0,646.0,167,216
+13,ENSAPOT00000017555,ENSAPOE00000000181,310862,317808,310862,317808,647.0,819.0,216,273
+14,ENSAPOT00000017555,ENSAPOE00000000198,310862,317808,310862,317808,161.0,347.0,54,116
+15,ENSAPOT00000017555,ENSAPOE00000000201,310862,317808,310862,317808,348.0,498.0,116,166
+16,ENSAPOT00000017555,ENSAPOE00000000406,310862,317808,310862,317808,958.0,1085.0,320,362
\ No newline at end of file

From 65e5d5cd1c37ca8b2ed1f013ea1dfc39cc838415 Mon Sep 17 00:00:00 2001
From: Preeti <singhpreeti236@gmail.com>
Date: Mon, 6 Jun 2022 13:02:36 -0400
Subject: [PATCH 11/16] add subparser for protein coordinates and features

---
 altanalyze3/components/annotation/main.py | 31 ++++++++---------------
 1 file changed, 10 insertions(+), 21 deletions(-)

diff --git a/altanalyze3/components/annotation/main.py b/altanalyze3/components/annotation/main.py
index 17604df..9a9ab81 100644
--- a/altanalyze3/components/annotation/main.py
+++ b/altanalyze3/components/annotation/main.py
@@ -10,9 +10,9 @@
 from io import StringIO
 from xml.etree.ElementTree import fromstring as xml_from_string
 import math
-# from altanalyze3.utilities.helpers import (
-#     TimeIt
-# )
+from altanalyze3.utilities.helpers import (
+    TimeIt
+)
 
 DEFAULT_HOST = 'http://www.biomart.org'
 DEFAULT_PATH = '/biomart/martservice'
@@ -368,13 +368,10 @@ def query(self,
                 lambda x: math.ceil((x) / 3))
             result["aa_stop"] = cds_stop.apply(
                 lambda x: math.ceil((x) / 3))
-            # aa_start = self.calculate_aa_positions(cds_start).astype(float)
-            # aa_stop = self.calculate_aa_positions(cds_stop)
-            # aa_start = result["aa_start"]
 
             if (datatype == "protein_coordinates"):
                 result.to_csv(
-                    'Hs_ProteinCoordinates_build_100_38.csv', sep='\t')
+                    'Hs_ProteinCoordinates_build_100_38.csv')
             elif(datatype == "protein_feature"):
                 result.to_csv(
                     'Hs_ProteinFeatures_build_100_38.csv', sep='\t')
@@ -511,17 +508,9 @@ def __repr__(self):
                 .format(self.name, self.type))
 
 
-# def protein_coordinates(args):
-#     with TimeIt():
-#         dataset = Dataset(name={args.name}, host={args.host})
-#         logging.info(
-#             f"""Getting Data from {args.host} for given species {args.name}""")
-#         dataset.query(attributes=args.attributes)
-
-
-dataset = Dataset(name='apolyacanthus_gene_ensembl',
-                  host='http://www.ensembl.org')
-
-# Protein Coordinates
-dataset.query(attributes=["ensembl_transcript_id", "ensembl_exon_id", "start_position",
-              "end_position", "transcript_start", "transcript_end", "cds_start", "cds_end"], datatype='protein_coordinates')
+def protein_coordinates(args):
+    with TimeIt():
+        dataset = Dataset(name={args.name}, host={args.host})
+        logging.info(
+            f"""Getting Data from {args.host} for given species {args.name}""")
+        dataset.query(attributes=args.attributes)

From 618551b11a4a587bfe982fe77522c715f1ffb5f5 Mon Sep 17 00:00:00 2001
From: Preeti <singhpreeti236@gmail.com>
Date: Wed, 8 Jun 2022 14:37:47 -0400
Subject: [PATCH 12/16] rename

---
 altanalyze3/utilities/parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/altanalyze3/utilities/parser.py b/altanalyze3/utilities/parser.py
index 6bdc762..2f92501 100644
--- a/altanalyze3/utilities/parser.py
+++ b/altanalyze3/utilities/parser.py
@@ -156,7 +156,7 @@ def get_parser(self):
             parents=[parent_parser],
             help="Get Protein to Domain annotations"
         )
-        protein_coordinates_parser.set_defaults(func=get_protein_coordinates)
+        protein_coordinates_parser.set_defaults(func=protein_coordinates)
         protein_coordinates_parser.add_argument(
             "--name",
             help="name of species eg. apolyacanthus_gene_ensembl",

From 053534747ea97a7bf9cc6f1b565e64098f18583f Mon Sep 17 00:00:00 2001
From: Preeti <singhpreeti236@gmail.com>
Date: Mon, 13 Jun 2022 14:46:34 -0400
Subject: [PATCH 13/16] add lambda functions to get aa start and stop columns

---
 altanalyze3/components/annotation/main.py | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/altanalyze3/components/annotation/main.py b/altanalyze3/components/annotation/main.py
index 9a9ab81..ee5d075 100644
--- a/altanalyze3/components/annotation/main.py
+++ b/altanalyze3/components/annotation/main.py
@@ -357,24 +357,16 @@ def query(self,
         try:
             result = pd.read_csv(StringIO(response.text),
                                  sep='\t', dtype=dtypes)
-
             # calculate the aa_nt_start and end positions
-
             result = result.dropna(subset=['CDS start'])
             result = result.dropna(subset=['CDS end'])
             cds_start = result['CDS start'].astype(int)
             cds_stop = result['CDS end'].astype(int)
-            result["aa_start"] = cds_start.apply(
-                lambda x: math.ceil((x) / 3))
-            result["aa_stop"] = cds_stop.apply(
-                lambda x: math.ceil((x) / 3))
-
-            if (datatype == "protein_coordinates"):
-                result.to_csv(
-                    'Hs_ProteinCoordinates_build_100_38.csv')
-            elif(datatype == "protein_feature"):
-                result.to_csv(
-                    'Hs_ProteinFeatures_build_100_38.csv', sep='\t')
+            result["aa_start"] = cds_start.apply(lambda x: math.ceil((x) / 3))
+            result["aa_stop"] = cds_stop.apply(lambda x: math.ceil((x) / 3))
+            result.to_csv('Hs_ProteinCoordinates_build_100_38.csv', sep='\t')
+            result.to_csv('Hs_ProteinFeatures_build_100_38.csv', sep='\t')
+
         # Type error is raised of a data type is not understood by pandas
         except TypeError as err:
             raise ValueError("Non valid data type is used in dtypes")

From 414bf7a61d038c145473a12c107833beb46c33d7 Mon Sep 17 00:00:00 2001
From: Preeti <singhpreeti236@gmail.com>
Date: Thu, 16 Jun 2022 11:38:33 -0400
Subject: [PATCH 14/16] revised changes - agrgs

---
 altanalyze3/bin/altanalyze3               |  2 +-
 altanalyze3/components/annotation/main.py | 30 ++++++-----------------
 2 files changed, 9 insertions(+), 23 deletions(-)

diff --git a/altanalyze3/bin/altanalyze3 b/altanalyze3/bin/altanalyze3
index c054719..97d762b 100644
--- a/altanalyze3/bin/altanalyze3
+++ b/altanalyze3/bin/altanalyze3
@@ -12,4 +12,4 @@ def main(args=None):
 
 
 if __name__ == "__main__":
-    sys.exit(main(sys.argv[1:]))
\ No newline at end of file
+    sys.exit(main(sys.argv[1:]))
diff --git a/altanalyze3/components/annotation/main.py b/altanalyze3/components/annotation/main.py
index ee5d075..e25f61c 100644
--- a/altanalyze3/components/annotation/main.py
+++ b/altanalyze3/components/annotation/main.py
@@ -2,7 +2,6 @@
 This is a generalized python module for getting data from Ensemble using Biomart server.
 """
 
-
 import requests
 from future.utils import native_str
 from xml.etree import ElementTree
@@ -27,7 +26,6 @@ class ServerBase(object):
         path (str): Path to the biomart service on the host.
         port (str): Port to connect to on the host.
         url (str): Url used to connect to the biomart service.
-        use_cache (bool): Whether to cache requests to biomart.
     """
 
     def __init__(self, host=None, path=None, port=None):
@@ -121,22 +119,7 @@ class Dataset(ServerBase):
         host (str): Url of host to connect to.
         path (str): Path on the host to access to the biomart service.
         port (int): Port to use for the connection.
-        use_cache (bool): Whether to cache requests.
         virtual_schema (str): The virtual schema of the dataset.
-    Examples:
-        Directly connecting to a dataset:
-            >>> dataset = Dataset(name='hsapiens_gene_ensembl',
-            >>>                   host='http://www.ensembl.org')
-        Querying the dataset:
-            >>> dataset.query(attributes=['ensembl_gene_id',
-            >>>                           'external_gene_name'],
-            >>>               filters={'chromosome_name': ['1','2']})
-        Listing available attributes:
-            >>> dataset.attributes
-            >>> dataset.list_attributes()
-        Listing available filters:
-            >>> dataset.filters
-            >>> dataset.list_filters()
     """
 
     def __init__(self,
@@ -145,7 +128,7 @@ def __init__(self,
                  host=None,
                  path=None,
                  port=None,
-                 virtual_schema=DEFAULT_SCHEMA):
+                 virtual_schema=DEFAULT_SCHEMA, location):
         super().__init__(host=host, path=path, port=port)
 
         self._name = name
@@ -155,6 +138,7 @@ def __init__(self,
         self._attributes = None
         self._default_attributes = None
         self._datatype = None
+        self.location = location
 
     @property
     def name(self):
@@ -271,7 +255,7 @@ def query(self,
               only_unique=True,
               use_attr_names=False,
               dtypes=None,
-              datatype=None
+              datatype=None,
               ):
         """Queries the dataset to retrieve the contained data.
         Args:
@@ -321,6 +305,8 @@ def query(self,
         dataset.set('name', self.name)
         dataset.set('interface', 'default')
 
+        csv_location = self.location.with_suffix(".csv")
+        logging.info(f"""Save protein coordinates reads to {csv_location}""")
         # Default to default attributes if none requested.
         if attributes is None:
             attributes = list(self.default_attributes.keys())
@@ -364,10 +350,10 @@ def query(self,
             cds_stop = result['CDS end'].astype(int)
             result["aa_start"] = cds_start.apply(lambda x: math.ceil((x) / 3))
             result["aa_stop"] = cds_stop.apply(lambda x: math.ceil((x) / 3))
-            result.to_csv('Hs_ProteinCoordinates_build_100_38.csv', sep='\t')
-            result.to_csv('Hs_ProteinFeatures_build_100_38.csv', sep='\t')
+            with csv_location.open("w") as out_handler:
+                out_handler.write(result)
 
-        # Type error is raised of a data type is not understood by pandas
+        # Type error is raised of a data type is not understood by Pandas
         except TypeError as err:
             raise ValueError("Non valid data type is used in dtypes")
 

From ec88fd170abd6ec111f93adad18c0668aa426926 Mon Sep 17 00:00:00 2001
From: Preeti Singh <singhpreeti236.com>
Date: Fri, 28 Oct 2022 06:45:44 -0400
Subject: [PATCH 15/16] remove dead code

---
 altanalyze3/utilities/parser.py | 59 +++++++++++++--------------------
 1 file changed, 23 insertions(+), 36 deletions(-)

diff --git a/altanalyze3/utilities/parser.py b/altanalyze3/utilities/parser.py
index 6bcc27d..1e555b8 100644
--- a/altanalyze3/utilities/parser.py
+++ b/altanalyze3/utilities/parser.py
@@ -4,11 +4,8 @@
 import argparse
 from altanalyze3.utilities.helpers import get_version
 from altanalyze3.components.intron_count.main import count_introns
-<<<<<<< HEAD
-from altanalyze3.components.annotation.main import protein_coordinates
-=======
 from altanalyze3.components.junction_count.main import count_junctions
->>>>>>> master
+# from altanalyze3.components.junction_count.main import protein_coordinates
 from altanalyze3.utilities.io import get_all_bam_chr
 from altanalyze3.utilities.constants import (
     IntRetCat,
@@ -31,19 +28,11 @@ def set_args_as_attributes(self):
 
     def add_common_arguments(self, parser):
         self.common_arguments = [
-<<<<<<< HEAD
-            ("--loglevel", "Logging level. Default: info", str,
-             "info", ["fatal", "error", "warning", "info", "debug"]),
-            ("--threads", "Number of threads to run in parallel where applicable", int, 1, None),
-            ("--cpus", "Number of processes to run in parallel where applicable", int, 1, None),
-            ("--output", "Output prefix", str, "results", None)
-=======
             ("--loglevel", "Logging level. Default: info", str, "info", ["fatal", "error", "warning", "info", "debug"]),
             ("--threads", "Number of threads to run in parallel where applicable. Default: 1", int, 1, None),
             ("--cpus", "Number of processes to run in parallel where applicable. Default: 1", int, 1, None),
             ("--tmp", "Temporary files location. Default: tmp", str, "tmp", None),
             ("--output", "Output prefix. Default: results", str, "results", None)
->>>>>>> master
         ]
         for param in self.common_arguments:
             parser.add_argument(
@@ -162,7 +151,6 @@ def get_parser(self):
             help="Export processed reads into the BAM file. Default: False",
             action="store_true"
         )
-<<<<<<< HEAD
         self.add_common_arguments(intron_parser)
 
         # Protein Domain Annotation parser
@@ -171,31 +159,30 @@ def get_parser(self):
             parents=[parent_parser],
             help="Get Protein to Domain annotations"
         )
-        protein_coordinates_parser.set_defaults(func=protein_coordinates)
-        protein_coordinates_parser.add_argument(
-            "--name",
-            help="name of species eg. apolyacanthus_gene_ensembl",
-            type=str,
-            required=True,
-        )
-        protein_coordinates_parser.add_argument(
-            "--host",
-            help="Select the host from where you want to import data",
-            type=str,
-            default="https://www.ensembl.org"
-        )
-        protein_coordinates_parser.add_argument(
-            "--attributes",
-            help="Export certain coordinates or features from Ensembl",
-            nargs="*",
-            default=["ensembl_transcript_id", "ensembl_exon_id", "ensembl_peptide_id", "start_position",
-                     "end_position", "transcript_start", "transcript_end", "cds_start", "cds_end"]
-        )
-        self.add_common_arguments(protein_coordinates_parser)
+        #TO-DO
+        # protein_coordinates_parser.set_defaults(func=protein_coordinates)
+        # protein_coordinates_parser.add_argument(
+        #     "--name",
+        #     help="name of species eg. apolyacanthus_gene_ensembl",
+        #     type=str,
+        #     required=True,
+        # )
+        # protein_coordinates_parser.add_argument(
+        #     "--host",
+        #     help="Select the host from where you want to import data",
+        #     type=str,
+        #     default="https://www.ensembl.org"
+        # )
+        # protein_coordinates_parser.add_argument(
+        #     "--attributes",
+        #     help="Export certain coordinates or features from Ensembl",
+        #     nargs="*",
+        #     default=["ensembl_transcript_id", "ensembl_exon_id", "ensembl_peptide_id", "start_position",
+        #              "end_position", "transcript_start", "transcript_end", "cds_start", "cds_end"]
+        # )
+        # self.add_common_arguments(protein_coordinates_parser)
 
-=======
         self.add_common_arguments(junction_parser)
->>>>>>> master
         return general_parser
 
     def resolve_path(self, selected=None):

From 733cf47c060e2e7d89a3d7a428730f3a2e9b1206 Mon Sep 17 00:00:00 2001
From: Preeti Singh <singhpreeti236.com>
Date: Fri, 28 Oct 2022 07:07:18 -0400
Subject: [PATCH 16/16] update gitignore to have .vscode

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index e3f4f6e..ddb8b28 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,5 @@
 **/.DS_Store
-
+.vscode/
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]