From 8f301498d6c3e75e1b4538ce7129ff1346034c8b Mon Sep 17 00:00:00 2001 From: Preeti Date: Wed, 1 Jun 2022 16:04:10 -0400 Subject: [PATCH 01/16] add amino acid neucleaotide position calculation --- .../components/annotation/ensembl_biomart.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/altanalyze3/components/annotation/ensembl_biomart.py b/altanalyze3/components/annotation/ensembl_biomart.py index 3d36cbc..54a1cb5 100644 --- a/altanalyze3/components/annotation/ensembl_biomart.py +++ b/altanalyze3/components/annotation/ensembl_biomart.py @@ -2,11 +2,9 @@ This is a generalized python module for getting data from Ensemble using Biomart server. """ -from __future__ import absolute_import, division, print_function -import requests +import requests from future.utils import native_str -from builtins import * from xml.etree import ElementTree import pandas as pd from io import StringIO @@ -370,6 +368,18 @@ def query(self, return result + # on loop for each exon in one transcript + # by default initialize the first aa start, aa_nt_start = 1 + def calculate_aa_positions(enst_id_new, enst_id_old, cds_start, cds_stop): + # check if new transcript + aa_stop = math.ceil((cds_stop - cds_start + 1) / 3) + if enst_id_new != enst_id_old: + aa_start = 1 + # check if the last codon has less than three neucleotides + elif (cds_stop - cds_start + 1) % 3 != 0: + aa_start = aa_stop + return aa_stop, aa_start + @staticmethod def _add_attr_node(root, attr): attr_el = ElementTree.SubElement(root, 'Attribute') From e9df4df93c16899fc0a5fa9f3d5487f4e9103847 Mon Sep 17 00:00:00 2001 From: Preeti Date: Wed, 1 Jun 2022 16:18:25 -0400 Subject: [PATCH 02/16] code review changes - add strip function to remove slash in url --- altanalyze3/components/annotation/ensembl_biomart.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/altanalyze3/components/annotation/ensembl_biomart.py b/altanalyze3/components/annotation/ensembl_biomart.py index 54a1cb5..4eebf4e 100644 --- a/altanalyze3/components/annotation/ensembl_biomart.py +++ b/altanalyze3/components/annotation/ensembl_biomart.py @@ -80,8 +80,7 @@ def _add_http_prefix(url, prefix='http://'): @staticmethod def _remove_trailing_slash(url): - if url.endswith('/'): - url = url[:-1] + url.strip("/") return url def get(self, **params): From 18f2e29de6af1151d4847f384bddd0c4737618ef Mon Sep 17 00:00:00 2001 From: Preeti Date: Wed, 1 Jun 2022 18:01:39 -0400 Subject: [PATCH 03/16] check if the value of host is none --- altanalyze3/components/annotation/ensembl_biomart.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/altanalyze3/components/annotation/ensembl_biomart.py b/altanalyze3/components/annotation/ensembl_biomart.py index 4eebf4e..ab00aa5 100644 --- a/altanalyze3/components/annotation/ensembl_biomart.py +++ b/altanalyze3/components/annotation/ensembl_biomart.py @@ -36,9 +36,9 @@ def __init__(self, host=None, path=None, port=None): use_cache (bool): Whether to cache requests. """ # Use defaults if arg is None. - host = host or DEFAULT_HOST - path = path or DEFAULT_PATH - port = port or DEFAULT_PORT + host = DEFAULT_HOST if host is None else host + path = DEFAULT_PATH if path is None else path + port = DEFAULT_PORT if port is None else port # Add http prefix and remove trailing slash. host = self._add_http_prefix(host) From a59eda97117fafe35acca613731759adcd0d4fe2 Mon Sep 17 00:00:00 2001 From: Preeti Date: Thu, 2 Jun 2022 10:55:17 -0400 Subject: [PATCH 04/16] add a function to parse args --- .../components/annotation/ensembl_biomart.py | 512 ------------------ 1 file changed, 512 deletions(-) delete mode 100644 altanalyze3/components/annotation/ensembl_biomart.py diff --git a/altanalyze3/components/annotation/ensembl_biomart.py b/altanalyze3/components/annotation/ensembl_biomart.py deleted file mode 100644 index ab00aa5..0000000 --- a/altanalyze3/components/annotation/ensembl_biomart.py +++ /dev/null @@ -1,512 +0,0 @@ -""" -This is a generalized python module for getting data from Ensemble using Biomart server. -""" - - -import requests -from future.utils import native_str -from xml.etree import ElementTree -import pandas as pd -from io import StringIO -from xml.etree.ElementTree import fromstring as xml_from_string - - -DEFAULT_HOST = 'http://www.biomart.org' -DEFAULT_PATH = '/biomart/martservice' -DEFAULT_PORT = 80 -DEFAULT_SCHEMA = 'default' - - -class ServerBase(object): - """Base class that handles requests to the biomart server. - Attributes: - host (str): Host to connect to for the biomart service. - path (str): Path to the biomart service on the host. - port (str): Port to connect to on the host. - url (str): Url used to connect to the biomart service. - use_cache (bool): Whether to cache requests to biomart. - """ - - def __init__(self, host=None, path=None, port=None): - """ServerBase constructor. - Args: - host (str): Url of host to connect to. - path (str): Path on the host to access to the biomart service. - port (int): Port to use for the connection. - use_cache (bool): Whether to cache requests. - """ - # Use defaults if arg is None. - host = DEFAULT_HOST if host is None else host - path = DEFAULT_PATH if path is None else path - port = DEFAULT_PORT if port is None else port - - # Add http prefix and remove trailing slash. - host = self._add_http_prefix(host) - host = self._remove_trailing_slash(host) - - # Ensure path starts with slash. - if not path.startswith('/'): - path = '/' + path - - self._host = host - self._path = path - self._port = port - - @property - def host(self): - """Host to connect to for the biomart service.""" - return self._host - - @property - def path(self): - """Path to the biomart service on the host.""" - return self._path - - @property - def port(self): - """Port to connect to on the host.""" - return self._port - - @property - def url(self): - """Url used to connect to the biomart service.""" - return '{}:{}{}'.format(self._host, self._port, self._path) - - @staticmethod - def _add_http_prefix(url, prefix='http://'): - if not url.startswith('http://') or url.startswith('https://'): - url = prefix + url - return url - - @staticmethod - def _remove_trailing_slash(url): - url.strip("/") - return url - - def get(self, **params): - """Performs get request to the biomart service. - Args: - **params (dict of str: any): Arbitrary keyword arguments, which - are added as parameters to the get request to biomart. - Returns: - requests.models.Response: Response from biomart for the request. - """ - - r = requests.get(self.url, params=params) - r.raise_for_status() - return r - - -class BiomartException(Exception): - """Basic exception class for biomart exceptions.""" - pass - - -class Dataset(ServerBase): - """Class representing a biomart dataset. - This class is responsible for handling queries to biomart - datasets. Queries can select a subset of attributes and can be filtered - using any available filters. A list of valid attributes is available in - the attributes property. If no attributes are given, a set of default - attributes is used. A list of valid filters is available in the filters - property. The type of value that can be specified for a given filter - depends on the filter as some filters accept single values, whilst others - can take lists of values. - Args: - name (str): Id of the dataset. - display_name (str): Display name of the dataset. - host (str): Url of host to connect to. - path (str): Path on the host to access to the biomart service. - port (int): Port to use for the connection. - use_cache (bool): Whether to cache requests. - virtual_schema (str): The virtual schema of the dataset. - Examples: - Directly connecting to a dataset: - >>> dataset = Dataset(name='hsapiens_gene_ensembl', - >>> host='http://www.ensembl.org') - Querying the dataset: - >>> dataset.query(attributes=['ensembl_gene_id', - >>> 'external_gene_name'], - >>> filters={'chromosome_name': ['1','2']}) - Listing available attributes: - >>> dataset.attributes - >>> dataset.list_attributes() - Listing available filters: - >>> dataset.filters - >>> dataset.list_filters() - """ - - def __init__(self, - name, - display_name='', - host=None, - path=None, - port=None, - virtual_schema=DEFAULT_SCHEMA): - super().__init__(host=host, path=path, port=port) - - self._name = name - self._display_name = display_name - self._virtual_schema = virtual_schema - self._filters = None - self._attributes = None - self._default_attributes = None - self._datatype = None - - @property - def name(self): - """Name of the dataset (used as dataset id).""" - return self._name - - @property - def display_name(self): - """Display name of the dataset.""" - return self._display_name - - @property - def filters(self): - """List of filters available for the dataset.""" - if self._filters is None: - self._filters, self._attributes = self._fetch_configuration() - return self._filters - - @property - def attributes(self): - """List of attributes available for the dataset (cached).""" - if self._attributes is None: - self._filters, self._attributes = self._fetch_configuration() - return self._attributes - - @property - def default_attributes(self): - """List of default attributes for the dataset.""" - if self._default_attributes is None: - self._default_attributes = { - name: attr - for name, attr in self.attributes.items() - if attr.default is True - } - return self._default_attributes - - def list_attributes(self): - """Lists available attributes in a readable DataFrame format. - Returns: - pd.DataFrame: Frame listing available attributes. - """ - - def _row_gen(attributes): - for attr in attributes.values(): - yield (attr.name, attr.display_name, attr.description) - - return pd.DataFrame.from_records( - _row_gen(self.attributes), - columns=['name', 'display_name', 'description']) - - def list_filters(self): - """Lists available filters in a readable DataFrame format. - Returns: - pd.DataFrame: Frame listing available filters. - """ - - def _row_gen(attributes): - for attr in attributes.values(): - yield (attr.name, attr.type, attr.description) - - return pd.DataFrame.from_records( - _row_gen(self.filters), columns=['name', 'type', 'description']) - - def _fetch_configuration(self): - # Get datasets using biomart. - response = self.get(type='configuration', dataset=self._name) - - # Check response for problems. - if 'Problem retrieving configuration' in response.text: - raise BiomartException('Failed to retrieve dataset configuration, ' - 'check the dataset name and schema.') - - # Get filters and attributes from xml. - xml = ElementTree.fromstring(response.content) - - filters = {f.name: f for f in self._filters_from_xml(xml)} - attributes = {a.name: a for a in self._attributes_from_xml(xml)} - - return filters, attributes - - @staticmethod - def _filters_from_xml(xml): - for node in xml.iter('FilterDescription'): - attrib = node.attrib - yield Filter( - name=attrib['internalName'], type=attrib.get('type', '')) - - @staticmethod - def _attributes_from_xml(xml): - for page_index, page in enumerate(xml.iter('AttributePage')): - for desc in page.iter('AttributeDescription'): - attrib = desc.attrib - - # Default attributes can only be from the first page. - default = (page_index == 0 and - attrib.get('default', '') == 'true') - - yield Attribute( - name=attrib['internalName'], - display_name=attrib.get('displayName', ''), - description=attrib.get('description', ''), - default=default) - - def query(self, - attributes=None, - filters=None, - only_unique=True, - use_attr_names=False, - dtypes=None, - datatype=None - ): - """Queries the dataset to retrieve the contained data. - Args: - attributes (list[str]): Names of attributes to fetch in query. - Attribute names must correspond to valid attributes. See - the attributes property for a list of valid attributes. - filters (dict[str,any]): Dictionary of filters --> values - to filter the dataset by. Filter names and values must - correspond to valid filters and filter values. See the - filters property for a list of valid filters. - only_unique (bool): Whether to return only rows containing - unique values (True) or to include duplicate rows (False). - use_attr_names (bool): Whether to use the attribute names - as column names in the result (True) or the attribute - display names (False). - dtypes (dict[str,any]): Dictionary of attributes --> data types - to describe to pandas how the columns should be handled - Returns: - pandas.DataFrame: DataFrame containing the query results. - """ - - # Example query from Ensembl biomart: - # - # - # - # - # - # - # - # - # - # - # - # - - # Setup query element. - root = ElementTree.Element('Query') - root.set('virtualSchemaName', self._virtual_schema) - root.set('formatter', 'TSV') - root.set('header', '1') - root.set('uniqueRows', native_str(int(only_unique))) - root.set('datasetConfigVersion', '0.6') - - # Add dataset element. - dataset = ElementTree.SubElement(root, 'Dataset') - dataset.set('name', self.name) - dataset.set('interface', 'default') - - # Default to default attributes if none requested. - if attributes is None: - attributes = list(self.default_attributes.keys()) - - # Add attribute elements. - for name in attributes: - try: - attr = self.attributes[name] - self._add_attr_node(dataset, attr) - except KeyError: - raise BiomartException( - 'Unknown attribute {}, check dataset attributes ' - 'for a list of valid attributes.'.format(name)) - - if filters is not None: - # Add filter elements. - for name, value in filters.items(): - try: - filter_ = self.filters[name] - self._add_filter_node(dataset, filter_, value) - except KeyError: - raise BiomartException( - 'Unknown filter {}, check dataset filters ' - 'for a list of valid filters.'.format(name)) - - # Fetch response. - response = self.get(query=ElementTree.tostring(root)) - - # Raise exception if an error occurred. - if 'Query ERROR' in response.text: - raise BiomartException(response.text) - - # Parse results into a DataFrame. - try: - result = pd.read_csv(StringIO(response.text), - sep='\t', dtype=dtypes) - if (datatype == "protein_coordinates"): - result.to_csv( - 'Hs_ProteinCoordinates_build_100_38.csv', sep='\t') - elif(datatype == "protein_feature"): - result.to_csv( - 'Hs_ProteinFeatures_build_100_38.csv', sep='\t') - # Type error is raised of a data type is not understood by pandas - except TypeError as err: - raise ValueError("Non valid data type is used in dtypes") - - if use_attr_names: - # Rename columns with attribute names instead of display names. - column_map = { - self.attributes[attr].display_name: attr - for attr in attributes - } - result.rename(columns=column_map, inplace=True) - - return result - - # on loop for each exon in one transcript - # by default initialize the first aa start, aa_nt_start = 1 - def calculate_aa_positions(enst_id_new, enst_id_old, cds_start, cds_stop): - # check if new transcript - aa_stop = math.ceil((cds_stop - cds_start + 1) / 3) - if enst_id_new != enst_id_old: - aa_start = 1 - # check if the last codon has less than three neucleotides - elif (cds_stop - cds_start + 1) % 3 != 0: - aa_start = aa_stop - return aa_stop, aa_start - - @staticmethod - def _add_attr_node(root, attr): - attr_el = ElementTree.SubElement(root, 'Attribute') - attr_el.set('name', attr.name) - - @staticmethod - def _add_filter_node(root, filter_, value): - """Adds filter xml node to root.""" - filter_el = ElementTree.SubElement(root, 'Filter') - filter_el.set('name', filter_.name) - - # Set filter value depending on type. - if filter_.type == 'boolean': - # Boolean case. - if value is True or value.lower() in {'included', 'only'}: - filter_el.set('excluded', '0') - elif value is False or value.lower() == 'excluded': - filter_el.set('excluded', '1') - else: - raise ValueError('Invalid value for boolean filter ({})' - .format(value)) - elif isinstance(value, list) or isinstance(value, tuple): - # List case. - filter_el.set('value', ','.join(map(str, value))) - else: - # Default case. - filter_el.set('value', str(value)) - - def __repr__(self): - return ('' - .format(self._name, self._display_name)) - - -class Attribute(object): - """Biomart dataset attribute. - Attributes: - name (str): Attribute name. - display_name (str): Attribute display name. - description (str): Attribute description. - """ - - def __init__(self, name, display_name='', description='', default=False): - """Attribute constructor. - Args: - name (str): Attribute name. - display_name (str): Attribute display name. - description (str): Attribute description. - default (bool): Whether the attribute is a default - attribute of the corresponding datasets. - """ - self._name = name - self._display_name = display_name - self._description = description - self._default = default - - @property - def name(self): - """Name of the attribute.""" - return self._name - - @property - def display_name(self): - """Display name of the attribute.""" - return self._display_name - - @property - def description(self): - """Description of the attribute.""" - return self._description - - @property - def default(self): - """Whether this is a default attribute.""" - return self._default - - def __repr__(self): - return (('') - .format(self._name, self._display_name, self._description)) - - -class Filter(object): - """Biomart dataset filter. - Attributes: - name (str): Filter name. - type (str): Type of the filter (boolean, int, etc.). - description (str): Filter description. - """ - - def __init__(self, name, type, description=''): - """ Filter constructor. - Args: - name (str): Filter name. - type (str): Type of the filter (boolean, int, etc.). - description (str): Filter description. - """ - self._name = name - self._type = type - self._description = description - - @property - def name(self): - """Filter name.""" - return self._name - - @property - def type(self): - """Filter type.""" - return self._type - - @property - def description(self): - """Filter description.""" - return self._description - - def __repr__(self): - return ('' - .format(self.name, self.type)) - - -dataset = Dataset(name='apolyacanthus_gene_ensembl', - host='http://www.ensembl.org') - -# Protein Coordinates -dataset.query(attributes=["ensembl_transcript_id", "ensembl_exon_id", "ensembl_peptide_id", "start_position", - "end_position", "transcript_start", "transcript_end", "cdd", "cdd_start", "cdd_end"], datatype='protein_coordinates') - - -# Protein Features -dataset.query(attributes=["ensembl_gene_id", "ensembl_gene_id_version", "ensembl_transcript_id_version", - "interpro", "interpro_description", "interpro_start", "interpro_end", "cdd", "cdd_start", "cdd_end"], datatype='protein_feature') From c18c8607d441a48b38b1413adf342574d0cca10b Mon Sep 17 00:00:00 2001 From: Preeti Date: Thu, 2 Jun 2022 10:55:42 -0400 Subject: [PATCH 05/16] add args for getting protein coordinates --- altanalyze3/utilities/parser.py | 37 ++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/altanalyze3/utilities/parser.py b/altanalyze3/utilities/parser.py index 45246d8..8c2dc5d 100644 --- a/altanalyze3/utilities/parser.py +++ b/altanalyze3/utilities/parser.py @@ -4,6 +4,7 @@ from altanalyze3.utilities.helpers import get_version from altanalyze3.components.junction_count.main import count_junctions from altanalyze3.components.intron_count.main import count_introns +from altanalyze3.components.annotation.main import protein_coordinates from altanalyze3.utilities.io import get_all_bam_chr from altanalyze3.utilities.constants import IntRetCat @@ -25,7 +26,8 @@ def set_args_as_attributes(self): def add_common_arguments(self, parser): self.common_arguments = [ - ("--loglevel", "Logging level. Default: info", str, "info", ["fatal", "error", "warning", "info", "debug"]), + ("--loglevel", "Logging level. Default: info", str, + "info", ["fatal", "error", "warning", "info", "debug"]), ("--threads", "Number of threads to run in parallel where applicable", int, 1, None), ("--cpus", "Number of processes to run in parallel where applicable", int, 1, None), ("--output", "Output prefix", str, "results", None) @@ -49,7 +51,7 @@ def get_parser(self): subparsers = general_parser.add_subparsers() subparsers.required = True # Global parameters for all components of the tool - general_parser.add_argument( + general_parser.add_argument( "--version", action="version", version=get_version(), @@ -147,6 +149,35 @@ def get_parser(self): action="store_true" ) self.add_common_arguments(intron_parser) + + # Protein Domain Annotation parser + protein_coordinates_parser = subparsers.add_parser( + "proteincoordinates", + parents=[parent_parser], + help="Get Protein to Domain annotations" + ) + protein_coordinates_parser.set_defaults(func=get_protein_coordinates) + protein_coordinates_parser.add_argument( + "--name", + help="name of species eg. apolyacanthus_gene_ensembl", + type=str, + required=True, + ) + protein_coordinates_parser.add_argument( + "--host", + help="Select the host from where you want to import data", + type=str, + default="http://www.ensembl.org" + ) + protein_coordinates_parser.add_argument( + "--attributes", + help="Export certain coordinates or features from Ensembl", + type=str, + default=["ensembl_transcript_id", "ensembl_exon_id", "ensembl_peptide_id", "start_position", + "end_position", "transcript_start", "transcript_end", "cdd", "cdd_start", "cdd_end"] + ) + self.add_common_arguments(protein_coordinates_parser) + return general_parser def resolve_path(self, selected=None): @@ -193,4 +224,4 @@ def assert_args_for_count_introns(self): def assert_common_args(self): self.args.chr = get_all_bam_chr(self.args.bam, self.args.threads) \ if len(self.args.chr) == 0 else [c if c.startswith("chr") else f"chr{c}" for c in self.args.chr] - self.args.loglevel = getattr(logging, self.args.loglevel.upper()) \ No newline at end of file + self.args.loglevel = getattr(logging, self.args.loglevel.upper()) From 03ed5a68642f6120592ad62fca38aace988a8a29 Mon Sep 17 00:00:00 2001 From: Preeti Date: Thu, 2 Jun 2022 10:56:08 -0400 Subject: [PATCH 06/16] rename the file to main --- altanalyze3/components/annotation/main.py | 511 ++++++++++++++++++++++ 1 file changed, 511 insertions(+) create mode 100644 altanalyze3/components/annotation/main.py diff --git a/altanalyze3/components/annotation/main.py b/altanalyze3/components/annotation/main.py new file mode 100644 index 0000000..c9c944c --- /dev/null +++ b/altanalyze3/components/annotation/main.py @@ -0,0 +1,511 @@ +""" +This is a generalized python module for getting data from Ensemble using Biomart server. +""" + + +import requests +from future.utils import native_str +from xml.etree import ElementTree +import pandas as pd +from io import StringIO +from xml.etree.ElementTree import fromstring as xml_from_string +from altanalyze3.utilities.helpers import ( + TimeIt +) + +DEFAULT_HOST = 'http://www.biomart.org' +DEFAULT_PATH = '/biomart/martservice' +DEFAULT_PORT = 80 +DEFAULT_SCHEMA = 'default' + + +class ServerBase(object): + """Base class that handles requests to the biomart server. + Attributes: + host (str): Host to connect to for the biomart service. + path (str): Path to the biomart service on the host. + port (str): Port to connect to on the host. + url (str): Url used to connect to the biomart service. + use_cache (bool): Whether to cache requests to biomart. + """ + + def __init__(self, host=None, path=None, port=None): + """ServerBase constructor. + Args: + host (str): Url of host to connect to. + path (str): Path on the host to access to the biomart service. + port (int): Port to use for the connection. + use_cache (bool): Whether to cache requests. + """ + # Use defaults if arg is None. + host = DEFAULT_HOST if host is None else host + path = DEFAULT_PATH if path is None else path + port = DEFAULT_PORT if port is None else port + + # Add http prefix and remove trailing slash. + host = self._add_http_prefix(host) + host = self._remove_trailing_slash(host) + + # Ensure path starts with slash. + if not path.startswith('/'): + path = '/' + path + + self._host = host + self._path = path + self._port = port + + @property + def host(self): + """Host to connect to for the biomart service.""" + return self._host + + @property + def path(self): + """Path to the biomart service on the host.""" + return self._path + + @property + def port(self): + """Port to connect to on the host.""" + return self._port + + @property + def url(self): + """Url used to connect to the biomart service.""" + return '{}:{}{}'.format(self._host, self._port, self._path) + + @staticmethod + def _add_http_prefix(url, prefix='http://'): + if not url.startswith('http://') or url.startswith('https://'): + url = prefix + url + return url + + @staticmethod + def _remove_trailing_slash(url): + url.strip("/") + return url + + def get(self, **params): + """Performs get request to the biomart service. + Args: + **params (dict of str: any): Arbitrary keyword arguments, which + are added as parameters to the get request to biomart. + Returns: + requests.models.Response: Response from biomart for the request. + """ + + r = requests.get(self.url, params=params) + r.raise_for_status() + return r + + +class BiomartException(Exception): + """Basic exception class for biomart exceptions.""" + pass + + +class Dataset(ServerBase): + """Class representing a biomart dataset. + This class is responsible for handling queries to biomart + datasets. Queries can select a subset of attributes and can be filtered + using any available filters. A list of valid attributes is available in + the attributes property. If no attributes are given, a set of default + attributes is used. A list of valid filters is available in the filters + property. The type of value that can be specified for a given filter + depends on the filter as some filters accept single values, whilst others + can take lists of values. + Args: + name (str): Id of the dataset. + display_name (str): Display name of the dataset. + host (str): Url of host to connect to. + path (str): Path on the host to access to the biomart service. + port (int): Port to use for the connection. + use_cache (bool): Whether to cache requests. + virtual_schema (str): The virtual schema of the dataset. + Examples: + Directly connecting to a dataset: + >>> dataset = Dataset(name='hsapiens_gene_ensembl', + >>> host='http://www.ensembl.org') + Querying the dataset: + >>> dataset.query(attributes=['ensembl_gene_id', + >>> 'external_gene_name'], + >>> filters={'chromosome_name': ['1','2']}) + Listing available attributes: + >>> dataset.attributes + >>> dataset.list_attributes() + Listing available filters: + >>> dataset.filters + >>> dataset.list_filters() + """ + + def __init__(self, + name, + display_name='', + host=None, + path=None, + port=None, + virtual_schema=DEFAULT_SCHEMA): + super().__init__(host=host, path=path, port=port) + + self._name = name + self._display_name = display_name + self._virtual_schema = virtual_schema + self._filters = None + self._attributes = None + self._default_attributes = None + self._datatype = None + + @property + def name(self): + """Name of the dataset (used as dataset id).""" + return self._name + + @property + def display_name(self): + """Display name of the dataset.""" + return self._display_name + + @property + def filters(self): + """List of filters available for the dataset.""" + if self._filters is None: + self._filters, self._attributes = self._fetch_configuration() + return self._filters + + @property + def attributes(self): + """List of attributes available for the dataset (cached).""" + if self._attributes is None: + self._filters, self._attributes = self._fetch_configuration() + return self._attributes + + @property + def default_attributes(self): + """List of default attributes for the dataset.""" + if self._default_attributes is None: + self._default_attributes = { + name: attr + for name, attr in self.attributes.items() + if attr.default is True + } + return self._default_attributes + + def list_attributes(self): + """Lists available attributes in a readable DataFrame format. + Returns: + pd.DataFrame: Frame listing available attributes. + """ + + def _row_gen(attributes): + for attr in attributes.values(): + yield (attr.name, attr.display_name, attr.description) + + return pd.DataFrame.from_records( + _row_gen(self.attributes), + columns=['name', 'display_name', 'description']) + + def list_filters(self): + """Lists available filters in a readable DataFrame format. + Returns: + pd.DataFrame: Frame listing available filters. + """ + + def _row_gen(attributes): + for attr in attributes.values(): + yield (attr.name, attr.type, attr.description) + + return pd.DataFrame.from_records( + _row_gen(self.filters), columns=['name', 'type', 'description']) + + def _fetch_configuration(self): + # Get datasets using biomart. + response = self.get(type='configuration', dataset=self._name) + + # Check response for problems. + if 'Problem retrieving configuration' in response.text: + raise BiomartException('Failed to retrieve dataset configuration, ' + 'check the dataset name and schema.') + + # Get filters and attributes from xml. + xml = ElementTree.fromstring(response.content) + + filters = {f.name: f for f in self._filters_from_xml(xml)} + attributes = {a.name: a for a in self._attributes_from_xml(xml)} + + return filters, attributes + + @staticmethod + def _filters_from_xml(xml): + for node in xml.iter('FilterDescription'): + attrib = node.attrib + yield Filter( + name=attrib['internalName'], type=attrib.get('type', '')) + + @staticmethod + def _attributes_from_xml(xml): + for page_index, page in enumerate(xml.iter('AttributePage')): + for desc in page.iter('AttributeDescription'): + attrib = desc.attrib + + # Default attributes can only be from the first page. + default = (page_index == 0 and + attrib.get('default', '') == 'true') + + yield Attribute( + name=attrib['internalName'], + display_name=attrib.get('displayName', ''), + description=attrib.get('description', ''), + default=default) + + def query(self, + attributes=None, + filters=None, + only_unique=True, + use_attr_names=False, + dtypes=None, + datatype=None + ): + """Queries the dataset to retrieve the contained data. + Args: + attributes (list[str]): Names of attributes to fetch in query. + Attribute names must correspond to valid attributes. See + the attributes property for a list of valid attributes. + filters (dict[str,any]): Dictionary of filters --> values + to filter the dataset by. Filter names and values must + correspond to valid filters and filter values. See the + filters property for a list of valid filters. + only_unique (bool): Whether to return only rows containing + unique values (True) or to include duplicate rows (False). + use_attr_names (bool): Whether to use the attribute names + as column names in the result (True) or the attribute + display names (False). + dtypes (dict[str,any]): Dictionary of attributes --> data types + to describe to pandas how the columns should be handled + Returns: + pandas.DataFrame: DataFrame containing the query results. + """ + + # Example query from Ensembl biomart: + # + # + # + # + # + # + # + # + # + # + # + # + + # Setup query element. + root = ElementTree.Element('Query') + root.set('virtualSchemaName', self._virtual_schema) + root.set('formatter', 'TSV') + root.set('header', '1') + root.set('uniqueRows', native_str(int(only_unique))) + root.set('datasetConfigVersion', '0.6') + + # Add dataset element. + dataset = ElementTree.SubElement(root, 'Dataset') + dataset.set('name', self.name) + dataset.set('interface', 'default') + + # Default to default attributes if none requested. + if attributes is None: + attributes = list(self.default_attributes.keys()) + + # Add attribute elements. + for name in attributes: + try: + attr = self.attributes[name] + self._add_attr_node(dataset, attr) + except KeyError: + raise BiomartException( + 'Unknown attribute {}, check dataset attributes ' + 'for a list of valid attributes.'.format(name)) + + if filters is not None: + # Add filter elements. + for name, value in filters.items(): + try: + filter_ = self.filters[name] + self._add_filter_node(dataset, filter_, value) + except KeyError: + raise BiomartException( + 'Unknown filter {}, check dataset filters ' + 'for a list of valid filters.'.format(name)) + + # Fetch response. + response = self.get(query=ElementTree.tostring(root)) + + # Raise exception if an error occurred. + if 'Query ERROR' in response.text: + raise BiomartException(response.text) + + # Parse results into a DataFrame. + try: + result = pd.read_csv(StringIO(response.text), + sep='\t', dtype=dtypes) + # calculate the aa_nt_start and end positions + + if (datatype == "protein_coordinates"): + result.to_csv( + 'Hs_ProteinCoordinates_build_100_38.csv', sep='\t') + elif(datatype == "protein_feature"): + result.to_csv( + 'Hs_ProteinFeatures_build_100_38.csv', sep='\t') + # Type error is raised of a data type is not understood by pandas + except TypeError as err: + raise ValueError("Non valid data type is used in dtypes") + + if use_attr_names: + # Rename columns with attribute names instead of display names. + column_map = { + self.attributes[attr].display_name: attr + for attr in attributes + } + result.rename(columns=column_map, inplace=True) + + return result + + # on loop for each exon in one transcript + # by default initialize the first aa start, aa_nt_start = 1 + def calculate_aa_positions(enst_id_new, enst_id_old, cds_start, cds_stop): + # check if new transcript + aa_stop = math.ceil((cds_stop - cds_start + 1) / 3) + if enst_id_new != enst_id_old: + aa_start = 1 + # check if the last codon has less than three neucleotides + elif (cds_stop - cds_start + 1) % 3 != 0: + aa_start = aa_stop + return aa_stop, aa_start + + @staticmethod + def _add_attr_node(root, attr): + attr_el = ElementTree.SubElement(root, 'Attribute') + attr_el.set('name', attr.name) + + @staticmethod + def _add_filter_node(root, filter_, value): + """Adds filter xml node to root.""" + filter_el = ElementTree.SubElement(root, 'Filter') + filter_el.set('name', filter_.name) + + # Set filter value depending on type. + if filter_.type == 'boolean': + # Boolean case. + if value is True or value.lower() in {'included', 'only'}: + filter_el.set('excluded', '0') + elif value is False or value.lower() == 'excluded': + filter_el.set('excluded', '1') + else: + raise ValueError('Invalid value for boolean filter ({})' + .format(value)) + elif isinstance(value, list) or isinstance(value, tuple): + # List case. + filter_el.set('value', ','.join(map(str, value))) + else: + # Default case. + filter_el.set('value', str(value)) + + def __repr__(self): + return ('' + .format(self._name, self._display_name)) + + +class Attribute(object): + """Biomart dataset attribute. + Attributes: + name (str): Attribute name. + display_name (str): Attribute display name. + description (str): Attribute description. + """ + + def __init__(self, name, display_name='', description='', default=False): + """Attribute constructor. + Args: + name (str): Attribute name. + display_name (str): Attribute display name. + description (str): Attribute description. + default (bool): Whether the attribute is a default + attribute of the corresponding datasets. + """ + self._name = name + self._display_name = display_name + self._description = description + self._default = default + + @property + def name(self): + """Name of the attribute.""" + return self._name + + @property + def display_name(self): + """Display name of the attribute.""" + return self._display_name + + @property + def description(self): + """Description of the attribute.""" + return self._description + + @property + def default(self): + """Whether this is a default attribute.""" + return self._default + + def __repr__(self): + return (('') + .format(self._name, self._display_name, self._description)) + + +class Filter(object): + """Biomart dataset filter. + Attributes: + name (str): Filter name. + type (str): Type of the filter (boolean, int, etc.). + description (str): Filter description. + """ + + def __init__(self, name, type, description=''): + """ Filter constructor. + Args: + name (str): Filter name. + type (str): Type of the filter (boolean, int, etc.). + description (str): Filter description. + """ + self._name = name + self._type = type + self._description = description + + @property + def name(self): + """Filter name.""" + return self._name + + @property + def type(self): + """Filter type.""" + return self._type + + @property + def description(self): + """Filter description.""" + return self._description + + def __repr__(self): + return ('' + .format(self.name, self.type)) + + +def protein_coordinates(args): + with TimeIt(): + dataset = Dataset(name={args.name}, host={args.host}) + logging.info( + f"""Getting Data from {args.host} for given species {args.name}""") + dataset.query(attributes=args.attributes) From 66a5c4747ae2bde0ad1e140cfc719b78c2bfa6dc Mon Sep 17 00:00:00 2001 From: Preeti Date: Thu, 2 Jun 2022 21:14:29 -0400 Subject: [PATCH 07/16] code review comments --- altanalyze3/components/annotation/main.py | 10 ++++++---- altanalyze3/utilities/parser.py | 6 +++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/altanalyze3/components/annotation/main.py b/altanalyze3/components/annotation/main.py index c9c944c..92805ec 100644 --- a/altanalyze3/components/annotation/main.py +++ b/altanalyze3/components/annotation/main.py @@ -350,6 +350,11 @@ def query(self, result = pd.read_csv(StringIO(response.text), sep='\t', dtype=dtypes) # calculate the aa_nt_start and end positions + cds_start = result["cdd_start"] + cds_end = result["cdd_end"] + enst_id_old = result["Exon stable ID"] + calculate_aa_positions( + enst_id_new, enst_id_old, cds_start, cds_stop) if (datatype == "protein_coordinates"): result.to_csv( @@ -368,16 +373,13 @@ def query(self, for attr in attributes } result.rename(columns=column_map, inplace=True) - return result # on loop for each exon in one transcript # by default initialize the first aa start, aa_nt_start = 1 - def calculate_aa_positions(enst_id_new, enst_id_old, cds_start, cds_stop): + def calculate_aa_positions(cds_start, cds_stop): # check if new transcript aa_stop = math.ceil((cds_stop - cds_start + 1) / 3) - if enst_id_new != enst_id_old: - aa_start = 1 # check if the last codon has less than three neucleotides elif (cds_stop - cds_start + 1) % 3 != 0: aa_start = aa_stop diff --git a/altanalyze3/utilities/parser.py b/altanalyze3/utilities/parser.py index 8c2dc5d..6bdc762 100644 --- a/altanalyze3/utilities/parser.py +++ b/altanalyze3/utilities/parser.py @@ -167,14 +167,14 @@ def get_parser(self): "--host", help="Select the host from where you want to import data", type=str, - default="http://www.ensembl.org" + default="https://www.ensembl.org" ) protein_coordinates_parser.add_argument( "--attributes", help="Export certain coordinates or features from Ensembl", - type=str, + nargs="*", default=["ensembl_transcript_id", "ensembl_exon_id", "ensembl_peptide_id", "start_position", - "end_position", "transcript_start", "transcript_end", "cdd", "cdd_start", "cdd_end"] + "end_position", "transcript_start", "transcript_end", "cds_start", "cds_end"] ) self.add_common_arguments(protein_coordinates_parser) From e8fd70e335b3dc5769f2030ba465728e6f1780ea Mon Sep 17 00:00:00 2001 From: Preeti Date: Mon, 6 Jun 2022 11:28:40 -0400 Subject: [PATCH 08/16] correct values of aa_start and aa_end positions being calculated --- altanalyze3/components/annotation/main.py | 62 ++++++++++++++--------- 1 file changed, 38 insertions(+), 24 deletions(-) diff --git a/altanalyze3/components/annotation/main.py b/altanalyze3/components/annotation/main.py index 92805ec..17604df 100644 --- a/altanalyze3/components/annotation/main.py +++ b/altanalyze3/components/annotation/main.py @@ -9,9 +9,10 @@ import pandas as pd from io import StringIO from xml.etree.ElementTree import fromstring as xml_from_string -from altanalyze3.utilities.helpers import ( - TimeIt -) +import math +# from altanalyze3.utilities.helpers import ( +# TimeIt +# ) DEFAULT_HOST = 'http://www.biomart.org' DEFAULT_PATH = '/biomart/martservice' @@ -257,6 +258,13 @@ def _attributes_from_xml(xml): description=attrib.get('description', ''), default=default) + # on loop for each exon in one transcript + def calculate_aa_positions(self, cds_pos): + # check if new transcript + + aa_position = math.ceil((cds_pos) / 3) + return aa_position + def query(self, attributes=None, filters=None, @@ -349,12 +357,20 @@ def query(self, try: result = pd.read_csv(StringIO(response.text), sep='\t', dtype=dtypes) + # calculate the aa_nt_start and end positions - cds_start = result["cdd_start"] - cds_end = result["cdd_end"] - enst_id_old = result["Exon stable ID"] - calculate_aa_positions( - enst_id_new, enst_id_old, cds_start, cds_stop) + + result = result.dropna(subset=['CDS start']) + result = result.dropna(subset=['CDS end']) + cds_start = result['CDS start'].astype(int) + cds_stop = result['CDS end'].astype(int) + result["aa_start"] = cds_start.apply( + lambda x: math.ceil((x) / 3)) + result["aa_stop"] = cds_stop.apply( + lambda x: math.ceil((x) / 3)) + # aa_start = self.calculate_aa_positions(cds_start).astype(float) + # aa_stop = self.calculate_aa_positions(cds_stop) + # aa_start = result["aa_start"] if (datatype == "protein_coordinates"): result.to_csv( @@ -375,16 +391,6 @@ def query(self, result.rename(columns=column_map, inplace=True) return result - # on loop for each exon in one transcript - # by default initialize the first aa start, aa_nt_start = 1 - def calculate_aa_positions(cds_start, cds_stop): - # check if new transcript - aa_stop = math.ceil((cds_stop - cds_start + 1) / 3) - # check if the last codon has less than three neucleotides - elif (cds_stop - cds_start + 1) % 3 != 0: - aa_start = aa_stop - return aa_stop, aa_start - @staticmethod def _add_attr_node(root, attr): attr_el = ElementTree.SubElement(root, 'Attribute') @@ -505,9 +511,17 @@ def __repr__(self): .format(self.name, self.type)) -def protein_coordinates(args): - with TimeIt(): - dataset = Dataset(name={args.name}, host={args.host}) - logging.info( - f"""Getting Data from {args.host} for given species {args.name}""") - dataset.query(attributes=args.attributes) +# def protein_coordinates(args): +# with TimeIt(): +# dataset = Dataset(name={args.name}, host={args.host}) +# logging.info( +# f"""Getting Data from {args.host} for given species {args.name}""") +# dataset.query(attributes=args.attributes) + + +dataset = Dataset(name='apolyacanthus_gene_ensembl', + host='http://www.ensembl.org') + +# Protein Coordinates +dataset.query(attributes=["ensembl_transcript_id", "ensembl_exon_id", "start_position", + "end_position", "transcript_start", "transcript_end", "cds_start", "cds_end"], datatype='protein_coordinates') From a7ee6c6ae124f99152b77e168d4042f79ba0f324 Mon Sep 17 00:00:00 2001 From: Preeti Date: Mon, 6 Jun 2022 11:30:15 -0400 Subject: [PATCH 09/16] sample file updated --- docs/Hs_ProteinCoordinates_build_100_38.csv | 30 ++++++++++++--------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/docs/Hs_ProteinCoordinates_build_100_38.csv b/docs/Hs_ProteinCoordinates_build_100_38.csv index e659e5d..3516b2c 100644 --- a/docs/Hs_ProteinCoordinates_build_100_38.csv +++ b/docs/Hs_ProteinCoordinates_build_100_38.csv @@ -1,12 +1,18 @@ -,Exon stable ID,Gene start (bp),Gene end (bp),Gene name,Protein stable ID,Transcript start (bp),Transcript end (bp),CDD start,CDD end -0,ENSAPOE00000120411,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,, -1,ENSAPOE00000120412,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,, -2,ENSAPOE00000120413,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,, -3,ENSAPOE00000120414,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,, -4,ENSAPOE00000120415,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,, -5,ENSAPOE00000120416,288439,298458,zgc:63972,ENSAPOP00000020934,288439,298458,, -6,ENSAPOE00000000170,290637,294901,,ENSAPOP00000020929,290637,294901,, -7,ENSAPOE00000000171,290637,294901,,ENSAPOP00000020929,290637,294901,, -8,ENSAPOE00000000174,290637,294901,,ENSAPOP00000020929,290637,294901,, -9,ENSAPOE00000000177,290637,294901,,ENSAPOP00000020929,290637,294901,, -10,ENSAPOE00000000280,290637,294901,,ENSAPOP00000020929,290637,294901,, \ No newline at end of file +Transcript stable ID Exon stable ID Gene start (bp) Gene end (bp) Transcript start (bp) Transcript end (bp) CDS start CDS end aa_start aa_stop +0 ENSAPOT00000017612 ENSAPOE00000120411 288439 298458 288439 298458 1.0 105.0 1 35 +1 ENSAPOT00000017612 ENSAPOE00000120412 288439 298458 288439 298458 106.0 254.0 36 85 +2 ENSAPOT00000017612 ENSAPOE00000120413 288439 298458 288439 298458 255.0 314.0 85 105 +3 ENSAPOT00000017612 ENSAPOE00000120414 288439 298458 288439 298458 315.0 360.0 105 120 +4 ENSAPOT00000017612 ENSAPOE00000120415 288439 298458 288439 298458 361.0 410.0 121 137 +5 ENSAPOT00000017612 ENSAPOE00000120416 288439 298458 288439 298458 411.0 513.0 137 171 +6 ENSAPOT00000017559 ENSAPOE00000000170 290637 294901 290637 294901 1.0 47.0 1 16 +7 ENSAPOT00000017559 ENSAPOE00000000171 290637 294901 290637 294901 48.0 103.0 16 35 +8 ENSAPOT00000017559 ENSAPOE00000000174 290637 294901 290637 294901 104.0 165.0 35 55 +9 ENSAPOT00000017559 ENSAPOE00000000177 290637 294901 290637 294901 166.0 303.0 56 101 +10 ENSAPOT00000017559 ENSAPOE00000000280 290637 294901 290637 294901 304.0 380.0 102 127 +11 ENSAPOT00000017559 ENSAPOE00000120530 290637 294901 290637 294901 381.0 396.0 127 132 +12 ENSAPOT00000017555 ENSAPOE00000000178 310862 317808 310862 317808 499.0 646.0 167 216 +13 ENSAPOT00000017555 ENSAPOE00000000181 310862 317808 310862 317808 647.0 819.0 216 273 +14 ENSAPOT00000017555 ENSAPOE00000000198 310862 317808 310862 317808 161.0 347.0 54 116 +15 ENSAPOT00000017555 ENSAPOE00000000201 310862 317808 310862 317808 348.0 498.0 116 166 +16 ENSAPOT00000017555 ENSAPOE00000000406 310862 317808 310862 317808 958.0 1085.0 320 362 \ No newline at end of file From abbe0fcd07bb5c8b39c6efb8c48a6f76695a8656 Mon Sep 17 00:00:00 2001 From: Preeti Date: Mon, 6 Jun 2022 11:36:26 -0400 Subject: [PATCH 10/16] add commas --- docs/Hs_ProteinCoordinates_build_100_38.csv | 36 ++++++++++----------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/docs/Hs_ProteinCoordinates_build_100_38.csv b/docs/Hs_ProteinCoordinates_build_100_38.csv index 3516b2c..efbb571 100644 --- a/docs/Hs_ProteinCoordinates_build_100_38.csv +++ b/docs/Hs_ProteinCoordinates_build_100_38.csv @@ -1,18 +1,18 @@ -Transcript stable ID Exon stable ID Gene start (bp) Gene end (bp) Transcript start (bp) Transcript end (bp) CDS start CDS end aa_start aa_stop -0 ENSAPOT00000017612 ENSAPOE00000120411 288439 298458 288439 298458 1.0 105.0 1 35 -1 ENSAPOT00000017612 ENSAPOE00000120412 288439 298458 288439 298458 106.0 254.0 36 85 -2 ENSAPOT00000017612 ENSAPOE00000120413 288439 298458 288439 298458 255.0 314.0 85 105 -3 ENSAPOT00000017612 ENSAPOE00000120414 288439 298458 288439 298458 315.0 360.0 105 120 -4 ENSAPOT00000017612 ENSAPOE00000120415 288439 298458 288439 298458 361.0 410.0 121 137 -5 ENSAPOT00000017612 ENSAPOE00000120416 288439 298458 288439 298458 411.0 513.0 137 171 -6 ENSAPOT00000017559 ENSAPOE00000000170 290637 294901 290637 294901 1.0 47.0 1 16 -7 ENSAPOT00000017559 ENSAPOE00000000171 290637 294901 290637 294901 48.0 103.0 16 35 -8 ENSAPOT00000017559 ENSAPOE00000000174 290637 294901 290637 294901 104.0 165.0 35 55 -9 ENSAPOT00000017559 ENSAPOE00000000177 290637 294901 290637 294901 166.0 303.0 56 101 -10 ENSAPOT00000017559 ENSAPOE00000000280 290637 294901 290637 294901 304.0 380.0 102 127 -11 ENSAPOT00000017559 ENSAPOE00000120530 290637 294901 290637 294901 381.0 396.0 127 132 -12 ENSAPOT00000017555 ENSAPOE00000000178 310862 317808 310862 317808 499.0 646.0 167 216 -13 ENSAPOT00000017555 ENSAPOE00000000181 310862 317808 310862 317808 647.0 819.0 216 273 -14 ENSAPOT00000017555 ENSAPOE00000000198 310862 317808 310862 317808 161.0 347.0 54 116 -15 ENSAPOT00000017555 ENSAPOE00000000201 310862 317808 310862 317808 348.0 498.0 116 166 -16 ENSAPOT00000017555 ENSAPOE00000000406 310862 317808 310862 317808 958.0 1085.0 320 362 \ No newline at end of file +,Transcript stable ID,Exon stable ID,Gene start (bp),Gene end (bp),Transcript start (bp),Transcript end (bp),CDS start,CDS end,aa_start,aa_stop +0,ENSAPOT00000017612,ENSAPOE00000120411,288439,298458,288439,298458,1.0,105.0,1,35 +1,ENSAPOT00000017612,ENSAPOE00000120412,288439,298458,288439,298458,106.0,254.0,36,85 +2,ENSAPOT00000017612,ENSAPOE00000120413,288439,298458,288439,298458,255.0,314.0,85,105 +3,ENSAPOT00000017612,ENSAPOE00000120414,288439,298458,288439,298458,315.0,360.0,105,120 +4,ENSAPOT00000017612,ENSAPOE00000120415,288439,298458,288439,298458,361.0,410.0,121,137 +5,ENSAPOT00000017612,ENSAPOE00000120416,288439,298458,288439,298458,411.0,513.0,137,171 +6,ENSAPOT00000017559,ENSAPOE00000000170,290637,294901,290637,294901,1.0,47.0,1,16 +7,ENSAPOT00000017559,ENSAPOE00000000171,290637,294901,290637,294901,48.0,103.0,16,35 +8,ENSAPOT00000017559,ENSAPOE00000000174,290637,294901,290637,294901,104.0,165.0,35,55 +9,ENSAPOT00000017559,ENSAPOE00000000177,290637,294901,290637,294901,166.0,303.0,56,101 +10,ENSAPOT00000017559,ENSAPOE00000000280,290637,294901,290637,294901,304.0,380.0,102,127 +11,ENSAPOT00000017559,ENSAPOE00000120530,290637,294901,290637,294901,381.0,396.0,127,132 +12,ENSAPOT00000017555,ENSAPOE00000000178,310862,317808,310862,317808,499.0,646.0,167,216 +13,ENSAPOT00000017555,ENSAPOE00000000181,310862,317808,310862,317808,647.0,819.0,216,273 +14,ENSAPOT00000017555,ENSAPOE00000000198,310862,317808,310862,317808,161.0,347.0,54,116 +15,ENSAPOT00000017555,ENSAPOE00000000201,310862,317808,310862,317808,348.0,498.0,116,166 +16,ENSAPOT00000017555,ENSAPOE00000000406,310862,317808,310862,317808,958.0,1085.0,320,362 \ No newline at end of file From 65e5d5cd1c37ca8b2ed1f013ea1dfc39cc838415 Mon Sep 17 00:00:00 2001 From: Preeti Date: Mon, 6 Jun 2022 13:02:36 -0400 Subject: [PATCH 11/16] add subparser for protein coordinates and features --- altanalyze3/components/annotation/main.py | 31 ++++++++--------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/altanalyze3/components/annotation/main.py b/altanalyze3/components/annotation/main.py index 17604df..9a9ab81 100644 --- a/altanalyze3/components/annotation/main.py +++ b/altanalyze3/components/annotation/main.py @@ -10,9 +10,9 @@ from io import StringIO from xml.etree.ElementTree import fromstring as xml_from_string import math -# from altanalyze3.utilities.helpers import ( -# TimeIt -# ) +from altanalyze3.utilities.helpers import ( + TimeIt +) DEFAULT_HOST = 'http://www.biomart.org' DEFAULT_PATH = '/biomart/martservice' @@ -368,13 +368,10 @@ def query(self, lambda x: math.ceil((x) / 3)) result["aa_stop"] = cds_stop.apply( lambda x: math.ceil((x) / 3)) - # aa_start = self.calculate_aa_positions(cds_start).astype(float) - # aa_stop = self.calculate_aa_positions(cds_stop) - # aa_start = result["aa_start"] if (datatype == "protein_coordinates"): result.to_csv( - 'Hs_ProteinCoordinates_build_100_38.csv', sep='\t') + 'Hs_ProteinCoordinates_build_100_38.csv') elif(datatype == "protein_feature"): result.to_csv( 'Hs_ProteinFeatures_build_100_38.csv', sep='\t') @@ -511,17 +508,9 @@ def __repr__(self): .format(self.name, self.type)) -# def protein_coordinates(args): -# with TimeIt(): -# dataset = Dataset(name={args.name}, host={args.host}) -# logging.info( -# f"""Getting Data from {args.host} for given species {args.name}""") -# dataset.query(attributes=args.attributes) - - -dataset = Dataset(name='apolyacanthus_gene_ensembl', - host='http://www.ensembl.org') - -# Protein Coordinates -dataset.query(attributes=["ensembl_transcript_id", "ensembl_exon_id", "start_position", - "end_position", "transcript_start", "transcript_end", "cds_start", "cds_end"], datatype='protein_coordinates') +def protein_coordinates(args): + with TimeIt(): + dataset = Dataset(name={args.name}, host={args.host}) + logging.info( + f"""Getting Data from {args.host} for given species {args.name}""") + dataset.query(attributes=args.attributes) From 618551b11a4a587bfe982fe77522c715f1ffb5f5 Mon Sep 17 00:00:00 2001 From: Preeti Date: Wed, 8 Jun 2022 14:37:47 -0400 Subject: [PATCH 12/16] rename --- altanalyze3/utilities/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/altanalyze3/utilities/parser.py b/altanalyze3/utilities/parser.py index 6bdc762..2f92501 100644 --- a/altanalyze3/utilities/parser.py +++ b/altanalyze3/utilities/parser.py @@ -156,7 +156,7 @@ def get_parser(self): parents=[parent_parser], help="Get Protein to Domain annotations" ) - protein_coordinates_parser.set_defaults(func=get_protein_coordinates) + protein_coordinates_parser.set_defaults(func=protein_coordinates) protein_coordinates_parser.add_argument( "--name", help="name of species eg. apolyacanthus_gene_ensembl", From 053534747ea97a7bf9cc6f1b565e64098f18583f Mon Sep 17 00:00:00 2001 From: Preeti Date: Mon, 13 Jun 2022 14:46:34 -0400 Subject: [PATCH 13/16] add lambda functions to get aa start and stop columns --- altanalyze3/components/annotation/main.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/altanalyze3/components/annotation/main.py b/altanalyze3/components/annotation/main.py index 9a9ab81..ee5d075 100644 --- a/altanalyze3/components/annotation/main.py +++ b/altanalyze3/components/annotation/main.py @@ -357,24 +357,16 @@ def query(self, try: result = pd.read_csv(StringIO(response.text), sep='\t', dtype=dtypes) - # calculate the aa_nt_start and end positions - result = result.dropna(subset=['CDS start']) result = result.dropna(subset=['CDS end']) cds_start = result['CDS start'].astype(int) cds_stop = result['CDS end'].astype(int) - result["aa_start"] = cds_start.apply( - lambda x: math.ceil((x) / 3)) - result["aa_stop"] = cds_stop.apply( - lambda x: math.ceil((x) / 3)) - - if (datatype == "protein_coordinates"): - result.to_csv( - 'Hs_ProteinCoordinates_build_100_38.csv') - elif(datatype == "protein_feature"): - result.to_csv( - 'Hs_ProteinFeatures_build_100_38.csv', sep='\t') + result["aa_start"] = cds_start.apply(lambda x: math.ceil((x) / 3)) + result["aa_stop"] = cds_stop.apply(lambda x: math.ceil((x) / 3)) + result.to_csv('Hs_ProteinCoordinates_build_100_38.csv', sep='\t') + result.to_csv('Hs_ProteinFeatures_build_100_38.csv', sep='\t') + # Type error is raised of a data type is not understood by pandas except TypeError as err: raise ValueError("Non valid data type is used in dtypes") From 414bf7a61d038c145473a12c107833beb46c33d7 Mon Sep 17 00:00:00 2001 From: Preeti Date: Thu, 16 Jun 2022 11:38:33 -0400 Subject: [PATCH 14/16] revised changes - agrgs --- altanalyze3/bin/altanalyze3 | 2 +- altanalyze3/components/annotation/main.py | 30 ++++++----------------- 2 files changed, 9 insertions(+), 23 deletions(-) diff --git a/altanalyze3/bin/altanalyze3 b/altanalyze3/bin/altanalyze3 index c054719..97d762b 100644 --- a/altanalyze3/bin/altanalyze3 +++ b/altanalyze3/bin/altanalyze3 @@ -12,4 +12,4 @@ def main(args=None): if __name__ == "__main__": - sys.exit(main(sys.argv[1:])) \ No newline at end of file + sys.exit(main(sys.argv[1:])) diff --git a/altanalyze3/components/annotation/main.py b/altanalyze3/components/annotation/main.py index ee5d075..e25f61c 100644 --- a/altanalyze3/components/annotation/main.py +++ b/altanalyze3/components/annotation/main.py @@ -2,7 +2,6 @@ This is a generalized python module for getting data from Ensemble using Biomart server. """ - import requests from future.utils import native_str from xml.etree import ElementTree @@ -27,7 +26,6 @@ class ServerBase(object): path (str): Path to the biomart service on the host. port (str): Port to connect to on the host. url (str): Url used to connect to the biomart service. - use_cache (bool): Whether to cache requests to biomart. """ def __init__(self, host=None, path=None, port=None): @@ -121,22 +119,7 @@ class Dataset(ServerBase): host (str): Url of host to connect to. path (str): Path on the host to access to the biomart service. port (int): Port to use for the connection. - use_cache (bool): Whether to cache requests. virtual_schema (str): The virtual schema of the dataset. - Examples: - Directly connecting to a dataset: - >>> dataset = Dataset(name='hsapiens_gene_ensembl', - >>> host='http://www.ensembl.org') - Querying the dataset: - >>> dataset.query(attributes=['ensembl_gene_id', - >>> 'external_gene_name'], - >>> filters={'chromosome_name': ['1','2']}) - Listing available attributes: - >>> dataset.attributes - >>> dataset.list_attributes() - Listing available filters: - >>> dataset.filters - >>> dataset.list_filters() """ def __init__(self, @@ -145,7 +128,7 @@ def __init__(self, host=None, path=None, port=None, - virtual_schema=DEFAULT_SCHEMA): + virtual_schema=DEFAULT_SCHEMA, location): super().__init__(host=host, path=path, port=port) self._name = name @@ -155,6 +138,7 @@ def __init__(self, self._attributes = None self._default_attributes = None self._datatype = None + self.location = location @property def name(self): @@ -271,7 +255,7 @@ def query(self, only_unique=True, use_attr_names=False, dtypes=None, - datatype=None + datatype=None, ): """Queries the dataset to retrieve the contained data. Args: @@ -321,6 +305,8 @@ def query(self, dataset.set('name', self.name) dataset.set('interface', 'default') + csv_location = self.location.with_suffix(".csv") + logging.info(f"""Save protein coordinates reads to {csv_location}""") # Default to default attributes if none requested. if attributes is None: attributes = list(self.default_attributes.keys()) @@ -364,10 +350,10 @@ def query(self, cds_stop = result['CDS end'].astype(int) result["aa_start"] = cds_start.apply(lambda x: math.ceil((x) / 3)) result["aa_stop"] = cds_stop.apply(lambda x: math.ceil((x) / 3)) - result.to_csv('Hs_ProteinCoordinates_build_100_38.csv', sep='\t') - result.to_csv('Hs_ProteinFeatures_build_100_38.csv', sep='\t') + with csv_location.open("w") as out_handler: + out_handler.write(result) - # Type error is raised of a data type is not understood by pandas + # Type error is raised of a data type is not understood by Pandas except TypeError as err: raise ValueError("Non valid data type is used in dtypes") From ec88fd170abd6ec111f93adad18c0668aa426926 Mon Sep 17 00:00:00 2001 From: Preeti Singh Date: Fri, 28 Oct 2022 06:45:44 -0400 Subject: [PATCH 15/16] remove dead code --- altanalyze3/utilities/parser.py | 59 +++++++++++++-------------------- 1 file changed, 23 insertions(+), 36 deletions(-) diff --git a/altanalyze3/utilities/parser.py b/altanalyze3/utilities/parser.py index 6bcc27d..1e555b8 100644 --- a/altanalyze3/utilities/parser.py +++ b/altanalyze3/utilities/parser.py @@ -4,11 +4,8 @@ import argparse from altanalyze3.utilities.helpers import get_version from altanalyze3.components.intron_count.main import count_introns -<<<<<<< HEAD -from altanalyze3.components.annotation.main import protein_coordinates -======= from altanalyze3.components.junction_count.main import count_junctions ->>>>>>> master +# from altanalyze3.components.junction_count.main import protein_coordinates from altanalyze3.utilities.io import get_all_bam_chr from altanalyze3.utilities.constants import ( IntRetCat, @@ -31,19 +28,11 @@ def set_args_as_attributes(self): def add_common_arguments(self, parser): self.common_arguments = [ -<<<<<<< HEAD - ("--loglevel", "Logging level. Default: info", str, - "info", ["fatal", "error", "warning", "info", "debug"]), - ("--threads", "Number of threads to run in parallel where applicable", int, 1, None), - ("--cpus", "Number of processes to run in parallel where applicable", int, 1, None), - ("--output", "Output prefix", str, "results", None) -======= ("--loglevel", "Logging level. Default: info", str, "info", ["fatal", "error", "warning", "info", "debug"]), ("--threads", "Number of threads to run in parallel where applicable. Default: 1", int, 1, None), ("--cpus", "Number of processes to run in parallel where applicable. Default: 1", int, 1, None), ("--tmp", "Temporary files location. Default: tmp", str, "tmp", None), ("--output", "Output prefix. Default: results", str, "results", None) ->>>>>>> master ] for param in self.common_arguments: parser.add_argument( @@ -162,7 +151,6 @@ def get_parser(self): help="Export processed reads into the BAM file. Default: False", action="store_true" ) -<<<<<<< HEAD self.add_common_arguments(intron_parser) # Protein Domain Annotation parser @@ -171,31 +159,30 @@ def get_parser(self): parents=[parent_parser], help="Get Protein to Domain annotations" ) - protein_coordinates_parser.set_defaults(func=protein_coordinates) - protein_coordinates_parser.add_argument( - "--name", - help="name of species eg. apolyacanthus_gene_ensembl", - type=str, - required=True, - ) - protein_coordinates_parser.add_argument( - "--host", - help="Select the host from where you want to import data", - type=str, - default="https://www.ensembl.org" - ) - protein_coordinates_parser.add_argument( - "--attributes", - help="Export certain coordinates or features from Ensembl", - nargs="*", - default=["ensembl_transcript_id", "ensembl_exon_id", "ensembl_peptide_id", "start_position", - "end_position", "transcript_start", "transcript_end", "cds_start", "cds_end"] - ) - self.add_common_arguments(protein_coordinates_parser) + #TO-DO + # protein_coordinates_parser.set_defaults(func=protein_coordinates) + # protein_coordinates_parser.add_argument( + # "--name", + # help="name of species eg. apolyacanthus_gene_ensembl", + # type=str, + # required=True, + # ) + # protein_coordinates_parser.add_argument( + # "--host", + # help="Select the host from where you want to import data", + # type=str, + # default="https://www.ensembl.org" + # ) + # protein_coordinates_parser.add_argument( + # "--attributes", + # help="Export certain coordinates or features from Ensembl", + # nargs="*", + # default=["ensembl_transcript_id", "ensembl_exon_id", "ensembl_peptide_id", "start_position", + # "end_position", "transcript_start", "transcript_end", "cds_start", "cds_end"] + # ) + # self.add_common_arguments(protein_coordinates_parser) -======= self.add_common_arguments(junction_parser) ->>>>>>> master return general_parser def resolve_path(self, selected=None): From 733cf47c060e2e7d89a3d7a428730f3a2e9b1206 Mon Sep 17 00:00:00 2001 From: Preeti Singh Date: Fri, 28 Oct 2022 07:07:18 -0400 Subject: [PATCH 16/16] update gitignore to have .vscode --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index e3f4f6e..ddb8b28 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ **/.DS_Store - +.vscode/ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod]