Merge pull request #37 from BiomedSciAI/uniprot-domain-task

Add new tasks based on UniProt keywords
BiomedSciAI · Aug 21, 2024 · 0d2e7e4 · 0d2e7e4
2 parents 7cea557 + f3cca1a
commit 0d2e7e4
Show file tree

Hide file tree

Showing 6 changed files with 198 additions and 43 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -37,6 +37,7 @@ jobs:
  python scripts/tasks_retrieval/HPA_tasks_creation.py --allow-downloads True
  python scripts/tasks_retrieval/humantfs_task_creation.py --allow-downloads True
  python scripts/tasks_retrieval/Reactome_tasks_creation.py --allow-downloads True
+ python scripts/tasks_retrieval/uniprot_keyword_tasks_creation.py --allow-downloads True
 
  - name: Test with pytest
  run: |

diff --git a/gene_benchmark/task_retrieval.py b/gene_benchmark/task_retrieval.py
@@ -342,3 +342,27 @@ def tag_list_to_multi_label(
  outcome_df.loc[index, true_cat] = 1
  entities = pd.Series(outcome_df.index, name=entities_name)
  return entities, outcome_df
+
+
+def get_symbol_list(
+ url: str = "https://g-a8b222.dd271.03c0.data.globus.org/pub/databases/genenames/hgnc/json/hgnc_complete_set.json",
+) -> list[str]:
+ """
+ Retrieves the symbol list from a HGNC json like file.
+
+ Args:
+ ----
+ url (str, optional): url for the json file download. Defaults to "https://g-a8b222.dd271.03c0.data.globus.org/pub/databases/genenames/hgnc/json/hgnc_complete_set.json".
+
+ Returns:
+ -------
+ list[str]: list of symbols
+
+ """
+ with requests.get(url) as response:
+ response.raise_for_status()
+ reactome_res = response.json()
+ return [v["symbol"] for v in reactome_res["response"]["docs"]]
+
+
+GENE_SYMBOL_URL = "https://g-a8b222.dd271.03c0.data.globus.org/pub/databases/genenames/hgnc/json/hgnc_complete_set.json"
diff --git a/scripts/encodings_retrieval/extract_bag_of_words_encodings.py b/scripts/encodings_retrieval/extract_bag_of_words_encodings.py
@@ -3,32 +3,11 @@
 
 import click
 import pandas as pd
-import requests
 import yaml
 from sklearn.feature_extraction.text import CountVectorizer
 
 from gene_benchmark.descriptor import NCBIDescriptor
-
-GENE_SYMBOL_URL = "https://g-a8b222.dd271.03c0.data.globus.org/pub/databases/genenames/hgnc/json/hgnc_complete_set.json"
-
-
-def get_symbol_list(url: str = GENE_SYMBOL_URL):
- """
- Get gene symbol list, default from : "https://g-a8b222.dd271.03c0.data.globus.org/pub/databases/genenames/hgnc/json/hgnc_complete_set.json".
-
- Args:
- ----
- url (str): the url
-
- Returns:
- -------
- A list og gene names [str]
-
- """
- with requests.get(url) as response:
- response.raise_for_status()
- reactome_res = response.json()
- return [v["symbol"] for v in reactome_res["response"]["docs"]]
+from gene_benchmark.task_retrieval import GENE_SYMBOL_URL, get_symbol_list
 
 
 def get_descriptions(gene_symbols: list):

diff --git a/scripts/tasks_retrieval/Reactome_tasks_creation.py b/scripts/tasks_retrieval/Reactome_tasks_creation.py
@@ -3,6 +3,7 @@
 import requests
 
 from gene_benchmark.task_retrieval import (
+ get_symbol_list,
  list_form_to_onehot_form,
  verify_source_of_data,
 )
@@ -28,27 +29,6 @@ def get_token_link_for_symbols(symbols: list[str]) -> str:
  return f"https://reactome.org/AnalysisService/download/{token}/pathways/TOTAL/result.csv"
 
 
-def get_symbol_list(
- url: str = "https://g-a8b222.dd271.03c0.data.globus.org/pub/databases/genenames/hgnc/json/hgnc_complete_set.json",
-) -> list[str]:
- """
- Retrieves the symbol list from a HGNC json like file.
-
- Args:
- ----
- url (str, optional): url for the json file download. Defaults to "https://g-a8b222.dd271.03c0.data.globus.org/pub/databases/genenames/hgnc/json/hgnc_complete_set.json".
-
- Returns:
- -------
- list[str]: list of symbols
-
- """
- with requests.get(url) as response:
- response.raise_for_status()
- reactome_res = response.json()
- return [v["symbol"] for v in reactome_res["response"]["docs"]]
-
-
 def get_token(
  identifiers: list[str],
  projection_url: str = "https://reactome.org/AnalysisService/identifiers/projection",

diff --git a/scripts/tasks_retrieval/uniprot_keyword_tasks_creation.py b/scripts/tasks_retrieval/uniprot_keyword_tasks_creation.py
@@ -0,0 +1,171 @@
+import gzip
+import json
+from collections import defaultdict
+from io import BytesIO
+from pathlib import Path
+
+import click
+import pandas as pd
+import requests
+
+from gene_benchmark.task_retrieval import GENE_SYMBOL_URL
+from gene_benchmark.tasks import dump_task_definitions
+
+
+def get_gene_protein_keyword_dfs(gene_proteins: list[dict]) -> dict[str, pd.DataFrame]:
+ """
+ Create dict of DataFrames for all UniProt keywords.
+
+ Args:
+ ----
+ gene_proteins (list[dict]): list of proteins that have gene names in their metadata
+
+ Returns:
+ -------
+ dict[str, pd.DataFrame]: dict with keys according to keyword category and values
+ are DataFrames with gene symbol index, keyword value columns and binary values
+ representing whether the gene symbol has the keyword value.
+
+ """
+ category_gene_kw_map = create_category_gene_kw_map(gene_proteins)
+
+ return {c: make_gene_kw_df(gkm) for c, gkm in category_gene_kw_map.items()}
+
+
+def create_category_gene_kw_map(gene_proteins: list[dict]) -> dict[str, dict[str, set]]:
+ category_dict = defaultdict(lambda: defaultdict(set))
+
+ for gene_protein in gene_proteins:
+ gene_symbol = gene_protein["genes"][0]["geneName"]["value"]
+
+ for kw in gene_protein.get("keywords", []):
+ category = kw["category"]
+ name = kw["name"]
+ category_dict[category][gene_symbol].add(name)
+ return category_dict
+
+
+def make_gene_kw_df(gene_kw_map: dict[str, set]) -> pd.DataFrame:
+ all_keywords = sorted({kw for keywords in gene_kw_map.values() for kw in keywords})
+
+ rows = []
+ for gene, keywords in gene_kw_map.items():
+ row = [1 if kw in keywords else 0 for kw in all_keywords]
+ rows.append([gene] + row)
+
+ return pd.DataFrame(rows, columns=["Gene"] + all_keywords).set_index("Gene")
+
+
+def download_and_load_json_gz(url: str) -> dict:
+ """
+ Download and gunzip a json from a url.
+
+ Args:
+ ----
+ url (str): url to download
+
+ Returns:
+ -------
+ dict: contents of json.gz as a dict
+
+ """
+ response = requests.get(url)
+ response.raise_for_status()
+ compressed_data = BytesIO(response.content)
+ with gzip.GzipFile(fileobj=compressed_data) as gz:
+ json_data = json.load(gz)
+
+ return json_data
+
+
+def get_uniprot_human_protein_features(
+ file: str | None = None, allow_downloads: bool = False
+) -> dict:
+ """Get UniProt Human Proteins data from file or from server."""
+ if file and Path(file).exists():
+ with open(file) as f:
+ return json.load(f)
+ url = "https://rest.uniprot.org/uniprotkb/stream?compressed=true&download=true&format=json&query=%28*%29+AND+%28model_organism%3A9606%29+AND+%28reviewed%3Atrue%29"
+ if allow_downloads:
+ uniprot_features = download_and_load_json_gz(url)
+ if file:
+ with open(file, "w") as f:
+ json.dump(uniprot_features, f)
+
+ return uniprot_features
+
+
+@click.command()
+@click.option(
+ "--allow-downloads",
+ "-l",
+ type=click.BOOL,
+ help=f"download files directly from {GENE_SYMBOL_URL}, use this option only if you trust the URL.",
+ default=False,
+)
+@click.option(
+ "--input-file",
+ type=click.STRING,
+ help="The path to the yaml file with the gene names. If omitted, `allow-downloads` must be True",
+ default=None,
+)
+@click.option(
+ "--main-task-directory",
+ "-m",
+ type=click.STRING,
+ help="The task root directory. Will not be created.",
+ default="./tasks",
+)
+@click.option(
+ "--task-name",
+ "-n",
+ type=click.STRING,
+ multiple=True,
+ help="Name for the task based on UniProt keyword category. Must be from this list:"
+ " ['Biological process', 'Cellular component', 'Coding sequence diversity', 'Disease', "
+ " 'Domain', 'Ligand', 'Molecular function', 'PTM', 'Technical term']"
+ "Can be multiply defined. Defaults to creating all of the possible keyword tasks.",
+ default=[
+ "Biological process",
+ "Cellular component",
+ "Coding sequence diversity",
+ "Disease",
+ "Domain",
+ "Ligand",
+ "Molecular function",
+ "PTM",
+ "Technical term",
+ ],
+)
+def main(
+ allow_downloads: bool, input_file: str, main_task_directory: str, task_name: str
+):
+ """
+ Create gene protein structural domain task.
+
+ This is a multilabel task based on the presence or absence of protein keywords as
+ compiled by UniProt.
+
+ This task is only defined for genes that code for proteins. Each protein can have
+ multiple keyword values in different locations of its sequence. This task does not count
+ copies, but only lists the presence or absence of structural domains on the protein
+ products of the gene symbol.
+ """
+ proteins = get_uniprot_human_protein_features(input_file, allow_downloads)
+ # restrict proteins to those with associated named genes
+ gene_proteins = [
+ i for i in proteins["results"] if "genes" in i and "geneName" in i["genes"][0]
+ ]
+ gene_keyword_df_dict = get_gene_protein_keyword_dfs(gene_proteins)
+
+ for task in task_name: # task_name is multiply defined, so a tuple
+ dump_task_definitions(
+ entities=pd.Series(gene_keyword_df_dict[task].index).rename("symbol"),
+ outcomes=gene_keyword_df_dict[task],
+ main_task_directory=main_task_directory,
+ task_name="UniProt keyword " + task,
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tasks/task_descriptions.xlsx b/tasks/task_descriptions.xlsx