Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reactome full set #29

Merged
merged 9 commits into from
Aug 12, 2024
Merged
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
371 changes: 371 additions & 0 deletions scripts/tasks_retrieval/Reactome_tasks_creation_full_set.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,371 @@
import io
import warnings
import zipfile
from dataclasses import dataclass

import click
import pandas as pd
import requests

from gene_benchmark.task_retrieval import (
list_form_to_onehot_form,
verify_source_of_data,
)
from gene_benchmark.tasks import dump_task_definitions

HIERARCHIES_URL = "https://reactome.org/download/current/ReactomePathwaysRelation.txt"
BOTTOM_PATHWAY_TO_GENE_LIST = (
"https://reactome.org/download/current/ReactomePathways.gmt.zip"
)
PATHWAY_DES = "https://reactome.org/download/current/ReactomePathways.txt"


def get_pathway_symbols_dict(
url: str, gmt_filename: str, allow_downloads: bool
) -> dict[str, list[str]]:
"""
Downloads a ZIP file from the given URL, extracts the specified GMT file,
and returns its contents as a dictionary.

Args:
----
url (str): URL of path for a zip file
gmt_filename (str): The name of the GMT file within the ZIP archive.
allow_downloads (bool): indicator if the file is a url or not

Raises:
------
Exception: _description_

Returns:
-------
dict[str,list[str]]: A dictionary where keys are pathway names and values are lists of genes.

"""
if allow_downloads:
response = requests.get(url)
if response.status_code != 200:
raise Exception(f"Failed to download ZIP file: {response.status_code}")
fstream = io.BytesIO(response.content)
else:
fstream = url
with zipfile.ZipFile(fstream) as zip_ref:
gmt_content = get_gmt_from_zip(zip_ref, gmt_filename)
return gmt_file_to_dictionary(gmt_content)


def get_gmt_from_zip(zip_ref, gmt_filename: str):
"""
extract the GMT file from a zip archive.

Args:
----
zip_ref (zipfile.ZipFile): Zip archive containing a gmt file
gmt_filename (str): The archived GMT file name

Raises:
------
Exception: raised if the zip archive doesn't contain the file

Returns:
-------
str: the gmt_file file content

"""
if gmt_filename not in zip_ref.namelist():
raise Exception(f"{gmt_filename} not found in the ZIP archive")
with zip_ref.open(gmt_filename) as gmt_file:
gmt_content = gmt_file.read().decode("utf-8")
return gmt_content


def gmt_file_to_dictionary(gmt_content: str) -> dict[str, list[str]]:
r"""
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove this r

Turns a reactom gmt file into a dictionary.

Args:
----
gmt_content (str): A file where each line is separated by \n and values by \t

Returns:
-------
dict[str,list[str]]: A dictionary where keys are pathway names and values are lists of genes.

"""
gmt_dict = {}
for line in gmt_content.strip().split("\n"):
parts = line.strip().split("\t")
description = parts[1]
genes = parts[2:]
gmt_dict[description] = genes
return gmt_dict


def get_hierarchy_data(url: str) -> [dict[str, str], pd.DataFrame]:
"""
loads and returns the pathways hierarchies.

Args:
----
url (str): the location of the file

Returns:
-------
[dict[str,str],pd.DataFrame]: a tuple with dictionary with parent pathways as keys and children as values and in a data frame format

"""
hierarchy_df = pd.read_csv(
url, header=None, delimiter="\t", names=["parent", "child"]
)
hierarchy_df = hierarchy_df.set_index("parent")

hierarchies = (
hierarchy_df.groupby(hierarchy_df.index)["child"].apply(list).to_dict()
)
return hierarchies, hierarchy_df


def pathways_2_one_hot(pathways: list[str], path_2_gene) -> pd.DataFrame:
"""
create a one hot data frame with the genes in the rows and pathways as columns.

Args:
----
pathways (list[str]): A list of pathways to analyze
path_2_gene (PathwaySeeks): PathwaySeeks object with the corresponding gene to list of genes

Returns:
-------
pd.DataFrame: a one hot data frame

"""
task_df = pd.DataFrame(columns=["genes"], index=pathways)
task_df["genes"] = [";".join(path_2_gene.get_genes(path)) for path in task_df.index]
return list_form_to_onehot_form(
task_df, participant_col_name="genes", delimiter=";"
)


def pathway_set_to_task(
pathways: list[str],
path_2_gene,
pathway_des: pd.DataFrame,
pathway_identifier: str,
main_task_directory: str,
verbose: bool,
):
"""
create a task from a set of pathways.

Args:
----
pathways (list[str]): the list of pathways to be collected into the task
path_2_gene (PathwaySeeks): PathwaySeeks object with the corresponding gene to list of genes
pathway_des (pd.DataFrame): Dataframe with pathways identifier as index and a name column
pathway_identifier (str): the pathways identifier
verbose (bool): if true will do some prints

"""
outcomes = pathways_2_one_hot(pathways, path_2_gene)
outcomes.rename(pathway_des["name"].to_dict())
symbols = pd.Series(outcomes.index, name="symbol")
dump_task_definitions(symbols, outcomes, main_task_directory, pathway_identifier)
if verbose:
print(
f"{pathway_identifier}, was created at {main_task_directory} shaped {outcomes.shape}"
)


def get_gene_descriptions(url: str) -> pd.DataFrame:
"""
retrieves the data frame description.

Args:
----
url (str): the file location

Returns:
-------
pd.DataFrame: data frame with pathways descriptions

"""
return pd.read_csv(
url,
on_bad_lines="skip",
header=None,
delimiter="\t",
index_col=0,
names=["name", "species"],
)


def get_top_level_dict(
hierarchy_df: pd.DataFrame, pathway_des: pd.DataFrame
) -> dict[str, list[str]]:
"""
returns a dictionary with the top pathways of each species.

Args:
----
hierarchy_df (pd.DataFrame): A data frame containing the hierarchy of pathways
pathway_des (pd.DataFrame): A data frame containing the

Returns:
-------
dict[str,list[str]]: _description_

"""
child_set = set(hierarchy_df["child"].values)
top_level = list(filter(lambda x: not x in child_set, hierarchy_df.index))
pathway_des["top_level"] = [path in top_level for path in pathway_des.index]
pathway_des["idx"] = pathway_des.index
return (
pathway_des.loc[pathway_des["top_level"], :]
.groupby(["species"])["idx"]
.apply(list)
.to_dict()
)


@dataclass
class PathwaySeeks:
"""
The data from reactom contains the gene list for bottom level pathways only to prevent redundant calculations
this class saves the gene list of each pathway when it encounter its.
"""

pathway_to_gene: dict[str, list[str]]
hierarchies: dict[str, list[str]]

def get_genes(self, pathway: list[str], verbose=False) -> list[str]:
"""
retrives the gene list of a pathway by merging all of it's descendants genes.

Args:
----
pathway (list[str]): A list of pathways
verbose (bool, optional): If true warnings will be printed. Defaults to False.

Returns:
-------
list[str]: a list of gene symbols

"""
sub_genes = set()
if pathway in self.pathway_to_gene:
return self.pathway_to_gene[pathway]
elif not pathway in self.hierarchies:
if verbose:
warnings.warn(
f"Pathway {pathway} has no sub pathways and no genes defined"
)
else:
for sub_pathways in self.hierarchies[pathway]:
sub_genes.update(self.get_genes(sub_pathways))
self.pathway_to_gene[pathway] = list(sub_genes)
return list(sub_genes)


@click.command()
@click.option(
"--main-task-directory",
"-m",
type=click.STRING,
help="The task root directory. Will not be created.",
default="./tasks",
)
@click.option(
"--allow-downloads",
type=click.BOOL,
help="If false data files will be downloaded directly from HGNC and reactome, set to true only if you trust the urls above",
default=False,
)
@click.option(
"--pathway-identifier",
"-p",
type=click.STRING,
help="Pathway identifier from which we want to create multilabel task.",
multiple=True,
default=None,
)
@click.option(
"--hierarchy-file",
type=click.STRING,
help="The location of the ReactomePathwaysRelation file available at https://reactome.org/download-data",
default=None,
)
@click.option(
"--pathway-description",
type=click.STRING,
help="The location of the ReactomePathways.txt file available at https://reactome.org/download-data",
default=None,
)
@click.option(
"--pathway-to-gene-file",
"-m",
type=click.STRING,
help="A file with the bottom pathways and the genes involved in the pathways available at https://reactome.org/download-data",
default=None,
)
@click.option(
"--verbose/--quite",
"-v/-q",
is_flag=True,
default=True,
)
@click.option(
"--add-top-levels",
help="If true will create a task per all of the top level human pathways and per each top level pathway individually for Homo sapiens",
type=click.BOOL,
default=True,
)
def main(
main_task_directory,
allow_downloads,
pathway_identifier,
hierarchy_file,
pathway_description,
pathway_to_gene_file,
verbose,
add_top_levels,
):

hierarchies_file_url = verify_source_of_data(
hierarchy_file, url=HIERARCHIES_URL, allow_downloads=allow_downloads
)

pathway_to_gene_url = verify_source_of_data(
pathway_to_gene_file,
url=BOTTOM_PATHWAY_TO_GENE_LIST,
allow_downloads=allow_downloads,
)
pathway_des_url = verify_source_of_data(
pathway_description, url=PATHWAY_DES, allow_downloads=allow_downloads
)
yoavkt marked this conversation as resolved.
Show resolved Hide resolved
pathway_des = get_gene_descriptions(pathway_des_url)
hire_dict, hierarchy_df = get_hierarchy_data(hierarchies_file_url)
path_2_gene = PathwaySeeks(
get_pathway_symbols_dict(pathway_to_gene_url, "ReactomePathways.gmt"), hire_dict
)
if add_top_levels:
top_dict = get_top_level_dict(hierarchy_df, pathway_des)
pathway_set_to_task(
top_dict["Homo sapiens"],
path_2_gene,
pathway_des,
"Pathways top level homo sapiens",
main_task_directory,
verbose,
)
pathway_identifier = pathway_identifier + tuple(top_dict["Homo sapiens"])

for pathway_idx in pathway_identifier:
pathways = hire_dict[pathway_idx]
path_name = f"pathway {pathway_des.loc[pathway_idx, 'name']}"
pathway_set_to_task(
pathways, path_2_gene, pathway_des, path_name, main_task_directory, verbose
)


if __name__ == "__main__":
main()
Loading