diff --git a/scripts/tasks_retrieval/Reactome_tasks_creation_full_set.py b/scripts/tasks_retrieval/Reactome_tasks_creation_full_set.py new file mode 100644 index 0000000..d9f7ead --- /dev/null +++ b/scripts/tasks_retrieval/Reactome_tasks_creation_full_set.py @@ -0,0 +1,371 @@ +import io +import warnings +import zipfile +from dataclasses import dataclass + +import click +import pandas as pd +import requests + +from gene_benchmark.task_retrieval import ( + list_form_to_onehot_form, + verify_source_of_data, +) +from gene_benchmark.tasks import dump_task_definitions + +HIERARCHIES_URL = "https://reactome.org/download/current/ReactomePathwaysRelation.txt" +BOTTOM_PATHWAY_TO_GENE_LIST = ( + "https://reactome.org/download/current/ReactomePathways.gmt.zip" +) +PATHWAY_DES = "https://reactome.org/download/current/ReactomePathways.txt" + + +def get_pathway_symbols_dict( + url: str, gmt_filename: str, allow_downloads: bool +) -> dict[str, list[str]]: + """ + Downloads a ZIP file from the given URL, extracts the specified GMT file, + and returns its contents as a dictionary. + + Args: + ---- + url (str): URL of path for a zip file + gmt_filename (str): The name of the GMT file within the ZIP archive. + allow_downloads (bool): indicator if the file is a url or not + + Raises: + ------ + Exception: _description_ + + Returns: + ------- + dict[str,list[str]]: A dictionary where keys are pathway names and values are lists of genes. + + """ + if allow_downloads: + response = requests.get(url) + if response.status_code != 200: + raise Exception(f"Failed to download ZIP file: {response.status_code}") + fstream = io.BytesIO(response.content) + else: + fstream = url + with zipfile.ZipFile(fstream) as zip_ref: + gmt_content = get_gmt_from_zip(zip_ref, gmt_filename) + return gmt_file_to_dictionary(gmt_content) + + +def get_gmt_from_zip(zip_ref, gmt_filename: str): + """ + extract the GMT file from a zip archive. + + Args: + ---- + zip_ref (zipfile.ZipFile): Zip archive containing a gmt file + gmt_filename (str): The archived GMT file name + + Raises: + ------ + Exception: raised if the zip archive doesn't contain the file + + Returns: + ------- + str: the gmt_file file content + + """ + if gmt_filename not in zip_ref.namelist(): + raise Exception(f"{gmt_filename} not found in the ZIP archive") + with zip_ref.open(gmt_filename) as gmt_file: + gmt_content = gmt_file.read().decode("utf-8") + return gmt_content + + +def gmt_file_to_dictionary(gmt_content: str) -> dict[str, list[str]]: + r""" + Turns a reactom gmt file into a dictionary. + + Args: + ---- + gmt_content (str): A file where each line is separated by \n and values by \t + + Returns: + ------- + dict[str,list[str]]: A dictionary where keys are pathway names and values are lists of genes. + + """ + gmt_dict = {} + for line in gmt_content.strip().split("\n"): + parts = line.strip().split("\t") + description = parts[1] + genes = parts[2:] + gmt_dict[description] = genes + return gmt_dict + + +def get_hierarchy_data(url: str) -> [dict[str, str], pd.DataFrame]: + """ + loads and returns the pathways hierarchies. + + Args: + ---- + url (str): the location of the file + + Returns: + ------- + [dict[str,str],pd.DataFrame]: a tuple with dictionary with parent pathways as keys and children as values and in a data frame format + + """ + hierarchy_df = pd.read_csv( + url, header=None, delimiter="\t", names=["parent", "child"] + ) + hierarchy_df = hierarchy_df.set_index("parent") + + hierarchies = ( + hierarchy_df.groupby(hierarchy_df.index)["child"].apply(list).to_dict() + ) + return hierarchies, hierarchy_df + + +def pathways_2_one_hot(pathways: list[str], path_2_gene) -> pd.DataFrame: + """ + create a one hot data frame with the genes in the rows and pathways as columns. + + Args: + ---- + pathways (list[str]): A list of pathways to analyze + path_2_gene (PathwaySeeks): PathwaySeeks object with the corresponding gene to list of genes + + Returns: + ------- + pd.DataFrame: a one hot data frame + + """ + task_df = pd.DataFrame(columns=["genes"], index=pathways) + task_df["genes"] = [";".join(path_2_gene.get_genes(path)) for path in task_df.index] + return list_form_to_onehot_form( + task_df, participant_col_name="genes", delimiter=";" + ) + + +def pathway_set_to_task( + pathways: list[str], + path_2_gene, + pathway_des: pd.DataFrame, + pathway_identifier: str, + main_task_directory: str, + verbose: bool, +): + """ + create a task from a set of pathways. + + Args: + ---- + pathways (list[str]): the list of pathways to be collected into the task + path_2_gene (PathwaySeeks): PathwaySeeks object with the corresponding gene to list of genes + pathway_des (pd.DataFrame): Dataframe with pathways identifier as index and a name column + pathway_identifier (str): the pathways identifier + verbose (bool): if true will do some prints + + """ + outcomes = pathways_2_one_hot(pathways, path_2_gene) + outcomes.rename(pathway_des["name"].to_dict()) + symbols = pd.Series(outcomes.index, name="symbol") + dump_task_definitions(symbols, outcomes, main_task_directory, pathway_identifier) + if verbose: + print( + f"{pathway_identifier}, was created at {main_task_directory} shaped {outcomes.shape}" + ) + + +def get_gene_descriptions(url: str) -> pd.DataFrame: + """ + retrieves the data frame description. + + Args: + ---- + url (str): the file location + + Returns: + ------- + pd.DataFrame: data frame with pathways descriptions + + """ + return pd.read_csv( + url, + on_bad_lines="skip", + header=None, + delimiter="\t", + index_col=0, + names=["name", "species"], + ) + + +def get_top_level_dict( + hierarchy_df: pd.DataFrame, pathway_des: pd.DataFrame +) -> dict[str, list[str]]: + """ + returns a dictionary with the top pathways of each species. + + Args: + ---- + hierarchy_df (pd.DataFrame): A data frame containing the hierarchy of pathways + pathway_des (pd.DataFrame): A data frame containing the + + Returns: + ------- + dict[str,list[str]]: _description_ + + """ + child_set = set(hierarchy_df["child"].values) + top_level = list(filter(lambda x: not x in child_set, hierarchy_df.index)) + pathway_des["top_level"] = [path in top_level for path in pathway_des.index] + pathway_des["idx"] = pathway_des.index + return ( + pathway_des.loc[pathway_des["top_level"], :] + .groupby(["species"])["idx"] + .apply(list) + .to_dict() + ) + + +@dataclass +class PathwaySeeks: + """ + The data from reactom contains the gene list for bottom level pathways only to prevent redundant calculations + this class saves the gene list of each pathway when it encounter its. + """ + + pathway_to_gene: dict[str, list[str]] + hierarchies: dict[str, list[str]] + + def get_genes(self, pathway: list[str], verbose=False) -> list[str]: + """ + retrives the gene list of a pathway by merging all of it's descendants genes. + + Args: + ---- + pathway (list[str]): A list of pathways + verbose (bool, optional): If true warnings will be printed. Defaults to False. + + Returns: + ------- + list[str]: a list of gene symbols + + """ + sub_genes = set() + if pathway in self.pathway_to_gene: + return self.pathway_to_gene[pathway] + elif not pathway in self.hierarchies: + if verbose: + warnings.warn( + f"Pathway {pathway} has no sub pathways and no genes defined" + ) + else: + for sub_pathways in self.hierarchies[pathway]: + sub_genes.update(self.get_genes(sub_pathways)) + self.pathway_to_gene[pathway] = list(sub_genes) + return list(sub_genes) + + +@click.command() +@click.option( + "--main-task-directory", + "-m", + type=click.STRING, + help="The task root directory. Will not be created.", + default="./tasks", +) +@click.option( + "--allow-downloads", + type=click.BOOL, + help="If false data files will be downloaded directly from HGNC and reactome, set to true only if you trust the urls above", + default=False, +) +@click.option( + "--pathway-identifier", + "-p", + type=click.STRING, + help="Pathway identifier from which we want to create multilabel task.", + multiple=True, + default=None, +) +@click.option( + "--hierarchy-file", + type=click.STRING, + help="The location of the ReactomePathwaysRelation file available at https://reactome.org/download-data", + default=None, +) +@click.option( + "--pathway-description", + type=click.STRING, + help="The location of the ReactomePathways.txt file available at https://reactome.org/download-data", + default=None, +) +@click.option( + "--pathway-to-gene-file", + "-m", + type=click.STRING, + help="A file with the bottom pathways and the genes involved in the pathways available at https://reactome.org/download-data", + default=None, +) +@click.option( + "--verbose/--quite", + "-v/-q", + is_flag=True, + default=True, +) +@click.option( + "--add-top-levels", + help="If true will create a task per all of the top level human pathways and per each top level pathway individually for Homo sapiens", + type=click.BOOL, + default=True, +) +def main( + main_task_directory, + allow_downloads, + pathway_identifier, + hierarchy_file, + pathway_description, + pathway_to_gene_file, + verbose, + add_top_levels, +): + urls_dict = {} + for nme, file, url in zip( + ["hierarchies", "pathway_to_gene", "pathway_to_des"], + [hierarchy_file, pathway_to_gene_file, pathway_description], + [HIERARCHIES_URL, BOTTOM_PATHWAY_TO_GENE_LIST, PATHWAY_DES], + ): + urls_dict[nme] = verify_source_of_data( + file, url=url, allow_downloads=allow_downloads + ) + + pathway_des = get_gene_descriptions(urls_dict["pathway_to_des"]) + hire_dict, hierarchy_df = get_hierarchy_data(urls_dict["hierarchies"]) + path_2_gene = PathwaySeeks( + get_pathway_symbols_dict( + urls_dict["pathway_to_gene"], "ReactomePathways.gmt", allow_downloads + ), + hire_dict, + ) + if add_top_levels: + top_dict = get_top_level_dict(hierarchy_df, pathway_des) + pathway_set_to_task( + top_dict["Homo sapiens"], + path_2_gene, + pathway_des, + "Pathways top level homo sapiens", + main_task_directory, + verbose, + ) + pathway_identifier = pathway_identifier + tuple(top_dict["Homo sapiens"]) + + for pathway_idx in pathway_identifier: + pathways = hire_dict[pathway_idx] + path_name = f"pathway {pathway_des.loc[pathway_idx, 'name']}" + pathway_set_to_task( + pathways, path_2_gene, pathway_des, path_name, main_task_directory, verbose + ) + + +if __name__ == "__main__": + main()