-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Reactome full set #29
Merged
Merged
Changes from 6 commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
8599835
add pre commit fix
yoavkt c2d1962
modify the script
yoavkt 6b6b5ca
working script
yoavkt 487ed55
Easy top level task creation
yoavkt a32e2a3
Fill documentation
yoavkt b1e0a3d
Merge branch 'main' into reactome_full_set
yoavkt a351ec4
remove code duplications
yoavkt f0726e8
Method correction
yoavkt c0e5912
Merge branch 'main' into reactome_full_set
yoavkt File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
371 changes: 371 additions & 0 deletions
371
scripts/tasks_retrieval/Reactome_tasks_creation_full_set.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,371 @@ | ||
import io | ||
import warnings | ||
import zipfile | ||
from dataclasses import dataclass | ||
|
||
import click | ||
import pandas as pd | ||
import requests | ||
|
||
from gene_benchmark.task_retrieval import ( | ||
list_form_to_onehot_form, | ||
verify_source_of_data, | ||
) | ||
from gene_benchmark.tasks import dump_task_definitions | ||
|
||
HIERARCHIES_URL = "https://reactome.org/download/current/ReactomePathwaysRelation.txt" | ||
BOTTOM_PATHWAY_TO_GENE_LIST = ( | ||
"https://reactome.org/download/current/ReactomePathways.gmt.zip" | ||
) | ||
PATHWAY_DES = "https://reactome.org/download/current/ReactomePathways.txt" | ||
|
||
|
||
def get_pathway_symbols_dict( | ||
url: str, gmt_filename: str, allow_downloads: bool | ||
) -> dict[str, list[str]]: | ||
""" | ||
Downloads a ZIP file from the given URL, extracts the specified GMT file, | ||
and returns its contents as a dictionary. | ||
|
||
Args: | ||
---- | ||
url (str): URL of path for a zip file | ||
gmt_filename (str): The name of the GMT file within the ZIP archive. | ||
allow_downloads (bool): indicator if the file is a url or not | ||
|
||
Raises: | ||
------ | ||
Exception: _description_ | ||
|
||
Returns: | ||
------- | ||
dict[str,list[str]]: A dictionary where keys are pathway names and values are lists of genes. | ||
|
||
""" | ||
if allow_downloads: | ||
response = requests.get(url) | ||
if response.status_code != 200: | ||
raise Exception(f"Failed to download ZIP file: {response.status_code}") | ||
fstream = io.BytesIO(response.content) | ||
else: | ||
fstream = url | ||
with zipfile.ZipFile(fstream) as zip_ref: | ||
gmt_content = get_gmt_from_zip(zip_ref, gmt_filename) | ||
return gmt_file_to_dictionary(gmt_content) | ||
|
||
|
||
def get_gmt_from_zip(zip_ref, gmt_filename: str): | ||
""" | ||
extract the GMT file from a zip archive. | ||
|
||
Args: | ||
---- | ||
zip_ref (zipfile.ZipFile): Zip archive containing a gmt file | ||
gmt_filename (str): The archived GMT file name | ||
|
||
Raises: | ||
------ | ||
Exception: raised if the zip archive doesn't contain the file | ||
|
||
Returns: | ||
------- | ||
str: the gmt_file file content | ||
|
||
""" | ||
if gmt_filename not in zip_ref.namelist(): | ||
raise Exception(f"{gmt_filename} not found in the ZIP archive") | ||
with zip_ref.open(gmt_filename) as gmt_file: | ||
gmt_content = gmt_file.read().decode("utf-8") | ||
return gmt_content | ||
|
||
|
||
def gmt_file_to_dictionary(gmt_content: str) -> dict[str, list[str]]: | ||
r""" | ||
Turns a reactom gmt file into a dictionary. | ||
|
||
Args: | ||
---- | ||
gmt_content (str): A file where each line is separated by \n and values by \t | ||
|
||
Returns: | ||
------- | ||
dict[str,list[str]]: A dictionary where keys are pathway names and values are lists of genes. | ||
|
||
""" | ||
gmt_dict = {} | ||
for line in gmt_content.strip().split("\n"): | ||
parts = line.strip().split("\t") | ||
description = parts[1] | ||
genes = parts[2:] | ||
gmt_dict[description] = genes | ||
return gmt_dict | ||
|
||
|
||
def get_hierarchy_data(url: str) -> [dict[str, str], pd.DataFrame]: | ||
""" | ||
loads and returns the pathways hierarchies. | ||
|
||
Args: | ||
---- | ||
url (str): the location of the file | ||
|
||
Returns: | ||
------- | ||
[dict[str,str],pd.DataFrame]: a tuple with dictionary with parent pathways as keys and children as values and in a data frame format | ||
|
||
""" | ||
hierarchy_df = pd.read_csv( | ||
url, header=None, delimiter="\t", names=["parent", "child"] | ||
) | ||
hierarchy_df = hierarchy_df.set_index("parent") | ||
|
||
hierarchies = ( | ||
hierarchy_df.groupby(hierarchy_df.index)["child"].apply(list).to_dict() | ||
) | ||
return hierarchies, hierarchy_df | ||
|
||
|
||
def pathways_2_one_hot(pathways: list[str], path_2_gene) -> pd.DataFrame: | ||
""" | ||
create a one hot data frame with the genes in the rows and pathways as columns. | ||
|
||
Args: | ||
---- | ||
pathways (list[str]): A list of pathways to analyze | ||
path_2_gene (PathwaySeeks): PathwaySeeks object with the corresponding gene to list of genes | ||
|
||
Returns: | ||
------- | ||
pd.DataFrame: a one hot data frame | ||
|
||
""" | ||
task_df = pd.DataFrame(columns=["genes"], index=pathways) | ||
task_df["genes"] = [";".join(path_2_gene.get_genes(path)) for path in task_df.index] | ||
return list_form_to_onehot_form( | ||
task_df, participant_col_name="genes", delimiter=";" | ||
) | ||
|
||
|
||
def pathway_set_to_task( | ||
pathways: list[str], | ||
path_2_gene, | ||
pathway_des: pd.DataFrame, | ||
pathway_identifier: str, | ||
main_task_directory: str, | ||
verbose: bool, | ||
): | ||
""" | ||
create a task from a set of pathways. | ||
|
||
Args: | ||
---- | ||
pathways (list[str]): the list of pathways to be collected into the task | ||
path_2_gene (PathwaySeeks): PathwaySeeks object with the corresponding gene to list of genes | ||
pathway_des (pd.DataFrame): Dataframe with pathways identifier as index and a name column | ||
pathway_identifier (str): the pathways identifier | ||
verbose (bool): if true will do some prints | ||
|
||
""" | ||
outcomes = pathways_2_one_hot(pathways, path_2_gene) | ||
outcomes.rename(pathway_des["name"].to_dict()) | ||
symbols = pd.Series(outcomes.index, name="symbol") | ||
dump_task_definitions(symbols, outcomes, main_task_directory, pathway_identifier) | ||
if verbose: | ||
print( | ||
f"{pathway_identifier}, was created at {main_task_directory} shaped {outcomes.shape}" | ||
) | ||
|
||
|
||
def get_gene_descriptions(url: str) -> pd.DataFrame: | ||
""" | ||
retrieves the data frame description. | ||
|
||
Args: | ||
---- | ||
url (str): the file location | ||
|
||
Returns: | ||
------- | ||
pd.DataFrame: data frame with pathways descriptions | ||
|
||
""" | ||
return pd.read_csv( | ||
url, | ||
on_bad_lines="skip", | ||
header=None, | ||
delimiter="\t", | ||
index_col=0, | ||
names=["name", "species"], | ||
) | ||
|
||
|
||
def get_top_level_dict( | ||
hierarchy_df: pd.DataFrame, pathway_des: pd.DataFrame | ||
) -> dict[str, list[str]]: | ||
""" | ||
returns a dictionary with the top pathways of each species. | ||
|
||
Args: | ||
---- | ||
hierarchy_df (pd.DataFrame): A data frame containing the hierarchy of pathways | ||
pathway_des (pd.DataFrame): A data frame containing the | ||
|
||
Returns: | ||
------- | ||
dict[str,list[str]]: _description_ | ||
|
||
""" | ||
child_set = set(hierarchy_df["child"].values) | ||
top_level = list(filter(lambda x: not x in child_set, hierarchy_df.index)) | ||
pathway_des["top_level"] = [path in top_level for path in pathway_des.index] | ||
pathway_des["idx"] = pathway_des.index | ||
return ( | ||
pathway_des.loc[pathway_des["top_level"], :] | ||
.groupby(["species"])["idx"] | ||
.apply(list) | ||
.to_dict() | ||
) | ||
|
||
|
||
@dataclass | ||
class PathwaySeeks: | ||
""" | ||
The data from reactom contains the gene list for bottom level pathways only to prevent redundant calculations | ||
this class saves the gene list of each pathway when it encounter its. | ||
""" | ||
|
||
pathway_to_gene: dict[str, list[str]] | ||
hierarchies: dict[str, list[str]] | ||
|
||
def get_genes(self, pathway: list[str], verbose=False) -> list[str]: | ||
""" | ||
retrives the gene list of a pathway by merging all of it's descendants genes. | ||
|
||
Args: | ||
---- | ||
pathway (list[str]): A list of pathways | ||
verbose (bool, optional): If true warnings will be printed. Defaults to False. | ||
|
||
Returns: | ||
------- | ||
list[str]: a list of gene symbols | ||
|
||
""" | ||
sub_genes = set() | ||
if pathway in self.pathway_to_gene: | ||
return self.pathway_to_gene[pathway] | ||
elif not pathway in self.hierarchies: | ||
if verbose: | ||
warnings.warn( | ||
f"Pathway {pathway} has no sub pathways and no genes defined" | ||
) | ||
else: | ||
for sub_pathways in self.hierarchies[pathway]: | ||
sub_genes.update(self.get_genes(sub_pathways)) | ||
self.pathway_to_gene[pathway] = list(sub_genes) | ||
return list(sub_genes) | ||
|
||
|
||
@click.command() | ||
@click.option( | ||
"--main-task-directory", | ||
"-m", | ||
type=click.STRING, | ||
help="The task root directory. Will not be created.", | ||
default="./tasks", | ||
) | ||
@click.option( | ||
"--allow-downloads", | ||
type=click.BOOL, | ||
help="If false data files will be downloaded directly from HGNC and reactome, set to true only if you trust the urls above", | ||
default=False, | ||
) | ||
@click.option( | ||
"--pathway-identifier", | ||
"-p", | ||
type=click.STRING, | ||
help="Pathway identifier from which we want to create multilabel task.", | ||
multiple=True, | ||
default=None, | ||
) | ||
@click.option( | ||
"--hierarchy-file", | ||
type=click.STRING, | ||
help="The location of the ReactomePathwaysRelation file available at https://reactome.org/download-data", | ||
default=None, | ||
) | ||
@click.option( | ||
"--pathway-description", | ||
type=click.STRING, | ||
help="The location of the ReactomePathways.txt file available at https://reactome.org/download-data", | ||
default=None, | ||
) | ||
@click.option( | ||
"--pathway-to-gene-file", | ||
"-m", | ||
type=click.STRING, | ||
help="A file with the bottom pathways and the genes involved in the pathways available at https://reactome.org/download-data", | ||
default=None, | ||
) | ||
@click.option( | ||
"--verbose/--quite", | ||
"-v/-q", | ||
is_flag=True, | ||
default=True, | ||
) | ||
@click.option( | ||
"--add-top-levels", | ||
help="If true will create a task per all of the top level human pathways and per each top level pathway individually for Homo sapiens", | ||
type=click.BOOL, | ||
default=True, | ||
) | ||
def main( | ||
main_task_directory, | ||
allow_downloads, | ||
pathway_identifier, | ||
hierarchy_file, | ||
pathway_description, | ||
pathway_to_gene_file, | ||
verbose, | ||
add_top_levels, | ||
): | ||
|
||
hierarchies_file_url = verify_source_of_data( | ||
hierarchy_file, url=HIERARCHIES_URL, allow_downloads=allow_downloads | ||
) | ||
|
||
pathway_to_gene_url = verify_source_of_data( | ||
pathway_to_gene_file, | ||
url=BOTTOM_PATHWAY_TO_GENE_LIST, | ||
allow_downloads=allow_downloads, | ||
) | ||
pathway_des_url = verify_source_of_data( | ||
pathway_description, url=PATHWAY_DES, allow_downloads=allow_downloads | ||
) | ||
yoavkt marked this conversation as resolved.
Show resolved
Hide resolved
|
||
pathway_des = get_gene_descriptions(pathway_des_url) | ||
hire_dict, hierarchy_df = get_hierarchy_data(hierarchies_file_url) | ||
path_2_gene = PathwaySeeks( | ||
get_pathway_symbols_dict(pathway_to_gene_url, "ReactomePathways.gmt"), hire_dict | ||
) | ||
if add_top_levels: | ||
top_dict = get_top_level_dict(hierarchy_df, pathway_des) | ||
pathway_set_to_task( | ||
top_dict["Homo sapiens"], | ||
path_2_gene, | ||
pathway_des, | ||
"Pathways top level homo sapiens", | ||
main_task_directory, | ||
verbose, | ||
) | ||
pathway_identifier = pathway_identifier + tuple(top_dict["Homo sapiens"]) | ||
|
||
for pathway_idx in pathway_identifier: | ||
pathways = hire_dict[pathway_idx] | ||
path_name = f"pathway {pathway_des.loc[pathway_idx, 'name']}" | ||
pathway_set_to_task( | ||
pathways, path_2_gene, pathway_des, path_name, main_task_directory, verbose | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
remove this r