-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #70 from Sage-Bionetworks/bwmac/ag-838/independent…
…_transforms [AG-838] Support independent transforms
- Loading branch information
Showing
17 changed files
with
952 additions
and
924 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
"""Submodule for Agora Data Tools Transformations""" | ||
|
||
from agoradatatools.etl.transform.distribution_data import ( | ||
transform_distribution_data, | ||
) | ||
from agoradatatools.etl.transform.gene_info import transform_gene_info | ||
from agoradatatools.etl.transform.genes_biodomains import ( | ||
transform_genes_biodomains, | ||
) | ||
from agoradatatools.etl.transform.overall_scores import ( | ||
transform_overall_scores, | ||
) | ||
from agoradatatools.etl.transform.proteomics_distribution import ( | ||
create_proteomics_distribution_data, | ||
) | ||
from agoradatatools.etl.transform.rna_distribution import ( | ||
transform_rna_distribution_data, | ||
transform_rna_seq_data, | ||
) | ||
from agoradatatools.etl.transform.team_info import transform_team_info | ||
|
||
__all__ = [ | ||
"transform_distribution_data", | ||
"transform_gene_info", | ||
"transform_genes_biodomains", | ||
"transform_overall_scores", | ||
"create_proteomics_distribution_data", | ||
"transform_rna_distribution_data", | ||
"transform_rna_seq_data", | ||
"transform_team_info", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
import pandas as pd | ||
import numpy as np | ||
|
||
|
||
def calculate_distribution(df: pd.DataFrame, col: str, is_scored, upper_bound) -> dict: | ||
if is_scored: | ||
df = df[df[is_scored] == "Y"] # df does not have the isscored | ||
else: | ||
df = df[df.isin(["Y"]).any(axis=1)] | ||
|
||
if df[col].dtype == object: | ||
df = df.copy() # Necessary to prevent SettingWithCopy warning | ||
df[col] = df[col].astype(float) | ||
|
||
obj = {} | ||
|
||
# In order to smooth out the bins and make sure the entire range from 0 | ||
# to the theoretical maximum value has been found, we create a copy of the | ||
# column with both 0 and that maximum value added to it. We use the copy to calculate | ||
# distributions and bins, and subtract the values at the end | ||
|
||
distribution = pd.concat([df[col], pd.Series([0, upper_bound])], ignore_index=True) | ||
|
||
obj["distribution"] = list( | ||
pd.cut( | ||
distribution, bins=10, precision=3, include_lowest=True, right=True | ||
).value_counts(sort=False) | ||
) | ||
obj["distribution"][ | ||
0 | ||
] -= 1 # since this was calculated with the artificial 0 value, we subtract it | ||
obj["distribution"][ | ||
-1 | ||
] -= 1 # since this was calculated with the artificial upper_bound, we subtract it | ||
|
||
discard, obj["bins"] = list( | ||
pd.cut(distribution, bins=10, precision=3, retbins=True) | ||
) | ||
obj["bins"] = np.around(obj["bins"].tolist()[1:], 2) | ||
base = [0, *obj["bins"][:-1]] | ||
obj["bins"] = zip(base, obj["bins"]) | ||
obj["bins"] = list(obj["bins"]) | ||
|
||
obj["min"] = np.around(df[col].min(), 4) | ||
obj["max"] = np.around(df[col].max(), 4) | ||
obj["mean"] = np.around(df[col].mean(), 4) | ||
obj["first_quartile"] = np.around( | ||
df[col].quantile(q=0.25, interpolation="midpoint") | ||
) | ||
obj["third_quartile"] = np.around( | ||
df[col].quantile(q=0.75, interpolation="midpoint") | ||
) | ||
|
||
return obj | ||
|
||
|
||
def transform_distribution_data( | ||
datasets: dict, | ||
overall_max_score, | ||
genetics_max_score, | ||
omics_max_score, | ||
lit_max_score, | ||
): | ||
overall_scores = datasets["overall_scores"] | ||
interesting_columns = [ | ||
"ensg", | ||
"overall", | ||
"geneticsscore", | ||
"omicsscore", | ||
"literaturescore", | ||
] | ||
|
||
# create mapping to deal with missing values as they take different shape across the fields | ||
scored = ["isscored_genetics", "isscored_omics", "isscored_lit"] | ||
mapping = dict(zip(interesting_columns[2:], scored)) | ||
mapping["overall"] = None | ||
|
||
# create mapping for max score values from config | ||
max_score = dict( | ||
zip( | ||
interesting_columns[1:], | ||
[overall_max_score, genetics_max_score, omics_max_score, lit_max_score], | ||
) | ||
) | ||
|
||
overall_scores = overall_scores[interesting_columns + scored] | ||
|
||
neo_matrix = {} | ||
for col in interesting_columns[1:]: # excludes the ENSG | ||
neo_matrix[col] = calculate_distribution( | ||
overall_scores, col, mapping[col], max_score[col] | ||
) | ||
|
||
neo_matrix["target_risk_score"] = neo_matrix.pop("overall") | ||
neo_matrix["genetics_score"] = neo_matrix.pop("geneticsscore") | ||
neo_matrix["multi_omics_score"] = neo_matrix.pop("omicsscore") | ||
neo_matrix["literature_score"] = neo_matrix.pop("literaturescore") | ||
|
||
additional_data = [ | ||
{"name": "Target Risk Score", "syn_id": "syn25913473", "wiki_id": "621071"}, | ||
{"name": "Genetic Risk Score", "syn_id": "syn25913473", "wiki_id": "621069"}, | ||
{"name": "Multi-omic Risk Score", "syn_id": "syn25913473", "wiki_id": "621070"}, | ||
{"name": "Literature Score", "syn_id": "syn25913473", "wiki_id": "613105"}, | ||
] | ||
for col, additional in zip(neo_matrix.keys(), additional_data): | ||
neo_matrix[col]["name"] = additional["name"] | ||
neo_matrix[col]["syn_id"] = additional["syn_id"] | ||
neo_matrix[col]["wiki_id"] = additional["wiki_id"] | ||
|
||
return neo_matrix |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
import pandas as pd | ||
import numpy as np | ||
|
||
from agoradatatools.etl.utils import nest_fields | ||
|
||
|
||
def transform_gene_info( | ||
datasets: dict, adjusted_p_value_threshold, protein_level_threshold | ||
): | ||
""" | ||
This function will perform transformations and incrementally create a dataset called gene_info. | ||
Each dataset will be left_joined onto gene_info, starting with gene_metadata. | ||
""" | ||
gene_metadata = datasets["gene_metadata"] | ||
igap = datasets["igap"] | ||
eqtl = datasets["eqtl"] | ||
proteomics = datasets["proteomics"] | ||
rna_change = datasets["rna_expression_change"] | ||
proteomics_tmt = datasets["agora_proteomics_tmt"] | ||
target_list = datasets["target_list"] | ||
median_expression = datasets["median_expression"] | ||
druggability = datasets["druggability"] | ||
|
||
# Modify the data before merging | ||
|
||
# All genes in this list should have 'is_igap' = True when added to gene_info. | ||
# Creating the column here automatically adds the column in to gene_info | ||
# during merge, with True values correctly populated. | ||
igap["is_igap"] = True | ||
|
||
# Get the smallest adj_p_val for each gene, to determine significance | ||
rna_change = ( | ||
rna_change.groupby("ensembl_gene_id")["adj_p_val"].agg("min").reset_index() | ||
) | ||
|
||
# Get the smallest cor_pval for each protein, to determine significance | ||
proteomics_concat = pd.concat([proteomics, proteomics_tmt]) | ||
proteomics_concat = proteomics_concat.dropna( | ||
subset=["log2_fc", "cor_pval", "ci_lwr", "ci_upr"] | ||
) | ||
proteomics_concat = ( | ||
proteomics_concat.groupby("ensembl_gene_id")["cor_pval"] | ||
.agg("min") | ||
.reset_index() | ||
) | ||
|
||
# these are the interesting columns of the druggability dataset | ||
useful_columns = [ | ||
"geneid", | ||
"sm_druggability_bucket", | ||
"safety_bucket", | ||
"abability_bucket", | ||
"pharos_class", | ||
"classification", | ||
"safety_bucket_definition", | ||
"abability_bucket_definition", | ||
] | ||
druggability = druggability[useful_columns] | ||
|
||
target_list = nest_fields( | ||
df=target_list, grouping="ensembl_gene_id", new_column="nominated_target" | ||
) | ||
|
||
median_expression = nest_fields( | ||
df=median_expression, grouping="ensembl_gene_id", new_column="median_expression" | ||
) | ||
|
||
druggability = nest_fields( | ||
df=druggability, grouping="geneid", new_column="druggability" | ||
) | ||
druggability.rename(columns={"geneid": "ensembl_gene_id"}, inplace=True) | ||
|
||
# Merge all the datasets | ||
|
||
gene_info = gene_metadata | ||
|
||
for dataset in [ | ||
igap, | ||
eqtl, | ||
rna_change, | ||
proteomics_concat, | ||
target_list, | ||
median_expression, | ||
druggability, | ||
]: | ||
gene_info = pd.merge( | ||
left=gene_info, | ||
right=dataset, | ||
on="ensembl_gene_id", | ||
how="outer", | ||
validate="one_to_one", | ||
) | ||
|
||
# Populate values for rows that didn't exist in the individual datasets | ||
|
||
gene_info.fillna( | ||
{"is_igap": False, "has_eqtl": False, "adj_p_val": -1, "cor_pval": -1}, | ||
inplace=True, | ||
) | ||
|
||
# fillna doesn't work for creating an empty array, need this function instead | ||
gene_info["alias"] = gene_info.apply( | ||
lambda row: row["alias"] | ||
if isinstance(row["alias"], np.ndarray) | ||
else np.ndarray(0, dtype=object), | ||
axis=1, | ||
) | ||
|
||
gene_info["rna_brain_change_studied"] = gene_info["adj_p_val"] != -1 | ||
gene_info["rna_in_ad_brain_change"] = ( | ||
gene_info["adj_p_val"] <= adjusted_p_value_threshold | ||
) & gene_info["rna_brain_change_studied"] | ||
|
||
gene_info["protein_brain_change_studied"] = gene_info["cor_pval"] != -1 | ||
gene_info["protein_in_ad_brain_change"] = ( | ||
gene_info["cor_pval"] <= protein_level_threshold | ||
) & gene_info["protein_brain_change_studied"] | ||
|
||
# create 'nominations' field | ||
gene_info["nominations"] = gene_info.apply( | ||
lambda row: len(row["nominated_target"]) | ||
if isinstance(row["nominated_target"], list) | ||
else np.NaN, | ||
axis=1, | ||
) | ||
|
||
# Remove some extra columns that got added during merges | ||
gene_info = gene_info[ | ||
[ | ||
"ensembl_gene_id", | ||
"name", | ||
"summary", | ||
"symbol", | ||
"alias", | ||
"is_igap", | ||
"has_eqtl", | ||
"rna_in_ad_brain_change", | ||
"rna_brain_change_studied", | ||
"protein_in_ad_brain_change", | ||
"protein_brain_change_studied", | ||
"nominated_target", | ||
"median_expression", | ||
"druggability", | ||
"nominations", | ||
] | ||
] | ||
|
||
# Make sure there are no N/A Ensembl IDs | ||
gene_info = gene_info.dropna(subset=["ensembl_gene_id"]) | ||
|
||
return gene_info |
Oops, something went wrong.