Skip to content

Commit

Permalink
Merge branch 'dev' into deep-learning-spectrum-prediction
Browse files Browse the repository at this point in the history
# Conflicts:
#	ui/runs/forms/data_analysis.py
#	ui/runs/views.py
  • Loading branch information
henninggaertner committed Nov 14, 2024
2 parents 3fc6390 + a2d0bf6 commit 87dd67e
Show file tree
Hide file tree
Showing 29 changed files with 1,034 additions and 256 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
.DS_Store
.idea/
user_data/runs/*
user_data/workflows/*
user_data/external_data/*
user_data/debug/*
ui/static/admin/*
Expand Down Expand Up @@ -145,4 +146,4 @@ dmypy.json

#custom file everyone has
Miniconda3*.sh
Miniconda3*.exe
Miniconda3*.exe
7 changes: 7 additions & 0 deletions protzilla/data_analysis/differential_expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,17 @@
from .differential_expression_anova import anova
from .differential_expression_linear_model import linear_model
from .differential_expression_t_test import t_test
from .differential_expression_mann_whitney import mann_whitney_test_on_intensity_data, mann_whitney_test_on_ptm_data
from .differential_expression_kruskal_wallis import kruskal_wallis_test_on_intensity_data, kruskal_wallis_test_on_ptm_data



# call methods for precommit hook not to delete imports
def unused():
anova(**{})
t_test(**{})
linear_model(**{})
mann_whitney_test_on_ptm_data(**{})
mann_whitney_test_on_intensity_data(**{})
kruskal_wallis_test_on_intensity_data(**{})
kruskal_wallis_test_on_ptm_data(**{})
27 changes: 2 additions & 25 deletions protzilla/data_analysis/differential_expression_anova.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from .differential_expression_helper import (
INVALID_PROTEINGROUP_DATA_MSG,
_map_log_base,
apply_multiple_testing_correction,
apply_multiple_testing_correction, preprocess_grouping,
)


Expand Down Expand Up @@ -57,30 +57,7 @@ def anova(
- a df filtered_proteins, containing the filtered out proteins (due to missing values or identical values),
"""

assert grouping in metadata_df.columns, f"{grouping} not found in metadata_df"
messages = []
# Check if the selected groups are present in the metadata_df

# Select all groups if none or less than two were selected
if not selected_groups or isinstance(selected_groups, str):
selected_groups = metadata_df[grouping].unique()
selected_groups_str = "".join([" " + str(group) for group in selected_groups])
messages.append(
{
"level": logging.INFO,
"msg": f"Auto-selected the groups {selected_groups_str} for comparison because none or only one group was selected.",
}
)
elif len(selected_groups) >= 2:
for group in selected_groups:
if group not in metadata_df[grouping].unique():
messages.append(
{
"level": logging.ERROR,
"msg": f"Group {group} not found in metadata_df.",
}
)
return {"messages": messages}
selected_groups, messages = preprocess_grouping(metadata_df, grouping, selected_groups)

# Merge the intensity and metadata dataframes in order to assign to each Sample
# their corresponding group
Expand Down
97 changes: 94 additions & 3 deletions protzilla/data_analysis/differential_expression_helper.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import logging
import math

import numpy as np
import pandas as pd

from statsmodels.stats.multitest import multipletests


def apply_multiple_testing_correction(
p_values: list, method: str, alpha: float
p_values: list, method: str, alpha: float
) -> tuple:
"""
Applies a multiple testing correction method to a list of p-values
Expand Down Expand Up @@ -46,8 +49,96 @@ def _map_log_base(log_base: str) -> int | None:
return log_base_mapping.get(log_base, None)


def preprocess_grouping(
metadata_df: pd.DataFrame, grouping: str, selected_groups: list | str
) -> tuple[list, list[dict]]:
"""
Preprocesses the grouping column in the metadata_df and checks if the selected groups are present.
:param metadata_df: the metadata dataframe
:param grouping: the column name in the metadata_df that contains the grouping information
:param selected_groups: the groups that should be compared
:return: a tuple containing the selected groups and a list of messages
"""
assert grouping in metadata_df.columns, f"{grouping} not found in metadata_df"
messages = []

# Check if the selected groups are present in the metadata_df
removed_groups = []
for group in selected_groups:
if group not in metadata_df[grouping].unique():
selected_groups.remove(group)
removed_groups.append(group)
if removed_groups:
messages.append(
{
"level": logging.WARNING,
"msg": f"Group{'s' if len(removed_groups) > 1 else ''} "
f"{str(removed_groups)[1:-1]} were not found in metadata_df and thus removed.",
}
)

# Select all groups if none or less than two were selected
if not selected_groups or isinstance(selected_groups, str) or len(selected_groups) < 2:
selected_groups = metadata_df[grouping].unique()
selected_groups_str = "".join(["\'" + str(group) + "\', " for group in selected_groups])[0:-2]
messages.append(
{
"level": logging.WARNING,
"msg": f"Auto-selected the groups {selected_groups_str} for comparison, "
f"because none or only one {'valid ' if removed_groups else ''}group was selected.",
}
)

return selected_groups, messages


def calculate_log2_fold_change(
group1_data: pd.Series, group2_data: pd.Series, log_base: int
) -> float:
return (
np.log2(
np.power(log_base, group2_data).mean()
/ np.power(log_base, group1_data).mean()
)
if log_base
else np.log2(group2_data.mean() / group1_data.mean())
)


def merge_differential_expression_and_significant_df(
intensity_df: pd.DataFrame, diff_exp_df: pd.DataFrame, sig_df: pd.DataFrame
) -> tuple[pd.DataFrame, pd.DataFrame]:
differentially_expressed_proteins_df = pd.merge(intensity_df, diff_exp_df, on="Protein ID", how="left")
differentially_expressed_proteins_df = differentially_expressed_proteins_df.loc[
differentially_expressed_proteins_df["Protein ID"].isin(diff_exp_df["Protein ID"])
]
significant_proteins_df = pd.merge(intensity_df, sig_df, on="Protein ID", how="left")
significant_proteins_df = significant_proteins_df.loc[
significant_proteins_df["Protein ID"].isin(sig_df["Protein ID"])
]

return differentially_expressed_proteins_df, significant_proteins_df


def normalize_ptm_df(ptm_df: pd.DataFrame) -> pd.DataFrame:
"""
Normalizes the PTM data frame by dividing the PTM values by the amount of peptides.
:param ptm_df: the PTM data frame
:return: the normalized PTM data frame
"""
ptm_df_without_sample = ptm_df.drop("Sample", axis=1)

normalized_ptm_df = ptm_df_without_sample.div(ptm_df["Total Amount of Peptides"], axis=0)

normalized_ptm_df = normalized_ptm_df.drop("Total Amount of Peptides", axis=1)
normalized_ptm_df.insert(0, "Sample", ptm_df["Sample"])

return normalized_ptm_df


INVALID_PROTEINGROUP_DATA_MSG = {
"level": logging.WARNING,
"msg": "Due do missing or identical values, the p-values for some protein groups could not be calculated. These groups were omitted from the analysis. "
"To prevent this, please add filtering and imputation steps to your workflow before running the analysis.",
"msg": "Due to missing or identical values, the p-values for some protein groups could not be calculated. "
"These groups were omitted from the analysis. "
"To prevent this, please add filtering and imputation steps to your workflow before running the analysis.",
}
Loading

0 comments on commit 87dd67e

Please sign in to comment.