Skip to content

Commit

Permalink
merge
Browse files Browse the repository at this point in the history
  • Loading branch information
lucatreide committed Jul 11, 2024
2 parents f845ae1 + 7545bf2 commit 3da419f
Show file tree
Hide file tree
Showing 30 changed files with 26,236 additions and 25,914 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,4 @@ def linear_model(
corrected_alpha=corrected_alpha,
filtered_proteins=filtered_proteins,
messages=messages,
group1=group1,
group2=group2,
)
164 changes: 164 additions & 0 deletions protzilla/data_analysis/differential_expression_mann_whitney.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
import logging

import numpy as np
import pandas as pd
from scipy import stats

from protzilla.data_analysis.differential_expression_helper import _map_log_base, apply_multiple_testing_correction
from protzilla.utilities.transform_dfs import long_to_wide


def mann_whitney_test_on_intensity_data(
intensity_df: pd.DataFrame,
metadata_df: pd.DataFrame,
grouping: str,
group1: str,
group2: str,
log_base: str = None,
alpha=0.05,
multiple_testing_correction_method: str = "",
) -> dict:
wide_df = long_to_wide(intensity_df)

outputs = mann_whitney_test_on_columns(
df=wide_df,
metadata_df=metadata_df,
grouping=grouping,
group1=group1,
group2=group2,
log_base=log_base,
alpha=alpha,
multiple_testing_correction_method=multiple_testing_correction_method,
columns_name="Protein ID",
)
differentially_expressed_proteins_df = pd.merge(intensity_df, outputs["differential_expressed_columns_df"], on="Protein ID", how="left")
differentially_expressed_proteins_df = differentially_expressed_proteins_df.loc[
differentially_expressed_proteins_df["Protein ID"].isin(outputs["differential_expressed_columns_df"]["Protein ID"])
]
significant_proteins_df = pd.merge(intensity_df, outputs["significant_columns_df"], on="Protein ID", how="left")
significant_proteins_df = significant_proteins_df.loc[
significant_proteins_df["Protein ID"].isin(outputs["significant_columns_df"]["Protein ID"])
]

return dict(
differentially_expressed_proteins_df=differentially_expressed_proteins_df,
significant_proteins_df=significant_proteins_df,
corrected_p_values_df=outputs["corrected_p_values_df"],
u_statistic_df=outputs["u_statistic_df"],
log2_fold_change_df=outputs["log2_fold_change_df"],
corrected_alpha=outputs["corrected_alpha"],
messages=outputs["messages"],
)

def mann_whitney_test_on_columns(
df: pd.DataFrame,
metadata_df: pd.DataFrame,
grouping: str,
group1: str,
group2: str,
log_base: str = None,
alpha=0.05,
multiple_testing_correction_method: str = "",
columns_name: str = "Protein ID",
) -> dict:
"""
Perform Mann-Whitney U test on all columns of the data frame.
@param df: The data frame containing the data in columns and a
"Sample" column that can be mapped to the metadata, to assign the groups.
@param metadata_df: The metadata data frame containing the grouping information.
@param grouping: The column name in the metadata data frame that contains the grouping information,
that should be used.
@param group1: The name of the first group for the Mann-Whitney U test.
@param group2: The name of the second group for the Mann-Whitney U test.
@param log_base: The base of the logarithm that was used to transform the data.
@param alpha: The significance level for the test.
@param multiple_testing_correction_method: The method for multiple testing correction.
:return: a dict containing
- a df differentially_expressed_column_df in wide format containing the t-test results
- a df significant_columns_df, containing the columns of differentially_expressed_column_df,
that are significant after multiple testing correction
- a df corrected_p_values, containing the p_values after application of multiple testing correction,
- a df log2_fold_change, containing the log2 fold changes per column,
- a df t_statistic_df, containing the t-statistic per protein,
- a float corrected_alpha, containing the alpha value after application of multiple testing correction (depending on the selected multiple testing correction method corrected_alpha may be equal to alpha),
- a list messages, containing messages for the user
"""

df_with_groups = pd.merge(
left=df,
right=metadata_df[["Sample", grouping]],
on="Sample",
copy=False,
)
log_base = _map_log_base(log_base) # now log_base in [2, 10, None]

valid_columns = []
p_values = []
log2_fold_changes = []
u_statistics = []
invalid_columns = []
data_columns = df.columns[~df.columns.isin(["Sample", grouping])]

for column in data_columns:
group1_data = df_with_groups[df_with_groups[grouping] == group1][column]
group2_data = df_with_groups[df_with_groups[grouping] == group2][column]
u_statistic, p_value = stats.mannwhitneyu(group1_data, group2_data, alternative="two-sided")

if not np.isnan(p_value):
log2_fold_change = (
np.log2(
np.power(log_base, group2_data).mean()
/ np.power(log_base, group1_data).mean()
)
if log_base
else np.log2(group2_data.mean() / group1_data.mean())
)

valid_columns.append(column)
p_values.append(p_value)
u_statistics.append(u_statistic)
log2_fold_changes.append(log2_fold_change)
else:
invalid_columns.append(column)

corrected_p_values, corrected_alpha = apply_multiple_testing_correction(
p_values=p_values,
method=multiple_testing_correction_method,
alpha=alpha,
)

corrected_p_values_df = pd.DataFrame(
list(zip(valid_columns, corrected_p_values)),
columns=[columns_name, "corrected_p_value"],
)
log2_fold_change_df = pd.DataFrame(
list(zip(valid_columns, log2_fold_changes)),
columns=[columns_name, "log2_fold_change"],
)
u_statistic_df = pd.DataFrame(
list(zip(valid_columns, u_statistics)),
columns=[columns_name, "t_statistic"],
)

combined_df = pd.DataFrame(
list(zip(valid_columns, corrected_p_values, log2_fold_changes, u_statistics)),
columns=[columns_name, "corrected_p_value", "log2_fold_change", "u_statistic"],
)

significant_columns_df = combined_df[
combined_df["corrected_p_value"] <= corrected_alpha
]

messages = [dict(level=logging.INFO, msg=f"Invalid columns: {invalid_columns}")] if invalid_columns else []

return dict(
differential_expressed_columns_df=combined_df,
significant_columns_df=significant_columns_df,
corrected_p_values_df=corrected_p_values_df,
u_statistic_df=u_statistic_df,
log2_fold_change_df=log2_fold_change_df,
corrected_alpha=corrected_alpha,
messages=messages,
)
8 changes: 1 addition & 7 deletions protzilla/data_analysis/differential_expression_t_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,12 +155,8 @@ def t_test(
for df in dataframes:
intensity_df = pd.merge(intensity_df, df, on="Protein ID", how="left")

differentially_expressed_proteins = [
protein for protein, p in zip(valid_protein_groups, corrected_p_values)
]

differentially_expressed_proteins_df = intensity_df.loc[
intensity_df["Protein ID"].isin(differentially_expressed_proteins)
intensity_df["Protein ID"].isin(valid_protein_groups)
]

significant_proteins_df = differentially_expressed_proteins_df[
Expand All @@ -178,6 +174,4 @@ def t_test(
corrected_alpha=corrected_alpha,
# filtered_proteins=filtered_proteins,
messages=messages,
group1=group1,
group2=group2,
)
98 changes: 97 additions & 1 deletion protzilla/data_analysis/ptm_analysis.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import logging
from math import log

import numpy as np
import pandas as pd
import re

from protzilla.utilities.transform_dfs import long_to_wide


def filter_peptides_of_protein(
Expand All @@ -27,4 +32,95 @@ def filter_peptides_of_protein(
"level": logging.INFO if len(filtered_peptides) > 0 else logging.WARNING,
"msg": f"Selected {len(filtered_peptides)} entry's from the peptide dataframe."
}],
)
)


def ptms_per_sample(peptide_df: pd.DataFrame) -> dict:
"""
This function calculates the amount of every PTMs per sample.
:param peptide_df: the pandas dataframe containing the peptide information
:return: dict containing a dataframe one row per sample and one column per PTM that occurs in the peptide_df,
with the cells containing the amount of the PTM in the sample
"""

modification_df = peptide_df[["Sample", "Modifications"]]

modification_df = pd.concat(
[modification_df["Sample"],
(modification_df['Modifications'].str.get_dummies(sep=","))], axis=1)

for column, data in modification_df.iteritems():
amount, name = from_string(column)
if amount > 1:
modification_df[column] = modification_df[column].multiply(amount)
modification_df = modification_df.rename(columns={column: name})

modification_df = modification_df.groupby(["Sample"]).sum()

modification_df = modification_df.groupby(modification_df.columns, axis=1).sum()

modification_df = modification_df.reset_index()

return dict(ptm_df=modification_df)


def ptms_per_protein_and_sample(peptide_df: pd.DataFrame) -> dict:
"""
This function calculates the amount of every PTM per sample and protein.
:param peptide_df: the pandas dataframe containing the peptide information
:return: dict containing a dataframe one row per sample and one column per protein,
with the cells containing a list of PTMs that occur in the peptide_df for the protein and sample and
their amount in the protein and sample
"""

modification_df = peptide_df[["Sample", "Protein ID", "Modifications"]]

modification_df = modification_df[["Sample", "Protein ID"]].join(
modification_df['Modifications'].str.get_dummies(sep=",")
)

for column, data in modification_df.iteritems():
amount, name = from_string(column)
if amount > 1:
modification_df[column] = modification_df[column].multiply(amount)
modification_df = modification_df.rename(columns={column: name})

modification_df = modification_df.groupby(["Sample", "Protein ID"]).sum()

modification_df = modification_df.groupby(modification_df.columns, axis=1).sum()

modification_df = modification_df.reset_index()

modi = (
modification_df.drop(["Sample", "Protein ID"], axis=1).apply(lambda x: ('(' + x.astype(str) + ') ' + x.name + ", ")))

for column, data in modi.iteritems():
modi[column] = np.where(modification_df[column] > 0, modi[column], "")

modification_df["Modifications"] = modi.apply(''.join, axis=1)
modification_df = modification_df[['Sample', 'Protein ID', 'Modifications']]

modification_df = long_to_wide(modification_df, "Modifications").fillna("").reset_index()

return dict(ptm_df=modification_df)


def from_string(mod_string: str) -> tuple[int, str]:
"""
This function extracts the amount and name of a modification from its listing in the evidence file.
:param mod_string: a string containing the amount and name of the modification
:return: tuple containing the amount and name of the modification
"""

re_search = re.search(r'\d+', mod_string)
amount = int(re_search.group()) if re_search else 1
name = re.search(r'\D+', mod_string).group()
name = name[1:] if name[0] == " " else name

return amount, name
Loading

0 comments on commit 3da419f

Please sign in to comment.