Merge branch 'dev' into deep-learning-spectrum-prediction

cschlaffner · Nov 14, 2024 · 3fc6390 · 3fc6390
2 parents 7d07942 + 22292af
commit 3fc6390
Show file tree

Hide file tree

Showing 8 changed files with 293 additions and 54 deletions.
diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ Once the script has done most of its work, something along the lines of `Startin
 
 ## Start-Guide - a little more technical
 > [!NOTE]
-> For further information on how to contribute on PROTzilla read our [dev-guide](./docs/dev-guide.md).
+> For further information on how to contribute on PROTzilla read our [dev-guide](https://github.com/cschlaffner/PROTzilla2/wiki).
 
 PROTzilla2 uses Python 3.11 and conda to manage the environment and pip for installing packages.
 

diff --git a/protzilla/data_analysis/differential_expression_mann_whitney.py b/protzilla/data_analysis/differential_expression_mann_whitney.py
@@ -16,8 +16,34 @@ def mann_whitney_test_on_intensity_data(
         group2: str,
         log_base: str = None,
         alpha=0.05,
-        multiple_testing_correction_method: str = "",
+        multiple_testing_correction_method: str = "Benjamini-Hochberg",
+        p_value_calculation_method: str = "auto"
 ) -> dict:
+    """
+    Perform Mann-Whitney U test on all proteins in the given intensity data frame.
+
+    :param intensity_df: A protein dataframe in typical PROTzilla long format.
+    :param metadata_df: The metadata data frame containing the grouping information.
+    :param grouping: The column name in the metadata data frame that contains the grouping information,
+        that should be used.
+    :param group1: The name of the first group for the Mann-Whitney U test.
+    :param group2: The name of the second group for the Mann-Whitney U test.
+    :param log_base: The base of the logarithm that was used to transform the data.
+    :param alpha: The significance level for the test.
+    :param multiple_testing_correction_method: The method for multiple testing correction.
+    :param p_value_calculation_method: The method for p-value calculation.
+
+    :return: a dict containing
+        - a df differentially_expressed_proteins_df in long format containing all test results
+        - a df significant_proteins_df, containing the proteins of differentially_expressed_column_df,
+            that are significant after multiple testing correction
+        - a df corrected_p_values, containing the p_values after application of multiple testing correction
+        - a df log2_fold_change, containing the log2 fold changes per protein
+        - a df u_statistic_df, containing the u-statistic per protein
+        - a float corrected_alpha, containing the alpha value after application of multiple testing correction
+            (depending on the selected multiple testing correction method corrected_alpha may be equal to alpha)
+        - a list messages (optional), containing messages for the user
+    """
     wide_df = long_to_wide(intensity_df)
 
     outputs = mann_whitney_test_on_columns(
@@ -30,6 +56,7 @@ def mann_whitney_test_on_intensity_data(
         alpha=alpha,
         multiple_testing_correction_method=multiple_testing_correction_method,
         columns_name="Protein ID",
+        p_value_calculation_method=p_value_calculation_method
     )
     differentially_expressed_proteins_df = pd.merge(intensity_df, outputs["differential_expressed_columns_df"], on="Protein ID", how="left")
     differentially_expressed_proteins_df = differentially_expressed_proteins_df.loc[
@@ -50,6 +77,65 @@ def mann_whitney_test_on_intensity_data(
         messages=outputs["messages"],
     )
 
+def mann_whitney_test_on_ptm_data(
+        ptm_df: pd.DataFrame,
+        metadata_df: pd.DataFrame,
+        grouping: str,
+        group1: str,
+        group2: str,
+        alpha=0.05,
+        multiple_testing_correction_method: str = "Benjamini-Hochberg",
+        p_value_calculation_method: str = "auto"
+) -> dict:
+    """
+    Perform Mann-Whitney U test on all PTMs in the given PTM data frame.
+
+    :param ptm_df: The data frame containing the PTM data in columns and a
+        "Sample" column that can be mapped to the metadata, to assign the groups.
+    :param metadata_df: The metadata data frame containing the grouping information.
+    :param grouping: The column name in the metadata data frame that contains the grouping information,
+        that should be used.
+    :param group1: The name of the first group for the Mann-Whitney U test.
+    :param group2: The name of the second group for the Mann-Whitney U test.
+    :param log_base: The base of the logarithm that was used to transform the data.
+    :param alpha: The significance level for the test.
+    :param multiple_testing_correction_method: The method for multiple testing correction.
+    :param p_value_calculation_method: The method for p-value calculation.
+
+    :return: a dict containing
+        - a df differentially_expressed_ptm_df in wide format containing all test results
+        - a df significant_ptm_df, containing the ptm of differentially_expressed_column_df,
+            that are significant after multiple testing correction
+        - a df corrected_p_values, containing the p_values after application of multiple testing correction,
+        - a df log2_fold_change, containing the log2 fold changes per column,
+        - a df t_statistic_df, containing the t-statistic per protein,
+        - a float corrected_alpha, containing the alpha value after application of multiple testing correction (depending on the selected multiple testing correction method corrected_alpha may be equal to alpha),
+        - a list messages, containing messages for the user
+    """
+    output = mann_whitney_test_on_columns(
+        df=ptm_df,
+        metadata_df=metadata_df,
+        grouping=grouping,
+        group1=group1,
+        group2=group2,
+        log_base=None,
+        alpha=alpha,
+        multiple_testing_correction_method=multiple_testing_correction_method,
+        columns_name="PTM",
+        p_value_calculation_method=p_value_calculation_method
+    )
+
+    return dict(
+        differentially_expressed_ptm_df=output["differential_expressed_columns_df"],
+        significant_ptm_df=output["significant_columns_df"],
+        corrected_p_values_df=output["corrected_p_values_df"],
+        u_statistic_df=output["u_statistic_df"],
+        log2_fold_change_df=output["log2_fold_change_df"],
+        corrected_alpha=output["corrected_alpha"],
+        messages=output["messages"],
+    )
+
+
 def mann_whitney_test_on_columns(
         df: pd.DataFrame,
         metadata_df: pd.DataFrame,
@@ -58,25 +144,28 @@ def mann_whitney_test_on_columns(
         group2: str,
         log_base: str = None,
         alpha=0.05,
-        multiple_testing_correction_method: str = "",
+        multiple_testing_correction_method: str = "Benjamini-Hochberg",
         columns_name: str = "Protein ID",
+        p_value_calculation_method: str = "auto"
 ) -> dict:
     """
     Perform Mann-Whitney U test on all columns of the data frame.
 
-    @param df: The data frame containing the data in columns and a
+    :param df: The data frame containing the data in columns and a
     "Sample" column that can be mapped to the metadata, to assign the groups.
-    @param metadata_df: The metadata data frame containing the grouping information.
-    @param grouping: The column name in the metadata data frame that contains the grouping information,
+    :param metadata_df: The metadata data frame containing the grouping information.
+    :param grouping: The column name in the metadata data frame that contains the grouping information,
     that should be used.
-    @param group1: The name of the first group for the Mann-Whitney U test.
-    @param group2: The name of the second group for the Mann-Whitney U test.
-    @param log_base: The base of the logarithm that was used to transform the data.
-    @param alpha: The significance level for the test.
-    @param multiple_testing_correction_method: The method for multiple testing correction.
+    :param group1: The name of the first group for the Mann-Whitney U test.
+    :param group2: The name of the second group for the Mann-Whitney U test.
+    :param log_base: The base of the logarithm that was used to transform the data.
+    :param alpha: The significance level for the test.
+    :param multiple_testing_correction_method: The method for multiple testing correction.
+    :param columns_name: The semantics of the column names. This is used to name the columns in the output data frames.
+    :param p_value_calculation_method: The method for p-value calculation.
 
     :return: a dict containing
-        - a df differentially_expressed_column_df in wide format containing the t-test results
+        - a df differentially_expressed_column_df in wide format containing the test results
         - a df significant_columns_df, containing the columns of differentially_expressed_column_df,
             that are significant after multiple testing correction
         - a df corrected_p_values, containing the p_values after application of multiple testing correction,
@@ -104,7 +193,7 @@ def mann_whitney_test_on_columns(
     for column in data_columns:
         group1_data = df_with_groups[df_with_groups[grouping] == group1][column]
         group2_data = df_with_groups[df_with_groups[grouping] == group2][column]
-        u_statistic, p_value = stats.mannwhitneyu(group1_data, group2_data, alternative="two-sided")
+        u_statistic, p_value = stats.mannwhitneyu(group1_data, group2_data, alternative="two-sided", method=p_value_calculation_method)
 
         if not np.isnan(p_value):
             log2_fold_change = (
@@ -139,7 +228,7 @@ def mann_whitney_test_on_columns(
     )
     u_statistic_df = pd.DataFrame(
         list(zip(valid_columns, u_statistics)),
-        columns=[columns_name, "t_statistic"],
+        columns=[columns_name, "u_statistic"],
     )
 
     combined_df = pd.DataFrame(

diff --git a/protzilla/importing/metadata_import.py b/protzilla/importing/metadata_import.py
@@ -68,7 +68,7 @@ def metadata_import_method(
         messages.append(
             {
                 "level": logging.INFO,
-                "msg": "The imported dataframe indicates an incorrent orientation. Consider viewing the table to ensure the orientation is correct.",
+                "msg": "The imported dataframe indicates an incorrect orientation. Consider viewing the table to ensure the orientation is correct.",
             }
         )
 

diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
@@ -10,9 +10,7 @@
 from protzilla.data_analysis.differential_expression_anova import anova
 from protzilla.data_analysis.differential_expression_linear_model import linear_model
 from protzilla.data_analysis.differential_expression_mann_whitney import (
-    mann_whitney_test_on_columns,
-    mann_whitney_test_on_intensity_data,
-)
+    mann_whitney_test_on_intensity_data, mann_whitney_test_on_ptm_data)
 from protzilla.data_analysis.differential_expression_t_test import t_test
 from protzilla.data_analysis.dimension_reduction import t_sne, umap
 from protzilla.data_analysis.model_evaluation import evaluate_classification_model
@@ -182,11 +180,13 @@ class DifferentialExpressionMannWhitneyOnIntensity(DataAnalysisStep):
         "group2",
         "alpha",
         "multiple_testing_correction_method",
+        "p_value_calculation_method",
     ]
     output_keys = [
         "differentially_expressed_proteins_df",
         "significant_proteins_df",
         "corrected_p_values_df",
+        "u_statistic_df",
         "log2_fold_change_df",
         "corrected_alpha",
     ]
@@ -216,40 +216,32 @@ class DifferentialExpressionMannWhitneyOnPTM(DataAnalysisStep):
     )
 
     input_keys = [
-        "df",
+        "ptm_df",
         "metadata_df",
         "grouping",
         "group1",
         "group2",
         "alpha",
         "multiple_testing_correction_method",
-        "columns_name",
+        "p_value_calculation_method",
     ]
     output_keys = [
         "differentially_expressed_ptm_df",
         "significant_ptm_df",
         "corrected_p_values_df",
+        "u_statistic_df",
         "log2_fold_change_df",
         "corrected_alpha",
     ]
 
     def method(self, inputs: dict) -> dict:
-        return mann_whitney_test_on_columns(**inputs)
+        return mann_whitney_test_on_ptm_data(**inputs)
 
     def insert_dataframes(self, steps: StepManager, inputs) -> dict:
-        inputs["df"] = steps.get_step_output(Step, "ptm_df", inputs["ptm_df"])
-        inputs["columns_name"] = "PTM"
+        inputs["ptm_df"] = steps.get_step_output(Step, "ptm_df", inputs["ptm_df"])
         inputs["metadata_df"] = steps.metadata_df
-        inputs["log_base"] = steps.get_step_input(TransformationLog, "log_base")
         return inputs
-
-    def handle_outputs(self, outputs: dict) -> None:
-        outputs["differentially_expressed_ptm_df"] = outputs.pop(
-            "differential_expressed_columns_df", None
-        )
-        outputs["significant_ptm_df"] = outputs.pop("significant_columns_df", None)
-        super().handle_outputs(outputs)
-
+
 
 class PlotVolcano(PlotStep):
     display_name = "Volcano Plot"

diff --git a/protzilla/utilities/transform_dfs.py b/protzilla/utilities/transform_dfs.py
@@ -3,7 +3,7 @@
 from protzilla.utilities import default_intensity_column
 
 
-def long_to_wide(intensity_df: pd.DataFrame, value_name: str = None):
+def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None):
     """
     This function transforms the dataframe to a wide format that
     can be more easily handled by packages such as sklearn.