From e6c29e256817780e61e5cc8803812670039c4e6d Mon Sep 17 00:00:00 2001 From: JanniRoebbecke <50395935+JanniRoebbecke@users.noreply.github.com> Date: Wed, 14 Aug 2024 20:57:05 +0200 Subject: [PATCH] Mann whitney test (#518) * evidence import * tests * filter proteins * filter sampels * Select Peptides by group or sample for coverage visualisation (#410) * untested working, needs cleanup * linter * remove dupliacte method (merge screw-up) * add doc strings, fix first few tests (miss. args) * metadata df now optional param + adjusted asserts, fixed test * formatting * conditional metadata input for protein graphs * import logging levels, formatting * add grouping tests for peptides, add error handling * spelling, formatting * remove unused import, format * fix error msgs in tests * fix line length * Convert has_metadata to property * Use logger instead of print statement * An attempt at consistent text-casing * formatting --------- Co-authored-by: henninggaertner * tests for filter_proteins with peptide file * tests for filter_proteins with peptide file, better mock dataframe * output types * added testing that only filtered Proteins are removed from peptides_df * tests for filter_samples with peptide file * extract test that peptide filtering matches protein filtering into Method * refactor: shortened tests by integrating extracted method and mock data * exclude peptides * tests * minor fixes in test * minor fixes in test * fix test that peptide filtering matches protein filtering Method * fix test that peptide filtering matches protein filtering Method * fix test that peptide filtering matches protein filtering Method * fix test that peptide filtering matches protein filtering Method * match mock protein dataframe to mock peptide dataframe * extract test that peptide filtering matches protein filtering into Method * implementation * tests * tests improve error logging * tests git testing * peptide import added missing column * implements form, form_mapping, method and the actual functionality (untested) * merge peptide filtering and outlier_detection * merge peptide transformation * implement passing of peptide data between preprocessing steps * complete implement passing of peptide data between preprocessing steps * implement overhead for new step evidence import * enable selecting multiple proteins of interest and implements tests * Fix EvidenceImport class (now the right method is called) * implement option to choose protein from list of significant proteins * implement option to let the system choose the most expressed protein * include messages for peptide filtering * remove selecting multiple proteins and add substring search * add cleaning of protein groups to evidence import * implement suggested changes: remove default values and correct grammar mistake * make preprocessing work without peptide_df * clean up * add cleaning of protein groups to evidence import * rename * clean Protein IDs for peptide_import and name intensity column Intensity * implement requested changes: make peptide_df input optional for all preprocessing steps, remove peptide_df input in preprocessing steps, where it is not used * implement requested changes: make peptide_df input optional for all preprocessing steps, remove peptide_df input in preprocessing steps, where it is not used * unify into one method * cleanup old names * cleanup names * cleanup names * setup form and method * small refactor: utilize fill_helper better * implemented method * cleanup rename and docstring * tests and named sample column from "0" to "Sample" * implemented slow version * enable selecting multiple proteins * complete merge * rename file and fix selecting multiple proteins * improve efficiency of methods ptms_per_sample and ptms_per_protein_and_sample * added test for ptms_per_protein_and_sample * complete merge (fix testes after merge) * cleanup * implement differential expression with mann-whitney-test on ptm data * implement differential expression with mann-whitney-test on protein data * refactor input and output handling for Mann-Whitney-test on ptm data * add additional parameter form user and docstrings * fix typo * add test for mann whitney on intensity data * add test for mann whitney on ptm data * add mann whitney to volcano plot form * add missing value to output_keys of mann whitney test * Implement requested changes --------- Co-authored-by: janni.roebbecke --- .../differential_expression_mann_whitney.py | 115 ++++++++++-- protzilla/methods/data_analysis.py | 26 +-- protzilla/utilities/transform_dfs.py | 2 +- .../test_differential_expression.py | 165 ++++++++++++++++-- ui/runs/forms/data_analysis.py | 15 +- 5 files changed, 277 insertions(+), 46 deletions(-) diff --git a/protzilla/data_analysis/differential_expression_mann_whitney.py b/protzilla/data_analysis/differential_expression_mann_whitney.py index 89041261..3ef4f663 100644 --- a/protzilla/data_analysis/differential_expression_mann_whitney.py +++ b/protzilla/data_analysis/differential_expression_mann_whitney.py @@ -16,8 +16,34 @@ def mann_whitney_test_on_intensity_data( group2: str, log_base: str = None, alpha=0.05, - multiple_testing_correction_method: str = "", + multiple_testing_correction_method: str = "Benjamini-Hochberg", + p_value_calculation_method: str = "auto" ) -> dict: + """ + Perform Mann-Whitney U test on all proteins in the given intensity data frame. + + :param intensity_df: A protein dataframe in typical PROTzilla long format. + :param metadata_df: The metadata data frame containing the grouping information. + :param grouping: The column name in the metadata data frame that contains the grouping information, + that should be used. + :param group1: The name of the first group for the Mann-Whitney U test. + :param group2: The name of the second group for the Mann-Whitney U test. + :param log_base: The base of the logarithm that was used to transform the data. + :param alpha: The significance level for the test. + :param multiple_testing_correction_method: The method for multiple testing correction. + :param p_value_calculation_method: The method for p-value calculation. + + :return: a dict containing + - a df differentially_expressed_proteins_df in long format containing all test results + - a df significant_proteins_df, containing the proteins of differentially_expressed_column_df, + that are significant after multiple testing correction + - a df corrected_p_values, containing the p_values after application of multiple testing correction + - a df log2_fold_change, containing the log2 fold changes per protein + - a df u_statistic_df, containing the u-statistic per protein + - a float corrected_alpha, containing the alpha value after application of multiple testing correction + (depending on the selected multiple testing correction method corrected_alpha may be equal to alpha) + - a list messages (optional), containing messages for the user + """ wide_df = long_to_wide(intensity_df) outputs = mann_whitney_test_on_columns( @@ -30,6 +56,7 @@ def mann_whitney_test_on_intensity_data( alpha=alpha, multiple_testing_correction_method=multiple_testing_correction_method, columns_name="Protein ID", + p_value_calculation_method=p_value_calculation_method ) differentially_expressed_proteins_df = pd.merge(intensity_df, outputs["differential_expressed_columns_df"], on="Protein ID", how="left") differentially_expressed_proteins_df = differentially_expressed_proteins_df.loc[ @@ -50,6 +77,65 @@ def mann_whitney_test_on_intensity_data( messages=outputs["messages"], ) +def mann_whitney_test_on_ptm_data( + ptm_df: pd.DataFrame, + metadata_df: pd.DataFrame, + grouping: str, + group1: str, + group2: str, + alpha=0.05, + multiple_testing_correction_method: str = "Benjamini-Hochberg", + p_value_calculation_method: str = "auto" +) -> dict: + """ + Perform Mann-Whitney U test on all PTMs in the given PTM data frame. + + :param ptm_df: The data frame containing the PTM data in columns and a + "Sample" column that can be mapped to the metadata, to assign the groups. + :param metadata_df: The metadata data frame containing the grouping information. + :param grouping: The column name in the metadata data frame that contains the grouping information, + that should be used. + :param group1: The name of the first group for the Mann-Whitney U test. + :param group2: The name of the second group for the Mann-Whitney U test. + :param log_base: The base of the logarithm that was used to transform the data. + :param alpha: The significance level for the test. + :param multiple_testing_correction_method: The method for multiple testing correction. + :param p_value_calculation_method: The method for p-value calculation. + + :return: a dict containing + - a df differentially_expressed_ptm_df in wide format containing all test results + - a df significant_ptm_df, containing the ptm of differentially_expressed_column_df, + that are significant after multiple testing correction + - a df corrected_p_values, containing the p_values after application of multiple testing correction, + - a df log2_fold_change, containing the log2 fold changes per column, + - a df t_statistic_df, containing the t-statistic per protein, + - a float corrected_alpha, containing the alpha value after application of multiple testing correction (depending on the selected multiple testing correction method corrected_alpha may be equal to alpha), + - a list messages, containing messages for the user + """ + output = mann_whitney_test_on_columns( + df=ptm_df, + metadata_df=metadata_df, + grouping=grouping, + group1=group1, + group2=group2, + log_base=None, + alpha=alpha, + multiple_testing_correction_method=multiple_testing_correction_method, + columns_name="PTM", + p_value_calculation_method=p_value_calculation_method + ) + + return dict( + differentially_expressed_ptm_df=output["differential_expressed_columns_df"], + significant_ptm_df=output["significant_columns_df"], + corrected_p_values_df=output["corrected_p_values_df"], + u_statistic_df=output["u_statistic_df"], + log2_fold_change_df=output["log2_fold_change_df"], + corrected_alpha=output["corrected_alpha"], + messages=output["messages"], + ) + + def mann_whitney_test_on_columns( df: pd.DataFrame, metadata_df: pd.DataFrame, @@ -58,25 +144,28 @@ def mann_whitney_test_on_columns( group2: str, log_base: str = None, alpha=0.05, - multiple_testing_correction_method: str = "", + multiple_testing_correction_method: str = "Benjamini-Hochberg", columns_name: str = "Protein ID", + p_value_calculation_method: str = "auto" ) -> dict: """ Perform Mann-Whitney U test on all columns of the data frame. - @param df: The data frame containing the data in columns and a + :param df: The data frame containing the data in columns and a "Sample" column that can be mapped to the metadata, to assign the groups. - @param metadata_df: The metadata data frame containing the grouping information. - @param grouping: The column name in the metadata data frame that contains the grouping information, + :param metadata_df: The metadata data frame containing the grouping information. + :param grouping: The column name in the metadata data frame that contains the grouping information, that should be used. - @param group1: The name of the first group for the Mann-Whitney U test. - @param group2: The name of the second group for the Mann-Whitney U test. - @param log_base: The base of the logarithm that was used to transform the data. - @param alpha: The significance level for the test. - @param multiple_testing_correction_method: The method for multiple testing correction. + :param group1: The name of the first group for the Mann-Whitney U test. + :param group2: The name of the second group for the Mann-Whitney U test. + :param log_base: The base of the logarithm that was used to transform the data. + :param alpha: The significance level for the test. + :param multiple_testing_correction_method: The method for multiple testing correction. + :param columns_name: The semantics of the column names. This is used to name the columns in the output data frames. + :param p_value_calculation_method: The method for p-value calculation. :return: a dict containing - - a df differentially_expressed_column_df in wide format containing the t-test results + - a df differentially_expressed_column_df in wide format containing the test results - a df significant_columns_df, containing the columns of differentially_expressed_column_df, that are significant after multiple testing correction - a df corrected_p_values, containing the p_values after application of multiple testing correction, @@ -104,7 +193,7 @@ def mann_whitney_test_on_columns( for column in data_columns: group1_data = df_with_groups[df_with_groups[grouping] == group1][column] group2_data = df_with_groups[df_with_groups[grouping] == group2][column] - u_statistic, p_value = stats.mannwhitneyu(group1_data, group2_data, alternative="two-sided") + u_statistic, p_value = stats.mannwhitneyu(group1_data, group2_data, alternative="two-sided", method=p_value_calculation_method) if not np.isnan(p_value): log2_fold_change = ( @@ -139,7 +228,7 @@ def mann_whitney_test_on_columns( ) u_statistic_df = pd.DataFrame( list(zip(valid_columns, u_statistics)), - columns=[columns_name, "t_statistic"], + columns=[columns_name, "u_statistic"], ) combined_df = pd.DataFrame( diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 001a2fca..7ad45ddd 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -9,9 +9,7 @@ from protzilla.data_analysis.differential_expression_anova import anova from protzilla.data_analysis.differential_expression_linear_model import linear_model from protzilla.data_analysis.differential_expression_mann_whitney import ( - mann_whitney_test_on_columns, - mann_whitney_test_on_intensity_data, -) + mann_whitney_test_on_intensity_data, mann_whitney_test_on_ptm_data) from protzilla.data_analysis.differential_expression_t_test import t_test from protzilla.data_analysis.dimension_reduction import t_sne, umap from protzilla.data_analysis.model_evaluation import evaluate_classification_model @@ -175,11 +173,13 @@ class DifferentialExpressionMannWhitneyOnIntensity(DataAnalysisStep): "group2", "alpha", "multiple_testing_correction_method", + "p_value_calculation_method", ] output_keys = [ "differentially_expressed_proteins_df", "significant_proteins_df", "corrected_p_values_df", + "u_statistic_df", "log2_fold_change_df", "corrected_alpha", ] @@ -209,40 +209,32 @@ class DifferentialExpressionMannWhitneyOnPTM(DataAnalysisStep): ) input_keys = [ - "df", + "ptm_df", "metadata_df", "grouping", "group1", "group2", "alpha", "multiple_testing_correction_method", - "columns_name", + "p_value_calculation_method", ] output_keys = [ "differentially_expressed_ptm_df", "significant_ptm_df", "corrected_p_values_df", + "u_statistic_df", "log2_fold_change_df", "corrected_alpha", ] def method(self, inputs: dict) -> dict: - return mann_whitney_test_on_columns(**inputs) + return mann_whitney_test_on_ptm_data(**inputs) def insert_dataframes(self, steps: StepManager, inputs) -> dict: - inputs["df"] = steps.get_step_output(Step, "ptm_df", inputs["ptm_df"]) - inputs["columns_name"] = "PTM" + inputs["ptm_df"] = steps.get_step_output(Step, "ptm_df", inputs["ptm_df"]) inputs["metadata_df"] = steps.metadata_df - inputs["log_base"] = steps.get_step_input(TransformationLog, "log_base") return inputs - - def handle_outputs(self, outputs: dict) -> None: - outputs["differentially_expressed_ptm_df"] = outputs.pop( - "differential_expressed_columns_df", None - ) - outputs["significant_ptm_df"] = outputs.pop("significant_columns_df", None) - super().handle_outputs(outputs) - + class PlotVolcano(PlotStep): display_name = "Volcano Plot" diff --git a/protzilla/utilities/transform_dfs.py b/protzilla/utilities/transform_dfs.py index f3605b08..59a83259 100644 --- a/protzilla/utilities/transform_dfs.py +++ b/protzilla/utilities/transform_dfs.py @@ -3,7 +3,7 @@ from protzilla.utilities import default_intensity_column -def long_to_wide(intensity_df: pd.DataFrame, value_name: str = None): +def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None): """ This function transforms the dataframe to a wide format that can be more easily handled by packages such as sklearn. diff --git a/tests/protzilla/data_analysis/test_differential_expression.py b/tests/protzilla/data_analysis/test_differential_expression.py index 43ad6021..15a000e4 100644 --- a/tests/protzilla/data_analysis/test_differential_expression.py +++ b/tests/protzilla/data_analysis/test_differential_expression.py @@ -3,6 +3,8 @@ import pytest from protzilla.data_analysis.differential_expression import anova, linear_model, t_test +from protzilla.data_analysis.differential_expression_mann_whitney import mann_whitney_test_on_intensity_data, \ + mann_whitney_test_on_ptm_data from protzilla.data_analysis.plots import create_volcano_plot @@ -62,8 +64,8 @@ def diff_expr_test_data(): def test_differential_expression_linear_model( - diff_expr_test_data, - show_figures, + diff_expr_test_data, + show_figures, ): test_intensity_df, test_metadata_df = diff_expr_test_data test_alpha = 0.05 @@ -107,8 +109,8 @@ def test_differential_expression_linear_model( assert p_values_rounded == corrected_p_values assert log2fc_rounded == log2_fc assert ( - list(current_out["differentially_expressed_proteins_df"]["Protein ID"].unique()) - == differentially_expressed_proteins + list(current_out["differentially_expressed_proteins_df"]["Protein ID"].unique()) + == differentially_expressed_proteins ) assert current_out["corrected_alpha"] == test_alpha @@ -160,13 +162,13 @@ def test_differential_expression_student_t_test(diff_expr_test_data, show_figure assert p_values_rounded == corrected_p_values assert ( - list(current_out["differentially_expressed_proteins_df"]["Protein ID"].unique()) - == differentially_expressed_proteins + list(current_out["differentially_expressed_proteins_df"]["Protein ID"].unique()) + == differentially_expressed_proteins ) assert current_out["corrected_alpha"] == test_alpha assert ( - list(current_out["significant_proteins_df"]["Protein ID"].unique()) - == significant_proteins + list(current_out["significant_proteins_df"]["Protein ID"].unique()) + == significant_proteins ) @@ -217,13 +219,13 @@ def test_differential_expression_welch_t_test(diff_expr_test_data, show_figures) assert p_values_rounded == corrected_p_values assert ( - list(current_out["differentially_expressed_proteins_df"]["Protein ID"].unique()) - == differentially_expressed_proteins + list(current_out["differentially_expressed_proteins_df"]["Protein ID"].unique()) + == differentially_expressed_proteins ) assert current_out["corrected_alpha"] == test_alpha assert ( - list(current_out["significant_proteins_df"]["Protein ID"].unique()) - == significant_proteins + list(current_out["significant_proteins_df"]["Protein ID"].unique()) + == significant_proteins ) @@ -386,4 +388,141 @@ def test_differential_expression_anova(show_figures): 1.0000, ] - assert assertion_p_values == p_values_rounded \ No newline at end of file + assert assertion_p_values == p_values_rounded + + +def test_differential_expression_mann_whitney_on_intensity( + diff_expr_test_data, + show_figures, +): + test_intensity_df, test_metadata_df = diff_expr_test_data + test_alpha = 0.05 + + current_input = dict( + intensity_df=test_intensity_df, + metadata_df=test_metadata_df, + grouping="Group", + group1="Group1", + group2="Group2", + multiple_testing_correction_method="Benjamini-Hochberg", + alpha=test_alpha, + log_base="log2", + p_value_calculation_method="auto", + ) + current_out = mann_whitney_test_on_intensity_data(**current_input) + + fig = create_volcano_plot( + p_values=current_out["corrected_p_values_df"], + log2_fc=current_out["log2_fold_change_df"], + alpha=current_out["corrected_alpha"], + group1=current_input["group1"], + group2=current_input["group2"], + fc_threshold=0, + ) + + if show_figures: + fig.show() + + expected_corrected_p_values = [0.2, 0.4916, 1.0, 0.2] + expected_u_statistics = [9.0, 7.0, 4.5, 9.0] + expected_log2_fc = [-10.1926, -1.0, 0.0, -5.0] + expected_differentially_expressed_proteins = ["Protein1", "Protein2", "Protein3", "Protein4"] + + p_values_rounded = [ + round(x, 4) for x in current_out["corrected_p_values_df"]["corrected_p_value"] + ] + u_statistics = current_out["u_statistic_df"]["u_statistic"] + log2fc_rounded = [ + round(x, 4) for x in current_out["log2_fold_change_df"]["log2_fold_change"] + ] + + assert p_values_rounded == expected_corrected_p_values + assert all(u_statistics == expected_u_statistics) + assert log2fc_rounded == expected_log2_fc + assert ( + list(current_out["differentially_expressed_proteins_df"]["Protein ID"].unique()) + == expected_differentially_expressed_proteins + ) + assert current_out["corrected_alpha"] == test_alpha + + +@pytest.fixture +def ptm_test_data(): + test_amount_list = ( + ["Sample1", 1, 1, 10, 1, 100], + ["Sample2", 2, 2, 10, 1, 100], + ["Sample3", 3, 3, 10, 1, 100], + ["Sample4", 4, 4, 10, 1, 100], + ["Sample5", 5, 5, 10, 1, 100], + ["Sample6", 6, 3, 11, 111, 100], + ["Sample7", 7, 4, 12, 222, 100], + ["Sample8", 8, 5, 13, 333, 100], + ["Sample9", 9, 6, 14, 444, 100], + ["Sample10", 10, 7, 15, 555, 100], + ) + test_amount_df = pd.DataFrame( + data=test_amount_list, + columns=["Sample", "Oxidation", "Acetyl", "GlyGly", "Phospho", "Unmodified"], + ) + + test_metadata_list = ( + ["Sample1", "Group1"], + ["Sample2", "Group1"], + ["Sample3", "Group1"], + ["Sample4", "Group1"], + ["Sample5", "Group1"], + ["Sample6", "Group2"], + ["Sample7", "Group2"], + ["Sample8", "Group2"], + ["Sample9", "Group2"], + ["Sample10", "Group2"], + ) + test_metadata_df = pd.DataFrame( + data=test_metadata_list, + columns=["Sample", "Group"], + ) + + return test_amount_df, test_metadata_df + + +def test_differential_expression_mann_whitney_on_ptm( + ptm_test_data, + show_figures, +): + test_amount_df, test_metadata_df = ptm_test_data + test_alpha = 0.05 + + current_input = dict( + ptm_df=test_amount_df, + metadata_df=test_metadata_df, + grouping="Group", + group1="Group1", + group2="Group2", + multiple_testing_correction_method="Benjamini-Hochberg", + alpha=test_alpha, + p_value_calculation_method="auto", + ) + current_out = mann_whitney_test_on_ptm_data(**current_input) + + expected_corrected_p_values = [0.0132, 0.1423, 0.0132, 0.0132, 1.00000] + expected_u_statistics = [0.0, 4.5, 0.0, 0.0, 12.5] + expected_log2_fc = [1.415, 0.737, 0.3785, 8.3794, 0.0] + + expected_significant_ptms = ["Oxidation", "GlyGly", "Phospho"] + + p_values_rounded = [ + round(x, 4) for x in current_out["corrected_p_values_df"]["corrected_p_value"] + ] + u_statistics = current_out["u_statistic_df"]["u_statistic"] + log2_fc_rounded = [ + round(x, 4) for x in current_out["log2_fold_change_df"]["log2_fold_change"] + ] + + assert p_values_rounded == expected_corrected_p_values + assert all(u_statistics == expected_u_statistics) + assert log2_fc_rounded == expected_log2_fc + assert ( + list(current_out["significant_ptm_df"]["PTM"].unique()) + == expected_significant_ptms + ) + assert current_out["corrected_alpha"] == test_alpha \ No newline at end of file diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 1eb6441b..2363832d 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -38,6 +38,12 @@ class MultipleTestingCorrectionMethod(Enum): bonferroni = "Bonferroni" +class PValueCalculationMethod(Enum): + auto = "Auto" + exact = "Exact" + asymptotic = "Asymptotic" + + class YesNo(Enum): yes = "Yes" no = "No" @@ -341,12 +347,17 @@ class DifferentialExpressionMannWhitneyOnPTMForm(MethodForm): alpha = CustomFloatField( label="Error rate (alpha)", min_value=0, max_value=1, initial=0.05 ) + p_value_calculation_method = CustomChoiceField( + choices=PValueCalculationMethod, + label="P-value calculation method", + initial=PValueCalculationMethod.auto, + ) grouping = CustomChoiceField(choices=[], label="Grouping from metadata") group1 = CustomChoiceField(choices=[], label="Group 1") group2 = CustomChoiceField(choices=[], label="Group 2") def fill_form(self, run: Run) -> None: - self.fields["df"].choices = fill_helper.to_choices( + self.fields["ptm_df"].choices = fill_helper.to_choices( run.steps.get_instance_identifiers(PTMsPerSample, "ptm_df") ) @@ -395,7 +406,7 @@ class PlotVolcanoForm(MethodForm): def fill_form(self, run: Run) -> None: self.fields["input_dict"].choices = fill_helper.to_choices( run.steps.get_instance_identifiers( - DifferentialExpressionTTest | DifferentialExpressionLinearModel, + DifferentialExpressionTTest | DifferentialExpressionLinearModel | DifferentialExpressionMannWhitneyOnIntensityForm, "differentially_expressed_proteins_df", ) )