Enable volcano plot with ptm data (#530)

* tests for filter_proteins with peptide file * tests for filter_proteins with peptide file, better mock dataframe * output types * added testing that only filtered Proteins are removed from peptides_df * tests for filter_samples with peptide file * extract test that peptide filtering matches protein filtering into Method * refactor: shortened tests by integrating extracted method and mock data * exclude peptides * tests * minor fixes in test * minor fixes in test * fix test that peptide filtering matches protein filtering Method * fix test that peptide filtering matches protein filtering Method * fix test that peptide filtering matches protein filtering Method * fix test that peptide filtering matches protein filtering Method * match mock protein dataframe to mock peptide dataframe * extract test that peptide filtering matches protein filtering into Method * implementation * tests * tests improve error logging * tests git testing * peptide import added missing column * implements form, form_mapping, method and the actual functionality (untested) * merge peptide filtering and outlier_detection * merge peptide transformation * implement passing of peptide data between preprocessing steps * complete implement passing of peptide data between preprocessing steps * implement overhead for new step evidence import * enable selecting multiple proteins of interest and implements tests * Fix EvidenceImport class (now the right method is called) * implement option to choose protein from list of significant proteins * implement option to let the system choose the most expressed protein * include messages for peptide filtering * remove selecting multiple proteins and add substring search * add cleaning of protein groups to evidence import * implement suggested changes: remove default values and correct grammar mistake * make preprocessing work without peptide_df * clean up * add cleaning of protein groups to evidence import * rename * clean Protein IDs for peptide_import and name intensity column Intensity * implement requested changes: make peptide_df input optional for all preprocessing steps, remove peptide_df input in preprocessing steps, where it is not used * implement requested changes: make peptide_df input optional for all preprocessing steps, remove peptide_df input in preprocessing steps, where it is not used * unify into one method * cleanup old names * cleanup names * cleanup names * setup form and method * small refactor: utilize fill_helper better * implemented method * cleanup rename and docstring * tests and named sample column from "0" to "Sample" * implemented slow version * enable selecting multiple proteins * complete merge * rename file and fix selecting multiple proteins * improve efficiency of methods ptms_per_sample and ptms_per_protein_and_sample * added test for ptms_per_protein_and_sample * complete merge (fix testes after merge) * cleanup * implement differential expression with mann-whitney-test on ptm data * implement differential expression with mann-whitney-test on protein data * refactor input and output handling for Mann-Whitney-test on ptm data * add additional parameter form user and docstrings * fix typo * add test for mann whitney on intensity data * add test for mann whitney on ptm data * add mann whitney to volcano plot form * implement kruskal wallis test on ptm data * implement kruskal wallis test on protein data * adds tests for kruskal wallis test * implement normalization * adapt tests * add unit test for ptm_df normalization * change mock files as to not upset other tests * enable volcano plot with ptm data * make names of select peptides for Protein Step consistent * simplify form and make names more broad * Names of import section * Names of import section * Add description of Volcano Plot * complete merge --------- Co-authored-by: janni.roebbecke <[email protected]>
cschlaffner · Nov 11, 2024 · c4da33e · c4da33e
1 parent 1924c0c
commit c4da33e
Show file tree

Hide file tree

Showing 7 changed files with 99 additions and 41 deletions.
diff --git a/protzilla/data_analysis/plots.py b/protzilla/data_analysis/plots.py
@@ -93,7 +93,8 @@ def create_volcano_plot(
     alpha: float,
     group1: str,
     group2: str,
-    proteins_of_interest: list | None = None,
+    item_type: str = "Protein ID",
+    items_of_interest: list | None = None,
 ) -> dict:
     """
     Function to create a volcano plot from p values and log2 fold change with the
@@ -105,12 +106,13 @@ def create_volcano_plot(
     :param alpha: the alpha value for the significance line
     :param group1: the name of the first group
     :param group2: the name of the second group
-    :param proteins_of_interest: the proteins that should be annotated in the plot
+    :param item_type: in ["Protein", "PTM"] the type of the items in the data
+    :param items_of_interest: the items that should be annotated in the plot
 
     :return: returns a dictionary containing a list with a plotly figure and/or a list of messages
     """
 
-    plot_df = p_values.join(log2_fc.set_index("Protein ID"), on="Protein ID")
+    plot_df = p_values.join(log2_fc.set_index(item_type), on=item_type)
     fig = dashbio.VolcanoPlot(
         dataframe=plot_df,
         effect_size="log2_fold_change",
@@ -122,30 +124,30 @@ def create_volcano_plot(
         xlabel=f"log2(fc) ({group2} / {group1})",
         ylabel="-log10(p)",
         title="Volcano Plot",
-        annotation="Protein ID",
+        annotation=item_type,
         plot_bgcolor=colors["plot_bgcolor"],
         xaxis_gridcolor=colors["gridcolor"],
         yaxis_gridcolor=colors["gridcolor"],
     )
-    if proteins_of_interest is None:
-        proteins_of_interest = []
-    elif not isinstance(proteins_of_interest, list):
-        proteins_of_interest = [proteins_of_interest]
+    if items_of_interest is None:
+        items_of_interest = []
+    elif not isinstance(items_of_interest, list):
+        items_of_interest = [items_of_interest]
 
-    # annotate the proteins of interest permanently in the plot
-    for protein in proteins_of_interest:
+    # annotate the items of interest permanently in the plot
+    for item in items_of_interest:
         fig.add_annotation(
             x=plot_df.loc[
-                plot_df["Protein ID"] == protein,
+                plot_df[item_type] == item,
                 "log2_fold_change",
             ].values[0],
             y=-np.log10(
                 plot_df.loc[
-                    plot_df["Protein ID"] == protein,
+                    plot_df[item_type] == item,
                     "corrected_p_value",
                 ].values[0]
             ),
-            text=protein,
+            text=item,
             showarrow=True,
             arrowhead=1,
             font=dict(color=colors["annotation_text_color"]),
@@ -158,8 +160,8 @@ def create_volcano_plot(
         )
 
     new_names = {
-        "Point(s) of interest": "Significant Proteins",
-        "Dataset": "Not Significant Proteins",
+        "Point(s) of interest": f"Significant {item_type}s",
+        "Dataset": f"Not Significant {item_type}s",
     }
 
     fig.for_each_trace(
@@ -170,11 +172,11 @@ def create_volcano_plot(
     )
     fig.update_traces(
         marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2]),
-        selector=dict(name="Significant Proteins"),
+        selector=dict(name=f"Significant {item_type}s"),
     )
     fig.update_traces(
         marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]),
-        selector=dict(name="Not Significant Proteins"),
+        selector=dict(name=f"Not Significant {item_type}s"),
     )
 
     return dict(plots=[fig])

diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
@@ -14,6 +14,8 @@
     mann_whitney_test_on_intensity_data, mann_whitney_test_on_ptm_data)
 from protzilla.data_analysis.differential_expression_t_test import t_test
 from protzilla.data_analysis.dimension_reduction import t_sne, umap
+from protzilla.data_analysis.ptm_analysis import ptms_per_sample, \
+    ptms_per_protein_and_sample, select_peptides_of_protein
 from protzilla.data_analysis.model_evaluation import evaluate_classification_model
 from protzilla.data_analysis.plots import (
     clustergram_plot,
@@ -229,6 +231,37 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         return inputs
 
 
+class DifferentialExpressionKruskalWallisOnIntensity(DataAnalysisStep):
+    display_name = "Kruskal-Wallis Test"
+    operation = "differential_expression"
+    method_description = ("A function to conduct a Kruskal-Wallis test between groups defined in the clinical data."
+                          "The p-values are corrected for multiple testing.")
+
+    input_keys = [
+        "protein_df",
+        "metadata_df",
+        "grouping",
+        "selected_groups",
+        "alpha",
+        "log_base",
+        "multiple_testing_correction_method",
+    ]
+    output_keys = [
+        "differentially_expressed_proteins_df",
+        "significant_proteins_df",
+        "corrected_p_values_df",
+        "corrected_alpha",
+    ]
+
+    def method(self, inputs: dict) -> dict:
+        return kruskal_wallis_test_on_intensity_data(**inputs)
+
+    def insert_dataframes(self, steps: StepManager, inputs) -> dict:
+        inputs["ptm_df"] = steps.get_step_output(Step, "ptm_df", inputs["ptm_df"])
+        inputs["metadata_df"] = steps.metadata_df
+        return inputs
+
+
 class DifferentialExpressionKruskalWallisOnIntensity(DataAnalysisStep):
     display_name = "Kruskal-Wallis Test"
     operation = "differential_expression"
@@ -294,13 +327,17 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
 class PlotVolcano(PlotStep):
     display_name = "Volcano Plot"
     operation = "plot"
+    method_description = ("Plots the results of a differential expression analysis in a volcano plot. The x-axis shows "
+                          "the log2 fold change and the y-axis shows the -log10 of the corrected p-values. The user "
+                          "can define a fold change threshold and an alpha level to highlight significant items.")
     input_keys = [
         "p_values",
         "fc_threshold",
         "alpha",
         "group1",
         "group2",
-        "proteins_of_interest",
+        "item_type",
+        "items_of_interest",
         "log2_fc",
     ]
     output_keys = []
@@ -323,6 +360,11 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
             Step, "log2_fold_change_df", inputs["input_dict"]
         )
 
+        if step.operation == "differential_expression":
+            inputs["item_type"] = "Protein ID"
+        elif step.operation == "Peptide analysis":
+            inputs["item_type"] = "PTM"
+
         return inputs
 
 

diff --git a/protzilla/methods/importing.py b/protzilla/methods/importing.py
@@ -25,9 +25,9 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
 
 
 class MaxQuantImport(ImportingStep):
-    display_name = "MaxQuant"
-    operation = "msdataimport"
-    method_description = "Import MaxQuant data"
+    display_name = "MaxQuant Protein Groups Import"
+    operation = "Protein Data Import"
+    method_description = "Import the protein groups file form output of MaxQuant"
 
     input_keys = ["file_path", "map_to_uniprot", "intensity_name", "aggregation_method"]
     output_keys = ["protein_df"]
@@ -37,8 +37,8 @@ def method(self, inputs):
 
 
 class DiannImport(ImportingStep):
-    display_name = "DIA-NN"
-    operation = "msdataimport"
+    display_name = "DIA-NN Import"
+    operation = "Protein Data Import"
     method_description = "DIA-NN data import"
 
     input_keys = ["file_path", "map_to_uniprot", "aggregation_method"]
@@ -49,9 +49,9 @@ def method(self, inputs):
 
 
 class MsFraggerImport(ImportingStep):
-    display_name = "MS Fragger"
-    operation = "msdataimport"
-    method_description = "MS Fragger data import"
+    display_name = "MS Fragger Combined Protein Import"
+    operation = "Protein Data Import"
+    method_description = "Import the combined_protein.tsv file form output of MS Fragger"
 
     input_keys = ["file_path", "intensity_name", "map_to_uniprot", "aggregation_method"]
     output_keys = ["protein_df"]
@@ -119,7 +119,7 @@ def insert_dataframes(self, steps: StepManager, inputs: dict) -> dict:
 
 
 class PeptideImport(ImportingStep):
-    display_name = "Peptide import"
+    display_name = "MaxQuant Peptide Import"
     operation = "peptide_import"
     method_description = "Import peptide data"
 
@@ -131,7 +131,7 @@ def method(self, inputs):
 
 
 class EvidenceImport(ImportingStep):
-    display_name = "Evidence import"
+    display_name = "MaxQuant Evidence Import"
     operation = "peptide_import"
     method_description = "Import an evidence file"
 

diff --git a/protzilla/runner.py b/protzilla/runner.py
@@ -96,7 +96,7 @@ def compute_workflow(self):
         self.run._run_write()
 
     def _insert_commandline_inputs(self, step):
-        if step.operation == "msdataimport":
+        if step.operation == "Protein Data Import":
             step.form_inputs["file_path"] = self.ms_data_path
 
         elif step.operation == "metadataimport":

diff --git a/protzilla/steps.py b/protzilla/steps.py
@@ -353,17 +353,20 @@ def all_steps(self) -> list[Step]:
         )
 
     def get_instance_identifiers(
-        self, step_type: type[Step], output_key: str = None
+        self, step_type: type[Step], output_key: str | list[str] = None
     ) -> list[str]:
+        if isinstance(output_key, str):
+            output_key = [output_key]
+
         instance_identifiers = [
             step.instance_identifier
             for step in self.all_steps
             if isinstance(step, step_type)
-            and (output_key is None or output_key in step.output)
+            and (output_key is None or all(k in step.output for k in output_key))
         ]
         if not instance_identifiers:
             logging.warning(
-                f"No instance identifiers found for {step_type} and output_key {output_key}"
+                f"No instance identifiers found with step type {step_type} and output_key{'s' if len(output_key) > 1 else ''} {output_key}"
             )
         return instance_identifiers
 

diff --git a/tests/protzilla/data_analysis/test_analysis_plots.py b/tests/protzilla/data_analysis/test_analysis_plots.py
@@ -89,7 +89,7 @@ def test_plots_volcano_plot_multiple_annotations(ttest_input, ttest_output, show
         log2_fc=ttest_output["log2_fold_change_df"],
         fc_threshold=0,
         alpha=0,
-        proteins_of_interest=["Protein1", "Protein2"],
+        items_of_interest=["Protein1", "Protein2"],
         group1=ttest_input["group1"],
         group2=ttest_input["group2"],
     )

diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
@@ -7,8 +7,10 @@
     DifferentialExpressionLinearModel,
     DifferentialExpressionTTest,
     DimensionReductionUMAP,
+    DataAnalysisStep,
     PTMsPerSample,
     SelectPeptidesForProtein,
+    DifferentialExpressionMannWhitneyOnPTM,
 )
 from protzilla.methods.data_preprocessing import DataPreprocessingStep
 from protzilla.run import Run
@@ -456,28 +458,35 @@ class PlotVolcanoForm(MethodForm):
     fc_threshold = CustomNumberField(
         label="Log2 fold change threshold", min_value=0, initial=0
     )
-    proteins_of_interest = CustomMultipleChoiceField(
+    items_of_interest = CustomMultipleChoiceField(
         choices=[],
-        label="Proteins of interest (will be highlighted)",
+        label="Items of interest (will be highlighted)",
     )
 
     def fill_form(self, run: Run) -> None:
         self.fields["input_dict"].choices = fill_helper.to_choices(
             run.steps.get_instance_identifiers(
-                DifferentialExpressionTTest | DifferentialExpressionLinearModel | DifferentialExpressionMannWhitneyOnIntensityForm,
-                "differentially_expressed_proteins_df",
+                Step, ["corrected_p_values_df", "log2_fold_change_df"],
             )
         )
 
         input_dict_instance_id = self.data.get(
             "input_dict", self.fields["input_dict"].choices[0][0]
         )
 
-        proteins = run.steps.get_step_output(
+        items_of_interest = []
+        step_output = run.steps.get_step_output(
             Step, "differentially_expressed_proteins_df", input_dict_instance_id
-        )["Protein ID"].unique()
+        )
+        if step_output is not None:
+            items_of_interest = step_output["Protein ID"].unique()
+        step_output = run.steps.get_step_output(
+            Step, "differentially_expressed_ptm_df", input_dict_instance_id
+        )
+        if step_output is not None:
+            items_of_interest = step_output["PTM"].unique()
 
-        self.fields["proteins_of_interest"].choices = fill_helper.to_choices(proteins)
+        self.fields["items_of_interest"].choices = fill_helper.to_choices(items_of_interest)
 
 
 class PlotScatterPlotForm(MethodForm):
@@ -1179,7 +1188,9 @@ def fill_form(self, run: Run) -> None:
         )
         self.fields["peptide_df"].choices = fill_helper.to_choices(single_protein_peptides)
 
-        self.fields["peptide_df"].choices = fill_helper.get_choices(run, "peptide_df")[::-1]
+        self.fields["peptide_df"].choices = fill_helper.get_choices(
+            run, "peptide_df"
+        )[::-1]
 
         single_protein_peptides = run.steps.get_instance_identifiers(
             SelectPeptidesForProtein, "peptide_df"