From c4da33e77ed1bdce36f97488cd8c0dd80daf3b72 Mon Sep 17 00:00:00 2001 From: JanniRoebbecke <50395935+JanniRoebbecke@users.noreply.github.com> Date: Mon, 11 Nov 2024 16:55:02 +0100 Subject: [PATCH] Enable volcano plot with ptm data (#530) * tests for filter_proteins with peptide file * tests for filter_proteins with peptide file, better mock dataframe * output types * added testing that only filtered Proteins are removed from peptides_df * tests for filter_samples with peptide file * extract test that peptide filtering matches protein filtering into Method * refactor: shortened tests by integrating extracted method and mock data * exclude peptides * tests * minor fixes in test * minor fixes in test * fix test that peptide filtering matches protein filtering Method * fix test that peptide filtering matches protein filtering Method * fix test that peptide filtering matches protein filtering Method * fix test that peptide filtering matches protein filtering Method * match mock protein dataframe to mock peptide dataframe * extract test that peptide filtering matches protein filtering into Method * implementation * tests * tests improve error logging * tests git testing * peptide import added missing column * implements form, form_mapping, method and the actual functionality (untested) * merge peptide filtering and outlier_detection * merge peptide transformation * implement passing of peptide data between preprocessing steps * complete implement passing of peptide data between preprocessing steps * implement overhead for new step evidence import * enable selecting multiple proteins of interest and implements tests * Fix EvidenceImport class (now the right method is called) * implement option to choose protein from list of significant proteins * implement option to let the system choose the most expressed protein * include messages for peptide filtering * remove selecting multiple proteins and add substring search * add cleaning of protein groups to evidence import * implement suggested changes: remove default values and correct grammar mistake * make preprocessing work without peptide_df * clean up * add cleaning of protein groups to evidence import * rename * clean Protein IDs for peptide_import and name intensity column Intensity * implement requested changes: make peptide_df input optional for all preprocessing steps, remove peptide_df input in preprocessing steps, where it is not used * implement requested changes: make peptide_df input optional for all preprocessing steps, remove peptide_df input in preprocessing steps, where it is not used * unify into one method * cleanup old names * cleanup names * cleanup names * setup form and method * small refactor: utilize fill_helper better * implemented method * cleanup rename and docstring * tests and named sample column from "0" to "Sample" * implemented slow version * enable selecting multiple proteins * complete merge * rename file and fix selecting multiple proteins * improve efficiency of methods ptms_per_sample and ptms_per_protein_and_sample * added test for ptms_per_protein_and_sample * complete merge (fix testes after merge) * cleanup * implement differential expression with mann-whitney-test on ptm data * implement differential expression with mann-whitney-test on protein data * refactor input and output handling for Mann-Whitney-test on ptm data * add additional parameter form user and docstrings * fix typo * add test for mann whitney on intensity data * add test for mann whitney on ptm data * add mann whitney to volcano plot form * implement kruskal wallis test on ptm data * implement kruskal wallis test on protein data * adds tests for kruskal wallis test * implement normalization * adapt tests * add unit test for ptm_df normalization * change mock files as to not upset other tests * enable volcano plot with ptm data * make names of select peptides for Protein Step consistent * simplify form and make names more broad * Names of import section * Names of import section * Add description of Volcano Plot * complete merge --------- Co-authored-by: janni.roebbecke --- protzilla/data_analysis/plots.py | 36 ++++++++------- protzilla/methods/data_analysis.py | 44 ++++++++++++++++++- protzilla/methods/importing.py | 20 ++++----- protzilla/runner.py | 2 +- protzilla/steps.py | 9 ++-- .../data_analysis/test_analysis_plots.py | 2 +- ui/runs/forms/data_analysis.py | 27 ++++++++---- 7 files changed, 99 insertions(+), 41 deletions(-) diff --git a/protzilla/data_analysis/plots.py b/protzilla/data_analysis/plots.py index 439d458f..84ae2b59 100644 --- a/protzilla/data_analysis/plots.py +++ b/protzilla/data_analysis/plots.py @@ -93,7 +93,8 @@ def create_volcano_plot( alpha: float, group1: str, group2: str, - proteins_of_interest: list | None = None, + item_type: str = "Protein ID", + items_of_interest: list | None = None, ) -> dict: """ Function to create a volcano plot from p values and log2 fold change with the @@ -105,12 +106,13 @@ def create_volcano_plot( :param alpha: the alpha value for the significance line :param group1: the name of the first group :param group2: the name of the second group - :param proteins_of_interest: the proteins that should be annotated in the plot + :param item_type: in ["Protein", "PTM"] the type of the items in the data + :param items_of_interest: the items that should be annotated in the plot :return: returns a dictionary containing a list with a plotly figure and/or a list of messages """ - plot_df = p_values.join(log2_fc.set_index("Protein ID"), on="Protein ID") + plot_df = p_values.join(log2_fc.set_index(item_type), on=item_type) fig = dashbio.VolcanoPlot( dataframe=plot_df, effect_size="log2_fold_change", @@ -122,30 +124,30 @@ def create_volcano_plot( xlabel=f"log2(fc) ({group2} / {group1})", ylabel="-log10(p)", title="Volcano Plot", - annotation="Protein ID", + annotation=item_type, plot_bgcolor=colors["plot_bgcolor"], xaxis_gridcolor=colors["gridcolor"], yaxis_gridcolor=colors["gridcolor"], ) - if proteins_of_interest is None: - proteins_of_interest = [] - elif not isinstance(proteins_of_interest, list): - proteins_of_interest = [proteins_of_interest] + if items_of_interest is None: + items_of_interest = [] + elif not isinstance(items_of_interest, list): + items_of_interest = [items_of_interest] - # annotate the proteins of interest permanently in the plot - for protein in proteins_of_interest: + # annotate the items of interest permanently in the plot + for item in items_of_interest: fig.add_annotation( x=plot_df.loc[ - plot_df["Protein ID"] == protein, + plot_df[item_type] == item, "log2_fold_change", ].values[0], y=-np.log10( plot_df.loc[ - plot_df["Protein ID"] == protein, + plot_df[item_type] == item, "corrected_p_value", ].values[0] ), - text=protein, + text=item, showarrow=True, arrowhead=1, font=dict(color=colors["annotation_text_color"]), @@ -158,8 +160,8 @@ def create_volcano_plot( ) new_names = { - "Point(s) of interest": "Significant Proteins", - "Dataset": "Not Significant Proteins", + "Point(s) of interest": f"Significant {item_type}s", + "Dataset": f"Not Significant {item_type}s", } fig.for_each_trace( @@ -170,11 +172,11 @@ def create_volcano_plot( ) fig.update_traces( marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2]), - selector=dict(name="Significant Proteins"), + selector=dict(name=f"Significant {item_type}s"), ) fig.update_traces( marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]), - selector=dict(name="Not Significant Proteins"), + selector=dict(name=f"Not Significant {item_type}s"), ) return dict(plots=[fig]) diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 0d10a78b..511629f5 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -14,6 +14,8 @@ mann_whitney_test_on_intensity_data, mann_whitney_test_on_ptm_data) from protzilla.data_analysis.differential_expression_t_test import t_test from protzilla.data_analysis.dimension_reduction import t_sne, umap +from protzilla.data_analysis.ptm_analysis import ptms_per_sample, \ + ptms_per_protein_and_sample, select_peptides_of_protein from protzilla.data_analysis.model_evaluation import evaluate_classification_model from protzilla.data_analysis.plots import ( clustergram_plot, @@ -229,6 +231,37 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: return inputs +class DifferentialExpressionKruskalWallisOnIntensity(DataAnalysisStep): + display_name = "Kruskal-Wallis Test" + operation = "differential_expression" + method_description = ("A function to conduct a Kruskal-Wallis test between groups defined in the clinical data." + "The p-values are corrected for multiple testing.") + + input_keys = [ + "protein_df", + "metadata_df", + "grouping", + "selected_groups", + "alpha", + "log_base", + "multiple_testing_correction_method", + ] + output_keys = [ + "differentially_expressed_proteins_df", + "significant_proteins_df", + "corrected_p_values_df", + "corrected_alpha", + ] + + def method(self, inputs: dict) -> dict: + return kruskal_wallis_test_on_intensity_data(**inputs) + + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + inputs["ptm_df"] = steps.get_step_output(Step, "ptm_df", inputs["ptm_df"]) + inputs["metadata_df"] = steps.metadata_df + return inputs + + class DifferentialExpressionKruskalWallisOnIntensity(DataAnalysisStep): display_name = "Kruskal-Wallis Test" operation = "differential_expression" @@ -294,13 +327,17 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: class PlotVolcano(PlotStep): display_name = "Volcano Plot" operation = "plot" + method_description = ("Plots the results of a differential expression analysis in a volcano plot. The x-axis shows " + "the log2 fold change and the y-axis shows the -log10 of the corrected p-values. The user " + "can define a fold change threshold and an alpha level to highlight significant items.") input_keys = [ "p_values", "fc_threshold", "alpha", "group1", "group2", - "proteins_of_interest", + "item_type", + "items_of_interest", "log2_fc", ] output_keys = [] @@ -323,6 +360,11 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: Step, "log2_fold_change_df", inputs["input_dict"] ) + if step.operation == "differential_expression": + inputs["item_type"] = "Protein ID" + elif step.operation == "Peptide analysis": + inputs["item_type"] = "PTM" + return inputs diff --git a/protzilla/methods/importing.py b/protzilla/methods/importing.py index 7cde1ba0..c91b40fc 100644 --- a/protzilla/methods/importing.py +++ b/protzilla/methods/importing.py @@ -25,9 +25,9 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: class MaxQuantImport(ImportingStep): - display_name = "MaxQuant" - operation = "msdataimport" - method_description = "Import MaxQuant data" + display_name = "MaxQuant Protein Groups Import" + operation = "Protein Data Import" + method_description = "Import the protein groups file form output of MaxQuant" input_keys = ["file_path", "map_to_uniprot", "intensity_name", "aggregation_method"] output_keys = ["protein_df"] @@ -37,8 +37,8 @@ def method(self, inputs): class DiannImport(ImportingStep): - display_name = "DIA-NN" - operation = "msdataimport" + display_name = "DIA-NN Import" + operation = "Protein Data Import" method_description = "DIA-NN data import" input_keys = ["file_path", "map_to_uniprot", "aggregation_method"] @@ -49,9 +49,9 @@ def method(self, inputs): class MsFraggerImport(ImportingStep): - display_name = "MS Fragger" - operation = "msdataimport" - method_description = "MS Fragger data import" + display_name = "MS Fragger Combined Protein Import" + operation = "Protein Data Import" + method_description = "Import the combined_protein.tsv file form output of MS Fragger" input_keys = ["file_path", "intensity_name", "map_to_uniprot", "aggregation_method"] output_keys = ["protein_df"] @@ -119,7 +119,7 @@ def insert_dataframes(self, steps: StepManager, inputs: dict) -> dict: class PeptideImport(ImportingStep): - display_name = "Peptide import" + display_name = "MaxQuant Peptide Import" operation = "peptide_import" method_description = "Import peptide data" @@ -131,7 +131,7 @@ def method(self, inputs): class EvidenceImport(ImportingStep): - display_name = "Evidence import" + display_name = "MaxQuant Evidence Import" operation = "peptide_import" method_description = "Import an evidence file" diff --git a/protzilla/runner.py b/protzilla/runner.py index 50cbf127..fa206a19 100644 --- a/protzilla/runner.py +++ b/protzilla/runner.py @@ -96,7 +96,7 @@ def compute_workflow(self): self.run._run_write() def _insert_commandline_inputs(self, step): - if step.operation == "msdataimport": + if step.operation == "Protein Data Import": step.form_inputs["file_path"] = self.ms_data_path elif step.operation == "metadataimport": diff --git a/protzilla/steps.py b/protzilla/steps.py index d5fb124e..eecf0d04 100644 --- a/protzilla/steps.py +++ b/protzilla/steps.py @@ -353,17 +353,20 @@ def all_steps(self) -> list[Step]: ) def get_instance_identifiers( - self, step_type: type[Step], output_key: str = None + self, step_type: type[Step], output_key: str | list[str] = None ) -> list[str]: + if isinstance(output_key, str): + output_key = [output_key] + instance_identifiers = [ step.instance_identifier for step in self.all_steps if isinstance(step, step_type) - and (output_key is None or output_key in step.output) + and (output_key is None or all(k in step.output for k in output_key)) ] if not instance_identifiers: logging.warning( - f"No instance identifiers found for {step_type} and output_key {output_key}" + f"No instance identifiers found with step type {step_type} and output_key{'s' if len(output_key) > 1 else ''} {output_key}" ) return instance_identifiers diff --git a/tests/protzilla/data_analysis/test_analysis_plots.py b/tests/protzilla/data_analysis/test_analysis_plots.py index 3b665b0b..eca37a85 100644 --- a/tests/protzilla/data_analysis/test_analysis_plots.py +++ b/tests/protzilla/data_analysis/test_analysis_plots.py @@ -89,7 +89,7 @@ def test_plots_volcano_plot_multiple_annotations(ttest_input, ttest_output, show log2_fc=ttest_output["log2_fold_change_df"], fc_threshold=0, alpha=0, - proteins_of_interest=["Protein1", "Protein2"], + items_of_interest=["Protein1", "Protein2"], group1=ttest_input["group1"], group2=ttest_input["group2"], ) diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index b890dd5b..d16145b4 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -7,8 +7,10 @@ DifferentialExpressionLinearModel, DifferentialExpressionTTest, DimensionReductionUMAP, + DataAnalysisStep, PTMsPerSample, SelectPeptidesForProtein, + DifferentialExpressionMannWhitneyOnPTM, ) from protzilla.methods.data_preprocessing import DataPreprocessingStep from protzilla.run import Run @@ -456,16 +458,15 @@ class PlotVolcanoForm(MethodForm): fc_threshold = CustomNumberField( label="Log2 fold change threshold", min_value=0, initial=0 ) - proteins_of_interest = CustomMultipleChoiceField( + items_of_interest = CustomMultipleChoiceField( choices=[], - label="Proteins of interest (will be highlighted)", + label="Items of interest (will be highlighted)", ) def fill_form(self, run: Run) -> None: self.fields["input_dict"].choices = fill_helper.to_choices( run.steps.get_instance_identifiers( - DifferentialExpressionTTest | DifferentialExpressionLinearModel | DifferentialExpressionMannWhitneyOnIntensityForm, - "differentially_expressed_proteins_df", + Step, ["corrected_p_values_df", "log2_fold_change_df"], ) ) @@ -473,11 +474,19 @@ def fill_form(self, run: Run) -> None: "input_dict", self.fields["input_dict"].choices[0][0] ) - proteins = run.steps.get_step_output( + items_of_interest = [] + step_output = run.steps.get_step_output( Step, "differentially_expressed_proteins_df", input_dict_instance_id - )["Protein ID"].unique() + ) + if step_output is not None: + items_of_interest = step_output["Protein ID"].unique() + step_output = run.steps.get_step_output( + Step, "differentially_expressed_ptm_df", input_dict_instance_id + ) + if step_output is not None: + items_of_interest = step_output["PTM"].unique() - self.fields["proteins_of_interest"].choices = fill_helper.to_choices(proteins) + self.fields["items_of_interest"].choices = fill_helper.to_choices(items_of_interest) class PlotScatterPlotForm(MethodForm): @@ -1179,7 +1188,9 @@ def fill_form(self, run: Run) -> None: ) self.fields["peptide_df"].choices = fill_helper.to_choices(single_protein_peptides) - self.fields["peptide_df"].choices = fill_helper.get_choices(run, "peptide_df")[::-1] + self.fields["peptide_df"].choices = fill_helper.get_choices( + run, "peptide_df" + )[::-1] single_protein_peptides = run.steps.get_instance_identifiers( SelectPeptidesForProtein, "peptide_df"