Skip to content

Commit

Permalink
Enable volcano plot with ptm data (#530)
Browse files Browse the repository at this point in the history
* tests for filter_proteins with peptide file

* tests for filter_proteins with peptide file, better mock dataframe

* output types

* added testing that only filtered Proteins are removed from peptides_df

* tests for filter_samples with peptide file

* extract test that peptide filtering matches protein filtering into Method

* refactor: shortened tests by integrating extracted method and mock data

* exclude peptides

* tests

* minor fixes in test

* minor fixes in test

* fix test that peptide filtering matches protein filtering Method

* fix test that peptide filtering matches protein filtering Method

* fix test that peptide filtering matches protein filtering Method

* fix test that peptide filtering matches protein filtering Method

* match mock protein dataframe to mock peptide dataframe

* extract test that peptide filtering matches protein filtering into Method

* implementation

* tests

* tests improve error logging

* tests git testing

* peptide import added missing column

* implements form, form_mapping, method and the actual functionality (untested)

* merge peptide filtering and outlier_detection

* merge peptide transformation

* implement passing of peptide data between preprocessing steps

* complete implement passing of peptide data between preprocessing steps

* implement overhead for new step evidence import

* enable selecting multiple proteins of interest and implements tests

* Fix EvidenceImport class (now the right method is called)

* implement option to choose protein from list of significant proteins

* implement option to let the system choose the most expressed protein

* include messages for peptide filtering

* remove selecting multiple proteins and add substring search

* add cleaning of protein groups to evidence import

* implement suggested changes: remove default values and correct grammar mistake

* make preprocessing work without peptide_df

* clean up

* add cleaning of protein groups to evidence import

* rename

* clean Protein IDs for peptide_import and name intensity column Intensity

* implement requested changes: make peptide_df input optional for all preprocessing steps, remove peptide_df input in preprocessing steps, where it is not used

* implement requested changes: make peptide_df input optional for all preprocessing steps, remove peptide_df input in preprocessing steps, where it is not used

* unify into one method

* cleanup old names

* cleanup names

* cleanup names

* setup form and method

* small refactor: utilize fill_helper better

* implemented method

* cleanup rename and docstring

* tests and named sample column from "0" to "Sample"

* implemented slow version

* enable selecting multiple proteins

* complete merge

* rename file and fix selecting multiple proteins

* improve efficiency of methods ptms_per_sample and ptms_per_protein_and_sample

* added test for ptms_per_protein_and_sample

* complete merge (fix testes after merge)

* cleanup

* implement differential expression with mann-whitney-test on ptm data

* implement differential expression with mann-whitney-test on protein data

* refactor input and output handling for Mann-Whitney-test on ptm data

* add additional parameter form user and docstrings

* fix typo

* add test for mann whitney on intensity data

* add test for mann whitney on ptm data

* add mann whitney to volcano plot form

* implement kruskal wallis test on ptm data

* implement kruskal wallis test on protein data

* adds tests for kruskal wallis test

* implement normalization

* adapt tests

* add unit test for ptm_df normalization

* change mock files as to not upset other tests

* enable volcano plot with ptm data

* make names of select peptides for Protein Step consistent

* simplify form and make names more broad

* Names of import section

* Names of import section

* Add description of Volcano Plot

* complete merge

---------

Co-authored-by: janni.roebbecke <[email protected]>
  • Loading branch information
JanniRoebbecke and janni.roebbecke authored Nov 11, 2024
1 parent 1924c0c commit c4da33e
Show file tree
Hide file tree
Showing 7 changed files with 99 additions and 41 deletions.
36 changes: 19 additions & 17 deletions protzilla/data_analysis/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ def create_volcano_plot(
alpha: float,
group1: str,
group2: str,
proteins_of_interest: list | None = None,
item_type: str = "Protein ID",
items_of_interest: list | None = None,
) -> dict:
"""
Function to create a volcano plot from p values and log2 fold change with the
Expand All @@ -105,12 +106,13 @@ def create_volcano_plot(
:param alpha: the alpha value for the significance line
:param group1: the name of the first group
:param group2: the name of the second group
:param proteins_of_interest: the proteins that should be annotated in the plot
:param item_type: in ["Protein", "PTM"] the type of the items in the data
:param items_of_interest: the items that should be annotated in the plot
:return: returns a dictionary containing a list with a plotly figure and/or a list of messages
"""

plot_df = p_values.join(log2_fc.set_index("Protein ID"), on="Protein ID")
plot_df = p_values.join(log2_fc.set_index(item_type), on=item_type)
fig = dashbio.VolcanoPlot(
dataframe=plot_df,
effect_size="log2_fold_change",
Expand All @@ -122,30 +124,30 @@ def create_volcano_plot(
xlabel=f"log2(fc) ({group2} / {group1})",
ylabel="-log10(p)",
title="Volcano Plot",
annotation="Protein ID",
annotation=item_type,
plot_bgcolor=colors["plot_bgcolor"],
xaxis_gridcolor=colors["gridcolor"],
yaxis_gridcolor=colors["gridcolor"],
)
if proteins_of_interest is None:
proteins_of_interest = []
elif not isinstance(proteins_of_interest, list):
proteins_of_interest = [proteins_of_interest]
if items_of_interest is None:
items_of_interest = []
elif not isinstance(items_of_interest, list):
items_of_interest = [items_of_interest]

# annotate the proteins of interest permanently in the plot
for protein in proteins_of_interest:
# annotate the items of interest permanently in the plot
for item in items_of_interest:
fig.add_annotation(
x=plot_df.loc[
plot_df["Protein ID"] == protein,
plot_df[item_type] == item,
"log2_fold_change",
].values[0],
y=-np.log10(
plot_df.loc[
plot_df["Protein ID"] == protein,
plot_df[item_type] == item,
"corrected_p_value",
].values[0]
),
text=protein,
text=item,
showarrow=True,
arrowhead=1,
font=dict(color=colors["annotation_text_color"]),
Expand All @@ -158,8 +160,8 @@ def create_volcano_plot(
)

new_names = {
"Point(s) of interest": "Significant Proteins",
"Dataset": "Not Significant Proteins",
"Point(s) of interest": f"Significant {item_type}s",
"Dataset": f"Not Significant {item_type}s",
}

fig.for_each_trace(
Expand All @@ -170,11 +172,11 @@ def create_volcano_plot(
)
fig.update_traces(
marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2]),
selector=dict(name="Significant Proteins"),
selector=dict(name=f"Significant {item_type}s"),
)
fig.update_traces(
marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]),
selector=dict(name="Not Significant Proteins"),
selector=dict(name=f"Not Significant {item_type}s"),
)

return dict(plots=[fig])
Expand Down
44 changes: 43 additions & 1 deletion protzilla/methods/data_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
mann_whitney_test_on_intensity_data, mann_whitney_test_on_ptm_data)
from protzilla.data_analysis.differential_expression_t_test import t_test
from protzilla.data_analysis.dimension_reduction import t_sne, umap
from protzilla.data_analysis.ptm_analysis import ptms_per_sample, \
ptms_per_protein_and_sample, select_peptides_of_protein
from protzilla.data_analysis.model_evaluation import evaluate_classification_model
from protzilla.data_analysis.plots import (
clustergram_plot,
Expand Down Expand Up @@ -229,6 +231,37 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
return inputs


class DifferentialExpressionKruskalWallisOnIntensity(DataAnalysisStep):
display_name = "Kruskal-Wallis Test"
operation = "differential_expression"
method_description = ("A function to conduct a Kruskal-Wallis test between groups defined in the clinical data."
"The p-values are corrected for multiple testing.")

input_keys = [
"protein_df",
"metadata_df",
"grouping",
"selected_groups",
"alpha",
"log_base",
"multiple_testing_correction_method",
]
output_keys = [
"differentially_expressed_proteins_df",
"significant_proteins_df",
"corrected_p_values_df",
"corrected_alpha",
]

def method(self, inputs: dict) -> dict:
return kruskal_wallis_test_on_intensity_data(**inputs)

def insert_dataframes(self, steps: StepManager, inputs) -> dict:
inputs["ptm_df"] = steps.get_step_output(Step, "ptm_df", inputs["ptm_df"])
inputs["metadata_df"] = steps.metadata_df
return inputs


class DifferentialExpressionKruskalWallisOnIntensity(DataAnalysisStep):
display_name = "Kruskal-Wallis Test"
operation = "differential_expression"
Expand Down Expand Up @@ -294,13 +327,17 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
class PlotVolcano(PlotStep):
display_name = "Volcano Plot"
operation = "plot"
method_description = ("Plots the results of a differential expression analysis in a volcano plot. The x-axis shows "
"the log2 fold change and the y-axis shows the -log10 of the corrected p-values. The user "
"can define a fold change threshold and an alpha level to highlight significant items.")
input_keys = [
"p_values",
"fc_threshold",
"alpha",
"group1",
"group2",
"proteins_of_interest",
"item_type",
"items_of_interest",
"log2_fc",
]
output_keys = []
Expand All @@ -323,6 +360,11 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
Step, "log2_fold_change_df", inputs["input_dict"]
)

if step.operation == "differential_expression":
inputs["item_type"] = "Protein ID"
elif step.operation == "Peptide analysis":
inputs["item_type"] = "PTM"

return inputs


Expand Down
20 changes: 10 additions & 10 deletions protzilla/methods/importing.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:


class MaxQuantImport(ImportingStep):
display_name = "MaxQuant"
operation = "msdataimport"
method_description = "Import MaxQuant data"
display_name = "MaxQuant Protein Groups Import"
operation = "Protein Data Import"
method_description = "Import the protein groups file form output of MaxQuant"

input_keys = ["file_path", "map_to_uniprot", "intensity_name", "aggregation_method"]
output_keys = ["protein_df"]
Expand All @@ -37,8 +37,8 @@ def method(self, inputs):


class DiannImport(ImportingStep):
display_name = "DIA-NN"
operation = "msdataimport"
display_name = "DIA-NN Import"
operation = "Protein Data Import"
method_description = "DIA-NN data import"

input_keys = ["file_path", "map_to_uniprot", "aggregation_method"]
Expand All @@ -49,9 +49,9 @@ def method(self, inputs):


class MsFraggerImport(ImportingStep):
display_name = "MS Fragger"
operation = "msdataimport"
method_description = "MS Fragger data import"
display_name = "MS Fragger Combined Protein Import"
operation = "Protein Data Import"
method_description = "Import the combined_protein.tsv file form output of MS Fragger"

input_keys = ["file_path", "intensity_name", "map_to_uniprot", "aggregation_method"]
output_keys = ["protein_df"]
Expand Down Expand Up @@ -119,7 +119,7 @@ def insert_dataframes(self, steps: StepManager, inputs: dict) -> dict:


class PeptideImport(ImportingStep):
display_name = "Peptide import"
display_name = "MaxQuant Peptide Import"
operation = "peptide_import"
method_description = "Import peptide data"

Expand All @@ -131,7 +131,7 @@ def method(self, inputs):


class EvidenceImport(ImportingStep):
display_name = "Evidence import"
display_name = "MaxQuant Evidence Import"
operation = "peptide_import"
method_description = "Import an evidence file"

Expand Down
2 changes: 1 addition & 1 deletion protzilla/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def compute_workflow(self):
self.run._run_write()

def _insert_commandline_inputs(self, step):
if step.operation == "msdataimport":
if step.operation == "Protein Data Import":
step.form_inputs["file_path"] = self.ms_data_path

elif step.operation == "metadataimport":
Expand Down
9 changes: 6 additions & 3 deletions protzilla/steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,17 +353,20 @@ def all_steps(self) -> list[Step]:
)

def get_instance_identifiers(
self, step_type: type[Step], output_key: str = None
self, step_type: type[Step], output_key: str | list[str] = None
) -> list[str]:
if isinstance(output_key, str):
output_key = [output_key]

instance_identifiers = [
step.instance_identifier
for step in self.all_steps
if isinstance(step, step_type)
and (output_key is None or output_key in step.output)
and (output_key is None or all(k in step.output for k in output_key))
]
if not instance_identifiers:
logging.warning(
f"No instance identifiers found for {step_type} and output_key {output_key}"
f"No instance identifiers found with step type {step_type} and output_key{'s' if len(output_key) > 1 else ''} {output_key}"
)
return instance_identifiers

Expand Down
2 changes: 1 addition & 1 deletion tests/protzilla/data_analysis/test_analysis_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def test_plots_volcano_plot_multiple_annotations(ttest_input, ttest_output, show
log2_fc=ttest_output["log2_fold_change_df"],
fc_threshold=0,
alpha=0,
proteins_of_interest=["Protein1", "Protein2"],
items_of_interest=["Protein1", "Protein2"],
group1=ttest_input["group1"],
group2=ttest_input["group2"],
)
Expand Down
27 changes: 19 additions & 8 deletions ui/runs/forms/data_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@
DifferentialExpressionLinearModel,
DifferentialExpressionTTest,
DimensionReductionUMAP,
DataAnalysisStep,
PTMsPerSample,
SelectPeptidesForProtein,
DifferentialExpressionMannWhitneyOnPTM,
)
from protzilla.methods.data_preprocessing import DataPreprocessingStep
from protzilla.run import Run
Expand Down Expand Up @@ -456,28 +458,35 @@ class PlotVolcanoForm(MethodForm):
fc_threshold = CustomNumberField(
label="Log2 fold change threshold", min_value=0, initial=0
)
proteins_of_interest = CustomMultipleChoiceField(
items_of_interest = CustomMultipleChoiceField(
choices=[],
label="Proteins of interest (will be highlighted)",
label="Items of interest (will be highlighted)",
)

def fill_form(self, run: Run) -> None:
self.fields["input_dict"].choices = fill_helper.to_choices(
run.steps.get_instance_identifiers(
DifferentialExpressionTTest | DifferentialExpressionLinearModel | DifferentialExpressionMannWhitneyOnIntensityForm,
"differentially_expressed_proteins_df",
Step, ["corrected_p_values_df", "log2_fold_change_df"],
)
)

input_dict_instance_id = self.data.get(
"input_dict", self.fields["input_dict"].choices[0][0]
)

proteins = run.steps.get_step_output(
items_of_interest = []
step_output = run.steps.get_step_output(
Step, "differentially_expressed_proteins_df", input_dict_instance_id
)["Protein ID"].unique()
)
if step_output is not None:
items_of_interest = step_output["Protein ID"].unique()
step_output = run.steps.get_step_output(
Step, "differentially_expressed_ptm_df", input_dict_instance_id
)
if step_output is not None:
items_of_interest = step_output["PTM"].unique()

self.fields["proteins_of_interest"].choices = fill_helper.to_choices(proteins)
self.fields["items_of_interest"].choices = fill_helper.to_choices(items_of_interest)


class PlotScatterPlotForm(MethodForm):
Expand Down Expand Up @@ -1179,7 +1188,9 @@ def fill_form(self, run: Run) -> None:
)
self.fields["peptide_df"].choices = fill_helper.to_choices(single_protein_peptides)

self.fields["peptide_df"].choices = fill_helper.get_choices(run, "peptide_df")[::-1]
self.fields["peptide_df"].choices = fill_helper.get_choices(
run, "peptide_df"
)[::-1]

single_protein_peptides = run.steps.get_instance_identifiers(
SelectPeptidesForProtein, "peptide_df"
Expand Down

0 comments on commit c4da33e

Please sign in to comment.