Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

298 histogram of imputed and original values #305

Merged
merged 25 commits into from
Nov 28, 2023
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
ec0fcbd
made histograms start at zero
gritlm Nov 14, 2023
464f24d
Added overlay of Histograms
Nov 14, 2023
49d86de
added Logarithmic scale for Histograms
Nov 15, 2023
c0b7fe4
Added visual_transformation Dropdowne menu in graphspecification
Nov 15, 2023
b91766c
Histogram in imputation step shows only the original and the imputed …
Nov 17, 2023
ccc1401
Tests
Nov 17, 2023
798ff49
fixes Bug with bin-sizes
Nov 21, 2023
2e641d5
fixes Bug with bin-sizes
Nov 21, 2023
eeb4bd1
Merge remote-tracking branch 'origin/main' into 298-histogram-of-impu…
Nov 21, 2023
8b1e063
branch conflict
Nov 21, 2023
c4f8d2d
fix linting issues
selenabr Nov 22, 2023
d7f72bd
reformatting
selenabr Nov 22, 2023
224c226
Revert "fix linting issues"
selenabr Nov 22, 2023
5fc9478
Revert "reformatting"
selenabr Nov 22, 2023
d472ebd
Revert "Revert "fix linting issues""
selenabr Nov 22, 2023
8497496
pre-commit formatting
selenabr Nov 22, 2023
2a212f7
fixes bug with calculation of max when some values are nan on some de…
Nov 22, 2023
3b12fb2
Merge remote-tracking branch 'origin/298-histogram-of-imputed-and-ori…
Nov 22, 2023
735e34d
implemented suggested style changes including addition of docstrings
Nov 23, 2023
c8109cf
fix test error
selenabr Nov 24, 2023
f410094
Revert "fix test error"
selenabr Nov 24, 2023
ae09aae
Revert "implemented suggested style changes including addition of doc…
selenabr Nov 24, 2023
5b025d2
Revert "Revert "implemented suggested style changes including additio…
selenabr Nov 24, 2023
2808a63
Revert "Revert "fix test error""
selenabr Nov 24, 2023
40c52c7
Merge branch 'main' into 298-histogram-of-imputed-and-original-values
JanniRoebbecke Nov 28, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions protzilla/constants/workflow_meta.json
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,15 @@
"Protein ID"
],
"default": "None"
},
"visual_transformation": {
"name": "Visual Transformation:",
"type": "categorical",
"categories": [
"linear",
"log10"
],
"default": "linear"
}
},
{
Expand Down Expand Up @@ -545,6 +554,15 @@
"Protein ID"
],
"default": "None"
},
"visual_transformation": {
"name": "Visual Transformation:",
"type": "categorical",
"categories": [
"linear",
"log10"
],
"default": "linear"
}
},
{
Expand Down Expand Up @@ -592,6 +610,15 @@
"Protein ID"
],
"default": "None"
},
"visual_transformation": {
"name": "Visual Transformation:",
"type": "categorical",
"categories": [
"linear",
"log10"
],
"default": "linear"
}
},
{
Expand Down Expand Up @@ -642,6 +669,15 @@
"Protein ID"
],
"default": "None"
},
"visual_transformation": {
"name": "Visual Transformation:",
"type": "categorical",
"categories": [
"linear",
"log10"
],
"default": "linear"
}
},
{
Expand Down Expand Up @@ -689,6 +725,15 @@
"Protein ID"
],
"default": "None"
},
"visual_transformation": {
"name": "Visual Transformation:",
"type": "categorical",
"categories": [
"linear",
"log10"
],
"default": "linear"
}
},
{
Expand Down
123 changes: 102 additions & 21 deletions protzilla/data_preprocessing/imputation.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,6 @@ def by_normal_distribution_sampling(
transformed_df = long_to_wide(intensity_df)
# iterate over all protein groups
for protein_grp in transformed_df.columns:

number_of_nans = transformed_df[protein_grp].isnull().sum()

# don't impute values if there not enough values (> 1) to sample from
Expand Down Expand Up @@ -327,50 +326,116 @@ def by_normal_distribution_sampling(


def by_knn_plot(
df, result_df, current_out, graph_type, graph_type_quantities, group_by
df,
result_df,
current_out,
graph_type,
graph_type_quantities,
group_by,
visual_transformation,
):
return _build_box_hist_plot(
df, result_df, graph_type, graph_type_quantities, group_by
df,
result_df,
graph_type,
graph_type_quantities,
group_by,
visual_transformation,
)


def by_normal_distribution_sampling_plot(
df, result_df, current_out, graph_type, graph_type_quantities, group_by
df,
result_df,
current_out,
graph_type,
graph_type_quantities,
group_by,
visual_transformation,
):
return _build_box_hist_plot(
df, result_df, graph_type, graph_type_quantities, group_by
df,
result_df,
graph_type,
graph_type_quantities,
group_by,
visual_transformation,
)


def by_simple_imputer_plot(
df, result_df, current_out, graph_type, graph_type_quantities, group_by
df,
result_df,
current_out,
graph_type,
graph_type_quantities,
group_by,
visual_transformation,
):
return _build_box_hist_plot(
df, result_df, graph_type, graph_type_quantities, group_by
df,
result_df,
graph_type,
graph_type_quantities,
group_by,
visual_transformation,
)


def by_min_per_sample_plot(
df, result_df, current_out, graph_type, graph_type_quantities, group_by
df,
result_df,
current_out,
graph_type,
graph_type_quantities,
group_by,
visual_transformation,
):
return _build_box_hist_plot(
df, result_df, graph_type, graph_type_quantities, group_by
df,
result_df,
graph_type,
graph_type_quantities,
group_by,
visual_transformation,
)


def by_min_per_protein_plot(
df, result_df, current_out, graph_type, graph_type_quantities, group_by
df,
result_df,
current_out,
graph_type,
graph_type_quantities,
group_by,
visual_transformation,
):
return _build_box_hist_plot(
df, result_df, graph_type, graph_type_quantities, group_by
df,
result_df,
graph_type,
graph_type_quantities,
group_by,
visual_transformation,
)


def by_min_per_dataset_plot(
df, result_df, current_out, graph_type, graph_type_quantities, group_by
df,
result_df,
current_out,
graph_type,
graph_type_quantities,
group_by,
visual_transformation,
):
return _build_box_hist_plot(
df, result_df, graph_type, graph_type_quantities, group_by
df,
result_df,
graph_type,
graph_type_quantities,
group_by,
visual_transformation,
)


Expand All @@ -379,7 +444,7 @@ def number_of_imputed_values(input_df, result_df):


def _build_box_hist_plot(
df, result_df, graph_type, graph_type_quantities, group_by
df, result_df, graph_type, graph_type_quantities, group_by, visual_transformation
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not consistent to previous format (e.g. 433-438)

) -> list[Figure]:
"""
This function creates two visualisations:
Expand All @@ -391,24 +456,40 @@ def _build_box_hist_plot(

2. a graph summarising the amount of
filtered proteins.

"""

intensity_name_df = df.columns[3]
intensity_name_result_df = result_df.columns[3]

imputed_df = result_df.copy()

imputed_df[intensity_name_result_df] = list(
map(
lambda x, y: y if np.isnan(x) else np.nan,
df[intensity_name_df],
result_df[intensity_name_result_df],
)
)

if graph_type == "Boxplot":
fig1 = create_box_plots(
dataframe_a=df,
dataframe_b=result_df,
name_a="Before Imputation",
name_b="After Imputation",
dataframe_b=imputed_df,
name_a="Original Values",
name_b="Imputed Values",
heading="Distribution of Protein Intensities",
group_by=group_by,
visual_transformation=visual_transformation,
)
elif graph_type == "Histogram":
fig1 = create_histograms(
dataframe_a=df,
dataframe_b=result_df,
name_a="Before Imputation",
name_b="After Imputation",
dataframe_b=imputed_df,
name_a="Original Values",
name_b="Imputed Values",
heading="Distribution of Protein Intensities",
visual_transformation=visual_transformation,
overlay=True,
)

values_of_sectors = [
Expand Down
67 changes: 61 additions & 6 deletions protzilla/data_preprocessing/plots.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.graph_objects import Figure
from plotly.subplots import make_subplots

from protzilla.data_preprocessing.plots_helper import generate_log_tics

from ..constants.colors import (
PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE,
PROTZILLA_DISCRETE_COLOR_SEQUENCE,
Expand Down Expand Up @@ -116,6 +119,7 @@ def create_box_plots(
y_title="",
x_title="",
group_by: str = "None",
visual_transformation="linear",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please specify the types for the parameters for consistency.
(e.g. visual_transformation: str = "linear")

) -> Figure:
"""
A function to create a boxplot for visualisation
Expand Down Expand Up @@ -204,6 +208,8 @@ def create_box_plots(
"yanchor": "top",
},
)
if visual_transformation == "log10":
fig.update_yaxes(type="log")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is there this inconsistency with "log10" and "log"?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"log" is the definition used in plotly, because log10 is the only possible (intended) use in plotly. we decided to use "log10" to make explicit which base is used

fig.update_yaxes(rangemode="tozero")
return fig

Expand All @@ -216,6 +222,8 @@ def create_histograms(
heading="",
y_title="",
x_title="",
visual_transformation="linear",
overlay=False,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above

) -> Figure:
"""
A function to create a histogram for visualisation
Expand All @@ -239,25 +247,71 @@ def create_histograms(
:type y_title: str
:param x_title: Optional x axis title for graphs.
:type x_title: str
:param overlay: Specifies whether to draw one Histogram with overlay or two separate histograms
:type overlay: bool
:param visual_transformation: Visual transformation of the y-axis data.
:type visual_transformation: str
:return: returns a pie or bar chart of the data
:rtype: Figure (plotly object)
"""
if visual_transformation not in {"linear", "log10"}:
raise ValueError(
f"""visual_transformation parameter must be "linear" or
"log10" but is {visual_transformation}"""
)

intensity_name_a = dataframe_a.columns[3]
intensity_name_b = dataframe_b.columns[3]
fig = make_subplots(rows=1, cols=2)

intensities_a = dataframe_a[intensity_name_a]
intensities_b = dataframe_b[intensity_name_b]

if visual_transformation == "log10":
intensities_a = intensities_a.apply(np.log10)
intensities_b = intensities_b.apply(np.log10)

min_value = min(min(intensities_a), min(intensities_b))
max_value = max(max(intensities_a), max(intensities_b))

binsize_factor = 0.0005 if visual_transformation == "linear" else 0.02

trace0 = go.Histogram(
x=dataframe_a[intensity_name_a],
x=intensities_a,
marker_color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0],
name=name_a,
xbins=dict(
start=min_value,
end=max_value,
size=(max_value - min_value) * binsize_factor,
),
)
trace1 = go.Histogram(
x=dataframe_b[intensity_name_b],
x=intensities_b,
marker_color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[1],
name=name_b,
xbins=dict(
start=min_value,
end=max_value,
size=(max_value - min_value) * binsize_factor,
),
)
fig.add_trace(trace0, 1, 1)
fig.add_trace(trace1, 1, 2)
fig.update_layout(bargap=0.2)
if not overlay:
fig = make_subplots(rows=1, cols=2)
fig.add_trace(trace0, 1, 1)
fig.add_trace(trace1, 1, 2)
if visual_transformation == "log10":
fig.update_layout(
xaxis=generate_log_tics(0, max_value),
xaxis2=generate_log_tics(0, max_value),
)
else:
fig = go.Figure()
fig.add_trace(trace0)
fig.add_trace(trace1)
fig.update_layout(barmode="overlay")
fig.update_traces(opacity=0.75)
if visual_transformation == "log10":
fig.update_layout(xaxis=generate_log_tics(0, max_value))

fig.update_layout(
xaxis_title=x_title,
Expand All @@ -276,6 +330,7 @@ def create_histograms(
},
)
fig.update_yaxes(rangemode="tozero")

return fig


Expand Down
Loading
Loading