Skip to content

Commit

Permalink
updated transform_dfs.py so that it supports peptide DFs
Browse files Browse the repository at this point in the history
  • Loading branch information
RogerAK committed Jul 10, 2024
1 parent dc9fc74 commit 286023d
Showing 1 changed file with 194 additions and 0 deletions.
194 changes: 194 additions & 0 deletions protzilla/data_analysis/prot_quant_plot_peptide.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
import pandas as pd
import plotly.graph_objects as go
from scipy import stats
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

from protzilla.utilities.transform_dfs import is_long_format, long_to_wide_retention_time

# Define color constants
PROTZILLA_DISCRETE_COLOR_SEQUENCE = ["#636EFA", "#EF553B", "#00CC96", "#AB63FA", "#19D3F3", "#E763FA", "#FECB52", "#FFA15A", "#FF6692", "#B6E880"]
colors = {
"plot_bgcolor": "white",
"gridcolor": "#F1F1F1",
"linecolor": "#F1F1F1",
"annotation_text_color": "#ffffff",
"annotation_proteins_of_interest": "#4A536A",
}

def prot_quant_plot_peptide(
input_df: pd.DataFrame,
protein_group: str,
similarity: float = 1.0,
similarity_measure: str = "euclidean distance",
) -> dict:
"""
A function to create a graph visualising protein quantifications across all samples
as a line diagram using retention time and intensity. It's possible to select one proteingroup
that will be displayed in orange and choose a similarity measurement with a similarity score
to get all proteingroups that are similar displayed in another color in this line diagram.
All other proteingroups are displayed in the background as a grey polygon.
:param input_df: A dataframe in protzilla wide format, where each row
represents a sample and each column represents a feature.
:param protein_group: Protein IDs as the columnheader of the dataframe
:param similarity_measure: method to compare the chosen proteingroup with all others. The two
methods are "cosine similarity" and "euclidean distance".
:param similarity: similarity score of the chosen similarity measurement method.
:return: returns a dictionary containing a list with a plotly figure and/or a list of messages
"""
# Ensure the dataframe includes retention time
if 'Retention time' not in input_df.columns:
raise ValueError("The input dataframe must include a 'Retention time' column.")

wide_df = input_df.interpolate(method='linear', axis=0)
wide_df = long_to_wide_retention_time(wide_df) if is_long_format(wide_df) else wide_df

if protein_group not in wide_df.columns:
raise ValueError("Please select a valid protein group.")
elif similarity_measure == "euclidean distance" and similarity < 0:
raise ValueError(
"Similarity for euclidean distance should be greater than or equal to 0."
)
elif similarity_measure == "cosine similarity" and (
similarity < -1 or similarity > 1
):
raise ValueError("Similarity for cosine similarity should be between -1 and 1")

fig = go.Figure()

color_mapping = {
"A": PROTZILLA_DISCRETE_COLOR_SEQUENCE[0],
"C": PROTZILLA_DISCRETE_COLOR_SEQUENCE[1],
}

lower_upper_x = []
lower_upper_y = []

lower_upper_x.append(wide_df['Retention time'].iloc[0])
lower_upper_y.append(wide_df.iloc[0].min())

for index, row in wide_df.iterrows():
lower_upper_x.append(row['Retention time'])
lower_upper_y.append(row.max())

for index, row in reversed(list(wide_df.iterrows())):
lower_upper_x.append(row['Retention time'])
lower_upper_y.append(row.min())

fig.add_trace(
go.Scatter(
x=lower_upper_x,
y=lower_upper_y,
fill="toself",
name="Intensity Range",
line=dict(color="silver"),
)
)

similar_groups = []
for group_to_compare in wide_df.columns:
if group_to_compare not in ['Retention time', protein_group]:
if similarity_measure == "euclidean distance":
distance = euclidean_distances(
stats.zscore(wide_df[protein_group]).values.reshape(1, -1),
stats.zscore(wide_df[group_to_compare]).values.reshape(1, -1),
)[0][0]
else:
distance = cosine_similarity(
stats.zscore(wide_df[protein_group]).values.reshape(1, -1),
stats.zscore(wide_df[group_to_compare]).values.reshape(1, -1),
)[0][0]
if similarity_measure == "euclidean distance":
if distance <= similarity:
similar_groups.append(group_to_compare)
else:
if distance >= similarity:
similar_groups.append(group_to_compare)

for group in similar_groups:
fig.add_trace(
go.Scatter(
x=wide_df['Retention time'],
y=wide_df[group],
mode="lines",
name=group[:15] + "..." if len(group) > 15 else group,
line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[1]),
showlegend=len(similar_groups) <= 7,
)
)

if len(similar_groups) > 7:
fig.add_trace(
go.Scatter(
x=[None],
y=[None],
mode="lines",
line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[1]),
name="Similar Protein Groups",
)
)

formatted_protein_name = (
protein_group[:15] + "..." if len(protein_group) > 15 else protein_group
)
fig.add_trace(
go.Scatter(
x=wide_df['Retention time'],
y=wide_df[protein_group],
mode="lines",
name=formatted_protein_name,
line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2]),
)
)

fig.add_trace(
go.Scatter(
x=[None],
y=[None],
mode="markers",
marker=dict(color=color_mapping.get("A")),
name="Experimental Group",
)
)

fig.add_trace(
go.Scatter(
x=[None],
y=[None],
mode="markers",
marker=dict(color=color_mapping.get("C")),
name="Control Group",
)
)

fig.update_layout(
title=f"Intensity of {formatted_protein_name} across retention time",
plot_bgcolor=colors["plot_bgcolor"],
xaxis_gridcolor=colors["gridcolor"],
yaxis_gridcolor=colors["gridcolor"],
xaxis_linecolor=colors["linecolor"],
yaxis_linecolor=colors["linecolor"],
xaxis_title="Retention Time",
yaxis_title="Intensity",
legend_title="Legend",
xaxis=dict(
tickmode="array",
tickangle=0,
tickvals=sorted(wide_df['Retention time']),
ticktext=[
f"<span style='font-size: 10px; color:{color_mapping.get(label[0], 'black')}'><b>•</b></span>"
for label in wide_df['Retention time']
],
),
autosize=True,
margin=dict(l=100, r=300, t=100, b=100),
legend=dict(
x=1.05,
y=1,
bgcolor="rgba(255, 255, 255, 0.5)",
orientation="v",
),
)

return dict(plots=[fig])

0 comments on commit 286023d

Please sign in to comment.