updated transform_dfs.py so that it supports peptide DFs

cschlaffner · Jul 10, 2024 · 286023d · 286023d
1 parent dc9fc74
commit 286023d
Showing 1 changed file with 194 additions and 0 deletions.
diff --git a/protzilla/data_analysis/prot_quant_plot_peptide.py b/protzilla/data_analysis/prot_quant_plot_peptide.py
@@ -0,0 +1,194 @@
+import pandas as pd
+import plotly.graph_objects as go
+from scipy import stats
+from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
+
+from protzilla.utilities.transform_dfs import is_long_format, long_to_wide_retention_time
+
+# Define color constants
+PROTZILLA_DISCRETE_COLOR_SEQUENCE = ["#636EFA", "#EF553B", "#00CC96", "#AB63FA", "#19D3F3", "#E763FA", "#FECB52", "#FFA15A", "#FF6692", "#B6E880"]
+colors = {
+    "plot_bgcolor": "white",
+    "gridcolor": "#F1F1F1",
+    "linecolor": "#F1F1F1",
+    "annotation_text_color": "#ffffff",
+    "annotation_proteins_of_interest": "#4A536A",
+}
+
+def prot_quant_plot_peptide(
+    input_df: pd.DataFrame,
+    protein_group: str,
+    similarity: float = 1.0,
+    similarity_measure: str = "euclidean distance",
+) -> dict:
+    """
+    A function to create a graph visualising protein quantifications across all samples
+    as a line diagram using retention time and intensity. It's possible to select one proteingroup
+    that will be displayed in orange and choose a similarity measurement with a similarity score
+    to get all proteingroups that are similar displayed in another color in this line diagram.
+    All other proteingroups are displayed in the background as a grey polygon.
+
+    :param input_df: A dataframe in protzilla wide format, where each row
+        represents a sample and each column represents a feature.
+    :param protein_group: Protein IDs as the columnheader of the dataframe
+    :param similarity_measure: method to compare the chosen proteingroup with all others. The two
+        methods are "cosine similarity" and "euclidean distance".
+    :param similarity: similarity score of the chosen similarity measurement method.
+
+    :return: returns a dictionary containing a list with a plotly figure and/or a list of messages
+    """
+    # Ensure the dataframe includes retention time
+    if 'Retention time' not in input_df.columns:
+        raise ValueError("The input dataframe must include a 'Retention time' column.")
+
+    wide_df = input_df.interpolate(method='linear', axis=0)
+    wide_df = long_to_wide_retention_time(wide_df) if is_long_format(wide_df) else  wide_df
+
+    if protein_group not in wide_df.columns:
+        raise ValueError("Please select a valid protein group.")
+    elif similarity_measure == "euclidean distance" and similarity < 0:
+        raise ValueError(
+            "Similarity for euclidean distance should be greater than or equal to 0."
+        )
+    elif similarity_measure == "cosine similarity" and (
+            similarity < -1 or similarity > 1
+    ):
+        raise ValueError("Similarity for cosine similarity should be between -1 and 1")
+
+    fig = go.Figure()
+
+    color_mapping = {
+        "A": PROTZILLA_DISCRETE_COLOR_SEQUENCE[0],
+        "C": PROTZILLA_DISCRETE_COLOR_SEQUENCE[1],
+    }
+
+    lower_upper_x = []
+    lower_upper_y = []
+
+    lower_upper_x.append(wide_df['Retention time'].iloc[0])
+    lower_upper_y.append(wide_df.iloc[0].min())
+
+    for index, row in wide_df.iterrows():
+        lower_upper_x.append(row['Retention time'])
+        lower_upper_y.append(row.max())
+
+    for index, row in reversed(list(wide_df.iterrows())):
+        lower_upper_x.append(row['Retention time'])
+        lower_upper_y.append(row.min())
+
+    fig.add_trace(
+        go.Scatter(
+            x=lower_upper_x,
+            y=lower_upper_y,
+            fill="toself",
+            name="Intensity Range",
+            line=dict(color="silver"),
+        )
+    )
+
+    similar_groups = []
+    for group_to_compare in wide_df.columns:
+        if group_to_compare not in ['Retention time', protein_group]:
+            if similarity_measure == "euclidean distance":
+                distance = euclidean_distances(
+                    stats.zscore(wide_df[protein_group]).values.reshape(1, -1),
+                    stats.zscore(wide_df[group_to_compare]).values.reshape(1, -1),
+                )[0][0]
+            else:
+                distance = cosine_similarity(
+                    stats.zscore(wide_df[protein_group]).values.reshape(1, -1),
+                    stats.zscore(wide_df[group_to_compare]).values.reshape(1, -1),
+                )[0][0]
+            if similarity_measure == "euclidean distance":
+                if distance <= similarity:
+                    similar_groups.append(group_to_compare)
+            else:
+                if distance >= similarity:
+                    similar_groups.append(group_to_compare)
+
+    for group in similar_groups:
+        fig.add_trace(
+            go.Scatter(
+                x=wide_df['Retention time'],
+                y=wide_df[group],
+                mode="lines",
+                name=group[:15] + "..." if len(group) > 15 else group,
+                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[1]),
+                showlegend=len(similar_groups) <= 7,
+            )
+        )
+
+    if len(similar_groups) > 7:
+        fig.add_trace(
+            go.Scatter(
+                x=[None],
+                y=[None],
+                mode="lines",
+                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[1]),
+                name="Similar Protein Groups",
+            )
+        )
+
+    formatted_protein_name = (
+        protein_group[:15] + "..." if len(protein_group) > 15 else protein_group
+    )
+    fig.add_trace(
+        go.Scatter(
+            x=wide_df['Retention time'],
+            y=wide_df[protein_group],
+            mode="lines",
+            name=formatted_protein_name,
+            line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2]),
+        )
+    )
+
+    fig.add_trace(
+        go.Scatter(
+            x=[None],
+            y=[None],
+            mode="markers",
+            marker=dict(color=color_mapping.get("A")),
+            name="Experimental Group",
+        )
+    )
+
+    fig.add_trace(
+        go.Scatter(
+            x=[None],
+            y=[None],
+            mode="markers",
+            marker=dict(color=color_mapping.get("C")),
+            name="Control Group",
+        )
+    )
+
+    fig.update_layout(
+        title=f"Intensity of {formatted_protein_name} across retention time",
+        plot_bgcolor=colors["plot_bgcolor"],
+        xaxis_gridcolor=colors["gridcolor"],
+        yaxis_gridcolor=colors["gridcolor"],
+        xaxis_linecolor=colors["linecolor"],
+        yaxis_linecolor=colors["linecolor"],
+        xaxis_title="Retention Time",
+        yaxis_title="Intensity",
+        legend_title="Legend",
+        xaxis=dict(
+            tickmode="array",
+            tickangle=0,
+            tickvals=sorted(wide_df['Retention time']),
+            ticktext=[
+                f"<span style='font-size: 10px; color:{color_mapping.get(label[0], 'black')}'><b>•</b></span>"
+                for label in wide_df['Retention time']
+            ],
+        ),
+        autosize=True,
+        margin=dict(l=100, r=300, t=100, b=100),
+        legend=dict(
+            x=1.05,
+            y=1,
+            bgcolor="rgba(255, 255, 255, 0.5)",
+            orientation="v",
+        ),
+    )
+
+    return dict(plots=[fig])