From 4b940f0f7631fa69ddc69d1ea976b1cb43078f10 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Wed, 10 Jul 2024 17:55:46 +0200
Subject: [PATCH] Implemeted Protquantplot with retention time instead of
 Intensities

---
 .../data_analysis/prot_quant_plot_peptide.py  | 33 +++++++++----------
 protzilla/utilities/transform_dfs.py          | 25 ++++++++++++++
 2 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/protzilla/data_analysis/prot_quant_plot_peptide.py b/protzilla/data_analysis/prot_quant_plot_peptide.py
index eeefbcfc..7df80807 100644
--- a/protzilla/data_analysis/prot_quant_plot_peptide.py
+++ b/protzilla/data_analysis/prot_quant_plot_peptide.py
@@ -23,7 +23,7 @@ def prot_quant_plot_peptide(
 ) -> dict:
     """
     A function to create a graph visualising protein quantifications across all samples
-    as a line diagram using retention time and intensity. It's possible to select one proteingroup
+    as a line diagram using retention time. It's possible to select one proteingroup
     that will be displayed in orange and choose a similarity measurement with a similarity score
     to get all proteingroups that are similar displayed in another color in this line diagram.
     All other proteingroups are displayed in the background as a grey polygon.
@@ -37,13 +37,10 @@ def prot_quant_plot_peptide(
 
     :return: returns a dictionary containing a list with a plotly figure and/or a list of messages
     """
-    # Ensure the dataframe includes retention time
-    if 'Retention time' not in input_df.columns:
-        raise ValueError("The input dataframe must include a 'Retention time' column.")
-
     wide_df = input_df.interpolate(method='linear', axis=0)
     wide_df = long_to_wide_retention_time(wide_df) if is_long_format(wide_df) else  wide_df
 
+
     if protein_group not in wide_df.columns:
         raise ValueError("Please select a valid protein group.")
     elif similarity_measure == "euclidean distance" and similarity < 0:
@@ -65,15 +62,15 @@ def prot_quant_plot_peptide(
     lower_upper_x = []
     lower_upper_y = []
 
-    lower_upper_x.append(wide_df['Retention time'].iloc[0])
+    lower_upper_x.append(wide_df.index[0])
     lower_upper_y.append(wide_df.iloc[0].min())
 
     for index, row in wide_df.iterrows():
-        lower_upper_x.append(row['Retention time'])
+        lower_upper_x.append(index)
         lower_upper_y.append(row.max())
 
     for index, row in reversed(list(wide_df.iterrows())):
-        lower_upper_x.append(row['Retention time'])
+        lower_upper_x.append(index)
         lower_upper_y.append(row.min())
 
     fig.add_trace(
@@ -81,14 +78,14 @@ def prot_quant_plot_peptide(
             x=lower_upper_x,
             y=lower_upper_y,
             fill="toself",
-            name="Intensity Range",
+            name="Retention time of all protein groups",
             line=dict(color="silver"),
         )
     )
 
     similar_groups = []
     for group_to_compare in wide_df.columns:
-        if group_to_compare not in ['Retention time', protein_group]:
+        if group_to_compare != protein_group:
             if similarity_measure == "euclidean distance":
                 distance = euclidean_distances(
                     stats.zscore(wide_df[protein_group]).values.reshape(1, -1),
@@ -109,7 +106,7 @@ def prot_quant_plot_peptide(
     for group in similar_groups:
         fig.add_trace(
             go.Scatter(
-                x=wide_df['Retention time'],
+                x=wide_df.index,
                 y=wide_df[group],
                 mode="lines",
                 name=group[:15] + "..." if len(group) > 15 else group,
@@ -134,7 +131,7 @@ def prot_quant_plot_peptide(
     )
     fig.add_trace(
         go.Scatter(
-            x=wide_df['Retention time'],
+            x=wide_df.index,
             y=wide_df[protein_group],
             mode="lines",
             name=formatted_protein_name,
@@ -163,22 +160,22 @@ def prot_quant_plot_peptide(
     )
 
     fig.update_layout(
-        title=f"Intensity of {formatted_protein_name} across retention time",
+        title=f"Retention time of {formatted_protein_name} in all samples",
         plot_bgcolor=colors["plot_bgcolor"],
         xaxis_gridcolor=colors["gridcolor"],
         yaxis_gridcolor=colors["gridcolor"],
         xaxis_linecolor=colors["linecolor"],
         yaxis_linecolor=colors["linecolor"],
-        xaxis_title="Retention Time",
-        yaxis_title="Intensity",
+        xaxis_title="Sample",
+        yaxis_title="Retention time",
         legend_title="Legend",
         xaxis=dict(
             tickmode="array",
             tickangle=0,
-            tickvals=sorted(wide_df['Retention time']),
+            tickvals=wide_df.index,
             ticktext=[
                 f"<span style='font-size: 10px; color:{color_mapping.get(label[0], 'black')}'><b>•</b></span>"
-                for label in wide_df['Retention time']
+                for label in wide_df.index
             ],
         ),
         autosize=True,
@@ -191,4 +188,4 @@ def prot_quant_plot_peptide(
         ),
     )
 
-    return dict(plots=[fig])
+    return dict(plots=[fig])
\ No newline at end of file
diff --git a/protzilla/utilities/transform_dfs.py b/protzilla/utilities/transform_dfs.py
index aa2bc6c0..0b1e7976 100644
--- a/protzilla/utilities/transform_dfs.py
+++ b/protzilla/utilities/transform_dfs.py
@@ -29,6 +29,31 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str = None):
     intensity_df = intensity_df.fillna(intensity_df.mean())
     return intensity_df
 
+def long_to_wide_retention_time(intensity_df: pd.DataFrame, value_name: str = None):
+    """
+    This function transforms the dataframe to a wide format that
+    can be more easily handled by packages such as sklearn.
+    Each sample gets one row with all observations as columns.
+
+    :param intensity_df: the dataframe that should be transformed into
+        long format
+        :type intensity_df: pd.DataFrame
+
+    :return: returns dataframe in wide format suitable for use by
+        packages such as sklearn
+    :rtype: pd.DataFrame
+    """
+
+    if intensity_df.duplicated(subset=["Sample", "Protein ID"]).any():
+        intensity_df = intensity_df.groupby(["Sample", "Protein ID"]).mean().reset_index()
+        intensity_df = intensity_df.dropna()
+
+    values_name = 'Retention time'
+    intensity_df = pd.pivot(
+        intensity_df, index="Sample", columns="Protein ID", values=values_name
+    )
+    intensity_df = intensity_df.fillna(intensity_df.mean())
+    return intensity_df
 
 def wide_to_long(wide_df: pd.DataFrame, original_long_df: pd.DataFrame):
     """