Resolved some comments from from Hendrik

cschlaffner · Sep 14, 2024 · 25c37d6 · 25c37d6
1 parent aaeed09
commit 25c37d6
Show file tree

Hide file tree

Showing 4 changed files with 59 additions and 72 deletions.
diff --git a/protzilla/data_analysis/time_series_helper.py b/protzilla/data_analysis/time_series_helper.py
@@ -6,7 +6,10 @@ def convert_time_to_hours(time_str):
     :param time_str: The time string to convert in format '%H:%M:%S'
 
     :return: Number of hours since midnight as a float
+    """
+
     """
     time_obj = datetime.strptime(time_str, '%H:%M:%S')
     hours_since_midnight = time_obj.hour + time_obj.minute / 60 + time_obj.second / 3600
-    return hours_since_midnight
+    """
+    return time_str
diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py
@@ -4,7 +4,7 @@
 import pandas as pd
 import plotly.graph_objects as go
 
-from protzilla.data_analysis.time_series_helper import convert_time_to_hours
+#from protzilla.data_analysis.time_series_helper import convert_time_to_hours
 from protzilla.utilities import default_intensity_column
 from protzilla.constants.colors import PROTZILLA_DISCRETE_COLOR_SEQUENCE
 
@@ -29,10 +29,10 @@ def time_series_linear_regression(
         intensity_df: pd.DataFrame,
         metadata_df: pd.DataFrame,
         time_column_name: str,
-        protein_group: str,
         train_size: float,
+        protein_group: str,
+        grouping: str,
         grouping_column_name: str,
-        grouping: str = None,
 ):
     """
     Perform linear regression on the time series data for a given protein group.
@@ -59,8 +59,6 @@ def time_series_linear_regression(
         copy=False,
     )
 
-    intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours)
-
     intensity_df = intensity_df.sample(frac=1, random_state = 42).reset_index(drop=True)
 
     X = intensity_df[[time_column_name]]
@@ -222,8 +220,8 @@ def time_series_ransac_regression(
         stop_probability: float,
         loss: str,
         train_size: float,
-        grouping_column_name: str,
         grouping: str,
+        grouping_column_name: str,
 ):
     """
     Perform RANSAC regression on the time series data for a given protein group.
@@ -255,8 +253,6 @@ def time_series_ransac_regression(
         copy=False,
     )
 
-    intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours)
-
     intensity_df = intensity_df.sample(frac=1, random_state = 42).reset_index(drop=True)
 
     X = intensity_df[[time_column_name]]
@@ -298,7 +294,7 @@ def time_series_ransac_regression(
                 x=plot_df[time_column_name],
                 y=plot_df['Intensity'],
                 mode='markers',
-                name=f'Actual Intensity ({group})',
+                name=f'Inliers ({group})',
                 marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index])
             ), row=1, col=1)
 
@@ -354,7 +350,7 @@ def time_series_ransac_regression(
             x=plot_df[time_column_name],
             y=plot_df['Intensity'],
             mode='markers',
-            name='Actual Intensity',
+            name='Inliers',
             marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0])
         ), row=1, col=1)
 
@@ -507,8 +503,8 @@ def time_series_auto_arima(
     seasonal: str,
     m: int,
     train_size: float,
-    grouping_column_name: str,
     grouping: str,
+    grouping_column_name: str,
 ) -> dict:
     """
     Perform an automatic ARIMA model selection on the time series data for a given protein group.
@@ -526,6 +522,7 @@ def time_series_auto_arima(
     """
 
     color_index = 0
+    messages = []
 
     if train_size < 0 or train_size > 1:
         raise ValueError("Train size should be between 0 and 1")
@@ -553,8 +550,6 @@ def time_series_auto_arima(
         for group in groups:
             group_df = intensity_df[intensity_df[grouping_column_name] == group]
 
-            group_df[time_column_name] = group_df[str(time_column_name)].apply(convert_time_to_hours)
-
             train_df_size = int(len(group_df) * train_size)
             train_df, test_df = group_df[:train_df_size], group_df[train_df_size:]
 
@@ -575,23 +570,6 @@ def time_series_auto_arima(
             # Forecast the test set
             forecast = model.predict(n_periods=test_df.shape[0])
             parameters = model.get_params()
-            aa_order = parameters['order']
-            aa_seasonal_order = parameters['seasonal_order']
-            messages = []
-
-            messages.append(
-                {
-                    "level": logging.INFO,
-                    "msg": f"Auto Arima Order (p,d,q): {aa_order}.",
-                }
-            )
-            if seasonal:
-                messages.append(
-                    {
-                        "level": logging.INFO,
-                        "msg": f"Auto Arima Seasonal Order (P,D,Q,s): {aa_seasonal_order}.",
-                    }
-                )
 
             test_rmse = np.sqrt(mean_squared_error(test_df, forecast))
             test_r2 = r2_score(test_df, forecast)
@@ -635,10 +613,24 @@ def time_series_auto_arima(
                 'train_r2_score': train_r2,
                 'test_r2_score': test_r2,
             })
+        aa_order = parameters['order']
+        aa_seasonal_order = parameters['seasonal_order']
 
-    else:
-        intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours)
+        messages.append(
+            {
+                "level": logging.INFO,
+                "msg": f"Auto Arima Order (p,d,q): {aa_order}.",
+            }
+        )
+        if seasonal:
+            messages.append(
+                {
+                    "level": logging.INFO,
+                    "msg": f"Auto Arima Seasonal Order (P,D,Q,s): {aa_seasonal_order}.",
+                }
+            )
 
+    else:
         train_size = int(len(intensity_df) * train_size)
         train_df, test_df = intensity_df[:train_size], intensity_df[train_size:]
 
@@ -662,7 +654,6 @@ def time_series_auto_arima(
 
         aa_order = parameters['order']
         aa_seasonal_order = parameters['seasonal_order']
-        messages = []
 
         messages.append(
             {
@@ -764,6 +755,7 @@ def time_series_auto_arima(
     return dict(
         scores=scores,
         plots=[fig],
+        messages=messages,
     )
 
 
@@ -781,8 +773,8 @@ def time_series_arima(
     Q: int,
     s: int,
     train_size: float,
-    grouping_column_name: str,
     grouping: str,
+    grouping_column_name: str,
 ) -> dict:
 
     """
@@ -825,8 +817,6 @@ def time_series_arima(
         for group in groups:
             group_df = intensity_df[intensity_df[grouping_column_name] == group]
 
-            group_df[time_column_name] = group_df[str(time_column_name)].apply(convert_time_to_hours)
-
             train_df_size = int(len(group_df) * train_size)
             train_df, test_df = group_df[:train_df_size], group_df[train_df_size:]
 
@@ -893,8 +883,6 @@ def time_series_arima(
             })
 
     else:
-        intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours)
-
         train_size = int(len(intensity_df) * train_size)
         train_df, test_df = intensity_df[:train_size], intensity_df[train_size:]
 

diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
@@ -815,8 +815,8 @@ class TimeSeriesLinearRegression(PlotStep):
         "time_column_name",
         "protein_group",
         "train_size",
-        "grouping_column_name",
         "grouping",
+        "grouping_column_name",
     ]
     output_keys = [
         "scores",
@@ -845,8 +845,8 @@ class TimeSeriesRANSACRegression(PlotStep):
         "stop_probability",
         "loss",
         "train_size",
-        "grouping_column_name",
         "grouping",
+        "grouping_column_name",
     ]
     output_keys = [
         "scores",
@@ -863,7 +863,15 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
 class TimeSeriesADFullerTest(DataAnalysisStep):
     display_name = "Augmented Dickey-Fuller Test"
     operation = "Time series analysis"
-    method_description = "Perform Augmented Dickey-Fuller test on the time series data for a given protein group."
+    method_description = (
+        "The Augmented Dickey-Fuller test is a type of statistical test called a unit root test. The test "
+        "determines how strongly a time series is defined by a trend. The null hypothesis of the test is that the "
+         "time series can be represented by a unit root, which implies that the time series is not stationary. "
+         "The alternative hypothesis is that the time series is stationary. If the p-value is less than the "
+          "significance level, the null hypothesis can be rejected and the time series is considered stationary."
+          "Dickey, D. & Fuller, Wayne. (1979). Distribution of the Estimators for Autoregressive Time Series With a Unit Root. "
+          "JASA. Journal of the American Statistical Association. 74. 10.2307/2286348. "
+    )
 
     input_keys = [
         "intensity_df",
@@ -902,8 +910,8 @@ class TimeSeriesAutoARIMA(PlotStep):
         "seasonal",
         "m",
         "train_size",
-        "grouping_column_name",
         "grouping",
+        "grouping_column_name",
     ]
     output_keys = [
         "scores",
@@ -939,8 +947,8 @@ class TimeSeriesARIMA(PlotStep):
         "Q",
         "s",
         "train_size",
-        "grouping_column_name",
         "grouping",
+        "grouping_column_name",
     ]
     output_keys = [
         "scores",

diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
@@ -1258,12 +1258,12 @@ class TimeSeriesLinearRegressionForm(MethodForm):
         step_size=0.1,
         initial=0.8
     )
-    grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
     grouping = CustomChoiceField(
         choices= TimeSeriesGrouping,
         label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",
         initial=TimeSeriesGrouping.with_grouping
     )
+    grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
 
 
     def fill_form(self, run: Run) -> None:
@@ -1288,6 +1288,9 @@ def fill_form(self, run: Run) -> None:
                 instance_identifier=input_df_instance_id,
             )["Protein ID"].unique()
         )
+        grouping = self.data.get("grouping")
+        if grouping == "Without Grouping":
+            self.toggle_visibility("grouping_column_name", False)
 
 
 class TimeSeriesRANSACRegressionForm(MethodForm):
@@ -1326,12 +1329,12 @@ class TimeSeriesRANSACRegressionForm(MethodForm):
         step_size=0.1,
         initial=0.8
     )
-    grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
     grouping = CustomChoiceField(
         choices= TimeSeriesGrouping,
         label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",
         initial=TimeSeriesGrouping.with_grouping
     )
+    grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
 
 
     def fill_form(self, run: Run) -> None:
@@ -1356,22 +1359,13 @@ def fill_form(self, run: Run) -> None:
                 instance_identifier=input_df_instance_id,
             )["Protein ID"].unique()
         )
+        grouping = self.data.get("grouping")
+        if grouping == "Without Grouping":
+            self.toggle_visibility("grouping_column_name", False)
 
 
 class TimeSeriesADFullerTestForm(MethodForm):
     is_dynamic = True
-    test_info = TextDisplayField(
-        label="Information about the Augmented Dickey-Fuller test",
-        text=(
-            "The Augmented Dickey-Fuller test is a type of statistical test called a unit root test. The test "
-             "determines how strongly a time series is defined by a trend. The null hypothesis of the test is that the "
-             "time series can be represented by a unit root, which implies that the time series is not stationary. "
-             "The alternative hypothesis is that the time series is stationary. If the p-value is less than the "
-             "significance level, the null hypothesis can be rejected and the time series is considered stationary.<br>"
-             "Dickey, D. & Fuller, Wayne. (1979). Distribution of the Estimators for Autoregressive Time Series With a Unit Root."
-             "JASA. Journal of the American Statistical Association. 74. 10.2307/2286348. "
-        ),
-    )
     intensity_df = CustomChoiceField(
         choices=[],
         label="Intensity dataframe",
@@ -1404,11 +1398,6 @@ def fill_form(self, run: Run) -> None:
 
 class TimeSeriesAutoARIMAForm(MethodForm):
     is_dynamic = True
-    model_info = TextDisplayField(
-        label="Citation for AutoARIMA model",
-        text=(
-        ),
-    )
     intensity_df = CustomChoiceField(
         choices=[],
         label="Intensity dataframe",
@@ -1436,12 +1425,12 @@ class TimeSeriesAutoARIMAForm(MethodForm):
         step_size=0.1,
         initial=0.8,
     )
-    grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
     grouping = CustomChoiceField(
         choices= TimeSeriesGrouping,
         label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",
         initial=TimeSeriesGrouping.with_grouping
     )
+    grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
 
 
     def fill_form(self, run: Run) -> None:
@@ -1466,17 +1455,13 @@ def fill_form(self, run: Run) -> None:
                 instance_identifier=input_df_instance_id,
             )["Protein ID"].unique()
         )
+        grouping = self.data.get("grouping")
+        if grouping == "Without Grouping":
+            self.toggle_visibility("grouping_column_name", False)
 
 
 class TimeSeriesARIMAForm(MethodForm):
     is_dynamic = True
-    """
-    model_info = TextDisplayField(
-        label="Citation for ARIMA model",
-        text=(
-        ),
-    )
-    """
     intensity_df = CustomChoiceField(
         choices=[],
         label="Intensity dataframe",
@@ -1544,12 +1529,12 @@ class TimeSeriesARIMAForm(MethodForm):
         step_size=0.1,
         initial=0.8,
     )
-    grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
     grouping = CustomChoiceField(
         choices= TimeSeriesGrouping,
         label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",
         initial=TimeSeriesGrouping.with_grouping
     )
+    grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
 
 
     def fill_form(self, run: Run) -> None:
@@ -1574,6 +1559,9 @@ def fill_form(self, run: Run) -> None:
                 instance_identifier=input_df_instance_id,
             )["Protein ID"].unique()
         )
+        grouping = self.data.get("grouping")
+        if grouping == "Without Grouping":
+            self.toggle_visibility("grouping_column_name", False)
         seasonal = self.data.get("seasonal")
         if seasonal == "No":
             self.toggle_visibility("P", False)