Skip to content

Commit

Permalink
Resolved some comments from from Hendrik
Browse files Browse the repository at this point in the history
  • Loading branch information
RogerAK committed Sep 14, 2024
1 parent aaeed09 commit 25c37d6
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 72 deletions.
5 changes: 4 additions & 1 deletion protzilla/data_analysis/time_series_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@ def convert_time_to_hours(time_str):
:param time_str: The time string to convert in format '%H:%M:%S'
:return: Number of hours since midnight as a float
"""

"""
time_obj = datetime.strptime(time_str, '%H:%M:%S')
hours_since_midnight = time_obj.hour + time_obj.minute / 60 + time_obj.second / 3600
return hours_since_midnight
"""
return time_str
64 changes: 26 additions & 38 deletions protzilla/data_analysis/time_series_regression_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pandas as pd
import plotly.graph_objects as go

from protzilla.data_analysis.time_series_helper import convert_time_to_hours
#from protzilla.data_analysis.time_series_helper import convert_time_to_hours
from protzilla.utilities import default_intensity_column
from protzilla.constants.colors import PROTZILLA_DISCRETE_COLOR_SEQUENCE

Expand All @@ -29,10 +29,10 @@ def time_series_linear_regression(
intensity_df: pd.DataFrame,
metadata_df: pd.DataFrame,
time_column_name: str,
protein_group: str,
train_size: float,
protein_group: str,
grouping: str,
grouping_column_name: str,
grouping: str = None,
):
"""
Perform linear regression on the time series data for a given protein group.
Expand All @@ -59,8 +59,6 @@ def time_series_linear_regression(
copy=False,
)

intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours)

intensity_df = intensity_df.sample(frac=1, random_state = 42).reset_index(drop=True)

X = intensity_df[[time_column_name]]
Expand Down Expand Up @@ -222,8 +220,8 @@ def time_series_ransac_regression(
stop_probability: float,
loss: str,
train_size: float,
grouping_column_name: str,
grouping: str,
grouping_column_name: str,
):
"""
Perform RANSAC regression on the time series data for a given protein group.
Expand Down Expand Up @@ -255,8 +253,6 @@ def time_series_ransac_regression(
copy=False,
)

intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours)

intensity_df = intensity_df.sample(frac=1, random_state = 42).reset_index(drop=True)

X = intensity_df[[time_column_name]]
Expand Down Expand Up @@ -298,7 +294,7 @@ def time_series_ransac_regression(
x=plot_df[time_column_name],
y=plot_df['Intensity'],
mode='markers',
name=f'Actual Intensity ({group})',
name=f'Inliers ({group})',
marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index])
), row=1, col=1)

Expand Down Expand Up @@ -354,7 +350,7 @@ def time_series_ransac_regression(
x=plot_df[time_column_name],
y=plot_df['Intensity'],
mode='markers',
name='Actual Intensity',
name='Inliers',
marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0])
), row=1, col=1)

Expand Down Expand Up @@ -507,8 +503,8 @@ def time_series_auto_arima(
seasonal: str,
m: int,
train_size: float,
grouping_column_name: str,
grouping: str,
grouping_column_name: str,
) -> dict:
"""
Perform an automatic ARIMA model selection on the time series data for a given protein group.
Expand All @@ -526,6 +522,7 @@ def time_series_auto_arima(
"""

color_index = 0
messages = []

if train_size < 0 or train_size > 1:
raise ValueError("Train size should be between 0 and 1")
Expand Down Expand Up @@ -553,8 +550,6 @@ def time_series_auto_arima(
for group in groups:
group_df = intensity_df[intensity_df[grouping_column_name] == group]

group_df[time_column_name] = group_df[str(time_column_name)].apply(convert_time_to_hours)

train_df_size = int(len(group_df) * train_size)
train_df, test_df = group_df[:train_df_size], group_df[train_df_size:]

Expand All @@ -575,23 +570,6 @@ def time_series_auto_arima(
# Forecast the test set
forecast = model.predict(n_periods=test_df.shape[0])
parameters = model.get_params()
aa_order = parameters['order']
aa_seasonal_order = parameters['seasonal_order']
messages = []

messages.append(
{
"level": logging.INFO,
"msg": f"Auto Arima Order (p,d,q): {aa_order}.",
}
)
if seasonal:
messages.append(
{
"level": logging.INFO,
"msg": f"Auto Arima Seasonal Order (P,D,Q,s): {aa_seasonal_order}.",
}
)

test_rmse = np.sqrt(mean_squared_error(test_df, forecast))
test_r2 = r2_score(test_df, forecast)
Expand Down Expand Up @@ -635,10 +613,24 @@ def time_series_auto_arima(
'train_r2_score': train_r2,
'test_r2_score': test_r2,
})
aa_order = parameters['order']
aa_seasonal_order = parameters['seasonal_order']

else:
intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours)
messages.append(
{
"level": logging.INFO,
"msg": f"Auto Arima Order (p,d,q): {aa_order}.",
}
)
if seasonal:
messages.append(
{
"level": logging.INFO,
"msg": f"Auto Arima Seasonal Order (P,D,Q,s): {aa_seasonal_order}.",
}
)

else:
train_size = int(len(intensity_df) * train_size)
train_df, test_df = intensity_df[:train_size], intensity_df[train_size:]

Expand All @@ -662,7 +654,6 @@ def time_series_auto_arima(

aa_order = parameters['order']
aa_seasonal_order = parameters['seasonal_order']
messages = []

messages.append(
{
Expand Down Expand Up @@ -764,6 +755,7 @@ def time_series_auto_arima(
return dict(
scores=scores,
plots=[fig],
messages=messages,
)


Expand All @@ -781,8 +773,8 @@ def time_series_arima(
Q: int,
s: int,
train_size: float,
grouping_column_name: str,
grouping: str,
grouping_column_name: str,
) -> dict:

"""
Expand Down Expand Up @@ -825,8 +817,6 @@ def time_series_arima(
for group in groups:
group_df = intensity_df[intensity_df[grouping_column_name] == group]

group_df[time_column_name] = group_df[str(time_column_name)].apply(convert_time_to_hours)

train_df_size = int(len(group_df) * train_size)
train_df, test_df = group_df[:train_df_size], group_df[train_df_size:]

Expand Down Expand Up @@ -893,8 +883,6 @@ def time_series_arima(
})

else:
intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours)

train_size = int(len(intensity_df) * train_size)
train_df, test_df = intensity_df[:train_size], intensity_df[train_size:]

Expand Down
18 changes: 13 additions & 5 deletions protzilla/methods/data_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -815,8 +815,8 @@ class TimeSeriesLinearRegression(PlotStep):
"time_column_name",
"protein_group",
"train_size",
"grouping_column_name",
"grouping",
"grouping_column_name",
]
output_keys = [
"scores",
Expand Down Expand Up @@ -845,8 +845,8 @@ class TimeSeriesRANSACRegression(PlotStep):
"stop_probability",
"loss",
"train_size",
"grouping_column_name",
"grouping",
"grouping_column_name",
]
output_keys = [
"scores",
Expand All @@ -863,7 +863,15 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
class TimeSeriesADFullerTest(DataAnalysisStep):
display_name = "Augmented Dickey-Fuller Test"
operation = "Time series analysis"
method_description = "Perform Augmented Dickey-Fuller test on the time series data for a given protein group."
method_description = (
"The Augmented Dickey-Fuller test is a type of statistical test called a unit root test. The test "
"determines how strongly a time series is defined by a trend. The null hypothesis of the test is that the "
"time series can be represented by a unit root, which implies that the time series is not stationary. "
"The alternative hypothesis is that the time series is stationary. If the p-value is less than the "
"significance level, the null hypothesis can be rejected and the time series is considered stationary."
"Dickey, D. & Fuller, Wayne. (1979). Distribution of the Estimators for Autoregressive Time Series With a Unit Root. "
"JASA. Journal of the American Statistical Association. 74. 10.2307/2286348. "
)

input_keys = [
"intensity_df",
Expand Down Expand Up @@ -902,8 +910,8 @@ class TimeSeriesAutoARIMA(PlotStep):
"seasonal",
"m",
"train_size",
"grouping_column_name",
"grouping",
"grouping_column_name",
]
output_keys = [
"scores",
Expand Down Expand Up @@ -939,8 +947,8 @@ class TimeSeriesARIMA(PlotStep):
"Q",
"s",
"train_size",
"grouping_column_name",
"grouping",
"grouping_column_name",
]
output_keys = [
"scores",
Expand Down
44 changes: 16 additions & 28 deletions ui/runs/forms/data_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -1258,12 +1258,12 @@ class TimeSeriesLinearRegressionForm(MethodForm):
step_size=0.1,
initial=0.8
)
grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
grouping = CustomChoiceField(
choices= TimeSeriesGrouping,
label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",
initial=TimeSeriesGrouping.with_grouping
)
grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")


def fill_form(self, run: Run) -> None:
Expand All @@ -1288,6 +1288,9 @@ def fill_form(self, run: Run) -> None:
instance_identifier=input_df_instance_id,
)["Protein ID"].unique()
)
grouping = self.data.get("grouping")
if grouping == "Without Grouping":
self.toggle_visibility("grouping_column_name", False)


class TimeSeriesRANSACRegressionForm(MethodForm):
Expand Down Expand Up @@ -1326,12 +1329,12 @@ class TimeSeriesRANSACRegressionForm(MethodForm):
step_size=0.1,
initial=0.8
)
grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
grouping = CustomChoiceField(
choices= TimeSeriesGrouping,
label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",
initial=TimeSeriesGrouping.with_grouping
)
grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")


def fill_form(self, run: Run) -> None:
Expand All @@ -1356,22 +1359,13 @@ def fill_form(self, run: Run) -> None:
instance_identifier=input_df_instance_id,
)["Protein ID"].unique()
)
grouping = self.data.get("grouping")
if grouping == "Without Grouping":
self.toggle_visibility("grouping_column_name", False)


class TimeSeriesADFullerTestForm(MethodForm):
is_dynamic = True
test_info = TextDisplayField(
label="Information about the Augmented Dickey-Fuller test",
text=(
"The Augmented Dickey-Fuller test is a type of statistical test called a unit root test. The test "
"determines how strongly a time series is defined by a trend. The null hypothesis of the test is that the "
"time series can be represented by a unit root, which implies that the time series is not stationary. "
"The alternative hypothesis is that the time series is stationary. If the p-value is less than the "
"significance level, the null hypothesis can be rejected and the time series is considered stationary.<br>"
"Dickey, D. & Fuller, Wayne. (1979). Distribution of the Estimators for Autoregressive Time Series With a Unit Root."
"JASA. Journal of the American Statistical Association. 74. 10.2307/2286348. "
),
)
intensity_df = CustomChoiceField(
choices=[],
label="Intensity dataframe",
Expand Down Expand Up @@ -1404,11 +1398,6 @@ def fill_form(self, run: Run) -> None:

class TimeSeriesAutoARIMAForm(MethodForm):
is_dynamic = True
model_info = TextDisplayField(
label="Citation for AutoARIMA model",
text=(
),
)
intensity_df = CustomChoiceField(
choices=[],
label="Intensity dataframe",
Expand Down Expand Up @@ -1436,12 +1425,12 @@ class TimeSeriesAutoARIMAForm(MethodForm):
step_size=0.1,
initial=0.8,
)
grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
grouping = CustomChoiceField(
choices= TimeSeriesGrouping,
label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",
initial=TimeSeriesGrouping.with_grouping
)
grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")


def fill_form(self, run: Run) -> None:
Expand All @@ -1466,17 +1455,13 @@ def fill_form(self, run: Run) -> None:
instance_identifier=input_df_instance_id,
)["Protein ID"].unique()
)
grouping = self.data.get("grouping")
if grouping == "Without Grouping":
self.toggle_visibility("grouping_column_name", False)


class TimeSeriesARIMAForm(MethodForm):
is_dynamic = True
"""
model_info = TextDisplayField(
label="Citation for ARIMA model",
text=(
),
)
"""
intensity_df = CustomChoiceField(
choices=[],
label="Intensity dataframe",
Expand Down Expand Up @@ -1544,12 +1529,12 @@ class TimeSeriesARIMAForm(MethodForm):
step_size=0.1,
initial=0.8,
)
grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
grouping = CustomChoiceField(
choices= TimeSeriesGrouping,
label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",
initial=TimeSeriesGrouping.with_grouping
)
grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")


def fill_form(self, run: Run) -> None:
Expand All @@ -1574,6 +1559,9 @@ def fill_form(self, run: Run) -> None:
instance_identifier=input_df_instance_id,
)["Protein ID"].unique()
)
grouping = self.data.get("grouping")
if grouping == "Without Grouping":
self.toggle_visibility("grouping_column_name", False)
seasonal = self.data.get("seasonal")
if seasonal == "No":
self.toggle_visibility("P", False)
Expand Down

0 comments on commit 25c37d6

Please sign in to comment.