Skip to content

Commit

Permalink
Added an option for the user to select the Time and Grouping column n…
Browse files Browse the repository at this point in the history
…ames
  • Loading branch information
RogerAK committed Sep 14, 2024
1 parent 7762995 commit aaeed09
Show file tree
Hide file tree
Showing 3 changed files with 171 additions and 126 deletions.
99 changes: 66 additions & 33 deletions protzilla/data_analysis/time_series_regression_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ def time_series_linear_regression(
)

intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours)
intensity_df = intensity_df.interpolate(method='linear', axis=0)

intensity_df = intensity_df.sample(frac=1, random_state = 42).reset_index(drop=True)

Expand Down Expand Up @@ -90,8 +89,8 @@ def time_series_linear_regression(
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)

train_df = pd.DataFrame({'Time': X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
test_df = pd.DataFrame({'Time': X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
train_df = pd.DataFrame({time_column_name: X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
test_df = pd.DataFrame({time_column_name: X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
plot_df = pd.concat([train_df, test_df])

color = PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index % len(PROTZILLA_DISCRETE_COLOR_SEQUENCE)]
Expand Down Expand Up @@ -134,8 +133,11 @@ def time_series_linear_regression(
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)

train_df = pd.DataFrame({'Time': X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
test_df = pd.DataFrame({'Time': X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
train_df = pd.DataFrame(
{time_column_name: X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train,
'Type': 'Train'})
test_df = pd.DataFrame(
{time_column_name: X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
plot_df = pd.concat([train_df, test_df])

fig.add_trace(go.Scatter(
Expand Down Expand Up @@ -186,7 +188,7 @@ def time_series_linear_regression(
yaxis_gridcolor=colors["gridcolor"],
xaxis_linecolor=colors["linecolor"],
yaxis_linecolor=colors["linecolor"],
xaxis_title="Time",
xaxis_title=time_column_name,
yaxis_title="Intensity",
legend_title="Legend",
autosize=True,
Expand Down Expand Up @@ -254,7 +256,6 @@ def time_series_ransac_regression(
)

intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours)
intensity_df = intensity_df.interpolate(method='linear', axis=0)

intensity_df = intensity_df.sample(frac=1, random_state = 42).reset_index(drop=True)

Expand Down Expand Up @@ -286,31 +287,31 @@ def time_series_ransac_regression(
train_r2 = r2_score(y_train[inlier_mask], y_pred_train[inlier_mask])
test_r2 = r2_score(y_test, y_pred_test)

train_df = pd.DataFrame({'Time': X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
test_df = pd.DataFrame({'Time': X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
train_df = pd.DataFrame({time_column_name: X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
test_df = pd.DataFrame({time_column_name: X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
train_df['Inlier'] = inlier_mask
test_df['Inlier'] = False
plot_df = pd.concat([train_df, test_df])

# Add main plot traces
fig.add_trace(go.Scatter(
x=plot_df['Time'],
x=plot_df[time_column_name],
y=plot_df['Intensity'],
mode='markers',
name='Actual Intensity',
name=f'Actual Intensity ({group})',
marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index])
), row=1, col=1)

fig.add_trace(go.Scatter(
x=plot_df['Time'],
x=plot_df[time_column_name],
y=plot_df['Predicted'],
mode='lines',
name='Predicted Intensity',
name=f'Predicted Intensity ({group})',
line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2])
), row=1, col=1)

fig.add_trace(go.Scatter(
x=plot_df[plot_df['Inlier'] == False]['Time'],
x=plot_df[plot_df['Inlier'] == False][time_column_name],
y=plot_df[plot_df['Inlier'] == False]['Intensity'],
mode='markers',
name='Outliers',
Expand Down Expand Up @@ -342,31 +343,31 @@ def time_series_ransac_regression(
train_r2 = r2_score(y_train[inlier_mask], y_pred_train[inlier_mask])
test_r2 = r2_score(y_test, y_pred_test)

train_df = pd.DataFrame({'Time': X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
test_df = pd.DataFrame({'Time': X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
train_df = pd.DataFrame({time_column_name: X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
test_df = pd.DataFrame({time_column_name: X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
train_df['Inlier'] = inlier_mask
test_df['Inlier'] = False
plot_df = pd.concat([train_df, test_df])

# Add main plot traces
fig.add_trace(go.Scatter(
x=plot_df['Time'],
x=plot_df[time_column_name],
y=plot_df['Intensity'],
mode='markers',
name='Actual Intensity',
marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0])
), row=1, col=1)

fig.add_trace(go.Scatter(
x=plot_df['Time'],
x=plot_df[time_column_name],
y=plot_df['Predicted'],
mode='lines',
name='Predicted Intensity',
line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0])
), row=1, col=1)

fig.add_trace(go.Scatter(
x=plot_df[plot_df['Inlier'] == False]['Time'],
x=plot_df[plot_df['Inlier'] == False][time_column_name],
y=plot_df[plot_df['Inlier'] == False]['Intensity'],
mode='markers',
name='Outliers',
Expand Down Expand Up @@ -405,7 +406,7 @@ def time_series_ransac_regression(
yaxis_gridcolor=colors["gridcolor"],
xaxis_linecolor=colors["linecolor"],
yaxis_linecolor=colors["linecolor"],
xaxis_title="Time",
xaxis_title=time_column_name,
yaxis_title="Intensity",
legend_title="Legend",
autosize=True,
Expand Down Expand Up @@ -553,7 +554,6 @@ def time_series_auto_arima(
group_df = intensity_df[intensity_df[grouping_column_name] == group]

group_df[time_column_name] = group_df[str(time_column_name)].apply(convert_time_to_hours)
group_df = group_df.interpolate(method='linear', axis=0)

train_df_size = int(len(group_df) * train_size)
train_df, test_df = group_df[:train_df_size], group_df[train_df_size:]
Expand All @@ -574,6 +574,24 @@ def time_series_auto_arima(

# Forecast the test set
forecast = model.predict(n_periods=test_df.shape[0])
parameters = model.get_params()
aa_order = parameters['order']
aa_seasonal_order = parameters['seasonal_order']
messages = []

messages.append(
{
"level": logging.INFO,
"msg": f"Auto Arima Order (p,d,q): {aa_order}.",
}
)
if seasonal:
messages.append(
{
"level": logging.INFO,
"msg": f"Auto Arima Seasonal Order (P,D,Q,s): {aa_seasonal_order}.",
}
)

test_rmse = np.sqrt(mean_squared_error(test_df, forecast))
test_r2 = r2_score(test_df, forecast)
Expand All @@ -588,23 +606,23 @@ def time_series_auto_arima(
x=test_df.index,
y=test_df,
mode='markers',
name='Actual Intensity',
name=f'Actual Intensity ({group})',
marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index])
), row=1, col=1)

fig.add_trace(go.Scatter(
x=test_df.index,
y=forecast,
mode='markers',
name='Predicted Intensity',
name=f'Predicted Intensity ({group})',
line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 3])
), row=1, col=1)

fig.add_trace(go.Scatter(
x = forecast_plot.index,
y = forecast_plot,
mode = 'lines',
name = 'Mean Predicted Intensity',
name = f'Mean Predicted Intensity ({group})',
line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 3])
), row=1, col=1)

Expand All @@ -620,7 +638,6 @@ def time_series_auto_arima(

else:
intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours)
intensity_df = intensity_df.interpolate(method='linear', axis=0)

train_size = int(len(intensity_df) * train_size)
train_df, test_df = intensity_df[:train_size], intensity_df[train_size:]
Expand All @@ -641,6 +658,25 @@ def time_series_auto_arima(

# Forecast the test set
forecast = model.predict(n_periods=test_df.shape[0])
parameters = model.get_params()

aa_order = parameters['order']
aa_seasonal_order = parameters['seasonal_order']
messages = []

messages.append(
{
"level": logging.INFO,
"msg": f"Auto Arima Order (p,d,q): {aa_order}.",
}
)
if seasonal:
messages.append(
{
"level": logging.INFO,
"msg": f"Auto Arima Seasonal Order (P,D,Q,s): {aa_seasonal_order}.",
}
)

test_rmse = np.sqrt(mean_squared_error(test_df, forecast))
test_r2 = r2_score(test_df, forecast)
Expand Down Expand Up @@ -707,7 +743,7 @@ def time_series_auto_arima(
yaxis_gridcolor=colors["gridcolor"],
xaxis_linecolor=colors["linecolor"],
yaxis_linecolor=colors["linecolor"],
xaxis_title="Time",
xaxis_title=time_column_name,
yaxis_title="Intensity",
legend_title="Legend",
autosize=True,
Expand All @@ -725,7 +761,6 @@ def time_series_auto_arima(

fig.update_annotations(font_size=12)


return dict(
scores=scores,
plots=[fig],
Expand Down Expand Up @@ -791,7 +826,6 @@ def time_series_arima(
group_df = intensity_df[intensity_df[grouping_column_name] == group]

group_df[time_column_name] = group_df[str(time_column_name)].apply(convert_time_to_hours)
group_df = group_df.interpolate(method='linear', axis=0)

train_df_size = int(len(group_df) * train_size)
train_df, test_df = group_df[:train_df_size], group_df[train_df_size:]
Expand Down Expand Up @@ -828,23 +862,23 @@ def time_series_arima(
x=test_df.index,
y=test_df,
mode='markers',
name='Actual Intensity',
name=f'Actual Intensity ({group})',
marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index])
), row=1, col=1)

fig.add_trace(go.Scatter(
x=forecast_plot.index,
y=forecast_plot,
mode='markers',
name='Predicted Intensity',
name= f'Predicted Intensity ({group})',
line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2])
), row=1, col=1)

fig.add_trace(go.Scatter(
x = forecast_mean_plot.index,
y = forecast_mean_plot,
mode = 'lines',
name = 'Mean Predicted Intensity',
name = f'Mean Predicted Intensity ({group})',
line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2])
), row=1, col=1)

Expand All @@ -860,7 +894,6 @@ def time_series_arima(

else:
intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours)
intensity_df = intensity_df.interpolate(method='linear', axis=0)

train_size = int(len(intensity_df) * train_size)
train_df, test_df = intensity_df[:train_size], intensity_df[train_size:]
Expand Down Expand Up @@ -945,7 +978,7 @@ def time_series_arima(
yaxis_gridcolor=colors["gridcolor"],
xaxis_linecolor=colors["linecolor"],
yaxis_linecolor=colors["linecolor"],
xaxis_title="Time",
xaxis_title=time_column_name,
yaxis_title="Intensity",
legend_title="Legend",
autosize=True,
Expand Down
50 changes: 28 additions & 22 deletions protzilla/methods/data_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
prot_quant_plot,
scatter_plot,
)
from protzilla.data_analysis.time_series_plot_peptide import time_series_plot_peptide
from protzilla.data_analysis.time_series_plots import time_quant_plot
from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph
from protzilla.data_analysis.ptm_analysis import (
filter_peptides_of_protein,
Expand Down Expand Up @@ -344,27 +344,6 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
)
return inputs

class PlotTimeSeriesPeptide(PlotStep):
display_name = "Time Quantification Plot For Peptide"
operation = "plot"
method_description = (
"Creates a line chart for intensity across Time for protein groups"
)

input_keys = ["input_df", "metadata_df", "protein_group", "similarity_measure", "similarity"]
output_keys = []

def method(self, inputs: dict) -> dict:
return time_series_plot_peptide(**inputs)


def insert_dataframes(self, steps: StepManager, inputs) -> dict:
inputs["input_df"] = steps.get_step_output(
Step, "peptide_df", inputs["input_df"]
)
inputs["metadata_df"] = steps.metadata_df
return inputs


class PlotPrecisionRecallCurve(PlotStep):
display_name = "Precision Recall"
Expand Down Expand Up @@ -796,6 +775,33 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
return inputs


class PlotTimeQuant(PlotStep):
display_name = "Time Quantification Plot For Protein"
operation = "Time series analysis"
method_description = (
"Creates a line chart for intensity across Time for protein groups"
)

input_keys = [
"intensity_df",
"metadata_df",
"time_column_name",
"protein_group",
"similarity_measure",
"similarity"
]
output_keys = []

def method(self, inputs: dict) -> dict:
return time_quant_plot(**inputs)


def insert_dataframes(self, steps: StepManager, inputs) -> dict:
inputs["intensity_df"] = steps.protein_df
inputs["metadata_df"] = steps.metadata_df
return inputs


class TimeSeriesLinearRegression(PlotStep):
display_name = "Linear Regression"
operation = "Time series analysis"
Expand Down
Loading

0 comments on commit aaeed09

Please sign in to comment.