From 1adb29274a3971da344d4393dd3e48988ff22caf Mon Sep 17 00:00:00 2001
From: healthonrails <healthonrails@gmail.com>
Date: Tue, 29 Oct 2024 17:39:57 -0400
Subject: [PATCH] feat: Implement behavior evaluation module

This commit introduces a new module for evaluating behavior predictions against manually labeled data. The module provides functionality to:

- Load and preprocess prediction and manual label data from CSV files.
- Create fixed time intervals for analysis.
- Align predicted and manual behaviors to these intervals, handling potential time offsets.
- Calculate overall accuracy and per-behavior precision, recall, and F1-score.
- Generate a confusion matrix visualization for detailed performance analysis.
---
 annolid/behavior/evaluation.py | 117 +++++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 annolid/behavior/evaluation.py

diff --git a/annolid/behavior/evaluation.py b/annolid/behavior/evaluation.py
new file mode 100644
index 0000000..5b96ae5
--- /dev/null
+++ b/annolid/behavior/evaluation.py
@@ -0,0 +1,117 @@
+import pandas as pd
+import numpy as np
+from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+
+def _load_data(predicted_segments_path, manual_labels_path):
+    """Loads and preprocesses prediction and manual label data."""
+    try:
+        predicted_df = pd.read_csv(predicted_segments_path)
+        manual_df = pd.read_csv(manual_labels_path)
+    except FileNotFoundError:
+        raise FileNotFoundError("Could not find one or both CSV files.")
+    except pd.errors.ParserError:  # Handle potential parsing errors
+        raise pd.errors.ParserError(
+            "Error parsing CSV file(s). Check the format.")
+
+    # Convert 'Recording time' to numeric and handle errors
+    for df in [predicted_df, manual_df]:
+        df['Recording time'] = pd.to_numeric(
+            df['Recording time'], errors='coerce')
+        # Remove rows with invalid times
+        df.dropna(subset=['Recording time'], inplace=True)
+    return predicted_df, manual_df
+
+
+def _create_time_intervals(start_time, end_time, interval_duration):
+    """Creates a DataFrame with fixed time intervals."""
+    return pd.DataFrame({'Recording time': np.arange(start_time, end_time + interval_duration, interval_duration)})
+
+
+def _align_behaviors_to_intervals(df, intervals_df, tolerance):
+    """Aligns behavior labels to the nearest time interval."""
+    merged_df = intervals_df.copy()
+    merged_df['Behavior'] = 'none of the above'
+
+    for _, row in df.iterrows():
+        closest_time = intervals_df['Recording time'].iloc[(
+            intervals_df['Recording time'] - row['Recording time']).abs().argsort()[0]]
+        if abs(closest_time - row['Recording time']) <= tolerance:
+            merged_df.loc[merged_df['Recording time'] ==
+                          closest_time, 'Behavior'] = row['Behavior']
+    return merged_df
+
+
+def _calculate_metrics(y_true, y_pred):
+    """Calculates accuracy, precision, recall, and F1-score."""
+    accuracy = accuracy_score(y_true, y_pred)
+    precision, recall, f1, _ = precision_recall_fscore_support(
+        y_true, y_pred, average=None, labels=np.unique(y_true))
+    results_df = pd.DataFrame({
+        'Behavior': np.unique(y_true),
+        'Precision': precision,
+        'Recall': recall,
+        'F1 Score': f1
+    })
+    return accuracy, results_df
+
+
+def evaluate_behavior_predictions(predicted_segments_path, manual_labels_path, interval_duration=3):
+    """Evaluates behavior predictions against manual labels using a fixed interval approach."""
+
+    predicted_df, manual_df = _load_data(
+        predicted_segments_path, manual_labels_path)
+
+    start_time = min(predicted_df['Recording time'].min(
+    ), manual_df['Recording time'].min())
+    end_time = max(predicted_df['Recording time'].max(),
+                   manual_df['Recording time'].max())
+
+    intervals_df = _create_time_intervals(
+        start_time, end_time, interval_duration)
+
+    tolerance = interval_duration / 2
+    merged_manual = _align_behaviors_to_intervals(
+        manual_df, intervals_df, tolerance)
+    merged_predicted = _align_behaviors_to_intervals(
+        predicted_df, intervals_df, tolerance).rename(columns={"Behavior": "Behavior_predicted"})
+
+    merged_df = pd.merge(merged_manual, merged_predicted,
+                         on="Recording time", how="outer")
+
+    y_true = merged_df['Behavior']
+    y_pred = merged_df['Behavior_predicted']
+
+    accuracy, results_df = _calculate_metrics(y_true, y_pred)
+
+    return accuracy, results_df, merged_df
+
+
+def plot_confusion_matrix(y_true, y_pred):
+    """Plots the confusion matrix."""
+    cm = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
+    plt.figure(figsize=(10, 8))
+    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
+                xticklabels=np.unique(y_true), yticklabels=np.unique(y_true))
+    plt.xlabel("Predicted Label")
+    plt.ylabel("True Label")
+    plt.title("Confusion Matrix")
+    plt.show()
+
+
+# Main execution block
+if __name__ == "__main__":
+    predicted_path = '/content/predictions.csv'
+    manual_path = '/content/ground_truth_annotations.csv'
+    try:
+        accuracy, results, merged_df = evaluate_behavior_predictions(
+            predicted_path, manual_path)
+        print(f"Overall Accuracy: {accuracy}")
+        print("\nPer-Behavior Metrics:")
+        print(results)
+        plot_confusion_matrix(
+            merged_df['Behavior'], merged_df['Behavior_predicted'])
+    except (FileNotFoundError, pd.errors.ParserError) as e:
+        print(f"Error: {e}")