From ddb8a86e72a1ce470227c0fa0dcbaa00e07c6fce Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Thu, 11 Jul 2024 12:43:23 -0400
Subject: [PATCH] Port over DLC conversion utils (#946)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 CHANGELOG.md                                  |   3 +-
 setup.py                                      |   1 -
 .../behavior/deeplabcut/_dlc_utils.py         | 288 ++++++++++++++++++
 .../deeplabcut/deeplabcutdatainterface.py     |  51 +---
 .../behavior/deeplabcut/requirements.txt      |   4 +-
 .../test_on_data/test_behavior_interfaces.py  |  17 ++
 6 files changed, 314 insertions(+), 50 deletions(-)
 create mode 100644 src/neuroconv/datainterfaces/behavior/deeplabcut/_dlc_utils.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 69d038f49..8b837f700 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,8 +20,7 @@
 * Make annotations from the raw format available on `IntanRecordingInterface`. [PR #934](https://github.com/catalystneuro/neuroconv/pull/943)
 * Add an option to suppress display the progress bar (tqdm) in `VideoContext`  [PR #937](https://github.com/catalystneuro/neuroconv/pull/937)
 * Automatic compression of data in the `LightnignPoseDataInterface` has been disabled - users should refer to the new `configure_backend` method for a general approach for setting compression. [PR #942](https://github.com/catalystneuro/neuroconv/pull/942)
-
-
+* Port over `dlc2nwb` utility functions for ease of maintenance. [PR #946](https://github.com/catalystneuro/neuroconv/pull/946)
 
 
 
diff --git a/setup.py b/setup.py
index 1af10b6c3..e06c90077 100644
--- a/setup.py
+++ b/setup.py
@@ -88,7 +88,6 @@
     license="BSD-3-Clause",
     classifiers=[
         "Intended Audience :: Science/Research",
-        "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",
diff --git a/src/neuroconv/datainterfaces/behavior/deeplabcut/_dlc_utils.py b/src/neuroconv/datainterfaces/behavior/deeplabcut/_dlc_utils.py
new file mode 100644
index 000000000..72ddafe28
--- /dev/null
+++ b/src/neuroconv/datainterfaces/behavior/deeplabcut/_dlc_utils.py
@@ -0,0 +1,288 @@
+import importlib
+import os
+import pickle
+import warnings
+from pathlib import Path
+from typing import List, Optional, Union
+
+import numpy as np
+import pandas as pd
+import yaml
+from pynwb import NWBFile
+from ruamel.yaml import YAML
+
+from ....utils import FilePathType
+
+
+def _read_config(config_file_path):
+    """
+    Reads structured config file defining a project.
+    """
+    ruamelFile = YAML()
+    path = Path(config_file_path)
+    if os.path.exists(path):
+        try:
+            with open(path, "r") as f:
+                cfg = ruamelFile.load(f)
+                curr_dir = os.path.dirname(config_file_path)
+                if cfg["project_path"] != curr_dir:
+                    cfg["project_path"] = curr_dir
+        except Exception as err:
+            if len(err.args) > 2:
+                if err.args[2] == "could not determine a constructor for the tag '!!python/tuple'":
+                    with open(path, "r") as ymlfile:
+                        cfg = yaml.load(ymlfile, Loader=yaml.SafeLoader)
+                else:
+                    raise
+
+    else:
+        raise FileNotFoundError(
+            "Config file is not found. Please make sure that the file exists and/or that you passed the path of the config file correctly!"
+        )
+    return cfg
+
+
+def _get_movie_timestamps(movie_file, VARIABILITYBOUND=1000, infer_timestamps=True):
+    """
+    Return numpy array of the timestamps for a video.
+
+    Parameters
+    ----------
+    movie_file : str
+        Path to movie_file
+    """
+    import cv2
+
+    reader = cv2.VideoCapture(movie_file)
+    timestamps = []
+    n_frames = int(reader.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = reader.get(cv2.CAP_PROP_FPS)
+
+    for _ in range(n_frames):
+        _ = reader.read()
+        timestamps.append(reader.get(cv2.CAP_PROP_POS_MSEC))
+
+    for _ in range(len(reader)):
+        _ = reader.read()
+        timestamps.append(reader.get(cv2.CAP_PROP_POS_MSEC))
+
+    timestamps = np.array(timestamps) / 1000  # Convert to seconds
+
+    if np.nanvar(np.diff(timestamps)) < 1.0 / fps * 1.0 / VARIABILITYBOUND:
+        warnings.warn(
+            "Variability of timestamps suspiciously small. See: https://github.com/DeepLabCut/DLC2NWB/issues/1"
+        )
+
+    if any(timestamps[1:] == 0):
+        # Infers times when OpenCV provides 0s
+        warning_msg = "Removing"
+        timestamp_zero_count = np.count_nonzero(timestamps == 0)
+        timestamps[1:][timestamps[1:] == 0] = np.nan  # replace 0s with nan
+
+        if infer_timestamps:
+            warning_msg = "Replacing"
+            timestamps = _infer_nan_timestamps(timestamps)
+
+        warnings.warn(  # warns user of percent of 0 frames
+            "%s cv2 timestamps returned as 0: %f%%" % (warning_msg, (timestamp_zero_count / len(timestamps) * 100))
+        )
+
+    return timestamps
+
+
+def _infer_nan_timestamps(timestamps):
+    """Given np.array, interpolate nan values using index * sampling rate"""
+    bad_timestamps_mask = np.isnan(timestamps)
+    # Runs of good timestamps
+    good_run_indices = np.where(np.diff(np.hstack(([False], bad_timestamps_mask == False, [False]))))[0].reshape(-1, 2)
+
+    # For each good run, get the diff and append to cumulative array
+    sampling_diffs = np.array([])
+    for idx in good_run_indices:
+        sampling_diffs = np.append(sampling_diffs, np.diff(timestamps[idx[0] : idx[1]]))
+    estimated_sampling_rate = np.mean(sampling_diffs)  # Average over diffs
+
+    # Infer timestamps with avg sampling rate
+    bad_timestamps_indexes = np.argwhere(bad_timestamps_mask)[:, 0]
+    inferred_timestamps = bad_timestamps_indexes * estimated_sampling_rate
+    timestamps[bad_timestamps_mask] = inferred_timestamps
+
+    return timestamps
+
+
+def _ensure_individuals_in_header(df, dummy_name):
+    if "individuals" not in df.columns.names:
+        # Single animal project -> add individual row to
+        # the header of single animal projects.
+        temp = pd.concat({dummy_name: df}, names=["individuals"], axis=1)
+        df = temp.reorder_levels(["scorer", "individuals", "bodyparts", "coords"], axis=1)
+    return df
+
+
+def _get_pes_args(config_file, h5file, individual_name, infer_timestamps=True):
+    if "DLC" not in h5file or not h5file.endswith(".h5"):
+        raise IOError("The file passed in is not a DeepLabCut h5 data file.")
+
+    cfg = _read_config(config_file)
+
+    vidname, scorer = os.path.split(h5file)[-1].split("DLC")
+    scorer = "DLC" + os.path.splitext(scorer)[0]
+    video = None
+
+    df = _ensure_individuals_in_header(pd.read_hdf(h5file), individual_name)
+
+    # Fetch the corresponding metadata pickle file
+    paf_graph = []
+    filename, _ = os.path.splitext(h5file)
+    for i, c in enumerate(filename[::-1]):
+        if c.isnumeric():
+            break
+    if i > 0:
+        filename = filename[:-i]
+    metadata_file = filename + "_meta.pickle"
+    if os.path.isfile(metadata_file):
+        with open(metadata_file, "rb") as file:
+            metadata = pickle.load(file)
+        test_cfg = metadata["data"]["DLC-model-config file"]
+        paf_graph = test_cfg.get("partaffinityfield_graph", [])
+        if paf_graph:
+            paf_inds = test_cfg.get("paf_best")
+            if paf_inds is not None:
+                paf_graph = [paf_graph[i] for i in paf_inds]
+    else:
+        warnings.warn("Metadata not found...")
+
+    for video_path, params in cfg["video_sets"].items():
+        if vidname in video_path:
+            video = video_path, params["crop"]
+            break
+
+    if video is None:
+        warnings.warn(f"The video file corresponding to {h5file} could not be found...")
+        video = "fake_path", "0, 0, 0, 0"
+
+        timestamps = df.index.tolist()  # setting timestamps to dummy TODO: extract timestamps in DLC?
+    else:
+        timestamps = _get_movie_timestamps(video[0], infer_timestamps=infer_timestamps)
+    return scorer, df, video, paf_graph, timestamps, cfg
+
+
+def _write_pes_to_nwbfile(
+    nwbfile,
+    animal,
+    df_animal,
+    scorer,
+    video,  # Expects this to be a tuple; first index is string path, second is the image shape as "0, width, 0, height"
+    paf_graph,
+    timestamps,
+    exclude_nans,
+    pose_estimation_container_kwargs: Optional[dict] = None,
+):
+    from ndx_pose import PoseEstimation, PoseEstimationSeries
+
+    pose_estimation_container_kwargs = pose_estimation_container_kwargs or dict()
+
+    pose_estimation_series = []
+    for kpt, xyp in df_animal.groupby(level="bodyparts", axis=1, sort=False):
+        data = xyp.to_numpy()
+
+        if exclude_nans:
+            # exclude_nans is inverse infer_timestamps. if not infer, there may be nans
+            data = data[~np.isnan(timestamps)]
+            timestamps_cleaned = timestamps[~np.isnan(timestamps)]
+        else:
+            timestamps_cleaned = timestamps
+
+        pes = PoseEstimationSeries(
+            name=f"{animal}_{kpt}",
+            description=f"Keypoint {kpt} from individual {animal}.",
+            data=data[:, :2],
+            unit="pixels",
+            reference_frame="(0,0) corresponds to the bottom left corner of the video.",
+            timestamps=timestamps_cleaned,
+            confidence=data[:, 2],
+            confidence_definition="Softmax output of the deep neural network.",
+        )
+        pose_estimation_series.append(pes)
+
+    deeplabcut_version = None
+    is_deeplabcut_installed = importlib.util.find_spec(name="deeplabcut") is not None
+    if is_deeplabcut_installed:
+        deeplabcut_version = importlib.metadata.version(distribution_name="deeplabcut")
+
+    pose_estimation_default_kwargs = dict(
+        pose_estimation_series=pose_estimation_series,
+        description="2D keypoint coordinates estimated using DeepLabCut.",
+        original_videos=[video[0]],
+        # TODO check if this is a mandatory arg in ndx-pose (can skip if video is not found_
+        dimensions=[list(map(int, video[1].split(",")))[1::2]],
+        scorer=scorer,
+        source_software="DeepLabCut",
+        source_software_version=deeplabcut_version,
+        nodes=[pes.name for pes in pose_estimation_series],
+        edges=paf_graph if paf_graph else None,
+        **pose_estimation_container_kwargs,
+    )
+    pose_estimation_default_kwargs.update(pose_estimation_container_kwargs)
+    pose_estimation_container = PoseEstimation(**pose_estimation_default_kwargs)
+
+    if "behavior" in nwbfile.processing:  # TODO: replace with get_module
+        behavior_processing_module = nwbfile.processing["behavior"]
+    else:
+        behavior_processing_module = nwbfile.create_processing_module(
+            name="behavior", description="processed behavioral data"
+        )
+    behavior_processing_module.add(pose_estimation_container)
+
+    return nwbfile
+
+
+def add_subject_to_nwbfile(
+    nwbfile: NWBFile,
+    h5file: FilePathType,
+    individual_name: str,
+    config_file: FilePathType,
+    timestamps: Optional[Union[List, np.ndarray]] = None,
+    pose_estimation_container_kwargs: Optional[dict] = None,
+) -> NWBFile:
+    """
+    Given the subject name, add the DLC .h5 file to an in-memory NWBFile object.
+
+    Parameters
+    ----------
+    nwbfile : pynwb.NWBFile
+        The in-memory nwbfile object to which the subject specific pose estimation series will be added.
+    h5file : str or path
+        Path to the DeepLabCut .h5 output file.
+    individual_name : str
+        Name of the subject (whose pose is predicted) for single-animal DLC project.
+        For multi-animal projects, the names from the DLC project will be used directly.
+    config_file : str or path
+        Path to a project config.yaml file
+    timestamps : list, np.ndarray or None, default: None
+        Alternative timestamps vector. If None, then use the inferred timestamps from DLC2NWB
+    pose_estimation_container_kwargs : dict, optional
+        Dictionary of keyword argument pairs to pass to the PoseEstimation container.
+
+    Returns
+    -------
+    nwbfile : pynwb.NWBFile
+        nwbfile with pes written in the behavior module
+    """
+    scorer, df, video, paf_graph, dlc_timestamps, _ = _get_pes_args(config_file, h5file, individual_name)
+    if timestamps is None:
+        timestamps = dlc_timestamps
+
+    df_animal = df.groupby(level="individuals", axis=1).get_group(individual_name)
+
+    return _write_pes_to_nwbfile(
+        nwbfile,
+        individual_name,
+        df_animal,
+        scorer,
+        video,
+        paf_graph,
+        timestamps,
+        exclude_nans=False,
+        pose_estimation_container_kwargs=pose_estimation_container_kwargs,
+    )
diff --git a/src/neuroconv/datainterfaces/behavior/deeplabcut/deeplabcutdatainterface.py b/src/neuroconv/datainterfaces/behavior/deeplabcut/deeplabcutdatainterface.py
index 8fa1d3cf1..e053e88fa 100644
--- a/src/neuroconv/datainterfaces/behavior/deeplabcut/deeplabcutdatainterface.py
+++ b/src/neuroconv/datainterfaces/behavior/deeplabcut/deeplabcutdatainterface.py
@@ -5,50 +5,9 @@
 from pynwb.file import NWBFile
 
 from ....basetemporalalignmentinterface import BaseTemporalAlignmentInterface
-from ....tools import get_package
 from ....utils import FilePathType
 
 
-def write_subject_to_nwb(
-    nwbfile: NWBFile,
-    h5file: FilePathType,
-    individual_name: str,
-    config_file: FilePathType,
-    timestamps: Optional[Union[List, np.ndarray]] = None,
-):
-    """
-    Given, subject name, write h5file to an existing nwbfile.
-
-    Parameters
-    ----------
-    nwbfile : pynwb.NWBFile
-        The in-memory nwbfile object to which the subject specific pose estimation series will be added.
-    h5file : str or path
-        Path to the DeepLabCut .h5 output file.
-    individual_name : str
-        Name of the subject (whose pose is predicted) for single-animal DLC project.
-        For multi-animal projects, the names from the DLC project will be used directly.
-    config_file : str or path
-        Path to a project config.yaml file
-    timestamps : list, np.ndarray or None, default: None
-        Alternative timestamps vector. If None, then use the inferred timestamps from DLC2NWB
-    Returns
-    -------
-    nwbfile : pynwb.NWBFile
-        nwbfile with pes written in the behavior module
-    """
-    dlc2nwb = get_package(package_name="dlc2nwb")
-
-    scorer, df, video, paf_graph, dlc_timestamps, _ = dlc2nwb.utils._get_pes_args(config_file, h5file, individual_name)
-    if timestamps is None:
-        timestamps = dlc_timestamps
-
-    df_animal = df.groupby(level="individuals", axis=1).get_group(individual_name)
-    return dlc2nwb.utils._write_pes_to_nwbfile(
-        nwbfile, individual_name, df_animal, scorer, video, paf_graph, timestamps, exclude_nans=False
-    )
-
-
 class DeepLabCutInterface(BaseTemporalAlignmentInterface):
     """Data interface for DeepLabCut datasets."""
 
@@ -87,13 +46,13 @@ def __init__(
         verbose: bool, default: True
             controls verbosity.
         """
-        dlc2nwb = get_package(package_name="dlc2nwb")
+        from ._dlc_utils import _read_config
 
         file_path = Path(file_path)
         if "DLC" not in file_path.stem or ".h5" not in file_path.suffixes:
             raise IOError("The file passed in is not a DeepLabCut h5 data file.")
 
-        self._config_file = dlc2nwb.utils.read_config(config_file_path)
+        self._config_file = _read_config(config_file_path=config_file_path)
         self.subject_name = subject_name
         self.verbose = verbose
         super().__init__(file_path=file_path, config_file_path=config_file_path)
@@ -126,13 +85,13 @@ def set_aligned_timestamps(self, aligned_timestamps: Union[List, np.ndarray]):
         aligned_timestamps : list, np.ndarray
             alternative timestamps vector.
         """
-
         self._timestamps = np.array(aligned_timestamps)
 
     def add_to_nwbfile(
         self,
         nwbfile: NWBFile,
         metadata: Optional[dict] = None,
+        container_name: str = "PoseEstimation",
     ):
         """
         Conversion from DLC output files to nwb. Derived from dlc2nwb library.
@@ -144,11 +103,13 @@ def add_to_nwbfile(
         metadata: dict
             metadata info for constructing the nwb file (optional).
         """
+        from ._dlc_utils import add_subject_to_nwbfile
 
-        write_subject_to_nwb(
+        add_subject_to_nwbfile(
             nwbfile=nwbfile,
             h5file=str(self.source_data["file_path"]),
             individual_name=self.subject_name,
             config_file=str(self.source_data["config_file_path"]),
             timestamps=self._timestamps,
+            pose_estimation_container_kwargs=dict(name=container_name),
         )
diff --git a/src/neuroconv/datainterfaces/behavior/deeplabcut/requirements.txt b/src/neuroconv/datainterfaces/behavior/deeplabcut/requirements.txt
index 17d8300fb..03e0ee0b0 100644
--- a/src/neuroconv/datainterfaces/behavior/deeplabcut/requirements.txt
+++ b/src/neuroconv/datainterfaces/behavior/deeplabcut/requirements.txt
@@ -1,4 +1,4 @@
-dlc2nwb>=0.3
-tables<3.9.0;python_version<'3.9'  # imported by package but not included in pip setup (is included in setup.cfg)
 tables<3.9.2;sys_platform=="darwin"
 tables;sys_platform=="linux" or sys_platform=="win32"
+ndx-pose==0.1.1
+neuroconv[video]
diff --git a/tests/test_on_data/test_behavior_interfaces.py b/tests/test_on_data/test_behavior_interfaces.py
index 155b32d72..36011da92 100644
--- a/tests/test_on_data/test_behavior_interfaces.py
+++ b/tests/test_on_data/test_behavior_interfaces.py
@@ -335,6 +335,7 @@ class TestDeepLabCutInterface(DeepLabCutInterfaceMixin, unittest.TestCase):
 
     def run_custom_checks(self):
         self.check_custom_timestamps(nwbfile_path=self.nwbfile_path)
+        self.check_renaming_instance(nwbfile_path=self.nwbfile_path)
 
     def check_custom_timestamps(self, nwbfile_path: str):
         # TODO: Peel out into separate test class and replace this part with check_read_nwb
@@ -361,6 +362,22 @@ def check_custom_timestamps(self, nwbfile_path: str):
                 pose_timestamps = pose_estimation.timestamps
                 np.testing.assert_array_equal(pose_timestamps, self._custom_timestamps_case_1)
 
+    def check_renaming_instance(self, nwbfile_path: str):
+        custom_container_name = "TestPoseEstimation"
+
+        metadata = self.interface.get_metadata()
+        metadata["NWBFile"].update(session_start_time=datetime.now().astimezone())
+
+        self.interface.run_conversion(
+            nwbfile_path=nwbfile_path, overwrite=True, metadata=metadata, container_name=custom_container_name
+        )
+
+        with NWBHDF5IO(path=nwbfile_path, mode="r", load_namespaces=True) as io:
+            nwbfile = io.read()
+            assert "behavior" in nwbfile.processing
+            assert "PoseEstimation" not in nwbfile.processing["behavior"].data_interfaces
+            assert custom_container_name in nwbfile.processing["behavior"].data_interfaces
+
     def check_read_nwb(self, nwbfile_path: str):
         # TODO: move this to the upstream mixin
         with NWBHDF5IO(path=nwbfile_path, mode="r", load_namespaces=True) as io: