support video annotation (#1124)

### Summary [CVS-116105](https://jira.devtools.intel.com/browse/CVS-116105) Support the video annotation type for 'datumaro', 'datumaro_binary' format  ### How to test  ### Checklist  - [x] I have added unit tests to cover my changes. - [x] I have added integration tests to cover my changes. - [x] I have added the description of my changes into [CHANGELOG](https://github.com/openvinotoolkit/datumaro/blob/develop/CHANGELOG.md). - [x] I have updated the [documentation](https://github.com/openvinotoolkit/datumaro/tree/develop/docs) accordingly ### License - [x] I submit _my code changes_ under the same [MIT License](https://github.com/openvinotoolkit/datumaro/blob/develop/LICENSE) that covers the project. Feel free to contact the maintainers if that's a concern. - [x] I have updated the license header for each file (see an example below). ```python # Copyright (C) 2023 Intel Corporation # # SPDX-License-Identifier: MIT ```
openvinotoolkit · Aug 17, 2023 · 4f848c9 · 4f848c9
1 parent 0d5311e
commit 4f848c9
Show file tree

Hide file tree

Showing 19 changed files with 280 additions and 16 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### New features
 - Add tabular data import/export
   (<https://github.com/openvinotoolkit/datumaro/pull/1089>)
+- Support video annotation import/export
+  (<https://github.com/openvinotoolkit/datumaro/pull/1124>)
 
 ### Enhancements
 - Remove xfail marks from the convert integration tests

diff --git a/docs/source/docs/data-formats/datumaro_format.md b/docs/source/docs/data-formats/datumaro_format.md
@@ -129,7 +129,7 @@ dataset/
 │   |    ├── img002.png
 │   |    └── ...
 │   └── ...
-│
+├── videos/  # directory to store video files
 └── annotations/
     ├── train.json  # annotation file with training data
     ├── val.json  # annotation file with validation data

diff --git a/docs/source/docs/data-formats/formats/datumaro.md b/docs/source/docs/data-formats/formats/datumaro.md
@@ -11,6 +11,7 @@ Supported media types:
 
 - `Image`
 - `PointCloud`
+- `VideoFrame`
 
 Supported annotation types:
 
@@ -56,6 +57,7 @@ A Datumaro dataset directory should have the following structure:
     │       ├── <image_name1.ext>
     │       ├── <image_name2.ext>
     │       └── ...
+    ├── videos/  # directory to store video files
     └── annotations/
         ├── <subset_name_1>.json
         ├── <subset_name_2>.json

diff --git a/docs/source/docs/data-formats/formats/datumaro_binary.md b/docs/source/docs/data-formats/formats/datumaro_binary.md
@@ -37,6 +37,7 @@ Dataset/
 │   └── val/
 │       ├── <image_name1.ext>
 │       └── ...
+├── videos/  # directory to store video files
 └── annotations/
     ├── instances_train2017.json
     └── instances_val2017.json
@@ -58,6 +59,7 @@ Supported media types:
 
 - `Image`
 - `PointCloud`
+- `VideoFrame`
 
 Supported annotation types:
 
@@ -103,6 +105,7 @@ A DatumaroBinary dataset directory should have the following structure:
     │       ├── <image_name1.ext>
     │       ├── <image_name2.ext>
     │       └── ...
+    ├── videos/
     └── annotations/
         ├── <subset_name_1>.datum
         ├── <subset_name_2>.datum

diff --git a/src/datumaro/components/annotation.py b/src/datumaro/components/annotation.py
@@ -53,6 +53,7 @@ class AnnotationType(IntEnum):
 COORDINATE_ROUNDING_DIGITS = 2
 CHECK_POLYGON_EQ_EPSILONE = 1e-7
 NO_GROUP = 0
+NO_OBJECT_ID = -1
 
 
 @attrs(slots=True, kw_only=True, order=False)
@@ -83,6 +84,11 @@ class Annotation:
     # single object. The value of 0 means there is no group.
     group: int = field(default=NO_GROUP, validator=default_if_none(int))
 
+    # obeject identifier over the multiple items
+    # e.g.) in a video, person 'A' could be annotated on the multiple frame images
+    #   the user could assign >=0 value as id of person 'A'.
+    object_id: int = field(default=NO_OBJECT_ID, validator=default_if_none(int))
+
     _type = AnnotationType.unknown
 
     @property

diff --git a/src/datumaro/components/exporter.py b/src/datumaro/components/exporter.py
@@ -23,7 +23,7 @@
     DatumaroError,
     ItemExportError,
 )
-from datumaro.components.media import Image, PointCloud
+from datumaro.components.media import Image, PointCloud, VideoFrame
 from datumaro.components.progress_reporting import NullProgressReporter, ProgressReporter
 from datumaro.util.meta_file_util import save_hashkey_file, save_meta_file
 from datumaro.util.os_util import rmtree
@@ -323,6 +323,7 @@ def __init__(
         save_media: bool,
         images_dir: str,
         pcd_dir: str,
+        video_dir: str,
         crypter: Crypter = NULL_CRYPTER,
         image_ext: Optional[str] = None,
         default_image_ext: Optional[str] = None,
@@ -332,6 +333,7 @@ def __init__(
         self._save_media = save_media
         self._images_dir = images_dir
         self._pcd_dir = pcd_dir
+        self._video_dir = video_dir
         self._crypter = crypter
         self._image_ext = image_ext
         self._default_image_ext = default_image_ext
@@ -363,6 +365,14 @@ def make_pcd_extra_image_filename(self, item, idx, image, *, name=None, subdir=N
             item, name=name if name else f"{item.id}/extra_image_{idx}", subdir=subdir
         ) + self.find_image_ext(image)
 
+    def make_video_filename(self, item, *, name=None):
+        if isinstance(item, DatasetItem) and isinstance(item.media, VideoFrame):
+            video_file_name = osp.basename(item.media.video.path)
+        else:
+            assert "Video item type should be VideoFrame"
+
+        return video_file_name
+
     def save_image(
         self,
         item: DatasetItem,
@@ -412,6 +422,26 @@ def helper(i, image):
 
         item.media.save(path, helper, crypter=NULL_CRYPTER)
 
+    def save_video(
+        self,
+        item: DatasetItem,
+        *,
+        basedir: Optional[str] = None,
+        fname: Optional[str] = None,
+    ):
+        if not item.media or not isinstance(item.media, VideoFrame):
+            log.warning("Item '%s' has no video", item.id)
+            return
+        basedir = self._video_dir if basedir is None else basedir
+        fname = self.make_video_filename(item) if fname is None else fname
+
+        path = osp.join(basedir, fname)
+        path = osp.abspath(path)
+
+        os.makedirs(osp.dirname(path), exist_ok=True)
+
+        item.media.video.save(path, crypter=NULL_CRYPTER)
+
     @property
     def images_dir(self) -> str:
         return self._images_dir

diff --git a/src/datumaro/components/media.py b/src/datumaro/components/media.py
@@ -559,6 +559,10 @@ def index(self) -> int:
     def video(self) -> Video:
         return self._video
 
+    @property
+    def path(self) -> str:
+        return self._video.path
+
 
 class _VideoFrameIterator(Iterator[VideoFrame]):
     """
@@ -808,6 +812,20 @@ def __hash__(self):
         # Required for caching
         return hash((self._path, self._step, self._start_frame, self._end_frame))
 
+    def save(
+        self,
+        fp: Union[str, io.IOBase],
+        crypter: Crypter = NULL_CRYPTER,
+    ):
+        if isinstance(fp, str):
+            os.makedirs(osp.dirname(fp), exist_ok=True)
+        if isinstance(fp, str):
+            if fp != self.path:
+                shutil.copyfile(self.path, fp)
+        elif isinstance(fp, io.IOBase):
+            with open(self.path, "rb") as f_video:
+                fp.write(f_video.read())
+
     @property
     def path(self) -> str:
         """Path to the media file"""

diff --git a/src/datumaro/plugins/data_formats/arrow/exporter.py b/src/datumaro/plugins/data_formats/arrow/exporter.py
@@ -288,6 +288,7 @@ def create_writer(self, subset: str, ctx: ExportContext) -> _SubsetWriter:
             save_media=self._save_media,
             images_dir="",
             pcd_dir="",
+            video_dir="",
             crypter=NULL_CRYPTER,
             image_ext=self._image_ext,
             default_image_ext=self._default_image_ext,

diff --git a/src/datumaro/plugins/data_formats/datumaro/base.py b/src/datumaro/plugins/data_formats/datumaro/base.py
@@ -10,6 +10,7 @@
 from json_stream.base import StreamingJSONObject
 
 from datumaro.components.annotation import (
+    NO_OBJECT_ID,
     AnnotationType,
     Bbox,
     Caption,
@@ -28,7 +29,7 @@
 from datumaro.components.dataset_base import DatasetItem, SubsetBase
 from datumaro.components.errors import DatasetImportError, MediaTypeError
 from datumaro.components.importer import ImportContext
-from datumaro.components.media import Image, MediaElement, MediaType, PointCloud
+from datumaro.components.media import Image, MediaElement, MediaType, PointCloud, Video, VideoFrame
 from datumaro.util import parse_json_file, to_dict_from_streaming_json
 from datumaro.version import __version__
 
@@ -45,12 +46,15 @@ def __init__(
         rootpath: str,
         images_dir: str,
         pcd_dir: str,
+        video_dir: str,
         ctx: ImportContext,
     ) -> None:
         self._subset = subset
         self._rootpath = rootpath
         self._images_dir = images_dir
         self._pcd_dir = pcd_dir
+        self._video_dir = video_dir
+        self._videos = {}
         self._ctx = ctx
 
         self._reader = self._init_reader(path)
@@ -174,6 +178,19 @@ def _parse_item(self, item_desc: Dict) -> Optional[DatasetItem]:
                 if self.media_type == MediaElement:
                     self.media_type = PointCloud
 
+            video_frame_info = item_desc.get("video_frame")
+            if media and video_frame_info:
+                raise MediaTypeError("Dataset cannot contain multiple media types")
+            if video_frame_info:
+                video_path = osp.join(self._video_dir, video_frame_info.get("video_path"))
+                if video_path not in self._videos:
+                    self._videos[video_path] = Video(video_path)
+                video = self._videos[video_path]
+
+                frame_index = video_frame_info.get("frame_index")
+
+                media = VideoFrame(video, frame_index)
+
             media_desc = item_desc.get("media")
             if not media and media_desc and media_desc.get("path"):
                 media = MediaElement(path=media_desc.get("path"))
@@ -203,14 +220,21 @@ def _load_annotations(self, item: Dict):
                 ann_type = AnnotationType[ann["type"]]
                 attributes = ann.get("attributes")
                 group = ann.get("group")
+                object_id = ann.get("object_id", NO_OBJECT_ID)
 
                 label_id = ann.get("label_id")
                 z_order = ann.get("z_order")
                 points = ann.get("points")
 
                 if ann_type == AnnotationType.label:
                     loaded.append(
-                        Label(label=label_id, id=ann_id, attributes=attributes, group=group)
+                        Label(
+                            label=label_id,
+                            id=ann_id,
+                            attributes=attributes,
+                            group=group,
+                            object_id=object_id,
+                        )
                     )
 
                 elif ann_type == AnnotationType.mask:
@@ -223,6 +247,7 @@ def _load_annotations(self, item: Dict):
                             id=ann_id,
                             attributes=attributes,
                             group=group,
+                            object_id=object_id,
                             z_order=z_order,
                         )
                     )
@@ -235,6 +260,7 @@ def _load_annotations(self, item: Dict):
                             id=ann_id,
                             attributes=attributes,
                             group=group,
+                            object_id=object_id,
                             z_order=z_order,
                         )
                     )
@@ -247,6 +273,7 @@ def _load_annotations(self, item: Dict):
                             id=ann_id,
                             attributes=attributes,
                             group=group,
+                            object_id=object_id,
                             z_order=z_order,
                         )
                     )
@@ -263,6 +290,7 @@ def _load_annotations(self, item: Dict):
                             id=ann_id,
                             attributes=attributes,
                             group=group,
+                            object_id=object_id,
                             z_order=z_order,
                         )
                     )
@@ -275,6 +303,7 @@ def _load_annotations(self, item: Dict):
                             id=ann_id,
                             attributes=attributes,
                             group=group,
+                            object_id=object_id,
                             z_order=z_order,
                         )
                     )
@@ -293,6 +322,7 @@ def _load_annotations(self, item: Dict):
                             id=ann_id,
                             attributes=attributes,
                             group=group,
+                            object_id=object_id,
                         )
                     )
 
@@ -304,6 +334,7 @@ def _load_annotations(self, item: Dict):
                             id=ann_id,
                             attributes=attributes,
                             group=group,
+                            object_id=object_id,
                             z_order=z_order,
                         )
                     )
@@ -334,9 +365,10 @@ def __init__(
         rootpath: str,
         images_dir: str,
         pcd_dir: str,
+        video_dir: str,
         ctx: ImportContext,
     ) -> None:
-        super().__init__(path, subset, rootpath, images_dir, pcd_dir, ctx)
+        super().__init__(path, subset, rootpath, images_dir, pcd_dir, video_dir, ctx)
         self._length = None
 
     def __len__(self):
@@ -458,6 +490,11 @@ def _init_path(self, path: str):
             pcd_dir = osp.join(rootpath, DatumaroPath.PCD_DIR)
         self._pcd_dir = pcd_dir
 
+        video_dir = ""
+        if rootpath and osp.isdir(osp.join(rootpath, DatumaroPath.VIDEO_DIR)):
+            video_dir = osp.join(rootpath, DatumaroPath.VIDEO_DIR)
+        self._video_dir = video_dir
+
     @property
     def is_stream(self) -> bool:
         return self._stream
@@ -480,6 +517,7 @@ def _load_impl(self, path: str) -> None:
                 self._rootpath,
                 self._images_dir,
                 self._pcd_dir,
+                self._video_dir,
                 self._ctx,
             )
             if not self._stream
@@ -489,6 +527,7 @@ def _load_impl(self, path: str) -> None:
                 self._rootpath,
                 self._images_dir,
                 self._pcd_dir,
+                self._video_dir,
                 self._ctx,
             )
         )