asyml · Pushkar-Bhuse · Jun 30, 2022 · Jul 5, 2022 · Jul 6, 2022 · Jul 7, 2022
diff --git a/forte/common/constants.py b/forte/common/constants.py
@@ -11,6 +11,9 @@
 # The index storing entry type in the internal entry data of DataStore.
 ENTRY_TYPE_INDEX = 3
 
+# The index storing the payload ID in internal entry data of DataStore
+PAYLOAD_INDEX = 0
+
 # The index storing entry type (specific to Link and Group type). It is saved
 # in the `tid_idx_dict` in DataStore.
 ENTRY_DICT_TYPE_INDEX = 0

diff --git a/forte/data/entry_converter.py b/forte/data/entry_converter.py
@@ -32,6 +32,7 @@
     SinglePackEntries,
     MultiPackEntries,
 )
+from forte.data.ontology.top import BoundingBox, Box
 from forte.utils import get_class, get_full_module_name
 
 logger = logging.getLogger(__name__)
@@ -205,6 +206,8 @@ def get_entry_object(
         # the value can be arbitrary since they will all be routed to DataStore.
         if data_store_ref._is_annotation(type_name):
             entry = entry_class(pack=pack, begin=0, end=0)
+        elif any(entry_class == box_class for box_class in [BoundingBox, Box]):
+            entry = entry_class(pack=pack, height=1, width=1)
         elif any(
             data_store_ref._is_subclass(type_name, type_class)
             for type_class in SinglePackEntries + MultiPackEntries

diff --git a/forte/data/ontology/core.py b/forte/data/ontology/core.py
@@ -15,6 +15,7 @@
 Defines the basic data structures and interfaces for the Forte data
 representation system.
 """
+import math
 import uuid
 
 from abc import abstractmethod, ABC
@@ -23,6 +24,7 @@
 from typing import (
     Iterable,
     Optional,
+    Tuple,
     Type,
     Hashable,
     TypeVar,
@@ -635,5 +637,207 @@ def index_key(self) -> int:
         return self.tid
 
 
+class Grid:
+    """
+    Regular grid with a grid configuration dependent on the image size.
+    It is a data structure used to retrieve grid-related objects such as grid
+    cells from the image. Grid itself doesn't store any data.
+    Based the image size and the grid shape,
+    we compute the height and the width of grid cells.
+    For example, if the image size (image_height,image_width) is (640, 480)
+    and the grid shape (height, width) is (2, 3)
+    the size of grid cells (self.c_h, self.c_w) will be (320, 160).
+    However, when the image size is not divisible by the grid shape, we round
+    up the resulting size(floating number) to an integer.
+    In this way, as each grid
+    cell taking one more pixel, we make the last grid cell per column and row
+    size(height and width) to be the remainder of the image size divided by the
+    grid cell size which is smaller than other grid cell.
+    For example, if the image
+    size is (128, 128) and the grid shape is (13, 13), the first 11 grid cells
+    per column and row will have a size of (10, 10) since 128/13=9.85, so we
+    round up to 10. The last grid cell per column and row will have a size of
+    (8, 8) since 128%10=8.
+    We require each grid to be bounded/intialized with one image size since
+    the number of different image shapes are limited per computer vision task.
+    For example, we can only have one image size (640, 480) from a CV dataset,
+    and we could augment the dataset with few other image sizes
+    (320, 240), (480, 640). Then there are only three image sizes.
+    Therefore, it won't be troublesome to
+    have a grid for each image size, and we can check the image size during the
+    initialization of the grid.
+    By contrast, if the grid is totally "free-form"
+    that we don't initialize it with any
+    image size and pass the image size directly into the method/operation on
+    the fly, the API would be more complex and image size check would be
+    repeated everytime the method is called.
+    Args:
+        height: the number of grid cell per column, the unit is one grid cell.
+        width: the number of grid cell per row, the unit is one grid cell.
+        image_height: the number of pixels per column in the image.
+        image_width: the number of pixels per row in the image.
+    """
+
+    def __init__(
+        self,
+        height: int,
+        width: int,
+        image_height: int,
+        image_width: int,
+    ):
+        if image_height <= 0 or image_width <= 0:
+            raise ValueError(
+                "both image height and width must be positive"
+                f"but the image shape is {(image_height, image_width)}"
+                "please input a valid image shape"
+            )
+        if height <= 0 or width <= 0:
+            raise ValueError(
+                f"height({height}) and "
+                f"width({width}) both must be larger than 0"
+            )
+        if height >= image_height or width >= image_width:
+            raise ValueError(
+                "Grid height and width must be smaller than image height and width"
+            )
+
+        self._height = height
+        self._width = width
+
+        self._image_height = image_height
+        self._image_width = image_width
+
+        # if the resulting size of grid is not an integer, we round it up.
+        # The last grid cell per row and column might be out of the image size
+        # since we constrain the maximum pixel locations by the image size
+        self.c_h, self.c_w = (
+            math.ceil(image_height / self._height),
+            math.ceil(image_width / self._width),
+        )
+
+        if self.c_h <= 0 or self.c_w <= 0:
+            raise ValueError(
+                "cell height and width must be positive"
+                f"but the cell shape is {(self.c_h, self.c_w)}"
+                "please adjust image shape or grid shape accordingly"
+            )
+
+    def get_grid_cell(self, img_arr: np.ndarray, h_idx: int, w_idx: int):
+        """
+        Get the array data of a grid cell from image of the image payload index.
+        The array is a masked version of the original image, and it has
+        the same size as the original image. The array entries that are not
+        within the grid cell will masked as zeros. The image array entries that
+        are within the grid cell will kept.
+        Note: all indices are zero-based and counted from top left corner of
+        the image.
+        Args:
+            img_arr: image data represented as a numpy array.
+            h_idx: the zero-based height(row) index of the grid cell in the
+                grid, the unit is one grid cell.
+            w_idx: the zero-based width(column) index of the grid cell in the
+                grid, the unit is one grid cell.
+        Raises:
+            ValueError: ``h_idx`` is out of the range specified by ``height``.
+            ValueError: ``w_idx`` is out of the range specified by ``width``.
+        Returns:
+            numpy array that represents the grid cell.
+        """
+        if not 0 <= h_idx < self._height:
+            raise ValueError(
+                f"input parameter h_idx ({h_idx}) is"
+                "out of scope of h_idx range"
+                f" {(0, self._height)}"
+            )
+        if not 0 <= w_idx < self._width:
+            raise ValueError(
+                f"input parameter w_idx ({w_idx}) is"
+                "out of scope of w_idx range"
+                f" {(0, self._width)}"
+            )
+        # initialize a numpy zeros array
+        array = np.zeros((self._image_height, self._image_width))
+        # set grid cell entry values to the values of the original image array
+        # (entry values outside of grid cell remain zeros)
+        # An example of computing grid height index range is
+        # index * cell height : min((index + 1) * cell height, image_height).
+        # It's similar for computing cell width index range
+        # Plus, we constrain the maximum pixel locations by the image size as
+        # the last grid cell per row and column might be out of the image size
+        array[
+            h_idx * self.c_h : min((h_idx + 1) * self.c_h, self._image_height),
+            w_idx * self.c_w : min((w_idx + 1) * self.c_w, self._image_width),
+        ] = img_arr[
+            h_idx * self.c_h : min((h_idx + 1) * self.c_h, self._image_height),
+            w_idx * self.c_w : min((w_idx + 1) * self.c_w, self._image_width),
+        ]
+        return array
+
+    def get_grid_cell_center(self, h_idx: int, w_idx: int) -> Tuple[int, int]:
+        """
+        Get the center pixel position of the grid cell at the specific height
+        index and width index in the ``Grid``.
+        The computation of the center position of the grid cell is
+        dividing the grid cell height range and width range by 2 (round down)
+        Suppose an extreme case that a grid cell has a height range of (0, 3)
+        and a width range of (0, 3) the grid cell center would be (1, 1).
+        Since the grid cell size is usually very large,
+        the offset of the grid cell center is minor.
+        Note: all indices are zero-based and counted from top left corner of
+        the grid.
+        Args:
+            h_idx: the height(row) index of the grid cell in the grid,
+                the unit is one grid cell.
+            w_idx: the width(column) index of the grid cell in the
+                grid, the unit is one grid cell.
+        Returns:
+            A tuple of (y index, x index)
+        """
+
+        return (
+            (h_idx * self.c_h + min((h_idx + 1) * self.c_h, self._image_height))
+            // 2,
+            (w_idx * self.c_w + min((w_idx + 1) * self.c_w, self._image_width))
+            // 2,
+        )
+
+    @property
+    def num_grid_cells(self):
+        return self._height * self._width
+
+    @property
+    def height(self):
+        return self._height
+
+    @property
+    def width(self):
+        return self._width
+
+    def __repr__(self):
+        return str(
+            (self._height, self._width, self._image_height, self._image_width)
+        )
+
+    def __eq__(self, other):
+        if other is None:
+            return False
+        return (
+            self._height,
+            self._width,
+            self._image_height,
+            self._image_width,
+        ) == (
+            other._height,
+            other._width,
+            other.image_height,
+            other.image_width,
+        )
+
+    def __hash__(self):
+        return hash(
+            (self._height, self._width, self._image_height, self._image_width)
+        )
+
+
 GroupType = TypeVar("GroupType", bound=BaseGroup)
 LinkType = TypeVar("LinkType", bound=BaseLink)