ARISE-Initiative · WangYixuan12 · Oct 5, 2023 · Oct 5, 2023 · Oct 29, 2023 · Oct 30, 2023
diff --git a/robomimic/config/base_config.py b/robomimic/config/base_config.py
@@ -272,6 +272,16 @@ def observation_config(self):
         self.observation.encoder.scan.core_kwargs = Config()                    # See models/obs_core.py for important kwargs to set and defaults used
         self.observation.encoder.scan.core_kwargs.do_not_lock_keys()
 
+        # =============== Spatial default encoder (pointnet) ===============
+        self.observation.encoder.spatial = deepcopy(self.observation.encoder.rgb)
+
+        # Scan: Modify the core class + kwargs, otherwise, is same as rgb encoder
+        self.observation.encoder.spatial.core_class = "SpatialCore"                   # Default ScanCore class uses Conv1D to process this modality
+        # self.observation.encoder.spatial.core_class = "SpatialTest"                   # Default ScanCore class uses Conv1D to process this modality
+        # self.observation.encoder.spatial.core_class = "SparseTransformer"                   # Default ScanCore class uses Conv1D to process this modality
+        self.observation.encoder.scan.core_kwargs = Config()                    # See models/obs_core.py for important kwargs to set and defaults used
+        self.observation.encoder.scan.core_kwargs.do_not_lock_keys()
+
     def meta_config(self):
         """
         This function populates the `config.meta` attribute of the config. This portion of the config 

diff --git a/robomimic/envs/env_robosuite.py b/robomimic/envs/env_robosuite.py
@@ -7,7 +7,9 @@
 import numpy as np
 from copy import deepcopy
 
+import mujoco
 import robosuite
+from robosuite.utils.camera_utils import get_real_depth_map, get_camera_extrinsic_matrix, get_camera_intrinsic_matrix
 try:
     # this is needed for ensuring robosuite can find the additional mimicgen environments (see https://mimicgen.github.io)
     import mimicgen_envs
@@ -70,7 +72,7 @@ def __init__(
             ignore_done=True,
             use_object_obs=True,
             use_camera_obs=use_image_obs,
-            camera_depths=False,
+            camera_depths=kwargs['camera_depths'],
         )
         kwargs.update(update_kwargs)
 
@@ -201,9 +203,21 @@ def get_observation(self, di=None):
                 ret[k] = di[k][::-1]
                 if self.postprocess_visual_obs:
                     ret[k] = ObsUtils.process_obs(obs=ret[k], obs_key=k)
+            if (k in ObsUtils.OBS_KEYS_TO_MODALITIES) and ObsUtils.key_is_obs_modality(key=k, obs_modality="depth"):
+                ret[k] = di[k][::-1]
+                ret[k] = get_real_depth_map(self.env.sim, ret[k])
+                if self.postprocess_visual_obs:
+                    ret[k] = ObsUtils.process_obs(obs=ret[k], obs_key=k)
 
         # "object" key contains object information
         ret["object"] = np.array(di["object-state"])
+
+        # save camera intrinsics and extrinsics
+        for cam_idx, camera_name in enumerate(self.env.camera_names):
+            cam_height = self.env.camera_heights[cam_idx]
+            cam_width = self.env.camera_widths[cam_idx]
+            ret[f'{camera_name}_extrinsic'] = get_camera_extrinsic_matrix(self.env.sim, camera_name)
+            ret[f'{camera_name}_intrinsic'] = get_camera_intrinsic_matrix(self.env.sim, camera_name, cam_height, cam_width)
 
         if self._is_v1:
             for robot in self.env.robots:
@@ -357,6 +371,8 @@ def create_for_data_processing(
         image_modalities = list(camera_names)
         if is_v1:
             image_modalities = ["{}_image".format(cn) for cn in camera_names]
+            if kwargs['camera_depths']:
+                depth_modalities = ["{}_depth".format(cn) for cn in camera_names]
         elif has_camera:
             # v0.3 only had support for one image, and it was named "rgb"
             assert len(image_modalities) == 1
@@ -367,6 +383,8 @@ def create_for_data_processing(
                 "rgb": image_modalities,
             }
         }
+        if kwargs['camera_depths']:
+            obs_modality_specs['obs']['depth'] = depth_modalities
         ObsUtils.initialize_obs_utils_with_obs_specs(obs_modality_specs)
 
         # note that @postprocess_visual_obs is False since this env's images will be written to a dataset

diff --git a/robomimic/models/base_nets.py b/robomimic/models/base_nets.py
@@ -15,6 +15,9 @@
 from torchvision import transforms
 from torchvision import models as vision_models
 
+import sys
+sys.path.append('/home/yixuan/general_dp/robomimic')
+
 import robomimic.utils.tensor_utils as TensorUtils
 
 
@@ -1109,3 +1112,233 @@ def forward(self, x):
             # weighted mean-pooling
             return torch.sum(x * self.agg_weight, dim=1)
         raise Exception("unexpected agg type: {}".forward(self.agg_type))
+
+class Conv1dBNReLU(Module):
+    """Applies a 1D convolution over an input signal composed of several input planes,
+    optionally followed by batch normalization and ReLU activation.
+    """
+
+    def __init__(
+            self, in_channels, out_channels, kernel_size, relu=True, bn=True, **kwargs
+    ):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        self.conv = nn.Conv1d(
+            in_channels, out_channels, kernel_size, bias=(not bn), **kwargs
+        )
+        self.bn = nn.BatchNorm1d(out_channels) if bn else None
+        self.relu = nn.ReLU(inplace=True) if relu else None
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.bn is not None:
+            x = self.bn(x)
+        if self.relu is not None:
+            x = self.relu(x)
+        return x
+
+
+class Conv2dBNReLU(Module):
+    """Applies a 2D convolution (optionally with batch normalization and relu activation)
+    over an input signal composed of several input planes.
+    """
+
+    def __init__(
+            self, in_channels, out_channels, kernel_size, relu=True, bn=True, **kwargs
+    ):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        self.conv = nn.Conv2d(
+            in_channels, out_channels, kernel_size, bias=(not bn), **kwargs
+        )
+        self.bn = nn.BatchNorm2d(out_channels) if bn else None
+        self.relu = nn.ReLU(inplace=True) if relu else None
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.bn is not None:
+            x = self.bn(x)
+        if self.relu is not None:
+            x = self.relu(x)
+        return x
+
+
+class LinearBNReLU(Module):
+    """Applies a linear transformation to the incoming data
+    optionally followed by batch normalization and relu activation
+    """
+
+    def __init__(self, in_channels, out_channels, relu=True, bn=True):
+        super(LinearBNReLU, self).__init__()
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        self.fc = nn.Linear(in_channels, out_channels, bias=(not bn))
+        self.bn = nn.BatchNorm1d(out_channels) if bn else None
+        self.relu = nn.ReLU(inplace=True) if relu else None
+
+    def forward(self, x):
+        x = self.fc(x)
+        if self.bn is not None:
+            x = self.bn(x)
+        if self.relu is not None:
+            x = self.relu(x)
+        return x
+
+def mlp_bn_relu(in_channels, out_channels_list):
+    c_in = in_channels
+    layers = []
+    for c_out in out_channels_list:
+        layers.append(LinearBNReLU(c_in, c_out, relu=True, bn=True))
+        c_in = c_out
+    return nn.Sequential(*layers)
+
+
+def mlp_relu(in_channels, out_channels_list):
+    c_in = in_channels
+    layers = []
+    for c_out in out_channels_list:
+        layers.append(LinearBNReLU(c_in, c_out, relu=True, bn=False))
+        c_in = c_out
+    return nn.Sequential(*layers)
+
+
+def mlp1d_bn_relu(in_channels, out_channels_list):
+    c_in = in_channels
+    layers = []
+    for c_out in out_channels_list:
+        layers.append(Conv1dBNReLU(c_in, c_out, 1, relu=True))
+        c_in = c_out
+    return nn.Sequential(*layers)
+
+
+def mlp1d_relu(in_channels, out_channels_list):
+    c_in = in_channels
+    layers = []
+    for c_out in out_channels_list:
+        layers.append(Conv1dBNReLU(c_in, c_out, 1, relu=True, bn=False))
+        c_in = c_out
+    return nn.Sequential(*layers)
+
+
+def mlp2d_bn_relu(in_channels, out_channels_list):
+    c_in = in_channels
+    layers = []
+    for c_out in out_channels_list:
+        layers.append(Conv2dBNReLU(c_in, c_out, 1, relu=True))
+        c_in = c_out
+    return nn.Sequential(*layers)
+
+
+def mlp2d_relu(in_channels, out_channels_list):
+    c_in = in_channels
+    layers = []
+    for c_out in out_channels_list:
+        layers.append(Conv2dBNReLU(c_in, c_out, 1, relu=True, bn=False))
+        c_in = c_out
+    return nn.Sequential(*layers)
+
+class PointNet(Module):
+    """PointNet for classification.
+    Notes:
+        1. The original implementation includes dropout for global MLPs.
+        2. The original implementation decays the BN momentum.
+    """
+
+    def __init__(
+            self,
+            in_channels=3,
+            local_channels=(64, 64, 64, 128, 1024),
+            global_channels=(512, 256),
+            use_bn=True,
+    ):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.out_channels = (local_channels + global_channels)[-1]
+        self.use_bn = use_bn
+
+        if use_bn:
+            self.mlp_local = mlp1d_bn_relu(in_channels, local_channels)
+            self.mlp_global = mlp_bn_relu(local_channels[-1], global_channels)
+        else:
+            self.mlp_local = mlp1d_relu(in_channels, local_channels)
+            self.mlp_global = mlp_relu(local_channels[-1], global_channels)
+
+        self.reset_parameters()
+
+    def forward_internal(self, points, points_feature=None, points_mask=None) -> dict:
+        # points: [B, 3, N]; points_feature: [B, C, N], points_mask: [B, N]
+        if points_feature is not None:
+            input_feature = torch.cat([points, points_feature], dim=1)
+        else:
+            input_feature = points
+
+        local_feature = self.mlp_local(input_feature)
+        if points_mask is not None:
+            local_feature = torch.where(
+                points_mask.unsqueeze(1), local_feature, torch.zeros_like(local_feature)
+            )
+        global_feature, max_indices = torch.max(local_feature, 2)
+        output_feature = self.mlp_global(global_feature)
+
+        return {"feature": output_feature, "max_indices": max_indices}
+
+    def forward(self, feats_points):
+        # points: [B, 3 + C, N]
+        points = feats_points[:, :3, :]
+        if feats_points.shape[1] > 3:
+            points_feature = feats_points[:, 3:, :]
+        else:
+            points_feature = None
+        return self.forward_internal(points, points_feature)['feature']
+
+    def reset_parameters(self):
+        for name, module in self.named_modules():
+            if isinstance(module, (nn.Linear, nn.Conv1d, nn.Conv2d)):
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+            if isinstance(module, (nn.BatchNorm1d, nn.BatchNorm2d)):
+                module.momentum = 0.01
+
+class PointMLP(Module):
+    def __init__(
+        self,
+    ):
+        super().__init__()
+
+        self.model = nn.Sequential(
+            nn.Linear(3 * 100 * 2, 512),
+            nn.ReLU(),
+            nn.Linear(512, 256),
+            nn.ReLU(),
+            nn.Linear(256, 256),
+            nn.ReLU(),
+            nn.Linear(256, 256),
+            nn.ReLU(),
+            nn.Linear(256, 256),
+        )
+        print('WARNING: PointMLP is only for testing purpose')
+
+    def forward(self, points):
+        # points: [B, 3 + C, N]
+        points = points[:, :3, :]
+        B, _, N = points.shape
+        points = points.reshape(B, 3 * 100 * 2)
+        return self.model(points)
+
+def test_pointnet():
+    feat_points = torch.rand(2, 3+1024, 1000) # (B, C, N)
+    point_net = PointNet(3)
+    out = point_net(feat_points)
+    print(out.shape)
+
+if __name__ == '__main__':
+    test_pointnet()