Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Error when evaluation in parallel environments #36

Closed
MasterXiong opened this issue Sep 3, 2024 · 3 comments
Closed

Error when evaluation in parallel environments #36

MasterXiong opened this issue Sep 3, 2024 · 3 comments

Comments

@MasterXiong
Copy link

Hi,

I'm trying to evaluate an octo-based policy in several simpler environments in parallel to accelerate the evaluation process. I generally use python's built-in multiprocessing. A minimal code example is as below:

import numpy as np
from multiprocessing import Process, Pipe
import simpler_env

def worker(remote, parent_remote, env_name):
    parent_remote.close()  # Close the parent end of the pipe
    env = simpler_env.make(env_name)
    while True:
        cmd, data = remote.recv()
        if cmd == 'step':
            obs, reward, success, truncated, info = env.step(data)
            # if done:
            #     obs = env.reset()
            remote.send((obs, reward, success, truncated, info))
        elif cmd == 'reset':
            obs, reset_info = env.reset()
            remote.send((obs, reset_info))
        elif cmd == 'close':
            env.close()
            remote.close()
            break

class ParallelEnvs:
    def __init__(self, env_name, num_envs):
        self.num_envs = num_envs
        self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(num_envs)])
        self.processes = [Process(target=worker, args=(work_remote, remote, env_name))
                          for (work_remote, remote) in zip(self.work_remotes, self.remotes)]
        for p in self.processes:
            p.start()
        for work_remote in self.work_remotes:
            work_remote.close()  # Close the worker end in the main process

    def step(self, actions):
        for remote, action in zip(self.remotes, actions):
            remote.send(('step', action))
        results = [remote.recv() for remote in self.remotes]
        obs, reward, success, truncated, info = zip(*results)
        return obs, reward, success, truncated, info

    def reset(self):
        for remote in self.remotes:
            remote.send(('reset', None))
        obs, reset_info = [remote.recv() for remote in self.remotes]
        return obs, reset_info

    def close(self):
        for remote in self.remotes:
            remote.send(('close', None))
        for p in self.processes:
            p.join()


if __name__ == "__main__":

    env_name = "google_robot_pick_coke_can"
    num_envs = 4  # Number of parallel environments
    num_steps = 100
    dummy_env = simpler_env.make(env_name)

    # Initialize parallel environments
    envs = ParallelEnvs(env_name, num_envs)

    # Reset all environments
    obs, reset_info = envs.reset()
    breakpoint()

    for step in range(num_steps):
        # Get actions from the policy
        actions = [dummy_env.action_space.sample() for _ in range(num_envs)]

        # Step all environments with the actions
        obs, reward, success, truncated, info = envs.step(actions)
        breakpoint()

    # Close the environments
    envs.close()

But I got the following error when initializing multiple environments:

  File "/user/fine-tune/test.py", line 15, in worker                                                                                                                 [15/1901]
    env = simpler_env.make(env_name)
  File "/SimplerEnv/simpler_env/__init__.py", line 78, in make
    env = gym.make(env_name, obs_mode="rgbd", **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/gymnasium/envs/registration.py", line 802, in make
    env = env_creator(**env_spec_kwargs)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/utils/registration.py", line 92, in make
    env = env_spec.make(**kwargs)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/utils/registration.py", line 34, in make
    return self.cls(**_kwargs)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/custom_scenes/grasp_single_in_scene.py", line 630, in __init__
    super().__init__(**kwargs)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/custom_scenes/grasp_single_in_scene.py", line 540, in __init__
    super().__init__(**kwargs)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/custom_scenes/grasp_single_in_scene.py", line 64, in __init__
    super().__init__(**kwargs)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/custom_scenes/base_env.py", line 134, in __init__
    super().__init__(**kwargs)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/sapien_env.py", line 188, in __init__
    obs, _ = self.reset(seed=2022, options=dict(reconfigure=True))
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/custom_scenes/grasp_single_in_scene.py", line 585, in reset
    obs, info = super().reset(seed=self._episode_seed, options=options)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/custom_scenes/grasp_single_in_scene.py", line 135, in reset
    obs, info = super().reset(seed=self._episode_seed, options=options)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/custom_scenes/base_env.py", line 228, in reset
    obs, info = super().reset(seed=seed, options=options)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/sapien_env.py", line 488, in reset
    return self.get_obs(), {}
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/custom_scenes/base_env.py", line 350, in get_obs
    obs = super().get_obs()
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/sapien_env.py", line 265, in get_obs
    return self._get_obs_images()
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/sapien_env.py", line 312, in _get_obs_images
    self.take_picture()
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/sapien_env.py", line 289, in take_picture
    cam.take_picture()
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/sensors/camera.py", line 187, in take_picture
    self.camera.take_picture()
RuntimeError: vk::Device::waitForFences: ErrorDeviceLost

Could you help have a look at what is the issue here? Or what is the right way to parallalize simpler environments? Thanks for your help!

P.S. This link may be relevant to my issue here.

@StoneT2000
Copy link
Collaborator

You can follow the progress here: #38

main blocker is making octo accept batched inputs atm

@MasterXiong
Copy link
Author

@StoneT2000 Thanks for your help! The GPU simulation currently only support widowx tasks right?

@StoneT2000
Copy link
Collaborator

Yes only the widowx robot. We don't have an implementation of a GPU parallelized version of the google robot's controller. I know it's possible but currently don't have time to tackle that problem just yet.

@github-staff github-staff deleted a comment from maher-nakesh Oct 1, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants