diff --git a/algorithms/minari/any_percent_bc.py b/algorithms/minari/any_percent_bc.py new file mode 100644 index 00000000..ff3d43c1 --- /dev/null +++ b/algorithms/minari/any_percent_bc.py @@ -0,0 +1,369 @@ +import contextlib +import os +import random +import uuid +from dataclasses import asdict, dataclass +from typing import Any, Dict, List, Optional, Tuple, Union + +import gymnasium as gym +import minari +import numpy as np +import pyrallis +import torch +import torch.nn as nn +import torch.nn.functional as F +import wandb +from tqdm.auto import trange + +TensorBatch = List[torch.Tensor] +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" + + +@dataclass +class TrainConfig: + # wandb params + project: str = "CORL" + group: str = "BC-Minari" + name: str = "bc" + # model params + gamma: float = 0.99 # Discount factor + top_fraction: float = 0.1 # Best data fraction to use + # training params + dataset_id: str = "pen-human-v1" # Minari remote dataset name + update_steps: int = int(1e6) # Total training networks updates + buffer_size: int = 2_000_000 # Replay buffer size + batch_size: int = 256 # Batch size for all networks + normalize_state: bool = True # Normalize states + # evaluation params + eval_every: int = int(5e3) # How often (time steps) we evaluate + eval_episodes: int = 10 # How many episodes run during evaluation + # general params + train_seed: int = 0 + eval_seed: int = 0 + checkpoints_path: Optional[str] = None # Save path + + def __post_init__(self): + self.name = f"{self.name}-{self.dataset_id}-{str(uuid.uuid4())[:8]}" + if self.checkpoints_path is not None: + self.checkpoints_path = os.path.join(self.checkpoints_path, self.name) + + +def set_seed(seed: int, deterministic_torch: bool = False): + os.environ["PYTHONHASHSEED"] = str(seed) + np.random.seed(seed) + random.seed(seed) + torch.manual_seed(seed) + torch.use_deterministic_algorithms(deterministic_torch) + + +def compute_mean_std(states: np.ndarray, eps: float) -> Tuple[np.ndarray, np.ndarray]: + mean = states.mean(0) + std = states.std(0) + eps + return mean, std + + +def normalize_states(states: np.ndarray, mean: np.ndarray, std: np.ndarray): + return (states - mean) / std + + +def wrap_env( + env: gym.Env, + state_mean: Union[np.ndarray, float] = 0.0, + state_std: Union[np.ndarray, float] = 1.0, + reward_scale: float = 1.0, +) -> gym.Env: + # PEP 8: E731 do not assign a lambda expression, use a def + def normalize_state(state): + # epsilon should be already added in std. + return (state - state_mean) / state_std + + def scale_reward(reward): + # Please be careful, here reward is multiplied by scale! + return reward_scale * reward + + env = gym.wrappers.TransformObservation(env, normalize_state) + if reward_scale != 1.0: + env = gym.wrappers.TransformReward(env, scale_reward) + return env + + +def discounted_return(x: np.ndarray, gamma: float) -> np.ndarray: + total_return = x[-1] + for t in reversed(range(x.shape[0] - 1)): + total_return = x[t] + gamma * total_return + return total_return + + +def best_trajectories_ids( + dataset: minari.MinariDataset, top_fraction: float, gamma: float +) -> List[int]: + ids_and_return = [ + (episode.id, discounted_return(episode.rewards, gamma)) for episode in dataset + ] + ids_and_returns = sorted(ids_and_return, key=lambda t: -t[1]) + + top_ids = [id for (id, r) in ids_and_returns] + top_ids = top_ids[: max(1, int(top_fraction * len(ids_and_returns)))] + assert len(top_ids) > 0 + return top_ids + + +# WARN: this will load full dataset in memory (which is OK for D4RL datasets) +def qlearning_dataset( + dataset: minari.MinariDataset, traj_ids: List[int] +) -> Dict[str, np.ndarray]: + obs, next_obs, actions, rewards, dones = [], [], [], [], [] + + for episode in dataset.iterate_episodes(episode_indices=traj_ids): + obs.append(episode.observations[:-1].astype(np.float32)) + next_obs.append(episode.observations[1:].astype(np.float32)) + actions.append(episode.actions.astype(np.float32)) + rewards.append(episode.rewards) + dones.append(episode.terminations) + + return { + "observations": np.concatenate(obs), + "actions": np.concatenate(actions), + "next_observations": np.concatenate(next_obs), + "rewards": np.concatenate(rewards), + "terminals": np.concatenate(dones), + } + + +class ReplayBuffer: + def __init__( + self, + state_dim: int, + action_dim: int, + buffer_size: int, + device: str = "cpu", + ): + self._buffer_size = buffer_size + self._pointer = 0 + self._size = 0 + + self._states = torch.zeros( + (buffer_size, state_dim), dtype=torch.float32, device=device + ) + self._actions = torch.zeros( + (buffer_size, action_dim), dtype=torch.float32, device=device + ) + self._rewards = torch.zeros((buffer_size, 1), dtype=torch.float32, device=device) + self._next_states = torch.zeros( + (buffer_size, state_dim), dtype=torch.float32, device=device + ) + self._dones = torch.zeros((buffer_size, 1), dtype=torch.float32, device=device) + self._device = device + + def _to_tensor(self, data: np.ndarray) -> torch.Tensor: + return torch.tensor(data, dtype=torch.float32, device=self._device) + + # Loads data in d4rl format, i.e. from Dict[str, np.array] after q_learning_dataset. + def load_dataset(self, data: Dict[str, np.ndarray]): + if self._size != 0: + raise ValueError("Trying to load data into non-empty replay buffer") + + n_transitions = data["observations"].shape[0] + if n_transitions > self._buffer_size: + raise ValueError( + "Replay buffer is smaller than the dataset you are trying to load!" + ) + self._states[:n_transitions] = self._to_tensor(data["observations"]) + self._actions[:n_transitions] = self._to_tensor(data["actions"]) + self._rewards[:n_transitions] = self._to_tensor(data["rewards"][..., None]) + self._next_states[:n_transitions] = self._to_tensor(data["next_observations"]) + self._dones[:n_transitions] = self._to_tensor(data["terminals"][..., None]) + + self._size = self._pointer = n_transitions + print(f"Dataset size: {n_transitions}") + + def sample(self, batch_size: int) -> TensorBatch: + indices = np.random.randint(0, min(self._size, self._pointer), size=batch_size) + states = self._states[indices] + actions = self._actions[indices] + rewards = self._rewards[indices] + next_states = self._next_states[indices] + dones = self._dones[indices] + return [states, actions, rewards, next_states, dones] + + def add_transition(self): + # Use this method to add new data into the replay buffer during fine-tuning. + # I left it unimplemented since now we do not do fine-tuning. + raise NotImplementedError + + +class Actor(nn.Module): + def __init__(self, state_dim: int, action_dim: int, max_action: float): + super(Actor, self).__init__() + self.net = nn.Sequential( + nn.Linear(state_dim, 256), + nn.ReLU(), + nn.Linear(256, 256), + nn.ReLU(), + nn.Linear(256, action_dim), + nn.Tanh(), + ) + self.max_action = max_action + + def forward(self, state: torch.Tensor) -> torch.Tensor: + return self.max_action * self.net(state) + + @torch.no_grad() + def act(self, state: np.ndarray, device: str = "cpu") -> np.ndarray: + state = torch.tensor(state.reshape(1, -1), device=device, dtype=torch.float32) + return self(state).cpu().data.numpy().flatten() + + +class BC: + def __init__( + self, + max_action: float, + actor: nn.Module, + actor_optimizer: torch.optim.Optimizer, + device: str = "cpu", + ): + self.actor = actor + self.actor_optimizer = actor_optimizer + self.max_action = max_action + self.device = device + + def train(self, batch: TensorBatch) -> Dict[str, float]: + log_dict = {} + state, action, _, _, _ = batch + + # Compute actor loss + pi = self.actor(state) + actor_loss = F.mse_loss(pi, action) + log_dict["actor_loss"] = actor_loss.item() + # Optimize the actor + self.actor_optimizer.zero_grad() + actor_loss.backward() + self.actor_optimizer.step() + + return log_dict + + def state_dict(self) -> Dict[str, Any]: + return { + "actor": self.actor.state_dict(), + "actor_optimizer": self.actor_optimizer.state_dict(), + } + + def load_state_dict(self, state_dict: Dict[str, Any]): + self.actor.load_state_dict(state_dict["actor"]) + self.actor_optimizer.load_state_dict(state_dict["actor_optimizer"]) + + +@torch.no_grad() +def evaluate( + env: gym.Env, actor: nn.Module, num_episodes: int, seed: int, device: str +) -> np.ndarray: + actor.eval() + episode_rewards = [] + for i in range(num_episodes): + done = False + state, info = env.reset(seed=seed + i) + + episode_reward = 0.0 + while not done: + action = actor.act(state, device) + state, reward, terminated, truncated, info = env.step(action) + done = terminated or truncated + episode_reward += reward + episode_rewards.append(episode_reward) + + actor.train() + return np.asarray(episode_rewards) + + +@pyrallis.wrap() +def train(config: TrainConfig): + wandb.init( + config=asdict(config), + project=config.project, + group=config.group, + name=config.name, + id=str(uuid.uuid4()), + save_code=True, + ) + minari.download_dataset(config.dataset_id) + dataset = minari.load_dataset(config.dataset_id) + + eval_env = dataset.recover_environment() + state_dim = eval_env.observation_space.shape[0] + action_dim = eval_env.action_space.shape[0] + max_action = float(eval_env.action_space.high[0]) + + qdataset = qlearning_dataset( + dataset=dataset, + traj_ids=best_trajectories_ids(dataset, config.top_fraction, config.gamma), + ) + if config.normalize_state: + state_mean, state_std = compute_mean_std(qdataset["observations"], eps=1e-3) + else: + state_mean, state_std = 0, 1 + + qdataset["observations"] = normalize_states( + qdataset["observations"], state_mean, state_std + ) + qdataset["next_observations"] = normalize_states( + qdataset["next_observations"], state_mean, state_std + ) + eval_env = wrap_env(eval_env, state_mean=state_mean, state_std=state_std) + replay_buffer = ReplayBuffer( + state_dim, + action_dim, + config.buffer_size, + DEVICE, + ) + replay_buffer.load_dataset(qdataset) + + if config.checkpoints_path is not None: + print(f"Checkpoints path: {config.checkpoints_path}") + os.makedirs(config.checkpoints_path, exist_ok=True) + with open(os.path.join(config.checkpoints_path, "config.yaml"), "w") as f: + pyrallis.dump(config, f) + + # Set seed + set_seed(config.train_seed) + + actor = Actor(state_dim, action_dim, max_action).to(DEVICE) + actor_optimizer = torch.optim.Adam(actor.parameters(), lr=3e-4) + + trainer = BC( + max_action=max_action, + actor=actor, + actor_optimizer=actor_optimizer, + device=DEVICE, + ) + + for step in trange(config.update_steps): + batch = [b.to(DEVICE) for b in replay_buffer.sample(config.batch_size)] + log_dict = trainer.train(batch) + + wandb.log(log_dict, step=step) + + if (step + 1) % config.eval_every == 0: + eval_scores = evaluate( + env=eval_env, + actor=actor, + num_episodes=config.eval_episodes, + seed=config.eval_seed, + device=DEVICE, + ) + wandb.log({"evaluation_return": eval_scores.mean()}, step=step) + # optional normalized score logging, only if dataset has reference scores + with contextlib.suppress(ValueError): + normalized_score = ( + minari.get_normalized_score(dataset, eval_scores).mean() * 100 + ) + wandb.log({"normalized_score": normalized_score}, step=step) + + if config.checkpoints_path is not None: + torch.save( + trainer.state_dict(), + os.path.join(config.checkpoints_path, f"checkpoint_{step}.pt"), + ) + + +if __name__ == "__main__": + train() diff --git a/algorithms/minari/iql.py b/algorithms/minari/iql.py new file mode 100644 index 00000000..4f449622 --- /dev/null +++ b/algorithms/minari/iql.py @@ -0,0 +1,643 @@ +# source: https://github.com/gwthomas/IQL-PyTorch +# https://arxiv.org/pdf/2110.06169.pdf + +# Implementation TODOs: +# 1. iql_deterministic is true only for 2 datasets. Can we remote it? +# 2. MLP class introduced bugs in the past. We should remove it. +# 3. Refactor IQL updating code to be more consistent in style +import contextlib +import copy +import os +import random +import uuid +from dataclasses import asdict, dataclass +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import gymnasium as gym +import minari +import numpy as np +import pyrallis +import torch +import torch.nn as nn +import torch.nn.functional as F +import wandb +from torch.distributions import Normal +from torch.optim.lr_scheduler import CosineAnnealingLR +from tqdm.auto import trange + +TensorBatch = List[torch.Tensor] + +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" +EXP_ADV_MAX = 100.0 +LOG_STD_MIN = -20.0 +LOG_STD_MAX = 2.0 + + +@dataclass +class TrainConfig: + # wandb params + project: str = "CORL" + group: str = "IQL-Minari" + name: str = "iql" + # model params + gamma: float = 0.99 # Discount factor + tau: float = 0.005 # Target network update rate + beta: float = 3.0 # Inverse temperature. Small beta -> BC, big beta -> maximizing Q + iql_tau: float = 0.7 # Coefficient for asymmetric loss + iql_deterministic: bool = False # Use deterministic actor + vf_lr: float = 3e-4 # V function learning rate + qf_lr: float = 3e-4 # Critic learning rate + actor_lr: float = 3e-4 # Actor learning rate + actor_dropout: Optional[float] = None # Adroit uses dropout for policy network + # training params + dataset_id: str = "pen-human-v1" # Minari remote dataset name + update_steps: int = int(1e6) # Total training networks updates + buffer_size: int = 2_000_000 # Replay buffer size + batch_size: int = 256 # Batch size for all networks + normalize_state: bool = True # Normalize states + normalize_reward: bool = False # Normalize reward + # evaluation params + eval_every: int = int(5e3) # How often (time steps) we evaluate + eval_episodes: int = 10 # How many episodes run during evaluation + # general params + train_seed: int = 0 + eval_seed: int = 0 + checkpoints_path: Optional[str] = None # Save path + + def __post_init__(self): + self.name = f"{self.name}-{self.dataset_id}-{str(uuid.uuid4())[:8]}" + if self.checkpoints_path is not None: + self.checkpoints_path = os.path.join(self.checkpoints_path, self.name) + + +def set_seed(seed: int, deterministic_torch: bool = False): + os.environ["PYTHONHASHSEED"] = str(seed) + np.random.seed(seed) + random.seed(seed) + torch.manual_seed(seed) + torch.use_deterministic_algorithms(deterministic_torch) + + +def soft_update(target: nn.Module, source: nn.Module, tau: float): + for target_param, source_param in zip(target.parameters(), source.parameters()): + target_param.data.copy_((1 - tau) * target_param.data + tau * source_param.data) + + +def compute_mean_std(states: np.ndarray, eps: float) -> Tuple[np.ndarray, np.ndarray]: + mean = states.mean(0) + std = states.std(0) + eps + return mean, std + + +def normalize_states(states: np.ndarray, mean: np.ndarray, std: np.ndarray): + return (states - mean) / std + + +def wrap_env( + env: gym.Env, + state_mean: Union[np.ndarray, float] = 0.0, + state_std: Union[np.ndarray, float] = 1.0, + reward_scale: float = 1.0, +) -> gym.Env: + # PEP 8: E731 do not assign a lambda expression, use a def + def normalize_state(state): + # epsilon should be already added in std. + return (state - state_mean) / state_std + + def scale_reward(reward): + # Please be careful, here reward is multiplied by scale! + return reward_scale * reward + + env = gym.wrappers.TransformObservation(env, normalize_state) + if reward_scale != 1.0: + env = gym.wrappers.TransformReward(env, scale_reward) + return env + + +# This is how reward normalization among all datasets is done in original IQL +def return_reward_range( + dataset: Dict[str, np.ndarray], max_episode_steps: int +) -> Tuple[float, float]: + returns, lengths = [], [] + ep_ret, ep_len = 0.0, 0 + for r, d in zip(dataset["rewards"], dataset["terminals"]): + ep_ret += float(r) + ep_len += 1 + if d or ep_len == max_episode_steps: + returns.append(ep_ret) + lengths.append(ep_len) + ep_ret, ep_len = 0.0, 0 + lengths.append(ep_len) # but still keep track of number of steps + assert sum(lengths) == len(dataset["rewards"]) + return min(returns), max(returns) + + +def modify_reward( + dataset: Dict[str, np.ndarray], env_name: str, max_episode_steps: int = 1000 +): + if any(s in env_name for s in ("halfcheetah", "hopper", "walker2d")): + min_ret, max_ret = return_reward_range(dataset, max_episode_steps) + dataset["rewards"] /= max_ret - min_ret + dataset["rewards"] *= max_episode_steps + elif "antmaze" in env_name: + dataset["rewards"] -= 1.0 + + +# WARN: this will load full dataset in memory (which is OK for D4RL datasets) +def qlearning_dataset(dataset: minari.MinariDataset) -> Dict[str, np.ndarray]: + obs, next_obs, actions, rewards, dones = [], [], [], [], [] + + for episode in dataset: + obs.append(episode.observations[:-1].astype(np.float32)) + next_obs.append(episode.observations[1:].astype(np.float32)) + actions.append(episode.actions.astype(np.float32)) + rewards.append(episode.rewards) + dones.append(episode.terminations) + + return { + "observations": np.concatenate(obs), + "actions": np.concatenate(actions), + "next_observations": np.concatenate(next_obs), + "rewards": np.concatenate(rewards), + "terminals": np.concatenate(dones), + } + + +class ReplayBuffer: + def __init__( + self, + state_dim: int, + action_dim: int, + buffer_size: int, + device: str = "cpu", + ): + self._buffer_size = buffer_size + self._pointer = 0 + self._size = 0 + + self._states = torch.zeros( + (buffer_size, state_dim), dtype=torch.float32, device=device + ) + self._actions = torch.zeros( + (buffer_size, action_dim), dtype=torch.float32, device=device + ) + self._rewards = torch.zeros((buffer_size, 1), dtype=torch.float32, device=device) + self._next_states = torch.zeros( + (buffer_size, state_dim), dtype=torch.float32, device=device + ) + self._dones = torch.zeros((buffer_size, 1), dtype=torch.float32, device=device) + self._device = device + + def _to_tensor(self, data: np.ndarray) -> torch.Tensor: + return torch.tensor(data, dtype=torch.float32, device=self._device) + + # Loads data in d4rl format, i.e. from Dict[str, np.array] after q_learning_dataset. + def load_dataset(self, data: Dict[str, np.ndarray]): + if self._size != 0: + raise ValueError("Trying to load data into non-empty replay buffer") + + n_transitions = data["observations"].shape[0] + if n_transitions > self._buffer_size: + raise ValueError( + "Replay buffer is smaller than the dataset you are trying to load!" + ) + self._states[:n_transitions] = self._to_tensor(data["observations"]) + self._actions[:n_transitions] = self._to_tensor(data["actions"]) + self._rewards[:n_transitions] = self._to_tensor(data["rewards"][..., None]) + self._next_states[:n_transitions] = self._to_tensor(data["next_observations"]) + self._dones[:n_transitions] = self._to_tensor(data["terminals"][..., None]) + + self._size = self._pointer = n_transitions + print(f"Dataset size: {n_transitions}") + + def sample(self, batch_size: int) -> TensorBatch: + indices = np.random.randint(0, min(self._size, self._pointer), size=batch_size) + states = self._states[indices] + actions = self._actions[indices] + rewards = self._rewards[indices] + next_states = self._next_states[indices] + dones = self._dones[indices] + return [states, actions, rewards, next_states, dones] + + def add_transition(self): + # Use this method to add new data into the replay buffer during fine-tuning. + # I left it unimplemented since now we do not do fine-tuning. + raise NotImplementedError + + +def asymmetric_l2_loss(u: torch.Tensor, tau: float) -> torch.Tensor: + return torch.mean(torch.abs(tau - (u < 0).float()) * u**2) + + +class Squeeze(nn.Module): + def __init__(self, dim=-1): + super().__init__() + self.dim = dim + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x.squeeze(dim=self.dim) + + +class MLP(nn.Module): + def __init__( + self, + dims, + activation_fn: Callable[[], nn.Module] = nn.ReLU, + output_activation_fn: Callable[[], nn.Module] = None, + squeeze_output: bool = False, + dropout: Optional[float] = None, + ): + super().__init__() + n_dims = len(dims) + if n_dims < 2: + raise ValueError("MLP requires at least two dims (input and output)") + + layers = [] + for i in range(n_dims - 2): + layers.append(nn.Linear(dims[i], dims[i + 1])) + layers.append(activation_fn()) + + if dropout is not None: + layers.append(nn.Dropout(dropout)) + + layers.append(nn.Linear(dims[-2], dims[-1])) + if output_activation_fn is not None: + layers.append(output_activation_fn()) + if squeeze_output: + if dims[-1] != 1: + raise ValueError("Last dim must be 1 when squeezing") + layers.append(Squeeze(-1)) + self.net = nn.Sequential(*layers) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.net(x) + + +class GaussianPolicy(nn.Module): + def __init__( + self, + state_dim: int, + act_dim: int, + max_action: float, + hidden_dim: int = 256, + n_hidden: int = 2, + dropout: Optional[float] = None, + ): + super().__init__() + self.net = MLP( + [state_dim, *([hidden_dim] * n_hidden), act_dim], + output_activation_fn=nn.Tanh, + dropout=dropout, + ) + self.log_std = nn.Parameter(torch.zeros(act_dim, dtype=torch.float32)) + self.max_action = max_action + + def forward(self, obs: torch.Tensor) -> Normal: + mean = self.net(obs) + std = torch.exp(self.log_std.clamp(LOG_STD_MIN, LOG_STD_MAX)) + return Normal(mean, std) + + @torch.no_grad() + def act(self, state: np.ndarray, device: str = "cpu"): + state = torch.tensor(state.reshape(1, -1), device=device, dtype=torch.float32) + dist = self(state) + action = dist.mean if not self.training else dist.sample() + action = torch.clamp(self.max_action * action, -self.max_action, self.max_action) + return action.cpu().data.numpy().flatten() + + +class DeterministicPolicy(nn.Module): + def __init__( + self, + state_dim: int, + act_dim: int, + max_action: float, + hidden_dim: int = 256, + n_hidden: int = 2, + dropout: Optional[float] = None, + ): + super().__init__() + self.net = MLP( + [state_dim, *([hidden_dim] * n_hidden), act_dim], + output_activation_fn=nn.Tanh, + dropout=dropout, + ) + self.max_action = max_action + + def forward(self, obs: torch.Tensor) -> torch.Tensor: + return self.net(obs) + + @torch.no_grad() + def act(self, state: np.ndarray, device: str = "cpu"): + state = torch.tensor(state.reshape(1, -1), device=device, dtype=torch.float32) + return ( + torch.clamp(self(state) * self.max_action, -self.max_action, self.max_action) + .cpu() + .data.numpy() + .flatten() + ) + + +class TwinQ(nn.Module): + def __init__( + self, state_dim: int, action_dim: int, hidden_dim: int = 256, n_hidden: int = 2 + ): + super().__init__() + dims = [state_dim + action_dim, *([hidden_dim] * n_hidden), 1] + self.q1 = MLP(dims, squeeze_output=True) + self.q2 = MLP(dims, squeeze_output=True) + + def both( + self, state: torch.Tensor, action: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + sa = torch.cat([state, action], 1) + return self.q1(sa), self.q2(sa) + + def forward(self, state: torch.Tensor, action: torch.Tensor) -> torch.Tensor: + return torch.min(*self.both(state, action)) + + +class ValueFunction(nn.Module): + def __init__(self, state_dim: int, hidden_dim: int = 256, n_hidden: int = 2): + super().__init__() + dims = [state_dim, *([hidden_dim] * n_hidden), 1] + self.v = MLP(dims, squeeze_output=True) + + def forward(self, state: torch.Tensor) -> torch.Tensor: + return self.v(state) + + +class ImplicitQLearning: + def __init__( + self, + max_action: float, + actor: nn.Module, + actor_optimizer: torch.optim.Optimizer, + actor_lr_scheduler: torch.optim.lr_scheduler.LRScheduler, + q_network: nn.Module, + q_optimizer: torch.optim.Optimizer, + v_network: nn.Module, + v_optimizer: torch.optim.Optimizer, + iql_tau: float = 0.7, + beta: float = 3.0, + gamma: float = 0.99, + tau: float = 0.005, + device: str = "cpu", + ): + self.max_action = max_action + self.qf = q_network + self.q_target = copy.deepcopy(self.qf).requires_grad_(False).to(device) + self.vf = v_network + self.actor = actor + self.v_optimizer = v_optimizer + self.q_optimizer = q_optimizer + self.actor_optimizer = actor_optimizer + self.actor_lr_scheduler = actor_lr_scheduler + self.iql_tau = iql_tau + self.beta = beta + self.gamma = gamma + self.tau = tau + self.device = device + + def _update_v(self, observations, actions, log_dict) -> torch.Tensor: + # Update value function + with torch.no_grad(): + target_q = self.q_target(observations, actions) + + v = self.vf(observations) + adv = target_q - v + v_loss = asymmetric_l2_loss(adv, self.iql_tau) + log_dict["value_loss"] = v_loss.item() + self.v_optimizer.zero_grad() + v_loss.backward() + self.v_optimizer.step() + return adv + + def _update_q( + self, + next_v: torch.Tensor, + observations: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + terminals: torch.Tensor, + log_dict: Dict, + ): + targets = rewards + (1.0 - terminals.float()) * self.gamma * next_v.detach() + qs = self.qf.both(observations, actions) + q_loss = sum(F.mse_loss(q, targets) for q in qs) / len(qs) + log_dict["q_loss"] = q_loss.item() + self.q_optimizer.zero_grad() + q_loss.backward() + self.q_optimizer.step() + + # Update target Q network + soft_update(self.q_target, self.qf, self.tau) + + def _update_policy( + self, + adv: torch.Tensor, + observations: torch.Tensor, + actions: torch.Tensor, + log_dict: Dict, + ): + exp_adv = torch.exp(self.beta * adv.detach()).clamp(max=EXP_ADV_MAX) + policy_out = self.actor(observations) + if isinstance(policy_out, torch.distributions.Distribution): + bc_losses = -policy_out.log_prob(actions).sum(-1, keepdim=False) + elif torch.is_tensor(policy_out): + if policy_out.shape != actions.shape: + raise RuntimeError("Actions shape missmatch") + bc_losses = torch.sum((policy_out - actions) ** 2, dim=1) + else: + raise NotImplementedError + policy_loss = torch.mean(exp_adv * bc_losses) + log_dict["actor_loss"] = policy_loss.item() + self.actor_optimizer.zero_grad() + policy_loss.backward() + self.actor_optimizer.step() + self.actor_lr_scheduler.step() + + def train(self, batch: TensorBatch) -> Dict[str, float]: + ( + observations, + actions, + rewards, + next_observations, + dones, + ) = batch + log_dict = {} + + with torch.no_grad(): + next_v = self.vf(next_observations) + # Update value function + adv = self._update_v(observations, actions, log_dict) + rewards = rewards.squeeze(dim=-1) + dones = dones.squeeze(dim=-1) + # Update Q function + self._update_q(next_v, observations, actions, rewards, dones, log_dict) + # Update actor + self._update_policy(adv, observations, actions, log_dict) + + return log_dict + + def state_dict(self) -> Dict[str, Any]: + return { + "qf": self.qf.state_dict(), + "q_optimizer": self.q_optimizer.state_dict(), + "vf": self.vf.state_dict(), + "v_optimizer": self.v_optimizer.state_dict(), + "actor": self.actor.state_dict(), + "actor_optimizer": self.actor_optimizer.state_dict(), + "actor_lr_scheduler": self.actor_lr_scheduler.state_dict(), + } + + def load_state_dict(self, state_dict: Dict[str, Any]): + self.qf.load_state_dict(state_dict["qf"]) + self.q_optimizer.load_state_dict(state_dict["q_optimizer"]) + self.q_target = copy.deepcopy(self.qf) + + self.vf.load_state_dict(state_dict["vf"]) + self.v_optimizer.load_state_dict(state_dict["v_optimizer"]) + + self.actor.load_state_dict(state_dict["actor"]) + self.actor_optimizer.load_state_dict(state_dict["actor_optimizer"]) + self.actor_lr_scheduler.load_state_dict(state_dict["actor_lr_scheduler"]) + + +@torch.no_grad() +def evaluate( + env: gym.Env, actor: nn.Module, num_episodes: int, seed: int, device: str +) -> np.ndarray: + actor.eval() + episode_rewards = [] + for i in range(num_episodes): + done = False + state, info = env.reset(seed=seed + i) + + episode_reward = 0.0 + while not done: + action = actor.act(state, device) + state, reward, terminated, truncated, info = env.step(action) + done = terminated or truncated + episode_reward += reward + episode_rewards.append(episode_reward) + + actor.train() + return np.asarray(episode_rewards) + + +@pyrallis.wrap() +def train(config: TrainConfig): + wandb.init( + config=asdict(config), + project=config.project, + group=config.group, + name=config.name, + id=str(uuid.uuid4()), + save_code=True, + ) + minari.download_dataset(config.dataset_id) + dataset = minari.load_dataset(config.dataset_id) + + eval_env = dataset.recover_environment() + state_dim = eval_env.observation_space.shape[0] + action_dim = eval_env.action_space.shape[0] + max_action = float(eval_env.action_space.high[0]) + + qdataset = qlearning_dataset(dataset) + if config.normalize_reward: + modify_reward(qdataset, config.dataset_id) + + if config.normalize_state: + state_mean, state_std = compute_mean_std(qdataset["observations"], eps=1e-3) + else: + state_mean, state_std = 0, 1 + + qdataset["observations"] = normalize_states( + qdataset["observations"], state_mean, state_std + ) + qdataset["next_observations"] = normalize_states( + qdataset["next_observations"], state_mean, state_std + ) + + eval_env = wrap_env(eval_env, state_mean=state_mean, state_std=state_std) + replay_buffer = ReplayBuffer( + state_dim, + action_dim, + config.buffer_size, + DEVICE, + ) + replay_buffer.load_dataset(qdataset) + + if config.checkpoints_path is not None: + print(f"Checkpoints path: {config.checkpoints_path}") + os.makedirs(config.checkpoints_path, exist_ok=True) + with open(os.path.join(config.checkpoints_path, "config.yaml"), "w") as f: + pyrallis.dump(config, f) + + # Set seeds + set_seed(config.train_seed) + + q_network = TwinQ(state_dim, action_dim).to(DEVICE) + v_network = ValueFunction(state_dim).to(DEVICE) + if config.iql_deterministic: + actor = DeterministicPolicy( + state_dim, action_dim, max_action, dropout=config.actor_dropout + ).to(DEVICE) + else: + actor = GaussianPolicy( + state_dim, action_dim, max_action, dropout=config.actor_dropout + ).to(DEVICE) + + v_optimizer = torch.optim.Adam(v_network.parameters(), lr=config.vf_lr) + q_optimizer = torch.optim.Adam(q_network.parameters(), lr=config.qf_lr) + actor_optimizer = torch.optim.Adam(actor.parameters(), lr=config.actor_lr) + actor_lr_scheduler = CosineAnnealingLR(actor_optimizer, config.update_steps) + + trainer = ImplicitQLearning( + max_action=max_action, + actor=actor, + actor_optimizer=actor_optimizer, + actor_lr_scheduler=actor_lr_scheduler, + q_network=q_network, + q_optimizer=q_optimizer, + v_network=v_network, + v_optimizer=v_optimizer, + iql_tau=config.iql_tau, + beta=config.beta, + gamma=config.gamma, + tau=config.tau, + device=DEVICE, + ) + + for step in trange(config.update_steps): + batch = [b.to(DEVICE) for b in replay_buffer.sample(config.batch_size)] + log_dict = trainer.train(batch) + + wandb.log(log_dict, step=step) + + if (step + 1) % config.eval_every == 0: + eval_scores = evaluate( + env=eval_env, + actor=actor, + num_episodes=config.eval_episodes, + seed=config.eval_seed, + device=DEVICE, + ) + wandb.log({"evaluation_return": eval_scores.mean()}, step=step) + # optional normalized score logging, only if dataset has reference scores + with contextlib.suppress(ValueError): + normalized_score = ( + minari.get_normalized_score(dataset, eval_scores).mean() * 100 + ) + wandb.log({"normalized_score": normalized_score}, step=step) + + if config.checkpoints_path is not None: + torch.save( + trainer.state_dict(), + os.path.join(config.checkpoints_path, f"checkpoint_{step}.pt"), + ) + + +if __name__ == "__main__": + train() diff --git a/configs/minari/offline/bc/door/cloned_v1.yaml b/configs/minari/offline/bc/door/cloned_v1.yaml new file mode 100644 index 00000000..c1d7118c --- /dev/null +++ b/configs/minari/offline/bc/door/cloned_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: door-cloned-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-door-cloned-v1-multiseed-v0 +name: bc +normalize_state: true +project: CORL +top_fraction: 1.0 +train_seed: 0 +update_steps: 1000000 diff --git a/configs/minari/offline/bc/door/expert_v1.yaml b/configs/minari/offline/bc/door/expert_v1.yaml new file mode 100644 index 00000000..b4d0d4be --- /dev/null +++ b/configs/minari/offline/bc/door/expert_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: door-expert-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-door-expert-v1-multiseed-v0 +name: bc +normalize_state: true +project: CORL +top_fraction: 1.0 +train_seed: 0 +update_steps: 1000000 \ No newline at end of file diff --git a/configs/minari/offline/bc/door/human_v1.yaml b/configs/minari/offline/bc/door/human_v1.yaml new file mode 100644 index 00000000..4b0024a0 --- /dev/null +++ b/configs/minari/offline/bc/door/human_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: door-human-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-door-human-v1-multiseed-v0 +name: bc +normalize_state: true +project: CORL +top_fraction: 1.0 +train_seed: 0 +update_steps: 1000000 \ No newline at end of file diff --git a/configs/minari/offline/bc/hammer/cloned_v1.yaml b/configs/minari/offline/bc/hammer/cloned_v1.yaml new file mode 100644 index 00000000..67131721 --- /dev/null +++ b/configs/minari/offline/bc/hammer/cloned_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: hammer-cloned-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-hammer-cloned-v1-multiseed-v0 +name: bc +normalize_state: true +project: CORL +top_fraction: 1.0 +train_seed: 0 +update_steps: 1000000 \ No newline at end of file diff --git a/configs/minari/offline/bc/hammer/expert_v1.yaml b/configs/minari/offline/bc/hammer/expert_v1.yaml new file mode 100644 index 00000000..ecdbb2c6 --- /dev/null +++ b/configs/minari/offline/bc/hammer/expert_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: hammer-expert-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-hammer-expert-v1-multiseed-v0 +name: bc +normalize_state: true +project: CORL +top_fraction: 1.0 +train_seed: 0 +update_steps: 1000000 \ No newline at end of file diff --git a/configs/minari/offline/bc/hammer/human_v1.yaml b/configs/minari/offline/bc/hammer/human_v1.yaml new file mode 100644 index 00000000..9e170b20 --- /dev/null +++ b/configs/minari/offline/bc/hammer/human_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: hammer-human-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-hammer-human-v1-multiseed-v0 +name: bc +normalize_state: true +project: CORL +top_fraction: 1.0 +train_seed: 0 +update_steps: 1000000 \ No newline at end of file diff --git a/configs/minari/offline/bc/pen/cloned_v1.yaml b/configs/minari/offline/bc/pen/cloned_v1.yaml new file mode 100644 index 00000000..e0365282 --- /dev/null +++ b/configs/minari/offline/bc/pen/cloned_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: pen-cloned-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-pen-cloned-v1-multiseed-v0 +name: bc +normalize_state: true +project: CORL +top_fraction: 1.0 +train_seed: 0 +update_steps: 1000000 diff --git a/configs/minari/offline/bc/pen/expert_v1.yaml b/configs/minari/offline/bc/pen/expert_v1.yaml new file mode 100644 index 00000000..deee7c50 --- /dev/null +++ b/configs/minari/offline/bc/pen/expert_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: pen-expert-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-pen-expert-v1-multiseed-v0 +name: bc +normalize_state: true +project: CORL +top_fraction: 1.0 +train_seed: 0 +update_steps: 1000000 diff --git a/configs/minari/offline/bc/pen/human_v1.yaml b/configs/minari/offline/bc/pen/human_v1.yaml new file mode 100644 index 00000000..3e416a58 --- /dev/null +++ b/configs/minari/offline/bc/pen/human_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: pen-human-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-pen-human-v1-multiseed-v0 +name: bc +normalize_state: true +project: CORL +top_fraction: 1.0 +train_seed: 0 +update_steps: 1000000 \ No newline at end of file diff --git a/configs/minari/offline/bc/relocate/cloned_v1.yaml b/configs/minari/offline/bc/relocate/cloned_v1.yaml new file mode 100644 index 00000000..e8abc4b7 --- /dev/null +++ b/configs/minari/offline/bc/relocate/cloned_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: relocate-cloned-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-relocate-cloned-v1-multiseed-v0 +name: bc +normalize_state: true +project: CORL +top_fraction: 1.0 +train_seed: 0 +update_steps: 1000000 diff --git a/configs/minari/offline/bc/relocate/expert_v1.yaml b/configs/minari/offline/bc/relocate/expert_v1.yaml new file mode 100644 index 00000000..4566ba23 --- /dev/null +++ b/configs/minari/offline/bc/relocate/expert_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: relocate-expert-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-relocate-expert-v1-multiseed-v0 +name: bc +normalize_state: true +project: CORL +top_fraction: 1.0 +train_seed: 0 +update_steps: 1000000 diff --git a/configs/minari/offline/bc/relocate/human_v1.yaml b/configs/minari/offline/bc/relocate/human_v1.yaml new file mode 100644 index 00000000..083ea2cf --- /dev/null +++ b/configs/minari/offline/bc/relocate/human_v1.yaml @@ -0,0 +1,16 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: relocate-human-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-relocate-human-v1-multiseed-v0 +name: bc +normalize_state: true +project: CORL +top_fraction: 1.0 +train_seed: 0 +update_steps: 1000000 + diff --git a/configs/minari/offline/bc_10/door/cloned_v1.yaml b/configs/minari/offline/bc_10/door/cloned_v1.yaml new file mode 100644 index 00000000..7fc0ea4e --- /dev/null +++ b/configs/minari/offline/bc_10/door/cloned_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: door-cloned-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-10-door-cloned-v1-multiseed-v0 +name: bc-10 +normalize_state: true +project: CORL +top_fraction: 0.1 +train_seed: 0 +update_steps: 1000000 diff --git a/configs/minari/offline/bc_10/door/expert_v1.yaml b/configs/minari/offline/bc_10/door/expert_v1.yaml new file mode 100644 index 00000000..0c6384b8 --- /dev/null +++ b/configs/minari/offline/bc_10/door/expert_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: door-expert-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-10-door-expert-v1-multiseed-v0 +name: bc-10 +normalize_state: true +project: CORL +top_fraction: 0.1 +train_seed: 0 +update_steps: 1000000 \ No newline at end of file diff --git a/configs/minari/offline/bc_10/door/human_v1.yaml b/configs/minari/offline/bc_10/door/human_v1.yaml new file mode 100644 index 00000000..8b976bf9 --- /dev/null +++ b/configs/minari/offline/bc_10/door/human_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: door-human-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-10-door-human-v1-multiseed-v0 +name: bc-10 +normalize_state: true +project: CORL +top_fraction: 0.1 +train_seed: 0 +update_steps: 1000000 \ No newline at end of file diff --git a/configs/minari/offline/bc_10/hammer/cloned_v1.yaml b/configs/minari/offline/bc_10/hammer/cloned_v1.yaml new file mode 100644 index 00000000..dfbd9583 --- /dev/null +++ b/configs/minari/offline/bc_10/hammer/cloned_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: hammer-cloned-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-10-hammer-cloned-v1-multiseed-v0 +name: bc-10 +normalize_state: true +project: CORL +top_fraction: 0.1 +train_seed: 0 +update_steps: 1000000 \ No newline at end of file diff --git a/configs/minari/offline/bc_10/hammer/expert_v1.yaml b/configs/minari/offline/bc_10/hammer/expert_v1.yaml new file mode 100644 index 00000000..be2c8183 --- /dev/null +++ b/configs/minari/offline/bc_10/hammer/expert_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: hammer-expert-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-10-hammer-expert-v1-multiseed-v0 +name: bc-10 +normalize_state: true +project: CORL +top_fraction: 0.1 +train_seed: 0 +update_steps: 1000000 \ No newline at end of file diff --git a/configs/minari/offline/bc_10/hammer/human_v1.yaml b/configs/minari/offline/bc_10/hammer/human_v1.yaml new file mode 100644 index 00000000..aba9df8f --- /dev/null +++ b/configs/minari/offline/bc_10/hammer/human_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: hammer-human-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-10-hammer-human-v1-multiseed-v0 +name: bc-10 +normalize_state: true +project: CORL +top_fraction: 0.1 +train_seed: 0 +update_steps: 1000000 \ No newline at end of file diff --git a/configs/minari/offline/bc_10/pen/cloned_v1.yaml b/configs/minari/offline/bc_10/pen/cloned_v1.yaml new file mode 100644 index 00000000..fd4b66bb --- /dev/null +++ b/configs/minari/offline/bc_10/pen/cloned_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: pen-cloned-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-10-pen-cloned-v1-multiseed-v0 +name: bc-10 +normalize_state: true +project: CORL +top_fraction: 0.1 +train_seed: 0 +update_steps: 1000000 diff --git a/configs/minari/offline/bc_10/pen/expert_v1.yaml b/configs/minari/offline/bc_10/pen/expert_v1.yaml new file mode 100644 index 00000000..f7145cae --- /dev/null +++ b/configs/minari/offline/bc_10/pen/expert_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: pen-expert-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-10-pen-expert-v1-multiseed-v0 +name: bc-10 +normalize_state: true +project: CORL +top_fraction: 0.1 +train_seed: 0 +update_steps: 1000000 diff --git a/configs/minari/offline/bc_10/pen/human_v1.yaml b/configs/minari/offline/bc_10/pen/human_v1.yaml new file mode 100644 index 00000000..ac3e3625 --- /dev/null +++ b/configs/minari/offline/bc_10/pen/human_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: pen-human-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-10-pen-human-v1-multiseed-v0 +name: bc-10 +normalize_state: true +project: CORL +top_fraction: 0.1 +train_seed: 0 +update_steps: 1000000 \ No newline at end of file diff --git a/configs/minari/offline/bc_10/relocate/cloned_v1.yaml b/configs/minari/offline/bc_10/relocate/cloned_v1.yaml new file mode 100644 index 00000000..55755ab8 --- /dev/null +++ b/configs/minari/offline/bc_10/relocate/cloned_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: relocate-cloned-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-10-relocate-cloned-v1-multiseed-v0 +name: bc-10 +normalize_state: true +project: CORL +top_fraction: 0.1 +train_seed: 0 +update_steps: 1000000 diff --git a/configs/minari/offline/bc_10/relocate/expert_v1.yaml b/configs/minari/offline/bc_10/relocate/expert_v1.yaml new file mode 100644 index 00000000..808885ac --- /dev/null +++ b/configs/minari/offline/bc_10/relocate/expert_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: relocate-expert-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-10-relocate-expert-v1-multiseed-v0 +name: bc-10 +normalize_state: true +project: CORL +top_fraction: 0.1 +train_seed: 0 +update_steps: 1000000 diff --git a/configs/minari/offline/bc_10/relocate/human_v1.yaml b/configs/minari/offline/bc_10/relocate/human_v1.yaml new file mode 100644 index 00000000..88f9d42b --- /dev/null +++ b/configs/minari/offline/bc_10/relocate/human_v1.yaml @@ -0,0 +1,16 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: relocate-human-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-10-relocate-human-v1-multiseed-v0 +name: bc-10 +normalize_state: true +project: CORL +top_fraction: 0.1 +train_seed: 0 +update_steps: 1000000 + diff --git a/configs/minari/offline/iql/door/cloned_v1.yaml b/configs/minari/offline/iql/door/cloned_v1.yaml new file mode 100644 index 00000000..e48f5312 --- /dev/null +++ b/configs/minari/offline/iql/door/cloned_v1.yaml @@ -0,0 +1,23 @@ +actor_dropout: 0.1 +actor_lr: 0.0003 +batch_size: 256 +beta: 3.0 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: door-cloned-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 0.99 +group: iql-door-cloned-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +name: iql +normalize_reward: false +normalize_state: true +project: CORL +qf_lr: 0.0003 +tau: 0.005 +train_seed: 0 +update_steps: 1000000 +vf_lr: 0.0003 diff --git a/configs/minari/offline/iql/door/expert_v1.yaml b/configs/minari/offline/iql/door/expert_v1.yaml new file mode 100644 index 00000000..b63686d3 --- /dev/null +++ b/configs/minari/offline/iql/door/expert_v1.yaml @@ -0,0 +1,23 @@ +actor_dropout: 0.1 +actor_lr: 0.0003 +batch_size: 256 +beta: 3.0 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: door-expert-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 0.99 +group: iql-door-expert-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +name: iql +normalize_reward: false +normalize_state: true +project: CORL +qf_lr: 0.0003 +tau: 0.005 +train_seed: 0 +update_steps: 1000000 +vf_lr: 0.0003 diff --git a/configs/minari/offline/iql/door/human_v1.yaml b/configs/minari/offline/iql/door/human_v1.yaml new file mode 100644 index 00000000..eb402b94 --- /dev/null +++ b/configs/minari/offline/iql/door/human_v1.yaml @@ -0,0 +1,23 @@ +actor_dropout: 0.1 +actor_lr: 0.0003 +batch_size: 256 +beta: 3.0 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: door-human-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 0.99 +group: iql-door-human-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +name: iql +normalize_reward: false +normalize_state: true +project: CORL +qf_lr: 0.0003 +tau: 0.005 +train_seed: 0 +update_steps: 1000000 +vf_lr: 0.0003 diff --git a/configs/minari/offline/iql/hammer/cloned_v1.yaml b/configs/minari/offline/iql/hammer/cloned_v1.yaml new file mode 100644 index 00000000..099e8f55 --- /dev/null +++ b/configs/minari/offline/iql/hammer/cloned_v1.yaml @@ -0,0 +1,23 @@ +actor_dropout: 0.1 +actor_lr: 0.0003 +batch_size: 256 +beta: 3.0 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: hammer-cloned-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 0.99 +group: iql-hammer-cloned-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +name: iql +normalize_reward: false +normalize_state: true +project: CORL +qf_lr: 0.0003 +tau: 0.005 +train_seed: 0 +update_steps: 1000000 +vf_lr: 0.0003 diff --git a/configs/minari/offline/iql/hammer/expert_v1.yaml b/configs/minari/offline/iql/hammer/expert_v1.yaml new file mode 100644 index 00000000..0c2b968e --- /dev/null +++ b/configs/minari/offline/iql/hammer/expert_v1.yaml @@ -0,0 +1,23 @@ +actor_dropout: 0.1 +actor_lr: 0.0003 +batch_size: 256 +beta: 3.0 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: hammer-expert-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 0.99 +group: iql-hammer-expert-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +name: iql +normalize_reward: false +normalize_state: true +project: CORL +qf_lr: 0.0003 +tau: 0.005 +train_seed: 0 +update_steps: 1000000 +vf_lr: 0.0003 \ No newline at end of file diff --git a/configs/minari/offline/iql/hammer/human_v1.yaml b/configs/minari/offline/iql/hammer/human_v1.yaml new file mode 100644 index 00000000..08883e95 --- /dev/null +++ b/configs/minari/offline/iql/hammer/human_v1.yaml @@ -0,0 +1,23 @@ +actor_dropout: 0.1 +actor_lr: 0.0003 +batch_size: 256 +beta: 3.0 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: hammer-human-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 0.99 +group: iql-hammer-human-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +name: iql +normalize_reward: false +normalize_state: true +project: CORL +qf_lr: 0.0003 +tau: 0.005 +train_seed: 0 +update_steps: 1000000 +vf_lr: 0.0003 \ No newline at end of file diff --git a/configs/minari/offline/iql/pen/cloned_v1.yaml b/configs/minari/offline/iql/pen/cloned_v1.yaml new file mode 100644 index 00000000..8dc39339 --- /dev/null +++ b/configs/minari/offline/iql/pen/cloned_v1.yaml @@ -0,0 +1,23 @@ +actor_dropout: 0.1 +actor_lr: 0.0003 +batch_size: 256 +beta: 3.0 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: pen-cloned-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 0.99 +group: iql-pen-cloned-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +name: iql +normalize_reward: false +normalize_state: true +project: CORL +qf_lr: 0.0003 +tau: 0.005 +train_seed: 0 +update_steps: 1000000 +vf_lr: 0.0003 \ No newline at end of file diff --git a/configs/minari/offline/iql/pen/expert_v1.yaml b/configs/minari/offline/iql/pen/expert_v1.yaml new file mode 100644 index 00000000..56b1db79 --- /dev/null +++ b/configs/minari/offline/iql/pen/expert_v1.yaml @@ -0,0 +1,23 @@ +actor_dropout: 0.1 +actor_lr: 0.0003 +batch_size: 256 +beta: 3.0 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: pen-expert-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 0.99 +group: iql-pen-expert-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +name: iql +normalize_reward: false +normalize_state: true +project: CORL +qf_lr: 0.0003 +tau: 0.005 +train_seed: 0 +update_steps: 1000000 +vf_lr: 0.0003 \ No newline at end of file diff --git a/configs/minari/offline/iql/pen/human_v1.yaml b/configs/minari/offline/iql/pen/human_v1.yaml new file mode 100644 index 00000000..199e24a3 --- /dev/null +++ b/configs/minari/offline/iql/pen/human_v1.yaml @@ -0,0 +1,23 @@ +actor_dropout: 0.1 +actor_lr: 0.0003 +batch_size: 256 +beta: 3.0 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: pen-human-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 0.99 +group: iql-pen-human-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +name: iql +normalize_reward: false +normalize_state: true +project: CORL +qf_lr: 0.0003 +tau: 0.005 +train_seed: 0 +update_steps: 1000000 +vf_lr: 0.0003 diff --git a/configs/minari/offline/iql/relocate/cloned_v1.yaml b/configs/minari/offline/iql/relocate/cloned_v1.yaml new file mode 100644 index 00000000..1acd1fa6 --- /dev/null +++ b/configs/minari/offline/iql/relocate/cloned_v1.yaml @@ -0,0 +1,23 @@ +actor_dropout: 0.1 +actor_lr: 0.0003 +batch_size: 256 +beta: 3.0 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: relocate-cloned-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 0.99 +group: iql-relocate-cloned-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +name: iql +normalize_reward: false +normalize_state: true +project: CORL +qf_lr: 0.0003 +tau: 0.005 +train_seed: 0 +update_steps: 1000000 +vf_lr: 0.0003 diff --git a/configs/minari/offline/iql/relocate/expert_v1.yaml b/configs/minari/offline/iql/relocate/expert_v1.yaml new file mode 100644 index 00000000..012815e7 --- /dev/null +++ b/configs/minari/offline/iql/relocate/expert_v1.yaml @@ -0,0 +1,23 @@ +actor_dropout: 0.1 +actor_lr: 0.0003 +batch_size: 256 +beta: 3.0 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: relocate-expert-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 0.99 +group: iql-relocate-expert-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +name: iql +normalize_reward: false +normalize_state: true +project: CORL +qf_lr: 0.0003 +tau: 0.005 +train_seed: 0 +update_steps: 1000000 +vf_lr: 0.0003 diff --git a/configs/minari/offline/iql/relocate/human_v1.yaml b/configs/minari/offline/iql/relocate/human_v1.yaml new file mode 100644 index 00000000..d0416acf --- /dev/null +++ b/configs/minari/offline/iql/relocate/human_v1.yaml @@ -0,0 +1,23 @@ +actor_dropout: 0.1 +actor_lr: 0.0003 +batch_size: 256 +beta: 3.0 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: relocate-human-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 0.99 +group: iql-relocate-human-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +name: iql +normalize_reward: false +normalize_state: true +project: CORL +qf_lr: 0.0003 +tau: 0.005 +train_seed: 0 +update_steps: 1000000 +vf_lr: 0.0003 diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 9034fc87..8d5d38cc 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,6 +1,6 @@ # Main dependencies git+https://github.com/tinkoff-ai/d4rl@master#egg=d4rl -tqdm==4.64.0 +tqdm==4.65.0 wandb==0.12.21 mujoco-py==2.1.2.14 numpy==1.23.1 @@ -8,3 +8,6 @@ gym[mujoco_py,classic_control]==0.23.0 --extra-index-url https://download.pytorch.org/whl/cu113 torch==1.11.0+cu113 pyrallis==0.3.1 +# experimental, thus without a specific version for now +git+https://github.com/Farama-Foundation/Minari.git +gymnasium==0.28.1 \ No newline at end of file diff --git a/requirements/requirements_dev.txt b/requirements/requirements_dev.txt index 14947414..878763db 100644 --- a/requirements/requirements_dev.txt +++ b/requirements/requirements_dev.txt @@ -1,6 +1,6 @@ # Main dependencies git+https://github.com/tinkoff-ai/d4rl@master#egg=d4rl -tqdm==4.64.0 +tqdm==4.65.0 wandb==0.12.21 mujoco-py==2.1.2.14 numpy==1.23.1 @@ -9,4 +9,7 @@ gym[mujoco_py,classic_control]==0.23.0 torch==1.11.0+cu113 pyrallis==0.3.1 pre-commit==3.3.3 -ruff==0.0.278 \ No newline at end of file +ruff==0.0.278 +# experimental, thus without a specific version for now +git+https://github.com/Farama-Foundation/Minari.git +gymnasium==0.28.1