Unverified Commit 82b24dd4 authored by Clemens Schwarke's avatar Clemens Schwarke Committed by GitHub

Adds changes for rsl_rl 3.0.1 (#2962)

# Description

This PR adds the necessary changes to work with the new version of RSL
RL.

## Type of change

- New feature (non-breaking change which adds functionality)

## Checklist

- [x] I have run the [`pre-commit` checks](https://pre-commit.com/) with
`./isaaclab.sh --format`
- [ ] I have made corresponding changes to the documentation
- [x] My changes generate no new warnings
- [ ] I have added tests that prove my fix is effective or that my
feature works
- [ ] I have updated the changelog and the corresponding version in the
extension's `config/extension.toml` file
- [ ] I have added my name to the `CONTRIBUTORS.md` or my name already
exists there

---------
Signed-off-by: 's avatarClemens Schwarke <96480707+ClemensSchwarke@users.noreply.github.com>
Signed-off-by: 's avatarKelly Guo <kellyg@nvidia.com>
Co-authored-by: 's avatarPascal Roth <57946385+pascal-roth@users.noreply.github.com>
Co-authored-by: 's avatarKelly Guo <kellyg@nvidia.com>
Co-authored-by: 's avatarOcti Zhang <zhengyuz@nvidia.com>
parent c91a125c
......@@ -390,5 +390,10 @@
"package": "ml_dtypes",
"license" : "UNKNOWN",
"comment": "Apache 2.0"
},
{
"package": "zipp",
"license" : "UNKNOWN",
"comment": "MIT"
}
]
......@@ -377,7 +377,6 @@ if "%arg%"=="-i" (
rem install the python packages in source directory
echo [INFO] Installing extensions inside the Isaac Lab repository...
call :extract_python_exe
rem check if pytorch is installed and its version
rem install pytorch with cuda 12.8 for blackwell support
call !python_exe! -m pip list | findstr /C:"torch" >nul
......
......@@ -10,7 +10,7 @@ import random
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from isaaclab_rl.rsl_rl import RslRlOnPolicyRunnerCfg
from isaaclab_rl.rsl_rl import RslRlBaseRunnerCfg
def add_rsl_rl_args(parser: argparse.ArgumentParser):
......@@ -39,7 +39,7 @@ def add_rsl_rl_args(parser: argparse.ArgumentParser):
)
def parse_rsl_rl_cfg(task_name: str, args_cli: argparse.Namespace) -> RslRlOnPolicyRunnerCfg:
def parse_rsl_rl_cfg(task_name: str, args_cli: argparse.Namespace) -> RslRlBaseRunnerCfg:
"""Parse configuration for RSL-RL agent based on inputs.
Args:
......@@ -52,12 +52,12 @@ def parse_rsl_rl_cfg(task_name: str, args_cli: argparse.Namespace) -> RslRlOnPol
from isaaclab_tasks.utils.parse_cfg import load_cfg_from_registry
# load the default configuration
rslrl_cfg: RslRlOnPolicyRunnerCfg = load_cfg_from_registry(task_name, "rsl_rl_cfg_entry_point")
rslrl_cfg: RslRlBaseRunnerCfg = load_cfg_from_registry(task_name, "rsl_rl_cfg_entry_point")
rslrl_cfg = update_rsl_rl_cfg(rslrl_cfg, args_cli)
return rslrl_cfg
def update_rsl_rl_cfg(agent_cfg: RslRlOnPolicyRunnerCfg, args_cli: argparse.Namespace):
def update_rsl_rl_cfg(agent_cfg: RslRlBaseRunnerCfg, args_cli: argparse.Namespace):
"""Update configuration for RSL-RL agent based on inputs.
Args:
......
......@@ -58,7 +58,7 @@ import os
import time
import torch
from rsl_rl.runners import OnPolicyRunner
from rsl_rl.runners import DistillationRunner, OnPolicyRunner
from isaaclab.envs import (
DirectMARLEnv,
......@@ -71,7 +71,7 @@ from isaaclab.utils.assets import retrieve_file_path
from isaaclab.utils.dict import print_dict
from isaaclab.utils.pretrained_checkpoint import get_published_pretrained_checkpoint
from isaaclab_rl.rsl_rl import RslRlOnPolicyRunnerCfg, RslRlVecEnvWrapper, export_policy_as_jit, export_policy_as_onnx
from isaaclab_rl.rsl_rl import RslRlBaseRunnerCfg, RslRlVecEnvWrapper, export_policy_as_jit, export_policy_as_onnx
import isaaclab_tasks # noqa: F401
from isaaclab_tasks.utils import get_checkpoint_path
......@@ -81,14 +81,14 @@ from isaaclab_tasks.utils.hydra import hydra_task_config
@hydra_task_config(args_cli.task, args_cli.agent)
def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agent_cfg: RslRlOnPolicyRunnerCfg):
def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agent_cfg: RslRlBaseRunnerCfg):
"""Play with RSL-RL agent."""
# grab task name for checkpoint path
task_name = args_cli.task.split(":")[-1]
train_task_name = task_name.replace("-Play", "")
# override configurations with non-hydra CLI arguments
agent_cfg = cli_args.update_rsl_rl_cfg(agent_cfg, args_cli)
agent_cfg: RslRlBaseRunnerCfg = cli_args.update_rsl_rl_cfg(agent_cfg, args_cli)
env_cfg.scene.num_envs = args_cli.num_envs if args_cli.num_envs is not None else env_cfg.scene.num_envs
# set the environment seed
......@@ -136,32 +136,43 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
print(f"[INFO]: Loading model checkpoint from: {resume_path}")
# load previously trained model
ppo_runner = OnPolicyRunner(env, agent_cfg.to_dict(), log_dir=None, device=agent_cfg.device)
ppo_runner.load(resume_path)
if agent_cfg.class_name == "OnPolicyRunner":
runner = OnPolicyRunner(env, agent_cfg.to_dict(), log_dir=None, device=agent_cfg.device)
elif agent_cfg.class_name == "DistillationRunner":
runner = DistillationRunner(env, agent_cfg.to_dict(), log_dir=None, device=agent_cfg.device)
else:
raise ValueError(f"Unsupported runner class: {agent_cfg.class_name}")
runner.load(resume_path)
# obtain the trained policy for inference
policy = ppo_runner.get_inference_policy(device=env.unwrapped.device)
policy = runner.get_inference_policy(device=env.unwrapped.device)
# extract the neural network module
# we do this in a try-except to maintain backwards compatibility.
try:
# version 2.3 onwards
policy_nn = ppo_runner.alg.policy
policy_nn = runner.alg.policy
except AttributeError:
# version 2.2 and below
policy_nn = ppo_runner.alg.actor_critic
policy_nn = runner.alg.actor_critic
# extract the normalizer
if hasattr(policy_nn, "actor_obs_normalizer"):
normalizer = policy_nn.actor_obs_normalizer
elif hasattr(policy_nn, "student_obs_normalizer"):
normalizer = policy_nn.student_obs_normalizer
else:
normalizer = None
# export policy to onnx/jit
export_model_dir = os.path.join(os.path.dirname(resume_path), "exported")
export_policy_as_jit(policy_nn, ppo_runner.obs_normalizer, path=export_model_dir, filename="policy.pt")
export_policy_as_onnx(
policy_nn, normalizer=ppo_runner.obs_normalizer, path=export_model_dir, filename="policy.onnx"
)
export_policy_as_jit(policy_nn, normalizer=normalizer, path=export_model_dir, filename="policy.pt")
export_policy_as_onnx(policy_nn, normalizer=normalizer, path=export_model_dir, filename="policy.onnx")
dt = env.unwrapped.step_dt
# reset environment
obs, _ = env.get_observations()
obs = env.get_observations()
timestep = 0
# simulate environment
while simulation_app.is_running():
......
......@@ -15,7 +15,6 @@ from isaaclab.app import AppLauncher
# local imports
import cli_args # isort: skip
# add argparse arguments
parser = argparse.ArgumentParser(description="Train an RL agent with RSL-RL.")
parser.add_argument("--video", action="store_true", default=False, help="Record videos during training.")
......@@ -56,10 +55,10 @@ import platform
from packaging import version
# for distributed training, check minimum supported rsl-rl version
RSL_RL_VERSION = "2.3.1"
# check minimum supported rsl-rl version
RSL_RL_VERSION = "3.0.1"
installed_version = metadata.version("rsl-rl-lib")
if args_cli.distributed and version.parse(installed_version) < version.parse(RSL_RL_VERSION):
if version.parse(installed_version) < version.parse(RSL_RL_VERSION):
if platform.system() == "Windows":
cmd = [r".\isaaclab.bat", "-p", "-m", "pip", "install", f"rsl-rl-lib=={RSL_RL_VERSION}"]
else:
......@@ -79,7 +78,7 @@ import torch
from datetime import datetime
import omni
from rsl_rl.runners import OnPolicyRunner
from rsl_rl.runners import DistillationRunner, OnPolicyRunner
from isaaclab.envs import (
DirectMARLEnv,
......@@ -91,7 +90,7 @@ from isaaclab.envs import (
from isaaclab.utils.dict import print_dict
from isaaclab.utils.io import dump_pickle, dump_yaml
from isaaclab_rl.rsl_rl import RslRlOnPolicyRunnerCfg, RslRlVecEnvWrapper
from isaaclab_rl.rsl_rl import RslRlBaseRunnerCfg, RslRlVecEnvWrapper
import isaaclab_tasks # noqa: F401
from isaaclab_tasks.utils import get_checkpoint_path
......@@ -106,7 +105,7 @@ torch.backends.cudnn.benchmark = False
@hydra_task_config(args_cli.task, args_cli.agent)
def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agent_cfg: RslRlOnPolicyRunnerCfg):
def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agent_cfg: RslRlBaseRunnerCfg):
"""Train with RSL-RL agent."""
# override configurations with non-hydra CLI arguments
agent_cfg = cli_args.update_rsl_rl_cfg(agent_cfg, args_cli)
......@@ -178,7 +177,12 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
env = RslRlVecEnvWrapper(env, clip_actions=agent_cfg.clip_actions)
# create runner from rsl-rl
runner = OnPolicyRunner(env, agent_cfg.to_dict(), log_dir=log_dir, device=agent_cfg.device)
if agent_cfg.class_name == "OnPolicyRunner":
runner = OnPolicyRunner(env, agent_cfg.to_dict(), log_dir=log_dir, device=agent_cfg.device)
elif agent_cfg.class_name == "DistillationRunner":
runner = DistillationRunner(env, agent_cfg.to_dict(), log_dir=log_dir, device=agent_cfg.device)
else:
raise ValueError(f"Unsupported runner class: {agent_cfg.class_name}")
# write git state to logs
runner.add_git_repo_to_log(__file__)
# load the checkpoint
......
......@@ -28,6 +28,12 @@ class RslRlDistillationStudentTeacherCfg:
noise_std_type: Literal["scalar", "log"] = "scalar"
"""The type of noise standard deviation for the policy. Default is scalar."""
student_obs_normalization: bool = MISSING
"""Whether to normalize the observation for the student network."""
teacher_obs_normalization: bool = MISSING
"""Whether to normalize the observation for the teacher network."""
student_hidden_dims: list[int] = MISSING
"""The hidden dimensions of the student network."""
......@@ -81,3 +87,9 @@ class RslRlDistillationAlgorithmCfg:
max_grad_norm: None | float = None
"""The maximum norm the gradient is clipped to."""
optimizer: Literal["adam", "adamw", "sgd", "rmsprop"] = "adam"
"""The optimizer to use for the student policy."""
loss_type: Literal["mse", "huber"] = "mse"
"""The loss type to use for the student policy."""
......@@ -32,6 +32,12 @@ class RslRlPpoActorCriticCfg:
noise_std_type: Literal["scalar", "log"] = "scalar"
"""The type of noise standard deviation for the policy. Default is scalar."""
actor_obs_normalization: bool = MISSING
"""Whether to normalize the observation for the actor network."""
critic_obs_normalization: bool = MISSING
"""Whether to normalize the observation for the critic network."""
actor_hidden_dims: list[int] = MISSING
"""The hidden dimensions of the actor network."""
......@@ -114,14 +120,12 @@ class RslRlPpoAlgorithmCfg:
Otherwise, the advantage is normalized over the entire collected trajectories.
"""
rnd_cfg: RslRlRndCfg | None = None
"""The RND configuration. Default is None, in which case RND is not used."""
symmetry_cfg: RslRlSymmetryCfg | None = None
"""The symmetry configuration. Default is None, in which case symmetry is not used."""
rnd_cfg: RslRlRndCfg | None = None
"""The configuration for the Random Network Distillation (RND) module. Default is None,
in which case RND is not used.
"""
#########################
# Runner configurations #
......@@ -129,8 +133,8 @@ class RslRlPpoAlgorithmCfg:
@configclass
class RslRlOnPolicyRunnerCfg:
"""Configuration of the runner for on-policy algorithms."""
class RslRlBaseRunnerCfg:
"""Base configuration of the runner."""
seed: int = 42
"""The seed for the experiment. Default is 42."""
......@@ -144,17 +148,36 @@ class RslRlOnPolicyRunnerCfg:
max_iterations: int = MISSING
"""The maximum number of iterations."""
empirical_normalization: bool = MISSING
"""Whether to use empirical normalization."""
empirical_normalization: bool | None = None
"""This parameter is deprecated and will be removed in the future.
policy: RslRlPpoActorCriticCfg | RslRlDistillationStudentTeacherCfg = MISSING
"""The policy configuration."""
Use `actor_obs_normalization` and `critic_obs_normalization` instead.
"""
algorithm: RslRlPpoAlgorithmCfg | RslRlDistillationAlgorithmCfg = MISSING
"""The algorithm configuration."""
obs_groups: dict[str, list[str]] = MISSING
"""A mapping from observation groups to observation sets.
The keys of the dictionary are predefined observation sets used by the underlying algorithm
and values are lists of observation groups provided by the environment.
For instance, if the environment provides a dictionary of observations with groups "policy", "images",
and "privileged", these can be mapped to algorithmic observation sets as follows:
.. code-block:: python
obs_groups = {
"policy": ["policy", "images"],
"critic": ["policy", "privileged"],
}
This way, the policy will receive the "policy" and "images" observations, and the critic will
receive the "policy" and "privileged" observations.
For more details, please check ``vec_env.py`` in the rsl_rl library.
"""
clip_actions: float | None = None
"""The clipping value for actions. If ``None``, then no clipping is done.
"""The clipping value for actions. If None, then no clipping is done. Defaults to None.
.. note::
This clipping is performed inside the :class:`RslRlVecEnvWrapper` wrapper.
......@@ -184,7 +207,10 @@ class RslRlOnPolicyRunnerCfg:
"""The wandb project name. Default is "isaaclab"."""
resume: bool = False
"""Whether to resume. Default is False."""
"""Whether to resume a previous training. Default is False.
This flag will be ignored for distillation.
"""
load_run: str = ".*"
"""The run directory to load. Default is ".*" (all).
......@@ -197,3 +223,31 @@ class RslRlOnPolicyRunnerCfg:
If regex expression, the latest (alphabetical order) matching file will be loaded.
"""
@configclass
class RslRlOnPolicyRunnerCfg(RslRlBaseRunnerCfg):
"""Configuration of the runner for on-policy algorithms."""
class_name: str = "OnPolicyRunner"
"""The runner class name. Default is OnPolicyRunner."""
policy: RslRlPpoActorCriticCfg = MISSING
"""The policy configuration."""
algorithm: RslRlPpoAlgorithmCfg = MISSING
"""The algorithm configuration."""
@configclass
class RslRlDistillationRunnerCfg(RslRlBaseRunnerCfg):
"""Configuration of the runner for distillation algorithms."""
class_name: str = "DistillationRunner"
"""The runner class name. Default is DistillationRunner."""
policy: RslRlDistillationStudentTeacherCfg = MISSING
"""The policy configuration."""
algorithm: RslRlDistillationAlgorithmCfg = MISSING
"""The algorithm configuration."""
......@@ -39,13 +39,11 @@ class RslRlSymmetryCfg:
Args:
env (VecEnv): The environment object. This is used to access the environment's properties.
obs (torch.Tensor | None): The observation tensor. If None, the observation is not used.
obs (tensordict.TensorDict | None): The observation tensor dictionary. If None, the observation is not used.
action (torch.Tensor | None): The action tensor. If None, the action is not used.
obs_type (str): The name of the observation type. Defaults to "policy".
This is useful when handling augmentation for different observation groups.
Returns:
A tuple containing the augmented observation and action tensors. The tensors can be None,
A tuple containing the augmented observation dictionary and action tensors. The tensors can be None,
if their respective inputs are None.
"""
......
......@@ -5,6 +5,7 @@
import gymnasium as gym
import torch
from tensordict import TensorDict
from rsl_rl.env import VecEnv
......@@ -12,16 +13,9 @@ from isaaclab.envs import DirectRLEnv, ManagerBasedRLEnv
class RslRlVecEnvWrapper(VecEnv):
"""Wraps around Isaac Lab environment for RSL-RL library
To use asymmetric actor-critic, the environment instance must have the attributes :attr:`num_privileged_obs` (int).
This is used by the learning agent to allocate buffers in the trajectory memory. Additionally, the returned
observations should have the key "critic" which corresponds to the privileged observations. Since this is
optional for some environments, the wrapper checks if these attributes exist. If they don't then the wrapper
defaults to zero as number of privileged observations.
"""Wraps around Isaac Lab environment for the RSL-RL library
.. caution::
This class must be the last wrapper in the wrapper chain. This is because the wrapper does not follow
the :class:`gym.Wrapper` interface. Any subsequent wrappers will need to be modified to work with this
wrapper.
......@@ -43,12 +37,14 @@ class RslRlVecEnvWrapper(VecEnv):
Raises:
ValueError: When the environment is not an instance of :class:`ManagerBasedRLEnv` or :class:`DirectRLEnv`.
"""
# check that input is valid
if not isinstance(env.unwrapped, ManagerBasedRLEnv) and not isinstance(env.unwrapped, DirectRLEnv):
raise ValueError(
"The environment must be inherited from ManagerBasedRLEnv or DirectRLEnv. Environment type:"
f" {type(env)}"
)
# initialize the wrapper
self.env = env
self.clip_actions = clip_actions
......@@ -63,20 +59,6 @@ class RslRlVecEnvWrapper(VecEnv):
self.num_actions = self.unwrapped.action_manager.total_action_dim
else:
self.num_actions = gym.spaces.flatdim(self.unwrapped.single_action_space)
if hasattr(self.unwrapped, "observation_manager"):
self.num_obs = self.unwrapped.observation_manager.group_obs_dim["policy"][0]
else:
self.num_obs = gym.spaces.flatdim(self.unwrapped.single_observation_space["policy"])
# -- privileged observations
if (
hasattr(self.unwrapped, "observation_manager")
and "critic" in self.unwrapped.observation_manager.group_obs_dim
):
self.num_privileged_obs = self.unwrapped.observation_manager.group_obs_dim["critic"][0]
elif hasattr(self.unwrapped, "num_states") and "critic" in self.unwrapped.single_observation_space:
self.num_privileged_obs = gym.spaces.flatdim(self.unwrapped.single_observation_space["critic"])
else:
self.num_privileged_obs = 0
# modify the action space to the clip range
self._modify_action_space()
......@@ -133,14 +115,6 @@ class RslRlVecEnvWrapper(VecEnv):
Properties
"""
def get_observations(self) -> tuple[torch.Tensor, dict]:
"""Returns the current observations of the environment."""
if hasattr(self.unwrapped, "observation_manager"):
obs_dict = self.unwrapped.observation_manager.compute()
else:
obs_dict = self.unwrapped._get_observations()
return obs_dict["policy"], {"observations": obs_dict}
@property
def episode_length_buf(self) -> torch.Tensor:
"""The episode length buffer."""
......@@ -162,13 +136,20 @@ class RslRlVecEnvWrapper(VecEnv):
def seed(self, seed: int = -1) -> int: # noqa: D102
return self.unwrapped.seed(seed)
def reset(self) -> tuple[torch.Tensor, dict]: # noqa: D102
def reset(self) -> tuple[TensorDict, dict]: # noqa: D102
# reset the environment
obs_dict, _ = self.env.reset()
# return observations
return obs_dict["policy"], {"observations": obs_dict}
obs_dict, extras = self.env.reset()
return TensorDict(obs_dict, batch_size=[self.num_envs]), extras
def get_observations(self) -> TensorDict:
"""Returns the current observations of the environment."""
if hasattr(self.unwrapped, "observation_manager"):
obs_dict = self.unwrapped.observation_manager.compute()
else:
obs_dict = self.unwrapped._get_observations()
return TensorDict(obs_dict, batch_size=[self.num_envs])
def step(self, actions: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, dict]:
def step(self, actions: torch.Tensor) -> tuple[TensorDict, torch.Tensor, torch.Tensor, dict]:
# clip actions
if self.clip_actions is not None:
actions = torch.clamp(actions, -self.clip_actions, self.clip_actions)
......@@ -176,16 +157,12 @@ class RslRlVecEnvWrapper(VecEnv):
obs_dict, rew, terminated, truncated, extras = self.env.step(actions)
# compute dones for compatibility with RSL-RL
dones = (terminated | truncated).to(dtype=torch.long)
# move extra observations to the extras dict
obs = obs_dict["policy"]
extras["observations"] = obs_dict
# move time out information to the extras dict
# this is only needed for infinite horizon tasks
if not self.unwrapped.cfg.is_finite_horizon:
extras["time_outs"] = truncated
# return the step information
return obs, rew, dones, extras
return TensorDict(obs_dict, batch_size=[self.num_envs]), rew, dones, extras
def close(self): # noqa: D102
return self.env.close()
......@@ -200,7 +177,8 @@ class RslRlVecEnvWrapper(VecEnv):
return
# modify the action space to the clip range
# note: this is only possible for the box action space. we need to change it in the future for other action spaces.
# note: this is only possible for the box action space. we need to change it in the future for other
# action spaces.
self.env.unwrapped.single_action_space = gym.spaces.Box(
low=-self.clip_actions, high=self.clip_actions, shape=(self.num_actions,)
)
......
......@@ -33,6 +33,7 @@ INSTALL_REQUIRES = [
"moviepy",
# make sure this is consistent with isaac sim version
"pillow==11.2.1",
"packaging<24",
]
PYTORCH_INDEX_URL = ["https://download.pytorch.org/whl/cu128"]
......@@ -45,7 +46,7 @@ EXTRAS_REQUIRE = {
"rl-games @ git+https://github.com/isaac-sim/rl_games.git@python3.11",
"gym",
], # rl-games still needs gym :(
"rsl-rl": ["rsl-rl-lib==2.3.3"],
"rsl-rl": ["rsl-rl-lib==3.0.1"],
}
# Add the names with hyphens as aliases for convenience
EXTRAS_REQUIRE["rl_games"] = EXTRAS_REQUIRE["rl-games"]
......
......@@ -16,6 +16,7 @@ simulation_app = app_launcher.app
import gymnasium as gym
import torch
from tensordict import TensorDict
import carb
import omni.usd
......@@ -161,6 +162,8 @@ def _check_valid_tensor(data: torch.Tensor | dict) -> bool:
"""
if isinstance(data, torch.Tensor):
return not torch.any(torch.isnan(data))
elif isinstance(data, TensorDict):
return not data.isnan().any()
elif isinstance(data, dict):
valid_tensor = True
for value in data.values():
......
......@@ -14,9 +14,10 @@ class AllegroHandPPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 10000
save_interval = 250
experiment_name = "allegro_hand"
empirical_normalization = True
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=True,
critic_obs_normalization=True,
actor_hidden_dims=[1024, 512, 256, 128],
critic_hidden_dims=[1024, 512, 256, 128],
activation="elu",
......
......@@ -14,9 +14,10 @@ class AntPPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 1000
save_interval = 50
experiment_name = "ant_direct"
empirical_normalization = False
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=False,
critic_obs_normalization=False,
actor_hidden_dims=[400, 200, 100],
critic_hidden_dims=[400, 200, 100],
activation="elu",
......
......@@ -14,9 +14,10 @@ class AnymalCFlatPPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 500
save_interval = 50
experiment_name = "anymal_c_flat_direct"
empirical_normalization = False
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=False,
critic_obs_normalization=False,
actor_hidden_dims=[128, 128, 128],
critic_hidden_dims=[128, 128, 128],
activation="elu",
......@@ -43,9 +44,10 @@ class AnymalCRoughPPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 1500
save_interval = 50
experiment_name = "anymal_c_rough_direct"
empirical_normalization = False
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=False,
critic_obs_normalization=False,
actor_hidden_dims=[512, 256, 128],
critic_hidden_dims=[512, 256, 128],
activation="elu",
......
......@@ -14,9 +14,10 @@ class CartpolePPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 150
save_interval = 50
experiment_name = "cartpole_direct"
empirical_normalization = False
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=False,
critic_obs_normalization=False,
actor_hidden_dims=[32, 32],
critic_hidden_dims=[32, 32],
activation="elu",
......
......@@ -14,9 +14,10 @@ class FrankaCabinetPPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 1500
save_interval = 50
experiment_name = "franka_cabinet_direct"
empirical_normalization = True
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=True,
critic_obs_normalization=True,
actor_hidden_dims=[256, 128, 64],
critic_hidden_dims=[256, 128, 64],
activation="elu",
......
......@@ -14,9 +14,10 @@ class HumanoidPPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 1000
save_interval = 50
experiment_name = "humanoid_direct"
empirical_normalization = True
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=True,
critic_obs_normalization=True,
actor_hidden_dims=[400, 200, 100],
critic_hidden_dims=[400, 200, 100],
activation="elu",
......
......@@ -14,9 +14,10 @@ class QuadcopterPPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 200
save_interval = 50
experiment_name = "quadcopter_direct"
empirical_normalization = False
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=False,
critic_obs_normalization=False,
actor_hidden_dims=[64, 64],
critic_hidden_dims=[64, 64],
activation="elu",
......
......@@ -14,9 +14,10 @@ class ShadowHandPPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 10000
save_interval = 250
experiment_name = "shadow_hand"
empirical_normalization = True
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=True,
critic_obs_normalization=True,
actor_hidden_dims=[512, 512, 256, 128],
critic_hidden_dims=[512, 512, 256, 128],
activation="elu",
......@@ -43,9 +44,10 @@ class ShadowHandAsymFFPPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 10000
save_interval = 250
experiment_name = "shadow_hand_openai_ff"
empirical_normalization = True
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=True,
critic_obs_normalization=True,
actor_hidden_dims=[400, 400, 200, 100],
critic_hidden_dims=[512, 512, 256, 128],
activation="elu",
......@@ -72,9 +74,10 @@ class ShadowHandVisionFFPPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 50000
save_interval = 250
experiment_name = "shadow_hand_vision"
empirical_normalization = True
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=True,
critic_obs_normalization=True,
actor_hidden_dims=[1024, 512, 512, 256, 128],
critic_hidden_dims=[1024, 512, 512, 256, 128],
activation="elu",
......
......@@ -14,9 +14,10 @@ class AntPPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 1000
save_interval = 50
experiment_name = "ant"
empirical_normalization = False
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=False,
critic_obs_normalization=False,
actor_hidden_dims=[400, 200, 100],
critic_hidden_dims=[400, 200, 100],
activation="elu",
......
......@@ -16,9 +16,10 @@ class CartpolePPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 150
save_interval = 50
experiment_name = "cartpole"
empirical_normalization = False
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=False,
critic_obs_normalization=False,
actor_hidden_dims=[32, 32],
critic_hidden_dims=[32, 32],
activation="elu",
......
......@@ -8,6 +8,7 @@
from __future__ import annotations
import torch
from tensordict import TensorDict
from typing import TYPE_CHECKING
if TYPE_CHECKING:
......@@ -20,9 +21,8 @@ __all__ = ["compute_symmetric_states"]
@torch.no_grad()
def compute_symmetric_states(
env: ManagerBasedRLEnv,
obs: torch.Tensor | None = None,
obs: TensorDict | None = None,
actions: torch.Tensor | None = None,
obs_type: str = "policy",
):
"""Augments the given observations and actions by applying symmetry transformations.
......@@ -33,9 +33,8 @@ def compute_symmetric_states(
Args:
env: The environment instance.
obs: The original observation tensor. Defaults to None.
obs: The original observation tensor dictionary. Defaults to None.
actions: The original actions tensor. Defaults to None.
obs_type: The type of observation to augment. Defaults to "policy".
Returns:
Augmented observations and actions tensors, or None if the respective input was None.
......@@ -43,25 +42,25 @@ def compute_symmetric_states(
# observations
if obs is not None:
num_envs = obs.shape[0]
batch_size = obs.batch_size[0]
# since we have 2 different symmetries, we need to augment the batch size by 2
obs_aug = torch.zeros(num_envs * 2, obs.shape[1], device=obs.device)
obs_aug = obs.repeat(2)
# -- original
obs_aug[:num_envs] = obs[:]
obs_aug["policy"][:batch_size] = obs["policy"][:]
# -- left-right
obs_aug[num_envs : 2 * num_envs] = -obs
obs_aug["policy"][batch_size : 2 * batch_size] = -obs["policy"]
else:
obs_aug = None
# actions
if actions is not None:
num_envs = actions.shape[0]
batch_size = actions.shape[0]
# since we have 4 different symmetries, we need to augment the batch size by 4
actions_aug = torch.zeros(num_envs * 2, actions.shape[1], device=actions.device)
actions_aug = torch.zeros(batch_size * 2, actions.shape[1], device=actions.device)
# -- original
actions_aug[:num_envs] = actions[:]
actions_aug[:batch_size] = actions[:]
# -- left-right
actions_aug[num_envs : 2 * num_envs] = -actions
actions_aug[batch_size : 2 * batch_size] = -actions
else:
actions_aug = None
......
......@@ -14,9 +14,10 @@ class HumanoidPPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 1000
save_interval = 50
experiment_name = "humanoid"
empirical_normalization = False
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=False,
critic_obs_normalization=False,
actor_hidden_dims=[400, 200, 100],
critic_hidden_dims=[400, 200, 100],
activation="elu",
......
......@@ -14,9 +14,10 @@ class DigitLocoManipPPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 2000
save_interval = 50
experiment_name = "digit_loco_manip"
empirical_normalization = False
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=False,
critic_obs_normalization=False,
actor_hidden_dims=[256, 128, 128],
critic_hidden_dims=[256, 128, 128],
activation="elu",
......
......@@ -14,9 +14,10 @@ class UnitreeA1RoughPPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 1500
save_interval = 50
experiment_name = "unitree_a1_rough"
empirical_normalization = False
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=False,
critic_obs_normalization=False,
actor_hidden_dims=[512, 256, 128],
critic_hidden_dims=[512, 256, 128],
activation="elu",
......
......@@ -16,9 +16,10 @@ class AnymalBRoughPPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 1500
save_interval = 50
experiment_name = "anymal_b_rough"
empirical_normalization = False
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=False,
critic_obs_normalization=False,
actor_hidden_dims=[512, 256, 128],
critic_hidden_dims=[512, 256, 128],
activation="elu",
......
......@@ -16,9 +16,10 @@ class AnymalCRoughPPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 1500
save_interval = 50
experiment_name = "anymal_c_rough"
empirical_normalization = False
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=False,
critic_obs_normalization=False,
actor_hidden_dims=[512, 256, 128],
critic_hidden_dims=[512, 256, 128],
activation="elu",
......
......@@ -16,9 +16,10 @@ class AnymalDRoughPPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 1500
save_interval = 50
experiment_name = "anymal_d_rough"
empirical_normalization = False
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=False,
critic_obs_normalization=False,
actor_hidden_dims=[512, 256, 128],
critic_hidden_dims=[512, 256, 128],
activation="elu",
......
......@@ -14,9 +14,10 @@ class CassieRoughPPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 1500
save_interval = 50
experiment_name = "cassie_rough"
empirical_normalization = False
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=False,
critic_obs_normalization=False,
actor_hidden_dims=[512, 256, 128],
critic_hidden_dims=[512, 256, 128],
activation="elu",
......
......@@ -14,9 +14,10 @@ class DigitRoughPPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 3000
save_interval = 50
experiment_name = "digit_rough"
empirical_normalization = False
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=False,
critic_obs_normalization=False,
actor_hidden_dims=[512, 256, 128],
critic_hidden_dims=[512, 256, 128],
activation="elu",
......
......@@ -14,9 +14,10 @@ class G1RoughPPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 3000
save_interval = 50
experiment_name = "g1_rough"
empirical_normalization = False
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=False,
critic_obs_normalization=False,
actor_hidden_dims=[512, 256, 128],
critic_hidden_dims=[512, 256, 128],
activation="elu",
......
......@@ -14,9 +14,10 @@ class UnitreeGo1RoughPPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 1500
save_interval = 50
experiment_name = "unitree_go1_rough"
empirical_normalization = False
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=False,
critic_obs_normalization=False,
actor_hidden_dims=[512, 256, 128],
critic_hidden_dims=[512, 256, 128],
activation="elu",
......
......@@ -14,9 +14,10 @@ class UnitreeGo2RoughPPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 1500
save_interval = 50
experiment_name = "unitree_go2_rough"
empirical_normalization = False
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=False,
critic_obs_normalization=False,
actor_hidden_dims=[512, 256, 128],
critic_hidden_dims=[512, 256, 128],
activation="elu",
......
......@@ -14,9 +14,10 @@ class H1RoughPPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 3000
save_interval = 50
experiment_name = "h1_rough"
empirical_normalization = False
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=False,
critic_obs_normalization=False,
actor_hidden_dims=[512, 256, 128],
critic_hidden_dims=[512, 256, 128],
activation="elu",
......
......@@ -14,10 +14,11 @@ class SpotFlatPPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 20000
save_interval = 50
experiment_name = "spot_flat"
empirical_normalization = False
store_code_state = False
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=False,
critic_obs_normalization=False,
actor_hidden_dims=[512, 256, 128],
critic_hidden_dims=[512, 256, 128],
activation="elu",
......
......@@ -9,6 +9,7 @@
from __future__ import annotations
import torch
from tensordict import TensorDict
from typing import TYPE_CHECKING
if TYPE_CHECKING:
......@@ -21,9 +22,8 @@ __all__ = ["compute_symmetric_states"]
@torch.no_grad()
def compute_symmetric_states(
env: ManagerBasedRLEnv,
obs: torch.Tensor | None = None,
obs: TensorDict | None = None,
actions: torch.Tensor | None = None,
obs_type: str = "policy",
):
"""Augments the given observations and actions by applying symmetry transformations.
......@@ -34,9 +34,8 @@ def compute_symmetric_states(
Args:
env: The environment instance.
obs: The original observation tensor. Defaults to None.
obs: The original observation tensor dictionary. Defaults to None.
actions: The original actions tensor. Defaults to None.
obs_type: The type of observation to augment. Defaults to "policy".
Returns:
Augmented observations and actions tensors, or None if the respective input was None.
......@@ -44,33 +43,39 @@ def compute_symmetric_states(
# observations
if obs is not None:
num_envs = obs.shape[0]
batch_size = obs.batch_size[0]
# since we have 4 different symmetries, we need to augment the batch size by 4
obs_aug = torch.zeros(num_envs * 4, obs.shape[1], device=obs.device)
obs_aug = obs.repeat(4)
# policy observation group
# -- original
obs_aug[:num_envs] = obs[:]
obs_aug["policy"][:batch_size] = obs["policy"][:]
# -- left-right
obs_aug[num_envs : 2 * num_envs] = _transform_obs_left_right(env.unwrapped, obs, obs_type)
obs_aug["policy"][batch_size : 2 * batch_size] = _transform_policy_obs_left_right(env.unwrapped, obs["policy"])
# -- front-back
obs_aug[2 * num_envs : 3 * num_envs] = _transform_obs_front_back(env.unwrapped, obs, obs_type)
obs_aug["policy"][2 * batch_size : 3 * batch_size] = _transform_policy_obs_front_back(
env.unwrapped, obs["policy"]
)
# -- diagonal
obs_aug[3 * num_envs :] = _transform_obs_front_back(env.unwrapped, obs_aug[num_envs : 2 * num_envs])
obs_aug["policy"][3 * batch_size :] = _transform_policy_obs_front_back(
env.unwrapped, obs_aug["policy"][batch_size : 2 * batch_size]
)
else:
obs_aug = None
# actions
if actions is not None:
num_envs = actions.shape[0]
batch_size = actions.shape[0]
# since we have 4 different symmetries, we need to augment the batch size by 4
actions_aug = torch.zeros(num_envs * 4, actions.shape[1], device=actions.device)
actions_aug = torch.zeros(batch_size * 4, actions.shape[1], device=actions.device)
# -- original
actions_aug[:num_envs] = actions[:]
actions_aug[:batch_size] = actions[:]
# -- left-right
actions_aug[num_envs : 2 * num_envs] = _transform_actions_left_right(actions)
actions_aug[batch_size : 2 * batch_size] = _transform_actions_left_right(actions)
# -- front-back
actions_aug[2 * num_envs : 3 * num_envs] = _transform_actions_front_back(actions)
actions_aug[2 * batch_size : 3 * batch_size] = _transform_actions_front_back(actions)
# -- diagonal
actions_aug[3 * num_envs :] = _transform_actions_front_back(actions_aug[num_envs : 2 * num_envs])
actions_aug[3 * batch_size :] = _transform_actions_front_back(actions_aug[batch_size : 2 * batch_size])
else:
actions_aug = None
......@@ -82,7 +87,7 @@ Symmetry functions for observations.
"""
def _transform_obs_left_right(env: ManagerBasedRLEnv, obs: torch.Tensor, obs_type: str = "policy") -> torch.Tensor:
def _transform_policy_obs_left_right(env: ManagerBasedRLEnv, obs: torch.Tensor) -> torch.Tensor:
"""Apply a left-right symmetry transformation to the observation tensor.
This function modifies the given observation tensor by applying transformations
......@@ -95,7 +100,6 @@ def _transform_obs_left_right(env: ManagerBasedRLEnv, obs: torch.Tensor, obs_typ
Args:
env: The environment instance from which the observation is obtained.
obs: The observation tensor to be transformed.
obs_type: The type of observation to augment. Defaults to "policy".
Returns:
The transformed observation tensor with left-right symmetry applied.
......@@ -118,21 +122,14 @@ def _transform_obs_left_right(env: ManagerBasedRLEnv, obs: torch.Tensor, obs_typ
# last actions
obs[:, 36:48] = _switch_anymal_joints_left_right(obs[:, 36:48])
# height-scan
if obs_type == "critic":
# handle asymmetric actor-critic formulation
group_name = "critic" if "critic" in env.observation_manager.active_terms else "policy"
else:
group_name = "policy"
# note: this is hard-coded for grid-pattern of ordering "xy" and size (1.6, 1.0)
if "height_scan" in env.observation_manager.active_terms[group_name]:
if "height_scan" in env.observation_manager.active_terms["policy"]:
obs[:, 48:235] = obs[:, 48:235].view(-1, 11, 17).flip(dims=[1]).view(-1, 11 * 17)
return obs
def _transform_obs_front_back(env: ManagerBasedRLEnv, obs: torch.Tensor, obs_type: str = "policy") -> torch.Tensor:
def _transform_policy_obs_front_back(env: ManagerBasedRLEnv, obs: torch.Tensor) -> torch.Tensor:
"""Applies a front-back symmetry transformation to the observation tensor.
This function modifies the given observation tensor by applying transformations
......@@ -144,7 +141,6 @@ def _transform_obs_front_back(env: ManagerBasedRLEnv, obs: torch.Tensor, obs_typ
Args:
env: The environment instance from which the observation is obtained.
obs: The observation tensor to be transformed.
obs_type: The type of observation to augment. Defaults to "policy".
Returns:
The transformed observation tensor with front-back symmetry applied.
......@@ -167,15 +163,8 @@ def _transform_obs_front_back(env: ManagerBasedRLEnv, obs: torch.Tensor, obs_typ
# last actions
obs[:, 36:48] = _switch_anymal_joints_front_back(obs[:, 36:48])
# height-scan
if obs_type == "critic":
# handle asymmetric actor-critic formulation
group_name = "critic" if "critic" in env.observation_manager.active_terms else "policy"
else:
group_name = "policy"
# note: this is hard-coded for grid-pattern of ordering "xy" and size (1.6, 1.0)
if "height_scan" in env.observation_manager.active_terms[group_name]:
if "height_scan" in env.observation_manager.active_terms["policy"]:
obs[:, 48:235] = obs[:, 48:235].view(-1, 11, 17).flip(dims=[2]).view(-1, 11 * 17)
return obs
......
......@@ -14,9 +14,10 @@ class CabinetPPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 400
save_interval = 50
experiment_name = "franka_open_drawer"
empirical_normalization = False
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=False,
critic_obs_normalization=False,
actor_hidden_dims=[256, 128, 64],
critic_hidden_dims=[256, 128, 64],
activation="elu",
......
......@@ -14,9 +14,10 @@ class AllegroCubePPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 5000
save_interval = 50
experiment_name = "allegro_cube"
empirical_normalization = True
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=True,
critic_obs_normalization=True,
actor_hidden_dims=[512, 256, 128],
critic_hidden_dims=[512, 256, 128],
activation="elu",
......
......@@ -14,9 +14,10 @@ class LiftCubePPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 1500
save_interval = 50
experiment_name = "franka_lift"
empirical_normalization = False
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=False,
critic_obs_normalization=False,
actor_hidden_dims=[256, 128, 64],
critic_hidden_dims=[256, 128, 64],
activation="elu",
......
......@@ -15,10 +15,10 @@ class FrankaReachPPORunnerCfg(RslRlOnPolicyRunnerCfg):
save_interval = 50
experiment_name = "franka_reach"
run_name = ""
resume = False
empirical_normalization = False
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=False,
critic_obs_normalization=False,
actor_hidden_dims=[64, 64],
critic_hidden_dims=[64, 64],
activation="elu",
......
......@@ -15,10 +15,10 @@ class UR10ReachPPORunnerCfg(RslRlOnPolicyRunnerCfg):
save_interval = 50
experiment_name = "reach_ur10"
run_name = ""
resume = False
empirical_normalization = False
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=False,
critic_obs_normalization=False,
actor_hidden_dims=[64, 64],
critic_hidden_dims=[64, 64],
activation="elu",
......
......@@ -14,9 +14,10 @@ class NavigationEnvPPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 1500
save_interval = 50
experiment_name = "anymal_c_navigation"
empirical_normalization = False
policy = RslRlPpoActorCriticCfg(
init_noise_std=0.5,
actor_obs_normalization=False,
critic_obs_normalization=False,
actor_hidden_dims=[128, 128],
critic_hidden_dims=[128, 128],
activation="elu",
......
......@@ -14,9 +14,10 @@ class PPORunnerCfg(RslRlOnPolicyRunnerCfg):
max_iterations = 150
save_interval = 50
experiment_name = "cartpole_direct"
empirical_normalization = False
policy = RslRlPpoActorCriticCfg(
init_noise_std=1.0,
actor_obs_normalization=False,
critic_obs_normalization=False,
actor_hidden_dims=[32, 32],
critic_hidden_dims=[32, 32],
activation="elu",
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment