Adds action clipping to rsl-rl wrapper (#2019)

# Description Currently, the actions from the policy are directly applied to the environment and also often fed back to the policy using the last action as observation. Doing this can lead to instability during training since applying a large action can introduce a negative feedback loop. More specifically, applying a very large action leads to a large last_action observations, which often results in a large error in the critic, which can lead to even larger actions being sampled in the future. This PR aims to fix this for RSL-RL library, by clipping the actions to (large) hard limits before applying them to the environment. This prohibits the actions from growing continuously and greatly improves training stability. Fixes #984, #1732, #1999 ## Type of change - Bug fix (non-breaking change which fixes an issue) - New feature (non-breaking change which adds functionality) ## Checklist - [x] I have run the [`pre-commit` checks](https://pre-commit.com/) with `./isaaclab.sh --format` - [x] I have made corresponding changes to the documentation - [x] My changes generate no new warnings - [ ] I have added tests that prove my fix is effective or that my feature works - [x] I have updated the changelog and the corresponding version in the extension's `config/extension.toml` file - [x] I have added my name to the `CONTRIBUTORS.md` or my name already exists there

Adds action clipping to rsl-rl wrapper (#2019)
# Description Currently, the actions from the policy are directly applied to the environment and also often fed back to the policy using the last action as observation. Doing this can lead to instability during training since applying a large action can introduce a negative feedback loop. More specifically, applying a very large action leads to a large last_action observations, which often results in a large error in the critic, which can lead to even larger actions being sampled in the future. This PR aims to fix this for RSL-RL library, by clipping the actions to (large) hard limits before applying them to the environment. This prohibits the actions from growing continuously and greatly improves training stability. Fixes #984, #1732, #1999 ## Type of change - Bug fix (non-breaking change which fixes an issue) - New feature (non-breaking change which adds functionality) ## Checklist - [x] I have run the [`pre-commit` checks](https://pre-commit.com/) with `./isaaclab.sh --format` - [x] I have made corresponding changes to the documentation - [x] My changes generate no new warnings - [ ] I have added tests that prove my fix is effective or that my feature works - [x] I have updated the changelog and the corresponding version in the extension's `config/extension.toml` file - [x] I have added my name to the `CONTRIBUTORS.md` or my name already exists there
f774425b · Mayank Mittal · GitHub · e6f63e21 · f774425b · f774425b
Unverified Commit f774425b authored Mar 13, 2025 by Mayank Mittal Committed by GitHub Mar 13, 2025
6 changed files
--- a/scripts/reinforcement_learning/rsl_rl/play.py
+++ b/scripts/reinforcement_learning/rsl_rl/play.py
@@ -106,7 +106,7 @@ def main():
        env = gym.wrappers.RecordVideo(env, **video_kwargs)

    # wrap around environment for rsl-rl
-    env = RslRlVecEnvWrapper(env)
+    env = RslRlVecEnvWrapper(env, clip_actions=agent_cfg.clip_actions)

    print(f"[INFO]: Loading model checkpoint from: {resume_path}")
    # load previously trained model

--- a/scripts/reinforcement_learning/rsl_rl/train.py
+++ b/scripts/reinforcement_learning/rsl_rl/train.py
@@ -124,7 +124,7 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
        env = gym.wrappers.RecordVideo(env, **video_kwargs)

    # wrap around environment for rsl-rl
-    env = RslRlVecEnvWrapper(env)
+    env = RslRlVecEnvWrapper(env, clip_actions=agent_cfg.clip_actions)

    # create runner from rsl-rl
    runner = OnPolicyRunner(env, agent_cfg.to_dict(), log_dir=log_dir, device=agent_cfg.device)

--- a/source/isaaclab_rl/config/extension.toml
+++ b/source/isaaclab_rl/config/extension.toml
 [package]

 # Note: Semantic Versioning is used: https://semver.org/
-version = "0.1.0"
+version = "0.1.1"

 # Description
 title = "Isaac Lab RL"

--- a/source/isaaclab_rl/docs/CHANGELOG.rst
+++ b/source/isaaclab_rl/docs/CHANGELOG.rst
 Changelog
 ---------

+0.1.1 (2025-03-10)
+~~~~~~~~~~~~~~~~~~
+
+Added
+^^^^^
+
+* Added a parameter to clip the actions in the action space inside the RSL-RL wrapper.
+  This parameter is set to None by default, which is the same as not clipping the actions.
+* Added attribute :attr:`isaaclab_rl.rsl_rl.RslRlOnPolicyRunnerCfg.clip_actions` to set
+  the clipping range for the actions in the RSL-RL on-policy runner.
+
+
 0.1.0 (2024-12-27)
 ~~~~~~~~~~~~~~~~~~


--- a/source/isaaclab_rl/isaaclab_rl/rsl_rl/rl_cfg.py
+++ b/source/isaaclab_rl/isaaclab_rl/rsl_rl/rl_cfg.py
@@ -98,6 +98,9 @@ class RslRlOnPolicyRunnerCfg:
    algorithm: RslRlPpoAlgorithmCfg = MISSING
    """The algorithm configuration."""

+    clip_actions: float | None = None
+    """The clipping value for actions. If ``None``, then no clipping is done."""
+
    ##
    # Checkpointing parameters
    ##

--- a/source/isaaclab_rl/isaaclab_rl/rsl_rl/vecenv_wrapper.py
+++ b/source/isaaclab_rl/isaaclab_rl/rsl_rl/vecenv_wrapper.py
@@ -30,7 +30,7 @@ class RslRlVecEnvWrapper(VecEnv):
        https://github.com/leggedrobotics/rsl_rl/blob/master/rsl_rl/env/vec_env.py
    """

-    def __init__(self, env: ManagerBasedRLEnv | DirectRLEnv):
+    def __init__(self, env: ManagerBasedRLEnv | DirectRLEnv, clip_actions: float | None = None):
        """Initializes the wrapper.

        Note:
@@ -38,6 +38,7 @@ class RslRlVecEnvWrapper(VecEnv):

        Args:
            env: The environment to wrap around.
+            clip_actions: The clipping value for actions. If ``None``, then no clipping is done.

        Raises:
            ValueError: When the environment is not an instance of :class:`ManagerBasedRLEnv` or :class:`DirectRLEnv`.
@@ -50,10 +51,17 @@ class RslRlVecEnvWrapper(VecEnv):
            )
        # initialize the wrapper
        self.env = env
+        self.clip_actions = clip_actions
+
        # store information required by wrapper
        self.num_envs = self.unwrapped.num_envs
        self.device = self.unwrapped.device
        self.max_episode_length = self.unwrapped.max_episode_length
+
+        # modify the action space to the clip range
+        self._modify_action_space()
+
+        # obtain dimensions of the environment
        if hasattr(self.unwrapped, "action_manager"):
            self.num_actions = self.unwrapped.action_manager.total_action_dim
        else:
@@ -72,6 +80,7 @@ class RslRlVecEnvWrapper(VecEnv):
            self.num_privileged_obs = gym.spaces.flatdim(self.unwrapped.single_observation_space["critic"])
        else:
            self.num_privileged_obs = 0
+
        # reset at the start since the RSL-RL runner does not call reset
        self.env.reset()

@@ -160,6 +169,9 @@ class RslRlVecEnvWrapper(VecEnv):
        return obs_dict["policy"], {"observations": obs_dict}

    def step(self, actions: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, dict]:
+        # clip actions
+        if self.clip_actions is not None:
+            actions = torch.clamp(actions, -self.clip_actions, self.clip_actions)
        # record step information
        obs_dict, rew, terminated, truncated, extras = self.env.step(actions)
        # compute dones for compatibility with RSL-RL
@@ -177,3 +189,21 @@ class RslRlVecEnvWrapper(VecEnv):

    def close(self):  # noqa: D102
        return self.env.close()
+
+    """
+    Helper functions
+    """
+
+    def _modify_action_space(self):
+        """Modifies the action space to the clip range."""
+        if self.clip_actions is None:
+            return
+
+        # modify the action space to the clip range
+        # note: this is only possible for the box action space. we need to change it in the future for other action spaces.
+        self.env.unwrapped.single_action_space = gym.spaces.Box(
+            low=-self.clip_actions, high=self.clip_actions, shape=(self.num_actions,)
+        )
+        self.env.unwrapped.action_space = gym.vector.utils.batch_space(
+            self.env.unwrapped.single_action_space, self.num_envs
+        )