Add reach environment for Franka and UR10 (#244)

# Description This MR fixes the reach environment for end-effector pose tracking to the new structure. It adds support for two robots: Franka-Panda and UR10. The training works with RSL-RL and RL-Games. ## Type of change - New feature (non-breaking change which adds functionality) ## Checklist - [x] I have run the [`pre-commit` checks](https://pre-commit.com/) with `./orbit.sh --format` - [ ] I have made corresponding changes to the documentation - [x] My changes generate no new warnings - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have updated the changelog and the corresponding version in the extension's `config/extension.toml` file - [x] I have added my name to the `CONTRIBUTORS.md` or my name already exists there --------- Co-authored-by: Mayank Mittal <mittalma@leggedrobotics.com>

Add reach environment for Franka and UR10 (#244)
# Description This MR fixes the reach environment for end-effector pose tracking to the new structure. It adds support for two robots: Franka-Panda and UR10. The training works with RSL-RL and RL-Games. ## Type of change - New feature (non-breaking change which adds functionality) ## Checklist - [x] I have run the [`pre-commit` checks](https://pre-commit.com/) with `./orbit.sh --format` - [ ] I have made corresponding changes to the documentation - [x] My changes generate no new warnings - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have updated the changelog and the corresponding version in the extension's `config/extension.toml` file - [x] I have added my name to the `CONTRIBUTORS.md` or my name already exists there --------- Co-authored-by: Mayank Mittal <mittalma@leggedrobotics.com>
0bde6fb2 · David Hoeller · GitHub · 39f4e96e · 0bde6fb2 · 0bde6fb2
Unverified Commit 0bde6fb2 authored Nov 24, 2023 by David Hoeller Committed by GitHub Nov 24, 2023
25 changed files
--- a/source/extensions/omni.isaac.orbit/config/extension.toml
+++ b/source/extensions/omni.isaac.orbit/config/extension.toml
 [package]

 # Note: Semantic Versioning is used: https://semver.org/
-version = "0.9.44"
+version = "0.9.45"

 # Description
 title = "ORBIT framework for Robot Learning"

--- a/source/extensions/omni.isaac.orbit/docs/CHANGELOG.rst
+++ b/source/extensions/omni.isaac.orbit/docs/CHANGELOG.rst
 Changelog
 ---------

+0.9.45 (2023-11-24)
+~~~~~~~~~~~~~~~~~~~
+
+Added
+^^^^^
+
+* Added :class:`omni.isaac.orbit.command_generators.UniformPoseCommandGenerator` to generate
+  poses in the asset's root frame by uniformly sampling from a given range.
+
+
 0.9.44 (2023-11-16)
 ~~~~~~~~~~~~~~~~~~~


--- a/source/extensions/omni.isaac.orbit/omni/isaac/orbit/command_generators/__init__.py
+++ b/source/extensions/omni.isaac.orbit/omni/isaac/orbit/command_generators/__init__.py
@@ -23,15 +23,20 @@ from .command_generator_cfg import (
    NormalVelocityCommandGeneratorCfg,
    NullCommandGeneratorCfg,
    TerrainBasedPositionCommandGeneratorCfg,
+    UniformPoseCommandGeneratorCfg,
    UniformVelocityCommandGeneratorCfg,
 )
 from .null_command_generator import NullCommandGenerator
+from .pose_command_generator import UniformPoseCommandGenerator
 from .position_command_generator import TerrainBasedPositionCommandGenerator
 from .velocity_command_generator import NormalVelocityCommandGenerator, UniformVelocityCommandGenerator

 __all__ = [
    "CommandGeneratorBase",
    "CommandGeneratorBaseCfg",
+    # pose command generators
+    "UniformPoseCommandGenerator",
+    "UniformPoseCommandGeneratorCfg",
    # null command generator
    "NullCommandGenerator",
    "NullCommandGeneratorCfg",

--- a/source/extensions/omni.isaac.orbit/omni/isaac/orbit/command_generators/command_generator_cfg.py
+++ b/source/extensions/omni.isaac.orbit/omni/isaac/orbit/command_generators/command_generator_cfg.py
@@ -12,6 +12,7 @@ from omni.isaac.orbit.utils import configclass

 from .command_generator_base import CommandGeneratorBase
 from .null_command_generator import NullCommandGenerator
+from .pose_command_generator import UniformPoseCommandGenerator
 from .position_command_generator import TerrainBasedPositionCommandGenerator
 from .velocity_command_generator import NormalVelocityCommandGenerator, UniformVelocityCommandGenerator

@@ -123,6 +124,32 @@ class NormalVelocityCommandGeneratorCfg(UniformVelocityCommandGeneratorCfg):
    """Distribution ranges for the velocity commands."""


+@configclass
+class UniformPoseCommandGeneratorCfg(CommandGeneratorBaseCfg):
+    """Configuration for uniform pose command generator."""
+
+    class_type: type = UniformPoseCommandGenerator
+
+    asset_name: str = MISSING
+    """Name of the asset in the environment for which the commands are generated."""
+    body_name: str = MISSING
+    """Name of the body in the asset for which the commands are generated."""
+
+    @configclass
+    class Ranges:
+        """Uniform distribution ranges for the pose commands."""
+
+        pos_x: tuple[float, float] = MISSING  # min max [m]
+        pos_y: tuple[float, float] = MISSING  # min max [m]
+        pos_z: tuple[float, float] = MISSING  # min max [m]
+        roll: tuple[float, float] = MISSING  # min max [rad]
+        pitch: tuple[float, float] = MISSING  # min max [rad]
+        yaw: tuple[float, float] = MISSING  # min max [rad]
+
+    ranges: Ranges = MISSING
+    """Ranges for the commands."""
+
+
 @configclass
 class TerrainBasedPositionCommandGeneratorCfg(CommandGeneratorBaseCfg):
    """Configuration for the terrain-based position command generator."""

--- a/source/extensions/omni.isaac.orbit/omni/isaac/orbit/command_generators/pose_command_generator.py
+++ b/source/extensions/omni.isaac.orbit/omni/isaac/orbit/command_generators/pose_command_generator.py
+# Copyright (c) 2022-2023, The ORBIT Project Developers.
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""Sub-module containing command generators for pose tracking."""
+
+from __future__ import annotations
+
+import torch
+from typing import TYPE_CHECKING, Sequence
+
+from omni.isaac.orbit.assets import Articulation
+from omni.isaac.orbit.markers import VisualizationMarkers
+from omni.isaac.orbit.markers.config import FRAME_MARKER_CFG
+from omni.isaac.orbit.utils.math import combine_frame_transforms, compute_pose_error, quat_from_euler_xyz
+
+from .command_generator_base import CommandGeneratorBase
+
+if TYPE_CHECKING:
+    from omni.isaac.orbit.envs import BaseEnv
+
+    from .command_generator_cfg import UniformPoseCommandGeneratorCfg
+
+
+class UniformPoseCommandGenerator(CommandGeneratorBase):
+    """Command generator for generating pose commands uniformly.
+
+    The command generator generates poses by sampling positions uniformly within specified
+    regions in cartesian space. For orientation, it samples uniformly the euler angles
+    (roll-pitch-yaw) and converts them into quaternion representation (w, x, y, z).
+
+    The position and orientation commands are generated in the base frame of the robot, and not the
+    simulation world frame. This means that users need to handle the transformation from the
+    base frame to the simulation world frame themselves.
+
+    .. caution::
+
+        Sampling orientations uniformly is not strictly the same as sampling euler angles uniformly.
+        This is because rotations are defined by 3D non-Euclidean space, and the mapping
+        from euler angles to rotations is not one-to-one.
+
+    """
+
+    cfg: UniformPoseCommandGeneratorCfg
+    """Configuration for the command generator."""
+
+    def __init__(self, cfg: UniformPoseCommandGeneratorCfg, env: BaseEnv):
+        """Initialize the command generator class.
+
+        Args:
+            cfg: The configuration parameters for the command generator.
+            env: The environment object.
+        """
+        # initialize the base class
+        super().__init__(cfg, env)
+
+        # extract the robot and body index for which the command is generated
+        self.robot: Articulation = env.scene[cfg.asset_name]
+        self.body_idx = self.robot.find_bodies(cfg.body_name)[0][0]
+
+        # create buffers
+        # -- commands: (x, y, z, qw, qx, qy, qz) in root frame
+        self.pose_command_b = torch.zeros(self.num_envs, 7, device=self.device)
+        self.pose_command_b[:, 3] = 1.0
+        self.pose_command_w = torch.zeros_like(self.pose_command_b)
+        # -- metrics
+        self.metrics["position_error"] = torch.zeros(self.num_envs, device=self.device)
+        self.metrics["orientation_error"] = torch.zeros(self.num_envs, device=self.device)
+
+    def __str__(self) -> str:
+        msg = "PoseCommandGenerator:\n"
+        msg += f"\tCommand dimension: {tuple(self.command.shape[1:])}\n"
+        msg += f"\tResampling time range: {self.cfg.resampling_time_range}\n"
+        return msg
+
+    """
+    Properties
+    """
+
+    @property
+    def command(self) -> torch.Tensor:
+        """The desired pose command. Shape is (num_envs, 7).
+
+        The first three elements correspond to the position, followed by the quaternion orientation in (w, x, y, z).
+        """
+        return self.pose_command_b
+
+    """
+    Implementation specific functions.
+    """
+
+    def _resample_command(self, env_ids: Sequence[int]):
+        # sample new pose targets
+        # -- position
+        r = torch.empty(len(env_ids), device=self.device)
+        self.pose_command_b[env_ids, 0] = r.uniform_(*self.cfg.ranges.pos_x)
+        self.pose_command_b[env_ids, 1] = r.uniform_(*self.cfg.ranges.pos_y)
+        self.pose_command_b[env_ids, 2] = r.uniform_(*self.cfg.ranges.pos_z)
+        # -- orientation
+        euler_angles = torch.zeros_like(self.pose_command_b[env_ids, :3])
+        euler_angles[:, 0].uniform_(*self.cfg.ranges.roll)
+        euler_angles[:, 1].uniform_(*self.cfg.ranges.pitch)
+        euler_angles[:, 2].uniform_(*self.cfg.ranges.yaw)
+        self.pose_command_b[env_ids, 3:] = quat_from_euler_xyz(
+            euler_angles[:, 0], euler_angles[:, 1], euler_angles[:, 2]
+        )
+
+    def _update_command(self):
+        pass
+
+    def _update_metrics(self):
+        # transform command from base frame to simulation world frame
+        self.pose_command_w[:, :3], self.pose_command_w[:, 3:] = combine_frame_transforms(
+            self.robot.data.root_pos_w,
+            self.robot.data.root_quat_w,
+            self.pose_command_b[:, :3],
+            self.pose_command_b[:, 3:],
+        )
+        # compute the error
+        pos_error, rot_error = compute_pose_error(
+            self.pose_command_w[:, :3],
+            self.pose_command_w[:, 3:],
+            self.robot.data.body_state_w[:, self.body_idx, :3],
+            self.robot.data.body_state_w[:, self.body_idx, 3:7],
+        )
+        self.metrics["position_error"] = torch.norm(pos_error, dim=-1)
+        self.metrics["orientation_error"] = torch.norm(rot_error, dim=-1)
+
+    def _set_debug_vis_impl(self, debug_vis: bool):
+        # create markers if necessary for the first tome
+        if debug_vis:
+            if not hasattr(self, "goal_pose_visualizer"):
+                marker_cfg = FRAME_MARKER_CFG.copy()
+                marker_cfg.markers["frame"].scale = (0.1, 0.1, 0.1)
+                # -- goal pose
+                marker_cfg.prim_path = "/Visuals/Command/goal_pose"
+                self.goal_pose_visualizer = VisualizationMarkers(marker_cfg)
+                # -- current body pose
+                marker_cfg.prim_path = "/Visuals/Command/body_pose"
+                self.body_pose_visualizer = VisualizationMarkers(marker_cfg)
+            # set their visibility to true
+            self.goal_pose_visualizer.set_visibility(True)
+            self.body_pose_visualizer.set_visibility(True)
+        else:
+            if hasattr(self, "goal_pose_visualizer"):
+                self.goal_pose_visualizer.set_visibility(False)
+                self.body_pose_visualizer.set_visibility(False)
+
+    def _debug_vis_callback(self, event):
+        # update the markers
+        # -- goal pose
+        self.goal_pose_visualizer.visualize(self.pose_command_w[:, :3], self.pose_command_w[:, 3:])
+        # -- current body pose
+        body_pose_w = self.robot.data.body_state_w[:, self.body_idx]
+        self.body_pose_visualizer.visualize(body_pose_w[:, :3], body_pose_w[:, 3:7])
--- a/source/extensions/omni.isaac.orbit/omni/isaac/orbit/managers/action_manager.py
+++ b/source/extensions/omni.isaac.orbit/omni/isaac/orbit/managers/action_manager.py
@@ -251,6 +251,9 @@ class ActionManager(ManagerBase):
        else:
            cfg_items = self.cfg.__dict__.items()
        for term_name, term_cfg in cfg_items:
+            # check if term config is None
+            if term_cfg is None:
+                continue
            # check valid type
            if not isinstance(term_cfg, ActionTermCfg):
                raise TypeError(

--- a/source/extensions/omni.isaac.orbit/omni/isaac/orbit/utils/math.py
+++ b/source/extensions/omni.isaac.orbit/omni/isaac/orbit/utils/math.py
@@ -42,6 +42,7 @@ __all__ = [
    "quat_from_matrix",
    "quat_apply_yaw",
    "quat_box_minus",
+    "quat_error_magnitude",
    "yaw_quat",
    "euler_xyz_from_quat",
    "axis_angle_from_quat",
@@ -496,9 +497,8 @@ def axis_angle_from_quat(quat: torch.Tensor, eps: float = 1.0e-6) -> torch.Tenso
        eps: The tolerance for Taylor approximation. Defaults to 1.0e-6.

    Returns:
-        Rotations given as a vector in axis angle form, as a tensor
-            of shape (..., 3), where the magnitude is the angle turned
-            anti-clockwise in radians around the vector's direction.
+        Rotations given as a vector in axis angle form, as a tensor of shape (..., 3),
+        where the magnitude is the angle turned anti-clockwise in radians around the vector's direction.

    Reference:
        Based on PyTorch3D (https://github.com/facebookresearch/pytorch3d/blob/main/pytorch3d/transforms/rotation_conversions.py#L526-L554)
@@ -520,6 +520,21 @@ def axis_angle_from_quat(quat: torch.Tensor, eps: float = 1.0e-6) -> torch.Tenso
    return quat[..., 1:4] / sin_half_angles_over_angles.unsqueeze(-1)


+@torch.jit.script
+def quat_error_magnitude(q1: torch.Tensor, q2: torch.Tensor) -> torch.Tensor:
+    """Computes the rotation difference between two quaternions.
+
+    Args:
+        q1: A (N, 4) tensor for quaternion (w, x, y, z).
+        q2: A (N, 4) tensor for quaternion (w, x, y, z).
+
+    Returns:
+        Angular error between input quaternions in radians.
+    """
+    quat_diff = quat_mul(q1, quat_conjugate(q2))
+    return torch.norm(axis_angle_from_quat(quat_diff), dim=1)
+
+
 """
 Transformations
 """
@@ -626,6 +641,9 @@ def compute_pose_error(
    Returns:
        A tuple containing position and orientation error.

+        - If :attr:`rot_error_type` is "quat", the orientation error is returned as a quaternion.
+        - If :attr:`rot_error_type` is "axis_angle", the orientation error is returned as an axis-angle vector.
+
    Raises:
        ValueError: Invalid rotation error type.
    """

--- a/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/__init__.py
+++ b/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/__init__.py
@@ -49,6 +49,6 @@ __version__ = ORBIT_TASKS_METADATA["package"]["version"]
 from .utils import import_packages

 # The blacklist is used to prevent importing configs from sub-packages
-_BLACKLIST_PKGS = ["locomotion.velocity.config.anymal_d", "manipulation", "utils", "classic.ant", "classic.humanoid"]
+_BLACKLIST_PKGS = ["locomotion.velocity.config.anymal_d", "utils", "classic.ant", "classic.humanoid"]
 # Import all configs in this package
 import_packages(__name__, _BLACKLIST_PKGS)
--- a/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/__init__.py
+++ b/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/__init__.py
@@ -4,3 +4,5 @@
 # SPDX-License-Identifier: BSD-3-Clause

 """Manipulation environments for fixed-arm robots."""
+
+from .reach import *  # noqa
--- a/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/__init__.py
+++ b/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/__init__.py
@@ -3,24 +3,4 @@
 #
 # SPDX-License-Identifier: BSD-3-Clause

-"""Environment for end-effector pose tracking task for fixed-arm robots."""
-
-import gymnasium as gym
-
-from . import agents
-
-##
-# Register Gym environments.
-##
-
-gym.register(
-    id="Isaac-Reach-Franka-v0",
-    entry_point="omni.isaac.orbit.envs:RLTaskEnv",
-    disable_env_checker=True,
-    kwargs={
-        "env_cfg_entry_point": f"{__name__}.reach_env_cfg:ReachEnvCfg",
-        "rl_games_cfg_entry_point": f"{agents.__name__}:rl_games_ppo_cfg.yaml",
-        "rsl_rl_cfg_entry_point": f"{agents.__name__}.rsl_rl_ppo_cfg:LIFT_RSL_RL_PPO_CFG",
-        "skrl_cfg_entry_point": f"{agents.__name__}:skrl_ppo_cfg.yaml",
-    },
-)
+"""Fixed-arm environments with end-effector pose tracking commands."""
--- a/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/agents/skrl_ppo_cfg.yaml
+++ b/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/agents/skrl_ppo_cfg.yaml
-seed: 42
-
-# Models are instantiated using skrl's model instantiator utility
-# https://skrl.readthedocs.io/en/develop/modules/skrl.utils.model_instantiators.html
-models:
-  separate: False
-  policy:  # see skrl.utils.model_instantiators.gaussian_model for parameter details
-    clip_actions: False
-    clip_log_std: True
-    min_log_std: -20.0
-    max_log_std: 2.0
-    input_shape: "Shape.STATES"
-    hiddens: [256, 128, 64]
-    hidden_activation: ["elu", "elu", "elu"]
-    output_shape: "Shape.ACTIONS"
-    output_activation: "tanh"
-    output_scale: 1.0
-  value:  # see skrl.utils.model_instantiators.deterministic_model for parameter details
-    clip_actions: False
-    input_shape: "Shape.STATES"
-    hiddens: [256, 128, 64]
-    hidden_activation: ["elu", "elu", "elu"]
-    output_shape: "Shape.ONE"
-    output_activation: ""
-    output_scale: 1.0
-
-
-# PPO agent configuration (field names are from PPO_DEFAULT_CONFIG)
-# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html
-agent:
-  rollouts: 16
-  learning_epochs: 8
-  mini_batches: 8
-  discount_factor: 0.99
-  lambda: 0.95
-  learning_rate: 3.e-4
-  learning_rate_scheduler: "KLAdaptiveRL"
-  learning_rate_scheduler_kwargs:
-    kl_threshold: 0.01
-  state_preprocessor: "RunningStandardScaler"
-  state_preprocessor_kwargs: null
-  value_preprocessor: "RunningStandardScaler"
-  value_preprocessor_kwargs: null
-  random_timesteps: 0
-  learning_starts: 0
-  grad_norm_clip: 1.0
-  ratio_clip: 0.2
-  value_clip: 0.2
-  clip_predicted_values: True
-  entropy_loss_scale: 0.0
-  value_loss_scale: 2.0
-  kl_threshold: 0
-  rewards_shaper_scale: 1.0
-  # logging and checkpoint
-  experiment:
-    directory: "reach"
-    experiment_name: ""
-    write_interval: 40
-    checkpoint_interval: 400
-
-
-# Sequential trainer
-# https://skrl.readthedocs.io/en/latest/modules/skrl.trainers.sequential.html
-trainer:
-  timesteps: 16000
--- a/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/config/__init__.py
+++ b/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/config/__init__.py
+# Copyright (c) 2022-2023, The ORBIT Project Developers.
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""Configurations for arm-based reach-tracking environments."""
+
+# We leave this file empty since we don't want to expose any configs in this package directly.
+# We still need this file to import the "config" module in the parent package.
--- a/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/config/franka/__init__.py
+++ b/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/config/franka/__init__.py
+# Copyright (c) 2022-2023, The ORBIT Project Developers.
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import gymnasium as gym
+
+from . import agents, joint_env_cfg
+
+##
+# Register Gym environments.
+##
+
+gym.register(
+    id="Isaac-Reach-Franka-v0",
+    entry_point="omni.isaac.orbit.envs:RLTaskEnv",
+    disable_env_checker=True,
+    kwargs={
+        "env_cfg_entry_point": joint_env_cfg.FrankaReachEnvCfg,
+        "rl_games_cfg_entry_point": f"{agents.__name__}:rl_games_ppo_cfg.yaml",
+        "rsl_rl_cfg_entry_point": f"{agents.__name__}.rsl_rl_ppo_cfg:FrankaReachPPORunnerCfg",
+    },
+)
+
+gym.register(
+    id="Isaac-Reach-Franka-Play-v0",
+    entry_point="omni.isaac.orbit.envs:RLTaskEnv",
+    disable_env_checker=True,
+    kwargs={
+        "env_cfg_entry_point": joint_env_cfg.FrankaReachEnvCfg_PLAY,
+        "rl_games_cfg_entry_point": f"{agents.__name__}:rl_games_ppo_cfg.yaml",
+        "rsl_rl_cfg_entry_point": f"{agents.__name__}.rsl_rl_ppo_cfg:FrankaReachPPORunnerCfg",
+    },
+)
--- a/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/config/franka/agents/rl_games_ppo_cfg.yaml
+++ b/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/config/franka/agents/rl_games_ppo_cfg.yaml
+params:
+  seed: 42
+
+  # environment wrapper clipping
+  env:
+    clip_observations: 100.0
+    clip_actions: 100.0
+
+  algo:
+    name: a2c_continuous
+
+  model:
+    name: continuous_a2c_logstd
+
+  network:
+    name: actor_critic
+    separate: False
+    space:
+      continuous:
+        mu_activation: None
+        sigma_activation: None
+
+        mu_init:
+          name: default
+        sigma_init:
+          name: const_initializer
+          val: 0
+        fixed_sigma: True
+    mlp:
+      units: [64, 64]
+      activation: elu
+      d2rl: False
+
+      initializer:
+        name: default
+      regularizer:
+        name: None
+
+  load_checkpoint: False # flag which sets whether to load the checkpoint
+  load_path: '' # path to the checkpoint to load
+
+  config:
+    name: reach_franka
+    env_name: rlgpu
+    device: 'cuda:0'
+    device_name: 'cuda:0'
+    multi_gpu: False
+    ppo: True
+    mixed_precision: False
+    normalize_input: True
+    normalize_value: True
+    value_bootstrap: True
+    num_actors: -1
+    reward_shaper:
+      scale_value: 1.0
+    normalize_advantage: True
+    gamma: 0.99
+    tau: 0.95
+    learning_rate: 1e-3
+    lr_schedule: adaptive
+    schedule_type: legacy
+    kl_threshold: 0.01
+    score_to_win: 10000
+    max_epochs: 1000
+    save_best_after: 200
+    save_frequency: 100
+    print_stats: True
+    grad_norm: 1.0
+    entropy_coef: 0.01
+    truncate_grads: True
+    e_clip: 0.2
+    horizon_length: 24
+    minibatch_size: 24576
+    mini_epochs: 5
+    critic_coef: 2
+    clip_value: True
+    clip_actions: False
+    bounds_loss_coef: 0.0001
--- a/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/config/franka/agents/rsl_rl_ppo_cfg.py
+++ b/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/config/franka/agents/rsl_rl_ppo_cfg.py
+# Copyright (c) 2022-2023, The ORBIT Project Developers.
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+from omni.isaac.orbit.utils import configclass
+
+from omni.isaac.orbit_tasks.utils.wrappers.rsl_rl import (
+    RslRlOnPolicyRunnerCfg,
+    RslRlPpoActorCriticCfg,
+    RslRlPpoAlgorithmCfg,
+)
+
+
+@configclass
+class FrankaReachPPORunnerCfg(RslRlOnPolicyRunnerCfg):
+    num_steps_per_env = 24
+    max_iterations = 1000
+    save_interval = 50
+    experiment_name = "reach_franka"
+    run_name = ""
+    resume = False
+    empirical_normalization = False
+    policy = RslRlPpoActorCriticCfg(
+        init_noise_std=1.0,
+        actor_hidden_dims=[64, 64],
+        critic_hidden_dims=[64, 64],
+        activation="elu",
+    )
+    algorithm = RslRlPpoAlgorithmCfg(
+        value_loss_coef=1.0,
+        use_clipped_value_loss=True,
+        clip_param=0.2,
+        entropy_coef=0.01,
+        num_learning_epochs=8,
+        num_mini_batches=4,
+        learning_rate=1.0e-3,
+        schedule="adaptive",
+        gamma=0.99,
+        lam=0.95,
+        desired_kl=0.01,
+        max_grad_norm=1.0,
+    )
--- a/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/config/franka/joint_env_cfg.py
+++ b/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/config/franka/joint_env_cfg.py
+# Copyright (c) 2022-2023, The ORBIT Project Developers.
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+from __future__ import annotations
+
+import math
+
+from omni.isaac.orbit.utils import configclass
+
+import omni.isaac.orbit_tasks.manipulation.reach.mdp as mdp
+from omni.isaac.orbit_tasks.manipulation.reach.reach_env_cfg import ReachEnvCfg
+
+##
+# Pre-defined configs
+##
+# isort: off
+from omni.isaac.orbit.assets.config import FRANKA_PANDA_ARM_WITH_PANDA_HAND_CFG
+
+
+##
+# Environment configuration
+##
+
+
+@configclass
+class FrankaReachEnvCfg(ReachEnvCfg):
+    def __post_init__(self):
+        # post init of parent
+        super().__post_init__()
+
+        # switch robot to franka
+        self.scene.robot = FRANKA_PANDA_ARM_WITH_PANDA_HAND_CFG.replace(prim_path="{ENV_REGEX_NS}/Robot")
+        # override rewards
+        self.rewards.end_effector_position_tracking.params["asset_cfg"].body_names = ["panda_hand"]
+        self.rewards.end_effector_orientation_tracking.params["asset_cfg"].body_names = ["panda_hand"]
+        # override actions
+        self.actions.arm_action = mdp.JointPositionActionCfg(
+            asset_name="robot", joint_names=["panda_joint.*"], scale=0.5, use_default_offset=True
+        )
+        # override command generator body
+        # end-effector is along z-direction
+        self.commands.body_name = "panda_hand"
+        self.commands.ranges.pitch = (math.pi, math.pi)
+
+
+@configclass
+class FrankaReachEnvCfg_PLAY(FrankaReachEnvCfg):
+    def __post_init__(self):
+        # post init of parent
+        super().__post_init__()
+        # make a smaller scene for play
+        self.scene.num_envs = 50
+        self.scene.env_spacing = 2.5
+        # disable randomization for play
+        self.observations.policy.enable_corruption = False
--- a/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/config/ur_10/__init__.py
+++ b/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/config/ur_10/__init__.py
+# Copyright (c) 2022-2023, The ORBIT Project Developers.
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import gymnasium as gym
+
+from . import agents, joint_env_cfg
+
+##
+# Register Gym environments.
+##
+
+gym.register(
+    id="Isaac-Reach-UR10-v0",
+    entry_point="omni.isaac.orbit.envs:RLTaskEnv",
+    disable_env_checker=True,
+    kwargs={
+        "env_cfg_entry_point": joint_env_cfg.UR10ReachEnvCfg,
+        "rl_games_cfg_entry_point": f"{agents.__name__}:rl_games_ppo_cfg.yaml",
+        "rsl_rl_cfg_entry_point": f"{agents.__name__}.rsl_rl_ppo_cfg:UR10ReachPPORunnerCfg",
+    },
+)
+
+gym.register(
+    id="Isaac-Reach-UR10-Play-v0",
+    entry_point="omni.isaac.orbit.envs:RLTaskEnv",
+    disable_env_checker=True,
+    kwargs={
+        "env_cfg_entry_point": joint_env_cfg.UR10ReachEnvCfg_PLAY,
+        "rl_games_cfg_entry_point": f"{agents.__name__}:rl_games_ppo_cfg.yaml",
+        "rsl_rl_cfg_entry_point": f"{agents.__name__}.rsl_rl_ppo_cfg:UR10ReachPPORunnerCfg",
+    },
+)
--- a/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/agents/rl_games_ppo_cfg.yaml
+++ b/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/agents/rl_games_ppo_cfg.yaml
@@ -3,8 +3,8 @@ params:

  # environment wrapper clipping
  env:
-    clip_observations: 5.0
-    clip_actions: 1.0
+    clip_observations: 100.0
+    clip_actions: 100.0

  algo:
    name: a2c_continuous
@@ -27,7 +27,7 @@ params:
          val: 0
        fixed_sigma: True
    mlp:
-      units: [256, 128, 64]
+      units: [64, 64]
      activation: elu
      d2rl: False

@@ -40,7 +40,7 @@ params:
  load_path: '' # path to the checkpoint to load

  config:
-    name: lift
+    name: reach_ur10
    env_name: rlgpu
    device: 'cuda:0'
    device_name: 'cuda:0'
@@ -56,23 +56,23 @@ params:
    normalize_advantage: True
    gamma: 0.99
    tau: 0.95
-    learning_rate: 5e-4
+    learning_rate: 1e-3
    lr_schedule: adaptive
    schedule_type: legacy
-    kl_threshold: 0.008
+    kl_threshold: 0.01
    score_to_win: 10000
-    max_epochs: 5000
+    max_epochs: 1000
    save_best_after: 200
    save_frequency: 100
    print_stats: True
    grad_norm: 1.0
-    entropy_coef: 0.0
+    entropy_coef: 0.01
    truncate_grads: True
    e_clip: 0.2
-    horizon_length: 32
-    minibatch_size: 2048
+    horizon_length: 24
+    minibatch_size: 24576
    mini_epochs: 5
-    critic_coef: 4
+    critic_coef: 2
    clip_value: True
-    seq_len: 4
+    clip_actions: False
    bounds_loss_coef: 0.0001
--- a/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/agents/rsl_rl_ppo_cfg.py
+++ b/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/agents/rsl_rl_ppo_cfg.py
@@ -3,40 +3,41 @@
 #
 # SPDX-License-Identifier: BSD-3-Clause

+from omni.isaac.orbit.utils import configclass
+
 from omni.isaac.orbit_tasks.utils.wrappers.rsl_rl import (
    RslRlOnPolicyRunnerCfg,
    RslRlPpoActorCriticCfg,
    RslRlPpoAlgorithmCfg,
 )

-REACH_RSL_RL_PPO_CFG = RslRlOnPolicyRunnerCfg(
-    num_steps_per_env=64,
-    max_iterations=250,
-    save_interval=50,
-    experiment_name="reach",
-    run_name="",
-    resume=False,
-    load_run=-1,
-    load_checkpoint=-1,
-    empirical_normalization=False,
-    policy=RslRlPpoActorCriticCfg(
+
+@configclass
+class UR10ReachPPORunnerCfg(RslRlOnPolicyRunnerCfg):
+    num_steps_per_env = 24
+    max_iterations = 1000
+    save_interval = 50
+    experiment_name = "reach_ur10"
+    run_name = ""
+    resume = False
+    empirical_normalization = False
+    policy = RslRlPpoActorCriticCfg(
        init_noise_std=1.0,
-        actor_hidden_dims=[256, 128, 64],
-        critic_hidden_dims=[256, 128, 64],
+        actor_hidden_dims=[64, 64],
+        critic_hidden_dims=[64, 64],
        activation="elu",
-    ),
-    algorithm=RslRlPpoAlgorithmCfg(
+    )
+    algorithm = RslRlPpoAlgorithmCfg(
        value_loss_coef=1.0,
        use_clipped_value_loss=True,
        clip_param=0.2,
        entropy_coef=0.01,
        num_learning_epochs=8,
-        num_mini_batches=64,
-        learning_rate=3.0e-4,
+        num_mini_batches=4,
+        learning_rate=1.0e-3,
        schedule="adaptive",
        gamma=0.99,
        lam=0.95,
        desired_kl=0.01,
        max_grad_norm=1.0,
-    ),
-)
+    )
--- a/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/config/ur_10/joint_env_cfg.py
+++ b/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/config/ur_10/joint_env_cfg.py
+# Copyright (c) 2022-2023, The ORBIT Project Developers.
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+from __future__ import annotations
+
+import math
+
+from omni.isaac.orbit.utils import configclass
+
+import omni.isaac.orbit_tasks.manipulation.reach.mdp as mdp
+from omni.isaac.orbit_tasks.manipulation.reach.reach_env_cfg import ReachEnvCfg
+
+##
+# Pre-defined configs
+##
+# isort: off
+from omni.isaac.orbit.assets.config import UR10_CFG
+
+
+##
+# Environment configuration
+##
+
+
+@configclass
+class UR10ReachEnvCfg(ReachEnvCfg):
+    def __post_init__(self):
+        # post init of parent
+        super().__post_init__()
+
+        # switch robot to ur10
+        self.scene.robot = UR10_CFG.replace(prim_path="{ENV_REGEX_NS}/Robot")
+        # override randomization
+        self.randomization.reset_robot_joints.params["position_range"] = (0.75, 1.25)
+        # override rewards
+        self.rewards.end_effector_position_tracking.params["asset_cfg"].body_names = ["ee_link"]
+        self.rewards.end_effector_orientation_tracking.params["asset_cfg"].body_names = ["ee_link"]
+        # override actions
+        self.actions.arm_action = mdp.JointPositionActionCfg(
+            asset_name="robot", joint_names=[".*"], scale=0.5, use_default_offset=True
+        )
+        # override command generator body
+        # end-effector is along x-direction
+        self.commands.body_name = "ee_link"
+        self.commands.ranges.pitch = (math.pi / 2, math.pi / 2)
+
+
+@configclass
+class UR10ReachEnvCfg_PLAY(UR10ReachEnvCfg):
+    def __post_init__(self):
+        # post init of parent
+        super().__post_init__()
+        # make a smaller scene for play
+        self.scene.num_envs = 50
+        self.scene.env_spacing = 2.5
+        # disable randomization for play
+        self.observations.policy.enable_corruption = False
--- a/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/mdp/__init__.py
+++ b/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/mdp/__init__.py
+# Copyright (c) 2022-2023, The ORBIT Project Developers.
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""This sub-module contains the functions that are specific to the locomotion environments."""
+
+from omni.isaac.orbit.envs.mdp import *  # noqa: F401, F403
+
+from .rewards import *  # noqa: F401, F403
--- a/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/mdp/rewards.py
+++ b/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/mdp/rewards.py
+# Copyright (c) 2022-2023, The ORBIT Project Developers.
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+from __future__ import annotations
+
+import torch
+from typing import TYPE_CHECKING
+
+from omni.isaac.orbit.assets import RigidObject
+from omni.isaac.orbit.command_generators import UniformPoseCommandGenerator
+from omni.isaac.orbit.managers import SceneEntityCfg
+from omni.isaac.orbit.utils.math import combine_frame_transforms, quat_error_magnitude, quat_mul
+
+if TYPE_CHECKING:
+    from omni.isaac.orbit.envs import RLTaskEnv
+
+
+def position_command_error(env: RLTaskEnv, asset_cfg: SceneEntityCfg) -> torch.Tensor:
+    """Penalize tracking of the position error using L2-norm.
+
+    The function computes the position error between the desired position (from the command) and the
+    current position of the asset's body (in world frame). The position error is computed as the L2-norm
+    of the difference between the desired and current positions.
+    """
+    # extract the asset (to enable type hinting)
+    asset: RigidObject = env.scene[asset_cfg.name]
+    command_manager: UniformPoseCommandGenerator = env.command_manager
+    # obtain the desired and current positions
+    des_pos_b = command_manager.command[:, :3]
+    des_pos_w, _ = combine_frame_transforms(asset.data.root_state_w[:, :3], asset.data.root_state_w[:, 3:7], des_pos_b)
+    curr_pos_w = asset.data.body_state_w[:, asset_cfg.body_ids[0], :3]  # type: ignore
+    return torch.norm(curr_pos_w - des_pos_w, dim=1)
+
+
+def orientation_command_error(env: RLTaskEnv, asset_cfg: SceneEntityCfg) -> torch.Tensor:
+    """Penalize tracking orientation error using shortest path.
+
+    The function computes the orientation error between the desired orientation (from the command) and the
+    current orientation of the asset's body (in world frame). The orientation error is computed as the shortest
+    path between the desired and current orientations.
+    """
+    # extract the asset (to enable type hinting)
+    asset: RigidObject = env.scene[asset_cfg.name]
+    command_manager: UniformPoseCommandGenerator = env.command_manager
+    # obtain the desired and current orientations
+    des_quat_b = command_manager.command[:, 3:7]
+    des_quat_w = quat_mul(asset.data.root_state_w[:, 3:7], des_quat_b)
+    curr_quat_w = asset.data.body_state_w[:, asset_cfg.body_ids[0], 3:7]  # type: ignore
+    return quat_error_magnitude(curr_quat_w, des_quat_w)
--- a/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/reach_cfg.py
+++ b/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/reach_cfg.py
-# Copyright (c) 2022-2023, The ORBIT Project Developers.
-# All rights reserved.
-#
-# SPDX-License-Identifier: BSD-3-Clause
-
-from __future__ import annotations
-
-from omni.isaac.orbit.controllers.differential_inverse_kinematics import DifferentialInverseKinematicsCfg
-from omni.isaac.orbit.robots.config.franka import FRANKA_PANDA_ARM_WITH_PANDA_HAND_CFG
-from omni.isaac.orbit.robots.single_arm import SingleArmManipulatorCfg
-from omni.isaac.orbit.utils import configclass
-from omni.isaac.orbit.utils.assets import ISAAC_NUCLEUS_DIR
-
-from omni.isaac.orbit_tasks.isaac_env_cfg import EnvCfg, IsaacEnvCfg, SimCfg, ViewerCfg
-
-##
-# Scene settings
-##
-
-
-@configclass
-class TableCfg:
-    """Properties for the table."""
-
-    # note: we use instanceable asset since it consumes less memory
-    usd_path = f"{ISAAC_NUCLEUS_DIR}/Props/Mounts/SeattleLabTable/table_instanceable.usd"
-
-
-@configclass
-class MarkerCfg:
-    """Properties for visualization marker."""
-
-    # usd file to import
-    usd_path = f"{ISAAC_NUCLEUS_DIR}/Props/UIElements/frame_prim.usd"
-    # scale of the asset at import
-    scale = [0.1, 0.1, 0.1]  # x,y,z
-
-
-##
-# MDP settings
-##
-
-
-@configclass
-class RandomizationCfg:
-    """Randomization of scene at reset."""
-
-    @configclass
-    class EndEffectorDesiredPoseCfg:
-        """Randomization of end-effector pose command."""
-
-        # category
-        position_cat: str = "default"  # randomize position: "default", "uniform"
-        orientation_cat: str = "default"  # randomize position: "default", "uniform"
-        # randomize position
-        position_default = [0.5, 0.0, 0.5]  # position default (x,y,z)
-        position_uniform_min = [0.25, -0.25, 0.25]  # position (x,y,z)
-        position_uniform_max = [0.5, 0.25, 0.5]  # position (x,y,z)
-        # randomize orientation
-        orientation_default = [1.0, 0.0, 0.0, 0.0]  # orientation default
-
-    # initialize
-    ee_desired_pose: EndEffectorDesiredPoseCfg = EndEffectorDesiredPoseCfg()
-
-
-@configclass
-class ObservationsCfg:
-    """Observation specifications for the MDP."""
-
-    @configclass
-    class PolicyCfg:
-        """Observations for policy group."""
-
-        # global group settings
-        enable_corruption: bool = True
-        # observation terms
-        arm_dof_pos_normalized = {"scale": 1.0, "noise": {"name": "uniform", "min": -0.01, "max": 0.01}}
-        arm_dof_vel = {"scale": 0.5, "noise": {"name": "uniform", "min": -0.1, "max": 0.1}}
-        ee_position = {}
-        ee_position_command = {}
-        actions = {}
-
-    # global observation settings
-    return_dict_obs_in_group = False
-    """Whether to return observations as dictionary or flattened vector within groups."""
-    # observation groups
-    policy: PolicyCfg = PolicyCfg()
-
-
-@configclass
-class RewardsCfg:
-    """Reward terms for the MDP."""
-
-    tracking_robot_position_l2 = {"weight": 0.0}
-    tracking_robot_position_exp = {"weight": 2.5, "sigma": 0.05}  # 0.25
-    penalizing_robot_dof_velocity_l2 = {"weight": -0.02}  # -1e-4
-    penalizing_robot_dof_acceleration_l2 = {"weight": -1e-5}
-    penalizing_action_rate_l2 = {"weight": -0.1}
-
-
-@configclass
-class TerminationsCfg:
-    """Termination terms for the MDP."""
-
-    episode_timeout = True  # reset when episode length ended
-
-
-@configclass
-class ControlCfg:
-    """Processing of MDP actions."""
-
-    # action space
-    control_type = "default"  # "default", "inverse_kinematics"
-    # decimation: Number of control action updates @ sim dt per policy dt
-    decimation = 2
-
-    # configuration loaded when control_type == "inverse_kinematics"
-    inverse_kinematics: DifferentialInverseKinematicsCfg = DifferentialInverseKinematicsCfg(
-        command_type="pose_rel",
-        ik_method="dls",
-        position_command_scale=(0.1, 0.1, 0.1),
-        rotation_command_scale=(0.1, 0.1, 0.1),
-    )
-
-
-##
-# Environment configuration
-##
-
-
-@configclass
-class ReachEnvCfg(IsaacEnvCfg):
-    """Configuration for the reach environment."""
-
-    # General Settings
-    env: EnvCfg = EnvCfg(num_envs=2048, env_spacing=2.5, episode_length_s=4.0)
-    viewer: ViewerCfg = ViewerCfg(debug_vis=True, eye=(7.5, 7.5, 7.5), lookat=(0.0, 0.0, 0.0))
-    # Physics settings
-    sim: SimCfg = SimCfg(dt=1.0 / 60.0, substeps=1)
-
-    # Scene Settings
-    robot: SingleArmManipulatorCfg = FRANKA_PANDA_ARM_WITH_PANDA_HAND_CFG
-    table: TableCfg = TableCfg()
-    marker: MarkerCfg = MarkerCfg()
-
-    # MDP settings
-    randomization: RandomizationCfg = RandomizationCfg()
-    observations: ObservationsCfg = ObservationsCfg()
-    rewards: RewardsCfg = RewardsCfg()
-    terminations: TerminationsCfg = TerminationsCfg()
-
-    # Controller settings
-    control: ControlCfg = ControlCfg()
--- a/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/reach_env.py
+++ b/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/reach_env.py
-# Copyright (c) 2022-2023, The ORBIT Project Developers.
-# All rights reserved.
-#
-# SPDX-License-Identifier: BSD-3-Clause
-
-from __future__ import annotations
-
-import gymnasium as gym
-import math
-import torch
-
-import omni.isaac.core.utils.prims as prim_utils
-
-import omni.isaac.orbit.compat.utils.kit as kit_utils
-from omni.isaac.orbit.compat.markers import PointMarker, StaticMarker
-from omni.isaac.orbit.compat.utils.mdp import ObservationManager, RewardManager
-from omni.isaac.orbit.controllers.differential_inverse_kinematics import DifferentialInverseKinematics
-from omni.isaac.orbit.robots.single_arm import SingleArmManipulator
-from omni.isaac.orbit.utils.dict import class_to_dict
-from omni.isaac.orbit.utils.math import random_orientation, sample_uniform, scale_transform
-
-from omni.isaac.orbit_tasks.isaac_env import IsaacEnv, VecEnvIndices, VecEnvObs
-
-from .reach_cfg import RandomizationCfg, ReachEnvCfg
-
-
-class ReachEnv(IsaacEnv):
-    """Environment for reaching to desired pose for a single-arm manipulator."""
-
-    def __init__(self, cfg: ReachEnvCfg = None, **kwargs):
-        # copy configuration
-        self.cfg = cfg
-        # parse the configuration for controller configuration
-        # note: controller decides the robot control mode
-        self._pre_process_cfg()
-        # create classes (these are called by the function :meth:`_design_scene`
-        self.robot = SingleArmManipulator(cfg=self.cfg.robot)
-
-        # initialize the base class to setup the scene.
-        super().__init__(self.cfg, **kwargs)
-        # parse the configuration for information
-        self._process_cfg()
-        # initialize views for the cloned scenes
-        self._initialize_views()
-
-        # prepare the observation manager
-        self._observation_manager = ReachObservationManager(class_to_dict(self.cfg.observations), self, self.device)
-        # prepare the reward manager
-        self._reward_manager = ReachRewardManager(
-            class_to_dict(self.cfg.rewards), self, self.num_envs, self.dt, self.device
-        )
-        # print information about MDP
-        print("[INFO] Observation Manager:", self._observation_manager)
-        print("[INFO] Reward Manager: ", self._reward_manager)
-
-        # compute the observation space
-        num_obs = self._observation_manager._group_obs_dim["policy"][0]
-        self.observation_space = gym.spaces.Box(low=-math.inf, high=math.inf, shape=(num_obs,))
-        # compute the action space
-        self.action_space = gym.spaces.Box(low=-1.0, high=1.0, shape=(self.num_actions,))
-        print("[INFO]: Completed setting up the environment...")
-        # Take an initial step to initialize the scene.
-        self.sim.step()
-        # -- fill up buffers
-        self.robot.update_buffers(self.dt)
-
-    """
-    Implementation specifics.
-    """
-
-    def _design_scene(self):
-        # ground plane
-        kit_utils.create_ground_plane("/World/defaultGroundPlane", z_position=-1.05)
-        # table
-        prim_utils.create_prim(self.template_env_ns + "/Table", usd_path=self.cfg.table.usd_path)
-        # robot
-        self.robot.spawn(self.template_env_ns + "/Robot")
-
-        # setup debug visualization
-        if self.cfg.viewer.debug_vis and self.enable_render:
-            # create point instancer to visualize the goal points
-            self._goal_markers = PointMarker("/Visuals/ee_goal", self.num_envs, radius=0.025)
-            # create marker for viewing end-effector pose
-            self._ee_markers = StaticMarker(
-                "/Visuals/ee_current", self.num_envs, usd_path=self.cfg.marker.usd_path, scale=self.cfg.marker.scale
-            )
-            # create marker for viewing command (if task-space controller is used)
-            if self.cfg.control.control_type == "inverse_kinematics":
-                self._cmd_markers = StaticMarker(
-                    "/Visuals/ik_command", self.num_envs, usd_path=self.cfg.marker.usd_path, scale=self.cfg.marker.scale
-                )
-        # return list of global prims
-        return ["/World/defaultGroundPlane"]
-
-    def _reset_idx(self, env_ids: VecEnvIndices):
-        # randomize the MDP
-        # -- robot DOF state
-        dof_pos, dof_vel = self.robot.get_default_dof_state(env_ids=env_ids)
-        self.robot.set_dof_state(dof_pos, dof_vel, env_ids=env_ids)
-        # -- desired end-effector pose
-        self._randomize_ee_desired_pose(env_ids, cfg=self.cfg.randomization.ee_desired_pose)
-
-        # -- Reward logging
-        # fill extras with episode information
-        self.extras["episode"] = dict()
-        # reset
-        # -- rewards manager: fills the sums for terminated episodes
-        self._reward_manager.reset_idx(env_ids, self.extras["episode"])
-        # -- obs manager
-        self._observation_manager.reset_idx(env_ids)
-        # -- reset history
-        self.previous_actions[env_ids] = 0
-        # -- MDP reset
-        self.reset_buf[env_ids] = 0
-        self.episode_length_buf[env_ids] = 0
-        # controller reset
-        if self.cfg.control.control_type == "inverse_kinematics":
-            self._ik_controller.reset_idx(env_ids)
-
-    def _step_impl(self, actions: torch.Tensor):
-        # reset environments that terminated
-        reset_env_ids = self.reset_buf.nonzero(as_tuple=False).squeeze(-1)
-        if len(reset_env_ids) > 0:
-            self._reset_idx(reset_env_ids)
-        # increment the number of steps
-        self.episode_length_buf += 1
-
-        # pre-step: set actions into buffer
-        self.actions = actions.clone().to(device=self.device)
-        # transform actions based on controller
-        if self.cfg.control.control_type == "inverse_kinematics":
-            # set the controller commands
-            self._ik_controller.set_command(self.actions)
-            # compute the joint commands
-            self.robot_actions[:, : self.robot.arm_num_dof] = self._ik_controller.compute(
-                self.robot.data.ee_state_w[:, 0:3] - self.envs_positions,
-                self.robot.data.ee_state_w[:, 3:7],
-                self.robot.data.ee_jacobian,
-                self.robot.data.arm_dof_pos,
-            )
-            # offset actuator command with position offsets
-            self.robot_actions[:, : self.robot.arm_num_dof] -= self.robot.data.actuator_pos_offset[
-                :, : self.robot.arm_num_dof
-            ]
-        elif self.cfg.control.control_type == "default":
-            self.robot_actions[:, : self.robot.arm_num_dof] = self.actions
-        # perform physics stepping
-        for _ in range(self.cfg.control.decimation):
-            # set actions into buffers
-            self.robot.apply_action(self.robot_actions)
-            # simulate
-            self.sim.step(render=self.enable_render)
-            # check that simulation is playing
-            if self.sim.is_stopped():
-                return
-        # post-step:
-        # -- compute common buffers
-        self.robot.update_buffers(self.dt)
-        # -- compute MDP signals
-        # reward
-        self.reward_buf = self._reward_manager.compute()
-        # terminations
-        self._check_termination()
-        # -- store history
-        self.previous_actions = self.actions.clone()
-
-        # -- add information to extra if timeout occurred due to episode length
-        # Note: this is used by algorithms like PPO where time-outs are handled differently
-        self.extras["time_outs"] = self.episode_length_buf >= self.max_episode_length
-        # -- update USD visualization
-        if self.cfg.viewer.debug_vis and self.enable_render:
-            self._debug_vis()
-
-    def _get_observations(self) -> VecEnvObs:
-        # compute observations
-        return self._observation_manager.compute()
-
-    """
-    Helper functions - Scene handling.
-    """
-
-    def _pre_process_cfg(self) -> None:
-        """Pre processing of configuration parameters."""
-        # set configuration for task-space controller
-        if self.cfg.control.control_type == "inverse_kinematics":
-            print("Using inverse kinematics controller...")
-            # enable jacobian computation
-            self.cfg.robot.data_info.enable_jacobian = True
-            # enable gravity compensation
-            self.cfg.robot.rigid_props.disable_gravity = True
-            # set the end-effector offsets
-            self.cfg.control.inverse_kinematics.position_offset = self.cfg.robot.ee_info.pos_offset
-            self.cfg.control.inverse_kinematics.rotation_offset = self.cfg.robot.ee_info.rot_offset
-        else:
-            print("Using default joint controller...")
-
-    def _process_cfg(self) -> None:
-        """Post processing of configuration parameters."""
-        # compute constants for environment
-        self.dt = self.cfg.control.decimation * self.physics_dt  # control-dt
-        self.max_episode_length = math.ceil(self.cfg.env.episode_length_s / self.dt)
-
-        # convert configuration parameters to torch
-        # randomization
-        # -- desired pose
-        config = self.cfg.randomization.ee_desired_pose
-        for attr in ["position_uniform_min", "position_uniform_max", "position_default", "orientation_default"]:
-            setattr(config, attr, torch.tensor(getattr(config, attr), device=self.device, requires_grad=False))
-
-    def _initialize_views(self) -> None:
-        """Creates views and extract useful quantities from them."""
-        # play the simulator to activate physics handles
-        # note: this activates the physics simulation view that exposes TensorAPIs
-        self.sim.reset()
-
-        # define views over instances
-        self.robot.initialize(self.env_ns + "/.*/Robot")
-
-        # create controller
-        if self.cfg.control.control_type == "inverse_kinematics":
-            self._ik_controller = DifferentialInverseKinematics(
-                self.cfg.control.inverse_kinematics, self.robot.count, self.device
-            )
-            # note: we exclude gripper from actions in this env
-            self.num_actions = self._ik_controller.num_actions
-        elif self.cfg.control.control_type == "default":
-            # note: we exclude gripper from actions in this env
-            self.num_actions = self.robot.arm_num_dof
-
-        # history
-        self.actions = torch.zeros((self.num_envs, self.num_actions), device=self.device)
-        self.previous_actions = torch.zeros((self.num_envs, self.num_actions), device=self.device)
-        # robot joint actions
-        self.robot_actions = torch.zeros((self.num_envs, self.robot.num_actions), device=self.device)
-        # commands
-        self.ee_des_pose_w = torch.zeros((self.num_envs, 7), device=self.device)
-
-    def _debug_vis(self):
-        # compute error between end-effector and command
-        error = torch.sum(torch.square(self.ee_des_pose_w[:, :3] - self.robot.data.ee_state_w[:, 0:3]), dim=1)
-        # set indices of the prim based on error threshold
-        goal_indices = torch.where(error < 0.002, 1, 0)
-        # apply to instance manager
-        # -- goal
-        self._goal_markers.set_world_poses(self.ee_des_pose_w[:, :3], self.ee_des_pose_w[:, 3:7])
-        self._goal_markers.set_status(goal_indices)
-        # -- end-effector
-        self._ee_markers.set_world_poses(self.robot.data.ee_state_w[:, 0:3], self.robot.data.ee_state_w[:, 3:7])
-        # -- task-space commands
-        if self.cfg.control.control_type == "inverse_kinematics":
-            # convert to world frame
-            ee_positions = self._ik_controller.desired_ee_pos + self.envs_positions
-            ee_orientations = self._ik_controller.desired_ee_rot
-            # set poses
-            self._cmd_markers.set_world_poses(ee_positions, ee_orientations)
-
-    """
-    Helper functions - MDP.
-    """
-
-    def _check_termination(self) -> None:
-        # extract values from buffer
-        # compute resets
-        self.reset_buf[:] = 0
-        # -- episode length
-        if self.cfg.terminations.episode_timeout:
-            self.reset_buf = torch.where(self.episode_length_buf >= self.max_episode_length, 1, self.reset_buf)
-
-    def _randomize_ee_desired_pose(self, env_ids: torch.Tensor, cfg: RandomizationCfg.EndEffectorDesiredPoseCfg):
-        """Randomize the desired pose of the end-effector."""
-        # -- desired object root position
-        if cfg.position_cat == "default":
-            # constant command for position
-            self.ee_des_pose_w[env_ids, 0:3] = cfg.position_default
-        elif cfg.position_cat == "uniform":
-            # sample uniformly from box
-            # note: this should be within in the workspace of the robot
-            self.ee_des_pose_w[env_ids, 0:3] = sample_uniform(
-                cfg.position_uniform_min, cfg.position_uniform_max, (len(env_ids), 3), device=self.device
-            )
-        else:
-            raise ValueError(f"Invalid category for randomizing the desired object positions '{cfg.position_cat}'.")
-        # -- desired object root orientation
-        if cfg.orientation_cat == "default":
-            # constant position of the object
-            self.ee_des_pose_w[env_ids, 3:7] = cfg.orientation_default
-        elif cfg.orientation_cat == "uniform":
-            self.ee_des_pose_w[env_ids, 3:7] = random_orientation(len(env_ids), self.device)
-        else:
-            raise ValueError(
-                f"Invalid category for randomizing the desired object orientation '{cfg.orientation_cat}'."
-            )
-        # transform command from local env to world
-        self.ee_des_pose_w[env_ids, 0:3] += self.envs_positions[env_ids]
-
-
-class ReachObservationManager(ObservationManager):
-    """Reward manager for single-arm reaching environment."""
-
-    def arm_dof_pos_normalized(self, env: ReachEnv):
-        """DOF positions for the arm normalized to its max and min ranges."""
-        return scale_transform(
-            env.robot.data.arm_dof_pos,
-            env.robot.data.soft_dof_pos_limits[:, :7, 0],
-            env.robot.data.soft_dof_pos_limits[:, :7, 1],
-        )
-
-    def arm_dof_vel(self, env: ReachEnv):
-        """DOF velocity of the arm."""
-        return env.robot.data.arm_dof_vel
-
-    def ee_position(self, env: ReachEnv):
-        """Current end-effector position of the arm."""
-        return env.robot.data.ee_state_w[:, :3] - env.envs_positions
-
-    def ee_position_command(self, env: ReachEnv):
-        """Desired end-effector position of the arm."""
-        return env.ee_des_pose_w[:, :3] - env.envs_positions
-
-    def actions(self, env: ReachEnv):
-        """Last actions provided to env."""
-        return env.actions
-
-
-class ReachRewardManager(RewardManager):
-    """Reward manager for single-arm reaching environment."""
-
-    def tracking_robot_position_l2(self, env: ReachEnv):
-        """Penalize tracking position error using L2-kernel."""
-        # compute error
-        return torch.sum(torch.square(env.ee_des_pose_w[:, :3] - env.robot.data.ee_state_w[:, 0:3]), dim=1)
-
-    def tracking_robot_position_exp(self, env: ReachEnv, sigma: float):
-        """Penalize tracking position error using exp-kernel."""
-        # compute error
-        error = torch.sum(torch.square(env.ee_des_pose_w[:, :3] - env.robot.data.ee_state_w[:, 0:3]), dim=1)
-        return torch.exp(-error / sigma)
-
-    def penalizing_robot_dof_velocity_l2(self, env: ReachEnv):
-        """Penalize large movements of the robot arm."""
-        return torch.sum(torch.square(env.robot.data.arm_dof_vel), dim=1)
-
-    def penalizing_robot_dof_acceleration_l2(self, env: ReachEnv):
-        """Penalize fast movements of the robot arm."""
-        return torch.sum(torch.square(env.robot.data.dof_acc), dim=1)
-
-    def penalizing_action_rate_l2(self, env: ReachEnv):
-        """Penalize large variations in action commands."""
-        return torch.sum(torch.square(env.actions - env.previous_actions), dim=1)
--- a/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/reach_env_cfg.py
+++ b/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/manipulation/reach/reach_env_cfg.py
+# Copyright (c) 2022-2023, The ORBIT Project Developers.
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+from __future__ import annotations
+
+from dataclasses import MISSING
+
+import omni.isaac.orbit.sim as sim_utils
+from omni.isaac.orbit.assets import ArticulationCfg, AssetBaseCfg
+from omni.isaac.orbit.command_generators import UniformPoseCommandGeneratorCfg
+from omni.isaac.orbit.envs import RLTaskEnvCfg
+from omni.isaac.orbit.managers import ActionTermCfg as ActionTerm
+from omni.isaac.orbit.managers import CurriculumTermCfg as CurrTerm
+from omni.isaac.orbit.managers import ObservationGroupCfg as ObsGroup
+from omni.isaac.orbit.managers import ObservationTermCfg as ObsTerm
+from omni.isaac.orbit.managers import RandomizationTermCfg as RandTerm
+from omni.isaac.orbit.managers import RewardTermCfg as RewTerm
+from omni.isaac.orbit.managers import SceneEntityCfg
+from omni.isaac.orbit.managers import TerminationTermCfg as DoneTerm
+from omni.isaac.orbit.scene import InteractiveSceneCfg
+from omni.isaac.orbit.utils import configclass
+from omni.isaac.orbit.utils.assets import ISAAC_NUCLEUS_DIR
+from omni.isaac.orbit.utils.noise import AdditiveUniformNoiseCfg as Unoise
+
+import omni.isaac.orbit_tasks.manipulation.reach.mdp as mdp
+
+##
+# Scene definition
+##
+
+
+@configclass
+class ReachSceneCfg(InteractiveSceneCfg):
+    """Configuration for the scene with a robotic arm."""
+
+    # world
+    ground = AssetBaseCfg(
+        prim_path="/World/ground",
+        spawn=sim_utils.GroundPlaneCfg(),
+        init_state=AssetBaseCfg.InitialStateCfg(pos=(0.0, 0.0, -1.05)),
+    )
+
+    table = AssetBaseCfg(
+        prim_path="{ENV_REGEX_NS}/Table",
+        spawn=sim_utils.UsdFileCfg(
+            usd_path=f"{ISAAC_NUCLEUS_DIR}/Props/Mounts/SeattleLabTable/table_instanceable.usd",
+        ),
+        init_state=AssetBaseCfg.InitialStateCfg(pos=(0.55, 0.0, 0.0), rot=(0.70711, 0.0, 0.0, 0.70711)),
+    )
+
+    # robots
+    robot: ArticulationCfg = MISSING
+
+    # lights
+    light = AssetBaseCfg(
+        prim_path="/World/light",
+        spawn=sim_utils.DomeLightCfg(color=(0.75, 0.75, 0.75), intensity=2500.0),
+    )
+
+
+##
+# MDP settings
+##
+
+
+@configclass
+class ActionsCfg:
+    """Action specifications for the MDP."""
+
+    arm_action: ActionTerm = MISSING
+    gripper_action: ActionTerm | None = None
+
+
+@configclass
+class ObservationsCfg:
+    """Observation specifications for the MDP."""
+
+    @configclass
+    class PolicyCfg(ObsGroup):
+        """Observations for policy group."""
+
+        # observation terms (order preserved)
+        joint_pos = ObsTerm(func=mdp.joint_pos_rel, noise=Unoise(n_min=-0.01, n_max=0.01))
+        joint_vel = ObsTerm(func=mdp.joint_vel_rel, noise=Unoise(n_min=-0.01, n_max=0.01))
+        pose_command = ObsTerm(func=mdp.generated_commands)
+        actions = ObsTerm(func=mdp.last_action)
+
+        def __post_init__(self):
+            self.enable_corruption = True
+            self.concatenate_terms = True
+
+    # observation groups
+    policy: PolicyCfg = PolicyCfg()
+
+
+@configclass
+class RandomizationCfg:
+    """Configuration for randomization."""
+
+    reset_robot_joints = RandTerm(
+        func=mdp.reset_joints_by_scale,
+        mode="reset",
+        params={
+            "position_range": (0.5, 1.5),
+            "velocity_range": (0.0, 0.0),
+        },
+    )
+
+
+@configclass
+class RewardsCfg:
+    """Reward terms for the MDP."""
+
+    # task terms
+    end_effector_position_tracking = RewTerm(
+        func=mdp.position_command_error,
+        weight=-0.2,
+        params={"asset_cfg": SceneEntityCfg("robot", body_names=MISSING)},
+    )
+    end_effector_orientation_tracking = RewTerm(
+        func=mdp.orientation_command_error,
+        weight=-0.05,
+        params={"asset_cfg": SceneEntityCfg("robot", body_names=MISSING)},
+    )
+
+    # action penalty
+    action_rate = RewTerm(func=mdp.action_rate_l2, weight=-0.005)
+    joint_vel = RewTerm(
+        func=mdp.joint_vel_l2,
+        weight=-0.0001,
+        params={"asset_cfg": SceneEntityCfg("robot")},
+    )
+
+
+@configclass
+class TerminationsCfg:
+    """Termination terms for the MDP."""
+
+    time_out = DoneTerm(func=mdp.time_out, time_out=True)
+
+
+@configclass
+class CurriculumCfg:
+    """Curriculum terms for the MDP."""
+
+    action_rate = CurrTerm(
+        func=mdp.modify_reward_weight, params={"term_name": "action_rate", "weight": -0.01, "num_steps": 4500}
+    )
+
+    joint_vel = CurrTerm(
+        func=mdp.modify_reward_weight, params={"term_name": "joint_vel", "weight": -0.0001, "num_steps": 4500}
+    )
+
+
+##
+# Environment configuration
+##
+
+
+@configclass
+class ReachEnvCfg(RLTaskEnvCfg):
+    """Configuration for the reach end-effector pose tracking environment."""
+
+    # Scene settings
+    scene: ReachSceneCfg = ReachSceneCfg(num_envs=4096, env_spacing=2.5, replicate_physics=False)
+    # Basic settings
+    observations: ObservationsCfg = ObservationsCfg()
+    actions: ActionsCfg = ActionsCfg()
+    commands: UniformPoseCommandGeneratorCfg = UniformPoseCommandGeneratorCfg(
+        asset_name="robot",
+        body_name=MISSING,
+        resampling_time_range=(4.0, 4.0),
+        debug_vis=False,
+        ranges=UniformPoseCommandGeneratorCfg.Ranges(
+            pos_x=(0.35, 0.65),
+            pos_y=(-0.2, 0.2),
+            pos_z=(0.15, 0.5),
+            roll=(0.0, 0.0),
+            pitch=MISSING,  # depends on end-effector axis
+            yaw=(-3.14, 3.14),
+        ),
+    )
+    # MDP settings
+    rewards: RewardsCfg = RewardsCfg()
+    terminations: TerminationsCfg = TerminationsCfg()
+    randomization: RandomizationCfg = RandomizationCfg()
+    curriculum: CurriculumCfg = CurriculumCfg()
+
+    def __post_init__(self):
+        """Post initialization."""
+        # general settings
+        self.decimation = 2
+        self.episode_length_s = 12.0
+        self.viewer.eye = (3.5, 3.5, 3.5)
+        # simulation settings
+        self.sim.dt = 1.0 / 60.0