Fixes distributed setup in benchmarking scripts (#2194)

# Description Previously, benchmark scripts were stopping the benchmark outside of the global rank check and this occasionally causes issues on processes with global ranks > 0. This change moves the call to be inside the if statement such that it is only called on the rank 0 process. ## Type of change  - Bug fix (non-breaking change which fixes an issue) ## Screenshots Please attach before and after screenshots of the change if applicable.  ## Checklist - [x] I have run the [`pre-commit` checks](https://pre-commit.com/) with `./isaaclab.sh --format` - [x] I have made corresponding changes to the documentation - [x] My changes generate no new warnings - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have updated the changelog and the corresponding version in the extension's `config/extension.toml` file - [ ] I have added my name to the `CONTRIBUTORS.md` or my name already exists there

Fixes distributed setup in benchmarking scripts (#2194)
# Description Previously, benchmark scripts were stopping the benchmark outside of the global rank check and this occasionally causes issues on processes with global ranks > 0. This change moves the call to be inside the if statement such that it is only called on the rank 0 process. ## Type of change  - Bug fix (non-breaking change which fixes an issue) ## Screenshots Please attach before and after screenshots of the change if applicable.  ## Checklist - [x] I have run the [`pre-commit` checks](https://pre-commit.com/) with `./isaaclab.sh --format` - [x] I have made corresponding changes to the documentation - [x] My changes generate no new warnings - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have updated the changelog and the corresponding version in the extension's `config/extension.toml` file - [ ] I have added my name to the `CONTRIBUTORS.md` or my name already exists there
bc7c9f5c · Kelly Guo · GitHub · d41c5a98 · bc7c9f5c · bc7c9f5c
Unverified Commit bc7c9f5c authored Mar 31, 2025 by Kelly Guo Committed by GitHub Mar 31, 2025
Showing with 32 additions and 29 deletions

benchmark_non_rl.py scripts/benchmarks/benchmark_non_rl.py +1 -1

benchmark_rlgames.py scripts/benchmarks/benchmark_rlgames.py +1 -1

benchmark_rsl_rl.py scripts/benchmarks/benchmark_rsl_rl.py +30 -27

No files found.
--- a/scripts/benchmarks/benchmark_non_rl.py
+++ b/scripts/benchmarks/benchmark_non_rl.py
@@ -193,7 +193,7 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
        log_total_start_time(benchmark, (task_startup_time_end - app_start_time_begin) / 1e6)
        log_runtime_step_times(benchmark, environment_step_times, compute_stats=True)

-    benchmark.stop()
+        benchmark.stop()

    # close the simulator
    env.close()

--- a/scripts/benchmarks/benchmark_rlgames.py
+++ b/scripts/benchmarks/benchmark_rlgames.py
@@ -248,7 +248,7 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
        log_rl_policy_rewards(benchmark, log_data["rewards/iter"])
        log_rl_policy_episode_lengths(benchmark, log_data["episode_lengths/iter"])

-    benchmark.stop()
+        benchmark.stop()

    # close the simulator
    env.close()

--- a/scripts/benchmarks/benchmark_rsl_rl.py
+++ b/scripts/benchmarks/benchmark_rsl_rl.py
@@ -142,6 +142,7 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
    env_cfg.sim.device = args_cli.device if args_cli.device is not None else env_cfg.sim.device

    # multi-gpu training configuration
+    world_rank = 0
    if args_cli.distributed:
        env_cfg.sim.device = f"cuda:{app_launcher.local_rank}"
        agent_cfg.device = f"cuda:{app_launcher.local_rank}"
@@ -150,6 +151,7 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
        seed = agent_cfg.seed + app_launcher.local_rank
        env_cfg.seed = seed
        agent_cfg.seed = seed
+        world_rank = app_launcher.global_rank

    # specify directory for logging experiments
    log_root_path = os.path.join("logs", "rsl_rl", agent_cfg.experiment_name)
@@ -211,34 +213,35 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
    # run training
    runner.learn(num_learning_iterations=agent_cfg.max_iterations, init_at_random_ep_len=True)

-    benchmark.store_measurements()
-
-    # parse tensorboard file stats
-    log_data = parse_tf_logs(log_dir)
+    if world_rank == 0:
+        benchmark.store_measurements()
+
+        # parse tensorboard file stats
+        log_data = parse_tf_logs(log_dir)
+
+        # prepare RL timing dict
+        collection_fps = (
+            1 / (np.array(log_data["Perf/collection time"])) * env.unwrapped.num_envs * agent_cfg.num_steps_per_env
+        )
+        rl_training_times = {
+            "Collection Time": (np.array(log_data["Perf/collection time"]) / 1000).tolist(),
+            "Learning Time": (np.array(log_data["Perf/learning_time"]) / 1000).tolist(),
+            "Collection FPS": collection_fps.tolist(),
+            "Total FPS": log_data["Perf/total_fps"],
+        }

-    # prepare RL timing dict
-    collection_fps = (
-        1 / (np.array(log_data["Perf/collection time"])) * env.unwrapped.num_envs * agent_cfg.num_steps_per_env
-    )
-    rl_training_times = {
-        "Collection Time": (np.array(log_data["Perf/collection time"]) / 1000).tolist(),
-        "Learning Time": (np.array(log_data["Perf/learning_time"]) / 1000).tolist(),
-        "Collection FPS": collection_fps.tolist(),
-        "Total FPS": log_data["Perf/total_fps"],
-    }
-
-    # log additional metrics to benchmark services
-    log_app_start_time(benchmark, (app_start_time_end - app_start_time_begin) / 1e6)
-    log_python_imports_time(benchmark, (imports_time_end - imports_time_begin) / 1e6)
-    log_task_start_time(benchmark, (task_startup_time_end - task_startup_time_begin) / 1e6)
-    log_scene_creation_time(benchmark, Timer.get_timer_info("scene_creation") * 1000)
-    log_simulation_start_time(benchmark, Timer.get_timer_info("simulation_start") * 1000)
-    log_total_start_time(benchmark, (task_startup_time_end - app_start_time_begin) / 1e6)
-    log_runtime_step_times(benchmark, rl_training_times, compute_stats=True)
-    log_rl_policy_rewards(benchmark, log_data["Train/mean_reward"])
-    log_rl_policy_episode_lengths(benchmark, log_data["Train/mean_episode_length"])
-
-    benchmark.stop()
+        # log additional metrics to benchmark services
+        log_app_start_time(benchmark, (app_start_time_end - app_start_time_begin) / 1e6)
+        log_python_imports_time(benchmark, (imports_time_end - imports_time_begin) / 1e6)
+        log_task_start_time(benchmark, (task_startup_time_end - task_startup_time_begin) / 1e6)
+        log_scene_creation_time(benchmark, Timer.get_timer_info("scene_creation") * 1000)
+        log_simulation_start_time(benchmark, Timer.get_timer_info("simulation_start") * 1000)
+        log_total_start_time(benchmark, (task_startup_time_end - app_start_time_begin) / 1e6)
+        log_runtime_step_times(benchmark, rl_training_times, compute_stats=True)
+        log_rl_policy_rewards(benchmark, log_data["Train/mean_reward"])
+        log_rl_policy_episode_lengths(benchmark, log_data["Train/mean_episode_length"])
+
+        benchmark.stop()

    # close the simulator
    env.close()