Unverified Commit bc7c9f5c authored by Kelly Guo's avatar Kelly Guo Committed by GitHub

Fixes distributed setup in benchmarking scripts (#2194)

# Description

Previously, benchmark scripts were stopping the benchmark outside of the
global rank check and this occasionally causes issues on processes with
global ranks > 0. This change moves the call to be inside the if
statement such that it is only called on the rank 0 process.


## Type of change

<!-- As you go through the list, delete the ones that are not
applicable. -->

- Bug fix (non-breaking change which fixes an issue)


## Screenshots

Please attach before and after screenshots of the change if applicable.

<!--
Example:

| Before | After |
| ------ | ----- |
| _gif/png before_ | _gif/png after_ |

To upload images to a PR -- simply drag and drop an image while in edit
mode and it should upload the image directly. You can then paste that
source into the above before/after sections.
-->

## Checklist

- [x] I have run the [`pre-commit` checks](https://pre-commit.com/) with
`./isaaclab.sh --format`
- [x] I have made corresponding changes to the documentation
- [x] My changes generate no new warnings
- [ ] I have added tests that prove my fix is effective or that my
feature works
- [ ] I have updated the changelog and the corresponding version in the
extension's `config/extension.toml` file
- [ ] I have added my name to the `CONTRIBUTORS.md` or my name already
exists there

<!--
As you go through the checklist above, you can mark something as done by
putting an x character in it

For example,
- [x] I have done this task
- [ ] I have not done this task
-->
parent d41c5a98
......@@ -193,7 +193,7 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
log_total_start_time(benchmark, (task_startup_time_end - app_start_time_begin) / 1e6)
log_runtime_step_times(benchmark, environment_step_times, compute_stats=True)
benchmark.stop()
benchmark.stop()
# close the simulator
env.close()
......
......@@ -248,7 +248,7 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
log_rl_policy_rewards(benchmark, log_data["rewards/iter"])
log_rl_policy_episode_lengths(benchmark, log_data["episode_lengths/iter"])
benchmark.stop()
benchmark.stop()
# close the simulator
env.close()
......
......@@ -142,6 +142,7 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
env_cfg.sim.device = args_cli.device if args_cli.device is not None else env_cfg.sim.device
# multi-gpu training configuration
world_rank = 0
if args_cli.distributed:
env_cfg.sim.device = f"cuda:{app_launcher.local_rank}"
agent_cfg.device = f"cuda:{app_launcher.local_rank}"
......@@ -150,6 +151,7 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
seed = agent_cfg.seed + app_launcher.local_rank
env_cfg.seed = seed
agent_cfg.seed = seed
world_rank = app_launcher.global_rank
# specify directory for logging experiments
log_root_path = os.path.join("logs", "rsl_rl", agent_cfg.experiment_name)
......@@ -211,34 +213,35 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
# run training
runner.learn(num_learning_iterations=agent_cfg.max_iterations, init_at_random_ep_len=True)
benchmark.store_measurements()
# parse tensorboard file stats
log_data = parse_tf_logs(log_dir)
if world_rank == 0:
benchmark.store_measurements()
# parse tensorboard file stats
log_data = parse_tf_logs(log_dir)
# prepare RL timing dict
collection_fps = (
1 / (np.array(log_data["Perf/collection time"])) * env.unwrapped.num_envs * agent_cfg.num_steps_per_env
)
rl_training_times = {
"Collection Time": (np.array(log_data["Perf/collection time"]) / 1000).tolist(),
"Learning Time": (np.array(log_data["Perf/learning_time"]) / 1000).tolist(),
"Collection FPS": collection_fps.tolist(),
"Total FPS": log_data["Perf/total_fps"],
}
# prepare RL timing dict
collection_fps = (
1 / (np.array(log_data["Perf/collection time"])) * env.unwrapped.num_envs * agent_cfg.num_steps_per_env
)
rl_training_times = {
"Collection Time": (np.array(log_data["Perf/collection time"]) / 1000).tolist(),
"Learning Time": (np.array(log_data["Perf/learning_time"]) / 1000).tolist(),
"Collection FPS": collection_fps.tolist(),
"Total FPS": log_data["Perf/total_fps"],
}
# log additional metrics to benchmark services
log_app_start_time(benchmark, (app_start_time_end - app_start_time_begin) / 1e6)
log_python_imports_time(benchmark, (imports_time_end - imports_time_begin) / 1e6)
log_task_start_time(benchmark, (task_startup_time_end - task_startup_time_begin) / 1e6)
log_scene_creation_time(benchmark, Timer.get_timer_info("scene_creation") * 1000)
log_simulation_start_time(benchmark, Timer.get_timer_info("simulation_start") * 1000)
log_total_start_time(benchmark, (task_startup_time_end - app_start_time_begin) / 1e6)
log_runtime_step_times(benchmark, rl_training_times, compute_stats=True)
log_rl_policy_rewards(benchmark, log_data["Train/mean_reward"])
log_rl_policy_episode_lengths(benchmark, log_data["Train/mean_episode_length"])
benchmark.stop()
# log additional metrics to benchmark services
log_app_start_time(benchmark, (app_start_time_end - app_start_time_begin) / 1e6)
log_python_imports_time(benchmark, (imports_time_end - imports_time_begin) / 1e6)
log_task_start_time(benchmark, (task_startup_time_end - task_startup_time_begin) / 1e6)
log_scene_creation_time(benchmark, Timer.get_timer_info("scene_creation") * 1000)
log_simulation_start_time(benchmark, Timer.get_timer_info("simulation_start") * 1000)
log_total_start_time(benchmark, (task_startup_time_end - app_start_time_begin) / 1e6)
log_runtime_step_times(benchmark, rl_training_times, compute_stats=True)
log_rl_policy_rewards(benchmark, log_data["Train/mean_reward"])
log_rl_policy_episode_lengths(benchmark, log_data["Train/mean_episode_length"])
benchmark.stop()
# close the simulator
env.close()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment