Fixes ray lazy metric reporting and hanging processes (#2346)

# Description The step() function of ray/tuner.py has some issues preventing one from having an uninterrupted ray hyperparameter tuning session. Please refer to #2328 for details. Fixes #2328. ## Type of change - Bug fix (non-breaking change which fixes an issue) ## Checklist - [x] I have run the [`pre-commit` checks](https://pre-commit.com/) with `./isaaclab.sh --format` - [ ] I have made corresponding changes to the documentation - [ ] My changes generate no new warnings - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have updated the changelog and the corresponding version in the extension's `config/extension.toml` file - [x] I have added my name to the `CONTRIBUTORS.md` or my name already exists there

Fixes ray lazy metric reporting and hanging processes (#2346)
# Description The step() function of ray/tuner.py has some issues preventing one from having an uninterrupted ray hyperparameter tuning session. Please refer to #2328 for details. Fixes #2328. ## Type of change - Bug fix (non-breaking change which fixes an issue) ## Checklist - [x] I have run the [`pre-commit` checks](https://pre-commit.com/) with `./isaaclab.sh --format` - [ ] I have made corresponding changes to the documentation - [ ] My changes generate no new warnings - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have updated the changelog and the corresponding version in the extension's `config/extension.toml` file - [x] I have added my name to the `CONTRIBUTORS.md` or my name already exists there
c9997d6a · Özhan Özen · GitHub · 82cb3206 · c9997d6a · c9997d6a
Unverified Commit c9997d6a authored May 08, 2025 by Özhan Özen Committed by GitHub May 08, 2025
5 changed files
--- a/scripts/reinforcement_learning/ray/tuner.py
+++ b/scripts/reinforcement_learning/ray/tuner.py
@@ -5,8 +5,9 @@
 import argparse
 import importlib.util
 import os
+import subprocess
 import sys
-from time import sleep
+from time import sleep, time

 import ray
 import util
@@ -57,6 +58,9 @@ BASE_DIR = os.path.expanduser("~")
 PYTHON_EXEC = "./isaaclab.sh -p"
 WORKFLOW = "scripts/reinforcement_learning/rl_games/train.py"
 NUM_WORKERS_PER_NODE = 1  # needed for local parallelism
+PROCESS_RESPONSE_TIMEOUT = 200.0  # seconds to wait before killing the process when it stops responding
+MAX_LINES_TO_SEARCH_EXPERIMENT_LOGS = 1000  # maximum number of lines to read from the training process logs
+MAX_LOG_EXTRACTION_ERRORS = 2  # maximum allowed LogExtractionErrors before we abort the whole training


 class IsaacLabTuneTrainable(tune.Trainable):
@@ -70,6 +74,7 @@ class IsaacLabTuneTrainable(tune.Trainable):
    def setup(self, config: dict) -> None:
        """Get the invocation command, return quick for easy scheduling."""
        self.data = None
+        self.time_since_last_proc_response = 0.0
        self.invoke_cmd = util.get_invocation_command_from_cfg(cfg=config, python_cmd=PYTHON_EXEC, workflow=WORKFLOW)
        print(f"[INFO]: Recovered invocation with {self.invoke_cmd}")
        self.experiment = None
@@ -84,12 +89,21 @@ class IsaacLabTuneTrainable(tune.Trainable):
            # When including this as first step instead of setup, experiments get scheduled faster
            # Don't want to block the scheduler while the experiment spins up
            print(f"[INFO]: Invoking experiment as first step with {self.invoke_cmd}...")
-            experiment = util.execute_job(
-                self.invoke_cmd,
-                identifier_string="",
-                extract_experiment=True,
-                persistent_dir=BASE_DIR,
-            )
+            try:
+                experiment = util.execute_job(
+                    self.invoke_cmd,
+                    identifier_string="",
+                    extract_experiment=True,  # Keep this as True to return a valid dictionary
+                    persistent_dir=BASE_DIR,
+                    max_lines_to_search_logs=MAX_LINES_TO_SEARCH_EXPERIMENT_LOGS,
+                    max_time_to_search_logs=PROCESS_RESPONSE_TIMEOUT,
+                )
+            except util.LogExtractionError:
+                self.data = {
+                    "LOG_EXTRACTION_ERROR_STOPPER_FLAG": True,
+                    "done": True,
+                }
+                return self.data
            self.experiment = experiment
            print(f"[INFO]: Tuner recovered experiment info {experiment}")
            self.proc = experiment["proc"]
@@ -109,11 +123,35 @@ class IsaacLabTuneTrainable(tune.Trainable):

            while data is None:
                data = util.load_tensorboard_logs(self.tensorboard_logdir)
+                proc_status = self.proc.poll()
+                if proc_status is not None:
+                    break
                sleep(2)  # Lazy report metrics to avoid performance overhead

            if self.data is not None:
-                while util._dicts_equal(data, self.data):
+                data_ = {k: v for k, v in data.items() if k != "done"}
+                self_data_ = {k: v for k, v in self.data.items() if k != "done"}
+                unresponsiveness_start_time = time()
+                while util._dicts_equal(data_, self_data_):
+                    self.time_since_last_proc_response = time() - unresponsiveness_start_time
                    data = util.load_tensorboard_logs(self.tensorboard_logdir)
+                    data_ = {k: v for k, v in data.items() if k != "done"}
+                    proc_status = self.proc.poll()
+                    if proc_status is not None:
+                        break
+                    if self.time_since_last_proc_response > PROCESS_RESPONSE_TIMEOUT:
+                        self.time_since_last_proc_response = 0.0
+                        print("[WARNING]: Training workflow process is not responding, terminating...")
+                        self.proc.terminate()
+                        try:
+                            self.proc.wait(timeout=20)
+                        except subprocess.TimeoutExpired:
+                            print("[ERROR]: The process did not terminate within timeout duration.")
+                            self.proc.kill()
+                            self.proc.wait()
+                        self.data = data
+                        self.data["done"] = True
+                        return self.data
                    sleep(2)  # Lazy report metrics to avoid performance overhead

            self.data = data
@@ -132,6 +170,39 @@ class IsaacLabTuneTrainable(tune.Trainable):
        )


+class LogExtractionErrorStopper(tune.Stopper):
+    """Stopper that stops all trials if multiple LogExtractionErrors occur.
+
+    Args:
+        max_errors: The maximum number of LogExtractionErrors allowed before terminating the experiment.
+    """
+
+    def __init__(self, max_errors: int):
+        self.max_errors = max_errors
+        self.error_count = 0
+
+    def __call__(self, trial_id, result):
+        """Increments the error count if trial has encountered a LogExtractionError.
+
+        It does not stop the trial based on the metrics, always returning False.
+        """
+        if result.get("LOG_EXTRACTION_ERROR_STOPPER_FLAG", False):
+            self.error_count += 1
+            print(
+                f"[ERROR]: Encountered LogExtractionError {self.error_count} times. "
+                f"Maximum allowed is {self.max_errors}."
+            )
+        return False
+
+    def stop_all(self):
+        """Returns true if number of LogExtractionErrors exceeds the maximum allowed, terminating the experiment."""
+        if self.error_count > self.max_errors:
+            print("[FATAL]: Encountered LogExtractionError more than allowed, aborting entire tuning run... ")
+            return True
+        else:
+            return False
+
+
 def invoke_tuning_run(cfg: dict, args: argparse.Namespace) -> None:
    """Invoke an Isaac-Ray tuning run.

@@ -175,6 +246,7 @@ def invoke_tuning_run(cfg: dict, args: argparse.Namespace) -> None:
                checkpoint_frequency=0,  # Disable periodic checkpointing
                checkpoint_at_end=False,  # Disable final checkpoint
            ),
+            stop=LogExtractionErrorStopper(max_errors=MAX_LOG_EXTRACTION_ERRORS),
        )

    elif args.run_mode == "remote":  # MLFlow, to MLFlow server
@@ -190,6 +262,7 @@ def invoke_tuning_run(cfg: dict, args: argparse.Namespace) -> None:
            storage_path="/tmp/ray",
            callbacks=[mlflow_callback],
            checkpoint_config=ray.train.CheckpointConfig(checkpoint_frequency=0, checkpoint_at_end=False),
+            stop=LogExtractionErrorStopper(max_errors=MAX_LOG_EXTRACTION_ERRORS),
        )
    else:
        raise ValueError("Unrecognized run mode.")
@@ -199,6 +272,8 @@ def invoke_tuning_run(cfg: dict, args: argparse.Namespace) -> None:
        IsaacLabTuneTrainable,
        param_space=cfg,
        tune_config=tune.TuneConfig(
+            metric=args.metric,
+            mode=args.mode,
            search_alg=repeat_search,
            num_samples=args.num_samples,
            reuse_actors=True,
@@ -306,8 +381,39 @@ if __name__ == "__main__":
        default=3,
        help="How many times to repeat each hyperparameter config.",
    )
+    parser.add_argument(
+        "--process_response_timeout",
+        type=float,
+        default=PROCESS_RESPONSE_TIMEOUT,
+        help="Training workflow process response timeout.",
+    )
+    parser.add_argument(
+        "--max_lines_to_search_experiment_logs",
+        type=float,
+        default=MAX_LINES_TO_SEARCH_EXPERIMENT_LOGS,
+        help="Max number of lines to search for experiment logs before terminating the training workflow process.",
+    )
+    parser.add_argument(
+        "--max_log_extraction_errors",
+        type=float,
+        default=MAX_LOG_EXTRACTION_ERRORS,
+        help="Max number number of LogExtractionError failures before we abort the whole tuning run.",
+    )

    args = parser.parse_args()
+    PROCESS_RESPONSE_TIMEOUT = args.process_response_timeout
+    MAX_LINES_TO_SEARCH_EXPERIMENT_LOGS = int(args.max_lines_to_search_experiment_logs)
+    print(
+        "[INFO]: The max number of lines to search for experiment logs before (early) terminating the training "
+        f"workflow process is set to {MAX_LINES_TO_SEARCH_EXPERIMENT_LOGS}.\n"
+        "[INFO]: The process response timeout, used while updating tensorboard scalars and searching for "
+        f"experiment logs, is set to {PROCESS_RESPONSE_TIMEOUT} seconds."
+    )
+    MAX_LOG_EXTRACTION_ERRORS = int(args.max_log_extraction_errors)
+    print(
+        "[INFO]: Max number of LogExtractionError failures before we abort the whole tuning run is "
+        f"set to {MAX_LOG_EXTRACTION_ERRORS}.\n"
+    )
    NUM_WORKERS_PER_NODE = args.num_workers_per_node
    print(f"[INFO]: Using {NUM_WORKERS_PER_NODE} workers per node.")
    if args.run_mode == "remote":

--- a/scripts/reinforcement_learning/ray/util.py
+++ b/scripts/reinforcement_learning/ray/util.py
@@ -5,10 +5,12 @@
 import argparse
 import os
 import re
+import select
 import subprocess
 import threading
 from datetime import datetime
 from math import isclose
+from time import time

 import ray
 from tensorboard.backend.event_processing.directory_watcher import DirectoryDeletedError
@@ -26,6 +28,12 @@ def load_tensorboard_logs(directory: str) -> dict:
        The latest available scalar values.
    """

+    # replace any non-alnum/underscore/dot with "_", then collapse runs of "_"
+    def replace_invalid_chars(t):
+        t2 = re.sub(r"[^0-9A-Za-z_./]", "_", t)
+        t2 = re.sub(r"_+", "_", t2)
+        return t2.strip("_")
+
    # Initialize the event accumulator with a size guidance for only the latest entry
    def get_latest_scalars(path: str) -> dict:
        event_acc = EventAccumulator(path, size_guidance={"scalars": 1})
@@ -33,7 +41,7 @@ def load_tensorboard_logs(directory: str) -> dict:
            event_acc.Reload()
            if event_acc.Tags()["scalars"]:
                return {
-                    tag: event_acc.Scalars(tag)[-1].value
+                    replace_invalid_chars(tag): event_acc.Scalars(tag)[-1].value
                    for tag in event_acc.Tags()["scalars"]
                    if event_acc.Scalars(tag)
                }
@@ -98,6 +106,12 @@ def remote_execute_job(
    )


+class LogExtractionError(Exception):
+    """Raised when we cannot extract experiment_name/logdir from the trainer output."""
+
+    pass
+
+
 def execute_job(
    job_cmd: str,
    identifier_string: str = "job 0",
@@ -105,6 +119,8 @@ def execute_job(
    extract_experiment: bool = False,
    persistent_dir: str | None = None,
    log_all_output: bool = False,
+    max_lines_to_search_logs: int = 1000,
+    max_time_to_search_logs: float = 200.0,
 ) -> str | dict:
    """Issue a job (shell command).

@@ -117,6 +133,8 @@ def execute_job(
        persistent_dir: When supplied, change to run the directory in a persistent
            directory. Can be used to avoid losing logs in the /tmp directory. Defaults to None.
        log_all_output: When true, print all output to the console. Defaults to False.
+        max_lines_to_search_logs: Maximum number of lines to search for experiment info. Defaults to 1000.
+        max_time_to_search_logs: Maximum time to wait for experiment info before giving up. Defaults to 200.0 seconds.
    Raises:
        ValueError: If the job is unable to start, or throws an error. Most likely to happen
            due to running out of memory.
@@ -190,6 +208,8 @@ def execute_job(
        process = subprocess.Popen(
            job_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1
        )
+        process_file_descriptor = process.stdout.fileno()
+
        if persistent_dir:
            os.chdir(og_dir)
        experiment_name = None
@@ -205,48 +225,80 @@ def execute_job(
                if log_all_output:
                    print(f"{identifier_string}: {line}")

-        # Read stdout until we find experiment info
+        # Read stdout until we find exp. info, up to max_lines_to_search_logs lines, max_time_to_search_logs, or EOF.
        # Do some careful handling prevent overflowing the pipe reading buffer with error 141
-        for line in iter(process.stdout.readline, ""):
-            line = line.strip()
-            result_details.append(f"{identifier_string}: {line} \n")
-            if log_all_output:
-                print(f"{identifier_string}: {line}")
-
-            if extract_experiment:
-                exp_match = experiment_info_pattern.search(line)
-                log_match = logdir_pattern.search(line)
-                err_match = err_pattern.search(line)
-
-                if err_match:
-                    raise ValueError(f"Encountered an error during trial run. {' '.join(result_details)}")
-
-                if exp_match:
-                    experiment_name = exp_match.group(1)
-                if log_match:
-                    logdir = log_match.group(1)
-
-                if experiment_name and logdir:
-                    # Start stderr reader after finding experiment info
-                    stderr_thread = threading.Thread(
-                        target=stream_reader, args=(process.stderr, identifier_string, result_details)
-                    )
-                    stderr_thread.daemon = True
-                    stderr_thread.start()
-
-                    # Start stdout reader to continue reading to flush buffer
-                    stdout_thread = threading.Thread(
-                        target=stream_reader, args=(process.stdout, identifier_string, result_details)
-                    )
-                    stdout_thread.daemon = True
-                    stdout_thread.start()
-
-                    return {
-                        "experiment_name": experiment_name,
-                        "logdir": logdir,
-                        "proc": process,
-                        "result": " ".join(result_details),
-                    }
+        lines_read = 0
+        search_duration = 0.0
+        search_start_time = time()
+        while True:
+            new_line_ready, _, _ = select.select([process_file_descriptor], [], [], 1.0)  # Wait up to 1s for stdout
+            if new_line_ready:
+                line = process.stdout.readline()
+                if not line:  # EOF
+                    break
+
+                lines_read += 1
+                line = line.strip()
+                result_details.append(f"{identifier_string}: {line} \n")
+
+                if log_all_output:
+                    print(f"{identifier_string}: {line}")
+
+                if extract_experiment:
+                    exp_match = experiment_info_pattern.search(line)
+                    log_match = logdir_pattern.search(line)
+                    err_match = err_pattern.search(line)
+
+                    if err_match:
+                        raise ValueError(f"Encountered an error during trial run. {' '.join(result_details)}")
+
+                    if exp_match:
+                        experiment_name = exp_match.group(1)
+                    if log_match:
+                        logdir = log_match.group(1)
+
+                    if experiment_name and logdir:
+                        # Start stderr reader after finding experiment info
+                        stderr_thread = threading.Thread(
+                            target=stream_reader, args=(process.stderr, identifier_string, result_details)
+                        )
+                        stderr_thread.daemon = True
+                        stderr_thread.start()
+
+                        # Start stdout reader to continue reading to flush buffer
+                        stdout_thread = threading.Thread(
+                            target=stream_reader, args=(process.stdout, identifier_string, result_details)
+                        )
+                        stdout_thread.daemon = True
+                        stdout_thread.start()
+
+                        return {
+                            "experiment_name": experiment_name,
+                            "logdir": logdir,
+                            "proc": process,
+                            "result": " ".join(result_details),
+                        }
+
+            if extract_experiment:  # if we are looking for experiment info, check for timeouts and line limits
+                search_duration = time() - search_start_time
+                if search_duration > max_time_to_search_logs:
+                    print(f"[ERROR]: Could not find experiment logs within {max_time_to_search_logs} seconds.")
+                    break
+                if lines_read >= max_lines_to_search_logs:
+                    print(f"[ERROR]: Could not find experiment logs within first {max_lines_to_search_logs} lines.")
+                    break
+
+        # If we reach here, we didn't find experiment info in the output
+        if extract_experiment and not (experiment_name and logdir):
+            error_msg = (
+                "Could not extract experiment_name/logdir from trainer output "
+                f"(experiment_name={experiment_name!r}, logdir={logdir!r}).\n"
+                "\tMake sure your training script prints the following correctly:\n"
+                "\t\tExact experiment name requested from command line: <name>\n"
+                "\t\t[INFO] Logging experiment in directory: <logdir>\n\n"
+            )
+            print(f"[ERROR]: {error_msg}")
+            raise LogExtractionError("Could not extract experiment_name/logdir from training workflow output.")
        process.wait()
        now = datetime.now().strftime("%H:%M:%S.%f")
        completion_info = f"\n[INFO]: {identifier_string}: Job Started at {start_time}, completed at {now}\n"

--- a/scripts/reinforcement_learning/rsl_rl/train.py
+++ b/scripts/reinforcement_learning/rsl_rl/train.py
@@ -131,7 +131,7 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
    print(f"[INFO] Logging experiment in directory: {log_root_path}")
    # specify directory for logging runs: {time-stamp}_{run_name}
    log_dir = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-    # This way, the Ray Tune workflow can extract experiment name.
+    # The Ray Tune workflow extracts experiment name using the logging line below, hence, do not change it (see PR #2346, comment-2819298849)
    print(f"Exact experiment name requested from command line: {log_dir}")
    if agent_cfg.run_name:
        log_dir += f"_{agent_cfg.run_name}"

--- a/scripts/reinforcement_learning/sb3/train.py
+++ b/scripts/reinforcement_learning/sb3/train.py
@@ -95,6 +95,7 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
    run_info = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    log_root_path = os.path.abspath(os.path.join("logs", "sb3", args_cli.task))
    print(f"[INFO] Logging experiment in directory: {log_root_path}")
+    # The Ray Tune workflow extracts experiment name using the logging line below, hence, do not change it (see PR #2346, comment-2819298849)
    print(f"Exact experiment name requested from command line: {run_info}")
    log_dir = os.path.join(log_root_path, run_info)
    # dump the configuration into log-directory

--- a/scripts/reinforcement_learning/skrl/train.py
+++ b/scripts/reinforcement_learning/skrl/train.py
@@ -140,7 +140,8 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
    print(f"[INFO] Logging experiment in directory: {log_root_path}")
    # specify directory for logging runs: {time-stamp}_{run_name}
    log_dir = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + f"_{algorithm}_{args_cli.ml_framework}"
-    print(f"Exact experiment name requested from command line {log_dir}")
+    # The Ray Tune workflow extracts experiment name using the logging line below, hence, do not change it (see PR #2346, comment-2819298849)
+    print(f"Exact experiment name requested from command line: {log_dir}")
    if agent_cfg["agent"]["experiment"]["experiment_name"]:
        log_dir += f'_{agent_cfg["agent"]["experiment"]["experiment_name"]}'
    # set directory into agent config