NVIDIA-NeMo · rlratzel · May 11, 2026
@@ -133,6 +133,10 @@ delete_scratch: true
 # for "ray start" to determine object store size.
 object_store_size: 536870912000  # 500GB
 
+# Percentage of GPU memory permitted to be in use prior to running a
+# benchmark before a warning is printed.
+max_allowed_gpu_mem_use_warning_threshold: 0.02
+
 # Global ray settings inherited by all entries. Per-entry ray sections override these values.
 ray:
   num_cpus: 64

@@ -205,7 +205,11 @@ def run_entry(
         ray_cluster_data = get_ray_cluster_data()
         gpu_stats_before = get_gpu_stats()
         logger.info("\tGPU stats (before):")
-        log_gpu_stats(gpu_stats_before, warn_if_in_use=True)
+        warnings = log_gpu_stats(
+            gpu_stats_before,
+            warn_if_in_use=True,
+            warning_threshold=entry.max_allowed_gpu_mem_use_warning_threshold,
+        )
         logger.info(f"\tRunning command {' '.join(cmd) if isinstance(cmd, list) else cmd}")
         started_exec = time.time()
         run_data = run_command_with_timeout(
@@ -217,7 +221,7 @@ def run_entry(
         )
         ended_exec = time.time()
         logger.info("\tGPU stats (after):")
-        log_gpu_stats(get_gpu_stats())
+        warnings += log_gpu_stats(get_gpu_stats(), warn_if_in_use=True)
         duration = ended_exec - started_exec
 
         # Update result_data
@@ -231,6 +235,7 @@ def run_entry(
                 "logs_dir": logs_path,
                 "ray_cluster_data": ray_cluster_data,
                 "gpu_stats": gpu_stats_before,
+                "warnings": warnings,
             }
         )
         # script_persisted_data is a dictionary with keys "params" and "metrics"

@@ -47,6 +47,8 @@ class Entry:
     object_store_size: int | float | str | None = None
     # If set, overrides the session-level delete_scratch setting for this entry
     delete_scratch: bool | None = None
+    # If set, overrides the session-level max_allowed_gpu_mem_use_warning_threshold for this entry
+    max_allowed_gpu_mem_use_warning_threshold: float | None = None
 
     def __post_init__(self) -> None:  # noqa: C901, PLR0912
         """Post-initialization checks and updates for dataclass."""

@@ -46,12 +46,15 @@ class Session:
     object_store_size: int | float | str | None = 0.5
     # Whether to delete the entry's scratch directory after completion by default
     delete_scratch: bool = True
+    # Fraction of total GPU memory (0.0-1.0) above which a pre-run warning is emitted.
+    # If None, any usage > 0 triggers a warning. Entries can override this value.
+    max_allowed_gpu_mem_use_warning_threshold: float | None = None
     # Global ray settings inherited by all entries; per-entry ray sections override these values.
     ray: dict = field(default_factory=dict)
     path_resolver: PathResolver = None
     dataset_resolver: DatasetResolver = None
 
-    def __post_init__(self) -> None:
+    def __post_init__(self) -> None:  # noqa: C901
         """Post-initialization checks and updates for dataclass."""
         names = [entry.name for entry in self.entries]
         if len(names) != len(set(names)):
@@ -78,6 +81,11 @@ def __post_init__(self) -> None:
             if entry.object_store_size is None:
                 entry.object_store_size = self.object_store_size
 
+        # Update max_allowed_gpu_mem_use_warning_threshold for each entry that has not been set.
+        for entry in self.entries:
+            if entry.max_allowed_gpu_mem_use_warning_threshold is None:
+                entry.max_allowed_gpu_mem_use_warning_threshold = self.max_allowed_gpu_mem_use_warning_threshold
+
         # Apply global ray defaults to each entry, with per-entry ray values taking precedence.
         for entry in self.entries:
             entry.ray = {**self.ray, **entry.ray}

@@ -575,15 +575,7 @@ def register_benchmark_entry_finished(self, result_dict: dict[str, Any], benchma
         pings = [] if result_dict["success"] else sink_data.get("ping_on_failure", [])
         status_text = "✅ success" if result_dict["success"] else "❌ FAILED"
 
-        # Warn if any GPU had memory in use before the benchmark started.
-        # TODO: This could be made into a more generic check that can be configured in the sink config.
-        warnings = []
-        for gpu_id, stats in result_dict.get("gpu_stats", {}).items():
-            if stats.get("memory_used", 0) > 0:
-                pct_used = stats["memory_used"] / stats["memory_total"] * 100
-                warnings.append(
-                    f"GPU {gpu_id} had {stats['memory_used']} MiB ({pct_used:.1f}% of total) used before benchmark started"
-                )
+        warnings = result_dict.get("warnings", [])
 
         # Create a new message for the entry to post in the thread.
         msg = self._create_benchmark_entry_message(

@@ -218,20 +218,32 @@ def get_gpu_stats() -> dict:
     return query_data
 
 
-def log_gpu_stats(gpu_stats: dict, warn_if_in_use: bool = False) -> None:
+def log_gpu_stats(gpu_stats: dict, warn_if_in_use: bool = False, warning_threshold: float | None = None) -> list[str]:
     """Log GPU memory usage for each GPU as a percentage of total memory.
 
     Args:
         gpu_stats: Dictionary as returned by get_gpu_stats().
-        warn_if_in_use: If True, emit a warning for any GPU with memory_used > 0.
+        warn_if_in_use: If True, emit a warning for any GPU that exceeds the usage threshold.
+        warning_threshold: Fraction of total GPU memory (0.0-1.0) above which a warning is
+            emitted. If None and warn_if_in_use is True, any usage > 0 triggers a warning.
+
+    Returns:
+        List of warning strings for any GPUs that triggered a warning.
     """
+    warnings = []
     for gpu_id, stats in gpu_stats.items():
         pct_used = stats["memory_used"] / stats["memory_total"] * 100
         logger.info(f"GPU {gpu_id} : {pct_used:.1f}%")
-        if warn_if_in_use and stats["memory_used"] > 0:
-            logger.warning(
-                f"GPU {gpu_id} has {stats['memory_used']} MiB ({pct_used:.1f}% of total) used before benchmark started"
+        if warn_if_in_use:
+            fraction_used = stats["memory_used"] / stats["memory_total"]
+            threshold_exceeded = (
+                fraction_used > warning_threshold if warning_threshold is not None else stats["memory_used"] > 0
             )
+            if threshold_exceeded:
+                msg = f"GPU {gpu_id} has {stats['memory_used']} MiB ({pct_used:.1f}% of total) used before benchmark started"
+                logger.warning(msg)
+                warnings.append(msg)
+    return warnings
 
 
 _LEGACY_PATH_FIELDS = ["results_path", "datasets_path", "model_weights_path"]