diff --git a/benchmarking/nightly-benchmark.yaml b/benchmarking/nightly-benchmark.yaml index 210363192c..f961c53a7e 100644 --- a/benchmarking/nightly-benchmark.yaml +++ b/benchmarking/nightly-benchmark.yaml @@ -133,6 +133,10 @@ delete_scratch: true # for "ray start" to determine object store size. object_store_size: 536870912000 # 500GB +# Percentage of GPU memory permitted to be in use prior to running a +# benchmark before a warning is printed. +max_allowed_gpu_mem_use_warning_threshold: 0.02 + # Global ray settings inherited by all entries. Per-entry ray sections override these values. ray: num_cpus: 64 diff --git a/benchmarking/run.py b/benchmarking/run.py index 2784442baf..3c32180981 100755 --- a/benchmarking/run.py +++ b/benchmarking/run.py @@ -205,7 +205,11 @@ def run_entry( ray_cluster_data = get_ray_cluster_data() gpu_stats_before = get_gpu_stats() logger.info("\tGPU stats (before):") - log_gpu_stats(gpu_stats_before, warn_if_in_use=True) + warnings = log_gpu_stats( + gpu_stats_before, + warn_if_in_use=True, + warning_threshold=entry.max_allowed_gpu_mem_use_warning_threshold, + ) logger.info(f"\tRunning command {' '.join(cmd) if isinstance(cmd, list) else cmd}") started_exec = time.time() run_data = run_command_with_timeout( @@ -217,7 +221,7 @@ def run_entry( ) ended_exec = time.time() logger.info("\tGPU stats (after):") - log_gpu_stats(get_gpu_stats()) + warnings += log_gpu_stats(get_gpu_stats(), warn_if_in_use=True) duration = ended_exec - started_exec # Update result_data @@ -231,6 +235,7 @@ def run_entry( "logs_dir": logs_path, "ray_cluster_data": ray_cluster_data, "gpu_stats": gpu_stats_before, + "warnings": warnings, } ) # script_persisted_data is a dictionary with keys "params" and "metrics" diff --git a/benchmarking/runner/entry.py b/benchmarking/runner/entry.py index 525aca05d5..26ae495d9d 100644 --- a/benchmarking/runner/entry.py +++ b/benchmarking/runner/entry.py @@ -47,6 +47,8 @@ class Entry: object_store_size: int | float | str | None = None # If set, overrides the session-level delete_scratch setting for this entry delete_scratch: bool | None = None + # If set, overrides the session-level max_allowed_gpu_mem_use_warning_threshold for this entry + max_allowed_gpu_mem_use_warning_threshold: float | None = None def __post_init__(self) -> None: # noqa: C901, PLR0912 """Post-initialization checks and updates for dataclass.""" diff --git a/benchmarking/runner/session.py b/benchmarking/runner/session.py index 79bcd41fd6..77842c55c1 100644 --- a/benchmarking/runner/session.py +++ b/benchmarking/runner/session.py @@ -46,12 +46,15 @@ class Session: object_store_size: int | float | str | None = 0.5 # Whether to delete the entry's scratch directory after completion by default delete_scratch: bool = True + # Fraction of total GPU memory (0.0-1.0) above which a pre-run warning is emitted. + # If None, any usage > 0 triggers a warning. Entries can override this value. + max_allowed_gpu_mem_use_warning_threshold: float | None = None # Global ray settings inherited by all entries; per-entry ray sections override these values. ray: dict = field(default_factory=dict) path_resolver: PathResolver = None dataset_resolver: DatasetResolver = None - def __post_init__(self) -> None: + def __post_init__(self) -> None: # noqa: C901 """Post-initialization checks and updates for dataclass.""" names = [entry.name for entry in self.entries] if len(names) != len(set(names)): @@ -78,6 +81,11 @@ def __post_init__(self) -> None: if entry.object_store_size is None: entry.object_store_size = self.object_store_size + # Update max_allowed_gpu_mem_use_warning_threshold for each entry that has not been set. + for entry in self.entries: + if entry.max_allowed_gpu_mem_use_warning_threshold is None: + entry.max_allowed_gpu_mem_use_warning_threshold = self.max_allowed_gpu_mem_use_warning_threshold + # Apply global ray defaults to each entry, with per-entry ray values taking precedence. for entry in self.entries: entry.ray = {**self.ray, **entry.ray} diff --git a/benchmarking/runner/sinks/slack_sink.py b/benchmarking/runner/sinks/slack_sink.py index fb09a27788..47bfce18d4 100644 --- a/benchmarking/runner/sinks/slack_sink.py +++ b/benchmarking/runner/sinks/slack_sink.py @@ -575,15 +575,7 @@ def register_benchmark_entry_finished(self, result_dict: dict[str, Any], benchma pings = [] if result_dict["success"] else sink_data.get("ping_on_failure", []) status_text = "✅ success" if result_dict["success"] else "❌ FAILED" - # Warn if any GPU had memory in use before the benchmark started. - # TODO: This could be made into a more generic check that can be configured in the sink config. - warnings = [] - for gpu_id, stats in result_dict.get("gpu_stats", {}).items(): - if stats.get("memory_used", 0) > 0: - pct_used = stats["memory_used"] / stats["memory_total"] * 100 - warnings.append( - f"GPU {gpu_id} had {stats['memory_used']} MiB ({pct_used:.1f}% of total) used before benchmark started" - ) + warnings = result_dict.get("warnings", []) # Create a new message for the entry to post in the thread. msg = self._create_benchmark_entry_message( diff --git a/benchmarking/runner/utils.py b/benchmarking/runner/utils.py index 45e4bef2c6..ecbc37c7cb 100644 --- a/benchmarking/runner/utils.py +++ b/benchmarking/runner/utils.py @@ -218,20 +218,32 @@ def get_gpu_stats() -> dict: return query_data -def log_gpu_stats(gpu_stats: dict, warn_if_in_use: bool = False) -> None: +def log_gpu_stats(gpu_stats: dict, warn_if_in_use: bool = False, warning_threshold: float | None = None) -> list[str]: """Log GPU memory usage for each GPU as a percentage of total memory. Args: gpu_stats: Dictionary as returned by get_gpu_stats(). - warn_if_in_use: If True, emit a warning for any GPU with memory_used > 0. + warn_if_in_use: If True, emit a warning for any GPU that exceeds the usage threshold. + warning_threshold: Fraction of total GPU memory (0.0-1.0) above which a warning is + emitted. If None and warn_if_in_use is True, any usage > 0 triggers a warning. + + Returns: + List of warning strings for any GPUs that triggered a warning. """ + warnings = [] for gpu_id, stats in gpu_stats.items(): pct_used = stats["memory_used"] / stats["memory_total"] * 100 logger.info(f"GPU {gpu_id} : {pct_used:.1f}%") - if warn_if_in_use and stats["memory_used"] > 0: - logger.warning( - f"GPU {gpu_id} has {stats['memory_used']} MiB ({pct_used:.1f}% of total) used before benchmark started" + if warn_if_in_use: + fraction_used = stats["memory_used"] / stats["memory_total"] + threshold_exceeded = ( + fraction_used > warning_threshold if warning_threshold is not None else stats["memory_used"] > 0 ) + if threshold_exceeded: + msg = f"GPU {gpu_id} has {stats['memory_used']} MiB ({pct_used:.1f}% of total) used before benchmark started" + logger.warning(msg) + warnings.append(msg) + return warnings _LEGACY_PATH_FIELDS = ["results_path", "datasets_path", "model_weights_path"]