Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions benchmarking/nightly-benchmark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,10 @@ delete_scratch: true
# for "ray start" to determine object store size.
object_store_size: 536870912000 # 500GB

# Percentage of GPU memory permitted to be in use prior to running a
# benchmark before a warning is printed.
max_allowed_gpu_mem_use_warning_threshold: 0.02

# Global ray settings inherited by all entries. Per-entry ray sections override these values.
ray:
num_cpus: 64
Expand Down
9 changes: 7 additions & 2 deletions benchmarking/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,11 @@ def run_entry(
ray_cluster_data = get_ray_cluster_data()
gpu_stats_before = get_gpu_stats()
logger.info("\tGPU stats (before):")
log_gpu_stats(gpu_stats_before, warn_if_in_use=True)
warnings = log_gpu_stats(
gpu_stats_before,
warn_if_in_use=True,
warning_threshold=entry.max_allowed_gpu_mem_use_warning_threshold,
)
logger.info(f"\tRunning command {' '.join(cmd) if isinstance(cmd, list) else cmd}")
started_exec = time.time()
run_data = run_command_with_timeout(
Expand All @@ -217,7 +221,7 @@ def run_entry(
)
ended_exec = time.time()
logger.info("\tGPU stats (after):")
log_gpu_stats(get_gpu_stats())
warnings += log_gpu_stats(get_gpu_stats(), warn_if_in_use=True)
duration = ended_exec - started_exec

# Update result_data
Expand All @@ -231,6 +235,7 @@ def run_entry(
"logs_dir": logs_path,
"ray_cluster_data": ray_cluster_data,
"gpu_stats": gpu_stats_before,
"warnings": warnings,
}
)
# script_persisted_data is a dictionary with keys "params" and "metrics"
Expand Down
2 changes: 2 additions & 0 deletions benchmarking/runner/entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ class Entry:
object_store_size: int | float | str | None = None
# If set, overrides the session-level delete_scratch setting for this entry
delete_scratch: bool | None = None
# If set, overrides the session-level max_allowed_gpu_mem_use_warning_threshold for this entry
max_allowed_gpu_mem_use_warning_threshold: float | None = None

def __post_init__(self) -> None: # noqa: C901, PLR0912
"""Post-initialization checks and updates for dataclass."""
Expand Down
10 changes: 9 additions & 1 deletion benchmarking/runner/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,15 @@ class Session:
object_store_size: int | float | str | None = 0.5
# Whether to delete the entry's scratch directory after completion by default
delete_scratch: bool = True
# Fraction of total GPU memory (0.0-1.0) above which a pre-run warning is emitted.
# If None, any usage > 0 triggers a warning. Entries can override this value.
max_allowed_gpu_mem_use_warning_threshold: float | None = None
# Global ray settings inherited by all entries; per-entry ray sections override these values.
ray: dict = field(default_factory=dict)
path_resolver: PathResolver = None
dataset_resolver: DatasetResolver = None

def __post_init__(self) -> None:
def __post_init__(self) -> None: # noqa: C901
"""Post-initialization checks and updates for dataclass."""
names = [entry.name for entry in self.entries]
if len(names) != len(set(names)):
Expand All @@ -78,6 +81,11 @@ def __post_init__(self) -> None:
if entry.object_store_size is None:
entry.object_store_size = self.object_store_size

# Update max_allowed_gpu_mem_use_warning_threshold for each entry that has not been set.
for entry in self.entries:
if entry.max_allowed_gpu_mem_use_warning_threshold is None:
entry.max_allowed_gpu_mem_use_warning_threshold = self.max_allowed_gpu_mem_use_warning_threshold

# Apply global ray defaults to each entry, with per-entry ray values taking precedence.
for entry in self.entries:
entry.ray = {**self.ray, **entry.ray}
Expand Down
10 changes: 1 addition & 9 deletions benchmarking/runner/sinks/slack_sink.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,15 +575,7 @@ def register_benchmark_entry_finished(self, result_dict: dict[str, Any], benchma
pings = [] if result_dict["success"] else sink_data.get("ping_on_failure", [])
status_text = "✅ success" if result_dict["success"] else "❌ FAILED"

# Warn if any GPU had memory in use before the benchmark started.
# TODO: This could be made into a more generic check that can be configured in the sink config.
warnings = []
for gpu_id, stats in result_dict.get("gpu_stats", {}).items():
if stats.get("memory_used", 0) > 0:
pct_used = stats["memory_used"] / stats["memory_total"] * 100
warnings.append(
f"GPU {gpu_id} had {stats['memory_used']} MiB ({pct_used:.1f}% of total) used before benchmark started"
)
warnings = result_dict.get("warnings", [])

# Create a new message for the entry to post in the thread.
msg = self._create_benchmark_entry_message(
Expand Down
22 changes: 17 additions & 5 deletions benchmarking/runner/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,20 +218,32 @@ def get_gpu_stats() -> dict:
return query_data


def log_gpu_stats(gpu_stats: dict, warn_if_in_use: bool = False) -> None:
def log_gpu_stats(gpu_stats: dict, warn_if_in_use: bool = False, warning_threshold: float | None = None) -> list[str]:
"""Log GPU memory usage for each GPU as a percentage of total memory.

Args:
gpu_stats: Dictionary as returned by get_gpu_stats().
warn_if_in_use: If True, emit a warning for any GPU with memory_used > 0.
warn_if_in_use: If True, emit a warning for any GPU that exceeds the usage threshold.
warning_threshold: Fraction of total GPU memory (0.0-1.0) above which a warning is
emitted. If None and warn_if_in_use is True, any usage > 0 triggers a warning.

Returns:
List of warning strings for any GPUs that triggered a warning.
"""
warnings = []
for gpu_id, stats in gpu_stats.items():
pct_used = stats["memory_used"] / stats["memory_total"] * 100
logger.info(f"GPU {gpu_id} : {pct_used:.1f}%")
if warn_if_in_use and stats["memory_used"] > 0:
logger.warning(
f"GPU {gpu_id} has {stats['memory_used']} MiB ({pct_used:.1f}% of total) used before benchmark started"
if warn_if_in_use:
fraction_used = stats["memory_used"] / stats["memory_total"]
threshold_exceeded = (
fraction_used > warning_threshold if warning_threshold is not None else stats["memory_used"] > 0
)
if threshold_exceeded:
msg = f"GPU {gpu_id} has {stats['memory_used']} MiB ({pct_used:.1f}% of total) used before benchmark started"
logger.warning(msg)
warnings.append(msg)
return warnings


_LEGACY_PATH_FIELDS = ["results_path", "datasets_path", "model_weights_path"]
Expand Down
Loading