From efbd43bbb5fae47cc0b2212b02d63ec262b00ce9 Mon Sep 17 00:00:00 2001 From: Swarit Pandey Date: Thu, 25 Jun 2026 12:22:21 +0530 Subject: [PATCH] feat(telemetry): log and report lock-acquisition contention at info level Surface lock-acquisition failures (another instance already running) at info level instead of Debug so the contention is visible in agent.log, and report the failed run immediately at the failure site so the backend records that a second invocation contended for the lock. reportFailedOnce is idempotent, so the deferred handler firing on the error return is a no-op. --- internal/telemetry/telemetry.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go index 4291612..3436f3a 100644 --- a/internal/telemetry/telemetry.go +++ b/internal/telemetry/telemetry.go @@ -363,7 +363,14 @@ func Run(exec executor.Executor, log *progress.Logger, cfg *cli.Config) (err err // Acquire lock lk, err := lock.Acquire(exec) if err != nil { - log.Debug("lock acquisition failed: %v", err) + // Another instance already holds the lock. Surface at info level (not + // Debug) so the contention is visible in agent.log, and report the + // failed run right here — don't wait for the deferred handler — so the + // backend records that this invocation contended for the lock while one + // was already in flight. reportFailedOnce is idempotent, so the deferred + // handler that also fires on the error return is a no-op. + log.Progress("Lock acquisition failed (PID %d): %v — another instance is already running, exiting", os.Getpid(), err) + reportFailedOnce(fmt.Sprintf("lock acquisition failed: %v", err)) return fmt.Errorf("acquiring lock: %w", err) } log.Debug("lock acquired (pid=%d)", os.Getpid())