Skip to content

Commit f42a92f

Browse files
sbryngelsonclaude
andcommitted
Fix parallel Phase 2: co-locate .gcno with .gcda; cap Phase 1 at 32 workers
The temp-directory approach for Phase 2 caused gcov to fail silently (zero coverage for all tests) because gcov could not resolve source paths when run from a temporary directory. Fix: copy .gcno files directly alongside .gcda files in each test's isolated GCOV_PREFIX directory, run gcov from root_dir, then clean up. Each test has its own directory so parallel execution is still safe. Also cap Phase 1 workers at 32 (from uncapped n_jobs=64) to prevent OOM kills on large nodes where each MPI test process uses ~500MB. Add diagnostic output between Phase 1 and 2 to show .gcda file count for easier debugging of future issues. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent f1ed539 commit f42a92f

File tree

1 file changed

+65
-45
lines changed

1 file changed

+65
-45
lines changed

toolchain/mfc/test/coverage.py

Lines changed: 65 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -213,54 +213,56 @@ def _collect_single_test_coverage( # pylint: disable=too-many-locals
213213
"""
214214
Collect file-level coverage for a single test, fully self-contained.
215215
216-
Creates a temp directory with copies of .gcda files and their matching
217-
.gcno files, then runs a single batched gcov call. This avoids touching
218-
the shared build tree, making it safe to call concurrently.
216+
Copies .gcno files from the real build tree into the test's isolated
217+
.gcda directory (alongside the .gcda files), runs a batched gcov call,
218+
then removes the .gcno copies. Each test has its own directory, so
219+
this is safe to call concurrently without touching the shared build tree.
219220
"""
220221
build_subdir = os.path.join(test_gcda, "build")
221222
if not os.path.isdir(build_subdir):
222223
return uuid, []
223224

224-
with tempfile.TemporaryDirectory() as tmpdir:
225-
matching_gcno = []
226-
227-
for dirpath, _, filenames in os.walk(build_subdir):
228-
for fname in filenames:
229-
if not fname.endswith(".gcda"):
230-
continue
231-
gcda_src = os.path.join(dirpath, fname)
232-
rel = os.path.relpath(gcda_src, test_gcda)
233-
234-
# Copy .gcda into temp dir
235-
gcda_dst = os.path.join(tmpdir, rel)
236-
os.makedirs(os.path.dirname(gcda_dst), exist_ok=True)
237-
shutil.copy2(gcda_src, gcda_dst)
238-
239-
# Copy matching .gcno from real build tree
240-
gcno_rel = rel[:-5] + ".gcno"
241-
gcno_src = os.path.join(root_dir, gcno_rel)
242-
if os.path.isfile(gcno_src):
243-
gcno_dst = os.path.join(tmpdir, gcno_rel)
244-
shutil.copy2(gcno_src, gcno_dst)
245-
matching_gcno.append(gcno_dst)
246-
247-
if not matching_gcno:
248-
return uuid, []
249-
250-
# Batch: single gcov call for all .gcno files in this test.
251-
cmd = [gcov_bin, "--json-format", "--stdout"] + matching_gcno
252-
try:
253-
proc = subprocess.run(
254-
cmd, capture_output=True, cwd=tmpdir, timeout=120, check=False
255-
)
256-
except (subprocess.TimeoutExpired, subprocess.SubprocessError, OSError):
257-
return uuid, []
225+
gcno_copies = []
258226

259-
if proc.returncode != 0 or not proc.stdout:
260-
return uuid, []
227+
for dirpath, _, filenames in os.walk(build_subdir):
228+
for fname in filenames:
229+
if not fname.endswith(".gcda"):
230+
continue
231+
# Derive matching .gcno path in the real build tree
232+
gcda_path = os.path.join(dirpath, fname)
233+
rel = os.path.relpath(gcda_path, test_gcda)
234+
gcno_rel = rel[:-5] + ".gcno"
235+
gcno_src = os.path.join(root_dir, gcno_rel)
236+
if os.path.isfile(gcno_src):
237+
# Copy .gcno alongside .gcda in the test's isolated dir
238+
gcno_dst = os.path.join(dirpath, fname[:-5] + ".gcno")
239+
shutil.copy2(gcno_src, gcno_dst)
240+
gcno_copies.append(gcno_dst)
241+
242+
if not gcno_copies:
243+
return uuid, []
261244

262-
coverage = _parse_gcov_json_output(proc.stdout, root_dir)
263-
return uuid, sorted(coverage)
245+
# Batch: single gcov call for all .gcno files in this test.
246+
# Run from root_dir so source path resolution works correctly.
247+
cmd = [gcov_bin, "--json-format", "--stdout"] + gcno_copies
248+
try:
249+
proc = subprocess.run(
250+
cmd, capture_output=True, cwd=root_dir, timeout=120, check=False
251+
)
252+
except (subprocess.TimeoutExpired, subprocess.SubprocessError, OSError):
253+
return uuid, []
254+
finally:
255+
for g in gcno_copies:
256+
try:
257+
os.remove(g)
258+
except OSError:
259+
pass
260+
261+
if proc.returncode != 0 or not proc.stdout:
262+
return uuid, []
263+
264+
coverage = _parse_gcov_json_output(proc.stdout, root_dir)
265+
return uuid, sorted(coverage)
264266

265267

266268
def _run_single_test_direct(test_info: dict, gcda_dir: str, strip: str) -> tuple: # pylint: disable=too-many-locals
@@ -390,8 +392,11 @@ def build_coverage_cache( # pylint: disable=unused-argument,too-many-locals,too
390392

391393
if n_jobs is None:
392394
n_jobs = max(os.cpu_count() or 1, 1)
395+
# Cap Phase 1 parallelism: each test spawns MPI processes (~500MB each),
396+
# so too many concurrent tests cause OOM on large nodes.
397+
phase1_jobs = min(n_jobs, 32)
393398
cons.print(f"[bold]Building coverage cache for {len(cases)} tests "
394-
f"({n_jobs} parallel)...[/bold]")
399+
f"({phase1_jobs} test workers, {n_jobs} gcov workers)...[/bold]")
395400
cons.print(f"[dim]Using gcov binary: {gcov_bin}[/dim]")
396401
cons.print(f"[dim]Found {len(gcno_files)} .gcno files[/dim]")
397402
cons.print(f"[dim]GCOV_PREFIX_STRIP={strip}[/dim]")
@@ -412,7 +417,7 @@ def build_coverage_cache( # pylint: disable=unused-argument,too-many-locals,too
412417
cons.print("[bold]Phase 1/2: Running tests...[/bold]")
413418
test_results: dict = {}
414419
all_failures: dict = {}
415-
with ThreadPoolExecutor(max_workers=n_jobs) as pool:
420+
with ThreadPoolExecutor(max_workers=phase1_jobs) as pool:
416421
futures = {
417422
pool.submit(_run_single_test_direct, info, gcda_dir, strip): info
418423
for info in test_infos
@@ -432,9 +437,24 @@ def build_coverage_cache( # pylint: disable=unused-argument,too-many-locals,too
432437
fail_str = ", ".join(f"{t}={rc}" for t, rc in fails)
433438
cons.print(f" [yellow]{uuid}[/yellow]: {fail_str}")
434439

440+
# Diagnostic: verify .gcda files exist for at least one test.
441+
sample_uuid = next(iter(test_results), None)
442+
if sample_uuid:
443+
sample_gcda = test_results[sample_uuid]
444+
sample_build = os.path.join(sample_gcda, "build")
445+
if os.path.isdir(sample_build):
446+
gcda_count = sum(
447+
1 for _, _, fns in os.walk(sample_build)
448+
for f in fns if f.endswith(".gcda")
449+
)
450+
cons.print(f"[dim]Sample test {sample_uuid}: "
451+
f"{gcda_count} .gcda files in {sample_build}[/dim]")
452+
else:
453+
cons.print(f"[yellow]Sample test {sample_uuid}: "
454+
f"no build/ dir in {sample_gcda}[/yellow]")
455+
435456
# Phase 2: Collect gcov coverage from each test's isolated .gcda directory.
436-
# Each test is processed in its own temp dir (copied .gcda + .gcno files)
437-
# with a single batched gcov call, so tests can run in parallel.
457+
# .gcno files are temporarily copied alongside .gcda files, then removed.
438458
cons.print()
439459
cons.print("[bold]Phase 2/2: Collecting coverage...[/bold]")
440460
cache: dict = {}

0 commit comments

Comments
 (0)