diff --git a/.github/workflows/ci-health-sweep.yml b/.github/workflows/ci-health-sweep.yml new file mode 100644 index 00000000..64f8e208 --- /dev/null +++ b/.github/workflows/ci-health-sweep.yml @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: MPL-2.0 +# Owner: Jonathan D.A. Jewell +# CI-Health Sweep — estate-wide detection + auto-remediation of the +# infrastructure CI failure classes diagnosed 2026-06-13 (see +# scripts/ci-health/README.adoc): +# A-BILLING account spending-limit/payment wall -> REPORT (owner-only) +# B-ALLOWLIST selected + no hyperpolymath/* pattern -> AUTO-FIX (allow-list) +# B-STARTUPFAIL observed startup_failure runs -> REPORT +# D-BURN bare [push, pull_request] double-trigger-> AUTO-FIX (PR) +# +# Complements hypatia-remediation-sweep.yml (Dependabot) — same fleet-sweep +# shape, staggered cron to avoid token-rate-limit collisions. +name: CI-Health Sweep + +on: + schedule: + - cron: '47 3 * * *' # daily 03:47 UTC (staggered off remediation-sweep 03:17) + workflow_dispatch: + inputs: + dry_run: + description: 'Detect + report only (no allow-list PUTs, no PRs)' + required: false + default: 'true' + type: choice + options: ['true', 'false'] + owner: + description: 'Owner to sweep' + required: false + default: 'hyperpolymath' + type: string + +permissions: + contents: read + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + sweep: + name: Detect + remediate estate CI health + runs-on: ubuntu-latest + timeout-minutes: 30 + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - name: Run CI-health sweep + env: + # PAT needs classic `repo` + `workflow` scope (repo admin for the + # allow-list PUT, PR create, and issue upsert). Reused from the + # existing remediation pipeline. + GH_TOKEN: ${{ secrets.HYPATIA_DISPATCH_PAT }} + OWNER: ${{ inputs.owner || 'hyperpolymath' }} + # Scheduled runs remediate live (owner opted into auto-remediation); + # manual dispatch defaults to dry-run for safety. + DRY_RUN: ${{ github.event_name == 'schedule' && 'false' || inputs.dry_run }} + MAX_BURN_PRS: '15' + ISSUE_REPO: hypatia + CI_HEALTH_DENYLIST: '007' # ARR-special / cross-owner repos to never touch + run: | + chmod +x scripts/ci-health/*.sh + ./scripts/ci-health/sweep.sh diff --git a/scripts/ci-health/README.adoc b/scripts/ci-health/README.adoc new file mode 100644 index 00000000..a40beb45 --- /dev/null +++ b/scripts/ci-health/README.adoc @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: MPL-2.0 +// SPDX-FileCopyrightText: 2026 Jonathan D.A. Jewell (hyperpolymath) +// Owner: Jonathan D.A. Jewell += CI-Health Sweep — estate failure-class detector + auto-remediator +:toc: macro +:icons: font + +Detects and auto-remediates the *infrastructure* CI failure classes that +repeatedly reddened estate CI, diagnosed 2026-06-13 from `tma-mark2#52`. These +are **not** workflow-logic bugs — which is why earlier rounds of per-workflow +"surface fixes" had zero effect. The cause lived in account billing and repo +Actions settings. + +toc::[] + +== Failure classes + +[cols="1,3,1",options="header"] +|=== +| Class | Signature | Action + +| `A-BILLING` +| A `failure` run whose job annotation reads _"the job was not started because +recent account payments have failed or your spending limit needs to be +increased"_. Blocks **all billable jobs** (private repos). Manifests on +code-scanning gates as _"waiting for results from CodeQL"_ (no result is ever +uploaded because the job never starts). +| **Report only** — owner clears it in GitHub → Settings → Billing & plans. + +| `B-ALLOWLIST` +| `allowed_actions: selected` with `patterns_allowed` missing `hyperpolymath/*`. +Only GitHub-owned + verified-marketplace actions resolve, so every +`hyperpolymath/*` reusable + every non-verified third-party action is blocked +at parse → `startup_failure` (zero jobs, no logs). Hits public repos too. +| **Auto-fix** — PUT the curated superset (`action-superset.txt`), keep +`sha_pinning_required`. + +| `B-STARTUPFAIL` +| Observed `startup_failure` runs. +| **Report** — the API never exposes the reason; the **web-UI red banner does** +(it names the blocked action). Usually a missing allow-list pattern. + +| `D-BURN` +| A workflow on bare `on: [push, pull_request]` — runs **twice** per PR commit +(branch push + PR synchronize). +| **Auto-fix** — open a PR scoping `push` to the default branch + adding +`concurrency: cancel-in-progress` (safe: these are read-only PR checks). +|=== + +== Components + +* `detect.sh ` — API-only classifier; emits TSV `repo CLASS SEV detail`. +* `remediate.sh ` — applies B-ALLOWLIST (allow-list PUT) + and D-BURN (idempotent signed PR). Guardrails: own repos only (skips + forks/archived), `CI_HEALTH_DENYLIST`, idempotent, dry-run honoured. +* `sweep.sh` — estate driver: enumerate → detect → remediate (cap + `MAX_BURN_PRS`) → upsert one rolling tracking issue. +* `action-superset.txt` — curated non-GitHub actions the estate uses, applied as + `owner/repo@*` alongside `hyperpolymath/*`. +* `../../.github/workflows/ci-health-sweep.yml` — daily cron (03:47 UTC) + + `workflow_dispatch` (manual defaults to dry-run). + +== Operating notes / off-switch + +* Needs `secrets.HYPATIA_DISPATCH_PAT` (classic `repo` + `workflow` scope — repo + admin for the allow-list PUT, PR create, issue upsert). +* Scheduled runs remediate **live**; manual `workflow_dispatch` defaults to + **dry-run**. To pause auto-remediation, set the schedule run to dry-run or + disable the workflow. +* Sweep gotchas baked in: drive off the API owner (never local dir names — some + estate clones' `origin` ≠ dir name, e.g. `mirror-007` → `hyperpolymath/007`); + PR-create always passes an explicit `base`; forks/archived/deleted remotes are + skipped, not failed. +* New shared third-party action? Add it to `action-superset.txt`. diff --git a/scripts/ci-health/action-superset.txt b/scripts/ci-health/action-superset.txt new file mode 100644 index 00000000..fd69bd6d --- /dev/null +++ b/scripts/ci-health/action-superset.txt @@ -0,0 +1,75 @@ +8398a7/action-slack +actions-rust-lang/setup-rust-toolchain +ad-m/github-push-action +alire-project/setup-alire +anchore/scan-action +anthropics/claude-code-action +aquasecurity/trivy-action +astral-sh/setup-uv +awalsh128/cache-apt-pkgs-action +azure/webapps-deploy +benchmark-action/github-action-benchmark +cachix/install-nix-action +cbrgm/cleanup-stale-branches-action +codecov/codecov-action +cometkim/rclone-actions +DavidAnson/markdownlint-cli2-action +dawidd6/action-download-artifact +DeLaGuardo/setup-clojure +denoland/setup-deno +dependabot/fetch-metadata +dependency-check/Dependency-Check_Action +devcontainers/ci +dlang-community/setup-dlang +docker/build-push-action +docker/login-action +docker/metadata-action +docker/setup-buildx-action +docker/setup-qemu-action +dtolnay/rust-action +dtolnay/rust-toolchain +editorconfig-checker/action-editorconfig-checker +EnricoMi/publish-unit-test-result-action +erlef/setup-beam +extractions/setup-just +gitleaks/gitleaks-action +google/clusterfuzzlite +goto-bus-stop/setup-zig +hadolint/hadolint-action +hashicorp/setup-terraform +haskell-actions/hlint-run +haskell-actions/hlint-setup +haskell-actions/setup +ibiqlik/action-yamllint +ionos-deploy-now/deploy-to-ionos-action +ionos-deploy-now/retrieve-project-info-action +ionos-deploy-now/template-renderer-action +jetli/wasm-pack-action +julia-actions/cache +julia-actions/julia-processcoverage +julia-actions/setup-julia +KSXGitHub/github-actions-deploy-aur +ludeeus/action-shellcheck +lycheeverse/lychee-action +mlugg/setup-zig +ocaml/setup-ocaml +orhun/git-cliff-action +ossf/scorecard-action +peaceiris/actions-gh-pages +peaceiris/actions-hugo +peter-evans/create-pull-request +peter-evans/repository-dispatch +pnpm/action-setup +pypa/gh-action-pypi-publish +returntocorp/semgrep-action +r-lib/actions +ruby/setup-ruby +rustsec/audit-check +shivammathur/setup-php +snyk/actions +softprops/action-gh-release +SonarSource/sonarcloud-github-action +Swatinem/rust-cache +taiki-e/install-action +trufflesecurity/trufflehog +webfactory/ssh-agent diff --git a/scripts/ci-health/detect.sh b/scripts/ci-health/detect.sh new file mode 100755 index 00000000..d359eb23 --- /dev/null +++ b/scripts/ci-health/detect.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: MPL-2.0 +# SPDX-FileCopyrightText: 2026 Jonathan D.A. Jewell (hyperpolymath) +# Owner: Jonathan D.A. Jewell +# +# detect.sh — classify the infrastructure CI failure modes that repeatedly +# redden estate CI (diagnosed 2026-06-13). API-only; safe to run in CI with +# no local checkout. Emits TSV: \t\t\t +# +# Classes: +# A-BILLING account Actions spending-limit/payment wall (OWNER-ONLY fix) +# B-ALLOWLIST allowed_actions=selected + no hyperpolymath/* → reusables/ +# non-verified actions startup_failure (auto-remediable) +# B-STARTUPFAIL observed startup_failure runs (symptom; check allow-list) +# D-BURN workflow(s) on bare [push,pull_request] = 2x runs/PR +# (auto-remediable: scope push + concurrency-cancel) +# +# The API NEVER exposes the startup_failure reason; the GitHub web-UI red +# banner does (it names the blocked action). That is the human diagnostic. +set -euo pipefail +O="${OWNER:-hyperpolymath}"; R="$1" +emit(){ printf '%s\t%s\t%s\t%s\n' "$R" "$1" "$2" "$3"; } + +# --- A: billing wall (a failure run whose job annotation matches the signature) +fail_id=$(gh api "repos/$O/$R/actions/runs?status=failure&per_page=5" --jq '.workflow_runs[0].id // empty' 2>/dev/null || true) +if [ -n "${fail_id:-}" ]; then + job_id=$(gh api "repos/$O/$R/actions/runs/$fail_id/jobs" --jq '.jobs[0].id // empty' 2>/dev/null || true) + if [ -n "${job_id:-}" ]; then + msg=$(gh api "repos/$O/$R/check-runs/$job_id/annotations" --jq '.[0].message // empty' 2>/dev/null || true) + if printf '%s' "$msg" | grep -qiE 'payments have failed|spending limit'; then + emit A-BILLING CRITICAL "Actions billing/spending-limit wall blocks all billable jobs → OWNER: GitHub Settings -> Billing & plans" + fi + fi +fi + +# --- B: allow-list misconfig (the root cause of estate startup_failure) +aa=$(gh api "repos/$O/$R/actions/permissions" --jq '.allowed_actions // empty' 2>/dev/null || true) +if [ "$aa" = "selected" ]; then + has=$(gh api "repos/$O/$R/actions/permissions/selected-actions" --jq '(.patterns_allowed // [])|index("hyperpolymath/*") // "MISSING"' 2>/dev/null || echo MISSING) + if [ "$has" = "MISSING" ]; then + emit B-ALLOWLIST HIGH "selected + no hyperpolymath/* pattern → reusables & non-verified actions startup_failure → apply curated superset" + fi +fi + +# --- B: observed startup_failure runs (symptom) +sf=$(gh api "repos/$O/$R/actions/runs?per_page=30" --jq '[.workflow_runs[]|select(.conclusion=="startup_failure")]|length' 2>/dev/null || echo 0) +[ "${sf:-0}" -gt 0 ] && emit B-STARTUPFAIL HIGH "$sf recent startup_failure run(s) → web-UI banner names the blocked action; populate allow-list patterns" + +# --- D: burn anti-pattern (bare [push, pull_request] double-trigger), via API +for path in $(gh api "repos/$O/$R/contents/.github/workflows" --jq '.[]?|select(.name|test("\\.ya?ml$"))|.path' 2>/dev/null || true); do + if gh api "repos/$O/$R/contents/$path" --jq '.content' 2>/dev/null | base64 -d 2>/dev/null \ + | grep -qE '^on:[[:space:]]*\[[[:space:]]*push[[:space:]]*,[[:space:]]*pull_request[[:space:]]*\]'; then + emit D-BURN MEDIUM "$path on bare [push,pull_request] (2x runs/PR) → scope push to default branch + concurrency-cancel" + fi +done diff --git a/scripts/ci-health/remediate.sh b/scripts/ci-health/remediate.sh new file mode 100755 index 00000000..4e5c3afe --- /dev/null +++ b/scripts/ci-health/remediate.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: MPL-2.0 +# SPDX-FileCopyrightText: 2026 Jonathan D.A. Jewell (hyperpolymath) +# Owner: Jonathan D.A. Jewell +# +# remediate.sh — apply the SAFE auto-remediations for the CI failure classes. +# B-ALLOWLIST -> PUT curated allow-list superset (hyperpolymath/* + pinned +# third-party), keep github-owned/verified/sha-pinning. +# D-BURN -> open an idempotent, signed burn-cut PR (scope push to the +# default branch + add concurrency-cancel) via the API. +# A-BILLING -> NEVER auto-fixed (account-level, owner-only); the driver +# aggregates these into the tracking issue. +# +# Guardrails (lessons from the 2026-06-13 sweep): +# * own repos only: skip forks + archived. +# * DENYLIST: skip ARR-special / cross-owner repos (e.g. 007). +# * idempotent: skip if the allow-list already has hyperpolymath/* or the +# burn-cut branch/PR already exists. +# * dry-run honoured. +# Usage: remediate.sh +set -euo pipefail +O="${OWNER:-hyperpolymath}"; R="$1"; CLASS="$2"; DRY="${3:-true}" +BR="ci/ci-health-auto-remediation" +HERE="$(cd "$(dirname "$0")" && pwd)" +DENYLIST="${CI_HEALTH_DENYLIST:-007}" # space-separated repo names to never touch + +for d in $DENYLIST; do [ "$R" = "$d" ] && { echo "SKIP $R/$CLASS denylisted"; exit 0; }; done +meta=$(gh api "repos/$O/$R" --jq '"\(.fork) \(.archived)"' 2>/dev/null || echo "false false") +read -r isfork isarch <<<"$meta" +{ [ "$isfork" = "true" ] || [ "$isarch" = "true" ]; } && { echo "SKIP $R/$CLASS fork-or-archived"; exit 0; } + +case "$CLASS" in + B-ALLOWLIST) + # Build body: hyperpolymath/* + each superset action as owner/repo@* + body=$(python3 - "$HERE/action-superset.txt" <<'PY' +import json,sys +pats=["hyperpolymath/*"]+[l.strip()+"@*" for l in open(sys.argv[1]) if l.strip()] +print(json.dumps({"github_owned_allowed":True,"verified_allowed":True,"patterns_allowed":pats})) +PY +) + if [ "$DRY" = "true" ]; then echo "DRYRUN $R/B-ALLOWLIST would PUT $(printf '%s' "$body"|python3 -c 'import json,sys;print(len(json.load(sys.stdin)["patterns_allowed"]))') patterns"; exit 0; fi + printf '%s' "$body" | gh api -X PUT "repos/$O/$R/actions/permissions/selected-actions" --input - >/dev/null + n=$(gh api "repos/$O/$R/actions/permissions/selected-actions" --jq '.patterns_allowed|length') + echo "FIXED $R/B-ALLOWLIST -> $n patterns (sha-pinning unchanged)" + ;; + D-BURN) + if gh api "repos/$O/$R/branches/$BR" --jq '.name' >/dev/null 2>&1; then echo "SKIP $R/D-BURN branch-exists"; exit 0; fi + def=$(gh api "repos/$O/$R" --jq '.default_branch'); sha=$(gh api "repos/$O/$R/git/ref/heads/$def" --jq '.object.sha') + targets=(); for p in $(gh api "repos/$O/$R/contents/.github/workflows" --jq '.[]?|select(.name|test("\\.ya?ml$"))|.path' 2>/dev/null); do + gh api "repos/$O/$R/contents/$p?ref=$def" --jq '.content' 2>/dev/null | base64 -d 2>/dev/null \ + | grep -qE '^on:[[:space:]]*\[[[:space:]]*push[[:space:]]*,[[:space:]]*pull_request' && targets+=("$p") + done + [ "${#targets[@]}" -eq 0 ] && { echo "SKIP $R/D-BURN no-targets"; exit 0; } + if [ "$DRY" = "true" ]; then echo "DRYRUN $R/D-BURN would patch ${#targets[@]} file(s) + open PR"; exit 0; fi + gh api -X POST "repos/$O/$R/git/refs" -f ref="refs/heads/$BR" -f sha="$sha" >/dev/null + for p in "${targets[@]}"; do + cur=$(gh api "repos/$O/$R/contents/$p?ref=$BR") + newc=$(printf '%s' "$cur" | python3 -c 'import json,sys,re,base64 +d=json.load(sys.stdin); s=base64.b64decode(d["content"]).decode("utf-8") +pat=re.compile(r"(?m)^on:[ \t]*\[[ \t]*push[ \t]*,[ \t]*pull_request[ \t]*\][ \t]*$") +blk="on:\n push:\n branches: [main, master]\n pull_request:\n" +if not re.search(r"(?m)^concurrency:",s): + blk+="\n# Estate guardrail: scope push to default branches (PR fires once, not\n# push+PR) and cancel superseded runs. Safe — read-only PR check.\nconcurrency:\n group: ${{ github.workflow }}-${{ github.ref }}\n cancel-in-progress: true\n" +print(base64.b64encode(pat.sub(blk.rstrip(chr(10)),s,count=1).encode()).decode())') + csha=$(printf '%s' "$cur" | python3 -c 'import json,sys;print(json.load(sys.stdin)["sha"])') + gh api -X PUT "repos/$O/$R/contents/$p" -f message="ci: cut Actions burn in $p (scope push + concurrency-cancel)" \ + -f content="$newc" -f sha="$csha" -f branch="$BR" >/dev/null + done + url=$(gh api "repos/$O/$R/pulls" -X POST -f title="ci: cut Actions burn — scope push triggers + concurrency-cancel" \ + -f head="$BR" -f base="$def" -f body="Automated by hypatia ci-health-sweep. Scopes \`push\` to the default branch (kills push+PR double-runs) and adds \`concurrency: cancel-in-progress\` to read-only PR checks. No SPDX/logic changes." --jq '.html_url') + echo "FIXED $R/D-BURN -> $url (${#targets[@]} file(s))" + ;; + *) echo "SKIP $R/$CLASS no-auto-remediation" ;; +esac diff --git a/scripts/ci-health/sweep.sh b/scripts/ci-health/sweep.sh new file mode 100755 index 00000000..f894fefc --- /dev/null +++ b/scripts/ci-health/sweep.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: MPL-2.0 +# SPDX-FileCopyrightText: 2026 Jonathan D.A. Jewell (hyperpolymath) +# Owner: Jonathan D.A. Jewell +# +# sweep.sh — estate driver: detect + auto-remediate (B,D) + report (A). +# Enumerates the owner's own (non-fork, non-archived) repos, classifies each +# with detect.sh, applies remediate.sh for the safe classes (unless dry-run), +# and upserts a single rolling tracking issue with the findings. +# +# Env: OWNER (default hyperpolymath), DRY_RUN (true|false), MAX_BURN_PRS +# (default 15), ISSUE_REPO (where the tracking issue lives, default hypatia), +# CI_HEALTH_DENYLIST (passed through to remediate.sh). +set -euo pipefail +O="${OWNER:-hyperpolymath}"; DRY="${DRY_RUN:-true}"; MAXPR="${MAX_BURN_PRS:-15}" +IREPO="${ISSUE_REPO:-hypatia}"; HERE="$(cd "$(dirname "$0")" && pwd)" +TITLE="🩺 CI-health: estate failure-class report" +findings=$(mktemp); burned=0 + +echo "::group::Enumerate owner repos (own, non-archived)" +mapfile -t REPOS < <(gh repo list "$O" --source --no-archived --limit 1000 --json name --jq '.[].name' | sort) +echo "repos to scan: ${#REPOS[@]} (dry_run=$DRY)" +echo "::endgroup::" + +for r in "${REPOS[@]}"; do + OWNER="$O" "$HERE/detect.sh" "$r" 2>/dev/null >>"$findings" || true +done + +# Remediate +while IFS=$'\t' read -r repo cls sev det; do + case "$cls" in + B-ALLOWLIST) OWNER="$O" "$HERE/remediate.sh" "$repo" "$cls" "$DRY" || true ;; + D-BURN) + if [ "$burned" -lt "$MAXPR" ]; then + out=$(OWNER="$O" "$HERE/remediate.sh" "$repo" "$cls" "$DRY" || true); echo "$out" + echo "$out" | grep -q '^FIXED' && burned=$((burned+1)) + else echo "CAP $repo/D-BURN max-burn-prs($MAXPR) reached — deferred"; fi ;; + esac +done < <(sort -u "$findings") + +# Build report +rep=$(mktemp) +{ + echo "## $TITLE" + echo "_Generated $(date -u +%Y-%m-%dT%H:%MZ) · owner: $O · dry_run: $DRY · scanned ${#REPOS[@]} repos_" + echo "" + echo "### 🔴 A-BILLING — OWNER action required (account spending-limit/payment wall)" + grep -P '\tA-BILLING\t' "$findings" | awk -F'\t' '{print "- **"$1"** — "$4}' || true + grep -qP '\tA-BILLING\t' "$findings" || echo "- _none_" + echo "" + echo "### 🟠 B — allow-list / startup_failure" + grep -P '\tB-(ALLOWLIST|STARTUPFAIL)\t' "$findings" | awk -F'\t' '{print "- "$1" ("$2"): "$4}' || true + grep -qP '\tB-' "$findings" || echo "- _none_" + echo "" + echo "### 🟡 D-BURN — push/PR double-trigger" + grep -P '\tD-BURN\t' "$findings" | awk -F'\t' '{print "- "$1": "$4}' || true + grep -qP '\tD-BURN\t' "$findings" || echo "- _none_" + echo "" + echo "> Auto-remediation: B-ALLOWLIST applied in place; D-BURN opened as PRs (cap $MAXPR/run); A-BILLING is owner-only. See \`scripts/ci-health/README.adoc\`." +} >"$rep" +[ -n "${GITHUB_STEP_SUMMARY:-}" ] && cat "$rep" >>"$GITHUB_STEP_SUMMARY" + +# Upsert the rolling tracking issue +num=$(gh issue list --repo "$O/$IREPO" --state open --search "$TITLE in:title" --json number --jq '.[0].number // empty' 2>/dev/null || true) +if [ -n "${num:-}" ]; then gh issue edit "$num" --repo "$O/$IREPO" --body-file "$rep" >/dev/null && echo "updated issue #$num" +else gh issue create --repo "$O/$IREPO" --title "$TITLE" --body-file "$rep" >/dev/null && echo "opened tracking issue"; fi