diff --git a/.github/workflows/otlp-data-quality-validator.lock.yml b/.github/workflows/otlp-data-quality-validator.lock.yml
index 6d4286d5a00..46b54f639e5 100644
--- a/.github/workflows/otlp-data-quality-validator.lock.yml
+++ b/.github/workflows/otlp-data-quality-validator.lock.yml
@@ -1,4 +1,4 @@
-# gh-aw-metadata: {"schema_version":"v3","frontmatter_hash":"17dcabe392f10a701b05312a2a2a544024a389a44bbf590159964c1892c52074","strict":true,"agent_id":"copilot"}
+# gh-aw-metadata: {"schema_version":"v3","frontmatter_hash":"759e69cd162496de334aa3b7220316b6485908c3c63e4436e2a2963728bf6146","strict":true,"agent_id":"copilot"}
# gh-aw-manifest: {"version":1,"secrets":["COPILOT_GITHUB_TOKEN","GH_AW_GITHUB_MCP_SERVER_TOKEN","GH_AW_GITHUB_TOKEN","GH_AW_OTEL_GRAFANA_AUTHORIZATION","GH_AW_OTEL_GRAFANA_ENDPOINT","GH_AW_OTEL_SENTRY_AUTHORIZATION","GH_AW_OTEL_SENTRY_ENDPOINT","GITHUB_TOKEN"],"actions":[{"repo":"actions/checkout","sha":"de0fac2e4500dabe0009e67214ff5f5447ce83dd","version":"v6.0.2"},{"repo":"actions/download-artifact","sha":"3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c","version":"v8.0.1"},{"repo":"actions/github-script","sha":"3a2844b7e9c422d3c10d287c895573f7108da1b3","version":"v9.0.0"},{"repo":"actions/setup-node","sha":"48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e","version":"v6.4.0"},{"repo":"actions/upload-artifact","sha":"043fb46d1a93c77aae656e7c1c64a875d1fc6a0a","version":"v7.0.1"}],"containers":[{"image":"ghcr.io/github/gh-aw-firewall/agent:0.25.49"},{"image":"ghcr.io/github/gh-aw-firewall/api-proxy:0.25.49"},{"image":"ghcr.io/github/gh-aw-firewall/cli-proxy:0.25.49"},{"image":"ghcr.io/github/gh-aw-firewall/squid:0.25.49"},{"image":"ghcr.io/github/gh-aw-mcpg:v0.3.9","digest":"sha256:64828b42a4482f58fab16509d7f8f495a6d97c972a98a68aff20543531ac0388","pinned_image":"ghcr.io/github/gh-aw-mcpg:v0.3.9@sha256:64828b42a4482f58fab16509d7f8f495a6d97c972a98a68aff20543531ac0388"},{"image":"ghcr.io/github/github-mcp-server:v1.0.4"},{"image":"node:lts-alpine","digest":"sha256:d1b3b4da11eefd5941e7f0b9cf17783fc99d9c6fc34884a665f40a06dbdfc94f","pinned_image":"node:lts-alpine@sha256:d1b3b4da11eefd5941e7f0b9cf17783fc99d9c6fc34884a665f40a06dbdfc94f"}]}
# ___ _ _
# / _ \ | | (_)
@@ -22,7 +22,7 @@
#
# For more information: https://github.github.com/gh-aw/introduction/overview/
#
-# Validates OTLP trace, metric, and log data quality across app emission, Collector processing, and backend visibility
+# Validates gh-aw OTLP trace data quality across local JSONL mirror, direct vendor export, and backend visibility
#
# Resolved workflow manifest:
# Imports:
@@ -202,20 +202,20 @@ jobs:
run: |
bash "${RUNNER_TEMP}/gh-aw/actions/create_prompt_first.sh"
{
- cat << 'GH_AW_PROMPT_7de7fa5e3739b47b_EOF'
+ cat << 'GH_AW_PROMPT_bc29e1568146c495_EOF'
- GH_AW_PROMPT_7de7fa5e3739b47b_EOF
+ GH_AW_PROMPT_bc29e1568146c495_EOF
cat "${RUNNER_TEMP}/gh-aw/prompts/xpia.md"
cat "${RUNNER_TEMP}/gh-aw/prompts/temp_folder_prompt.md"
cat "${RUNNER_TEMP}/gh-aw/prompts/markdown.md"
cat "${RUNNER_TEMP}/gh-aw/prompts/safe_outputs_prompt.md"
- cat << 'GH_AW_PROMPT_7de7fa5e3739b47b_EOF'
+ cat << 'GH_AW_PROMPT_bc29e1568146c495_EOF'
Tools: create_issue, missing_tool, missing_data, noop
- GH_AW_PROMPT_7de7fa5e3739b47b_EOF
+ GH_AW_PROMPT_bc29e1568146c495_EOF
cat "${RUNNER_TEMP}/gh-aw/prompts/mcp_cli_tools_prompt.md"
- cat << 'GH_AW_PROMPT_7de7fa5e3739b47b_EOF'
+ cat << 'GH_AW_PROMPT_bc29e1568146c495_EOF'
The following GitHub context information is available for this workflow:
{{#if github.actor}}
@@ -244,14 +244,14 @@ jobs:
{{/if}}
- GH_AW_PROMPT_7de7fa5e3739b47b_EOF
+ GH_AW_PROMPT_bc29e1568146c495_EOF
cat "${RUNNER_TEMP}/gh-aw/prompts/cli_proxy_with_safeoutputs_prompt.md"
- cat << 'GH_AW_PROMPT_7de7fa5e3739b47b_EOF'
+ cat << 'GH_AW_PROMPT_bc29e1568146c495_EOF'
{{#runtime-import .github/workflows/shared/otlp.md}}
{{#runtime-import .github/workflows/shared/otel-queries.md}}
{{#runtime-import .github/workflows/otlp-data-quality-validator.md}}
- GH_AW_PROMPT_7de7fa5e3739b47b_EOF
+ GH_AW_PROMPT_bc29e1568146c495_EOF
} > "$GH_AW_PROMPT"
- name: Interpolate variables and render templates
uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
@@ -466,9 +466,9 @@ jobs:
mkdir -p "${RUNNER_TEMP}/gh-aw/safeoutputs"
mkdir -p /tmp/gh-aw/safeoutputs
mkdir -p /tmp/gh-aw/mcp-logs/safeoutputs
- cat > "${RUNNER_TEMP}/gh-aw/safeoutputs/config.json" << 'GH_AW_SAFE_OUTPUTS_CONFIG_4c77f8b71cbb283e_EOF'
+ cat > "${RUNNER_TEMP}/gh-aw/safeoutputs/config.json" << 'GH_AW_SAFE_OUTPUTS_CONFIG_53860d35baa3701f_EOF'
{"create_issue":{"close_older_issues":true,"expires":168,"labels":["observability","telemetry","report"],"max":1,"title_prefix":"[OTLP Validation] "},"create_report_incomplete_issue":{},"max_bot_mentions":1,"mentions":{"enabled":false},"missing_data":{},"missing_tool":{},"noop":{"max":1,"report-as-issue":"true"},"report_incomplete":{}}
- GH_AW_SAFE_OUTPUTS_CONFIG_4c77f8b71cbb283e_EOF
+ GH_AW_SAFE_OUTPUTS_CONFIG_53860d35baa3701f_EOF
- name: Generate Safe Outputs Tools
env:
GH_AW_TOOLS_META_JSON: |
@@ -673,7 +673,7 @@ jobs:
mkdir -p /home/runner/.copilot
GH_AW_NODE=$(which node 2>/dev/null || command -v node 2>/dev/null || echo node)
- cat << GH_AW_MCP_CONFIG_14247717b4285c48_EOF | "$GH_AW_NODE" "${RUNNER_TEMP}/gh-aw/actions/start_mcp_gateway.cjs"
+ cat << GH_AW_MCP_CONFIG_2c6df0af9284b001_EOF | "$GH_AW_NODE" "${RUNNER_TEMP}/gh-aw/actions/start_mcp_gateway.cjs"
{
"mcpServers": {
"safeoutputs": {
@@ -703,7 +703,7 @@ jobs:
}
}
}
- GH_AW_MCP_CONFIG_14247717b4285c48_EOF
+ GH_AW_MCP_CONFIG_2c6df0af9284b001_EOF
- name: Mount MCP servers as CLIs
id: mount-mcp-clis
continue-on-error: true
@@ -1214,7 +1214,7 @@ jobs:
uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
env:
WORKFLOW_NAME: "OTLP Data Quality Validator"
- WORKFLOW_DESCRIPTION: "Validates OTLP trace, metric, and log data quality across app emission, Collector processing, and backend visibility"
+ WORKFLOW_DESCRIPTION: "Validates gh-aw OTLP trace data quality across local JSONL mirror, direct vendor export, and backend visibility"
HAS_PATCH: ${{ needs.agent.outputs.has_patch }}
with:
script: |
diff --git a/.github/workflows/otlp-data-quality-validator.md b/.github/workflows/otlp-data-quality-validator.md
index e910b595237..aba848a7090 100644
--- a/.github/workflows/otlp-data-quality-validator.md
+++ b/.github/workflows/otlp-data-quality-validator.md
@@ -1,7 +1,7 @@
---
emoji: "๐งญ"
name: OTLP Data Quality Validator
-description: Validates OTLP trace, metric, and log data quality across app emission, Collector processing, and backend visibility
+description: Validates gh-aw OTLP trace data quality across local JSONL mirror, direct vendor export, and backend visibility
on:
schedule: daily on weekdays
workflow_dispatch:
@@ -35,30 +35,30 @@ imports:
# OTLP Data Quality Validator
-You are an OpenTelemetry/OTLP data quality validation agent.
+You are an OpenTelemetry/OTLP data quality validation agent for GitHub Agentic Workflows (`gh-aw`).
-Your goal is to determine whether telemetry data is complete, deduplicated, correctly shaped, and reliably flowing from source applications through the Collector to the observability backend.
+Your goal is to determine whether gh-aw trace data is complete, deduplicated, correctly shaped, and reliably flowing from the workflow runtime to configured OTLP vendor endpoints.
-Signal scope:
-- traces
-- metrics
-- logs
+## Architecture
-Pipeline scope:
-- SDK/app emission
-- Collector receiver
-- Collector processors
-- Collector exporters
-- backend ingestion and query-visible layer
+gh-aw emits **traces only** (no metrics or logs). It sends OTLP spans **directly to vendor endpoints** โ there is no OpenTelemetry Collector in the pipeline.
+
+```text
+gh-aw workflow runtime (actions/setup/js/send_otlp_span.cjs)
+ โ local JSONL mirror (/tmp/gh-aw/otel.jsonl)
+ โ OTLP/HTTP POST to vendor endpoints (concurrent fan-out)
+ โ vendor backends (Sentry, Grafana Tempo, Datadog, etc.)
+```
+
+Normative specification: `specs/otel-observability-spec.md`
Use the cheapest trustworthy source first:
-1. local files/artifacts and mirrors (for example `/tmp/gh-aw/otel.jsonl`)
-2. Collector/internal telemetry artifacts
-3. backend queries
+1. local JSONL mirror (`/tmp/gh-aw/otel.jsonl`) and export error logs (`/tmp/gh-aw/otlp-export-errors.jsonl`)
+2. backend queries via MCP tools (when available)
Always distinguish:
-- emitted vs ingested vs query-visible
-- true loss vs expected sampling or visibility delay
+- emitted (in JSONL mirror) vs exported (HTTP response) vs query-visible (backend)
+- true loss vs expected visibility delay
- suspected cause vs proven cause
If required evidence is unavailable, continue and mark confidence/uncertainty explicitly.
@@ -69,156 +69,150 @@ If required evidence is unavailable, continue and mark confidence/uncertainty ex
Define and report:
- validation time window (start/end)
-- expected services, environments, namespaces, and signal types
-
-When synthetic fields exist, prefer exact matching using:
-- `validation.run_id`
-- `validation.sequence_id`
-- `validation.expected_count`
+- expected `service.name` values (format: `gh-aw.`)
+- expected job names and span operations (setup, conclusion, agent)
-If synthetic fields do not exist, infer expectations from:
-- source-side counters
-- Collector receiver counts
-- backend ingestion/query counts
+Infer expectations from:
+- local JSONL mirror span count
+- `github.run_id` from resource attributes
+- export error count from `/tmp/gh-aw/otlp-export-errors.count`
### Step 2: Validate trace completeness and integrity
-Compute and report:
-- unique `trace_id` count
-- unique span identity count using `trace_id + span_id`
-- duplicate spans with same `trace_id + span_id`
+From the local JSONL mirror (`/tmp/gh-aw/otel.jsonl`), compute and report:
+- unique `traceId` count (expect 1 per workflow run)
+- unique span identity count using `traceId + spanId`
+- duplicate spans with same `traceId + spanId`
-When expected per-trace span counts exist, compare expected vs observed.
-
-Validate structure:
-- every non-root span must reference an existing `parent_span_id` in the same trace
-- root spans must not have `parent_span_id`
+Validate the expected span hierarchy per the spec (ยง9.3):
+- all setup spans share a single global `parentSpanId`
+- each conclusion span parents under its job's setup span
+- agent spans parent under the conclusion span
+- root setup parent has no parent
Validate required fields per span:
-- `trace_id`
-- `span_id`
-- `name`
-- `kind`
-- `start_time`
-- `end_time`
-- `service.name`
-- resource attributes
+- `traceId` (32-char hex)
+- `spanId` (16-char hex)
+- `name` (must match pattern `gh-aw..`)
+- `kind` (INTERNAL=1 for setup/conclusion, CLIENT=3 for agent)
+- `startTimeUnixNano`
+- `endTimeUnixNano`
Flag timestamp issues:
- `start_time > end_time`
- far-future timestamps
- timestamps far outside the validation window
-### Step 3: Validate metric completeness and quality
-
-Report:
-- observed metric names
-- diff between observed names and expected metric inventory
-
-Count metric points by:
-- metric name
-- resource identity
-- scope/instrumentation library
-- datapoint attributes
-- timestamp
-
-Detect duplicate datapoints using:
-`resource identity + scope + metric name + datapoint attributes + timestamp`
-
-Validate temporality:
-- cumulative counters should not reset unexpectedly
-- delta counters must not be interpreted as cumulative
-
-Flag suspicious behavior:
-- missing datapoints
-- counter decreases without reset evidence
-- unexpected zero values
-- cardinality spikes
-- missing required dimensions
-
-### Step 4: Validate log completeness and correlation
-
-Report total log records in the validation window.
-
-Detect duplicates using stable fingerprint:
-`timestamp + observed timestamp + body hash + severity + trace_id + span_id + resource identity`
-
-If `validation.sequence_id` exists:
-- identify missing sequence IDs
-- identify duplicate sequence IDs
-
-Validate required fields:
-- `timestamp`
-- `body`
-- `severity` or `severity_text`
-- `service.name`
-- resource attributes
-
-Check trace correlation:
-- logs emitted inside traces should contain both `trace_id` and `span_id`
-
-### Step 5: Check Collector health
-
-Inspect and report Collector internal telemetry. Use actual metric names when version-specific names differ.
-
-Cover:
-- accepted records by receiver
-- refused records by receiver
-- dropped records by processor
-- sent records by exporter
-- failed sends by exporter
-- retry counts
-- queue size/capacity
-- memory limiter drops
-- batch behavior
-- timeout/rate-limit exporter errors
-
-Pay special attention to metrics such as:
-- `otelcol_receiver_accepted_spans`
-- `otelcol_receiver_refused_spans`
-- `otelcol_processor_dropped_spans`
-- `otelcol_exporter_sent_spans`
-- `otelcol_exporter_send_failed_spans`
-- `otelcol_receiver_accepted_metric_points`
-- `otelcol_processor_dropped_metric_points`
-- `otelcol_exporter_sent_metric_points`
-- `otelcol_receiver_accepted_log_records`
-- `otelcol_processor_dropped_log_records`
-- `otelcol_exporter_sent_log_records`
-
-### Step 6: Reconcile pipeline stages
-
-For traces, metrics, and logs independently, reconcile:
-
-app emitted
-โ Collector received
-โ Collector processed
-โ Collector exported
-โ backend ingested
-โ backend query-visible
-
-For each mismatch, identify the most likely stage of loss, duplication, or transformation.
-
-Do not claim data loss unless cross-stage evidence supports it.
+```bash
+# Example: Extract span summary from JSONL mirror
+jq -c '.resourceSpans[].scopeSpans[].spans[] | {name, traceId, spanId, parentSpanId, kind, status}' /tmp/gh-aw/otel.jsonl
+```
+
+### Step 3: Validate span attribute contract
+
+Check setup spans for required attributes (spec ยง10.1):
+- `gh-aw.job.name`
+- `gh-aw.workflow.name`
+- `gh-aw.run.id`
+- `gh-aw.run.attempt`
+- `gh-aw.run.actor`
+- `gh-aw.repository`
+- `gh-aw.staged`
+
+Check conclusion spans for required attributes (spec ยง10.2):
+- `gh-aw.run.status` (must be `success`, `failure`, `timeout`, or `cancelled`)
+- `gh-aw.error_count`
+- `gh-aw.warning_count`
+- `gh-aw.action_minutes`
+- `gh-aw.output.item_count`
+- `gh-aw.otlp.export_errors`
+
+Check agent spans for GenAI semantic conventions (spec ยง10.3):
+- `gen_ai.system`
+- `gen_ai.request.model`
+- `gen_ai.operation.name` (must be `"chat"`)
+- `gen_ai.usage.input_tokens`
+- `gen_ai.usage.output_tokens`
+
+```bash
+# Example: Check required attributes on setup spans
+jq -c '.resourceSpans[].scopeSpans[].spans[] | select(.name | endswith(".setup")) | {name, attrs: [.attributes[]? | {(.key): .value}] | add}' /tmp/gh-aw/otel.jsonl
+```
+
+### Step 4: Validate resource attributes
+
+Check all spans for required resource attributes (spec ยง11.1):
+- `service.name` (format: `gh-aw.` or `gh-aw`)
+- `service.version`
+- `github.repository`
+- `github.run_id`
+- `github.run_attempt`
+- `github.actions.run_url`
+
+Check instrumentation scope:
+- `scope.name` must be `gh-aw`
+- `scope.version` should match `service.version`
+
+```bash
+# Example: Extract resource attributes
+jq -c '.resourceSpans[].resource.attributes[] | {(.key): .value}' /tmp/gh-aw/otel.jsonl | sort -u
+```
+
+### Step 5: Validate trace ID propagation
+
+Verify trace ID consistency across jobs (spec ยง12):
+- all spans in a single workflow run share the same `trace_id`
+- setup spans across different jobs share the same global `parent_span_id`
+- the JSONL mirror `trace_id` matches the value in `GITHUB_AW_OTEL_TRACE_ID`
+
+If export errors exist, check `/tmp/gh-aw/otlp-export-errors.jsonl`:
+- which endpoints failed
+- HTTP status codes
+- whether failures are transient (retryable) or permanent
+
+```bash
+# Example: Check trace ID consistency
+jq -r '.resourceSpans[].scopeSpans[].spans[].traceId' /tmp/gh-aw/otel.jsonl | sort -u | wc -l
+# Expected: 1 (single trace ID per run)
+
+# Example: Check export errors
+cat /tmp/gh-aw/otlp-export-errors.jsonl 2>/dev/null || echo "No export errors"
+cat /tmp/gh-aw/otlp-export-errors.count 2>/dev/null || echo "0"
+```
+
+### Step 6: Reconcile local mirror vs backend visibility
+
+For each configured OTLP endpoint, reconcile:
+
+```text
+local JSONL mirror (emitted)
+ โ OTLP/HTTP export (sent)
+ โ vendor backend (query-visible)
+```
+
+Check:
+- span count in JSONL mirror vs backend
+- whether all span names from the mirror appear in the backend
+- whether resource attributes survived backend ingestion
+- whether `trace_id` is searchable in the backend
+
+For multi-endpoint fan-out, validate each endpoint independently. Failure on one endpoint SHOULD NOT affect others.
+
+Do not claim data loss unless cross-stage evidence supports it. Distinguish ingestion delay from actual loss.
### Step 7: Root-cause hypotheses
-Evaluate likely causes, including:
-- SDK not flushing on shutdown
-- sampling misconfiguration
-- duplicate exporters in app config
-- duplicate flow through both agent and gateway
-- multiple Collectors scraping same source
-- retry behavior causing duplicate ingestion
-- filelog receiver offset rereads
-- batch timeout/size effects
-- memory limiter drops
-- exporter queue overflow
-- backend rate limits
-- resource attribute mutation/overwrite
-- OTLP gRPC/HTTP protocol mismatch
-- wrong endpoint/path
-- metrics temporality mismatch
+Evaluate likely causes for any issues found, including:
+- OTLP endpoint misconfiguration (wrong URL, missing `/v1/traces` suffix)
+- authentication failures (expired API key, wrong header name)
+- Sentry header rewrite not applied (`Authorization` should become `x-sentry-auth`)
+- network allowlist missing vendor hostname
+- `if-missing: error` blocking gateway OTLP when secrets are unresolved
+- retry exhaustion (3 attempts with exponential backoff)
+- OTLP/HTTP JSON vs OTLP/HTTP protobuf mismatch
+- vendor rate limits or ingestion delays
+- span attribute redaction removing useful diagnostic data
+- proxy configuration interfering with `fetch`-based export
Rank hypotheses by evidence strength and include alternatives.
@@ -231,31 +225,32 @@ Create exactly one issue with these sections in order:
- main risks
- most likely root cause (if any)
-### B. Completeness results
-Per signal (traces/metrics/logs):
-- expected count
-- observed count
-- missing count
-- duplicate count
+### B. Trace completeness
+- expected span count (from JSONL mirror)
+- observed span count (in backend)
+- missing spans
+- duplicate spans
+- trace ID consistency (single trace per run)
- confidence level
-### C. Duplicate analysis
-- duplicate keys
-- affected services
-- affected windows
-- sample duplicate records
-
-### D. Schema and quality issues
-- missing fields
-- invalid timestamps
-- missing resource attributes
-- cardinality problems
-- trace/log correlation gaps
-
-### E. Pipeline health
-- Collector receiver/processor/exporter counters
-- dropped/refused/failed signals
-- queue/retry indicators
+### C. Span hierarchy validation
+- setup spans share global parent: pass/fail
+- conclusion spans parent under setup: pass/fail
+- agent spans parent under conclusion: pass/fail
+- span naming pattern `gh-aw..`: pass/fail
+
+### D. Attribute contract validation
+- setup span required attributes: present/missing list
+- conclusion span required attributes: present/missing list
+- agent span GenAI attributes: present/missing list
+- resource attributes: present/missing list
+- instrumentation scope: correct/incorrect
+
+### E. Export and fan-out health
+- per-endpoint export status (success/fail/partial)
+- export error count and details
+- JSONL mirror write status
+- multi-endpoint fan-out independence
### F. Root-cause hypothesis
- likely cause
@@ -263,18 +258,17 @@ Per signal (traces/metrics/logs):
- alternative explanations
### G. Recommended fixes (prioritized)
-1. stop data loss
-2. stop duplication
-3. fix schema/resource attributes
-4. improve observability and alerts
+1. fix data loss or export failures
+2. fix missing required attributes
+3. fix span hierarchy or naming issues
+4. improve diagnostic coverage
### H. Validation queries or commands
-Provide concrete queries/commands/pseudocode used.
+Provide concrete jq/bash commands used against the JSONL mirror and backend.
Rules:
- Never assume missing equals lost without cross-stage evidence.
- Always distinguish ingestion completeness from query visibility.
-- Treat sampled traces as intentionally incomplete only when sampling config is verified.
-- Do not flag legitimate metric resets as errors when reset metadata or restart evidence exists.
-- Prefer exact validation keyed by `validation.run_id` and `validation.sequence_id` when available.
+- Do not flag visibility delays under 5 minutes as data loss.
- Be explicit about uncertainty.
+- Reference the normative spec (`specs/otel-observability-spec.md`) section numbers when reporting violations.
diff --git a/.github/workflows/outcome-collector.md b/.github/workflows/outcome-collector.md
index 0a69a87a464..c33c757f2e0 100644
--- a/.github/workflows/outcome-collector.md
+++ b/.github/workflows/outcome-collector.md
@@ -86,44 +86,69 @@ Use h3 (`###`) or lower for all headers in your report. Never use h1 (`#`) or h2
Wrap long sections in `Section Name
` tags to improve readability and reduce scrolling. Keep critical summaries and key metrics always visible.
Suggested structure:
-- Brief summary (always visible)
-- Key metrics or highlights (always visible)
-- Detailed analysis (in `` tags)
-- Recommendations (always visible)
+- Scorecard with economics metrics (always visible)
+- Actionable recommendations with specific next steps (always visible)
+- Per-workflow breakdown (in `` tags)
+- Detailed per-run data (in `` tags)
```markdown
-## Safe Output Outcomes โ {date}
+### Outcome Scorecard โ {date}
-### Fleet Summary
+| Metric | Value | Status |
+|--------|-------|--------|
+| **Acceptance rate** | **{acceptance_rate}%** | ๐ข >80% / ๐ก 60-80% / ๐ด <60% |
+| **Zero-touch rate** | **{zero_touch_rate}%** | ๐ข >50% / ๐ก 25-50% / ๐ด <25% |
+| **Waste rate** | {waste_rate}% | ๐ข <10% / ๐ก 10-25% / ๐ด >25% |
+| **Median time to resolution** | {median_resolution} | โ |
+| Accepted | {accepted} / {total_outcomes} | โ |
+| Rejected | {rejected} | โ |
+| Zero-touch | {zero_touch} / {accepted} | โ |
+| Pending | {pending} | โ |
+| Runs checked | {runs_checked} | โ |
-| Metric | Value |
-|--------|-------|
-| Runs checked | {runs_checked} |
-| Total outcomes | {total_outcomes} |
-| Accepted | {accepted} |
-| Rejected | {rejected} |
-| Ignored | {ignored} |
-| Pending | {pending} |
-| **Acceptance rate** | **{acceptance_rate}%** |
-| Waste rate | {waste_rate}% |
+### ๐ด Action Items
+
+List concrete actions the team should take based on the data:
+
+1. **Highest-waste workflows** โ Name the top 2-3 workflows by waste rate. If waste rate >25%, recommend reviewing the prompt or safe-output configuration.
+2. **Stuck pending items** โ List any items pending >48 hours. These need human review or the workflow needs a timeout.
+3. **Low zero-touch workflows** โ Workflows where accepted items always need human edits indicate the agent's output quality needs improvement.
+4. **Negative reactions** โ Items with negative reactions (๐, confused) signal user dissatisfaction even on "accepted" items.
### Per-Workflow Breakdown
-For each workflow with outcomes, show:
-- Workflow name
-- Outcomes: accepted / rejected / ignored
-- Acceptance rate
+For each workflow with outcomes, show a mini-scorecard:
+
+| Workflow | Accepted | Rejected | Pending | Acceptance | Zero-touch | Reactions ๐/๐ |
+|----------|----------|----------|---------|------------|------------|----------------|
+
+Sort by waste rate descending (worst first).
+
+### Reaction Summary
+
+If any items have reactions, summarize:
+- Items with positive reactions (๐ heart rocket hooray): these workflows are producing valued output
+- Items with negative reactions (๐ confused): these need prompt or quality improvements
+- Items with zero reactions: no signal yet
+
+### Trend Signal
-### Key Observations
+Compare today's acceptance rate and zero-touch rate against the previous report in cache-memory (if available). Flag:
+- โฌ๏ธ Improving: acceptance rate up >5pp or zero-touch rate up >10pp
+- โฌ๏ธ Regressing: acceptance rate down >5pp or waste rate up >5pp
+- โก๏ธ Stable: within 5pp of previous
-- Which workflows have the highest acceptance rate?
-- Which workflows have the highest waste rate?
-- Any workflows with all outcomes ignored (noise signal)?
+If no previous data exists, skip this section.
```
## Guidelines
- Keep the report factual โ numbers only, no speculation
- Do not re-evaluate outcomes โ use the pre-computed data
+- Sort workflows by waste rate descending so the worst performers are at the top
+- Flag any workflow with acceptance rate <60% as needing attention
+- Flag any item pending >48 hours
+- If reactions data is available, include it in the per-workflow breakdown
+- Save this report's key metrics to cache-memory for trend comparison in the next run
- If no outcomes exist, use `noop`
- Stop immediately after creating the issue
diff --git a/actions/setup/js/emit_outcome_spans.cjs b/actions/setup/js/emit_outcome_spans.cjs
index c857223e899..2d7eccaf95e 100644
--- a/actions/setup/js/emit_outcome_spans.cjs
+++ b/actions/setup/js/emit_outcome_spans.cjs
@@ -148,6 +148,11 @@ async function main() {
const changedFiles = typeof eval_.changed_files === "number" ? eval_.changed_files : null;
const additions = typeof eval_.additions === "number" ? eval_.additions : null;
const deletions = typeof eval_.deletions === "number" ? eval_.deletions : null;
+ const reactionsTotal = typeof eval_.reactions_total === "number" ? eval_.reactions_total : null;
+ const reactionsPositive = typeof eval_.reactions_positive === "number" ? eval_.reactions_positive : null;
+ const reactionsNegative = typeof eval_.reactions_negative === "number" ? eval_.reactions_negative : null;
+ const comments = typeof eval_.comments === "number" ? eval_.comments : null;
+ const zeroTouch = eval_.zero_touch === true;
const attributes = [
buildAttr("gh-aw.exporter.name", "outcome-collector"),
@@ -168,6 +173,11 @@ async function main() {
if (changedFiles !== null) attributes.push(buildAttr("gh-aw.outcome.changed_files", changedFiles));
if (additions !== null) attributes.push(buildAttr("gh-aw.outcome.additions", additions));
if (deletions !== null) attributes.push(buildAttr("gh-aw.outcome.deletions", deletions));
+ if (reactionsTotal !== null) attributes.push(buildAttr("gh-aw.outcome.reactions_total", reactionsTotal));
+ if (reactionsPositive !== null) attributes.push(buildAttr("gh-aw.outcome.reactions_positive", reactionsPositive));
+ if (reactionsNegative !== null) attributes.push(buildAttr("gh-aw.outcome.reactions_negative", reactionsNegative));
+ if (comments !== null) attributes.push(buildAttr("gh-aw.outcome.comments", comments));
+ if (zeroTouch) attributes.push(buildAttr("gh-aw.outcome.zero_touch", true));
// Map result to OTLP status: accepted=OK, rejected=ERROR, noop=UNSET, pending/ignored=UNSET
const statusCode = result === "rejected" ? 2 : result === "accepted" ? 1 : 0;
@@ -205,6 +215,8 @@ async function main() {
buildAttr("gh-aw.outcome.acceptance_rate", getSummaryNumber("acceptance_rate", 0)),
buildAttr("gh-aw.outcome.waste_rate", getSummaryNumber("waste_rate", 0)),
buildAttr("gh-aw.outcome.noop_rate", getSummaryNumber("noop_rate", 0)),
+ buildAttr("gh-aw.outcome.zero_touch_count", getSummaryNumber("zero_touch", 0)),
+ buildAttr("gh-aw.outcome.zero_touch_rate", getSummaryNumber("zero_touch_rate", 0)),
buildAttr("gh-aw.outcome.item_count", evaluations.length),
];
@@ -212,15 +224,20 @@ async function main() {
summaryAttributes.push(buildAttr("gh-aw.outcome.date", summary.date));
}
- // Median time-to-resolution for resolved items
- const resolutionTimes = evaluations
- .filter(e => typeof e.resolution_sec === "number" && e.resolution_sec > 0)
- .map(e => e.resolution_sec)
- .sort((a, b) => a - b);
- if (resolutionTimes.length > 0) {
- const mid = Math.floor(resolutionTimes.length / 2);
- const median = resolutionTimes.length % 2 !== 0 ? resolutionTimes[mid] : Math.round((resolutionTimes[mid - 1] + resolutionTimes[mid]) / 2);
- summaryAttributes.push(buildAttr("gh-aw.outcome.median_resolution_sec", median));
+ // Median time-to-resolution: prefer summary value, fall back to local computation
+ const summaryMedian = summary && typeof summary.median_resolution_sec === "number" ? summary.median_resolution_sec : null;
+ if (summaryMedian !== null) {
+ summaryAttributes.push(buildAttr("gh-aw.outcome.median_resolution_sec", summaryMedian));
+ } else {
+ const resolutionTimes = evaluations
+ .filter(e => typeof e.resolution_sec === "number" && e.resolution_sec > 0)
+ .map(e => e.resolution_sec)
+ .sort((a, b) => a - b);
+ if (resolutionTimes.length > 0) {
+ const mid = Math.floor(resolutionTimes.length / 2);
+ const median = resolutionTimes.length % 2 !== 0 ? resolutionTimes[mid] : Math.round((resolutionTimes[mid - 1] + resolutionTimes[mid]) / 2);
+ summaryAttributes.push(buildAttr("gh-aw.outcome.median_resolution_sec", median));
+ }
}
// Trigger type distribution
diff --git a/actions/setup/js/emit_outcome_spans.test.cjs b/actions/setup/js/emit_outcome_spans.test.cjs
index a590f91f948..f13bb36f64e 100644
--- a/actions/setup/js/emit_outcome_spans.test.cjs
+++ b/actions/setup/js/emit_outcome_spans.test.cjs
@@ -182,6 +182,11 @@ describe("emit_outcome_spans.cjs", () => {
rejected: 1,
ignored: 0,
pending: 0,
+ noop: 0,
+ noop_rate: 0,
+ zero_touch: 1,
+ zero_touch_rate: 1,
+ median_resolution_sec: 42,
acceptance_rate: 0.5,
waste_rate: 0.5,
date: "2026-05-13",
@@ -198,6 +203,15 @@ describe("emit_outcome_spans.cjs", () => {
url: "https://github.com/github/gh-aw/issues/1",
repo: "github/gh-aw",
timestamp: "2026-05-13T09:00:00Z",
+ review_comments: 0,
+ changed_files: 3,
+ additions: 10,
+ deletions: 2,
+ reactions_total: 5,
+ reactions_positive: 4,
+ reactions_negative: 1,
+ comments: 0,
+ zero_touch: true,
}),
JSON.stringify({
type: "comment",
@@ -263,10 +277,29 @@ describe("emit_outcome_spans.cjs", () => {
expect(summarySpan.attributes).toContainEqual({ key: "gh-aw.exporter.name", value: "outcome-collector" });
expect(summarySpan.attributes).toContainEqual({ key: "gh-aw.outcome.date", value: "2026-05-13" });
+ expect(summarySpan.attributes).toContainEqual({ key: "gh-aw.outcome.zero_touch_count", value: 1 });
expect(spans[1].attributes).toContainEqual({ key: "gh-aw.exporter.name", value: "outcome-collector" });
expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.url", value: "https://github.com/github/gh-aw/issues/1" });
expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.detail", value: "created item" });
expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.created_at", value: "2026-05-13T09:00:00Z" });
+ expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.review_comments", value: 0 });
+ expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.changed_files", value: 3 });
+ expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.additions", value: 10 });
+ expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.deletions", value: 2 });
+ expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.reactions_total", value: 5 });
+ expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.reactions_positive", value: 4 });
+ expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.reactions_negative", value: 1 });
+ expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.comments", value: 0 });
+ expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.zero_touch", value: true });
+ expect(spans[2].attributes.find(attr => attr.key === "gh-aw.outcome.review_comments")).toBeUndefined();
+ expect(spans[2].attributes.find(attr => attr.key === "gh-aw.outcome.changed_files")).toBeUndefined();
+ expect(spans[2].attributes.find(attr => attr.key === "gh-aw.outcome.additions")).toBeUndefined();
+ expect(spans[2].attributes.find(attr => attr.key === "gh-aw.outcome.deletions")).toBeUndefined();
+ expect(spans[2].attributes.find(attr => attr.key === "gh-aw.outcome.reactions_total")).toBeUndefined();
+ expect(spans[2].attributes.find(attr => attr.key === "gh-aw.outcome.reactions_positive")).toBeUndefined();
+ expect(spans[2].attributes.find(attr => attr.key === "gh-aw.outcome.reactions_negative")).toBeUndefined();
+ expect(spans[2].attributes.find(attr => attr.key === "gh-aw.outcome.comments")).toBeUndefined();
+ expect(spans[2].attributes.find(attr => attr.key === "gh-aw.outcome.zero_touch")).toBeUndefined();
expect(mockAppendToOTLPJSONL).toHaveBeenCalledOnce();
expect(mockSendOTLPToAllEndpoints).not.toHaveBeenCalled();
diff --git a/actions/setup/js/evaluate_outcomes.cjs b/actions/setup/js/evaluate_outcomes.cjs
index eef8a218129..349d34b9e0e 100644
--- a/actions/setup/js/evaluate_outcomes.cjs
+++ b/actions/setup/js/evaluate_outcomes.cjs
@@ -163,6 +163,11 @@ function secondsBetween(from, to) {
* @property {number | null} changed_files
* @property {number | null} additions
* @property {number | null} deletions
+ * @property {number | null} reactions_total
+ * @property {number | null} reactions_positive
+ * @property {number | null} reactions_negative
+ * @property {number | null} comments
+ * @property {boolean} zero_touch
*/
/**
@@ -186,6 +191,11 @@ function evaluateItem(item, defaultRepo) {
changed_files: null,
additions: null,
deletions: null,
+ reactions_total: null,
+ reactions_positive: null,
+ reactions_negative: null,
+ comments: null,
+ zero_touch: false,
};
if (!url) {
@@ -206,6 +216,18 @@ function evaluateItem(item, defaultRepo) {
}
out.result = "accepted";
out.detail = data.state;
+ out.comments = typeof data.comments === "number" ? data.comments : null;
+
+ // Reactions on issues
+ if (data.reactions && typeof data.reactions === "object") {
+ const r = data.reactions;
+ const positive = (r["+1"] || 0) + (r.heart || 0) + (r.hooray || 0) + (r.rocket || 0);
+ const negative = (r["-1"] || 0) + (r.confused || 0);
+ out.reactions_total = (r.total_count != null) ? r.total_count : positive + negative + (r.laugh || 0) + (r.eyes || 0);
+ out.reactions_positive = positive;
+ out.reactions_negative = negative;
+ }
+
if (data.state === "closed" && data.created_at && data.closed_at) {
out.resolution_sec = secondsBetween(data.created_at, data.closed_at);
}
@@ -228,6 +250,22 @@ function evaluateItem(item, defaultRepo) {
out.changed_files = typeof data.changed_files === "number" ? data.changed_files : null;
out.additions = typeof data.additions === "number" ? data.additions : null;
out.deletions = typeof data.deletions === "number" ? data.deletions : null;
+ out.comments = typeof data.comments === "number" ? data.comments : null;
+
+ // Reactions
+ if (data.reactions && typeof data.reactions === "object") {
+ const r = data.reactions;
+ const positive = (r["+1"] || 0) + (r.heart || 0) + (r.hooray || 0) + (r.rocket || 0);
+ const negative = (r["-1"] || 0) + (r.confused || 0);
+ out.reactions_total = (r.total_count != null) ? r.total_count : positive + negative + (r.laugh || 0) + (r.eyes || 0);
+ out.reactions_positive = positive;
+ out.reactions_negative = negative;
+ }
+
+ // Zero-touch: merged with no human review comments and no issue-level comments
+ if (data.merged === true && out.review_comments === 0 && out.comments === 0) {
+ out.zero_touch = true;
+ }
if (data.merged === true) {
out.result = "accepted";
@@ -315,6 +353,9 @@ function main() {
let pending = 0;
let total = 0;
let noop = 0;
+ let zeroTouchCount = 0;
+ /** @type {number[]} */
+ const resolutionTimes = [];
// Clear the evaluations file
fs.writeFileSync(EVAL_JSONL, "");
@@ -393,6 +434,9 @@ function main() {
switch (evalResult.result) {
case "accepted":
accepted++;
+ if (evalResult.zero_touch === true) {
+ zeroTouchCount++;
+ }
break;
case "rejected":
rejected++;
@@ -401,6 +445,9 @@ function main() {
pending++;
break;
}
+ if (typeof evalResult.resolution_sec === "number" && evalResult.resolution_sec > 0) {
+ resolutionTimes.push(evalResult.resolution_sec);
+ }
fs.appendFileSync(
EVAL_JSONL,
@@ -420,6 +467,11 @@ function main() {
changed_files: evalResult.changed_files,
additions: evalResult.additions,
deletions: evalResult.deletions,
+ reactions_total: evalResult.reactions_total,
+ reactions_positive: evalResult.reactions_positive,
+ reactions_negative: evalResult.reactions_negative,
+ comments: evalResult.comments,
+ zero_touch: evalResult.zero_touch || false,
}) + "\n"
);
}
@@ -442,6 +494,15 @@ function main() {
const wasteRate = total > 0 ? rejected / total : 0;
const noopRate = total + noop > 0 ? noop / (total + noop) : 0;
+ // Economics: zero-touch rate and median time-to-outcome
+ const zeroTouchRate = accepted > 0 ? zeroTouchCount / accepted : 0;
+ resolutionTimes.sort((a, b) => a - b);
+ let medianResolutionSec = null;
+ if (resolutionTimes.length > 0) {
+ const mid = Math.floor(resolutionTimes.length / 2);
+ medianResolutionSec = resolutionTimes.length % 2 !== 0 ? resolutionTimes[mid] : Math.round((resolutionTimes[mid - 1] + resolutionTimes[mid]) / 2);
+ }
+
writeJSONAtomic(SUMMARY_PATH, {
runs_checked: checked,
total_outcomes: total,
@@ -453,6 +514,9 @@ function main() {
acceptance_rate: Math.round(acceptanceRate * 10000) / 10000,
waste_rate: Math.round(wasteRate * 10000) / 10000,
noop_rate: Math.round(noopRate * 10000) / 10000,
+ zero_touch: zeroTouchCount,
+ zero_touch_rate: Math.round(zeroTouchRate * 10000) / 10000,
+ median_resolution_sec: medianResolutionSec,
date: new Date().toISOString().slice(0, 10),
});
diff --git a/specs/otel-observability-spec.md b/specs/otel-observability-spec.md
index 375aa0e9797..d0a2989f31b 100644
--- a/specs/otel-observability-spec.md
+++ b/specs/otel-observability-spec.md
@@ -1,9 +1,9 @@
---
title: OTel Observability Specification
-version: 0.1.0
+version: 0.2.0
status: Working Draft
date: 2026-05-19
-last_updated: 2026-05-19
+last_updated: 2026-05-21
editors:
- GitHub gh-aw Team
---
@@ -38,10 +38,14 @@ Changes to `observability.otlp`, OTLP environment injection, MCP gateway tracing
6. [Export and Gateway Integration](#6-export-and-gateway-integration)
7. [Local Mirrors and Artifacts](#7-local-mirrors-and-artifacts)
8. [Security and Privacy Requirements](#8-security-and-privacy-requirements)
-9. [Implementation Mapping](#9-implementation-mapping)
-10. [Compliance Testing](#10-compliance-testing)
-11. [References](#11-references)
-12. [Change Log](#12-change-log)
+9. [Trace Model](#9-trace-model)
+10. [Span Attribute Contract](#10-span-attribute-contract)
+11. [Resource Attributes](#11-resource-attributes)
+12. [Trace ID Propagation and Lookup](#12-trace-id-propagation-and-lookup)
+13. [Implementation Mapping](#13-implementation-mapping)
+14. [Compliance Testing](#14-compliance-testing)
+15. [References](#15-references)
+16. [Change Log](#16-change-log)
---
@@ -85,7 +89,7 @@ The following documents are informative companions and do not override this spec
## 2. Conformance
-An implementation conforms to this specification if it satisfies all MUST and MUST NOT requirements in Sections 4 through 10.
+An implementation conforms to this specification if it satisfies all MUST and MUST NOT requirements in Sections 4 through 12.
The key words **MUST**, **MUST NOT**, **SHOULD**, **SHOULD NOT**, and **MAY** are to be interpreted as described in [RFC 2119](https://www.rfc-editor.org/rfc/rfc2119).
@@ -97,7 +101,7 @@ This specification defines three conformance levels:
|---|---|
| **Level 1 - Config** | Correct parsing and normalization of `observability.otlp` and workflow environment injection as defined in Sections 4 and 5. |
| **Level 2 - Runtime** | Level 1 plus MCP gateway integration and degraded-mode export behavior from Section 6. |
-| **Level 3 - Complete** | Level 2 plus local mirror, artifact, implementation-mapping, and compliance obligations in Sections 7 through 10. |
+| **Level 3 - Complete** | Level 2 plus local mirror, artifact, trace model, span attribute contract, resource attributes, trace ID propagation, implementation-mapping, and compliance obligations in Sections 7 through 12. |
---
@@ -267,7 +271,378 @@ The JavaScript OTLP helper layer SHOULD remain non-fatal:
---
-## 9. Implementation Mapping
+## 9. Trace Model
+
+### 9.1 Overview
+
+gh-aw emits OpenTelemetry trace spans directly to configured OTLP-compatible vendor endpoints. gh-aw does **not** require or run an OpenTelemetry Collector. All transformation, batching, retry, endpoint selection, and authentication happens in-process before sending to the vendor OTLP endpoint.
+
+Tracing is best-effort. Export failures MUST NOT fail the workflow.
+
+### 9.2 Span Naming Convention
+
+All gh-aw span names MUST follow the pattern: `gh-aw..`.
+
+When no job name is available, the fallback `job` MUST be used, yielding names such as `gh-aw.job.setup`.
+
+### 9.3 Span Hierarchy
+
+A single trace ID is shared across all jobs in a workflow run. All setup spans share a global parent span ID so they render as siblings in OTLP backends.
+
+```text
+Single Trace: trace_id (32-char hex, shared across all jobs in a run)
+โโโ Root Setup Parent: parent_span_id (global, shared across all jobs)
+โ
+โโโ Activation Job
+โ โโโ gh-aw.activation.setup (parent: root setup parent)
+โ โโโ gh-aw.activation.conclusion (parent: activation setup span)
+โ
+โโโ Agent Job
+โ โโโ gh-aw.agent.setup (parent: root setup parent)
+โ โโโ gh-aw.agent.conclusion (parent: agent setup span)
+โ โ โโโ gh-aw.agent.agent (parent: agent conclusion span)
+โ โ [dedicated AI latency measurement]
+โ โ
+โ
+โโโ Other Jobs
+ โโโ gh-aw..setup (parent: root setup parent)
+ โโโ gh-aw..conclusion (parent: job setup span)
+```
+
+### 9.4 Span Kinds
+
+Span kind assignments MUST follow these rules:
+
+| Span | OTLP `kind` | Rationale |
+|---|---|---|
+| `gh-aw.*.setup` | `SPAN_KIND_INTERNAL` (1) | Internal job lifecycle |
+| `gh-aw.*.conclusion` | `SPAN_KIND_INTERNAL` (1) | Internal job lifecycle |
+| `gh-aw.*.agent` | `SPAN_KIND_CLIENT` (3) | Outbound AI model request |
+
+### 9.5 Span Status
+
+Conclusion spans MUST set `status.code` based on the job outcome:
+
+| Outcome | `status.code` |
+|---|---|
+| `success` | `OK` (1) |
+| `failure`, `timeout`, `cancelled` | `ERROR` (2) |
+
+### 9.6 Exception Events
+
+When errors are present in `agent_output.json`, the conclusion span MUST emit OTel exception events:
+
+```json
+{
+ "timeUnixNano": "...",
+ "name": "exception",
+ "attributes": [
+ {"key": "exception.type", "value": {"stringValue": "gh-aw."}},
+ {"key": "exception.message", "value": {"stringValue": "Error description"}}
+ ]
+}
+```
+
+Exception type resolution:
+
+1. If the error message matches the format `type:message`, use `gh-aw.` as the exception type.
+2. Otherwise, derive the type from the run status: `gh-aw.AgentError`, `gh-aw.AgentFailed`, `gh-aw.AgentTimedOut`, or `gh-aw.AgentCancelled`.
+
+---
+
+## 10. Span Attribute Contract
+
+This section defines the attributes each span type MUST or MAY carry.
+
+### 10.1 Setup Span Attributes
+
+**Required attributes** (MUST be present on every setup span):
+
+| Attribute | Type | Description |
+|---|---|---|
+| `gh-aw.job.name` | string | Job name from action input |
+| `gh-aw.workflow.name` | string | Workflow name or ID |
+| `gh-aw.run.id` | string | GitHub Actions run ID |
+| `gh-aw.run.attempt` | string | Run attempt number |
+| `gh-aw.run.actor` | string | User or bot initiating the run |
+| `gh-aw.repository` | string | `owner/repo` |
+| `gh-aw.staged` | boolean | Whether this is a staging deployment |
+
+**Conditional attributes** (MUST be present when the value is available):
+
+| Attribute | Type | Description |
+|---|---|---|
+| `gen_ai.system` | string | Mapped AI system name (e.g., `github_models`, `anthropic`, `openai`) |
+| `gh-aw.engine.id` | string | Raw engine identifier (`copilot`, `claude`, `codex`, `gemini`, custom) |
+| `gh-aw.event_name` | string | GitHub event type |
+| `gh-aw.trigger.item_type` | string | Triggering item (`issue`, `pull_request`, `discussion`, etc.) |
+| `gh-aw.trigger.item_number` | string | Triggering item ID/number |
+| `gh-aw.trigger.label` | string | Label on triggering item |
+| `gh-aw.trigger.comment_id` | string | Comment ID on triggering item |
+| `gh-aw.episode.id` | string | Episode/session ID for cross-run correlation |
+| `gh-aw.episode.kind` | string | `run` or `workflow_call` |
+| `gh-aw.hop.id` | string | Current workflow invocation ID |
+| `gh-aw.hop.parent_id` | string | Parent workflow invocation ID |
+| `gh-aw.origin.event` | string | Origin event type |
+| `gh-aw.root.repo` | string | Root repository (for dispatched workflows) |
+| `gh-aw.root.workflow_id` | string | Root workflow ID |
+| `gh-aw.frontmatter.source` | string | Frontmatter source type |
+| `gh-aw.frontmatter.emoji` | string | Frontmatter emoji |
+| `gh-aw.frontmatter.body_modified` | boolean | Whether body was edited |
+| `gh-aw.experiment.` | string | Per-experiment variant assignment |
+| `gh-aw.experiments` | string | Compact JSON of all experiment assignments |
+| `gh-aw.deployment.state` | string | Deployment status |
+| `gh-aw.workflow_run.conclusion` | string | Workflow-level outcome |
+
+### 10.2 Conclusion Span Attributes
+
+**Required attributes** (MUST be present on every conclusion span):
+
+| Attribute | Type | Description |
+|---|---|---|
+| `gh-aw.workflow.name` | string | Workflow name |
+| `gh-aw.run.id` | string | Run ID |
+| `gh-aw.run.attempt` | string | Attempt number |
+| `gh-aw.run.actor` | string | Actor |
+| `gh-aw.repository` | string | Repository |
+| `gh-aw.run.status` | string | Run outcome (`success`, `failure`, `timeout`, `cancelled`) |
+| `gh-aw.error_count` | int | Number of errors |
+| `gh-aw.warning_count` | int | Number of warnings |
+| `gh-aw.action_minutes` | double | Duration in minutes |
+| `gh-aw.output.item_count` | int | Safe output items produced |
+| `gh-aw.otlp.export_errors` | int | Count of OTLP export failures during this run |
+
+**Conditional attributes** (MUST be present when the value is available):
+
+| Attribute | Type | Description |
+|---|---|---|
+| `gh-aw.job.name` | string | Job name |
+| `gen_ai.system` | string | AI system |
+| `gh-aw.engine.id` | string | Engine ID |
+| `gen_ai.request.model` | string | Requested model name |
+| `gh-aw.tracker.id` | string | Tracker identifier |
+| `gh-aw.event_name` | string | Event type |
+| `gh-aw.staged` | boolean | Staging flag |
+| `gh-aw.trigger.*` | string | Trigger context (same fields as setup span) |
+| `gh-aw.frontmatter.*` | string | Frontmatter metadata (same fields as setup span) |
+| `gh-aw.effective_tokens` | int | Effective token count |
+| `gh-aw.turns` | int | Number of agent turns |
+| `gh-aw.estimated_cost_usd` | double | Estimated cost |
+| `gh-aw.agent.conclusion` | string | Agent job outcome |
+| `gh-aw.detection.conclusion` | string | Threat detection outcome |
+| `gh-aw.detection.reason` | string | Detection reasoning |
+| `gh-aw.otlp.export_error_details` | string | Export failure details |
+| `gh-aw.error.count` | int | Output error count |
+| `gh-aw.error.messages` | string | Error messages joined by ` \| ` |
+| `gh-aw.output.item_types` | string | Comma-separated types of safe output items |
+| `gh-aw.github.rate_limit.remaining` | int | API rate limit remaining |
+| `gh-aw.github.rate_limit.limit` | int | API rate limit total |
+| `gh-aw.github.rate_limit.used` | int | API rate limit used |
+| `gh-aw.github.rate_limit.resource` | string | Rate limit resource category |
+| `gh-aw.github.rate_limit.reset` | string | ISO 8601 rate limit reset time |
+| `gh-aw.outcome.total` | int | Total outcomes |
+| `gh-aw.outcome.accepted` | int | Accepted outcomes |
+| `gh-aw.outcome.rejected` | int | Rejected outcomes |
+| `gh-aw.outcome.pending` | int | Pending outcomes |
+| `gh-aw.outcome.ignored` | int | Ignored outcomes |
+| `gh-aw.outcome.acceptance_rate` | double | Acceptance rate |
+| `gh-aw.outcome.waste_rate` | double | Waste rate |
+
+### 10.3 Agent Span Attributes
+
+The dedicated agent span (`gh-aw.*.agent`) follows OpenTelemetry [GenAI semantic conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/).
+
+**Required attributes** (MUST be present when available from the AI engine):
+
+| Attribute | Type | Description |
+|---|---|---|
+| `gen_ai.system` | string | Mapped AI system name |
+| `gen_ai.request.model` | string | Requested model |
+| `gen_ai.response.model` | string | Resolved runtime model |
+| `gen_ai.operation.name` | string | Always `"chat"` |
+| `gen_ai.workflow.name` | string | Workflow name |
+| `gen_ai.usage.input_tokens` | int | Input tokens consumed |
+| `gen_ai.usage.output_tokens` | int | Output tokens generated |
+| `gen_ai.usage.total_tokens` | int | Total tokens (input + output, excluding cache) |
+| `gen_ai.response.finish_reasons` | string[] | Stop reasons (e.g., `["stop"]`, `["length"]`, `["timeout"]`) |
+
+**Optional attributes** (MAY be present):
+
+| Attribute | Type | Description |
+|---|---|---|
+| `gen_ai.usage.cache_read.input_tokens` | int | Cache read tokens |
+| `gen_ai.usage.cache_creation.input_tokens` | int | Cache write tokens |
+
+### 10.4 Outcome Evaluation Span Attributes
+
+Per-item outcome evaluation spans (`gh-aw.outcome.evaluation`) are emitted by the outcome-collector workflow. Each span represents one safe output item evaluated against the GitHub API.
+
+| Attribute | Type | Condition | Description |
+|---|---|---|---|
+| `gh-aw.outcome.type` | string | Required | Safe output type (e.g., `create_pull_request`, `create_issue`) |
+| `gh-aw.outcome.result` | string | Required | `accepted`, `rejected`, `pending`, `ignored`, `noop` |
+| `gh-aw.outcome.workflow` | string | Required | Source workflow name |
+| `gh-aw.outcome.run_id` | int | Required | Source run ID |
+| `gh-aw.outcome.repo` | string | Required | Repository |
+| `gh-aw.outcome.url` | string | When available | URL to the created object |
+| `gh-aw.outcome.detail` | string | When available | Result detail (e.g., `merged`, `closed`, `open`) |
+| `gh-aw.outcome.created_at` | string | When available | Item creation timestamp |
+| `gh-aw.outcome.event` | string | When available | Triggering event type |
+| `gh-aw.outcome.resolution_sec` | int | When resolved | Seconds from creation to resolution |
+| `gh-aw.outcome.pending_age_sec` | int | When pending | Seconds since creation |
+| `gh-aw.outcome.review_comments` | int | PRs only | Number of review comments |
+| `gh-aw.outcome.comments` | int | When available | Number of issue-level comments |
+| `gh-aw.outcome.changed_files` | int | PRs only | Files changed |
+| `gh-aw.outcome.additions` | int | PRs only | Lines added |
+| `gh-aw.outcome.deletions` | int | PRs only | Lines deleted |
+| `gh-aw.outcome.reactions_total` | int | When available | Total reaction count |
+| `gh-aw.outcome.reactions_positive` | int | When available | Positive reactions (+1, heart, hooray, rocket) |
+| `gh-aw.outcome.reactions_negative` | int | When available | Negative reactions (-1, confused) |
+| `gh-aw.outcome.zero_touch` | boolean | When true | Accepted with no human review comments or issue comments |
+
+### 10.5 Outcome Summary Span Attributes
+
+The fleet summary span (`gh-aw.outcome.summary`) aggregates all evaluated outcomes into a single span with economics metrics.
+
+| Attribute | Type | Description |
+|---|---|---|
+| `gh-aw.outcome.runs_checked` | int | Number of runs evaluated |
+| `gh-aw.outcome.total` | int | Total actionable outcomes |
+| `gh-aw.outcome.accepted` | int | Accepted outcomes |
+| `gh-aw.outcome.rejected` | int | Rejected outcomes |
+| `gh-aw.outcome.ignored` | int | Ignored outcomes |
+| `gh-aw.outcome.pending` | int | Pending outcomes |
+| `gh-aw.outcome.noop` | int | Noop outcomes |
+| `gh-aw.outcome.acceptance_rate` | double | Accepted / (accepted + rejected) |
+| `gh-aw.outcome.waste_rate` | double | Rejected / total |
+| `gh-aw.outcome.noop_rate` | double | Noop / (total + noop) |
+| `gh-aw.outcome.zero_touch_count` | int | Count of zero-touch accepted outcomes |
+| `gh-aw.outcome.zero_touch_rate` | double | Zero-touch / accepted |
+| `gh-aw.outcome.median_resolution_sec` | int | Median seconds from creation to resolution |
+| `gh-aw.outcome.item_count` | int | Number of per-item spans emitted |
+| `gh-aw.outcome.date` | string | Evaluation date (YYYY-MM-DD) |
+| `gh-aw.outcome.events` | string | Comma-separated distinct trigger events |
+| `gh-aw.outcome.workflows` | string | Comma-separated distinct workflow names |
+| `gh-aw.outcome.types` | string | Comma-separated distinct outcome types |
+
+---
+
+## 11. Resource Attributes
+
+Resource attributes are applied to all OTLP spans and describe the service and execution environment.
+
+### 11.1 Required Resource Attributes
+
+A conforming implementation MUST include these resource attributes on every exported span:
+
+| Attribute | Type | Description | Example |
+|---|---|---|---|
+| `service.name` | string | `gh-aw.` or `gh-aw` | `gh-aw.daily-report` |
+| `service.version` | string | gh-aw CLI version or commit SHA | `v0.23.4` |
+| `github.repository` | string | `owner/repo` | `github/gh-aw` |
+| `github.run_id` | string | GitHub Actions run ID | `12345678` |
+| `github.run_attempt` | string | Run attempt number | `1` |
+| `github.actions.run_url` | string | URL to the run | `https://github.com/owner/repo/actions/runs/123` |
+
+### 11.2 Conditional Resource Attributes
+
+These resource attributes MUST be included when the corresponding value is available:
+
+| Attribute | Type | Description |
+|---|---|---|
+| `github.event_name` | string | Event type (e.g., `push`, `pull_request`) |
+| `github.ref` | string | Git ref (branch/tag) |
+| `github.ref_name` | string | Ref name |
+| `github.head_ref` | string | Head ref (for PRs) |
+| `github.sha` | string | Commit SHA |
+| `github.job` | string | Job name |
+| `github.workflow_ref` | string | Workflow ref |
+| `github.actor_id` | string | Actor ID |
+| `runner.os` | string | Runner OS (`Linux`, `Windows`, `macOS`) |
+| `runner.arch` | string | Runner architecture (`X64`, `ARM64`) |
+| `runner.name` | string | Runner name/label |
+| `runner.environment` | string | Runner environment |
+| `gh-aw.awf.version` | string | Agentic Workflows Framework version |
+| `gh-aw.awmg.version` | string | Agentic Workflows Manager version |
+| `deployment.environment` | string | `staging` or `production` |
+
+### 11.3 Instrumentation Scope
+
+All gh-aw spans MUST be emitted under an instrumentation scope with:
+
+| Field | Value |
+|---|---|
+| `scope.name` | `gh-aw` |
+| `scope.version` | The gh-aw CLI version |
+
+---
+
+## 12. Trace ID Propagation and Lookup
+
+### 12.1 Trace ID Format
+
+The OTLP trace ID is a 32-character lowercase hexadecimal string (16 random bytes). The span ID is a 16-character lowercase hexadecimal string (8 random bytes).
+
+Do **not** confuse the OTLP trace ID with `workflow_call_id`, which is derived from the GitHub run ID and attempt number. The OTLP trace ID is the value to search for in vendor backends (Sentry, Honeycomb, Datadog, Grafana Tempo, etc.).
+
+### 12.2 Trace ID Resolution Order
+
+The setup span MUST resolve the trace ID using the following priority order:
+
+1. **Explicit option** โ `options.traceId` passed to the setup function (used for activation job reuse).
+2. **Action input** โ `INPUT_TRACE_ID` environment variable (from `trace-id` action input, used for cross-job propagation).
+3. **Parent context** โ `aw_info.context.otel_trace_id` (propagated from parent workflow via `aw_context`).
+4. **Generate new** โ 32-character random hex string via `randomBytes(16).toString("hex")`.
+
+The conclusion span MUST resolve the trace ID using:
+
+1. **Job environment** โ `GITHUB_AW_OTEL_TRACE_ID` (set by this job's setup step).
+2. **Parent context** โ `aw_info.context.otel_trace_id` (inherited from parent).
+3. **Legacy fallback** โ `aw_info.context.workflow_call_id` (converted to hex).
+4. **Generate new** โ 32-character random hex string.
+
+### 12.3 Trace ID Storage
+
+After generating or resolving a trace ID, the setup step MUST:
+
+1. **Write to `$GITHUB_OUTPUT`** so downstream jobs can access:
+ - `trace-id` โ 32-char hex trace ID
+ - `span-id` โ 16-char hex setup span ID
+ - `parent-span-id` โ 16-char hex global parent span ID
+
+2. **Write to `$GITHUB_ENV`** so downstream steps in the same job can access:
+ - `GITHUB_AW_OTEL_TRACE_ID` โ Trace ID
+ - `GITHUB_AW_OTEL_PARENT_SPAN_ID` โ Setup span ID (parent for conclusion span)
+ - `GITHUB_AW_OTEL_JOB_START_MS` โ Epoch milliseconds when setup completed
+
+### 12.4 Cross-Job Propagation
+
+The compiler MUST wire setup outputs through the job dependency graph so all jobs in a run share a single trace ID. Downstream jobs receive `needs..outputs.trace-id` and `needs..outputs.parent-span-id` as action inputs.
+
+### 12.5 Dispatch and Composite Action Propagation
+
+When a workflow dispatches a child workflow or composite action, parent trace context MUST be passed via `aw_context`:
+
+- `aw_context.otel_trace_id` โ child inherits parent trace ID
+- `aw_context.otel_parent_span_id` โ child setup span parents under parent's setup span
+
+This context is written to `/tmp/gh-aw/aw_info.json` and propagated through action inputs.
+
+### 12.6 Trace ID Lookup
+
+To find a trace in an OTLP backend:
+
+1. Locate the OTLP trace ID from the GitHub Actions job summary or the `trace-id` output.
+2. Search the backend by trace ID (32-char hex string).
+3. For local debugging, query the JSONL mirror:
+
+```bash
+jq '.resourceSpans[].scopeSpans[].spans[] | {name, traceId, spanId, status}' /tmp/gh-aw/otel.jsonl
+```
+
+---
+
+## 13. Implementation Mapping
This section maps the normative behavior in this specification to the current `gh-aw` implementation. These mappings MUST be kept in sync when behavior changes.
@@ -280,12 +655,16 @@ This section maps the normative behavior in this specification to the current `g
| ยง6.5 | Trace Context Variables | `actions/setup/js/action_setup_otlp.cjs`, `actions/setup/js/aw_context.cjs` |
| ยง7 | Local Mirrors and Artifacts | `actions/setup/js/send_otlp_span.cjs`, `actions/setup/js/constants.cjs`, `actions/setup/post.js` |
| ยง8 | Security and Privacy Requirements | `pkg/workflow/observability_otlp.go`, `pkg/workflow/mcp_renderer.go`, `pkg/workflow/mcp_setup_generator.go`, `actions/setup/js/send_otlp_span.cjs` |
+| ยง9 | Trace Model | `actions/setup/js/send_otlp_span.cjs`, `actions/setup/js/action_setup_otlp.cjs`, `actions/setup/js/action_conclusion_otlp.cjs` |
+| ยง10 | Span Attribute Contract | `actions/setup/js/action_setup_otlp.cjs`, `actions/setup/js/action_conclusion_otlp.cjs`, `actions/setup/js/send_otlp_span.cjs`, `actions/setup/js/evaluate_outcomes.cjs`, `actions/setup/js/emit_outcome_spans.cjs` |
+| ยง11 | Resource Attributes | `actions/setup/js/action_setup_otlp.cjs`, `actions/setup/js/send_otlp_span.cjs` |
+| ยง12 | Trace ID Propagation | `actions/setup/js/action_setup_otlp.cjs`, `actions/setup/js/aw_context.cjs`, `pkg/workflow/compiler_yaml.go` |
When behavior changes in any mapped file, this table SHOULD be updated in the same change set.
---
-## 10. Compliance Testing
+## 14. Compliance Testing
A conforming implementation MUST include automated coverage for the following behaviors.
@@ -301,12 +680,28 @@ A conforming implementation MUST include automated coverage for the following be
| `T-OTEL-OBS-008` | Local mirror persistence | Helper emission writes `/tmp/gh-aw/otel.jsonl` even when OTLP export fails or is absent. | `actions/setup/js/send_otlp_span.test.cjs` |
| `T-OTEL-OBS-009` | Trace context propagation | Setup writes valid trace and parent span IDs into runtime environment. | `actions/setup/js/action_setup_otlp.test.cjs`, `actions/setup/js/otlp.test.cjs` |
| `T-OTEL-OBS-010` | Artifact inclusion | Observability artifacts include the OTEL JSONL mirror when artifact collection is enabled. | `pkg/workflow/compiled_lock_files_test.go` |
+| `T-OTEL-OBS-011` | Span naming convention | All emitted span names follow `gh-aw..` pattern. | `actions/setup/js/send_otlp_span.test.cjs` |
+| `T-OTEL-OBS-012` | Span hierarchy | Setup spans share a global parent span ID; conclusion spans parent under the setup span. | `actions/setup/js/action_setup_otlp.test.cjs`, `actions/setup/js/action_conclusion_otlp.test.cjs` |
+| `T-OTEL-OBS-013` | Span attribute contract | Setup and conclusion spans contain all required attributes from ยง10. | `actions/setup/js/action_setup_otlp.test.cjs`, `actions/setup/js/action_conclusion_otlp.test.cjs` |
+| `T-OTEL-OBS-014` | Resource attributes | All exported spans include required resource attributes from ยง11. | `actions/setup/js/send_otlp_span.test.cjs` |
+| `T-OTEL-OBS-015` | Trace ID resolution order | Trace ID follows the priority chain: explicit option โ action input โ parent context โ generate new. | `actions/setup/js/action_setup_otlp.test.cjs` |
Additional tests SHOULD be added when new helper APIs, new OTLP normalization rules, or new runtime sinks become normative.
+### 14.1 Runtime Conformance Workflows
+
+The following agentic workflows provide runtime conformance validation:
+
+| Workflow | Purpose | Coverage |
+|---|---|---|
+| [`smoke-otel-backends.md`](../.github/workflows/smoke-otel-backends.md) | End-to-end OTLP smoke test | Local mirror + Sentry/Grafana/Datadog visibility |
+| [`daily-otel-instrumentation-advisor.md`](../.github/workflows/daily-otel-instrumentation-advisor.md) | Daily code review + live data validation | Sentry + Grafana backend data |
+| [`daily-grafana-otel-instrumentation-advisor.md`](../.github/workflows/daily-grafana-otel-instrumentation-advisor.md) | Grafana-only variant | Grafana Tempo data |
+| [`otlp-data-quality-validator.md`](../.github/workflows/otlp-data-quality-validator.md) | OTLP data quality validation | JSONL + vendor traces + attribute contract |
+
---
-## 11. References
+## 15. References
### Normative References
@@ -321,12 +716,32 @@ Additional tests SHOULD be added when new helper APIs, new OTLP normalization ru
- [specs/aw-harness.md](./aw-harness.md)
- [specs/safe-output-outcome-evaluation.md](./safe-output-outcome-evaluation.md)
+### Runtime Conformance Workflows
+
+- [.github/workflows/smoke-otel-backends.md](../.github/workflows/smoke-otel-backends.md) โ End-to-end OTLP smoke test
+- [.github/workflows/daily-otel-instrumentation-advisor.md](../.github/workflows/daily-otel-instrumentation-advisor.md) โ Daily code review + live data validation
+- [.github/workflows/daily-grafana-otel-instrumentation-advisor.md](../.github/workflows/daily-grafana-otel-instrumentation-advisor.md) โ Grafana-only variant
+- [.github/workflows/otlp-data-quality-validator.md](../.github/workflows/otlp-data-quality-validator.md) โ OTLP data quality validation
+
---
-## 12. Change Log
+## 16. Change Log
+
+### Version 0.2.0 (Working Draft)
+
+- Added ยง9 Trace Model: span naming, hierarchy, kinds, status, exception events
+- Added ยง10 Span Attribute Contract: required and conditional attributes for setup, conclusion, and agent spans
+- Added ยง10.4 Outcome Evaluation Span Attributes: reactions, zero-touch, comments
+- Added ยง10.5 Outcome Summary Span Attributes: zero-touch rate, median resolution, economics metrics
+- Added ยง11 Resource Attributes: required and conditional resource attributes, instrumentation scope
+- Added ยง12 Trace ID Propagation and Lookup: resolution order, storage, cross-job and dispatch propagation
+- Added ยง14.1 Runtime Conformance Workflows
+- Added compliance tests T-OTEL-OBS-011 through T-OTEL-OBS-015
+- Updated implementation mapping table with ยง9โยง12 entries
+- Renumbered ยง9โยง12 to ยง13โยง16
### Version 0.1.0 (Working Draft)
- Initial repository-level OTel observability specification
- Defined the normative `observability.otlp` contract for compiler and runtime behavior
-- Added gateway-integration, local-mirror, implementation-mapping, and conformance-test sections
\ No newline at end of file
+- Added gateway-integration, local-mirror, implementation-mapping, and conformance-test sections