From e192c20be7a894c65addb767a18467c050c74b05 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Thu, 18 Dec 2025 18:48:23 +0000 Subject: [PATCH 01/43] V! --- codex-rs/Cargo.lock | 11 + codex-rs/Cargo.toml | 2 + codex-rs/metrics/Cargo.toml | 12 + codex-rs/metrics/README.md | 110 ++++++++++ codex-rs/metrics/src/batch.rs | 241 ++++++++++++++++++++ codex-rs/metrics/src/client.rs | 109 ++++++++++ codex-rs/metrics/src/config.rs | 56 +++++ codex-rs/metrics/src/error.rs | 63 ++++++ codex-rs/metrics/src/lib.rs | 13 ++ codex-rs/metrics/src/statsd.rs | 102 +++++++++ codex-rs/metrics/src/validation.rs | 46 ++++ codex-rs/metrics/tests/tests.rs | 338 +++++++++++++++++++++++++++++ 12 files changed, 1103 insertions(+) create mode 100644 codex-rs/metrics/Cargo.toml create mode 100644 codex-rs/metrics/README.md create mode 100644 codex-rs/metrics/src/batch.rs create mode 100644 codex-rs/metrics/src/client.rs create mode 100644 codex-rs/metrics/src/config.rs create mode 100644 codex-rs/metrics/src/error.rs create mode 100644 codex-rs/metrics/src/lib.rs create mode 100644 codex-rs/metrics/src/statsd.rs create mode 100644 codex-rs/metrics/src/validation.rs create mode 100644 codex-rs/metrics/tests/tests.rs diff --git a/codex-rs/Cargo.lock b/codex-rs/Cargo.lock index acf173c5170..d83fee60fbb 100644 --- a/codex-rs/Cargo.lock +++ b/codex-rs/Cargo.lock @@ -1561,6 +1561,17 @@ dependencies = [ "wiremock", ] +[[package]] +name = "codex-metrics" +version = "0.0.0" +dependencies = [ + "pretty_assertions", + "reqwest", + "sentry", + "serde_json", + "thiserror 2.0.17", +] + [[package]] name = "codex-ollama" version = "0.0.0" diff --git a/codex-rs/Cargo.toml b/codex-rs/Cargo.toml index 50941771cf2..7c03533bd98 100644 --- a/codex-rs/Cargo.toml +++ b/codex-rs/Cargo.toml @@ -33,6 +33,7 @@ members = [ "responses-api-proxy", "stdio-to-uds", "otel", + "metrics", "tui", "tui2", "utils/absolute-path", @@ -82,6 +83,7 @@ codex-linux-sandbox = { path = "linux-sandbox" } codex-lmstudio = { path = "lmstudio" } codex-login = { path = "login" } codex-mcp-server = { path = "mcp-server" } +codex-metrics = { path = "metrics" } codex-ollama = { path = "ollama" } codex-otel = { path = "otel" } codex-process-hardening = { path = "process-hardening" } diff --git a/codex-rs/metrics/Cargo.toml b/codex-rs/metrics/Cargo.toml new file mode 100644 index 00000000000..7e070ba80cc --- /dev/null +++ b/codex-rs/metrics/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "codex-metrics" +version.workspace = true +edition.workspace = true +license.workspace = true + +[dependencies] +pretty_assertions = { workspace = true } +reqwest = { workspace = true, features = ["blocking"] } +sentry = { workspace = true } +serde_json = { workspace = true } +thiserror = { workspace = true } diff --git a/codex-rs/metrics/README.md b/codex-rs/metrics/README.md new file mode 100644 index 00000000000..44bc6cabb49 --- /dev/null +++ b/codex-rs/metrics/README.md @@ -0,0 +1,110 @@ +# codex-metrics + +Send lightweight counters and histogram buckets to Sentry via the statsd envelope item. + +## Overview + +- Blocking, minimal client designed for CLI and service use. +- Counters and histograms only (histograms are encoded as bucketed counters). +- Tag validation and metric name validation are enforced before send. + +## Quick start + +```rust +use codex_metrics::HistogramBuckets; +use codex_metrics::MetricsClient; +use codex_metrics::MetricsConfig; +use codex_metrics::Result; + +fn main() -> Result<()> { + let metrics = MetricsClient::new( + MetricsConfig::new("https://public@example.ingest.us.sentry.io/123456") + .with_tag("service", "codex-cli")? + .with_tag("env", "dev")?, + )?; + + let buckets = HistogramBuckets::from_values(&[25, 50, 100, 250, 500, 1000])?; + + metrics.counter("codex.session_started", 1, &[("source", "tui")])?; + metrics.histogram("codex.request_latency", 83, &buckets, &[("route", "chat")])?; + + Ok(()) +} +``` + +Buckets are integer upper bounds; pick your own unit (ms, bytes, tokens, etc.). + +You can also use the default placeholder DSN: + +```rust +let metrics = MetricsClient::new(MetricsConfig::default())?; +``` + +## Configuration + +`MetricsConfig` lets you specify: + +- `MetricsConfig::new(dsn)` to set the Sentry DSN. +- `with_tag(key, value)` to add default tags. +- `with_timeout(duration)` to override the HTTP timeout (default 10s). +- `with_user_agent(agent)` to override the user agent. + +## Buckets + +`HistogramBuckets` supports two constructors: + +- `from_values(&[...])` for explicit upper bounds. +- `from_range(from, to, step)` to build linear buckets. +- `from_exponential(from, to, factor)` to build exponential buckets. + +`from_range` requires `step > 0` and `from <= to`. The upper bound is always included. +`from_exponential` requires `from > 0`, `from <= to`, and a finite `factor > 1`. The upper bound is always included. + +```rust +let buckets = HistogramBuckets::from_range(25, 100, 25)?; +let exp_buckets = HistogramBuckets::from_exponential(10, 1000, 2.0)?; +``` + +## Sending metrics + +Counters send a single statsd counter increment with tags: + +```rust +metrics.counter("codex.session_started", 1, &[("source", "tui")])?; +``` + +Histograms are translated into bucket counters by adding an `le` tag for each +bound that is greater than or equal to the value (or `inf` if none match): + +```rust +metrics.histogram("codex.request_latency", 83, &buckets, &[("route", "chat")])?; +``` + +## Batching + +Batching reduces network requests. Build a batch and send it once: + +```rust +let mut batch = metrics.batch(); +batch.counter("codex.turns", 1, &[("model", "gpt-5.1")])?; +batch.histogram("codex.tool_latency", 140, &buckets, &[("tool", "shell")])?; +metrics.send(batch)?; +``` + +## Validation rules + +Metric names: + +- Must be non-empty. +- Allowed characters: ASCII letters/digits plus `.`, `_`, `-`. + +Tag keys and values: + +- Must be non-empty. +- Allowed characters: ASCII letters/digits plus `.`, `_`, `-`, `/`. + +## Error handling + +All APIs return `codex_metrics::Result` with a `MetricsError` variant on +failure. Errors cover invalid configuration, validation failures, and HTTP or +serialization failures. diff --git a/codex-rs/metrics/src/batch.rs b/codex-rs/metrics/src/batch.rs new file mode 100644 index 00000000000..bcfec6898bd --- /dev/null +++ b/codex-rs/metrics/src/batch.rs @@ -0,0 +1,241 @@ +use crate::error::MetricsError; +use crate::error::Result; +use crate::statsd::StatsdLine; +use crate::statsd::collect_tags; +use std::collections::BTreeMap; + +#[cfg_attr(test, derive(PartialEq, Eq))] +#[derive(Clone, Debug)] +pub struct HistogramBuckets { + bounds: Vec, +} + +impl HistogramBuckets { + /// Build histogram buckets from unsorted bounds (upper limits). + pub fn new(mut bounds: Vec) -> Result { + if bounds.is_empty() { + return Err(MetricsError::EmptyBuckets); + } + bounds.sort_unstable(); + bounds.dedup(); + Ok(Self { bounds }) + } + + /// Build histogram buckets from a slice of upper bounds. + pub fn from_values(bounds: &[i64]) -> Result { + Self::new(bounds.to_vec()) + } + + /// Build linear histogram buckets from an inclusive range and step size. + pub fn from_range(from: i64, to: i64, n_step: i64) -> Result { + if n_step <= 0 { + return Err(MetricsError::BucketStepNonPositive { step: n_step }); + } + if from > to { + return Err(MetricsError::BucketRangeDescending { from, to }); + } + + let mut bounds = Vec::new(); + let mut current = from; + bounds.push(current); + + while current < to { + let next = match current.checked_add(n_step) { + Some(next) => next, + None => { + return Err(MetricsError::BucketRangeOverflow { + from, + to, + step: n_step, + }); + } + }; + if next >= to { + bounds.push(to); + break; + } + bounds.push(next); + current = next; + } + + Self::new(bounds) + } + + /// Build exponential histogram buckets from an inclusive range and factor. + pub fn from_exponential(from: i64, to: i64, factor: f64) -> Result { + if from <= 0 { + return Err(MetricsError::BucketStartNonPositive { start: from }); + } + if from > to { + return Err(MetricsError::BucketRangeDescending { from, to }); + } + if !factor.is_finite() || factor <= 1.0 { + return Err(MetricsError::BucketFactorInvalid { factor }); + } + + let mut bounds = Vec::new(); + let mut current = from; + bounds.push(current); + + while current < to { + let next_value = (current as f64) * factor; + if !next_value.is_finite() || next_value >= to as f64 { + bounds.push(to); + break; + } + let mut next = next_value.ceil() as i64; + if next <= current { + next = current + 1; + } + if next >= to { + bounds.push(to); + break; + } + bounds.push(next); + current = next; + } + + Self::new(bounds) + } +} + +pub struct MetricsBatch { + lines: Vec, +} + +impl Default for MetricsBatch { + fn default() -> Self { + Self::new() + } +} + +impl MetricsBatch { + /// Create an empty metrics batch. + pub fn new() -> Self { + Self { lines: Vec::new() } + } + + /// Append a counter increment to the batch. + pub fn counter(&mut self, name: &str, inc: i64, tags: &[(&str, &str)]) -> Result<()> { + let tags = collect_tags(tags)?; + self.lines.push(StatsdLine::counter(name, inc, tags)?); + Ok(()) + } + + /// Append a histogram sample, encoded as a bucketed counter, to the batch. + pub fn histogram( + &mut self, + name: &str, + value: i64, + buckets: &HistogramBuckets, + tags: &[(&str, &str)], + ) -> Result<()> { + let base_tags = collect_tags(tags)?; + let mut matched = false; + for bound in buckets.bounds.iter().filter(|bound| value <= **bound) { + let mut tags = base_tags.clone(); + tags.push(("le".to_string(), bound.to_string())); + self.lines.push(StatsdLine::counter(name, 1, tags)?); + matched = true; + } + if !matched { + let mut tags = base_tags; + tags.push(("le".to_string(), "inf".to_string())); + self.lines.push(StatsdLine::counter(name, 1, tags)?); + } + Ok(()) + } + + pub(crate) fn is_empty(&self) -> bool { + self.lines.is_empty() + } + + pub(crate) fn render(&self, default_tags: &BTreeMap) -> Result { + let mut rendered = Vec::with_capacity(self.lines.len()); + for line in &self.lines { + rendered.push(line.render(default_tags)?); + } + Ok(rendered.join("\n")) + } +} + +#[cfg(test)] +mod tests { + use super::HistogramBuckets; + use crate::error::MetricsError; + use crate::error::Result; + use pretty_assertions::assert_eq; + + #[test] + fn from_range_builds_linear_buckets() -> Result<()> { + let buckets = HistogramBuckets::from_range(25, 100, 25)?; + let expected = HistogramBuckets::from_values(&[25, 50, 75, 100])?; + assert_eq!(buckets, expected); + Ok(()) + } + + #[test] + fn from_range_includes_upper_bound_when_step_is_uneven() -> Result<()> { + let buckets = HistogramBuckets::from_range(10, 95, 30)?; + let expected = HistogramBuckets::from_values(&[10, 40, 70, 95])?; + assert_eq!(buckets, expected); + Ok(()) + } + + #[test] + fn from_range_accepts_single_value_range() -> Result<()> { + let buckets = HistogramBuckets::from_range(42, 42, 5)?; + let expected = HistogramBuckets::from_values(&[42])?; + assert_eq!(buckets, expected); + Ok(()) + } + + #[test] + fn from_range_rejects_non_positive_step() { + let err = HistogramBuckets::from_range(0, 10, 0).unwrap_err(); + assert_eq!(err.to_string(), "histogram bucket step must be positive: 0"); + } + + #[test] + fn from_range_rejects_descending_range() { + let err = HistogramBuckets::from_range(10, 0, 1).unwrap_err(); + assert_eq!( + err.to_string(), + "histogram bucket range must be ascending: 10..=0" + ); + } + + #[test] + fn from_exponential_builds_buckets() -> Result<()> { + let buckets = HistogramBuckets::from_exponential(10, 100, 2.0)?; + let expected = HistogramBuckets::from_values(&[10, 20, 40, 80, 100])?; + assert_eq!(buckets, expected); + Ok(()) + } + + #[test] + fn from_exponential_includes_upper_bound() -> Result<()> { + let buckets = HistogramBuckets::from_exponential(30, 100, 3.0)?; + let expected = HistogramBuckets::from_values(&[30, 90, 100])?; + assert_eq!(buckets, expected); + Ok(()) + } + + #[test] + fn from_exponential_rejects_non_positive_start() { + let err = HistogramBuckets::from_exponential(0, 10, 2.0).unwrap_err(); + assert!(matches!( + err, + MetricsError::BucketStartNonPositive { start: 0 } + )); + } + + #[test] + fn from_exponential_rejects_invalid_factor() { + let err = HistogramBuckets::from_exponential(1, 10, 1.0).unwrap_err(); + assert!(matches!( + err, + MetricsError::BucketFactorInvalid { factor: 1.0 } + )); + } +} diff --git a/codex-rs/metrics/src/client.rs b/codex-rs/metrics/src/client.rs new file mode 100644 index 00000000000..952265aa636 --- /dev/null +++ b/codex-rs/metrics/src/client.rs @@ -0,0 +1,109 @@ +use crate::batch::HistogramBuckets; +use crate::batch::MetricsBatch; +use crate::config::MetricsConfig; +use crate::error::MetricsError; +use crate::error::Result; +use crate::statsd::build_statsd_envelope; +use crate::validation::validate_tags; +use sentry::types::Dsn; +use std::collections::BTreeMap; + +const ENVELOPE_CONTENT_TYPE: &str = "application/x-sentry-envelope"; + +#[derive(Debug)] +pub struct MetricsClient { + dsn: Dsn, + http: reqwest::blocking::Client, + auth_header: String, + default_tags: BTreeMap, +} + +impl MetricsClient { + /// Build a metrics client from configuration and validate defaults. + pub fn new(config: MetricsConfig) -> Result { + let dsn_value = config.dsn.clone(); + let dsn = dsn_value + .parse::() + .map_err(|source| MetricsError::InvalidDsn { + dsn: dsn_value, + source, + })?; + validate_tags(&config.default_tags)?; + + let http = reqwest::blocking::Client::builder() + .timeout(config.timeout) + .user_agent(config.user_agent.clone()) + .build() + .map_err(|source| MetricsError::HttpClientBuild { source })?; + + let auth_header = dsn.to_auth(Some(&config.user_agent)).to_string(); + + Ok(Self { + dsn, + http, + auth_header, + default_tags: config.default_tags, + }) + } + + /// Send a single counter increment. + pub fn counter(&self, name: &str, inc: i64, tags: &[(&str, &str)]) -> Result<()> { + let mut batch = MetricsBatch::new(); + batch.counter(name, inc, tags)?; + self.send(batch) + } + + /// Send a single histogram sample with the provided buckets. + pub fn histogram( + &self, + name: &str, + value: i64, + buckets: &HistogramBuckets, + tags: &[(&str, &str)], + ) -> Result<()> { + let mut batch = MetricsBatch::new(); + batch.histogram(name, value, buckets, tags)?; + self.send(batch) + } + + /// Create an empty batch for multi-metric sends. + pub fn batch(&self) -> MetricsBatch { + MetricsBatch::new() + } + + /// Send a batch of metrics to Sentry (no-op if the batch is empty). + pub fn send(&self, batch: MetricsBatch) -> Result<()> { + if batch.is_empty() { + return Ok(()); + } + + let payload = batch.render(&self.default_tags)?; + let envelope = build_statsd_envelope(&self.dsn, &payload)?; + + let response = self + .http + .post(self.dsn.envelope_api_url()) + .header("X-Sentry-Auth", &self.auth_header) + .header("Content-Type", ENVELOPE_CONTENT_TYPE) + .body(envelope) + .send() + .map_err(|source| MetricsError::SendEnvelope { source })?; + + if !response.status().is_success() { + let status = response.status(); + let body = response + .text() + .map(|body| { + if body.is_empty() { + String::new() + } else { + format!(" body: {body}") + } + }) + .unwrap_or_default(); + return Err(MetricsError::SentryUploadFailed { status, body }); + } + + Ok(()) + } +} diff --git a/codex-rs/metrics/src/config.rs b/codex-rs/metrics/src/config.rs new file mode 100644 index 00000000000..1c7f22f5763 --- /dev/null +++ b/codex-rs/metrics/src/config.rs @@ -0,0 +1,56 @@ +use crate::error::Result; +use crate::validation::validate_tag_component; +use std::collections::BTreeMap; +use std::time::Duration; + +const SENTRY_DSN: &str = + "https://ae32ed50620d7a7792c1ce5df38b3e3e@o33249.ingest.us.sentry.io/4510195390611458"; +const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10); + +#[derive(Clone, Debug)] +pub struct MetricsConfig { + pub(crate) dsn: String, + pub(crate) default_tags: BTreeMap, + pub(crate) timeout: Duration, + pub(crate) user_agent: String, +} + +impl MetricsConfig { + /// Create a config with the provided DSN and default settings. + pub fn new(dsn: impl Into) -> Self { + Self { + dsn: dsn.into(), + default_tags: BTreeMap::new(), + timeout: DEFAULT_TIMEOUT, + user_agent: format!("codex-metrics/{}", env!("CARGO_PKG_VERSION")), + } + } + + /// Add a default tag that will be sent with every metric. + pub fn with_tag(mut self, key: impl Into, value: impl Into) -> Result { + let key = key.into(); + let value = value.into(); + validate_tag_component(&key, "tag key")?; + validate_tag_component(&value, "tag value")?; + self.default_tags.insert(key, value); + Ok(self) + } + + /// Override the HTTP timeout. + pub fn with_timeout(mut self, timeout: Duration) -> Self { + self.timeout = timeout; + self + } + + /// Override the user agent string. + pub fn with_user_agent(mut self, user_agent: impl Into) -> Self { + self.user_agent = user_agent.into(); + self + } +} + +impl Default for MetricsConfig { + fn default() -> Self { + Self::new(SENTRY_DSN) + } +} diff --git a/codex-rs/metrics/src/error.rs b/codex-rs/metrics/src/error.rs new file mode 100644 index 00000000000..f20db8e9110 --- /dev/null +++ b/codex-rs/metrics/src/error.rs @@ -0,0 +1,63 @@ +use thiserror::Error; + +pub type Result = std::result::Result; + +#[derive(Debug, Error)] +pub enum MetricsError { + // Buckets. + #[error("histogram buckets cannot be empty")] + EmptyBuckets, + #[error("histogram bucket step must be positive: {step}")] + BucketStepNonPositive { step: i64 }, + #[error("histogram bucket start must be positive: {start}")] + BucketStartNonPositive { start: i64 }, + #[error("histogram bucket factor must be finite and greater than 1: {factor}")] + BucketFactorInvalid { factor: f64 }, + #[error("histogram bucket range must be ascending: {from}..={to}")] + BucketRangeDescending { from: i64, to: i64 }, + #[error("histogram bucket range overflow: {from}..={to} step {step}")] + BucketRangeOverflow { from: i64, to: i64, step: i64 }, + + // Metrics. + #[error("metric name cannot be empty")] + EmptyMetricName, + #[error("metric name contains invalid characters: {name}")] + InvalidMetricName { name: String }, + #[error("{label} cannot be empty")] + EmptyTagComponent { label: String }, + #[error("{label} contains invalid characters: {value}")] + InvalidTagComponent { label: String, value: String }, + + // Client. + #[error("invalid sentry dsn: {dsn}")] + InvalidDsn { + dsn: String, + #[source] + source: sentry::types::ParseDsnError, + }, + #[error("failed to build metrics http client")] + HttpClientBuild { + #[source] + source: reqwest::Error, + }, + #[error("failed to serialize envelope header")] + SerializeEnvelopeHeader { + #[source] + source: serde_json::Error, + }, + #[error("failed to serialize item header")] + SerializeEnvelopeItemHeader { + #[source] + source: serde_json::Error, + }, + #[error("failed to send metrics envelope")] + SendEnvelope { + #[source] + source: reqwest::Error, + }, + #[error("sentry metrics upload failed: {status}{body}")] + SentryUploadFailed { + status: reqwest::StatusCode, + body: String, + }, +} diff --git a/codex-rs/metrics/src/lib.rs b/codex-rs/metrics/src/lib.rs new file mode 100644 index 00000000000..ec80fd88fda --- /dev/null +++ b/codex-rs/metrics/src/lib.rs @@ -0,0 +1,13 @@ +mod batch; +mod client; +mod config; +mod error; +mod statsd; +mod validation; + +pub use crate::batch::HistogramBuckets; +pub use crate::batch::MetricsBatch; +pub use crate::client::MetricsClient; +pub use crate::config::MetricsConfig; +pub use crate::error::MetricsError; +pub use crate::error::Result; diff --git a/codex-rs/metrics/src/statsd.rs b/codex-rs/metrics/src/statsd.rs new file mode 100644 index 00000000000..61f41eab3f8 --- /dev/null +++ b/codex-rs/metrics/src/statsd.rs @@ -0,0 +1,102 @@ +use crate::error::MetricsError; +use crate::error::Result; +use crate::validation::validate_metric_name; +use crate::validation::validate_tag_component; +use sentry::types::Dsn; +use std::collections::BTreeMap; + +const STATSD_CONTENT_TYPE: &str = "text/plain"; + +pub(crate) struct StatsdLine { + name: String, + value: i64, + kind: MetricKind, + tags: Vec<(String, String)>, +} + +impl StatsdLine { + pub(crate) fn counter(name: &str, value: i64, tags: Vec<(String, String)>) -> Result { + validate_metric_name(name)?; + Ok(Self { + name: name.to_string(), + value, + kind: MetricKind::Counter, + tags, + }) + } + + pub(crate) fn render(&self, default_tags: &BTreeMap) -> Result { + let tags = merge_tags(default_tags, &self.tags); + let name = self.name.as_str(); + let value = self.value; + let kind = self.kind.as_str(); + let mut line = format!("{name}:{value}|{kind}"); + + if !tags.is_empty() { + let taglist = tags + .iter() + .map(|(key, value)| format!("{key}:{value}")) + .collect::>() + .join(","); + line.push_str("|#"); + line.push_str(&taglist); + } + + Ok(line) + } +} + +enum MetricKind { + Counter, +} + +impl MetricKind { + fn as_str(&self) -> &'static str { + match self { + MetricKind::Counter => "c", + } + } +} + +pub(crate) fn build_statsd_envelope(dsn: &Dsn, payload: &str) -> Result> { + let header = serde_json::json!({ + "dsn": dsn.to_string(), + }); + let mut bytes = Vec::new(); + serde_json::to_writer(&mut bytes, &header) + .map_err(|source| MetricsError::SerializeEnvelopeHeader { source })?; + bytes.push(b'\n'); + + let item_header = serde_json::json!({ + "type": "statsd", + "length": payload.len(), + "content_type": STATSD_CONTENT_TYPE, + }); + serde_json::to_writer(&mut bytes, &item_header) + .map_err(|source| MetricsError::SerializeEnvelopeItemHeader { source })?; + bytes.push(b'\n'); + bytes.extend_from_slice(payload.as_bytes()); + bytes.push(b'\n'); + Ok(bytes) +} + +pub(crate) fn collect_tags(tags: &[(&str, &str)]) -> Result> { + tags.iter() + .map(|(key, value)| { + validate_tag_component(key, "tag key")?; + validate_tag_component(value, "tag value")?; + Ok(((*key).to_string(), (*value).to_string())) + }) + .collect() +} + +fn merge_tags( + default_tags: &BTreeMap, + tags: &[(String, String)], +) -> BTreeMap { + let mut merged = default_tags.clone(); + for (key, value) in tags { + merged.insert(key.clone(), value.clone()); + } + merged +} diff --git a/codex-rs/metrics/src/validation.rs b/codex-rs/metrics/src/validation.rs new file mode 100644 index 00000000000..a51185b3f9e --- /dev/null +++ b/codex-rs/metrics/src/validation.rs @@ -0,0 +1,46 @@ +use crate::error::MetricsError; +use crate::error::Result; +use std::collections::BTreeMap; + +pub(crate) fn validate_tags(tags: &BTreeMap) -> Result<()> { + for (key, value) in tags { + validate_tag_component(key, "tag key")?; + validate_tag_component(value, "tag value")?; + } + Ok(()) +} + +pub(crate) fn validate_metric_name(name: &str) -> Result<()> { + if name.is_empty() { + return Err(MetricsError::EmptyMetricName); + } + if !name.chars().all(is_metric_char) { + return Err(MetricsError::InvalidMetricName { + name: name.to_string(), + }); + } + Ok(()) +} + +pub(crate) fn validate_tag_component(value: &str, label: &str) -> Result<()> { + if value.is_empty() { + return Err(MetricsError::EmptyTagComponent { + label: label.to_string(), + }); + } + if !value.chars().all(is_tag_char) { + return Err(MetricsError::InvalidTagComponent { + label: label.to_string(), + value: value.to_string(), + }); + } + Ok(()) +} + +fn is_metric_char(c: char) -> bool { + c.is_ascii_alphanumeric() || matches!(c, '.' | '_' | '-') +} + +fn is_tag_char(c: char) -> bool { + c.is_ascii_alphanumeric() || matches!(c, '.' | '_' | '-' | '/') +} diff --git a/codex-rs/metrics/tests/tests.rs b/codex-rs/metrics/tests/tests.rs new file mode 100644 index 00000000000..e3e5f5067fa --- /dev/null +++ b/codex-rs/metrics/tests/tests.rs @@ -0,0 +1,338 @@ +use codex_metrics::HistogramBuckets; +use codex_metrics::MetricsBatch; +use codex_metrics::MetricsClient; +use codex_metrics::MetricsConfig; +use codex_metrics::MetricsError; +use codex_metrics::Result; +use pretty_assertions::assert_eq; +use serde_json::Value; +use std::collections::BTreeMap; +use std::io::Read; +use std::io::Write; +use std::net::TcpListener; +use std::net::TcpStream; +use std::thread; +use std::time::Duration; + +#[derive(Debug)] +struct CapturedRequest { + method: String, + path: String, + headers: BTreeMap, + body: Vec, +} + +#[derive(Debug)] +struct ParsedEnvelope { + header: Value, + item_header: Value, + payload: String, +} + +#[derive(Debug)] +struct ParsedStatsdLine { + name: String, + value: i64, + kind: String, + tags: BTreeMap, +} + +fn spawn_server(status: u16) -> (String, thread::JoinHandle) { + let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server"); + let addr = listener.local_addr().expect("local addr"); + let dsn = format!("http://public:@{addr}/123"); + + let handle = thread::spawn(move || { + let (mut stream, _) = listener.accept().expect("accept connection"); + let request = read_http_request(&mut stream); + let reason = match status { + 200 => "OK", + 500 => "Internal Server Error", + _ => "OK", + }; + let response = + format!("HTTP/1.1 {status} {reason}\r\nContent-Length: 0\r\nConnection: close\r\n\r\n"); + stream + .write_all(response.as_bytes()) + .expect("write response"); + request + }); + + (dsn, handle) +} + +fn read_http_request(stream: &mut TcpStream) -> CapturedRequest { + let mut buffer = Vec::new(); + let mut chunk = [0_u8; 1024]; + let mut header_end = None; + while header_end.is_none() { + let read = stream.read(&mut chunk).expect("read request"); + if read == 0 { + break; + } + buffer.extend_from_slice(&chunk[..read]); + header_end = find_header_end(&buffer); + } + let header_end = header_end.expect("request headers"); + let headers_bytes = &buffer[..header_end]; + let headers_str = std::str::from_utf8(headers_bytes).expect("headers utf-8"); + let mut lines = headers_str.split("\r\n"); + let request_line = lines.next().expect("request line"); + let mut request_parts = request_line.split_whitespace(); + let method = request_parts.next().expect("method").to_string(); + let path = request_parts.next().expect("path").to_string(); + + let mut headers = BTreeMap::new(); + for line in lines { + if line.is_empty() { + continue; + } + if let Some((key, value)) = line.split_once(':') { + headers.insert(key.trim().to_ascii_lowercase(), value.trim().to_string()); + } + } + + let content_length = headers + .get("content-length") + .and_then(|value| value.parse::().ok()) + .unwrap_or(0); + let mut body = buffer[header_end..].to_vec(); + while body.len() < content_length { + let read = stream.read(&mut chunk).expect("read body"); + if read == 0 { + break; + } + body.extend_from_slice(&chunk[..read]); + } + + CapturedRequest { + method, + path, + headers, + body, + } +} + +fn find_header_end(buffer: &[u8]) -> Option { + buffer + .windows(4) + .position(|window| window == b"\r\n\r\n") + .map(|pos| pos + 4) +} + +fn parse_envelope(body: &[u8]) -> ParsedEnvelope { + let mut parts = body.splitn(3, |byte| *byte == b'\n'); + let header_line = parts.next().expect("envelope header"); + let item_header_line = parts.next().expect("item header"); + let payload = parts.next().unwrap_or(&[]); + + let header = serde_json::from_slice(header_line).expect("parse envelope header"); + let item_header = serde_json::from_slice(item_header_line).expect("parse item header"); + let payload = std::str::from_utf8(payload) + .expect("payload utf-8") + .trim_end_matches('\n') + .to_string(); + + ParsedEnvelope { + header, + item_header, + payload, + } +} + +fn parse_statsd_line(line: &str) -> ParsedStatsdLine { + let (metric, tags_part) = line + .split_once("|#") + .map(|(metric, tags)| (metric, Some(tags))) + .unwrap_or((line, None)); + let (name_value, kind) = metric.split_once('|').expect("metric kind"); + let (name, value) = name_value.split_once(':').expect("metric value"); + let value = value.parse::().expect("metric value parse"); + + let mut tags = BTreeMap::new(); + if let Some(tags_part) = tags_part + && !tags_part.is_empty() + { + for tag in tags_part.split(',') { + let (key, value) = tag.split_once(':').expect("tag"); + tags.insert(key.to_string(), value.to_string()); + } + } + + ParsedStatsdLine { + name: name.to_string(), + value, + kind: kind.to_string(), + tags, + } +} + +#[test] +fn send_builds_payload_with_tags_and_histograms() -> Result<()> { + let (dsn, handle) = spawn_server(200); + let metrics = MetricsClient::new( + MetricsConfig::new(dsn.clone()) + .with_tag("service", "codex-cli")? + .with_tag("env", "prod")?, + )?; + let buckets = HistogramBuckets::from_values(&[25, 50, 100])?; + + let mut batch = metrics.batch(); + batch.counter("codex.turns", 1, &[("model", "gpt-5.1"), ("env", "dev")])?; + batch.histogram("codex.tool_latency", 25, &buckets, &[("tool", "shell")])?; + metrics.send(batch)?; + + let captured = handle.join().expect("server thread"); + assert_eq!(captured.method, "POST"); + assert_eq!(captured.path, "/api/123/envelope/"); + assert_eq!( + captured.headers.get("content-type").map(String::as_str), + Some("application/x-sentry-envelope") + ); + + let envelope = parse_envelope(&captured.body); + assert_eq!(envelope.header["dsn"].as_str(), Some(dsn.as_str())); + assert_eq!(envelope.item_header["type"], "statsd"); + assert_eq!(envelope.item_header["content_type"], "text/plain"); + assert_eq!( + envelope.item_header["length"].as_u64(), + Some(envelope.payload.len() as u64) + ); + + let lines: Vec<&str> = envelope.payload.split('\n').collect(); + assert_eq!(lines.len(), 4); + + let line = parse_statsd_line(lines[0]); + assert_eq!(line.name, "codex.turns"); + assert_eq!(line.value, 1); + assert_eq!(line.kind, "c"); + assert_eq!( + line.tags.get("service").map(String::as_str), + Some("codex-cli") + ); + assert_eq!(line.tags.get("env").map(String::as_str), Some("dev")); + assert_eq!(line.tags.get("model").map(String::as_str), Some("gpt-5.1")); + + for (line, expected_le) in lines.iter().skip(1).zip(["25", "50", "100"]) { + let line = parse_statsd_line(line); + assert_eq!(line.name, "codex.tool_latency"); + assert_eq!(line.value, 1); + assert_eq!(line.kind, "c"); + assert_eq!( + line.tags.get("service").map(String::as_str), + Some("codex-cli") + ); + assert_eq!(line.tags.get("env").map(String::as_str), Some("prod")); + assert_eq!(line.tags.get("tool").map(String::as_str), Some("shell")); + assert_eq!(line.tags.get("le").map(String::as_str), Some(expected_le)); + } + + Ok(()) +} + +#[test] +fn send_uses_inf_bucket_for_values_over_max() -> Result<()> { + let (dsn, handle) = spawn_server(200); + let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; + let buckets = HistogramBuckets::from_values(&[10, 20])?; + + let mut batch = metrics.batch(); + batch.histogram("codex.tool_latency", 99, &buckets, &[("tool", "shell")])?; + metrics.send(batch)?; + + let captured = handle.join().expect("server thread"); + let envelope = parse_envelope(&captured.body); + let lines: Vec<&str> = envelope.payload.split('\n').collect(); + assert_eq!(lines.len(), 1); + let line = parse_statsd_line(lines[0]); + assert_eq!(line.tags.get("le").map(String::as_str), Some("inf")); + + Ok(()) +} + +#[test] +fn send_reports_non_success_status() -> Result<()> { + let (dsn, handle) = spawn_server(500); + let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; + + let mut batch = metrics.batch(); + batch.counter("codex.turns", 1, &[])?; + let err = metrics.send(batch).unwrap_err(); + assert!(matches!( + err, + MetricsError::SentryUploadFailed { status, .. } if status.as_u16() == 500 + )); + + let _ = handle.join().expect("server thread"); + Ok(()) +} + +#[test] +fn invalid_dsn_reports_error() -> Result<()> { + let err = MetricsClient::new(MetricsConfig::new("not a dsn")).unwrap_err(); + assert!(matches!(err, MetricsError::InvalidDsn { .. })); + Ok(()) +} + +#[test] +fn send_is_noop_when_batch_empty() -> Result<()> { + let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server"); + listener.set_nonblocking(true).expect("set nonblocking"); + let addr = listener.local_addr().expect("local addr"); + let dsn = format!("http://public:@{addr}/123"); + let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; + + metrics.send(metrics.batch())?; + + let mut saw_connection = false; + for _ in 0..10 { + match listener.accept() { + Ok(_) => { + saw_connection = true; + break; + } + Err(err) if err.kind() == std::io::ErrorKind::WouldBlock => { + thread::sleep(Duration::from_millis(10)); + } + Err(err) => panic!("unexpected accept error: {err}"), + } + } + assert!(!saw_connection, "expected no request for empty batch"); + Ok(()) +} + +#[test] +fn invalid_tag_component_is_rejected() -> Result<()> { + let err = MetricsConfig::default() + .with_tag("bad key", "value") + .unwrap_err(); + assert!(matches!( + err, + MetricsError::InvalidTagComponent { label, value } + if label == "tag key" && value == "bad key" + )); + Ok(()) +} + +#[test] +fn counter_rejects_invalid_metric_name() -> Result<()> { + let mut batch = MetricsBatch::new(); + let err = batch.counter("bad name", 1, &[]).unwrap_err(); + assert!(matches!( + err, + MetricsError::InvalidMetricName { name } if name == "bad name" + )); + Ok(()) +} + +#[test] +fn empty_buckets_are_rejected() { + let err = HistogramBuckets::from_values(&[]).unwrap_err(); + assert!(matches!(err, MetricsError::EmptyBuckets)); +} + +#[test] +fn range_overflow_is_reported() { + let err = HistogramBuckets::from_range(i64::MAX - 1, i64::MAX, 2).unwrap_err(); + assert!(matches!(err, MetricsError::BucketRangeOverflow { .. })); +} From 358079d7d8a5b0d7932357345fcf306f3ac1c272 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Fri, 19 Dec 2025 14:01:52 +0100 Subject: [PATCH 02/43] V2 --- codex-rs/Cargo.lock | 1 + codex-rs/metrics/Cargo.toml | 1 + codex-rs/metrics/README.md | 52 ++++++- codex-rs/metrics/src/batch.rs | 9 ++ codex-rs/metrics/src/client.rs | 244 ++++++++++++++++++++++++++++---- codex-rs/metrics/src/error.rs | 10 ++ codex-rs/metrics/src/lib.rs | 2 + codex-rs/metrics/src/time.rs | 7 + codex-rs/metrics/src/util.rs | 9 ++ codex-rs/metrics/tests/tests.rs | 146 +++++++++++++++++-- docs/telemetry.md | 48 +++++++ 11 files changed, 492 insertions(+), 37 deletions(-) create mode 100644 codex-rs/metrics/src/time.rs create mode 100644 codex-rs/metrics/src/util.rs create mode 100644 docs/telemetry.md diff --git a/codex-rs/Cargo.lock b/codex-rs/Cargo.lock index d83fee60fbb..73f7fd62dbe 100644 --- a/codex-rs/Cargo.lock +++ b/codex-rs/Cargo.lock @@ -1570,6 +1570,7 @@ dependencies = [ "sentry", "serde_json", "thiserror 2.0.17", + "tracing", ] [[package]] diff --git a/codex-rs/metrics/Cargo.toml b/codex-rs/metrics/Cargo.toml index 7e070ba80cc..23c90dbb6aa 100644 --- a/codex-rs/metrics/Cargo.toml +++ b/codex-rs/metrics/Cargo.toml @@ -10,3 +10,4 @@ reqwest = { workspace = true, features = ["blocking"] } sentry = { workspace = true } serde_json = { workspace = true } thiserror = { workspace = true } +tracing = { workspace = true } diff --git a/codex-rs/metrics/README.md b/codex-rs/metrics/README.md index 44bc6cabb49..860160b91f5 100644 --- a/codex-rs/metrics/README.md +++ b/codex-rs/metrics/README.md @@ -4,7 +4,7 @@ Send lightweight counters and histogram buckets to Sentry via the statsd envelop ## Overview -- Blocking, minimal client designed for CLI and service use. +- Non-blocking client that enqueues metrics to a background worker. - Counters and histograms only (histograms are encoded as bucketed counters). - Tag validation and metric name validation are enforced before send. @@ -48,6 +48,8 @@ let metrics = MetricsClient::new(MetricsConfig::default())?; - `with_tag(key, value)` to add default tags. - `with_timeout(duration)` to override the HTTP timeout (default 10s). - `with_user_agent(agent)` to override the user agent. +Use `MetricsClient::with_capacity(config, capacity)` to override the default +queue capacity. ## Buckets @@ -80,6 +82,39 @@ bound that is greater than or equal to the value (or `inf` if none match): metrics.histogram("codex.request_latency", 83, &buckets, &[("route", "chat")])?; ``` +`counter`, `histogram`, and `send` enqueue metrics for the background worker. +Call `shutdown` to flush queued metrics on exit. + +## Timing + +Measure a closure and emit a histogram sample for the elapsed time in milliseconds: + +```rust +let result = metrics.time("codex.request_latency", &buckets, &[("route", "chat")], || { + "ok" +})?; +``` + +If the closure already returns `codex_metrics::Result`, use `time_result` to +avoid nested results: + +```rust +let result = metrics.time_result("codex.request_latency", &buckets, &[("route", "chat")], || { + Ok("ok") +})?; +``` + +If you already have a duration, record it directly: + +```rust +metrics.record_duration( + "codex.request_latency", + std::time::Duration::from_millis(83), + &buckets, + &[("route", "chat")], +)?; +``` + ## Batching Batching reduces network requests. Build a batch and send it once: @@ -91,6 +126,21 @@ batch.histogram("codex.tool_latency", 140, &buckets, &[("tool", "shell")])?; metrics.send(batch)?; ``` +## Shutdown and queue capacity + +The client uses a bounded queue (default capacity 1024). Enqueueing returns a +`MetricsError::QueueFull` error if the queue is full or `MetricsError::WorkerUnavailable` +if the worker is no longer running. + +`shutdown` waits up to 500ms for the worker to stop. Use `shutdown_with_timeout` +to override the timeout. + +Uploads are best-effort; if the worker encounters a send error, the metric is +dropped. + +`MetricsClient` also attempts a best-effort shutdown on drop using the default +timeout, so explicit calls to `shutdown` are optional. + ## Validation rules Metric names: diff --git a/codex-rs/metrics/src/batch.rs b/codex-rs/metrics/src/batch.rs index bcfec6898bd..dfde45b69b2 100644 --- a/codex-rs/metrics/src/batch.rs +++ b/codex-rs/metrics/src/batch.rs @@ -167,6 +167,7 @@ mod tests { use pretty_assertions::assert_eq; #[test] + // Verifies linear bucket construction over a clean step range. fn from_range_builds_linear_buckets() -> Result<()> { let buckets = HistogramBuckets::from_range(25, 100, 25)?; let expected = HistogramBuckets::from_values(&[25, 50, 75, 100])?; @@ -175,6 +176,7 @@ mod tests { } #[test] + // Ensures uneven steps still include the final upper bound. fn from_range_includes_upper_bound_when_step_is_uneven() -> Result<()> { let buckets = HistogramBuckets::from_range(10, 95, 30)?; let expected = HistogramBuckets::from_values(&[10, 40, 70, 95])?; @@ -183,6 +185,7 @@ mod tests { } #[test] + // Confirms a single-value range produces one bucket. fn from_range_accepts_single_value_range() -> Result<()> { let buckets = HistogramBuckets::from_range(42, 42, 5)?; let expected = HistogramBuckets::from_values(&[42])?; @@ -191,12 +194,14 @@ mod tests { } #[test] + // Rejects a non-positive step to avoid invalid ranges. fn from_range_rejects_non_positive_step() { let err = HistogramBuckets::from_range(0, 10, 0).unwrap_err(); assert_eq!(err.to_string(), "histogram bucket step must be positive: 0"); } #[test] + // Rejects descending ranges to prevent inverted buckets. fn from_range_rejects_descending_range() { let err = HistogramBuckets::from_range(10, 0, 1).unwrap_err(); assert_eq!( @@ -206,6 +211,7 @@ mod tests { } #[test] + // Verifies exponential buckets grow and include the upper bound. fn from_exponential_builds_buckets() -> Result<()> { let buckets = HistogramBuckets::from_exponential(10, 100, 2.0)?; let expected = HistogramBuckets::from_values(&[10, 20, 40, 80, 100])?; @@ -214,6 +220,7 @@ mod tests { } #[test] + // Ensures exponential buckets always include the final bound. fn from_exponential_includes_upper_bound() -> Result<()> { let buckets = HistogramBuckets::from_exponential(30, 100, 3.0)?; let expected = HistogramBuckets::from_values(&[30, 90, 100])?; @@ -222,6 +229,7 @@ mod tests { } #[test] + // Rejects non-positive starts because exponential growth requires > 0. fn from_exponential_rejects_non_positive_start() { let err = HistogramBuckets::from_exponential(0, 10, 2.0).unwrap_err(); assert!(matches!( @@ -231,6 +239,7 @@ mod tests { } #[test] + // Rejects invalid exponential factors (non-finite or <= 1). fn from_exponential_rejects_invalid_factor() { let err = HistogramBuckets::from_exponential(1, 10, 1.0).unwrap_err(); assert!(matches!( diff --git a/codex-rs/metrics/src/client.rs b/codex-rs/metrics/src/client.rs index 952265aa636..c9a126a1b46 100644 --- a/codex-rs/metrics/src/client.rs +++ b/codex-rs/metrics/src/client.rs @@ -4,23 +4,100 @@ use crate::config::MetricsConfig; use crate::error::MetricsError; use crate::error::Result; use crate::statsd::build_statsd_envelope; +use crate::time::duration_to_millis; +use crate::util::error_or_panic; use crate::validation::validate_tags; use sentry::types::Dsn; use std::collections::BTreeMap; +use std::sync::Arc; +use std::sync::Mutex; +use std::sync::mpsc; +use std::sync::mpsc::Receiver; +use std::sync::mpsc::SyncSender; +use std::sync::mpsc::TrySendError; +use std::thread; +use std::time::Duration; +use std::time::Instant; const ENVELOPE_CONTENT_TYPE: &str = "application/x-sentry-envelope"; +const DEFAULT_QUEUE_CAPACITY: usize = 1024; +const DEFAULT_SHUTDOWN_TIMEOUT: Duration = Duration::from_millis(500); +const SHUTDOWN_POLL_INTERVAL: Duration = Duration::from_millis(10); + +enum WorkerMessage { + Batch(MetricsBatch), + Shutdown, +} + +struct WorkerState { + sender: Mutex>>, + handle: Mutex>>, + capacity: usize, +} #[derive(Debug)] -pub struct MetricsClient { +struct ClientCore { dsn: Dsn, http: reqwest::blocking::Client, auth_header: String, default_tags: BTreeMap, } +impl ClientCore { + fn send(&self, batch: MetricsBatch) -> Result<()> { + if batch.is_empty() { + return Ok(()); + } + + let payload = batch.render(&self.default_tags)?; + let envelope = build_statsd_envelope(&self.dsn, &payload)?; + + let response = self + .http + .post(self.dsn.envelope_api_url()) + .header("X-Sentry-Auth", &self.auth_header) + .header("Content-Type", ENVELOPE_CONTENT_TYPE) + .body(envelope) + .send() + .map_err(|source| MetricsError::SendEnvelope { source })?; + + if !response.status().is_success() { + let status = response.status(); + let body = response + .text() + .map(|body| { + if body.is_empty() { + String::new() + } else { + format!(" body: {body}") + } + }) + .unwrap_or_default(); + return Err(MetricsError::SentryUploadFailed { status, body }); + } + + Ok(()) + } +} + +/// Background metrics client that enqueues metrics to a dedicated worker thread. +#[derive(Clone)] +pub struct MetricsClient { + state: Arc, +} + impl MetricsClient { /// Build a metrics client from configuration and validate defaults. pub fn new(config: MetricsConfig) -> Result { + Self::with_capacity(config, DEFAULT_QUEUE_CAPACITY) + } + + /// Build a metrics client with a bounded queue capacity. + pub fn with_capacity(config: MetricsConfig, capacity: usize) -> Result { + if capacity == 0 { + return Err(MetricsError::QueueCapacityZero); + } + let dsn_value = config.dsn.clone(); let dsn = dsn_value .parse::() @@ -38,15 +115,26 @@ impl MetricsClient { let auth_header = dsn.to_auth(Some(&config.user_agent)).to_string(); - Ok(Self { + let core = ClientCore { dsn, http, auth_header, default_tags: config.default_tags, + }; + + let (sender, receiver) = mpsc::sync_channel(capacity); + let handle = thread::spawn(move || run_worker(core, receiver)); + + Ok(Self { + state: Arc::new(WorkerState { + sender: Mutex::new(Some(sender)), + handle: Mutex::new(Some(handle)), + capacity, + }), }) } - /// Send a single counter increment. + /// Send a single counter increment without blocking the caller. pub fn counter(&self, name: &str, inc: i64, tags: &[(&str, &str)]) -> Result<()> { let mut batch = MetricsBatch::new(); batch.counter(name, inc, tags)?; @@ -66,44 +154,146 @@ impl MetricsClient { self.send(batch) } + /// Record a duration in milliseconds using histogram buckets. + pub fn record_duration( + &self, + name: &str, + duration: Duration, + buckets: &HistogramBuckets, + tags: &[(&str, &str)], + ) -> Result<()> { + let millis = duration_to_millis(duration); + self.histogram(name, millis, buckets, tags) + } + + /// Measure a closure and emit a histogram sample for the elapsed time. + pub fn time( + &self, + name: &str, + buckets: &HistogramBuckets, + tags: &[(&str, &str)], + f: impl FnOnce() -> T, + ) -> Result { + let start = Instant::now(); + let output = f(); + self.record_duration(name, start.elapsed(), buckets, tags)?; + Ok(output) + } + + /// Measure a closure that returns a metrics result without nesting results. + pub fn time_result( + &self, + name: &str, + buckets: &HistogramBuckets, + tags: &[(&str, &str)], + f: impl FnOnce() -> Result, + ) -> Result { + let start = Instant::now(); + let output = f(); + match output { + Ok(value) => { + self.record_duration(name, start.elapsed(), buckets, tags)?; + Ok(value) + } + Err(err) => { + let _ = self.record_duration(name, start.elapsed(), buckets, tags); + Err(err) + } + } + } + /// Create an empty batch for multi-metric sends. pub fn batch(&self) -> MetricsBatch { MetricsBatch::new() } - /// Send a batch of metrics to Sentry (no-op if the batch is empty). + /// Enqueue a batch of metrics for the worker to send. pub fn send(&self, batch: MetricsBatch) -> Result<()> { if batch.is_empty() { return Ok(()); } - let payload = batch.render(&self.default_tags)?; - let envelope = build_statsd_envelope(&self.dsn, &payload)?; + let sender = self + .state + .sender + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let Some(sender) = sender.as_ref() else { + return Err(MetricsError::WorkerUnavailable); + }; - let response = self - .http - .post(self.dsn.envelope_api_url()) - .header("X-Sentry-Auth", &self.auth_header) - .header("Content-Type", ENVELOPE_CONTENT_TYPE) - .body(envelope) - .send() - .map_err(|source| MetricsError::SendEnvelope { source })?; + match sender.try_send(WorkerMessage::Batch(batch)) { + Ok(()) => Ok(()), + Err(TrySendError::Full(_)) => Err(MetricsError::QueueFull { + capacity: self.state.capacity, + }), + Err(TrySendError::Disconnected(_)) => Err(MetricsError::WorkerUnavailable), + } + } - if !response.status().is_success() { - let status = response.status(); - let body = response - .text() - .map(|body| { - if body.is_empty() { - String::new() - } else { - format!(" body: {body}") - } - }) - .unwrap_or_default(); - return Err(MetricsError::SentryUploadFailed { status, body }); + /// Flush queued metrics and stop the worker thread. + pub fn shutdown(&self) -> Result<()> { + self.shutdown_with_timeout(DEFAULT_SHUTDOWN_TIMEOUT) + } + + /// Flush queued metrics and stop the worker thread, waiting up to `timeout`. + pub fn shutdown_with_timeout(&self, timeout: Duration) -> Result<()> { + let sender = self + .state + .sender + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner) + .take(); + let mut handle = self + .state + .handle + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let Some(handle) = handle.take() else { + return Ok(()); + }; + + if let Some(sender) = sender { + let _ = sender.try_send(WorkerMessage::Shutdown); + } + + if timeout.is_zero() { + if handle.is_finished() { + handle.join().map_err(|_| MetricsError::WorkerPanicked)?; + } + return Ok(()); + } + + let start = Instant::now(); + while start.elapsed() < timeout { + if handle.is_finished() { + handle.join().map_err(|_| MetricsError::WorkerPanicked)?; + return Ok(()); + } + thread::sleep(SHUTDOWN_POLL_INTERVAL); } Ok(()) } } + +impl Drop for MetricsClient { + fn drop(&mut self) { + if Arc::strong_count(&self.state) == 1 { + let _ = self.shutdown_with_timeout(DEFAULT_SHUTDOWN_TIMEOUT); + } + } +} + +fn run_worker(client: ClientCore, receiver: Receiver) { + while let Ok(message) = receiver.recv() { + match message { + WorkerMessage::Batch(batch) => { + if let Err(err) = client.send(batch) { + error_or_panic(format!("metrics send failed: {err}")); + } + } + WorkerMessage::Shutdown => break, + } + } +} diff --git a/codex-rs/metrics/src/error.rs b/codex-rs/metrics/src/error.rs index f20db8e9110..44117af0ab0 100644 --- a/codex-rs/metrics/src/error.rs +++ b/codex-rs/metrics/src/error.rs @@ -60,4 +60,14 @@ pub enum MetricsError { status: reqwest::StatusCode, body: String, }, + + // Worker. + #[error("metrics queue capacity must be positive")] + QueueCapacityZero, + #[error("metrics queue is full (capacity {capacity})")] + QueueFull { capacity: usize }, + #[error("metrics worker is unavailable")] + WorkerUnavailable, + #[error("metrics worker thread panicked")] + WorkerPanicked, } diff --git a/codex-rs/metrics/src/lib.rs b/codex-rs/metrics/src/lib.rs index ec80fd88fda..7f188b458de 100644 --- a/codex-rs/metrics/src/lib.rs +++ b/codex-rs/metrics/src/lib.rs @@ -3,6 +3,8 @@ mod client; mod config; mod error; mod statsd; +mod time; +mod util; mod validation; pub use crate::batch::HistogramBuckets; diff --git a/codex-rs/metrics/src/time.rs b/codex-rs/metrics/src/time.rs new file mode 100644 index 00000000000..d68d76fa4e7 --- /dev/null +++ b/codex-rs/metrics/src/time.rs @@ -0,0 +1,7 @@ +use std::time::Duration; + +pub(crate) fn duration_to_millis(duration: Duration) -> i64 { + let millis = duration.as_millis(); + let capped = millis.min(i64::MAX as u128); + capped as i64 +} diff --git a/codex-rs/metrics/src/util.rs b/codex-rs/metrics/src/util.rs new file mode 100644 index 00000000000..01c18894b9d --- /dev/null +++ b/codex-rs/metrics/src/util.rs @@ -0,0 +1,9 @@ +use tracing::error; + +pub(crate) fn error_or_panic(message: impl ToString) { + if cfg!(debug_assertions) || env!("CARGO_PKG_VERSION").contains("alpha") { + panic!("{}", message.to_string()); + } else { + error!("{}", message.to_string()); + } +} diff --git a/codex-rs/metrics/tests/tests.rs b/codex-rs/metrics/tests/tests.rs index e3e5f5067fa..055987a11c5 100644 --- a/codex-rs/metrics/tests/tests.rs +++ b/codex-rs/metrics/tests/tests.rs @@ -167,6 +167,7 @@ fn parse_statsd_line(line: &str) -> ParsedStatsdLine { } } +// Ensures counters/histograms render with default + per-call tags. #[test] fn send_builds_payload_with_tags_and_histograms() -> Result<()> { let (dsn, handle) = spawn_server(200); @@ -181,6 +182,7 @@ fn send_builds_payload_with_tags_and_histograms() -> Result<()> { batch.counter("codex.turns", 1, &[("model", "gpt-5.1"), ("env", "dev")])?; batch.histogram("codex.tool_latency", 25, &buckets, &[("tool", "shell")])?; metrics.send(batch)?; + metrics.shutdown()?; let captured = handle.join().expect("server thread"); assert_eq!(captured.method, "POST"); @@ -230,6 +232,7 @@ fn send_builds_payload_with_tags_and_histograms() -> Result<()> { Ok(()) } +// Verifies values above the max bucket use the inf tag. #[test] fn send_uses_inf_bucket_for_values_over_max() -> Result<()> { let (dsn, handle) = spawn_server(200); @@ -239,6 +242,7 @@ fn send_uses_inf_bucket_for_values_over_max() -> Result<()> { let mut batch = metrics.batch(); batch.histogram("codex.tool_latency", 99, &buckets, &[("tool", "shell")])?; metrics.send(batch)?; + metrics.shutdown()?; let captured = handle.join().expect("server thread"); let envelope = parse_envelope(&captured.body); @@ -250,30 +254,149 @@ fn send_uses_inf_bucket_for_values_over_max() -> Result<()> { Ok(()) } +// Ensures duration recording maps to the expected bucket tag. #[test] -fn send_reports_non_success_status() -> Result<()> { +fn record_duration_uses_matching_bucket() -> Result<()> { + let (dsn, handle) = spawn_server(200); + let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; + let buckets = HistogramBuckets::from_values(&[10, 20])?; + + metrics.record_duration( + "codex.request_latency", + Duration::from_millis(15), + &buckets, + &[("route", "chat")], + )?; + metrics.shutdown()?; + + let captured = handle.join().expect("server thread"); + let envelope = parse_envelope(&captured.body); + let lines: Vec<&str> = envelope.payload.split('\n').collect(); + assert_eq!(lines.len(), 1); + + let line = parse_statsd_line(lines[0]); + assert_eq!(line.name, "codex.request_latency"); + assert_eq!(line.tags.get("route").map(String::as_str), Some("chat")); + assert_eq!(line.tags.get("le").map(String::as_str), Some("20")); + + Ok(()) +} + +// Ensures time_result returns the closure output and records timing. +#[test] +fn time_result_records_success() -> Result<()> { + let (dsn, handle) = spawn_server(200); + let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; + let buckets = HistogramBuckets::from_values(&[10, 20])?; + + let value = metrics.time_result( + "codex.request_latency", + &buckets, + &[("route", "chat")], + || Ok("ok"), + )?; + assert_eq!(value, "ok"); + metrics.shutdown()?; + + let captured = handle.join().expect("server thread"); + let envelope = parse_envelope(&captured.body); + let lines: Vec<&str> = envelope.payload.split('\n').collect(); + assert!(!lines.is_empty()); + for line in lines { + let line = parse_statsd_line(line); + assert_eq!(line.name, "codex.request_latency"); + assert_eq!(line.tags.get("route").map(String::as_str), Some("chat")); + assert!(line.tags.contains_key("le")); + } + + Ok(()) +} + +// Ensures time_result propagates errors but still records timing. +#[test] +fn time_result_records_on_error() -> Result<()> { + let (dsn, handle) = spawn_server(200); + let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; + let buckets = HistogramBuckets::from_values(&[10, 20])?; + + let err = metrics + .time_result( + "codex.request_latency", + &buckets, + &[("route", "chat")], + || -> Result<&'static str> { Err(MetricsError::EmptyMetricName) }, + ) + .unwrap_err(); + assert!(matches!(err, MetricsError::EmptyMetricName)); + metrics.shutdown()?; + + let captured = handle.join().expect("server thread"); + let envelope = parse_envelope(&captured.body); + let lines: Vec<&str> = envelope.payload.split('\n').collect(); + assert!(!lines.is_empty()); + for line in lines { + let line = parse_statsd_line(line); + assert_eq!(line.name, "codex.request_latency"); + assert_eq!(line.tags.get("route").map(String::as_str), Some("chat")); + assert!(line.tags.contains_key("le")); + } + + Ok(()) +} + +// Verifies enqueued batches are delivered by the background worker. +#[test] +fn client_sends_enqueued_batch() -> Result<()> { + let (dsn, handle) = spawn_server(200); + let metrics = MetricsClient::with_capacity(MetricsConfig::new(dsn), 8)?; + + let mut batch = metrics.batch(); + batch.counter("codex.turns", 1, &[("model", "gpt-5.1")])?; + metrics.send(batch)?; + metrics.shutdown()?; + + let captured = handle.join().expect("server thread"); + let envelope = parse_envelope(&captured.body); + let lines: Vec<&str> = envelope.payload.split('\n').collect(); + assert_eq!(lines.len(), 1); + + let line = parse_statsd_line(lines[0]); + assert_eq!(line.name, "codex.turns"); + assert_eq!(line.value, 1); + assert_eq!(line.kind, "c"); + assert_eq!(line.tags.get("model").map(String::as_str), Some("gpt-5.1")); + + Ok(()) +} + +// Ensures a non-success response panics in debug builds via error_or_panic. +#[test] +fn send_panics_on_non_success_status_in_debug() -> Result<()> { let (dsn, handle) = spawn_server(500); let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; let mut batch = metrics.batch(); batch.counter("codex.turns", 1, &[])?; - let err = metrics.send(batch).unwrap_err(); - assert!(matches!( - err, - MetricsError::SentryUploadFailed { status, .. } if status.as_u16() == 500 - )); + metrics.send(batch)?; + let err = metrics.shutdown().unwrap_err(); + assert!(matches!(err, MetricsError::WorkerPanicked)); - let _ = handle.join().expect("server thread"); + let captured = handle.join().expect("server thread"); + assert_eq!(captured.method, "POST"); Ok(()) } +// Validates invalid DSNs are rejected early. #[test] fn invalid_dsn_reports_error() -> Result<()> { - let err = MetricsClient::new(MetricsConfig::new("not a dsn")).unwrap_err(); - assert!(matches!(err, MetricsError::InvalidDsn { .. })); + assert!(matches!( + MetricsClient::new(MetricsConfig::new("not a dsn")), + Err(MetricsError::InvalidDsn { .. }) + )); Ok(()) } +// Ensures empty batches do not trigger any HTTP request. #[test] fn send_is_noop_when_batch_empty() -> Result<()> { let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server"); @@ -283,6 +406,7 @@ fn send_is_noop_when_batch_empty() -> Result<()> { let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; metrics.send(metrics.batch())?; + metrics.shutdown()?; let mut saw_connection = false; for _ in 0..10 { @@ -301,6 +425,7 @@ fn send_is_noop_when_batch_empty() -> Result<()> { Ok(()) } +// Ensures invalid tag components are rejected during config build. #[test] fn invalid_tag_component_is_rejected() -> Result<()> { let err = MetricsConfig::default() @@ -314,6 +439,7 @@ fn invalid_tag_component_is_rejected() -> Result<()> { Ok(()) } +// Ensures invalid metric names are rejected when building a batch. #[test] fn counter_rejects_invalid_metric_name() -> Result<()> { let mut batch = MetricsBatch::new(); @@ -325,12 +451,14 @@ fn counter_rejects_invalid_metric_name() -> Result<()> { Ok(()) } +// Ensures empty histogram bucket lists are rejected. #[test] fn empty_buckets_are_rejected() { let err = HistogramBuckets::from_values(&[]).unwrap_err(); assert!(matches!(err, MetricsError::EmptyBuckets)); } +// Ensures range overflow is detected when building buckets. #[test] fn range_overflow_is_reported() { let err = HistogramBuckets::from_range(i64::MAX - 1, i64::MAX, 2).unwrap_err(); diff --git a/docs/telemetry.md b/docs/telemetry.md new file mode 100644 index 00000000000..320cae30c23 --- /dev/null +++ b/docs/telemetry.md @@ -0,0 +1,48 @@ +# Codex Telemetry + +## Config + +**TODO(jif)**: add the config and document it + +## Tracing + +Codex does not export OpenTelemetry traces today. The only OTEL output is log +events emitted by the `codex_otel` crate, and those are exported only when an +OTEL exporter is configured; otherwise nothing is sent. + +## Feedback + +Feedback is sent only when you run `/feedback` and confirm. The report includes +the selected category and optional note; if you opt in to include logs, Codex +attaches the most recent in-memory logs for the session (up to ~4 MiB). + +## Metrics + +This section list all the metrics exported by Codex when locally installed. + +### Global context (applies to every event/metric) + +- `surface`: `cli` | `vscode` | `exec` | `mcp` | `subagent_*` (from `SessionSource`). +- `version`: binary version. +- `auth_mode`: `swic` (AuthMode::ChatGPT) | `api` (AuthMode::ApiKey) | `unknown`. +- `model`: name of the model used. + +## Metrics catalog + +Each metric includes the required fields plus the global context above. + +| Metric | Type | Fields | Description | +| ------------------------- | --------- | ------------------------------------- | ------------------------------------------------------------------------------- | +| `approval.requested` | counter | `tool`, `approved` | Tool approval request result (`approved`: `yes` or `no`). | +| `auth.completed` | counter | `status` | Authentication completed (only for ChatGPT authentication). | +| `conversation.compact` | counter | `status`, `number` | Compaction event including the status and the compaction number in the session. | +| `conversation.turn.count` | counter | `role` | User/assistant turns per session. | +| `feature.duration_ms` | histogram | `feature`, `status` | End-to-end feature latency. | +| `feature.used` | counter | `feature` | Feature usage through `/` (e.g., `/undo`, `/review`, ...). | +| `features.state` | counter | `key`, `value` | Feature values that differ from defaults (emit one row per non-default). | +| `mcp.call` | counter | `status` | MCP tool invocation result (`ok` or error string). | +| `model.call.duration_ms` | histogram | `provider`, `status`, `attempt` | Model API request duration. | +| `session.started` | counter | `is_git` | New session created. | +| `tool.call` | counter | `tool`, `status` | Tool invocation result (`ok` or error string). | +| `tool.call.duration_ms` | histogram | `tool`, `status` | Tool execution time. | +| `user.feedback.submitted` | counter | `category`, `include_logs`, `success` | Feedback submission via `/feedback`. | From 5c8e7d0419ca8b7910b222c9d62e47e64e3a239d Mon Sep 17 00:00:00 2001 From: jif-oai Date: Fri, 19 Dec 2025 14:12:03 +0100 Subject: [PATCH 03/43] Some updates in doc --- codex-rs/metrics/README.md | 63 ++++++++++---------------------------- 1 file changed, 17 insertions(+), 46 deletions(-) diff --git a/codex-rs/metrics/README.md b/codex-rs/metrics/README.md index 860160b91f5..07dd6abbc26 100644 --- a/codex-rs/metrics/README.md +++ b/codex-rs/metrics/README.md @@ -2,42 +2,22 @@ Send lightweight counters and histogram buckets to Sentry via the statsd envelope item. -## Overview - -- Non-blocking client that enqueues metrics to a background worker. -- Counters and histograms only (histograms are encoded as bucketed counters). -- Tag validation and metric name validation are enforced before send. +Key points: +- Non-blocking for the sender. Metrics are processed by a dedicated worker. +- Tag validation and metric name validation are enforced before send to match Sentry requirements. ## Quick start ```rust -use codex_metrics::HistogramBuckets; -use codex_metrics::MetricsClient; -use codex_metrics::MetricsConfig; -use codex_metrics::Result; - -fn main() -> Result<()> { - let metrics = MetricsClient::new( - MetricsConfig::new("https://public@example.ingest.us.sentry.io/123456") - .with_tag("service", "codex-cli")? - .with_tag("env", "dev")?, - )?; - - let buckets = HistogramBuckets::from_values(&[25, 50, 100, 250, 500, 1000])?; - - metrics.counter("codex.session_started", 1, &[("source", "tui")])?; - metrics.histogram("codex.request_latency", 83, &buckets, &[("route", "chat")])?; - - Ok(()) -} -``` - -Buckets are integer upper bounds; pick your own unit (ms, bytes, tokens, etc.). +let metrics = MetricsClient::new( + MetricsConfig::default() // Default to the standard Sentry DSN. + .with_tag("service", "codex-cli")?, +)?; -You can also use the default placeholder DSN: +let buckets = HistogramBuckets::from_values(&[25, 50, 100, 250, 500, 1000])?; -```rust -let metrics = MetricsClient::new(MetricsConfig::default())?; +metrics.counter("codex.session_started", 1, &[("source", "tui")])?; +metrics.histogram("codex.request_latency", 83, &buckets, &[("route", "chat")])?; ``` ## Configuration @@ -48,24 +28,15 @@ let metrics = MetricsClient::new(MetricsConfig::default())?; - `with_tag(key, value)` to add default tags. - `with_timeout(duration)` to override the HTTP timeout (default 10s). - `with_user_agent(agent)` to override the user agent. -Use `MetricsClient::with_capacity(config, capacity)` to override the default -queue capacity. -## Buckets +Use `MetricsClient::with_capacity(config, capacity)` to override the default queue capacity. -`HistogramBuckets` supports two constructors: +## Buckets +`HistogramBuckets` supports: - `from_values(&[...])` for explicit upper bounds. -- `from_range(from, to, step)` to build linear buckets. -- `from_exponential(from, to, factor)` to build exponential buckets. - -`from_range` requires `step > 0` and `from <= to`. The upper bound is always included. -`from_exponential` requires `from > 0`, `from <= to`, and a finite `factor > 1`. The upper bound is always included. - -```rust -let buckets = HistogramBuckets::from_range(25, 100, 25)?; -let exp_buckets = HistogramBuckets::from_exponential(10, 1000, 2.0)?; -``` +- `from_range(from, to, step)` to build linear buckets. Requires `step > 0` and `from <= to`. The upper bound is always included. +- `from_exponential(from, to, factor)` to build exponential buckets. Requires `from > 0`, `from <= to`, and a finite `factor > 1`. The upper bound is always included. ## Sending metrics @@ -76,7 +47,7 @@ metrics.counter("codex.session_started", 1, &[("source", "tui")])?; ``` Histograms are translated into bucket counters by adding an `le` tag for each -bound that is greater than or equal to the value (or `inf` if none match): +bound that is greater than or equal to the value (or `inf` if none match), following the statsd standards: ```rust metrics.histogram("codex.request_latency", 83, &buckets, &[("route", "chat")])?; @@ -136,7 +107,7 @@ if the worker is no longer running. to override the timeout. Uploads are best-effort; if the worker encounters a send error, the metric is -dropped. +dropped (if in `alpha`, or debug mode, the worker will panic on errors). `MetricsClient` also attempts a best-effort shutdown on drop using the default timeout, so explicit calls to `shutdown` are optional. From a4ed08a09ac6ae7746c6ab1171d6897411bb0796 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Fri, 19 Dec 2025 14:21:24 +0100 Subject: [PATCH 04/43] Some nits --- codex-rs/metrics/README.md | 10 +++++----- codex-rs/metrics/src/batch.rs | 10 +++------- codex-rs/metrics/src/client.rs | 21 ++++++++------------- codex-rs/metrics/src/config.rs | 6 ++---- codex-rs/metrics/src/lib.rs | 11 +++++++++++ codex-rs/metrics/src/statsd.rs | 3 +-- codex-rs/metrics/tests/tests.rs | 29 +++++++++++++++++++++-------- 7 files changed, 51 insertions(+), 39 deletions(-) diff --git a/codex-rs/metrics/README.md b/codex-rs/metrics/README.md index 07dd6abbc26..3bf9b2a74d4 100644 --- a/codex-rs/metrics/README.md +++ b/codex-rs/metrics/README.md @@ -29,7 +29,7 @@ metrics.histogram("codex.request_latency", 83, &buckets, &[("route", "chat")])?; - `with_timeout(duration)` to override the HTTP timeout (default 10s). - `with_user_agent(agent)` to override the user agent. -Use `MetricsClient::with_capacity(config, capacity)` to override the default queue capacity. +The queue capacity is fixed at 1024 entries. ## Buckets @@ -47,7 +47,8 @@ metrics.counter("codex.session_started", 1, &[("source", "tui")])?; ``` Histograms are translated into bucket counters by adding an `le` tag for each -bound that is greater than or equal to the value (or `inf` if none match), following the statsd standards: +bound that is greater than or equal to the value, plus a final `le=inf` bucket +so the histogram is cumulative per the statsd `le` convention: ```rust metrics.histogram("codex.request_latency", 83, &buckets, &[("route", "chat")])?; @@ -88,7 +89,7 @@ metrics.record_duration( ## Batching -Batching reduces network requests. Build a batch and send it once: +Batching reduces network requests and ensure metrics have the same timestamp. ```rust let mut batch = metrics.batch(); @@ -103,8 +104,7 @@ The client uses a bounded queue (default capacity 1024). Enqueueing returns a `MetricsError::QueueFull` error if the queue is full or `MetricsError::WorkerUnavailable` if the worker is no longer running. -`shutdown` waits up to 500ms for the worker to stop. Use `shutdown_with_timeout` -to override the timeout. +`shutdown` waits up to 500ms for the worker to stop. Uploads are best-effort; if the worker encounters a send error, the metric is dropped (if in `alpha`, or debug mode, the worker will panic on errors). diff --git a/codex-rs/metrics/src/batch.rs b/codex-rs/metrics/src/batch.rs index dfde45b69b2..41a4ca9d50a 100644 --- a/codex-rs/metrics/src/batch.rs +++ b/codex-rs/metrics/src/batch.rs @@ -131,18 +131,14 @@ impl MetricsBatch { tags: &[(&str, &str)], ) -> Result<()> { let base_tags = collect_tags(tags)?; - let mut matched = false; for bound in buckets.bounds.iter().filter(|bound| value <= **bound) { let mut tags = base_tags.clone(); tags.push(("le".to_string(), bound.to_string())); self.lines.push(StatsdLine::counter(name, 1, tags)?); - matched = true; - } - if !matched { - let mut tags = base_tags; - tags.push(("le".to_string(), "inf".to_string())); - self.lines.push(StatsdLine::counter(name, 1, tags)?); } + let mut tags = base_tags; + tags.push(("le".to_string(), "inf".to_string())); + self.lines.push(StatsdLine::counter(name, 1, tags)?); Ok(()) } diff --git a/codex-rs/metrics/src/client.rs b/codex-rs/metrics/src/client.rs index c9a126a1b46..611aee0790b 100644 --- a/codex-rs/metrics/src/client.rs +++ b/codex-rs/metrics/src/client.rs @@ -1,3 +1,7 @@ +use crate::DEFAULT_QUEUE_CAPACITY; +use crate::DEFAULT_SHUTDOWN_TIMEOUT; +use crate::ENVELOPE_CONTENT_TYPE; +use crate::SHUTDOWN_POLL_INTERVAL; use crate::batch::HistogramBuckets; use crate::batch::MetricsBatch; use crate::config::MetricsConfig; @@ -19,11 +23,6 @@ use std::thread; use std::time::Duration; use std::time::Instant; -const ENVELOPE_CONTENT_TYPE: &str = "application/x-sentry-envelope"; -const DEFAULT_QUEUE_CAPACITY: usize = 1024; -const DEFAULT_SHUTDOWN_TIMEOUT: Duration = Duration::from_millis(500); -const SHUTDOWN_POLL_INTERVAL: Duration = Duration::from_millis(10); - enum WorkerMessage { Batch(MetricsBatch), Shutdown, @@ -89,11 +88,8 @@ pub struct MetricsClient { impl MetricsClient { /// Build a metrics client from configuration and validate defaults. pub fn new(config: MetricsConfig) -> Result { - Self::with_capacity(config, DEFAULT_QUEUE_CAPACITY) - } + let capacity = DEFAULT_QUEUE_CAPACITY; - /// Build a metrics client with a bounded queue capacity. - pub fn with_capacity(config: MetricsConfig, capacity: usize) -> Result { if capacity == 0 { return Err(MetricsError::QueueCapacityZero); } @@ -233,11 +229,10 @@ impl MetricsClient { /// Flush queued metrics and stop the worker thread. pub fn shutdown(&self) -> Result<()> { - self.shutdown_with_timeout(DEFAULT_SHUTDOWN_TIMEOUT) + self.shutdown_inner(DEFAULT_SHUTDOWN_TIMEOUT) } - /// Flush queued metrics and stop the worker thread, waiting up to `timeout`. - pub fn shutdown_with_timeout(&self, timeout: Duration) -> Result<()> { + fn shutdown_inner(&self, timeout: Duration) -> Result<()> { let sender = self .state .sender @@ -280,7 +275,7 @@ impl MetricsClient { impl Drop for MetricsClient { fn drop(&mut self) { if Arc::strong_count(&self.state) == 1 { - let _ = self.shutdown_with_timeout(DEFAULT_SHUTDOWN_TIMEOUT); + let _ = self.shutdown_inner(DEFAULT_SHUTDOWN_TIMEOUT); } } } diff --git a/codex-rs/metrics/src/config.rs b/codex-rs/metrics/src/config.rs index 1c7f22f5763..3591cb54a04 100644 --- a/codex-rs/metrics/src/config.rs +++ b/codex-rs/metrics/src/config.rs @@ -1,12 +1,10 @@ +use crate::DEFAULT_TIMEOUT; +use crate::SENTRY_DSN; use crate::error::Result; use crate::validation::validate_tag_component; use std::collections::BTreeMap; use std::time::Duration; -const SENTRY_DSN: &str = - "https://ae32ed50620d7a7792c1ce5df38b3e3e@o33249.ingest.us.sentry.io/4510195390611458"; -const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10); - #[derive(Clone, Debug)] pub struct MetricsConfig { pub(crate) dsn: String, diff --git a/codex-rs/metrics/src/lib.rs b/codex-rs/metrics/src/lib.rs index 7f188b458de..f47380958e8 100644 --- a/codex-rs/metrics/src/lib.rs +++ b/codex-rs/metrics/src/lib.rs @@ -7,6 +7,17 @@ mod time; mod util; mod validation; +use std::time::Duration; + +pub(crate) const SENTRY_DSN: &str = + "https://ae32ed50620d7a7792c1ce5df38b3e3e@o33249.ingest.us.sentry.io/4510195390611458"; +pub(crate) const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10); +pub(crate) const STATSD_CONTENT_TYPE: &str = "text/plain"; +pub(crate) const ENVELOPE_CONTENT_TYPE: &str = "application/x-sentry-envelope"; +pub(crate) const DEFAULT_QUEUE_CAPACITY: usize = 1024; +pub(crate) const DEFAULT_SHUTDOWN_TIMEOUT: Duration = Duration::from_millis(500); +pub(crate) const SHUTDOWN_POLL_INTERVAL: Duration = Duration::from_millis(10); + pub use crate::batch::HistogramBuckets; pub use crate::batch::MetricsBatch; pub use crate::client::MetricsClient; diff --git a/codex-rs/metrics/src/statsd.rs b/codex-rs/metrics/src/statsd.rs index 61f41eab3f8..c6e154891f6 100644 --- a/codex-rs/metrics/src/statsd.rs +++ b/codex-rs/metrics/src/statsd.rs @@ -1,3 +1,4 @@ +use crate::STATSD_CONTENT_TYPE; use crate::error::MetricsError; use crate::error::Result; use crate::validation::validate_metric_name; @@ -5,8 +6,6 @@ use crate::validation::validate_tag_component; use sentry::types::Dsn; use std::collections::BTreeMap; -const STATSD_CONTENT_TYPE: &str = "text/plain"; - pub(crate) struct StatsdLine { name: String, value: i64, diff --git a/codex-rs/metrics/tests/tests.rs b/codex-rs/metrics/tests/tests.rs index 055987a11c5..797b683409a 100644 --- a/codex-rs/metrics/tests/tests.rs +++ b/codex-rs/metrics/tests/tests.rs @@ -202,7 +202,7 @@ fn send_builds_payload_with_tags_and_histograms() -> Result<()> { ); let lines: Vec<&str> = envelope.payload.split('\n').collect(); - assert_eq!(lines.len(), 4); + assert_eq!(lines.len(), 5); let line = parse_statsd_line(lines[0]); assert_eq!(line.name, "codex.turns"); @@ -215,7 +215,7 @@ fn send_builds_payload_with_tags_and_histograms() -> Result<()> { assert_eq!(line.tags.get("env").map(String::as_str), Some("dev")); assert_eq!(line.tags.get("model").map(String::as_str), Some("gpt-5.1")); - for (line, expected_le) in lines.iter().skip(1).zip(["25", "50", "100"]) { + for (line, expected_le) in lines.iter().skip(1).zip(["25", "50", "100", "inf"]) { let line = parse_statsd_line(line); assert_eq!(line.name, "codex.tool_latency"); assert_eq!(line.value, 1); @@ -272,13 +272,16 @@ fn record_duration_uses_matching_bucket() -> Result<()> { let captured = handle.join().expect("server thread"); let envelope = parse_envelope(&captured.body); let lines: Vec<&str> = envelope.payload.split('\n').collect(); - assert_eq!(lines.len(), 1); + assert_eq!(lines.len(), 2); let line = parse_statsd_line(lines[0]); assert_eq!(line.name, "codex.request_latency"); assert_eq!(line.tags.get("route").map(String::as_str), Some("chat")); assert_eq!(line.tags.get("le").map(String::as_str), Some("20")); + let line = parse_statsd_line(lines[1]); + assert_eq!(line.tags.get("le").map(String::as_str), Some("inf")); + Ok(()) } @@ -302,8 +305,13 @@ fn time_result_records_success() -> Result<()> { let envelope = parse_envelope(&captured.body); let lines: Vec<&str> = envelope.payload.split('\n').collect(); assert!(!lines.is_empty()); - for line in lines { - let line = parse_statsd_line(line); + let parsed: Vec = lines.iter().map(parse_statsd_line).collect(); + assert!( + parsed + .iter() + .any(|line| { line.tags.get("le").map(String::as_str) == Some("inf") }) + ); + for line in parsed { assert_eq!(line.name, "codex.request_latency"); assert_eq!(line.tags.get("route").map(String::as_str), Some("chat")); assert!(line.tags.contains_key("le")); @@ -334,8 +342,13 @@ fn time_result_records_on_error() -> Result<()> { let envelope = parse_envelope(&captured.body); let lines: Vec<&str> = envelope.payload.split('\n').collect(); assert!(!lines.is_empty()); - for line in lines { - let line = parse_statsd_line(line); + let parsed: Vec = lines.iter().map(parse_statsd_line).collect(); + assert!( + parsed + .iter() + .any(|line| { line.tags.get("le").map(String::as_str) == Some("inf") }) + ); + for line in parsed { assert_eq!(line.name, "codex.request_latency"); assert_eq!(line.tags.get("route").map(String::as_str), Some("chat")); assert!(line.tags.contains_key("le")); @@ -348,7 +361,7 @@ fn time_result_records_on_error() -> Result<()> { #[test] fn client_sends_enqueued_batch() -> Result<()> { let (dsn, handle) = spawn_server(200); - let metrics = MetricsClient::with_capacity(MetricsConfig::new(dsn), 8)?; + let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; let mut batch = metrics.batch(); batch.counter("codex.turns", 1, &[("model", "gpt-5.1")])?; From cbdcc980bec73854260e47108f976756d97066ac Mon Sep 17 00:00:00 2001 From: jif-oai Date: Fri, 19 Dec 2025 14:36:57 +0100 Subject: [PATCH 05/43] Rework harness structure --- codex-rs/metrics/tests/harness/mod.rs | 165 +++++++ codex-rs/metrics/tests/suite/mod.rs | 3 + codex-rs/metrics/tests/suite/send.rs | 207 +++++++++ codex-rs/metrics/tests/suite/timing.rs | 113 +++++ codex-rs/metrics/tests/suite/validation.rs | 91 ++++ codex-rs/metrics/tests/tests.rs | 481 +-------------------- 6 files changed, 581 insertions(+), 479 deletions(-) create mode 100644 codex-rs/metrics/tests/harness/mod.rs create mode 100644 codex-rs/metrics/tests/suite/mod.rs create mode 100644 codex-rs/metrics/tests/suite/send.rs create mode 100644 codex-rs/metrics/tests/suite/timing.rs create mode 100644 codex-rs/metrics/tests/suite/validation.rs diff --git a/codex-rs/metrics/tests/harness/mod.rs b/codex-rs/metrics/tests/harness/mod.rs new file mode 100644 index 00000000000..3a51dd73574 --- /dev/null +++ b/codex-rs/metrics/tests/harness/mod.rs @@ -0,0 +1,165 @@ +use serde_json::Value; +use std::collections::BTreeMap; +use std::io::Read; +use std::io::Write; +use std::net::TcpListener; +use std::net::TcpStream; +use std::thread; + +#[derive(Debug)] +pub(crate) struct CapturedRequest { + pub(crate) method: String, + pub(crate) path: String, + pub(crate) headers: BTreeMap, + pub(crate) body: Vec, +} + +#[derive(Debug)] +pub(crate) struct ParsedEnvelope { + pub(crate) header: Value, + pub(crate) item_header: Value, + pub(crate) payload: String, +} + +#[derive(Debug)] +pub(crate) struct ParsedStatsdLine { + pub(crate) name: String, + pub(crate) value: i64, + pub(crate) kind: String, + pub(crate) tags: BTreeMap, +} + +/// Spawn a simple HTTP server that captures one request and responds with `status`. +pub(crate) fn spawn_server(status: u16) -> (String, thread::JoinHandle) { + let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server"); + let addr = listener.local_addr().expect("local addr"); + let dsn = format!("http://public:@{addr}/123"); + + let handle = thread::spawn(move || { + let (mut stream, _) = listener.accept().expect("accept connection"); + let request = read_http_request(&mut stream); + let reason = match status { + 200 => "OK", + 500 => "Internal Server Error", + _ => "OK", + }; + let response = + format!("HTTP/1.1 {status} {reason}\r\nContent-Length: 0\r\nConnection: close\r\n\r\n"); + stream + .write_all(response.as_bytes()) + .expect("write response"); + request + }); + + (dsn, handle) +} + +// Read a single HTTP request from the stream and return the parsed data. +fn read_http_request(stream: &mut TcpStream) -> CapturedRequest { + let mut buffer = Vec::new(); + let mut chunk = [0_u8; 1024]; + let mut header_end = None; + while header_end.is_none() { + let read = stream.read(&mut chunk).expect("read request"); + if read == 0 { + break; + } + buffer.extend_from_slice(&chunk[..read]); + header_end = find_header_end(&buffer); + } + let header_end = header_end.expect("request headers"); + let headers_bytes = &buffer[..header_end]; + let headers_str = std::str::from_utf8(headers_bytes).expect("headers utf-8"); + let mut lines = headers_str.split("\r\n"); + let request_line = lines.next().expect("request line"); + let mut request_parts = request_line.split_whitespace(); + let method = request_parts.next().expect("method").to_string(); + let path = request_parts.next().expect("path").to_string(); + + let mut headers = BTreeMap::new(); + for line in lines { + if line.is_empty() { + continue; + } + if let Some((key, value)) = line.split_once(':') { + headers.insert(key.trim().to_ascii_lowercase(), value.trim().to_string()); + } + } + + let content_length = headers + .get("content-length") + .and_then(|value| value.parse::().ok()) + .unwrap_or(0); + let mut body = buffer[header_end..].to_vec(); + while body.len() < content_length { + let read = stream.read(&mut chunk).expect("read body"); + if read == 0 { + break; + } + body.extend_from_slice(&chunk[..read]); + } + + CapturedRequest { + method, + path, + headers, + body, + } +} + +// Locate the end of the HTTP headers in a buffered request. +fn find_header_end(buffer: &[u8]) -> Option { + buffer + .windows(4) + .position(|window| window == b"\r\n\r\n") + .map(|pos| pos + 4) +} + +/// Parse a Sentry envelope payload into headers and statsd payload text. +pub(crate) fn parse_envelope(body: &[u8]) -> ParsedEnvelope { + let mut parts = body.splitn(3, |byte| *byte == b'\n'); + let header_line = parts.next().expect("envelope header"); + let item_header_line = parts.next().expect("item header"); + let payload = parts.next().unwrap_or(&[]); + + let header = serde_json::from_slice(header_line).expect("parse envelope header"); + let item_header = serde_json::from_slice(item_header_line).expect("parse item header"); + let payload = std::str::from_utf8(payload) + .expect("payload utf-8") + .trim_end_matches('\n') + .to_string(); + + ParsedEnvelope { + header, + item_header, + payload, + } +} + +/// Parse a single statsd line (with optional tags) into components. +pub(crate) fn parse_statsd_line(line: &str) -> ParsedStatsdLine { + let (metric, tags_part) = line + .split_once("|#") + .map(|(metric, tags)| (metric, Some(tags))) + .unwrap_or((line, None)); + let (name_value, kind) = metric.split_once('|').expect("metric kind"); + let (name, value) = name_value.split_once(':').expect("metric value"); + let value = value.parse::().expect("metric value parse"); + + let mut tags = BTreeMap::new(); + if let Some(tags_part) = tags_part + && !tags_part.is_empty() + { + for tag in tags_part.split(',') { + let (key, value) = tag.split_once(':').expect("tag"); + tags.insert(key.to_string(), value.to_string()); + } + } + + ParsedStatsdLine { + name: name.to_string(), + value, + kind: kind.to_string(), + tags, + } +} diff --git a/codex-rs/metrics/tests/suite/mod.rs b/codex-rs/metrics/tests/suite/mod.rs new file mode 100644 index 00000000000..42708df7981 --- /dev/null +++ b/codex-rs/metrics/tests/suite/mod.rs @@ -0,0 +1,3 @@ +mod send; +mod timing; +mod validation; diff --git a/codex-rs/metrics/tests/suite/send.rs b/codex-rs/metrics/tests/suite/send.rs new file mode 100644 index 00000000000..1abfdde37aa --- /dev/null +++ b/codex-rs/metrics/tests/suite/send.rs @@ -0,0 +1,207 @@ +use crate::harness::parse_envelope; +use crate::harness::parse_statsd_line; +use crate::harness::spawn_server; +use codex_metrics::HistogramBuckets; +use codex_metrics::MetricsClient; +use codex_metrics::MetricsConfig; +use codex_metrics::MetricsError; +use codex_metrics::Result; +use pretty_assertions::assert_eq; +use std::net::TcpListener; +use std::thread; +use std::time::Duration; + +// Ensures counters/histograms render with default + per-call tags. +#[test] +fn send_builds_payload_with_tags_and_histograms() -> Result<()> { + let (dsn, handle) = spawn_server(200); + let metrics = MetricsClient::new( + MetricsConfig::new(dsn.clone()) + .with_tag("service", "codex-cli")? + .with_tag("env", "prod")?, + )?; + let buckets = HistogramBuckets::from_values(&[25, 50, 100])?; + + let mut batch = metrics.batch(); + batch.counter("codex.turns", 1, &[("model", "gpt-5.1"), ("env", "dev")])?; + batch.histogram("codex.tool_latency", 25, &buckets, &[("tool", "shell")])?; + metrics.send(batch)?; + metrics.shutdown()?; + + let captured = handle.join().expect("server thread"); + assert_eq!(captured.method, "POST"); + assert_eq!(captured.path, "/api/123/envelope/"); + assert_eq!( + captured.headers.get("content-type").map(String::as_str), + Some("application/x-sentry-envelope") + ); + + let envelope = parse_envelope(&captured.body); + assert_eq!(envelope.header["dsn"].as_str(), Some(dsn.as_str())); + assert_eq!(envelope.item_header["type"], "statsd"); + assert_eq!(envelope.item_header["content_type"], "text/plain"); + assert_eq!( + envelope.item_header["length"].as_u64(), + Some(envelope.payload.len() as u64) + ); + + let lines: Vec<&str> = envelope.payload.split('\n').collect(); + assert_eq!(lines.len(), 5); + + let line = parse_statsd_line(lines[0]); + assert_eq!(line.name, "codex.turns"); + assert_eq!(line.value, 1); + assert_eq!(line.kind, "c"); + assert_eq!( + line.tags.get("service").map(String::as_str), + Some("codex-cli") + ); + assert_eq!(line.tags.get("env").map(String::as_str), Some("dev")); + assert_eq!(line.tags.get("model").map(String::as_str), Some("gpt-5.1")); + + for (line, expected_le) in lines.iter().skip(1).zip(["25", "50", "100", "inf"]) { + let line = parse_statsd_line(line); + assert_eq!(line.name, "codex.tool_latency"); + assert_eq!(line.value, 1); + assert_eq!(line.kind, "c"); + assert_eq!( + line.tags.get("service").map(String::as_str), + Some("codex-cli") + ); + assert_eq!(line.tags.get("env").map(String::as_str), Some("prod")); + assert_eq!(line.tags.get("tool").map(String::as_str), Some("shell")); + assert_eq!(line.tags.get("le").map(String::as_str), Some(expected_le)); + } + + Ok(()) +} + +// Ensures defaults merge per line and overrides take precedence. +#[test] +fn send_merges_default_tags_per_line() -> Result<()> { + let (dsn, handle) = spawn_server(200); + let metrics = MetricsClient::new( + MetricsConfig::new(dsn.clone()) + .with_tag("service", "codex-cli")? + .with_tag("env", "prod")? + .with_tag("region", "us")?, + )?; + + let mut batch = metrics.batch(); + batch.counter("codex.alpha", 1, &[("env", "dev"), ("component", "alpha")])?; + batch.counter( + "codex.beta", + 2, + &[("service", "worker"), ("component", "beta")], + )?; + metrics.send(batch)?; + metrics.shutdown()?; + + let captured = handle.join().expect("server thread"); + let envelope = parse_envelope(&captured.body); + let lines: Vec<&str> = envelope.payload.split('\n').collect(); + assert_eq!(lines.len(), 2); + assert_eq!( + lines[0], + "codex.alpha:1|c|#component:alpha,env:dev,region:us,service:codex-cli" + ); + assert_eq!( + lines[1], + "codex.beta:2|c|#component:beta,env:prod,region:us,service:worker" + ); + + Ok(()) +} + +// Verifies values above the max bucket use the inf tag. +#[test] +fn send_uses_inf_bucket_for_values_over_max() -> Result<()> { + let (dsn, handle) = spawn_server(200); + let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; + let buckets = HistogramBuckets::from_values(&[10, 20])?; + + let mut batch = metrics.batch(); + batch.histogram("codex.tool_latency", 99, &buckets, &[("tool", "shell")])?; + metrics.send(batch)?; + metrics.shutdown()?; + + let captured = handle.join().expect("server thread"); + let envelope = parse_envelope(&captured.body); + let lines: Vec<&str> = envelope.payload.split('\n').collect(); + assert_eq!(lines.len(), 1); + let line = parse_statsd_line(lines[0]); + assert_eq!(line.tags.get("le").map(String::as_str), Some("inf")); + + Ok(()) +} + +// Verifies enqueued batches are delivered by the background worker. +#[test] +fn client_sends_enqueued_batch() -> Result<()> { + let (dsn, handle) = spawn_server(200); + let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; + + let mut batch = metrics.batch(); + batch.counter("codex.turns", 1, &[("model", "gpt-5.1")])?; + metrics.send(batch)?; + metrics.shutdown()?; + + let captured = handle.join().expect("server thread"); + let envelope = parse_envelope(&captured.body); + let lines: Vec<&str> = envelope.payload.split('\n').collect(); + assert_eq!(lines.len(), 1); + + let line = parse_statsd_line(lines[0]); + assert_eq!(line.name, "codex.turns"); + assert_eq!(line.value, 1); + assert_eq!(line.kind, "c"); + assert_eq!(line.tags.get("model").map(String::as_str), Some("gpt-5.1")); + + Ok(()) +} + +// Ensures a non-success response panics in debug builds via error_or_panic. +#[test] +fn send_panics_on_non_success_status_in_debug() -> Result<()> { + let (dsn, handle) = spawn_server(500); + let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; + + let mut batch = metrics.batch(); + batch.counter("codex.turns", 1, &[])?; + metrics.send(batch)?; + let err = metrics.shutdown().unwrap_err(); + assert!(matches!(err, MetricsError::WorkerPanicked)); + + let captured = handle.join().expect("server thread"); + assert_eq!(captured.method, "POST"); + Ok(()) +} + +// Ensures empty batches do not trigger any HTTP request. +#[test] +fn client_core_skips_empty_batch() -> Result<()> { + let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server"); + listener.set_nonblocking(true).expect("set nonblocking"); + let addr = listener.local_addr().expect("local addr"); + let dsn = format!("http://public:@{addr}/123"); + let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; + + metrics.send(metrics.batch())?; + metrics.shutdown()?; + + let mut saw_connection = false; + for _ in 0..10 { + match listener.accept() { + Ok(_) => { + saw_connection = true; + break; + } + Err(err) if err.kind() == std::io::ErrorKind::WouldBlock => { + thread::sleep(Duration::from_millis(10)); + } + Err(err) => panic!("unexpected accept error: {err}"), + } + } + assert!(!saw_connection, "expected no request for empty batch"); + Ok(()) +} diff --git a/codex-rs/metrics/tests/suite/timing.rs b/codex-rs/metrics/tests/suite/timing.rs new file mode 100644 index 00000000000..938bdefaf14 --- /dev/null +++ b/codex-rs/metrics/tests/suite/timing.rs @@ -0,0 +1,113 @@ +use crate::harness::parse_envelope; +use crate::harness::parse_statsd_line; +use crate::harness::spawn_server; +use codex_metrics::HistogramBuckets; +use codex_metrics::MetricsClient; +use codex_metrics::MetricsConfig; +use codex_metrics::MetricsError; +use codex_metrics::Result; +use pretty_assertions::assert_eq; +use std::time::Duration; + +// Ensures duration recording maps to the expected bucket tag. +#[test] +fn record_duration_uses_matching_bucket() -> Result<()> { + let (dsn, handle) = spawn_server(200); + let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; + let buckets = HistogramBuckets::from_values(&[10, 20])?; + + metrics.record_duration( + "codex.request_latency", + Duration::from_millis(15), + &buckets, + &[("route", "chat")], + )?; + metrics.shutdown()?; + + let captured = handle.join().expect("server thread"); + let envelope = parse_envelope(&captured.body); + let lines: Vec<&str> = envelope.payload.split('\n').collect(); + assert_eq!(lines.len(), 2); + + let line = parse_statsd_line(lines[0]); + assert_eq!(line.name, "codex.request_latency"); + assert_eq!(line.tags.get("route").map(String::as_str), Some("chat")); + assert_eq!(line.tags.get("le").map(String::as_str), Some("20")); + + let line = parse_statsd_line(lines[1]); + assert_eq!(line.tags.get("le").map(String::as_str), Some("inf")); + + Ok(()) +} + +// Ensures time_result returns the closure output and records timing. +#[test] +fn time_result_records_success() -> Result<()> { + let (dsn, handle) = spawn_server(200); + let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; + let buckets = HistogramBuckets::from_values(&[10, 20])?; + + let value = metrics.time_result( + "codex.request_latency", + &buckets, + &[("route", "chat")], + || Ok("ok"), + )?; + assert_eq!(value, "ok"); + metrics.shutdown()?; + + let captured = handle.join().expect("server thread"); + let envelope = parse_envelope(&captured.body); + let lines: Vec<&str> = envelope.payload.split('\n').collect(); + assert!(!lines.is_empty()); + let parsed: Vec<_> = lines.iter().copied().map(parse_statsd_line).collect(); + assert!( + parsed + .iter() + .any(|line| { line.tags.get("le").map(String::as_str) == Some("inf") }) + ); + for line in parsed { + assert_eq!(line.name, "codex.request_latency"); + assert_eq!(line.tags.get("route").map(String::as_str), Some("chat")); + assert!(line.tags.contains_key("le")); + } + + Ok(()) +} + +// Ensures time_result propagates errors but still records timing. +#[test] +fn time_result_records_on_error() -> Result<()> { + let (dsn, handle) = spawn_server(200); + let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; + let buckets = HistogramBuckets::from_values(&[10, 20])?; + + let err = metrics + .time_result( + "codex.request_latency", + &buckets, + &[("route", "chat")], + || -> Result<&'static str> { Err(MetricsError::EmptyMetricName) }, + ) + .unwrap_err(); + assert!(matches!(err, MetricsError::EmptyMetricName)); + metrics.shutdown()?; + + let captured = handle.join().expect("server thread"); + let envelope = parse_envelope(&captured.body); + let lines: Vec<&str> = envelope.payload.split('\n').collect(); + assert!(!lines.is_empty()); + let parsed: Vec<_> = lines.iter().copied().map(parse_statsd_line).collect(); + assert!( + parsed + .iter() + .any(|line| { line.tags.get("le").map(String::as_str) == Some("inf") }) + ); + for line in parsed { + assert_eq!(line.name, "codex.request_latency"); + assert_eq!(line.tags.get("route").map(String::as_str), Some("chat")); + assert!(line.tags.contains_key("le")); + } + + Ok(()) +} diff --git a/codex-rs/metrics/tests/suite/validation.rs b/codex-rs/metrics/tests/suite/validation.rs new file mode 100644 index 00000000000..e7a99519838 --- /dev/null +++ b/codex-rs/metrics/tests/suite/validation.rs @@ -0,0 +1,91 @@ +use codex_metrics::HistogramBuckets; +use codex_metrics::MetricsBatch; +use codex_metrics::MetricsClient; +use codex_metrics::MetricsConfig; +use codex_metrics::MetricsError; +use codex_metrics::Result; + +// Validates invalid DSNs are rejected early. +#[test] +fn invalid_dsn_reports_error() -> Result<()> { + assert!(matches!( + MetricsClient::new(MetricsConfig::new("not a dsn")), + Err(MetricsError::InvalidDsn { .. }) + )); + Ok(()) +} + +// Ensures invalid tag components are rejected during config build. +#[test] +fn invalid_tag_component_is_rejected() -> Result<()> { + let err = MetricsConfig::default() + .with_tag("bad key", "value") + .unwrap_err(); + assert!(matches!( + err, + MetricsError::InvalidTagComponent { label, value } + if label == "tag key" && value == "bad key" + )); + Ok(()) +} + +// Ensures per-metric tag keys are validated. +#[test] +fn counter_rejects_invalid_tag_key() { + let mut batch = MetricsBatch::new(); + let err = batch + .counter("codex.turns", 1, &[("bad key", "value")]) + .unwrap_err(); + assert!(matches!( + err, + MetricsError::InvalidTagComponent { label, value } + if label == "tag key" && value == "bad key" + )); +} + +// Ensures per-metric tag values are validated. +#[test] +fn histogram_rejects_invalid_tag_value() -> Result<()> { + let mut batch = MetricsBatch::new(); + let buckets = HistogramBuckets::from_values(&[10])?; + let err = batch + .histogram( + "codex.request_latency", + 3, + &buckets, + &[("route", "bad value")], + ) + .unwrap_err(); + assert!(matches!( + err, + MetricsError::InvalidTagComponent { label, value } + if label == "tag value" && value == "bad value" + )); + Ok(()) +} + +// Ensures invalid metric names are rejected when building a batch. +#[test] +fn counter_rejects_invalid_metric_name() -> Result<()> { + let mut batch = MetricsBatch::new(); + let err = batch.counter("bad name", 1, &[]).unwrap_err(); + assert!(matches!( + err, + MetricsError::InvalidMetricName { name } if name == "bad name" + )); + Ok(()) +} + +// Ensures empty histogram bucket lists are rejected. +#[test] +fn empty_buckets_are_rejected() { + let err = HistogramBuckets::from_values(&[]).unwrap_err(); + assert!(matches!(err, MetricsError::EmptyBuckets)); +} + +// Ensures range overflow is detected when building buckets. +#[test] +fn range_overflow_is_reported() { + let err = HistogramBuckets::from_range(i64::MAX - 1, i64::MAX, 2).unwrap_err(); + assert!(matches!(err, MetricsError::BucketRangeOverflow { .. })); +} diff --git a/codex-rs/metrics/tests/tests.rs b/codex-rs/metrics/tests/tests.rs index 797b683409a..92f88b95fd8 100644 --- a/codex-rs/metrics/tests/tests.rs +++ b/codex-rs/metrics/tests/tests.rs @@ -1,479 +1,2 @@ -use codex_metrics::HistogramBuckets; -use codex_metrics::MetricsBatch; -use codex_metrics::MetricsClient; -use codex_metrics::MetricsConfig; -use codex_metrics::MetricsError; -use codex_metrics::Result; -use pretty_assertions::assert_eq; -use serde_json::Value; -use std::collections::BTreeMap; -use std::io::Read; -use std::io::Write; -use std::net::TcpListener; -use std::net::TcpStream; -use std::thread; -use std::time::Duration; - -#[derive(Debug)] -struct CapturedRequest { - method: String, - path: String, - headers: BTreeMap, - body: Vec, -} - -#[derive(Debug)] -struct ParsedEnvelope { - header: Value, - item_header: Value, - payload: String, -} - -#[derive(Debug)] -struct ParsedStatsdLine { - name: String, - value: i64, - kind: String, - tags: BTreeMap, -} - -fn spawn_server(status: u16) -> (String, thread::JoinHandle) { - let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server"); - let addr = listener.local_addr().expect("local addr"); - let dsn = format!("http://public:@{addr}/123"); - - let handle = thread::spawn(move || { - let (mut stream, _) = listener.accept().expect("accept connection"); - let request = read_http_request(&mut stream); - let reason = match status { - 200 => "OK", - 500 => "Internal Server Error", - _ => "OK", - }; - let response = - format!("HTTP/1.1 {status} {reason}\r\nContent-Length: 0\r\nConnection: close\r\n\r\n"); - stream - .write_all(response.as_bytes()) - .expect("write response"); - request - }); - - (dsn, handle) -} - -fn read_http_request(stream: &mut TcpStream) -> CapturedRequest { - let mut buffer = Vec::new(); - let mut chunk = [0_u8; 1024]; - let mut header_end = None; - while header_end.is_none() { - let read = stream.read(&mut chunk).expect("read request"); - if read == 0 { - break; - } - buffer.extend_from_slice(&chunk[..read]); - header_end = find_header_end(&buffer); - } - let header_end = header_end.expect("request headers"); - let headers_bytes = &buffer[..header_end]; - let headers_str = std::str::from_utf8(headers_bytes).expect("headers utf-8"); - let mut lines = headers_str.split("\r\n"); - let request_line = lines.next().expect("request line"); - let mut request_parts = request_line.split_whitespace(); - let method = request_parts.next().expect("method").to_string(); - let path = request_parts.next().expect("path").to_string(); - - let mut headers = BTreeMap::new(); - for line in lines { - if line.is_empty() { - continue; - } - if let Some((key, value)) = line.split_once(':') { - headers.insert(key.trim().to_ascii_lowercase(), value.trim().to_string()); - } - } - - let content_length = headers - .get("content-length") - .and_then(|value| value.parse::().ok()) - .unwrap_or(0); - let mut body = buffer[header_end..].to_vec(); - while body.len() < content_length { - let read = stream.read(&mut chunk).expect("read body"); - if read == 0 { - break; - } - body.extend_from_slice(&chunk[..read]); - } - - CapturedRequest { - method, - path, - headers, - body, - } -} - -fn find_header_end(buffer: &[u8]) -> Option { - buffer - .windows(4) - .position(|window| window == b"\r\n\r\n") - .map(|pos| pos + 4) -} - -fn parse_envelope(body: &[u8]) -> ParsedEnvelope { - let mut parts = body.splitn(3, |byte| *byte == b'\n'); - let header_line = parts.next().expect("envelope header"); - let item_header_line = parts.next().expect("item header"); - let payload = parts.next().unwrap_or(&[]); - - let header = serde_json::from_slice(header_line).expect("parse envelope header"); - let item_header = serde_json::from_slice(item_header_line).expect("parse item header"); - let payload = std::str::from_utf8(payload) - .expect("payload utf-8") - .trim_end_matches('\n') - .to_string(); - - ParsedEnvelope { - header, - item_header, - payload, - } -} - -fn parse_statsd_line(line: &str) -> ParsedStatsdLine { - let (metric, tags_part) = line - .split_once("|#") - .map(|(metric, tags)| (metric, Some(tags))) - .unwrap_or((line, None)); - let (name_value, kind) = metric.split_once('|').expect("metric kind"); - let (name, value) = name_value.split_once(':').expect("metric value"); - let value = value.parse::().expect("metric value parse"); - - let mut tags = BTreeMap::new(); - if let Some(tags_part) = tags_part - && !tags_part.is_empty() - { - for tag in tags_part.split(',') { - let (key, value) = tag.split_once(':').expect("tag"); - tags.insert(key.to_string(), value.to_string()); - } - } - - ParsedStatsdLine { - name: name.to_string(), - value, - kind: kind.to_string(), - tags, - } -} - -// Ensures counters/histograms render with default + per-call tags. -#[test] -fn send_builds_payload_with_tags_and_histograms() -> Result<()> { - let (dsn, handle) = spawn_server(200); - let metrics = MetricsClient::new( - MetricsConfig::new(dsn.clone()) - .with_tag("service", "codex-cli")? - .with_tag("env", "prod")?, - )?; - let buckets = HistogramBuckets::from_values(&[25, 50, 100])?; - - let mut batch = metrics.batch(); - batch.counter("codex.turns", 1, &[("model", "gpt-5.1"), ("env", "dev")])?; - batch.histogram("codex.tool_latency", 25, &buckets, &[("tool", "shell")])?; - metrics.send(batch)?; - metrics.shutdown()?; - - let captured = handle.join().expect("server thread"); - assert_eq!(captured.method, "POST"); - assert_eq!(captured.path, "/api/123/envelope/"); - assert_eq!( - captured.headers.get("content-type").map(String::as_str), - Some("application/x-sentry-envelope") - ); - - let envelope = parse_envelope(&captured.body); - assert_eq!(envelope.header["dsn"].as_str(), Some(dsn.as_str())); - assert_eq!(envelope.item_header["type"], "statsd"); - assert_eq!(envelope.item_header["content_type"], "text/plain"); - assert_eq!( - envelope.item_header["length"].as_u64(), - Some(envelope.payload.len() as u64) - ); - - let lines: Vec<&str> = envelope.payload.split('\n').collect(); - assert_eq!(lines.len(), 5); - - let line = parse_statsd_line(lines[0]); - assert_eq!(line.name, "codex.turns"); - assert_eq!(line.value, 1); - assert_eq!(line.kind, "c"); - assert_eq!( - line.tags.get("service").map(String::as_str), - Some("codex-cli") - ); - assert_eq!(line.tags.get("env").map(String::as_str), Some("dev")); - assert_eq!(line.tags.get("model").map(String::as_str), Some("gpt-5.1")); - - for (line, expected_le) in lines.iter().skip(1).zip(["25", "50", "100", "inf"]) { - let line = parse_statsd_line(line); - assert_eq!(line.name, "codex.tool_latency"); - assert_eq!(line.value, 1); - assert_eq!(line.kind, "c"); - assert_eq!( - line.tags.get("service").map(String::as_str), - Some("codex-cli") - ); - assert_eq!(line.tags.get("env").map(String::as_str), Some("prod")); - assert_eq!(line.tags.get("tool").map(String::as_str), Some("shell")); - assert_eq!(line.tags.get("le").map(String::as_str), Some(expected_le)); - } - - Ok(()) -} - -// Verifies values above the max bucket use the inf tag. -#[test] -fn send_uses_inf_bucket_for_values_over_max() -> Result<()> { - let (dsn, handle) = spawn_server(200); - let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; - let buckets = HistogramBuckets::from_values(&[10, 20])?; - - let mut batch = metrics.batch(); - batch.histogram("codex.tool_latency", 99, &buckets, &[("tool", "shell")])?; - metrics.send(batch)?; - metrics.shutdown()?; - - let captured = handle.join().expect("server thread"); - let envelope = parse_envelope(&captured.body); - let lines: Vec<&str> = envelope.payload.split('\n').collect(); - assert_eq!(lines.len(), 1); - let line = parse_statsd_line(lines[0]); - assert_eq!(line.tags.get("le").map(String::as_str), Some("inf")); - - Ok(()) -} - -// Ensures duration recording maps to the expected bucket tag. -#[test] -fn record_duration_uses_matching_bucket() -> Result<()> { - let (dsn, handle) = spawn_server(200); - let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; - let buckets = HistogramBuckets::from_values(&[10, 20])?; - - metrics.record_duration( - "codex.request_latency", - Duration::from_millis(15), - &buckets, - &[("route", "chat")], - )?; - metrics.shutdown()?; - - let captured = handle.join().expect("server thread"); - let envelope = parse_envelope(&captured.body); - let lines: Vec<&str> = envelope.payload.split('\n').collect(); - assert_eq!(lines.len(), 2); - - let line = parse_statsd_line(lines[0]); - assert_eq!(line.name, "codex.request_latency"); - assert_eq!(line.tags.get("route").map(String::as_str), Some("chat")); - assert_eq!(line.tags.get("le").map(String::as_str), Some("20")); - - let line = parse_statsd_line(lines[1]); - assert_eq!(line.tags.get("le").map(String::as_str), Some("inf")); - - Ok(()) -} - -// Ensures time_result returns the closure output and records timing. -#[test] -fn time_result_records_success() -> Result<()> { - let (dsn, handle) = spawn_server(200); - let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; - let buckets = HistogramBuckets::from_values(&[10, 20])?; - - let value = metrics.time_result( - "codex.request_latency", - &buckets, - &[("route", "chat")], - || Ok("ok"), - )?; - assert_eq!(value, "ok"); - metrics.shutdown()?; - - let captured = handle.join().expect("server thread"); - let envelope = parse_envelope(&captured.body); - let lines: Vec<&str> = envelope.payload.split('\n').collect(); - assert!(!lines.is_empty()); - let parsed: Vec = lines.iter().map(parse_statsd_line).collect(); - assert!( - parsed - .iter() - .any(|line| { line.tags.get("le").map(String::as_str) == Some("inf") }) - ); - for line in parsed { - assert_eq!(line.name, "codex.request_latency"); - assert_eq!(line.tags.get("route").map(String::as_str), Some("chat")); - assert!(line.tags.contains_key("le")); - } - - Ok(()) -} - -// Ensures time_result propagates errors but still records timing. -#[test] -fn time_result_records_on_error() -> Result<()> { - let (dsn, handle) = spawn_server(200); - let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; - let buckets = HistogramBuckets::from_values(&[10, 20])?; - - let err = metrics - .time_result( - "codex.request_latency", - &buckets, - &[("route", "chat")], - || -> Result<&'static str> { Err(MetricsError::EmptyMetricName) }, - ) - .unwrap_err(); - assert!(matches!(err, MetricsError::EmptyMetricName)); - metrics.shutdown()?; - - let captured = handle.join().expect("server thread"); - let envelope = parse_envelope(&captured.body); - let lines: Vec<&str> = envelope.payload.split('\n').collect(); - assert!(!lines.is_empty()); - let parsed: Vec = lines.iter().map(parse_statsd_line).collect(); - assert!( - parsed - .iter() - .any(|line| { line.tags.get("le").map(String::as_str) == Some("inf") }) - ); - for line in parsed { - assert_eq!(line.name, "codex.request_latency"); - assert_eq!(line.tags.get("route").map(String::as_str), Some("chat")); - assert!(line.tags.contains_key("le")); - } - - Ok(()) -} - -// Verifies enqueued batches are delivered by the background worker. -#[test] -fn client_sends_enqueued_batch() -> Result<()> { - let (dsn, handle) = spawn_server(200); - let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; - - let mut batch = metrics.batch(); - batch.counter("codex.turns", 1, &[("model", "gpt-5.1")])?; - metrics.send(batch)?; - metrics.shutdown()?; - - let captured = handle.join().expect("server thread"); - let envelope = parse_envelope(&captured.body); - let lines: Vec<&str> = envelope.payload.split('\n').collect(); - assert_eq!(lines.len(), 1); - - let line = parse_statsd_line(lines[0]); - assert_eq!(line.name, "codex.turns"); - assert_eq!(line.value, 1); - assert_eq!(line.kind, "c"); - assert_eq!(line.tags.get("model").map(String::as_str), Some("gpt-5.1")); - - Ok(()) -} - -// Ensures a non-success response panics in debug builds via error_or_panic. -#[test] -fn send_panics_on_non_success_status_in_debug() -> Result<()> { - let (dsn, handle) = spawn_server(500); - let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; - - let mut batch = metrics.batch(); - batch.counter("codex.turns", 1, &[])?; - metrics.send(batch)?; - let err = metrics.shutdown().unwrap_err(); - assert!(matches!(err, MetricsError::WorkerPanicked)); - - let captured = handle.join().expect("server thread"); - assert_eq!(captured.method, "POST"); - Ok(()) -} - -// Validates invalid DSNs are rejected early. -#[test] -fn invalid_dsn_reports_error() -> Result<()> { - assert!(matches!( - MetricsClient::new(MetricsConfig::new("not a dsn")), - Err(MetricsError::InvalidDsn { .. }) - )); - Ok(()) -} - -// Ensures empty batches do not trigger any HTTP request. -#[test] -fn send_is_noop_when_batch_empty() -> Result<()> { - let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server"); - listener.set_nonblocking(true).expect("set nonblocking"); - let addr = listener.local_addr().expect("local addr"); - let dsn = format!("http://public:@{addr}/123"); - let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; - - metrics.send(metrics.batch())?; - metrics.shutdown()?; - - let mut saw_connection = false; - for _ in 0..10 { - match listener.accept() { - Ok(_) => { - saw_connection = true; - break; - } - Err(err) if err.kind() == std::io::ErrorKind::WouldBlock => { - thread::sleep(Duration::from_millis(10)); - } - Err(err) => panic!("unexpected accept error: {err}"), - } - } - assert!(!saw_connection, "expected no request for empty batch"); - Ok(()) -} - -// Ensures invalid tag components are rejected during config build. -#[test] -fn invalid_tag_component_is_rejected() -> Result<()> { - let err = MetricsConfig::default() - .with_tag("bad key", "value") - .unwrap_err(); - assert!(matches!( - err, - MetricsError::InvalidTagComponent { label, value } - if label == "tag key" && value == "bad key" - )); - Ok(()) -} - -// Ensures invalid metric names are rejected when building a batch. -#[test] -fn counter_rejects_invalid_metric_name() -> Result<()> { - let mut batch = MetricsBatch::new(); - let err = batch.counter("bad name", 1, &[]).unwrap_err(); - assert!(matches!( - err, - MetricsError::InvalidMetricName { name } if name == "bad name" - )); - Ok(()) -} - -// Ensures empty histogram bucket lists are rejected. -#[test] -fn empty_buckets_are_rejected() { - let err = HistogramBuckets::from_values(&[]).unwrap_err(); - assert!(matches!(err, MetricsError::EmptyBuckets)); -} - -// Ensures range overflow is detected when building buckets. -#[test] -fn range_overflow_is_reported() { - let err = HistogramBuckets::from_range(i64::MAX - 1, i64::MAX, 2).unwrap_err(); - assert!(matches!(err, MetricsError::BucketRangeOverflow { .. })); -} +mod harness; +mod suite; From f3699c5eb47dd5a7f72adcf1ddb9f45ee097bf80 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Fri, 19 Dec 2025 15:04:26 +0100 Subject: [PATCH 06/43] More validation --- codex-rs/Cargo.toml | 2 +- codex-rs/metrics/Cargo.toml | 4 ++- codex-rs/metrics/README.md | 1 + codex-rs/metrics/src/config.rs | 7 ++-- codex-rs/metrics/src/error.rs | 2 ++ codex-rs/metrics/src/statsd.rs | 7 ++-- codex-rs/metrics/src/validation.rs | 20 +++++++++-- codex-rs/metrics/tests/suite/validation.rs | 39 ++++++++++++++++++++++ 8 files changed, 71 insertions(+), 11 deletions(-) diff --git a/codex-rs/Cargo.toml b/codex-rs/Cargo.toml index 7c03533bd98..fb817320611 100644 --- a/codex-rs/Cargo.toml +++ b/codex-rs/Cargo.toml @@ -26,6 +26,7 @@ members = [ "login", "mcp-server", "mcp-types", + "metrics", "ollama", "process-hardening", "protocol", @@ -33,7 +34,6 @@ members = [ "responses-api-proxy", "stdio-to-uds", "otel", - "metrics", "tui", "tui2", "utils/absolute-path", diff --git a/codex-rs/metrics/Cargo.toml b/codex-rs/metrics/Cargo.toml index 23c90dbb6aa..88d7cd5df95 100644 --- a/codex-rs/metrics/Cargo.toml +++ b/codex-rs/metrics/Cargo.toml @@ -5,9 +5,11 @@ edition.workspace = true license.workspace = true [dependencies] -pretty_assertions = { workspace = true } reqwest = { workspace = true, features = ["blocking"] } sentry = { workspace = true } serde_json = { workspace = true } thiserror = { workspace = true } tracing = { workspace = true } + +[dev-dependencies] +pretty_assertions = { workspace = true } diff --git a/codex-rs/metrics/README.md b/codex-rs/metrics/README.md index 3bf9b2a74d4..4232d7c5801 100644 --- a/codex-rs/metrics/README.md +++ b/codex-rs/metrics/README.md @@ -123,6 +123,7 @@ Tag keys and values: - Must be non-empty. - Allowed characters: ASCII letters/digits plus `.`, `_`, `-`, `/`. +- The tag key `le` is reserved for histogram bucketing. ## Error handling diff --git a/codex-rs/metrics/src/config.rs b/codex-rs/metrics/src/config.rs index 3591cb54a04..8ed75e74a5b 100644 --- a/codex-rs/metrics/src/config.rs +++ b/codex-rs/metrics/src/config.rs @@ -1,7 +1,8 @@ use crate::DEFAULT_TIMEOUT; use crate::SENTRY_DSN; use crate::error::Result; -use crate::validation::validate_tag_component; +use crate::validation::validate_tag_key; +use crate::validation::validate_tag_value; use std::collections::BTreeMap; use std::time::Duration; @@ -28,8 +29,8 @@ impl MetricsConfig { pub fn with_tag(mut self, key: impl Into, value: impl Into) -> Result { let key = key.into(); let value = value.into(); - validate_tag_component(&key, "tag key")?; - validate_tag_component(&value, "tag value")?; + validate_tag_key(&key)?; + validate_tag_value(&value)?; self.default_tags.insert(key, value); Ok(self) } diff --git a/codex-rs/metrics/src/error.rs b/codex-rs/metrics/src/error.rs index 44117af0ab0..3e5bd100540 100644 --- a/codex-rs/metrics/src/error.rs +++ b/codex-rs/metrics/src/error.rs @@ -27,6 +27,8 @@ pub enum MetricsError { EmptyTagComponent { label: String }, #[error("{label} contains invalid characters: {value}")] InvalidTagComponent { label: String, value: String }, + #[error("tag key is reserved: {key}")] + ReservedTagKey { key: String }, // Client. #[error("invalid sentry dsn: {dsn}")] diff --git a/codex-rs/metrics/src/statsd.rs b/codex-rs/metrics/src/statsd.rs index c6e154891f6..5ed54ca8165 100644 --- a/codex-rs/metrics/src/statsd.rs +++ b/codex-rs/metrics/src/statsd.rs @@ -2,7 +2,8 @@ use crate::STATSD_CONTENT_TYPE; use crate::error::MetricsError; use crate::error::Result; use crate::validation::validate_metric_name; -use crate::validation::validate_tag_component; +use crate::validation::validate_tag_key; +use crate::validation::validate_tag_value; use sentry::types::Dsn; use std::collections::BTreeMap; @@ -82,8 +83,8 @@ pub(crate) fn build_statsd_envelope(dsn: &Dsn, payload: &str) -> Result> pub(crate) fn collect_tags(tags: &[(&str, &str)]) -> Result> { tags.iter() .map(|(key, value)| { - validate_tag_component(key, "tag key")?; - validate_tag_component(value, "tag value")?; + validate_tag_key(key)?; + validate_tag_value(value)?; Ok(((*key).to_string(), (*value).to_string())) }) .collect() diff --git a/codex-rs/metrics/src/validation.rs b/codex-rs/metrics/src/validation.rs index a51185b3f9e..20c316490c8 100644 --- a/codex-rs/metrics/src/validation.rs +++ b/codex-rs/metrics/src/validation.rs @@ -4,8 +4,8 @@ use std::collections::BTreeMap; pub(crate) fn validate_tags(tags: &BTreeMap) -> Result<()> { for (key, value) in tags { - validate_tag_component(key, "tag key")?; - validate_tag_component(value, "tag value")?; + validate_tag_key(key)?; + validate_tag_value(value)?; } Ok(()) } @@ -22,7 +22,21 @@ pub(crate) fn validate_metric_name(name: &str) -> Result<()> { Ok(()) } -pub(crate) fn validate_tag_component(value: &str, label: &str) -> Result<()> { +pub(crate) fn validate_tag_key(key: &str) -> Result<()> { + validate_tag_component(key, "tag key")?; + if key == "le" { + return Err(MetricsError::ReservedTagKey { + key: key.to_string(), + }); + } + Ok(()) +} + +pub(crate) fn validate_tag_value(value: &str) -> Result<()> { + validate_tag_component(value, "tag value") +} + +fn validate_tag_component(value: &str, label: &str) -> Result<()> { if value.is_empty() { return Err(MetricsError::EmptyTagComponent { label: label.to_string(), diff --git a/codex-rs/metrics/tests/suite/validation.rs b/codex-rs/metrics/tests/suite/validation.rs index e7a99519838..2383b774539 100644 --- a/codex-rs/metrics/tests/suite/validation.rs +++ b/codex-rs/metrics/tests/suite/validation.rs @@ -29,6 +29,17 @@ fn invalid_tag_component_is_rejected() -> Result<()> { Ok(()) } +// Ensures the reserved histogram bucketing tag key is rejected in config defaults. +#[test] +fn reserved_tag_key_is_rejected_in_config() -> Result<()> { + let err = MetricsConfig::default().with_tag("le", "10").unwrap_err(); + assert!(matches!( + err, + MetricsError::ReservedTagKey { key } if key == "le" + )); + Ok(()) +} + // Ensures per-metric tag keys are validated. #[test] fn counter_rejects_invalid_tag_key() { @@ -43,6 +54,19 @@ fn counter_rejects_invalid_tag_key() { )); } +// Ensures per-metric tag keys cannot use reserved histogram bucketing keys. +#[test] +fn counter_rejects_reserved_tag_key() { + let mut batch = MetricsBatch::new(); + let err = batch + .counter("codex.turns", 1, &[("le", "10")]) + .unwrap_err(); + assert!(matches!( + err, + MetricsError::ReservedTagKey { key } if key == "le" + )); +} + // Ensures per-metric tag values are validated. #[test] fn histogram_rejects_invalid_tag_value() -> Result<()> { @@ -64,6 +88,21 @@ fn histogram_rejects_invalid_tag_value() -> Result<()> { Ok(()) } +// Ensures histogram calls reject reserved tag keys even though they internally add `le`. +#[test] +fn histogram_rejects_reserved_tag_key() -> Result<()> { + let mut batch = MetricsBatch::new(); + let buckets = HistogramBuckets::from_values(&[10])?; + let err = batch + .histogram("codex.request_latency", 3, &buckets, &[("le", "10")]) + .unwrap_err(); + assert!(matches!( + err, + MetricsError::ReservedTagKey { key } if key == "le" + )); + Ok(()) +} + // Ensures invalid metric names are rejected when building a batch. #[test] fn counter_rejects_invalid_metric_name() -> Result<()> { From b041858341c4312c4d3135cc544b7aeb19370054 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Fri, 19 Dec 2025 15:30:01 +0100 Subject: [PATCH 07/43] client to tokio --- codex-rs/Cargo.lock | 2 ++ codex-rs/core/Cargo.toml | 1 + codex-rs/metrics/Cargo.toml | 1 + codex-rs/metrics/src/client.rs | 44 +++++++++++++++++++++++----------- 4 files changed, 34 insertions(+), 14 deletions(-) diff --git a/codex-rs/Cargo.lock b/codex-rs/Cargo.lock index 73f7fd62dbe..c8c791b9c2f 100644 --- a/codex-rs/Cargo.lock +++ b/codex-rs/Cargo.lock @@ -1271,6 +1271,7 @@ dependencies = [ "codex-file-search", "codex-git", "codex-keyring-store", + "codex-metrics", "codex-otel", "codex-protocol", "codex-rmcp-client", @@ -1570,6 +1571,7 @@ dependencies = [ "sentry", "serde_json", "thiserror 2.0.17", + "tokio", "tracing", ] diff --git a/codex-rs/core/Cargo.toml b/codex-rs/core/Cargo.toml index 2b51b784cc9..c376bdfc9d4 100644 --- a/codex-rs/core/Cargo.toml +++ b/codex-rs/core/Cargo.toml @@ -28,6 +28,7 @@ codex-execpolicy = { workspace = true } codex-file-search = { workspace = true } codex-git = { workspace = true } codex-keyring-store = { workspace = true } +codex-metrics = { workspace = true } codex-otel = { workspace = true } codex-protocol = { workspace = true } codex-rmcp-client = { workspace = true } diff --git a/codex-rs/metrics/Cargo.toml b/codex-rs/metrics/Cargo.toml index 88d7cd5df95..7ef2d808603 100644 --- a/codex-rs/metrics/Cargo.toml +++ b/codex-rs/metrics/Cargo.toml @@ -9,6 +9,7 @@ reqwest = { workspace = true, features = ["blocking"] } sentry = { workspace = true } serde_json = { workspace = true } thiserror = { workspace = true } +tokio = { workspace = true, features = ["rt", "sync"] } tracing = { workspace = true } [dev-dependencies] diff --git a/codex-rs/metrics/src/client.rs b/codex-rs/metrics/src/client.rs index 611aee0790b..c5bb5051382 100644 --- a/codex-rs/metrics/src/client.rs +++ b/codex-rs/metrics/src/client.rs @@ -15,13 +15,12 @@ use sentry::types::Dsn; use std::collections::BTreeMap; use std::sync::Arc; use std::sync::Mutex; -use std::sync::mpsc; -use std::sync::mpsc::Receiver; -use std::sync::mpsc::SyncSender; -use std::sync::mpsc::TrySendError; use std::thread; use std::time::Duration; use std::time::Instant; +use tokio::runtime::Builder; +use tokio::sync::mpsc; +use tokio::sync::mpsc::error::TrySendError; enum WorkerMessage { Batch(MetricsBatch), @@ -29,7 +28,7 @@ enum WorkerMessage { } struct WorkerState { - sender: Mutex>>, + sender: Mutex>>, handle: Mutex>>, capacity: usize, } @@ -111,15 +110,15 @@ impl MetricsClient { let auth_header = dsn.to_auth(Some(&config.user_agent)).to_string(); - let core = ClientCore { + let core = Arc::new(ClientCore { dsn, http, auth_header, default_tags: config.default_tags, - }; + }); - let (sender, receiver) = mpsc::sync_channel(capacity); - let handle = thread::spawn(move || run_worker(core, receiver)); + let (sender, receiver) = mpsc::channel(capacity); + let handle = thread::spawn(move || run_worker_thread(core, receiver)); Ok(Self { state: Arc::new(WorkerState { @@ -223,7 +222,7 @@ impl MetricsClient { Err(TrySendError::Full(_)) => Err(MetricsError::QueueFull { capacity: self.state.capacity, }), - Err(TrySendError::Disconnected(_)) => Err(MetricsError::WorkerUnavailable), + Err(TrySendError::Closed(_)) => Err(MetricsError::WorkerUnavailable), } } @@ -280,12 +279,29 @@ impl Drop for MetricsClient { } } -fn run_worker(client: ClientCore, receiver: Receiver) { - while let Ok(message) = receiver.recv() { +fn run_worker_thread(client: Arc, receiver: mpsc::Receiver) { + let runtime = Builder::new_current_thread() + .enable_all() + .build() + .expect("metrics runtime"); + let handle = runtime.spawn(run_worker(client, receiver)); + runtime.block_on(async { + if let Err(err) = handle.await { + panic!("metrics worker panicked: {err}"); + } + }); +} + +async fn run_worker(client: Arc, mut receiver: mpsc::Receiver) { + while let Some(message) = receiver.recv().await { match message { WorkerMessage::Batch(batch) => { - if let Err(err) = client.send(batch) { - error_or_panic(format!("metrics send failed: {err}")); + let client = Arc::clone(&client); + let send_result = tokio::task::spawn_blocking(move || client.send(batch)).await; + match send_result { + Ok(Ok(())) => {} + Ok(Err(err)) => error_or_panic(format!("metrics send failed: {err}")), + Err(err) => error_or_panic(format!("metrics send task panicked: {err}")), } } WorkerMessage::Shutdown => break, From 2f18fc5fb571b744501d02c1cc91d84ee283bc36 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Fri, 19 Dec 2025 19:09:24 +0100 Subject: [PATCH 08/43] Move to otel --- codex-rs/Cargo.lock | 16 +- codex-rs/Cargo.toml | 2 - codex-rs/core/Cargo.toml | 1 - codex-rs/core/src/client.rs | 2 +- codex-rs/core/src/codex.rs | 2 +- codex-rs/core/src/otel_init.rs | 2 +- codex-rs/core/src/state/service.rs | 2 +- codex-rs/core/src/tools/orchestrator.rs | 4 +- .../core/tests/chat_completions_payload.rs | 2 +- codex-rs/core/tests/chat_completions_sse.rs | 2 +- codex-rs/core/tests/responses_headers.rs | 2 +- codex-rs/core/tests/suite/client.rs | 2 +- codex-rs/docs/metrics.md | 160 ++++++++ codex-rs/metrics/Cargo.toml | 16 - codex-rs/metrics/README.md | 132 ------ codex-rs/metrics/src/client.rs | 310 -------------- codex-rs/metrics/src/config.rs | 55 --- codex-rs/metrics/src/lib.rs | 26 -- codex-rs/metrics/src/statsd.rs | 102 ----- codex-rs/metrics/tests/harness/mod.rs | 165 -------- codex-rs/metrics/tests/suite/mod.rs | 3 - codex-rs/metrics/tests/suite/send.rs | 207 ---------- codex-rs/metrics/tests/suite/timing.rs | 113 ------ codex-rs/metrics/tests/suite/validation.rs | 130 ------ codex-rs/metrics/tests/tests.rs | 2 - codex-rs/otel/Cargo.toml | 11 +- codex-rs/otel/README.md | 136 +++++++ codex-rs/otel/src/lib.rs | 5 +- .../src => otel/src/metrics}/batch.rs | 72 ++-- codex-rs/otel/src/metrics/client.rs | 376 +++++++++++++++++ codex-rs/otel/src/metrics/config.rs | 100 +++++ .../src => otel/src/metrics}/error.rs | 40 +- codex-rs/otel/src/metrics/mod.rs | 29 ++ codex-rs/otel/src/metrics/tags.rs | 32 ++ codex-rs/otel/src/metrics/tests.rs | 382 ++++++++++++++++++ .../{metrics/src => otel/src/metrics}/time.rs | 0 .../{metrics/src => otel/src/metrics}/util.rs | 0 .../src => otel/src/metrics}/validation.rs | 4 +- codex-rs/otel/src/traces/mod.rs | 2 + .../otel/src/{ => traces}/otel_manager.rs | 214 +++++++++- .../otel/src/{ => traces}/otel_provider.rs | 0 41 files changed, 1510 insertions(+), 1353 deletions(-) create mode 100644 codex-rs/docs/metrics.md delete mode 100644 codex-rs/metrics/Cargo.toml delete mode 100644 codex-rs/metrics/README.md delete mode 100644 codex-rs/metrics/src/client.rs delete mode 100644 codex-rs/metrics/src/config.rs delete mode 100644 codex-rs/metrics/src/lib.rs delete mode 100644 codex-rs/metrics/src/statsd.rs delete mode 100644 codex-rs/metrics/tests/harness/mod.rs delete mode 100644 codex-rs/metrics/tests/suite/mod.rs delete mode 100644 codex-rs/metrics/tests/suite/send.rs delete mode 100644 codex-rs/metrics/tests/suite/timing.rs delete mode 100644 codex-rs/metrics/tests/suite/validation.rs delete mode 100644 codex-rs/metrics/tests/tests.rs create mode 100644 codex-rs/otel/README.md rename codex-rs/{metrics/src => otel/src/metrics}/batch.rs (83%) create mode 100644 codex-rs/otel/src/metrics/client.rs create mode 100644 codex-rs/otel/src/metrics/config.rs rename codex-rs/{metrics/src => otel/src/metrics}/error.rs (68%) create mode 100644 codex-rs/otel/src/metrics/mod.rs create mode 100644 codex-rs/otel/src/metrics/tags.rs create mode 100644 codex-rs/otel/src/metrics/tests.rs rename codex-rs/{metrics/src => otel/src/metrics}/time.rs (100%) rename codex-rs/{metrics/src => otel/src/metrics}/util.rs (100%) rename codex-rs/{metrics/src => otel/src/metrics}/validation.rs (95%) create mode 100644 codex-rs/otel/src/traces/mod.rs rename codex-rs/otel/src/{ => traces}/otel_manager.rs (74%) rename codex-rs/otel/src/{ => traces}/otel_provider.rs (100%) diff --git a/codex-rs/Cargo.lock b/codex-rs/Cargo.lock index c8c791b9c2f..1d8e247c0cc 100644 --- a/codex-rs/Cargo.lock +++ b/codex-rs/Cargo.lock @@ -1271,7 +1271,6 @@ dependencies = [ "codex-file-search", "codex-git", "codex-keyring-store", - "codex-metrics", "codex-otel", "codex-protocol", "codex-rmcp-client", @@ -1562,19 +1561,6 @@ dependencies = [ "wiremock", ] -[[package]] -name = "codex-metrics" -version = "0.0.0" -dependencies = [ - "pretty_assertions", - "reqwest", - "sentry", - "serde_json", - "thiserror 2.0.17", - "tokio", - "tracing", -] - [[package]] name = "codex-ollama" version = "0.0.0" @@ -1607,10 +1593,12 @@ dependencies = [ "opentelemetry-otlp", "opentelemetry-semantic-conventions", "opentelemetry_sdk", + "pretty_assertions", "reqwest", "serde", "serde_json", "strum_macros 0.27.2", + "thiserror 2.0.17", "tokio", "tonic", "tracing", diff --git a/codex-rs/Cargo.toml b/codex-rs/Cargo.toml index fb817320611..50941771cf2 100644 --- a/codex-rs/Cargo.toml +++ b/codex-rs/Cargo.toml @@ -26,7 +26,6 @@ members = [ "login", "mcp-server", "mcp-types", - "metrics", "ollama", "process-hardening", "protocol", @@ -83,7 +82,6 @@ codex-linux-sandbox = { path = "linux-sandbox" } codex-lmstudio = { path = "lmstudio" } codex-login = { path = "login" } codex-mcp-server = { path = "mcp-server" } -codex-metrics = { path = "metrics" } codex-ollama = { path = "ollama" } codex-otel = { path = "otel" } codex-process-hardening = { path = "process-hardening" } diff --git a/codex-rs/core/Cargo.toml b/codex-rs/core/Cargo.toml index c376bdfc9d4..2b51b784cc9 100644 --- a/codex-rs/core/Cargo.toml +++ b/codex-rs/core/Cargo.toml @@ -28,7 +28,6 @@ codex-execpolicy = { workspace = true } codex-file-search = { workspace = true } codex-git = { workspace = true } codex-keyring-store = { workspace = true } -codex-metrics = { workspace = true } codex-otel = { workspace = true } codex-protocol = { workspace = true } codex-rmcp-client = { workspace = true } diff --git a/codex-rs/core/src/client.rs b/codex-rs/core/src/client.rs index aaf3b0ea353..dc0bf3872da 100644 --- a/codex-rs/core/src/client.rs +++ b/codex-rs/core/src/client.rs @@ -18,7 +18,7 @@ use codex_api::common::Reasoning; use codex_api::create_text_param_for_request; use codex_api::error::ApiError; use codex_app_server_protocol::AuthMode; -use codex_otel::otel_manager::OtelManager; +use codex_otel::traces::otel_manager::OtelManager; use codex_protocol::ConversationId; use codex_protocol::config_types::ReasoningSummary as ReasoningSummaryConfig; use codex_protocol::models::ResponseItem; diff --git a/codex-rs/core/src/codex.rs b/codex-rs/core/src/codex.rs index c15fa03cfd7..746790b3287 100644 --- a/codex-rs/core/src/codex.rs +++ b/codex-rs/core/src/codex.rs @@ -150,7 +150,7 @@ use crate::user_notification::UserNotification; use crate::util::backoff; use codex_async_utils::OrCancelExt; use codex_execpolicy::Policy as ExecPolicy; -use codex_otel::otel_manager::OtelManager; +use codex_otel::traces::otel_manager::OtelManager; use codex_protocol::config_types::ReasoningSummary as ReasoningSummaryConfig; use codex_protocol::models::ContentItem; use codex_protocol::models::ResponseInputItem; diff --git a/codex-rs/core/src/otel_init.rs b/codex-rs/core/src/otel_init.rs index ece5a6bf500..8736869126f 100644 --- a/codex-rs/core/src/otel_init.rs +++ b/codex-rs/core/src/otel_init.rs @@ -6,7 +6,7 @@ use codex_otel::config::OtelExporter; use codex_otel::config::OtelHttpProtocol; use codex_otel::config::OtelSettings; use codex_otel::config::OtelTlsConfig as OtelTlsSettings; -use codex_otel::otel_provider::OtelProvider; +use codex_otel::traces::otel_provider::OtelProvider; use std::error::Error; /// Build an OpenTelemetry provider from the app Config. diff --git a/codex-rs/core/src/state/service.rs b/codex-rs/core/src/state/service.rs index e06691955fc..b859af310a8 100644 --- a/codex-rs/core/src/state/service.rs +++ b/codex-rs/core/src/state/service.rs @@ -8,7 +8,7 @@ use crate::skills::SkillsManager; use crate::tools::sandboxing::ApprovalStore; use crate::unified_exec::UnifiedExecSessionManager; use crate::user_notification::UserNotifier; -use codex_otel::otel_manager::OtelManager; +use codex_otel::traces::otel_manager::OtelManager; use tokio::sync::Mutex; use tokio::sync::RwLock; use tokio_util::sync::CancellationToken; diff --git a/codex-rs/core/src/tools/orchestrator.rs b/codex-rs/core/src/tools/orchestrator.rs index 7853617238e..113ad98e0f6 100644 --- a/codex-rs/core/src/tools/orchestrator.rs +++ b/codex-rs/core/src/tools/orchestrator.rs @@ -45,8 +45,8 @@ impl ToolOrchestrator { let otel = turn_ctx.client.get_otel_manager(); let otel_tn = &tool_ctx.tool_name; let otel_ci = &tool_ctx.call_id; - let otel_user = codex_otel::otel_manager::ToolDecisionSource::User; - let otel_cfg = codex_otel::otel_manager::ToolDecisionSource::Config; + let otel_user = codex_otel::traces::otel_manager::ToolDecisionSource::User; + let otel_cfg = codex_otel::traces::otel_manager::ToolDecisionSource::Config; // 1) Approval let mut already_approved = false; diff --git a/codex-rs/core/tests/chat_completions_payload.rs b/codex-rs/core/tests/chat_completions_payload.rs index 3e53fa85cf9..c36fef937ef 100644 --- a/codex-rs/core/tests/chat_completions_payload.rs +++ b/codex-rs/core/tests/chat_completions_payload.rs @@ -13,7 +13,7 @@ use codex_core::Prompt; use codex_core::ResponseItem; use codex_core::WireApi; use codex_core::openai_models::models_manager::ModelsManager; -use codex_otel::otel_manager::OtelManager; +use codex_otel::traces::otel_manager::OtelManager; use codex_protocol::ConversationId; use codex_protocol::models::ReasoningItemContent; use codex_protocol::protocol::SessionSource; diff --git a/codex-rs/core/tests/chat_completions_sse.rs b/codex-rs/core/tests/chat_completions_sse.rs index 969fa47b86c..c5820f57123 100644 --- a/codex-rs/core/tests/chat_completions_sse.rs +++ b/codex-rs/core/tests/chat_completions_sse.rs @@ -12,7 +12,7 @@ use codex_core::ResponseEvent; use codex_core::ResponseItem; use codex_core::WireApi; use codex_core::openai_models::models_manager::ModelsManager; -use codex_otel::otel_manager::OtelManager; +use codex_otel::traces::otel_manager::OtelManager; use codex_protocol::ConversationId; use codex_protocol::models::ReasoningItemContent; use codex_protocol::protocol::SessionSource; diff --git a/codex-rs/core/tests/responses_headers.rs b/codex-rs/core/tests/responses_headers.rs index 382c8875ce0..580fc58bcc4 100644 --- a/codex-rs/core/tests/responses_headers.rs +++ b/codex-rs/core/tests/responses_headers.rs @@ -11,7 +11,7 @@ use codex_core::ResponseEvent; use codex_core::ResponseItem; use codex_core::WireApi; use codex_core::openai_models::models_manager::ModelsManager; -use codex_otel::otel_manager::OtelManager; +use codex_otel::traces::otel_manager::OtelManager; use codex_protocol::ConversationId; use codex_protocol::config_types::ReasoningSummary; use codex_protocol::openai_models::ReasoningSummaryFormat; diff --git a/codex-rs/core/tests/suite/client.rs b/codex-rs/core/tests/suite/client.rs index 35a67a69299..132dc50aadd 100644 --- a/codex-rs/core/tests/suite/client.rs +++ b/codex-rs/core/tests/suite/client.rs @@ -20,7 +20,7 @@ use codex_core::openai_models::models_manager::ModelsManager; use codex_core::protocol::EventMsg; use codex_core::protocol::Op; use codex_core::protocol::SessionSource; -use codex_otel::otel_manager::OtelManager; +use codex_otel::traces::otel_manager::OtelManager; use codex_protocol::ConversationId; use codex_protocol::config_types::ReasoningSummary; use codex_protocol::config_types::Verbosity; diff --git a/codex-rs/docs/metrics.md b/codex-rs/docs/metrics.md new file mode 100644 index 00000000000..28a095321a6 --- /dev/null +++ b/codex-rs/docs/metrics.md @@ -0,0 +1,160 @@ +# Metrics (Statsig + OTEL) + +The `codex_otel::metrics` module sends counters and histograms to a Statsig +backend using OTLP/HTTP. It uses a background worker to keep callers +non-blocking and exports metrics via OpenTelemetry. + +You must supply a Statsig OTLP endpoint and API key. This module ships with +placeholders (``, ``, +``) so they are obvious to replace. + +## Quick start + +```rust +use codex_otel::metrics::HistogramBuckets; +use codex_otel::metrics::MetricsClient; +use codex_otel::metrics::MetricsConfig; + +let metrics = MetricsClient::new( + MetricsConfig::new("") + .with_endpoint("") + .with_api_key_header("") + .with_tag("service", "codex-cli")?, +)?; + +let buckets = HistogramBuckets::from_values(&[25, 50, 100, 250, 500, 1000])?; + +metrics.counter("codex.session_started", 1, &[("source", "tui")])?; +metrics.histogram("codex.request_latency", 83, &buckets, &[("route", "chat")])?; +``` + +## OtelManager facade + +If you're already using `OtelManager` for tracing, you can attach a metrics +client and emit metrics through the same handle. By default, metrics sent via +`OtelManager` include metadata tags: `auth_mode`, `model`, `slug`, +`terminal.type`, and `app.version`. Use +`with_metrics_without_metadata_tags` to opt out. + +```rust +use codex_otel::metrics::HistogramBuckets; +use codex_otel::metrics::MetricsConfig; +use codex_otel::traces::otel_manager::OtelManager; + +let manager = OtelManager::new( + conversation_id, + model, + slug, + account_id, + account_email, + auth_mode, + log_user_prompts, + terminal_type, + session_source, +) +.with_metrics_config( + MetricsConfig::new("") + .with_endpoint("") + .with_api_key_header(""), +)?; + +let buckets = HistogramBuckets::from_values(&[25, 50, 100, 250, 500])?; +manager.counter("codex.session_started", 1, &[("source", "tui")])?; +manager.histogram("codex.request_latency", 83, &buckets, &[("route", "chat")])?; +``` + +## Configuration + +`MetricsConfig` lets you specify: + +- `MetricsConfig::new(api_key)` to set the Statsig API key. +- `with_endpoint(endpoint)` to set the OTLP endpoint. +- `with_api_key_header(header)` to set the API key header name. +- `with_tag(key, value)` to add default tags for every metric. +- `with_timeout(duration)` to set the OTLP export timeout. +- `with_export_interval(duration)` to set the periodic export interval. +- `with_user_agent(agent)` to override the HTTP `User-Agent` header. + +The queue capacity is fixed at 1024 entries. + +## Histograms + +Histograms are recorded as OpenTelemetry histograms. Bucket boundaries are +controlled by the OTEL pipeline (collector/exporter configuration). The +`HistogramBuckets` type is retained for API compatibility and validation but +is not used to pre-bucket samples. + +## Timing + +Measure a closure and emit a histogram sample for the elapsed time in +milliseconds: + +```rust +let result = metrics.time("codex.request_latency", &buckets, &[("route", "chat")], || { + "ok" +})?; +``` + +If the closure already returns `codex_otel::metrics::Result`, use +`time_result` to avoid nested results: + +```rust +let result = metrics.time_result( + "codex.request_latency", + &buckets, + &[("route", "chat")], + || Ok("ok"), +)?; +``` + +If you already have a duration, record it directly: + +```rust +metrics.record_duration( + "codex.request_latency", + std::time::Duration::from_millis(83), + &buckets, + &[("route", "chat")], +)?; +``` + +## Batching + +Batching reduces overhead and keeps metrics aligned in time: + +```rust +let mut batch = metrics.batch(); +batch.counter("codex.turns", 1, &[("model", "gpt-5.1")])?; +batch.histogram("codex.tool_latency", 140, &buckets, &[("tool", "shell")])?; +metrics.send(batch)?; +``` + +## Shutdown and queue capacity + +The client uses a bounded queue (default capacity 1024). Enqueueing returns a +`MetricsError::QueueFull` error if the queue is full or +`MetricsError::WorkerUnavailable` if the worker is no longer running. + +`shutdown` flushes queued metrics, requests a final export, and waits up to +500ms for the worker to stop. `MetricsClient` also attempts a best-effort +shutdown on drop using the default timeout, so explicit calls to `shutdown` +are optional. + +## Validation rules + +Metric names: + +- Must be non-empty. +- Allowed characters: ASCII letters/digits plus `.`, `_`, `-`. + +Tag keys and values: + +- Must be non-empty. +- Allowed characters: ASCII letters/digits plus `.`, `_`, `-`, `/`. +- The tag key `le` is reserved. + +## Error handling + +All APIs return `codex_otel::metrics::Result` with a `MetricsError` variant +on failure. Errors cover invalid configuration, validation failures, queue +backpressure, and OTLP exporter setup issues. diff --git a/codex-rs/metrics/Cargo.toml b/codex-rs/metrics/Cargo.toml deleted file mode 100644 index 7ef2d808603..00000000000 --- a/codex-rs/metrics/Cargo.toml +++ /dev/null @@ -1,16 +0,0 @@ -[package] -name = "codex-metrics" -version.workspace = true -edition.workspace = true -license.workspace = true - -[dependencies] -reqwest = { workspace = true, features = ["blocking"] } -sentry = { workspace = true } -serde_json = { workspace = true } -thiserror = { workspace = true } -tokio = { workspace = true, features = ["rt", "sync"] } -tracing = { workspace = true } - -[dev-dependencies] -pretty_assertions = { workspace = true } diff --git a/codex-rs/metrics/README.md b/codex-rs/metrics/README.md deleted file mode 100644 index 4232d7c5801..00000000000 --- a/codex-rs/metrics/README.md +++ /dev/null @@ -1,132 +0,0 @@ -# codex-metrics - -Send lightweight counters and histogram buckets to Sentry via the statsd envelope item. - -Key points: -- Non-blocking for the sender. Metrics are processed by a dedicated worker. -- Tag validation and metric name validation are enforced before send to match Sentry requirements. - -## Quick start - -```rust -let metrics = MetricsClient::new( - MetricsConfig::default() // Default to the standard Sentry DSN. - .with_tag("service", "codex-cli")?, -)?; - -let buckets = HistogramBuckets::from_values(&[25, 50, 100, 250, 500, 1000])?; - -metrics.counter("codex.session_started", 1, &[("source", "tui")])?; -metrics.histogram("codex.request_latency", 83, &buckets, &[("route", "chat")])?; -``` - -## Configuration - -`MetricsConfig` lets you specify: - -- `MetricsConfig::new(dsn)` to set the Sentry DSN. -- `with_tag(key, value)` to add default tags. -- `with_timeout(duration)` to override the HTTP timeout (default 10s). -- `with_user_agent(agent)` to override the user agent. - -The queue capacity is fixed at 1024 entries. - -## Buckets - -`HistogramBuckets` supports: -- `from_values(&[...])` for explicit upper bounds. -- `from_range(from, to, step)` to build linear buckets. Requires `step > 0` and `from <= to`. The upper bound is always included. -- `from_exponential(from, to, factor)` to build exponential buckets. Requires `from > 0`, `from <= to`, and a finite `factor > 1`. The upper bound is always included. - -## Sending metrics - -Counters send a single statsd counter increment with tags: - -```rust -metrics.counter("codex.session_started", 1, &[("source", "tui")])?; -``` - -Histograms are translated into bucket counters by adding an `le` tag for each -bound that is greater than or equal to the value, plus a final `le=inf` bucket -so the histogram is cumulative per the statsd `le` convention: - -```rust -metrics.histogram("codex.request_latency", 83, &buckets, &[("route", "chat")])?; -``` - -`counter`, `histogram`, and `send` enqueue metrics for the background worker. -Call `shutdown` to flush queued metrics on exit. - -## Timing - -Measure a closure and emit a histogram sample for the elapsed time in milliseconds: - -```rust -let result = metrics.time("codex.request_latency", &buckets, &[("route", "chat")], || { - "ok" -})?; -``` - -If the closure already returns `codex_metrics::Result`, use `time_result` to -avoid nested results: - -```rust -let result = metrics.time_result("codex.request_latency", &buckets, &[("route", "chat")], || { - Ok("ok") -})?; -``` - -If you already have a duration, record it directly: - -```rust -metrics.record_duration( - "codex.request_latency", - std::time::Duration::from_millis(83), - &buckets, - &[("route", "chat")], -)?; -``` - -## Batching - -Batching reduces network requests and ensure metrics have the same timestamp. - -```rust -let mut batch = metrics.batch(); -batch.counter("codex.turns", 1, &[("model", "gpt-5.1")])?; -batch.histogram("codex.tool_latency", 140, &buckets, &[("tool", "shell")])?; -metrics.send(batch)?; -``` - -## Shutdown and queue capacity - -The client uses a bounded queue (default capacity 1024). Enqueueing returns a -`MetricsError::QueueFull` error if the queue is full or `MetricsError::WorkerUnavailable` -if the worker is no longer running. - -`shutdown` waits up to 500ms for the worker to stop. - -Uploads are best-effort; if the worker encounters a send error, the metric is -dropped (if in `alpha`, or debug mode, the worker will panic on errors). - -`MetricsClient` also attempts a best-effort shutdown on drop using the default -timeout, so explicit calls to `shutdown` are optional. - -## Validation rules - -Metric names: - -- Must be non-empty. -- Allowed characters: ASCII letters/digits plus `.`, `_`, `-`. - -Tag keys and values: - -- Must be non-empty. -- Allowed characters: ASCII letters/digits plus `.`, `_`, `-`, `/`. -- The tag key `le` is reserved for histogram bucketing. - -## Error handling - -All APIs return `codex_metrics::Result` with a `MetricsError` variant on -failure. Errors cover invalid configuration, validation failures, and HTTP or -serialization failures. diff --git a/codex-rs/metrics/src/client.rs b/codex-rs/metrics/src/client.rs deleted file mode 100644 index c5bb5051382..00000000000 --- a/codex-rs/metrics/src/client.rs +++ /dev/null @@ -1,310 +0,0 @@ -use crate::DEFAULT_QUEUE_CAPACITY; -use crate::DEFAULT_SHUTDOWN_TIMEOUT; -use crate::ENVELOPE_CONTENT_TYPE; -use crate::SHUTDOWN_POLL_INTERVAL; -use crate::batch::HistogramBuckets; -use crate::batch::MetricsBatch; -use crate::config::MetricsConfig; -use crate::error::MetricsError; -use crate::error::Result; -use crate::statsd::build_statsd_envelope; -use crate::time::duration_to_millis; -use crate::util::error_or_panic; -use crate::validation::validate_tags; -use sentry::types::Dsn; -use std::collections::BTreeMap; -use std::sync::Arc; -use std::sync::Mutex; -use std::thread; -use std::time::Duration; -use std::time::Instant; -use tokio::runtime::Builder; -use tokio::sync::mpsc; -use tokio::sync::mpsc::error::TrySendError; - -enum WorkerMessage { - Batch(MetricsBatch), - Shutdown, -} - -struct WorkerState { - sender: Mutex>>, - handle: Mutex>>, - capacity: usize, -} - -#[derive(Debug)] -struct ClientCore { - dsn: Dsn, - http: reqwest::blocking::Client, - auth_header: String, - default_tags: BTreeMap, -} - -impl ClientCore { - fn send(&self, batch: MetricsBatch) -> Result<()> { - if batch.is_empty() { - return Ok(()); - } - - let payload = batch.render(&self.default_tags)?; - let envelope = build_statsd_envelope(&self.dsn, &payload)?; - - let response = self - .http - .post(self.dsn.envelope_api_url()) - .header("X-Sentry-Auth", &self.auth_header) - .header("Content-Type", ENVELOPE_CONTENT_TYPE) - .body(envelope) - .send() - .map_err(|source| MetricsError::SendEnvelope { source })?; - - if !response.status().is_success() { - let status = response.status(); - let body = response - .text() - .map(|body| { - if body.is_empty() { - String::new() - } else { - format!(" body: {body}") - } - }) - .unwrap_or_default(); - return Err(MetricsError::SentryUploadFailed { status, body }); - } - - Ok(()) - } -} - -/// Background metrics client that enqueues metrics to a dedicated worker thread. -#[derive(Clone)] -pub struct MetricsClient { - state: Arc, -} - -impl MetricsClient { - /// Build a metrics client from configuration and validate defaults. - pub fn new(config: MetricsConfig) -> Result { - let capacity = DEFAULT_QUEUE_CAPACITY; - - if capacity == 0 { - return Err(MetricsError::QueueCapacityZero); - } - - let dsn_value = config.dsn.clone(); - let dsn = dsn_value - .parse::() - .map_err(|source| MetricsError::InvalidDsn { - dsn: dsn_value, - source, - })?; - validate_tags(&config.default_tags)?; - - let http = reqwest::blocking::Client::builder() - .timeout(config.timeout) - .user_agent(config.user_agent.clone()) - .build() - .map_err(|source| MetricsError::HttpClientBuild { source })?; - - let auth_header = dsn.to_auth(Some(&config.user_agent)).to_string(); - - let core = Arc::new(ClientCore { - dsn, - http, - auth_header, - default_tags: config.default_tags, - }); - - let (sender, receiver) = mpsc::channel(capacity); - let handle = thread::spawn(move || run_worker_thread(core, receiver)); - - Ok(Self { - state: Arc::new(WorkerState { - sender: Mutex::new(Some(sender)), - handle: Mutex::new(Some(handle)), - capacity, - }), - }) - } - - /// Send a single counter increment without blocking the caller. - pub fn counter(&self, name: &str, inc: i64, tags: &[(&str, &str)]) -> Result<()> { - let mut batch = MetricsBatch::new(); - batch.counter(name, inc, tags)?; - self.send(batch) - } - - /// Send a single histogram sample with the provided buckets. - pub fn histogram( - &self, - name: &str, - value: i64, - buckets: &HistogramBuckets, - tags: &[(&str, &str)], - ) -> Result<()> { - let mut batch = MetricsBatch::new(); - batch.histogram(name, value, buckets, tags)?; - self.send(batch) - } - - /// Record a duration in milliseconds using histogram buckets. - pub fn record_duration( - &self, - name: &str, - duration: Duration, - buckets: &HistogramBuckets, - tags: &[(&str, &str)], - ) -> Result<()> { - let millis = duration_to_millis(duration); - self.histogram(name, millis, buckets, tags) - } - - /// Measure a closure and emit a histogram sample for the elapsed time. - pub fn time( - &self, - name: &str, - buckets: &HistogramBuckets, - tags: &[(&str, &str)], - f: impl FnOnce() -> T, - ) -> Result { - let start = Instant::now(); - let output = f(); - self.record_duration(name, start.elapsed(), buckets, tags)?; - Ok(output) - } - - /// Measure a closure that returns a metrics result without nesting results. - pub fn time_result( - &self, - name: &str, - buckets: &HistogramBuckets, - tags: &[(&str, &str)], - f: impl FnOnce() -> Result, - ) -> Result { - let start = Instant::now(); - let output = f(); - match output { - Ok(value) => { - self.record_duration(name, start.elapsed(), buckets, tags)?; - Ok(value) - } - Err(err) => { - let _ = self.record_duration(name, start.elapsed(), buckets, tags); - Err(err) - } - } - } - - /// Create an empty batch for multi-metric sends. - pub fn batch(&self) -> MetricsBatch { - MetricsBatch::new() - } - - /// Enqueue a batch of metrics for the worker to send. - pub fn send(&self, batch: MetricsBatch) -> Result<()> { - if batch.is_empty() { - return Ok(()); - } - - let sender = self - .state - .sender - .lock() - .unwrap_or_else(std::sync::PoisonError::into_inner); - let Some(sender) = sender.as_ref() else { - return Err(MetricsError::WorkerUnavailable); - }; - - match sender.try_send(WorkerMessage::Batch(batch)) { - Ok(()) => Ok(()), - Err(TrySendError::Full(_)) => Err(MetricsError::QueueFull { - capacity: self.state.capacity, - }), - Err(TrySendError::Closed(_)) => Err(MetricsError::WorkerUnavailable), - } - } - - /// Flush queued metrics and stop the worker thread. - pub fn shutdown(&self) -> Result<()> { - self.shutdown_inner(DEFAULT_SHUTDOWN_TIMEOUT) - } - - fn shutdown_inner(&self, timeout: Duration) -> Result<()> { - let sender = self - .state - .sender - .lock() - .unwrap_or_else(std::sync::PoisonError::into_inner) - .take(); - let mut handle = self - .state - .handle - .lock() - .unwrap_or_else(std::sync::PoisonError::into_inner); - let Some(handle) = handle.take() else { - return Ok(()); - }; - - if let Some(sender) = sender { - let _ = sender.try_send(WorkerMessage::Shutdown); - } - - if timeout.is_zero() { - if handle.is_finished() { - handle.join().map_err(|_| MetricsError::WorkerPanicked)?; - } - return Ok(()); - } - - let start = Instant::now(); - while start.elapsed() < timeout { - if handle.is_finished() { - handle.join().map_err(|_| MetricsError::WorkerPanicked)?; - return Ok(()); - } - thread::sleep(SHUTDOWN_POLL_INTERVAL); - } - - Ok(()) - } -} - -impl Drop for MetricsClient { - fn drop(&mut self) { - if Arc::strong_count(&self.state) == 1 { - let _ = self.shutdown_inner(DEFAULT_SHUTDOWN_TIMEOUT); - } - } -} - -fn run_worker_thread(client: Arc, receiver: mpsc::Receiver) { - let runtime = Builder::new_current_thread() - .enable_all() - .build() - .expect("metrics runtime"); - let handle = runtime.spawn(run_worker(client, receiver)); - runtime.block_on(async { - if let Err(err) = handle.await { - panic!("metrics worker panicked: {err}"); - } - }); -} - -async fn run_worker(client: Arc, mut receiver: mpsc::Receiver) { - while let Some(message) = receiver.recv().await { - match message { - WorkerMessage::Batch(batch) => { - let client = Arc::clone(&client); - let send_result = tokio::task::spawn_blocking(move || client.send(batch)).await; - match send_result { - Ok(Ok(())) => {} - Ok(Err(err)) => error_or_panic(format!("metrics send failed: {err}")), - Err(err) => error_or_panic(format!("metrics send task panicked: {err}")), - } - } - WorkerMessage::Shutdown => break, - } - } -} diff --git a/codex-rs/metrics/src/config.rs b/codex-rs/metrics/src/config.rs deleted file mode 100644 index 8ed75e74a5b..00000000000 --- a/codex-rs/metrics/src/config.rs +++ /dev/null @@ -1,55 +0,0 @@ -use crate::DEFAULT_TIMEOUT; -use crate::SENTRY_DSN; -use crate::error::Result; -use crate::validation::validate_tag_key; -use crate::validation::validate_tag_value; -use std::collections::BTreeMap; -use std::time::Duration; - -#[derive(Clone, Debug)] -pub struct MetricsConfig { - pub(crate) dsn: String, - pub(crate) default_tags: BTreeMap, - pub(crate) timeout: Duration, - pub(crate) user_agent: String, -} - -impl MetricsConfig { - /// Create a config with the provided DSN and default settings. - pub fn new(dsn: impl Into) -> Self { - Self { - dsn: dsn.into(), - default_tags: BTreeMap::new(), - timeout: DEFAULT_TIMEOUT, - user_agent: format!("codex-metrics/{}", env!("CARGO_PKG_VERSION")), - } - } - - /// Add a default tag that will be sent with every metric. - pub fn with_tag(mut self, key: impl Into, value: impl Into) -> Result { - let key = key.into(); - let value = value.into(); - validate_tag_key(&key)?; - validate_tag_value(&value)?; - self.default_tags.insert(key, value); - Ok(self) - } - - /// Override the HTTP timeout. - pub fn with_timeout(mut self, timeout: Duration) -> Self { - self.timeout = timeout; - self - } - - /// Override the user agent string. - pub fn with_user_agent(mut self, user_agent: impl Into) -> Self { - self.user_agent = user_agent.into(); - self - } -} - -impl Default for MetricsConfig { - fn default() -> Self { - Self::new(SENTRY_DSN) - } -} diff --git a/codex-rs/metrics/src/lib.rs b/codex-rs/metrics/src/lib.rs deleted file mode 100644 index f47380958e8..00000000000 --- a/codex-rs/metrics/src/lib.rs +++ /dev/null @@ -1,26 +0,0 @@ -mod batch; -mod client; -mod config; -mod error; -mod statsd; -mod time; -mod util; -mod validation; - -use std::time::Duration; - -pub(crate) const SENTRY_DSN: &str = - "https://ae32ed50620d7a7792c1ce5df38b3e3e@o33249.ingest.us.sentry.io/4510195390611458"; -pub(crate) const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10); -pub(crate) const STATSD_CONTENT_TYPE: &str = "text/plain"; -pub(crate) const ENVELOPE_CONTENT_TYPE: &str = "application/x-sentry-envelope"; -pub(crate) const DEFAULT_QUEUE_CAPACITY: usize = 1024; -pub(crate) const DEFAULT_SHUTDOWN_TIMEOUT: Duration = Duration::from_millis(500); -pub(crate) const SHUTDOWN_POLL_INTERVAL: Duration = Duration::from_millis(10); - -pub use crate::batch::HistogramBuckets; -pub use crate::batch::MetricsBatch; -pub use crate::client::MetricsClient; -pub use crate::config::MetricsConfig; -pub use crate::error::MetricsError; -pub use crate::error::Result; diff --git a/codex-rs/metrics/src/statsd.rs b/codex-rs/metrics/src/statsd.rs deleted file mode 100644 index 5ed54ca8165..00000000000 --- a/codex-rs/metrics/src/statsd.rs +++ /dev/null @@ -1,102 +0,0 @@ -use crate::STATSD_CONTENT_TYPE; -use crate::error::MetricsError; -use crate::error::Result; -use crate::validation::validate_metric_name; -use crate::validation::validate_tag_key; -use crate::validation::validate_tag_value; -use sentry::types::Dsn; -use std::collections::BTreeMap; - -pub(crate) struct StatsdLine { - name: String, - value: i64, - kind: MetricKind, - tags: Vec<(String, String)>, -} - -impl StatsdLine { - pub(crate) fn counter(name: &str, value: i64, tags: Vec<(String, String)>) -> Result { - validate_metric_name(name)?; - Ok(Self { - name: name.to_string(), - value, - kind: MetricKind::Counter, - tags, - }) - } - - pub(crate) fn render(&self, default_tags: &BTreeMap) -> Result { - let tags = merge_tags(default_tags, &self.tags); - let name = self.name.as_str(); - let value = self.value; - let kind = self.kind.as_str(); - let mut line = format!("{name}:{value}|{kind}"); - - if !tags.is_empty() { - let taglist = tags - .iter() - .map(|(key, value)| format!("{key}:{value}")) - .collect::>() - .join(","); - line.push_str("|#"); - line.push_str(&taglist); - } - - Ok(line) - } -} - -enum MetricKind { - Counter, -} - -impl MetricKind { - fn as_str(&self) -> &'static str { - match self { - MetricKind::Counter => "c", - } - } -} - -pub(crate) fn build_statsd_envelope(dsn: &Dsn, payload: &str) -> Result> { - let header = serde_json::json!({ - "dsn": dsn.to_string(), - }); - let mut bytes = Vec::new(); - serde_json::to_writer(&mut bytes, &header) - .map_err(|source| MetricsError::SerializeEnvelopeHeader { source })?; - bytes.push(b'\n'); - - let item_header = serde_json::json!({ - "type": "statsd", - "length": payload.len(), - "content_type": STATSD_CONTENT_TYPE, - }); - serde_json::to_writer(&mut bytes, &item_header) - .map_err(|source| MetricsError::SerializeEnvelopeItemHeader { source })?; - bytes.push(b'\n'); - bytes.extend_from_slice(payload.as_bytes()); - bytes.push(b'\n'); - Ok(bytes) -} - -pub(crate) fn collect_tags(tags: &[(&str, &str)]) -> Result> { - tags.iter() - .map(|(key, value)| { - validate_tag_key(key)?; - validate_tag_value(value)?; - Ok(((*key).to_string(), (*value).to_string())) - }) - .collect() -} - -fn merge_tags( - default_tags: &BTreeMap, - tags: &[(String, String)], -) -> BTreeMap { - let mut merged = default_tags.clone(); - for (key, value) in tags { - merged.insert(key.clone(), value.clone()); - } - merged -} diff --git a/codex-rs/metrics/tests/harness/mod.rs b/codex-rs/metrics/tests/harness/mod.rs deleted file mode 100644 index 3a51dd73574..00000000000 --- a/codex-rs/metrics/tests/harness/mod.rs +++ /dev/null @@ -1,165 +0,0 @@ -use serde_json::Value; -use std::collections::BTreeMap; -use std::io::Read; -use std::io::Write; -use std::net::TcpListener; -use std::net::TcpStream; -use std::thread; - -#[derive(Debug)] -pub(crate) struct CapturedRequest { - pub(crate) method: String, - pub(crate) path: String, - pub(crate) headers: BTreeMap, - pub(crate) body: Vec, -} - -#[derive(Debug)] -pub(crate) struct ParsedEnvelope { - pub(crate) header: Value, - pub(crate) item_header: Value, - pub(crate) payload: String, -} - -#[derive(Debug)] -pub(crate) struct ParsedStatsdLine { - pub(crate) name: String, - pub(crate) value: i64, - pub(crate) kind: String, - pub(crate) tags: BTreeMap, -} - -/// Spawn a simple HTTP server that captures one request and responds with `status`. -pub(crate) fn spawn_server(status: u16) -> (String, thread::JoinHandle) { - let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server"); - let addr = listener.local_addr().expect("local addr"); - let dsn = format!("http://public:@{addr}/123"); - - let handle = thread::spawn(move || { - let (mut stream, _) = listener.accept().expect("accept connection"); - let request = read_http_request(&mut stream); - let reason = match status { - 200 => "OK", - 500 => "Internal Server Error", - _ => "OK", - }; - let response = - format!("HTTP/1.1 {status} {reason}\r\nContent-Length: 0\r\nConnection: close\r\n\r\n"); - stream - .write_all(response.as_bytes()) - .expect("write response"); - request - }); - - (dsn, handle) -} - -// Read a single HTTP request from the stream and return the parsed data. -fn read_http_request(stream: &mut TcpStream) -> CapturedRequest { - let mut buffer = Vec::new(); - let mut chunk = [0_u8; 1024]; - let mut header_end = None; - while header_end.is_none() { - let read = stream.read(&mut chunk).expect("read request"); - if read == 0 { - break; - } - buffer.extend_from_slice(&chunk[..read]); - header_end = find_header_end(&buffer); - } - let header_end = header_end.expect("request headers"); - let headers_bytes = &buffer[..header_end]; - let headers_str = std::str::from_utf8(headers_bytes).expect("headers utf-8"); - let mut lines = headers_str.split("\r\n"); - let request_line = lines.next().expect("request line"); - let mut request_parts = request_line.split_whitespace(); - let method = request_parts.next().expect("method").to_string(); - let path = request_parts.next().expect("path").to_string(); - - let mut headers = BTreeMap::new(); - for line in lines { - if line.is_empty() { - continue; - } - if let Some((key, value)) = line.split_once(':') { - headers.insert(key.trim().to_ascii_lowercase(), value.trim().to_string()); - } - } - - let content_length = headers - .get("content-length") - .and_then(|value| value.parse::().ok()) - .unwrap_or(0); - let mut body = buffer[header_end..].to_vec(); - while body.len() < content_length { - let read = stream.read(&mut chunk).expect("read body"); - if read == 0 { - break; - } - body.extend_from_slice(&chunk[..read]); - } - - CapturedRequest { - method, - path, - headers, - body, - } -} - -// Locate the end of the HTTP headers in a buffered request. -fn find_header_end(buffer: &[u8]) -> Option { - buffer - .windows(4) - .position(|window| window == b"\r\n\r\n") - .map(|pos| pos + 4) -} - -/// Parse a Sentry envelope payload into headers and statsd payload text. -pub(crate) fn parse_envelope(body: &[u8]) -> ParsedEnvelope { - let mut parts = body.splitn(3, |byte| *byte == b'\n'); - let header_line = parts.next().expect("envelope header"); - let item_header_line = parts.next().expect("item header"); - let payload = parts.next().unwrap_or(&[]); - - let header = serde_json::from_slice(header_line).expect("parse envelope header"); - let item_header = serde_json::from_slice(item_header_line).expect("parse item header"); - let payload = std::str::from_utf8(payload) - .expect("payload utf-8") - .trim_end_matches('\n') - .to_string(); - - ParsedEnvelope { - header, - item_header, - payload, - } -} - -/// Parse a single statsd line (with optional tags) into components. -pub(crate) fn parse_statsd_line(line: &str) -> ParsedStatsdLine { - let (metric, tags_part) = line - .split_once("|#") - .map(|(metric, tags)| (metric, Some(tags))) - .unwrap_or((line, None)); - let (name_value, kind) = metric.split_once('|').expect("metric kind"); - let (name, value) = name_value.split_once(':').expect("metric value"); - let value = value.parse::().expect("metric value parse"); - - let mut tags = BTreeMap::new(); - if let Some(tags_part) = tags_part - && !tags_part.is_empty() - { - for tag in tags_part.split(',') { - let (key, value) = tag.split_once(':').expect("tag"); - tags.insert(key.to_string(), value.to_string()); - } - } - - ParsedStatsdLine { - name: name.to_string(), - value, - kind: kind.to_string(), - tags, - } -} diff --git a/codex-rs/metrics/tests/suite/mod.rs b/codex-rs/metrics/tests/suite/mod.rs deleted file mode 100644 index 42708df7981..00000000000 --- a/codex-rs/metrics/tests/suite/mod.rs +++ /dev/null @@ -1,3 +0,0 @@ -mod send; -mod timing; -mod validation; diff --git a/codex-rs/metrics/tests/suite/send.rs b/codex-rs/metrics/tests/suite/send.rs deleted file mode 100644 index 1abfdde37aa..00000000000 --- a/codex-rs/metrics/tests/suite/send.rs +++ /dev/null @@ -1,207 +0,0 @@ -use crate::harness::parse_envelope; -use crate::harness::parse_statsd_line; -use crate::harness::spawn_server; -use codex_metrics::HistogramBuckets; -use codex_metrics::MetricsClient; -use codex_metrics::MetricsConfig; -use codex_metrics::MetricsError; -use codex_metrics::Result; -use pretty_assertions::assert_eq; -use std::net::TcpListener; -use std::thread; -use std::time::Duration; - -// Ensures counters/histograms render with default + per-call tags. -#[test] -fn send_builds_payload_with_tags_and_histograms() -> Result<()> { - let (dsn, handle) = spawn_server(200); - let metrics = MetricsClient::new( - MetricsConfig::new(dsn.clone()) - .with_tag("service", "codex-cli")? - .with_tag("env", "prod")?, - )?; - let buckets = HistogramBuckets::from_values(&[25, 50, 100])?; - - let mut batch = metrics.batch(); - batch.counter("codex.turns", 1, &[("model", "gpt-5.1"), ("env", "dev")])?; - batch.histogram("codex.tool_latency", 25, &buckets, &[("tool", "shell")])?; - metrics.send(batch)?; - metrics.shutdown()?; - - let captured = handle.join().expect("server thread"); - assert_eq!(captured.method, "POST"); - assert_eq!(captured.path, "/api/123/envelope/"); - assert_eq!( - captured.headers.get("content-type").map(String::as_str), - Some("application/x-sentry-envelope") - ); - - let envelope = parse_envelope(&captured.body); - assert_eq!(envelope.header["dsn"].as_str(), Some(dsn.as_str())); - assert_eq!(envelope.item_header["type"], "statsd"); - assert_eq!(envelope.item_header["content_type"], "text/plain"); - assert_eq!( - envelope.item_header["length"].as_u64(), - Some(envelope.payload.len() as u64) - ); - - let lines: Vec<&str> = envelope.payload.split('\n').collect(); - assert_eq!(lines.len(), 5); - - let line = parse_statsd_line(lines[0]); - assert_eq!(line.name, "codex.turns"); - assert_eq!(line.value, 1); - assert_eq!(line.kind, "c"); - assert_eq!( - line.tags.get("service").map(String::as_str), - Some("codex-cli") - ); - assert_eq!(line.tags.get("env").map(String::as_str), Some("dev")); - assert_eq!(line.tags.get("model").map(String::as_str), Some("gpt-5.1")); - - for (line, expected_le) in lines.iter().skip(1).zip(["25", "50", "100", "inf"]) { - let line = parse_statsd_line(line); - assert_eq!(line.name, "codex.tool_latency"); - assert_eq!(line.value, 1); - assert_eq!(line.kind, "c"); - assert_eq!( - line.tags.get("service").map(String::as_str), - Some("codex-cli") - ); - assert_eq!(line.tags.get("env").map(String::as_str), Some("prod")); - assert_eq!(line.tags.get("tool").map(String::as_str), Some("shell")); - assert_eq!(line.tags.get("le").map(String::as_str), Some(expected_le)); - } - - Ok(()) -} - -// Ensures defaults merge per line and overrides take precedence. -#[test] -fn send_merges_default_tags_per_line() -> Result<()> { - let (dsn, handle) = spawn_server(200); - let metrics = MetricsClient::new( - MetricsConfig::new(dsn.clone()) - .with_tag("service", "codex-cli")? - .with_tag("env", "prod")? - .with_tag("region", "us")?, - )?; - - let mut batch = metrics.batch(); - batch.counter("codex.alpha", 1, &[("env", "dev"), ("component", "alpha")])?; - batch.counter( - "codex.beta", - 2, - &[("service", "worker"), ("component", "beta")], - )?; - metrics.send(batch)?; - metrics.shutdown()?; - - let captured = handle.join().expect("server thread"); - let envelope = parse_envelope(&captured.body); - let lines: Vec<&str> = envelope.payload.split('\n').collect(); - assert_eq!(lines.len(), 2); - assert_eq!( - lines[0], - "codex.alpha:1|c|#component:alpha,env:dev,region:us,service:codex-cli" - ); - assert_eq!( - lines[1], - "codex.beta:2|c|#component:beta,env:prod,region:us,service:worker" - ); - - Ok(()) -} - -// Verifies values above the max bucket use the inf tag. -#[test] -fn send_uses_inf_bucket_for_values_over_max() -> Result<()> { - let (dsn, handle) = spawn_server(200); - let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; - let buckets = HistogramBuckets::from_values(&[10, 20])?; - - let mut batch = metrics.batch(); - batch.histogram("codex.tool_latency", 99, &buckets, &[("tool", "shell")])?; - metrics.send(batch)?; - metrics.shutdown()?; - - let captured = handle.join().expect("server thread"); - let envelope = parse_envelope(&captured.body); - let lines: Vec<&str> = envelope.payload.split('\n').collect(); - assert_eq!(lines.len(), 1); - let line = parse_statsd_line(lines[0]); - assert_eq!(line.tags.get("le").map(String::as_str), Some("inf")); - - Ok(()) -} - -// Verifies enqueued batches are delivered by the background worker. -#[test] -fn client_sends_enqueued_batch() -> Result<()> { - let (dsn, handle) = spawn_server(200); - let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; - - let mut batch = metrics.batch(); - batch.counter("codex.turns", 1, &[("model", "gpt-5.1")])?; - metrics.send(batch)?; - metrics.shutdown()?; - - let captured = handle.join().expect("server thread"); - let envelope = parse_envelope(&captured.body); - let lines: Vec<&str> = envelope.payload.split('\n').collect(); - assert_eq!(lines.len(), 1); - - let line = parse_statsd_line(lines[0]); - assert_eq!(line.name, "codex.turns"); - assert_eq!(line.value, 1); - assert_eq!(line.kind, "c"); - assert_eq!(line.tags.get("model").map(String::as_str), Some("gpt-5.1")); - - Ok(()) -} - -// Ensures a non-success response panics in debug builds via error_or_panic. -#[test] -fn send_panics_on_non_success_status_in_debug() -> Result<()> { - let (dsn, handle) = spawn_server(500); - let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; - - let mut batch = metrics.batch(); - batch.counter("codex.turns", 1, &[])?; - metrics.send(batch)?; - let err = metrics.shutdown().unwrap_err(); - assert!(matches!(err, MetricsError::WorkerPanicked)); - - let captured = handle.join().expect("server thread"); - assert_eq!(captured.method, "POST"); - Ok(()) -} - -// Ensures empty batches do not trigger any HTTP request. -#[test] -fn client_core_skips_empty_batch() -> Result<()> { - let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server"); - listener.set_nonblocking(true).expect("set nonblocking"); - let addr = listener.local_addr().expect("local addr"); - let dsn = format!("http://public:@{addr}/123"); - let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; - - metrics.send(metrics.batch())?; - metrics.shutdown()?; - - let mut saw_connection = false; - for _ in 0..10 { - match listener.accept() { - Ok(_) => { - saw_connection = true; - break; - } - Err(err) if err.kind() == std::io::ErrorKind::WouldBlock => { - thread::sleep(Duration::from_millis(10)); - } - Err(err) => panic!("unexpected accept error: {err}"), - } - } - assert!(!saw_connection, "expected no request for empty batch"); - Ok(()) -} diff --git a/codex-rs/metrics/tests/suite/timing.rs b/codex-rs/metrics/tests/suite/timing.rs deleted file mode 100644 index 938bdefaf14..00000000000 --- a/codex-rs/metrics/tests/suite/timing.rs +++ /dev/null @@ -1,113 +0,0 @@ -use crate::harness::parse_envelope; -use crate::harness::parse_statsd_line; -use crate::harness::spawn_server; -use codex_metrics::HistogramBuckets; -use codex_metrics::MetricsClient; -use codex_metrics::MetricsConfig; -use codex_metrics::MetricsError; -use codex_metrics::Result; -use pretty_assertions::assert_eq; -use std::time::Duration; - -// Ensures duration recording maps to the expected bucket tag. -#[test] -fn record_duration_uses_matching_bucket() -> Result<()> { - let (dsn, handle) = spawn_server(200); - let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; - let buckets = HistogramBuckets::from_values(&[10, 20])?; - - metrics.record_duration( - "codex.request_latency", - Duration::from_millis(15), - &buckets, - &[("route", "chat")], - )?; - metrics.shutdown()?; - - let captured = handle.join().expect("server thread"); - let envelope = parse_envelope(&captured.body); - let lines: Vec<&str> = envelope.payload.split('\n').collect(); - assert_eq!(lines.len(), 2); - - let line = parse_statsd_line(lines[0]); - assert_eq!(line.name, "codex.request_latency"); - assert_eq!(line.tags.get("route").map(String::as_str), Some("chat")); - assert_eq!(line.tags.get("le").map(String::as_str), Some("20")); - - let line = parse_statsd_line(lines[1]); - assert_eq!(line.tags.get("le").map(String::as_str), Some("inf")); - - Ok(()) -} - -// Ensures time_result returns the closure output and records timing. -#[test] -fn time_result_records_success() -> Result<()> { - let (dsn, handle) = spawn_server(200); - let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; - let buckets = HistogramBuckets::from_values(&[10, 20])?; - - let value = metrics.time_result( - "codex.request_latency", - &buckets, - &[("route", "chat")], - || Ok("ok"), - )?; - assert_eq!(value, "ok"); - metrics.shutdown()?; - - let captured = handle.join().expect("server thread"); - let envelope = parse_envelope(&captured.body); - let lines: Vec<&str> = envelope.payload.split('\n').collect(); - assert!(!lines.is_empty()); - let parsed: Vec<_> = lines.iter().copied().map(parse_statsd_line).collect(); - assert!( - parsed - .iter() - .any(|line| { line.tags.get("le").map(String::as_str) == Some("inf") }) - ); - for line in parsed { - assert_eq!(line.name, "codex.request_latency"); - assert_eq!(line.tags.get("route").map(String::as_str), Some("chat")); - assert!(line.tags.contains_key("le")); - } - - Ok(()) -} - -// Ensures time_result propagates errors but still records timing. -#[test] -fn time_result_records_on_error() -> Result<()> { - let (dsn, handle) = spawn_server(200); - let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; - let buckets = HistogramBuckets::from_values(&[10, 20])?; - - let err = metrics - .time_result( - "codex.request_latency", - &buckets, - &[("route", "chat")], - || -> Result<&'static str> { Err(MetricsError::EmptyMetricName) }, - ) - .unwrap_err(); - assert!(matches!(err, MetricsError::EmptyMetricName)); - metrics.shutdown()?; - - let captured = handle.join().expect("server thread"); - let envelope = parse_envelope(&captured.body); - let lines: Vec<&str> = envelope.payload.split('\n').collect(); - assert!(!lines.is_empty()); - let parsed: Vec<_> = lines.iter().copied().map(parse_statsd_line).collect(); - assert!( - parsed - .iter() - .any(|line| { line.tags.get("le").map(String::as_str) == Some("inf") }) - ); - for line in parsed { - assert_eq!(line.name, "codex.request_latency"); - assert_eq!(line.tags.get("route").map(String::as_str), Some("chat")); - assert!(line.tags.contains_key("le")); - } - - Ok(()) -} diff --git a/codex-rs/metrics/tests/suite/validation.rs b/codex-rs/metrics/tests/suite/validation.rs deleted file mode 100644 index 2383b774539..00000000000 --- a/codex-rs/metrics/tests/suite/validation.rs +++ /dev/null @@ -1,130 +0,0 @@ -use codex_metrics::HistogramBuckets; -use codex_metrics::MetricsBatch; -use codex_metrics::MetricsClient; -use codex_metrics::MetricsConfig; -use codex_metrics::MetricsError; -use codex_metrics::Result; - -// Validates invalid DSNs are rejected early. -#[test] -fn invalid_dsn_reports_error() -> Result<()> { - assert!(matches!( - MetricsClient::new(MetricsConfig::new("not a dsn")), - Err(MetricsError::InvalidDsn { .. }) - )); - Ok(()) -} - -// Ensures invalid tag components are rejected during config build. -#[test] -fn invalid_tag_component_is_rejected() -> Result<()> { - let err = MetricsConfig::default() - .with_tag("bad key", "value") - .unwrap_err(); - assert!(matches!( - err, - MetricsError::InvalidTagComponent { label, value } - if label == "tag key" && value == "bad key" - )); - Ok(()) -} - -// Ensures the reserved histogram bucketing tag key is rejected in config defaults. -#[test] -fn reserved_tag_key_is_rejected_in_config() -> Result<()> { - let err = MetricsConfig::default().with_tag("le", "10").unwrap_err(); - assert!(matches!( - err, - MetricsError::ReservedTagKey { key } if key == "le" - )); - Ok(()) -} - -// Ensures per-metric tag keys are validated. -#[test] -fn counter_rejects_invalid_tag_key() { - let mut batch = MetricsBatch::new(); - let err = batch - .counter("codex.turns", 1, &[("bad key", "value")]) - .unwrap_err(); - assert!(matches!( - err, - MetricsError::InvalidTagComponent { label, value } - if label == "tag key" && value == "bad key" - )); -} - -// Ensures per-metric tag keys cannot use reserved histogram bucketing keys. -#[test] -fn counter_rejects_reserved_tag_key() { - let mut batch = MetricsBatch::new(); - let err = batch - .counter("codex.turns", 1, &[("le", "10")]) - .unwrap_err(); - assert!(matches!( - err, - MetricsError::ReservedTagKey { key } if key == "le" - )); -} - -// Ensures per-metric tag values are validated. -#[test] -fn histogram_rejects_invalid_tag_value() -> Result<()> { - let mut batch = MetricsBatch::new(); - let buckets = HistogramBuckets::from_values(&[10])?; - let err = batch - .histogram( - "codex.request_latency", - 3, - &buckets, - &[("route", "bad value")], - ) - .unwrap_err(); - assert!(matches!( - err, - MetricsError::InvalidTagComponent { label, value } - if label == "tag value" && value == "bad value" - )); - Ok(()) -} - -// Ensures histogram calls reject reserved tag keys even though they internally add `le`. -#[test] -fn histogram_rejects_reserved_tag_key() -> Result<()> { - let mut batch = MetricsBatch::new(); - let buckets = HistogramBuckets::from_values(&[10])?; - let err = batch - .histogram("codex.request_latency", 3, &buckets, &[("le", "10")]) - .unwrap_err(); - assert!(matches!( - err, - MetricsError::ReservedTagKey { key } if key == "le" - )); - Ok(()) -} - -// Ensures invalid metric names are rejected when building a batch. -#[test] -fn counter_rejects_invalid_metric_name() -> Result<()> { - let mut batch = MetricsBatch::new(); - let err = batch.counter("bad name", 1, &[]).unwrap_err(); - assert!(matches!( - err, - MetricsError::InvalidMetricName { name } if name == "bad name" - )); - Ok(()) -} - -// Ensures empty histogram bucket lists are rejected. -#[test] -fn empty_buckets_are_rejected() { - let err = HistogramBuckets::from_values(&[]).unwrap_err(); - assert!(matches!(err, MetricsError::EmptyBuckets)); -} - -// Ensures range overflow is detected when building buckets. -#[test] -fn range_overflow_is_reported() { - let err = HistogramBuckets::from_range(i64::MAX - 1, i64::MAX, 2).unwrap_err(); - assert!(matches!(err, MetricsError::BucketRangeOverflow { .. })); -} diff --git a/codex-rs/metrics/tests/tests.rs b/codex-rs/metrics/tests/tests.rs deleted file mode 100644 index 92f88b95fd8..00000000000 --- a/codex-rs/metrics/tests/tests.rs +++ /dev/null @@ -1,2 +0,0 @@ -mod harness; -mod suite; diff --git a/codex-rs/otel/Cargo.toml b/codex-rs/otel/Cargo.toml index 8c99326a4c7..0b422de6011 100644 --- a/codex-rs/otel/Cargo.toml +++ b/codex-rs/otel/Cargo.toml @@ -19,13 +19,14 @@ codex-utils-absolute-path = { workspace = true } codex-api = { workspace = true } codex-protocol = { workspace = true } eventsource-stream = { workspace = true } -opentelemetry = { workspace = true, features = ["logs", "trace"] } +opentelemetry = { workspace = true, features = ["logs", "metrics", "trace"] } opentelemetry-appender-tracing = { workspace = true } opentelemetry-otlp = { workspace = true, features = [ "grpc-tonic", "http-proto", "http-json", "logs", + "metrics", "trace", "reqwest-blocking-client", "reqwest-rustls", @@ -33,16 +34,13 @@ opentelemetry-otlp = { workspace = true, features = [ "tls-roots", ]} opentelemetry-semantic-conventions = { workspace = true } -opentelemetry_sdk = { workspace = true, features = [ - "logs", - "rt-tokio", - "trace", -]} +opentelemetry_sdk = { workspace = true, features = ["logs", "metrics", "rt-tokio", "trace"] } http = { workspace = true } reqwest = { workspace = true, features = ["blocking", "rustls-tls"] } serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true } strum_macros = { workspace = true } +thiserror = { workspace = true } tokio = { workspace = true } tonic = { workspace = true, features = [ "transport", @@ -55,3 +53,4 @@ tracing-subscriber = { workspace = true } [dev-dependencies] opentelemetry_sdk = { workspace = true, features = ["testing"] } +pretty_assertions = { workspace = true } diff --git a/codex-rs/otel/README.md b/codex-rs/otel/README.md new file mode 100644 index 00000000000..953c7408117 --- /dev/null +++ b/codex-rs/otel/README.md @@ -0,0 +1,136 @@ +# codex-otel + +`codex-otel` is the OpenTelemetry integration crate for Codex. It provides: + +- Trace/log exporters and tracing subscriber layers (`codex_otel::traces::otel_provider`). +- A structured event helper (`codex_otel::traces::otel_manager::OtelManager`). +- A Statsig OTLP metrics client (`codex_otel::metrics`). +- A metrics facade on `OtelManager` so tracing + metrics share metadata. + +## Tracing and logs + +Create an OTEL provider from `OtelSettings`, then attach its layers to your +`tracing_subscriber` registry: + +```rust +use codex_otel::config::OtelExporter; +use codex_otel::config::OtelHttpProtocol; +use codex_otel::config::OtelSettings; +use codex_otel::traces::otel_provider::OtelProvider; +use tracing_subscriber::prelude::*; + +let settings = OtelSettings { + environment: "dev".to_string(), + service_name: "codex-cli".to_string(), + service_version: env!("CARGO_PKG_VERSION").to_string(), + codex_home: std::path::PathBuf::from("/tmp"), + exporter: OtelExporter::OtlpHttp { + endpoint: "https://otlp.example.com".to_string(), + headers: std::collections::HashMap::new(), + protocol: OtelHttpProtocol::Binary, + tls: None, + }, + trace_exporter: OtelExporter::OtlpHttp { + endpoint: "https://otlp.example.com".to_string(), + headers: std::collections::HashMap::new(), + protocol: OtelHttpProtocol::Binary, + tls: None, + }, +}; + +if let Some(provider) = OtelProvider::from(&settings)? { + let registry = tracing_subscriber::registry() + .with(provider.logger_layer()) + .with(provider.tracing_layer()); + registry.init(); +} +``` + +## OtelManager (events) + +`OtelManager` adds consistent metadata to tracing events and helps record +Codex-specific events. + +```rust +use codex_otel::traces::otel_manager::OtelManager; + +let manager = OtelManager::new( + conversation_id, + model, + slug, + account_id, + account_email, + auth_mode, + log_user_prompts, + terminal_type, + session_source, +); + +manager.user_prompt(&prompt_items); +``` + +## Metrics (Statsig + OTLP) + +The metrics client sends counters and histograms to Statsig via OTLP/HTTP. Use +placeholders for the Statsig endpoint and API key header until you have real +values: + +```rust +use codex_otel::metrics::HistogramBuckets; +use codex_otel::metrics::MetricsClient; +use codex_otel::metrics::MetricsConfig; + +let metrics = MetricsClient::new( + MetricsConfig::new("") + .with_endpoint("") + .with_api_key_header(""), +)?; + +let buckets = HistogramBuckets::from_values(&[25, 50, 100, 250, 500, 1000])?; +metrics.counter("codex.session_started", 1, &[("source", "tui")])?; +``` + +## Metrics via OtelManager + +Attach a metrics client (or config) to `OtelManager` to reuse metadata: + +```rust +use codex_otel::metrics::HistogramBuckets; +use codex_otel::metrics::MetricsConfig; +use codex_otel::traces::otel_manager::OtelManager; + +let manager = OtelManager::new( + conversation_id, + model, + slug, + account_id, + account_email, + auth_mode, + log_user_prompts, + terminal_type, + session_source, +) +.with_metrics_config( + MetricsConfig::new("") + .with_endpoint("") + .with_api_key_header(""), +)?; + +let buckets = HistogramBuckets::from_values(&[25, 50, 100, 250, 500])?; +manager.counter("codex.session_started", 1, &[("source", "tui")])?; +manager.histogram("codex.request_latency", 83, &buckets, &[("route", "chat")])?; +``` + +By default, `OtelManager` adds metadata tags to metrics: `auth_mode`, `model`, +`slug`, `terminal.type`, and `app.version`. Use +`with_metrics_without_metadata_tags` to disable these tags. + +For batching, use `OtelManager::batch()` and `OtelManager::send()`. + +## Shutdown + +- `OtelProvider::shutdown()` stops the OTEL exporter. +- `OtelManager::shutdown_metrics()` flushes and stops the metrics worker. + +Both are optional because drop performs best-effort shutdown, but calling them +explicitly gives deterministic flushing. diff --git a/codex-rs/otel/src/lib.rs b/codex-rs/otel/src/lib.rs index 5211c8e89ba..0f55a738096 100644 --- a/codex-rs/otel/src/lib.rs +++ b/codex-rs/otel/src/lib.rs @@ -1,4 +1,3 @@ pub mod config; - -pub mod otel_manager; -pub mod otel_provider; +pub mod metrics; +pub mod traces; diff --git a/codex-rs/metrics/src/batch.rs b/codex-rs/otel/src/metrics/batch.rs similarity index 83% rename from codex-rs/metrics/src/batch.rs rename to codex-rs/otel/src/metrics/batch.rs index 41a4ca9d50a..37bad7d0a44 100644 --- a/codex-rs/metrics/src/batch.rs +++ b/codex-rs/otel/src/metrics/batch.rs @@ -1,8 +1,7 @@ -use crate::error::MetricsError; -use crate::error::Result; -use crate::statsd::StatsdLine; -use crate::statsd::collect_tags; -use std::collections::BTreeMap; +use crate::metrics::error::MetricsError; +use crate::metrics::error::Result; +use crate::metrics::tags::collect_tags; +use crate::metrics::validation::validate_metric_name; #[cfg_attr(test, derive(PartialEq, Eq))] #[derive(Clone, Debug)] @@ -97,10 +96,28 @@ impl HistogramBuckets { Self::new(bounds) } + + pub(crate) fn bounds(&self) -> &[i64] { + &self.bounds + } +} + +#[derive(Clone, Debug)] +pub(crate) enum MetricEvent { + Counter { + name: String, + value: i64, + tags: Vec<(String, String)>, + }, + Histogram { + name: String, + value: i64, + tags: Vec<(String, String)>, + }, } pub struct MetricsBatch { - lines: Vec, + events: Vec, } impl Default for MetricsBatch { @@ -112,17 +129,22 @@ impl Default for MetricsBatch { impl MetricsBatch { /// Create an empty metrics batch. pub fn new() -> Self { - Self { lines: Vec::new() } + Self { events: Vec::new() } } /// Append a counter increment to the batch. pub fn counter(&mut self, name: &str, inc: i64, tags: &[(&str, &str)]) -> Result<()> { + validate_metric_name(name)?; let tags = collect_tags(tags)?; - self.lines.push(StatsdLine::counter(name, inc, tags)?); + self.events.push(MetricEvent::Counter { + name: name.to_string(), + value: inc, + tags, + }); Ok(()) } - /// Append a histogram sample, encoded as a bucketed counter, to the batch. + /// Append a histogram sample to the batch. pub fn histogram( &mut self, name: &str, @@ -130,36 +152,32 @@ impl MetricsBatch { buckets: &HistogramBuckets, tags: &[(&str, &str)], ) -> Result<()> { - let base_tags = collect_tags(tags)?; - for bound in buckets.bounds.iter().filter(|bound| value <= **bound) { - let mut tags = base_tags.clone(); - tags.push(("le".to_string(), bound.to_string())); - self.lines.push(StatsdLine::counter(name, 1, tags)?); - } - let mut tags = base_tags; - tags.push(("le".to_string(), "inf".to_string())); - self.lines.push(StatsdLine::counter(name, 1, tags)?); + // Buckets remain part of the API, but OTEL histogram aggregation owns bucket selection. + let _ = buckets.bounds(); + validate_metric_name(name)?; + let tags = collect_tags(tags)?; + self.events.push(MetricEvent::Histogram { + name: name.to_string(), + value, + tags, + }); Ok(()) } pub(crate) fn is_empty(&self) -> bool { - self.lines.is_empty() + self.events.is_empty() } - pub(crate) fn render(&self, default_tags: &BTreeMap) -> Result { - let mut rendered = Vec::with_capacity(self.lines.len()); - for line in &self.lines { - rendered.push(line.render(default_tags)?); - } - Ok(rendered.join("\n")) + pub(crate) fn into_events(self) -> Vec { + self.events } } #[cfg(test)] mod tests { use super::HistogramBuckets; - use crate::error::MetricsError; - use crate::error::Result; + use crate::metrics::error::MetricsError; + use crate::metrics::error::Result; use pretty_assertions::assert_eq; #[test] diff --git a/codex-rs/otel/src/metrics/client.rs b/codex-rs/otel/src/metrics/client.rs new file mode 100644 index 00000000000..49a05765abd --- /dev/null +++ b/codex-rs/otel/src/metrics/client.rs @@ -0,0 +1,376 @@ +use crate::metrics::DEFAULT_QUEUE_CAPACITY; +use crate::metrics::DEFAULT_SHUTDOWN_TIMEOUT; +use crate::metrics::SHUTDOWN_POLL_INTERVAL; +use crate::metrics::batch::HistogramBuckets; +use crate::metrics::batch::MetricEvent; +use crate::metrics::batch::MetricsBatch; +use crate::metrics::config::MetricsConfig; +use crate::metrics::config::MetricsExporter; +use crate::metrics::error::MetricsError; +use crate::metrics::error::Result; +use crate::metrics::tags::merge_tags; +use crate::metrics::tags::tags_to_attributes; +use crate::metrics::time::duration_to_millis; +use crate::metrics::util::error_or_panic; +use crate::metrics::validation::validate_tags; +use opentelemetry::KeyValue; +use opentelemetry::metrics::Histogram; +use opentelemetry::metrics::Meter; +use opentelemetry::metrics::MeterProvider; +use opentelemetry::metrics::UpDownCounter; +use opentelemetry_otlp::MetricExporter; +use opentelemetry_otlp::Protocol; +use opentelemetry_otlp::WithExportConfig; +use opentelemetry_otlp::WithHttpConfig; +use opentelemetry_sdk::metrics::PeriodicReader; +use opentelemetry_sdk::metrics::SdkMeterProvider; +use std::collections::BTreeMap; +use std::collections::HashMap; +use std::sync::Arc; +use std::sync::Mutex; +use std::sync::mpsc; +use std::sync::mpsc::TrySendError; +use std::thread; +use std::time::Duration; +use std::time::Instant; + +const METER_NAME: &str = "codex-otel-metrics"; + +enum WorkerMessage { + Batch(MetricsBatch), + Shutdown, +} + +struct WorkerState { + sender: Mutex>>, + handle: Mutex>>, + capacity: usize, + meter_provider: Mutex>, +} + +#[derive(Debug)] +struct MetricRecorder { + meter: Meter, + counters: HashMap>, + histograms: HashMap>, + default_tags: BTreeMap, +} + +impl MetricRecorder { + fn new(meter: Meter, default_tags: BTreeMap) -> Self { + Self { + meter, + counters: HashMap::new(), + histograms: HashMap::new(), + default_tags, + } + } + + fn record_batch(&mut self, batch: MetricsBatch) { + for event in batch.into_events() { + match event { + MetricEvent::Counter { name, value, tags } => { + self.record_counter(&name, value, &tags); + } + MetricEvent::Histogram { name, value, tags } => { + self.record_histogram(&name, value, &tags); + } + } + } + } + + fn record_counter(&mut self, name: &str, value: i64, tags: &[(String, String)]) { + let attributes = self.attributes_for(tags); + let name = name.to_string(); + let counter = self + .counters + .entry(name.clone()) + .or_insert_with(|| self.meter.i64_up_down_counter(name.clone()).build()); + counter.add(value, &attributes); + } + + fn record_histogram(&mut self, name: &str, value: i64, tags: &[(String, String)]) { + let attributes = self.attributes_for(tags); + let name = name.to_string(); + let histogram = self + .histograms + .entry(name.clone()) + .or_insert_with(|| self.meter.f64_histogram(name.clone()).build()); + histogram.record(value as f64, &attributes); + } + + fn attributes_for(&self, tags: &[(String, String)]) -> Vec { + let merged = merge_tags(&self.default_tags, tags); + tags_to_attributes(&merged) + } +} + +/// Background metrics client that enqueues metrics to a dedicated worker thread. +#[derive(Clone)] +pub struct MetricsClient { + state: Arc, +} + +impl std::fmt::Debug for MetricsClient { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("MetricsClient") + .field("capacity", &self.state.capacity) + .finish() + } +} + +impl MetricsClient { + /// Build a metrics client from configuration and validate defaults. + pub fn new(config: MetricsConfig) -> Result { + let capacity = DEFAULT_QUEUE_CAPACITY; + + if capacity == 0 { + return Err(MetricsError::QueueCapacityZero); + } + + if config.endpoint.is_empty() { + return Err(MetricsError::EmptyEndpoint); + } + + if config.api_key.is_empty() { + return Err(MetricsError::EmptyApiKey); + } + + validate_tags(&config.default_tags)?; + + let meter_provider = build_meter_provider(&config)?; + let meter = meter_provider.meter(METER_NAME); + + let recorder = MetricRecorder::new(meter, config.default_tags); + + let (sender, receiver) = mpsc::sync_channel(capacity); + let worker_provider = meter_provider.clone(); + let handle = thread::spawn(move || run_worker(recorder, receiver, worker_provider)); + + Ok(Self { + state: Arc::new(WorkerState { + sender: Mutex::new(Some(sender)), + handle: Mutex::new(Some(handle)), + capacity, + meter_provider: Mutex::new(Some(meter_provider)), + }), + }) + } + + /// Send a single counter increment without blocking the caller. + pub fn counter(&self, name: &str, inc: i64, tags: &[(&str, &str)]) -> Result<()> { + let mut batch = MetricsBatch::new(); + batch.counter(name, inc, tags)?; + self.send(batch) + } + + /// Send a single histogram sample. + pub fn histogram( + &self, + name: &str, + value: i64, + buckets: &HistogramBuckets, + tags: &[(&str, &str)], + ) -> Result<()> { + let mut batch = MetricsBatch::new(); + batch.histogram(name, value, buckets, tags)?; + self.send(batch) + } + + /// Record a duration in milliseconds using a histogram. + pub fn record_duration( + &self, + name: &str, + duration: Duration, + buckets: &HistogramBuckets, + tags: &[(&str, &str)], + ) -> Result<()> { + let millis = duration_to_millis(duration); + self.histogram(name, millis, buckets, tags) + } + + /// Measure a closure and emit a histogram sample for the elapsed time. + pub fn time( + &self, + name: &str, + buckets: &HistogramBuckets, + tags: &[(&str, &str)], + f: impl FnOnce() -> T, + ) -> Result { + let start = Instant::now(); + let output = f(); + self.record_duration(name, start.elapsed(), buckets, tags)?; + Ok(output) + } + + /// Measure a closure that returns a metrics result without nesting results. + pub fn time_result( + &self, + name: &str, + buckets: &HistogramBuckets, + tags: &[(&str, &str)], + f: impl FnOnce() -> Result, + ) -> Result { + let start = Instant::now(); + let output = f(); + match output { + Ok(value) => { + self.record_duration(name, start.elapsed(), buckets, tags)?; + Ok(value) + } + Err(err) => { + let _ = self.record_duration(name, start.elapsed(), buckets, tags); + Err(err) + } + } + } + + /// Create an empty batch for multi-metric sends. + pub fn batch(&self) -> MetricsBatch { + MetricsBatch::new() + } + + /// Enqueue a batch of metrics for the worker to send. + pub fn send(&self, batch: MetricsBatch) -> Result<()> { + if batch.is_empty() { + return Ok(()); + } + + let sender = self + .state + .sender + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let Some(sender) = sender.as_ref() else { + return Err(MetricsError::WorkerUnavailable); + }; + + match sender.try_send(WorkerMessage::Batch(batch)) { + Ok(()) => Ok(()), + Err(TrySendError::Full(_)) => Err(MetricsError::QueueFull { + capacity: self.state.capacity, + }), + Err(TrySendError::Disconnected(_)) => Err(MetricsError::WorkerUnavailable), + } + } + + /// Flush queued metrics and stop the worker thread. + pub fn shutdown(&self) -> Result<()> { + self.shutdown_inner(DEFAULT_SHUTDOWN_TIMEOUT) + } + + fn shutdown_inner(&self, timeout: Duration) -> Result<()> { + let sender = self + .state + .sender + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner) + .take(); + let mut handle = self + .state + .handle + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let mut meter_provider = self + .state + .meter_provider + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let Some(handle) = handle.take() else { + return Ok(()); + }; + let mut joined = false; + + if let Some(sender) = sender { + let _ = sender.try_send(WorkerMessage::Shutdown); + } + + if timeout.is_zero() { + if handle.is_finished() { + handle.join().map_err(|_| MetricsError::WorkerPanicked)?; + joined = true; + } + } else { + let start = Instant::now(); + while start.elapsed() < timeout { + if handle.is_finished() { + handle.join().map_err(|_| MetricsError::WorkerPanicked)?; + joined = true; + break; + } + thread::sleep(SHUTDOWN_POLL_INTERVAL); + } + } + + if joined && let Some(meter_provider) = meter_provider.take() { + meter_provider + .force_flush() + .map_err(|source| MetricsError::FlushFailed { source })?; + meter_provider + .shutdown() + .map_err(|source| MetricsError::ShutdownFailed { source })?; + } + + Ok(()) + } +} + +impl Drop for MetricsClient { + fn drop(&mut self) { + if Arc::strong_count(&self.state) == 1 { + let _ = self.shutdown_inner(DEFAULT_SHUTDOWN_TIMEOUT); + } + } +} + +fn build_meter_provider(config: &MetricsConfig) -> Result { + match &config.exporter { + MetricsExporter::OtlpHttp => build_otlp_http_provider(config), + #[cfg(test)] + MetricsExporter::InMemory(exporter) => { + let reader = PeriodicReader::builder(exporter.clone()).build(); + Ok(SdkMeterProvider::builder().with_reader(reader).build()) + } + } +} + +fn build_otlp_http_provider(config: &MetricsConfig) -> Result { + let mut headers = HashMap::new(); + headers.insert(config.api_key_header.clone(), config.api_key.clone()); + if !config.user_agent.is_empty() { + headers.insert("User-Agent".to_string(), config.user_agent.clone()); + } + + let exporter = MetricExporter::builder() + .with_http() + .with_protocol(Protocol::HttpBinary) + .with_endpoint(config.endpoint.clone()) + .with_timeout(config.timeout) + .with_headers(headers) + .build() + .map_err(|source| MetricsError::ExporterBuild { source })?; + + let reader = PeriodicReader::builder(exporter) + .with_interval(config.export_interval) + .build(); + + Ok(SdkMeterProvider::builder().with_reader(reader).build()) +} + +fn run_worker( + mut recorder: MetricRecorder, + receiver: mpsc::Receiver, + meter_provider: SdkMeterProvider, +) { + for message in receiver { + match message { + WorkerMessage::Batch(batch) => { + recorder.record_batch(batch); + if let Err(err) = meter_provider.force_flush() { + error_or_panic(format!("metrics flush failed: {err}")); + } + } + WorkerMessage::Shutdown => break, + } + } +} diff --git a/codex-rs/otel/src/metrics/config.rs b/codex-rs/otel/src/metrics/config.rs new file mode 100644 index 00000000000..aeff931f26c --- /dev/null +++ b/codex-rs/otel/src/metrics/config.rs @@ -0,0 +1,100 @@ +use crate::metrics::DEFAULT_API_KEY; +use crate::metrics::DEFAULT_API_KEY_HEADER; +use crate::metrics::DEFAULT_EXPORT_INTERVAL; +use crate::metrics::DEFAULT_OTLP_ENDPOINT; +use crate::metrics::DEFAULT_TIMEOUT; +use crate::metrics::error::Result; +use crate::metrics::validation::validate_tag_key; +use crate::metrics::validation::validate_tag_value; +use std::collections::BTreeMap; +use std::time::Duration; + +#[derive(Clone, Debug)] +pub(crate) enum MetricsExporter { + OtlpHttp, + #[cfg(test)] + InMemory(opentelemetry_sdk::metrics::InMemoryMetricExporter), +} + +#[derive(Clone, Debug)] +pub struct MetricsConfig { + pub(crate) endpoint: String, + pub(crate) api_key: String, + pub(crate) api_key_header: String, + pub(crate) default_tags: BTreeMap, + pub(crate) timeout: Duration, + pub(crate) export_interval: Duration, + pub(crate) user_agent: String, + pub(crate) exporter: MetricsExporter, +} + +impl MetricsConfig { + /// Create a config with the provided API key and default settings. + pub fn new(api_key: impl Into) -> Self { + Self { + endpoint: DEFAULT_OTLP_ENDPOINT.to_string(), + api_key: api_key.into(), + api_key_header: DEFAULT_API_KEY_HEADER.to_string(), + default_tags: BTreeMap::new(), + timeout: DEFAULT_TIMEOUT, + export_interval: DEFAULT_EXPORT_INTERVAL, + user_agent: format!("codex-otel-metrics/{}", env!("CARGO_PKG_VERSION")), + exporter: MetricsExporter::OtlpHttp, + } + } + + /// Override the OTLP endpoint. + pub fn with_endpoint(mut self, endpoint: impl Into) -> Self { + self.endpoint = endpoint.into(); + self + } + + /// Override the API key header name. + pub fn with_api_key_header(mut self, header: impl Into) -> Self { + self.api_key_header = header.into(); + self + } + + /// Add a default tag that will be sent with every metric. + pub fn with_tag(mut self, key: impl Into, value: impl Into) -> Result { + let key = key.into(); + let value = value.into(); + validate_tag_key(&key)?; + validate_tag_value(&value)?; + self.default_tags.insert(key, value); + Ok(self) + } + + /// Override the OTLP exporter timeout. + pub fn with_timeout(mut self, timeout: Duration) -> Self { + self.timeout = timeout; + self + } + + /// Override the OTLP export interval. + pub fn with_export_interval(mut self, interval: Duration) -> Self { + self.export_interval = interval; + self + } + + /// Override the HTTP user agent header. + pub fn with_user_agent(mut self, user_agent: impl Into) -> Self { + self.user_agent = user_agent.into(); + self + } + + #[cfg(test)] + pub(crate) fn with_in_memory_exporter( + mut self, + exporter: opentelemetry_sdk::metrics::InMemoryMetricExporter, + ) -> Self { + self.exporter = MetricsExporter::InMemory(exporter); + self + } +} + +impl Default for MetricsConfig { + fn default() -> Self { + Self::new(DEFAULT_API_KEY) + } +} diff --git a/codex-rs/metrics/src/error.rs b/codex-rs/otel/src/metrics/error.rs similarity index 68% rename from codex-rs/metrics/src/error.rs rename to codex-rs/otel/src/metrics/error.rs index 3e5bd100540..16f6f452259 100644 --- a/codex-rs/metrics/src/error.rs +++ b/codex-rs/otel/src/metrics/error.rs @@ -30,37 +30,25 @@ pub enum MetricsError { #[error("tag key is reserved: {key}")] ReservedTagKey { key: String }, - // Client. - #[error("invalid sentry dsn: {dsn}")] - InvalidDsn { - dsn: String, + // Config. + #[error("metrics endpoint cannot be empty")] + EmptyEndpoint, + #[error("metrics api key cannot be empty")] + EmptyApiKey, + #[error("failed to build metrics exporter")] + ExporterBuild { #[source] - source: sentry::types::ParseDsnError, + source: opentelemetry_otlp::ExporterBuildError, }, - #[error("failed to build metrics http client")] - HttpClientBuild { + #[error("failed to flush metrics")] + FlushFailed { #[source] - source: reqwest::Error, + source: opentelemetry_sdk::error::OTelSdkError, }, - #[error("failed to serialize envelope header")] - SerializeEnvelopeHeader { + #[error("failed to shutdown metrics provider")] + ShutdownFailed { #[source] - source: serde_json::Error, - }, - #[error("failed to serialize item header")] - SerializeEnvelopeItemHeader { - #[source] - source: serde_json::Error, - }, - #[error("failed to send metrics envelope")] - SendEnvelope { - #[source] - source: reqwest::Error, - }, - #[error("sentry metrics upload failed: {status}{body}")] - SentryUploadFailed { - status: reqwest::StatusCode, - body: String, + source: opentelemetry_sdk::error::OTelSdkError, }, // Worker. diff --git a/codex-rs/otel/src/metrics/mod.rs b/codex-rs/otel/src/metrics/mod.rs new file mode 100644 index 00000000000..61996b00f83 --- /dev/null +++ b/codex-rs/otel/src/metrics/mod.rs @@ -0,0 +1,29 @@ +mod batch; +mod client; +mod config; +mod error; +mod tags; +mod time; +mod util; +pub(crate) mod validation; + +use std::time::Duration; + +pub(crate) const DEFAULT_OTLP_ENDPOINT: &str = ""; +pub(crate) const DEFAULT_API_KEY_HEADER: &str = "statsig-api-key"; +pub(crate) const DEFAULT_API_KEY: &str = ""; +pub(crate) const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10); +pub(crate) const DEFAULT_EXPORT_INTERVAL: Duration = Duration::from_secs(10); +pub(crate) const DEFAULT_QUEUE_CAPACITY: usize = 1024; +pub(crate) const DEFAULT_SHUTDOWN_TIMEOUT: Duration = Duration::from_millis(500); +pub(crate) const SHUTDOWN_POLL_INTERVAL: Duration = Duration::from_millis(10); + +pub use crate::metrics::batch::HistogramBuckets; +pub use crate::metrics::batch::MetricsBatch; +pub use crate::metrics::client::MetricsClient; +pub use crate::metrics::config::MetricsConfig; +pub use crate::metrics::error::MetricsError; +pub use crate::metrics::error::Result; + +#[cfg(test)] +mod tests; diff --git a/codex-rs/otel/src/metrics/tags.rs b/codex-rs/otel/src/metrics/tags.rs new file mode 100644 index 00000000000..da730672628 --- /dev/null +++ b/codex-rs/otel/src/metrics/tags.rs @@ -0,0 +1,32 @@ +use crate::metrics::error::Result; +use crate::metrics::validation::validate_tag_key; +use crate::metrics::validation::validate_tag_value; +use opentelemetry::KeyValue; +use std::collections::BTreeMap; + +pub(crate) fn collect_tags(tags: &[(&str, &str)]) -> Result> { + tags.iter() + .map(|(key, value)| { + validate_tag_key(key)?; + validate_tag_value(value)?; + Ok(((*key).to_string(), (*value).to_string())) + }) + .collect() +} + +pub(crate) fn merge_tags( + default_tags: &BTreeMap, + tags: &[(String, String)], +) -> BTreeMap { + let mut merged = default_tags.clone(); + for (key, value) in tags { + merged.insert(key.clone(), value.clone()); + } + merged +} + +pub(crate) fn tags_to_attributes(tags: &BTreeMap) -> Vec { + tags.iter() + .map(|(key, value)| KeyValue::new(key.clone(), value.clone())) + .collect() +} diff --git a/codex-rs/otel/src/metrics/tests.rs b/codex-rs/otel/src/metrics/tests.rs new file mode 100644 index 00000000000..a2bfba3075a --- /dev/null +++ b/codex-rs/otel/src/metrics/tests.rs @@ -0,0 +1,382 @@ +use super::HistogramBuckets; +use super::MetricsBatch; +use super::MetricsClient; +use super::MetricsConfig; +use super::MetricsError; +use super::Result; +use opentelemetry::KeyValue; +use opentelemetry_sdk::metrics::InMemoryMetricExporter; +use opentelemetry_sdk::metrics::data::AggregatedMetrics; +use opentelemetry_sdk::metrics::data::Metric; +use opentelemetry_sdk::metrics::data::MetricData; +use opentelemetry_sdk::metrics::data::ResourceMetrics; +use pretty_assertions::assert_eq; +use std::collections::BTreeMap; +use std::time::Duration; + +fn build_test_client() -> Result<(MetricsClient, InMemoryMetricExporter)> { + let exporter = InMemoryMetricExporter::default(); + let config = MetricsConfig::new("test-key") + .with_tag("service", "codex-cli")? + .with_tag("env", "prod")? + .with_in_memory_exporter(exporter.clone()); + let metrics = MetricsClient::new(config)?; + Ok((metrics, exporter)) +} + +fn latest_metrics(exporter: &InMemoryMetricExporter) -> ResourceMetrics { + let Ok(metrics) = exporter.get_finished_metrics() else { + panic!("finished metrics error"); + }; + let Some(metrics) = metrics.into_iter().last() else { + panic!("metrics export missing"); + }; + metrics +} + +fn find_metric<'a>(resource_metrics: &'a ResourceMetrics, name: &str) -> Option<&'a Metric> { + for scope_metrics in resource_metrics.scope_metrics() { + for metric in scope_metrics.metrics() { + if metric.name() == name { + return Some(metric); + } + } + } + None +} + +fn attributes_to_map<'a>( + attributes: impl Iterator, +) -> BTreeMap { + attributes + .map(|kv| (kv.key.as_str().to_string(), kv.value.as_str().to_string())) + .collect() +} + +#[test] +// Ensures counters/histograms record with default + per-call tags. +fn send_builds_metrics_with_tags_and_histograms() -> Result<()> { + let (metrics, exporter) = build_test_client()?; + let buckets = HistogramBuckets::from_values(&[25, 50, 100])?; + + let mut batch = metrics.batch(); + batch.counter("codex.turns", 1, &[("model", "gpt-5.1"), ("env", "dev")])?; + batch.histogram("codex.tool_latency", 25, &buckets, &[("tool", "shell")])?; + metrics.send(batch)?; + metrics.shutdown()?; + + let resource_metrics = latest_metrics(&exporter); + + let Some(counter_metric) = find_metric(&resource_metrics, "codex.turns") else { + panic!("counter metric missing"); + }; + let attributes = match counter_metric.data() { + AggregatedMetrics::I64(data) => match data { + MetricData::Sum(sum) => { + let points: Vec<_> = sum.data_points().collect(); + assert_eq!(points.len(), 1); + let point = points[0]; + assert_eq!(point.value(), 1); + attributes_to_map(point.attributes()) + } + _ => panic!("unexpected counter aggregation"), + }, + _ => panic!("unexpected counter data type"), + }; + + let expected_counter_attributes = BTreeMap::from([ + ("service".to_string(), "codex-cli".to_string()), + ("env".to_string(), "dev".to_string()), + ("model".to_string(), "gpt-5.1".to_string()), + ]); + assert_eq!(attributes, expected_counter_attributes); + + let Some(histogram_metric) = find_metric(&resource_metrics, "codex.tool_latency") else { + panic!("histogram metric missing"); + }; + let attributes = match histogram_metric.data() { + AggregatedMetrics::F64(data) => match data { + MetricData::Histogram(histogram) => { + let points: Vec<_> = histogram.data_points().collect(); + assert_eq!(points.len(), 1); + let point = points[0]; + assert_eq!(point.count(), 1); + assert_eq!(point.sum(), 25.0); + attributes_to_map(point.attributes()) + } + _ => panic!("unexpected histogram aggregation"), + }, + _ => panic!("unexpected histogram data type"), + }; + + let expected_histogram_attributes = BTreeMap::from([ + ("service".to_string(), "codex-cli".to_string()), + ("env".to_string(), "prod".to_string()), + ("tool".to_string(), "shell".to_string()), + ]); + assert_eq!(attributes, expected_histogram_attributes); + + Ok(()) +} + +#[test] +// Ensures defaults merge per metric and overrides take precedence. +fn send_merges_default_tags_per_metric() -> Result<()> { + let exporter = InMemoryMetricExporter::default(); + let config = MetricsConfig::new("test-key") + .with_tag("service", "codex-cli")? + .with_tag("env", "prod")? + .with_tag("region", "us")? + .with_in_memory_exporter(exporter.clone()); + let metrics = MetricsClient::new(config)?; + + let mut batch = metrics.batch(); + batch.counter("codex.alpha", 1, &[("env", "dev"), ("component", "alpha")])?; + batch.counter( + "codex.beta", + 2, + &[("service", "worker"), ("component", "beta")], + )?; + metrics.send(batch)?; + metrics.shutdown()?; + + let resource_metrics = latest_metrics(&exporter); + + let Some(alpha_metric) = find_metric(&resource_metrics, "codex.alpha") else { + panic!("alpha metric missing"); + }; + let alpha_attributes = match alpha_metric.data() { + AggregatedMetrics::I64(data) => match data { + MetricData::Sum(sum) => { + let points: Vec<_> = sum.data_points().collect(); + assert_eq!(points.len(), 1); + attributes_to_map(points[0].attributes()) + } + _ => panic!("unexpected alpha aggregation"), + }, + _ => panic!("unexpected alpha data type"), + }; + let expected_alpha_attributes = BTreeMap::from([ + ("service".to_string(), "codex-cli".to_string()), + ("env".to_string(), "dev".to_string()), + ("region".to_string(), "us".to_string()), + ("component".to_string(), "alpha".to_string()), + ]); + assert_eq!(alpha_attributes, expected_alpha_attributes); + + let Some(beta_metric) = find_metric(&resource_metrics, "codex.beta") else { + panic!("beta metric missing"); + }; + let beta_attributes = match beta_metric.data() { + AggregatedMetrics::I64(data) => match data { + MetricData::Sum(sum) => { + let points: Vec<_> = sum.data_points().collect(); + assert_eq!(points.len(), 1); + attributes_to_map(points[0].attributes()) + } + _ => panic!("unexpected beta aggregation"), + }, + _ => panic!("unexpected beta data type"), + }; + let expected_beta_attributes = BTreeMap::from([ + ("service".to_string(), "worker".to_string()), + ("env".to_string(), "prod".to_string()), + ("region".to_string(), "us".to_string()), + ("component".to_string(), "beta".to_string()), + ]); + assert_eq!(beta_attributes, expected_beta_attributes); + + Ok(()) +} + +#[test] +// Ensures duration recording maps to histogram output. +fn record_duration_uses_histogram() -> Result<()> { + let (metrics, exporter) = build_test_client()?; + let buckets = HistogramBuckets::from_values(&[10, 20])?; + + metrics.record_duration( + "codex.request_latency", + Duration::from_millis(15), + &buckets, + &[("route", "chat")], + )?; + metrics.shutdown()?; + + let resource_metrics = latest_metrics(&exporter); + let Some(metric) = find_metric(&resource_metrics, "codex.request_latency") else { + panic!("request latency histogram missing"); + }; + let attributes = match metric.data() { + AggregatedMetrics::F64(data) => match data { + MetricData::Histogram(histogram) => { + let points: Vec<_> = histogram.data_points().collect(); + assert_eq!(points.len(), 1); + let point = points[0]; + assert_eq!(point.count(), 1); + assert_eq!(point.sum(), 15.0); + attributes_to_map(point.attributes()) + } + _ => panic!("unexpected histogram aggregation"), + }, + _ => panic!("unexpected histogram data type"), + }; + + let expected_attributes = BTreeMap::from([ + ("service".to_string(), "codex-cli".to_string()), + ("env".to_string(), "prod".to_string()), + ("route".to_string(), "chat".to_string()), + ]); + assert_eq!(attributes, expected_attributes); + + Ok(()) +} + +#[test] +// Ensures time_result propagates errors but still records timing. +fn time_result_records_on_error() -> Result<()> { + let (metrics, exporter) = build_test_client()?; + let buckets = HistogramBuckets::from_values(&[10, 20])?; + + let Err(err) = metrics.time_result( + "codex.request_latency", + &buckets, + &[("route", "chat")], + || -> Result<&'static str> { Err(MetricsError::EmptyMetricName) }, + ) else { + panic!("expected error"); + }; + assert!(matches!(err, MetricsError::EmptyMetricName)); + metrics.shutdown()?; + + let resource_metrics = latest_metrics(&exporter); + let Some(metric) = find_metric(&resource_metrics, "codex.request_latency") else { + panic!("request latency histogram missing"); + }; + match metric.data() { + AggregatedMetrics::F64(data) => match data { + MetricData::Histogram(histogram) => { + let points: Vec<_> = histogram.data_points().collect(); + assert_eq!(points.len(), 1); + assert_eq!(points[0].count(), 1); + } + _ => panic!("unexpected histogram aggregation"), + }, + _ => panic!("unexpected histogram data type"), + } + + Ok(()) +} + +#[test] +// Validates invalid tag components are rejected during config build. +fn invalid_tag_component_is_rejected() -> Result<()> { + let Err(err) = MetricsConfig::default().with_tag("bad key", "value") else { + panic!("expected error"); + }; + assert!(matches!( + err, + MetricsError::InvalidTagComponent { label, value } + if label == "tag key" && value == "bad key" + )); + Ok(()) +} + +#[test] +// Ensures the reserved histogram bucketing tag key is rejected in config defaults. +fn reserved_tag_key_is_rejected_in_config() -> Result<()> { + let Err(err) = MetricsConfig::default().with_tag("le", "10") else { + panic!("expected error"); + }; + assert!(matches!(err, MetricsError::ReservedTagKey { key } if key == "le")); + Ok(()) +} + +#[test] +// Ensures per-metric tag keys are validated. +fn counter_rejects_invalid_tag_key() { + let mut batch = MetricsBatch::new(); + let Err(err) = batch.counter("codex.turns", 1, &[("bad key", "value")]) else { + panic!("expected error"); + }; + assert!(matches!( + err, + MetricsError::InvalidTagComponent { label, value } + if label == "tag key" && value == "bad key" + )); +} + +#[test] +// Ensures per-metric tag keys cannot use reserved histogram bucketing keys. +fn counter_rejects_reserved_tag_key() { + let mut batch = MetricsBatch::new(); + let Err(err) = batch.counter("codex.turns", 1, &[("le", "10")]) else { + panic!("expected error"); + }; + assert!(matches!(err, MetricsError::ReservedTagKey { key } if key == "le")); +} + +#[test] +// Ensures per-metric tag values are validated. +fn histogram_rejects_invalid_tag_value() -> Result<()> { + let mut batch = MetricsBatch::new(); + let buckets = HistogramBuckets::from_values(&[10])?; + let Err(err) = batch.histogram( + "codex.request_latency", + 3, + &buckets, + &[("route", "bad value")], + ) else { + panic!("expected error"); + }; + assert!(matches!( + err, + MetricsError::InvalidTagComponent { label, value } + if label == "tag value" && value == "bad value" + )); + Ok(()) +} + +#[test] +// Ensures histogram calls reject reserved tag keys even though they no longer add `le`. +fn histogram_rejects_reserved_tag_key() -> Result<()> { + let mut batch = MetricsBatch::new(); + let buckets = HistogramBuckets::from_values(&[10])?; + let Err(err) = batch.histogram("codex.request_latency", 3, &buckets, &[("le", "10")]) else { + panic!("expected error"); + }; + assert!(matches!(err, MetricsError::ReservedTagKey { key } if key == "le")); + Ok(()) +} + +#[test] +// Ensures invalid metric names are rejected when building a batch. +fn counter_rejects_invalid_metric_name() -> Result<()> { + let mut batch = MetricsBatch::new(); + let Err(err) = batch.counter("bad name", 1, &[]) else { + panic!("expected error"); + }; + assert!(matches!( + err, + MetricsError::InvalidMetricName { name } if name == "bad name" + )); + Ok(()) +} + +#[test] +// Validates missing API key is rejected early. +fn empty_api_key_is_rejected() { + let Err(err) = MetricsClient::new(MetricsConfig::new("")) else { + panic!("expected error"); + }; + assert!(matches!(err, MetricsError::EmptyApiKey)); +} + +#[test] +// Validates missing endpoint is rejected early. +fn empty_endpoint_is_rejected() { + let Err(err) = MetricsClient::new(MetricsConfig::new("test").with_endpoint("")) else { + panic!("expected error"); + }; + assert!(matches!(err, MetricsError::EmptyEndpoint)); +} diff --git a/codex-rs/metrics/src/time.rs b/codex-rs/otel/src/metrics/time.rs similarity index 100% rename from codex-rs/metrics/src/time.rs rename to codex-rs/otel/src/metrics/time.rs diff --git a/codex-rs/metrics/src/util.rs b/codex-rs/otel/src/metrics/util.rs similarity index 100% rename from codex-rs/metrics/src/util.rs rename to codex-rs/otel/src/metrics/util.rs diff --git a/codex-rs/metrics/src/validation.rs b/codex-rs/otel/src/metrics/validation.rs similarity index 95% rename from codex-rs/metrics/src/validation.rs rename to codex-rs/otel/src/metrics/validation.rs index 20c316490c8..8e2181e2660 100644 --- a/codex-rs/metrics/src/validation.rs +++ b/codex-rs/otel/src/metrics/validation.rs @@ -1,5 +1,5 @@ -use crate::error::MetricsError; -use crate::error::Result; +use crate::metrics::error::MetricsError; +use crate::metrics::error::Result; use std::collections::BTreeMap; pub(crate) fn validate_tags(tags: &BTreeMap) -> Result<()> { diff --git a/codex-rs/otel/src/traces/mod.rs b/codex-rs/otel/src/traces/mod.rs new file mode 100644 index 00000000000..a58949f9ab3 --- /dev/null +++ b/codex-rs/otel/src/traces/mod.rs @@ -0,0 +1,2 @@ +pub mod otel_manager; +pub mod otel_provider; diff --git a/codex-rs/otel/src/otel_manager.rs b/codex-rs/otel/src/traces/otel_manager.rs similarity index 74% rename from codex-rs/otel/src/otel_manager.rs rename to codex-rs/otel/src/traces/otel_manager.rs index 33750d83c5f..3e953d6b892 100644 --- a/codex-rs/otel/src/otel_manager.rs +++ b/codex-rs/otel/src/traces/otel_manager.rs @@ -1,4 +1,11 @@ -use crate::otel_provider::traceparent_context_from_env; +use crate::metrics::HistogramBuckets; +use crate::metrics::MetricsBatch; +use crate::metrics::MetricsClient; +use crate::metrics::MetricsConfig; +use crate::metrics::Result as MetricsResult; +use crate::metrics::validation::validate_tag_key; +use crate::metrics::validation::validate_tag_value; +use crate::traces::otel_provider::traceparent_context_from_env; use chrono::SecondsFormat; use chrono::Utc; use codex_api::ResponseEvent; @@ -52,6 +59,8 @@ pub struct OtelEventMetadata { pub struct OtelManager { metadata: OtelEventMetadata, session_span: Span, + metrics: Option, + metrics_use_metadata_tags: bool, } impl OtelManager { @@ -86,6 +95,8 @@ impl OtelManager { terminal_type, }, session_span, + metrics: None, + metrics_use_metadata_tags: true, } } @@ -96,10 +107,109 @@ impl OtelManager { manager } + pub fn with_metrics(mut self, metrics: MetricsClient) -> Self { + self.metrics = Some(metrics); + self.metrics_use_metadata_tags = true; + self + } + + pub fn with_metrics_without_metadata_tags(mut self, metrics: MetricsClient) -> Self { + self.metrics = Some(metrics); + self.metrics_use_metadata_tags = false; + self + } + + pub fn with_metrics_config(self, config: MetricsConfig) -> MetricsResult { + let metrics = MetricsClient::new(config)?; + Ok(self.with_metrics(metrics)) + } + pub fn current_span(&self) -> &Span { &self.session_span } + pub fn counter(&self, name: &str, inc: i64, tags: &[(&str, &str)]) -> MetricsResult<()> { + let Some(metrics) = &self.metrics else { + return Ok(()); + }; + let tags = self.tags_with_metadata(tags)?; + metrics.counter(name, inc, &tags) + } + + pub fn histogram( + &self, + name: &str, + value: i64, + buckets: &HistogramBuckets, + tags: &[(&str, &str)], + ) -> MetricsResult<()> { + let Some(metrics) = &self.metrics else { + return Ok(()); + }; + let tags = self.tags_with_metadata(tags)?; + metrics.histogram(name, value, buckets, &tags) + } + + pub fn record_duration( + &self, + name: &str, + duration: Duration, + buckets: &HistogramBuckets, + tags: &[(&str, &str)], + ) -> MetricsResult<()> { + let Some(metrics) = &self.metrics else { + return Ok(()); + }; + let tags = self.tags_with_metadata(tags)?; + metrics.record_duration(name, duration, buckets, &tags) + } + + pub fn time( + &self, + name: &str, + buckets: &HistogramBuckets, + tags: &[(&str, &str)], + f: impl FnOnce() -> T, + ) -> MetricsResult { + let Some(metrics) = &self.metrics else { + return Ok(f()); + }; + let tags = self.tags_with_metadata(tags)?; + metrics.time(name, buckets, &tags, f) + } + + pub fn time_result( + &self, + name: &str, + buckets: &HistogramBuckets, + tags: &[(&str, &str)], + f: impl FnOnce() -> MetricsResult, + ) -> MetricsResult { + let Some(metrics) = &self.metrics else { + return f(); + }; + let tags = self.tags_with_metadata(tags)?; + metrics.time_result(name, buckets, &tags, f) + } + + pub fn batch(&self) -> MetricsResult { + Ok(OtelMetricsBatch::new(self.metadata_tags_owned()?)) + } + + pub fn send(&self, batch: OtelMetricsBatch) -> MetricsResult<()> { + let Some(metrics) = &self.metrics else { + return Ok(()); + }; + metrics.send(batch.into_inner()) + } + + pub fn shutdown_metrics(&self) -> MetricsResult<()> { + let Some(metrics) = &self.metrics else { + return Ok(()); + }; + metrics.shutdown() + } + pub fn record_responses(&self, handle_responses_span: &Span, event: &ResponseEvent) { handle_responses_span.record("otel.name", OtelManager::responses_type(event)); @@ -529,6 +639,108 @@ impl OtelManager { ResponseItem::Other => "other".into(), } } + + fn tags_with_metadata<'a>( + &'a self, + tags: &'a [(&'a str, &'a str)], + ) -> MetricsResult> { + let mut merged = self.metadata_tag_refs()?; + merged.extend(tags.iter().copied()); + Ok(merged) + } + + fn metadata_tag_refs(&self) -> MetricsResult> { + if !self.metrics_use_metadata_tags { + return Ok(Vec::new()); + } + let mut tags = Vec::with_capacity(5); + Self::push_metadata_tag(&mut tags, "auth_mode", self.metadata.auth_mode.as_deref())?; + Self::push_metadata_tag(&mut tags, "model", Some(self.metadata.model.as_str()))?; + Self::push_metadata_tag(&mut tags, "slug", Some(self.metadata.slug.as_str()))?; + Self::push_metadata_tag( + &mut tags, + "terminal.type", + Some(self.metadata.terminal_type.as_str()), + )?; + Self::push_metadata_tag(&mut tags, "app.version", Some(self.metadata.app_version))?; + Ok(tags) + } + + fn metadata_tags_owned(&self) -> MetricsResult> { + let tags = self.metadata_tag_refs()?; + Ok(tags + .into_iter() + .map(|(key, value)| (key.to_string(), value.to_string())) + .collect()) + } + + fn push_metadata_tag<'a>( + tags: &mut Vec<(&'a str, &'a str)>, + key: &'static str, + value: Option<&'a str>, + ) -> MetricsResult<()> { + let Some(value) = value else { + return Ok(()); + }; + validate_tag_key(key)?; + validate_tag_value(value)?; + tags.push((key, value)); + Ok(()) + } +} + +pub struct OtelMetricsBatch { + batch: MetricsBatch, + metadata_tags: Vec<(String, String)>, +} + +impl OtelMetricsBatch { + fn new(metadata_tags: Vec<(String, String)>) -> Self { + Self { + batch: MetricsBatch::new(), + metadata_tags, + } + } + + pub fn counter(&mut self, name: &str, inc: i64, tags: &[(&str, &str)]) -> MetricsResult<()> { + let metadata_tags = std::mem::take(&mut self.metadata_tags); + let merged = Self::merge_tags(&metadata_tags, tags); + let result = self.batch.counter(name, inc, &merged); + self.metadata_tags = metadata_tags; + result + } + + pub fn histogram( + &mut self, + name: &str, + value: i64, + buckets: &HistogramBuckets, + tags: &[(&str, &str)], + ) -> MetricsResult<()> { + let metadata_tags = std::mem::take(&mut self.metadata_tags); + let merged = Self::merge_tags(&metadata_tags, tags); + let result = self.batch.histogram(name, value, buckets, &merged); + self.metadata_tags = metadata_tags; + result + } + + fn merge_tags<'a>( + metadata_tags: &'a [(String, String)], + tags: &'a [(&'a str, &'a str)], + ) -> Vec<(&'a str, &'a str)> { + let mut merged = Vec::with_capacity(metadata_tags.len() + tags.len()); + merged.extend( + metadata_tags + .iter() + .map(|(key, value)| (key.as_str(), value.as_str())), + ); + merged.extend(tags.iter().copied()); + merged + } + + fn into_inner(self) -> MetricsBatch { + self.batch + } } fn timestamp() -> String { diff --git a/codex-rs/otel/src/otel_provider.rs b/codex-rs/otel/src/traces/otel_provider.rs similarity index 100% rename from codex-rs/otel/src/otel_provider.rs rename to codex-rs/otel/src/traces/otel_provider.rs From 11c604516d0c23b9ecd0b01b11c2a7f268145668 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Sun, 21 Dec 2025 11:29:51 +0100 Subject: [PATCH 09/43] Unification --- codex-rs/core/src/otel_init.rs | 1 + codex-rs/docs/metrics.md | 4 ++ codex-rs/otel/README.md | 53 +++++++++++++++++++---- codex-rs/otel/src/config.rs | 2 + codex-rs/otel/src/traces/otel_manager.rs | 8 ++++ codex-rs/otel/src/traces/otel_provider.rs | 20 ++++++++- 6 files changed, 77 insertions(+), 11 deletions(-) diff --git a/codex-rs/core/src/otel_init.rs b/codex-rs/core/src/otel_init.rs index 8736869126f..8ee4746dd20 100644 --- a/codex-rs/core/src/otel_init.rs +++ b/codex-rs/core/src/otel_init.rs @@ -71,6 +71,7 @@ pub fn build_provider( environment: config.otel.environment.to_string(), exporter, trace_exporter, + metrics: None, }) } diff --git a/codex-rs/docs/metrics.md b/codex-rs/docs/metrics.md index 28a095321a6..d19ae9e02b3 100644 --- a/codex-rs/docs/metrics.md +++ b/codex-rs/docs/metrics.md @@ -63,6 +63,10 @@ manager.counter("codex.session_started", 1, &[("source", "tui")])?; manager.histogram("codex.request_latency", 83, &buckets, &[("route", "chat")])?; ``` +If you set `metrics: Some(MetricsConfig)` on `OtelSettings` and build an +`OtelProvider`, you can reuse that client via +`OtelManager::with_provider_metrics(&provider)`. + ## Configuration `MetricsConfig` lets you specify: diff --git a/codex-rs/otel/README.md b/codex-rs/otel/README.md index 953c7408117..605da6f2ef6 100644 --- a/codex-rs/otel/README.md +++ b/codex-rs/otel/README.md @@ -36,6 +36,7 @@ let settings = OtelSettings { protocol: OtelHttpProtocol::Binary, tls: None, }, + metrics: None, }; if let Some(provider) = OtelProvider::from(&settings)? { @@ -92,12 +93,47 @@ metrics.counter("codex.session_started", 1, &[("source", "tui")])?; ## Metrics via OtelManager -Attach a metrics client (or config) to `OtelManager` to reuse metadata: +Attach metrics once in `OtelSettings.metrics` and reuse them from +`OtelManager`: ```rust -use codex_otel::metrics::HistogramBuckets; -use codex_otel::metrics::MetricsConfig; +use codex_otel::config::{OtelExporter, OtelHttpProtocol, OtelSettings}; +use codex_otel::metrics::{HistogramBuckets, MetricsConfig}; use codex_otel::traces::otel_manager::OtelManager; +use codex_otel::traces::otel_provider::OtelProvider; +use tracing_subscriber::prelude::*; + +let settings = OtelSettings { + environment: "dev".into(), + service_name: "codex-cli".into(), + service_version: env!("CARGO_PKG_VERSION").into(), + codex_home: std::path::PathBuf::from("/tmp"), + exporter: OtelExporter::OtlpHttp { + endpoint: "https://otlp.example.com".into(), + headers: std::collections::HashMap::new(), + protocol: OtelHttpProtocol::Binary, + tls: None, + }, + trace_exporter: OtelExporter::OtlpHttp { + endpoint: "https://otlp.example.com".into(), + headers: std::collections::HashMap::new(), + protocol: OtelHttpProtocol::Binary, + tls: None, + }, + metrics: Some( + MetricsConfig::new("") + .with_endpoint("") + .with_api_key_header(""), + ), +}; + +let provider = OtelProvider::from(&settings)?; +if let Some(p) = &provider { + tracing_subscriber::registry() + .with(p.logger_layer()) + .with(p.tracing_layer()) + .init(); +} let manager = OtelManager::new( conversation_id, @@ -109,12 +145,11 @@ let manager = OtelManager::new( log_user_prompts, terminal_type, session_source, -) -.with_metrics_config( - MetricsConfig::new("") - .with_endpoint("") - .with_api_key_header(""), -)?; +); +let manager = provider + .as_ref() + .map(|p| manager.with_provider_metrics(p)) + .unwrap_or(manager); let buckets = HistogramBuckets::from_values(&[25, 50, 100, 250, 500])?; manager.counter("codex.session_started", 1, &[("source", "tui")])?; diff --git a/codex-rs/otel/src/config.rs b/codex-rs/otel/src/config.rs index 935c0379fbe..78a867aa94e 100644 --- a/codex-rs/otel/src/config.rs +++ b/codex-rs/otel/src/config.rs @@ -1,6 +1,7 @@ use std::collections::HashMap; use std::path::PathBuf; +use crate::metrics::MetricsConfig; use codex_utils_absolute_path::AbsolutePathBuf; #[derive(Clone, Debug)] @@ -11,6 +12,7 @@ pub struct OtelSettings { pub codex_home: PathBuf, pub exporter: OtelExporter, pub trace_exporter: OtelExporter, + pub metrics: Option, } #[derive(Clone, Debug)] diff --git a/codex-rs/otel/src/traces/otel_manager.rs b/codex-rs/otel/src/traces/otel_manager.rs index 3e953d6b892..b54ea49131c 100644 --- a/codex-rs/otel/src/traces/otel_manager.rs +++ b/codex-rs/otel/src/traces/otel_manager.rs @@ -5,6 +5,7 @@ use crate::metrics::MetricsConfig; use crate::metrics::Result as MetricsResult; use crate::metrics::validation::validate_tag_key; use crate::metrics::validation::validate_tag_value; +use crate::traces::otel_provider::OtelProvider; use crate::traces::otel_provider::traceparent_context_from_env; use chrono::SecondsFormat; use chrono::Utc; @@ -124,6 +125,13 @@ impl OtelManager { Ok(self.with_metrics(metrics)) } + pub fn with_provider_metrics(self, provider: &OtelProvider) -> Self { + match provider.metrics() { + Some(metrics) => self.with_metrics(metrics.clone()), + None => self, + } + } + pub fn current_span(&self) -> &Span { &self.session_span } diff --git a/codex-rs/otel/src/traces/otel_provider.rs b/codex-rs/otel/src/traces/otel_provider.rs index b9d95593255..0d12e378aab 100644 --- a/codex-rs/otel/src/traces/otel_provider.rs +++ b/codex-rs/otel/src/traces/otel_provider.rs @@ -2,6 +2,7 @@ use crate::config::OtelExporter; use crate::config::OtelHttpProtocol; use crate::config::OtelSettings; use crate::config::OtelTlsConfig; +use crate::metrics::MetricsClient; use codex_utils_absolute_path::AbsolutePathBuf; use http::Uri; use opentelemetry::Context; @@ -67,6 +68,7 @@ pub struct OtelProvider { pub logger: Option, pub tracer_provider: Option, pub tracer: Option, + pub metrics: Option, } impl OtelProvider { @@ -77,14 +79,23 @@ impl OtelProvider { if let Some(tracer_provider) = &self.tracer_provider { let _ = tracer_provider.shutdown(); } + if let Some(metrics) = &self.metrics { + let _ = metrics.shutdown(); + } } pub fn from(settings: &OtelSettings) -> Result, Box> { let log_enabled = !matches!(settings.exporter, OtelExporter::None); let trace_enabled = !matches!(settings.trace_exporter, OtelExporter::None); - if !log_enabled && !trace_enabled { - debug!("No exporter enabled in OTLP settings."); + let metrics = settings + .metrics + .clone() + .map(MetricsClient::new) + .transpose()?; + + if !log_enabled && !trace_enabled && metrics.is_none() { + debug!("No OTEL exporter enabled in settings."); return Ok(None); } @@ -113,6 +124,7 @@ impl OtelProvider { logger, tracer_provider, tracer, + metrics, })) } @@ -141,6 +153,10 @@ impl OtelProvider { pub fn codex_export_filter(meta: &tracing::Metadata<'_>) -> bool { meta.target().starts_with("codex_otel") } + + pub fn metrics(&self) -> Option<&MetricsClient> { + self.metrics.as_ref() + } } impl Drop for OtelProvider { From 14736ca199b4115ee95fb727d45b5597551feb76 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Mon, 22 Dec 2025 10:38:48 +0100 Subject: [PATCH 10/43] Unification 2 --- codex-rs/core/src/client.rs | 2 +- codex-rs/core/src/codex.rs | 2 +- codex-rs/core/src/state/service.rs | 2 +- codex-rs/core/src/tools/orchestrator.rs | 5 +- .../core/tests/chat_completions_payload.rs | 2 +- codex-rs/core/tests/chat_completions_sse.rs | 2 +- codex-rs/core/tests/responses_headers.rs | 2 +- codex-rs/core/tests/suite/client.rs | 2 +- codex-rs/docs/metrics.md | 2 +- codex-rs/otel/README.md | 6 +- codex-rs/otel/src/lib.rs | 205 ++++++++++++++ codex-rs/otel/src/metrics/batch.rs | 29 +- codex-rs/otel/src/traces/otel_manager.rs | 257 +----------------- 13 files changed, 247 insertions(+), 271 deletions(-) diff --git a/codex-rs/core/src/client.rs b/codex-rs/core/src/client.rs index dc0bf3872da..6f6db825472 100644 --- a/codex-rs/core/src/client.rs +++ b/codex-rs/core/src/client.rs @@ -18,7 +18,7 @@ use codex_api::common::Reasoning; use codex_api::create_text_param_for_request; use codex_api::error::ApiError; use codex_app_server_protocol::AuthMode; -use codex_otel::traces::otel_manager::OtelManager; +use codex_otel::OtelManager; use codex_protocol::ConversationId; use codex_protocol::config_types::ReasoningSummary as ReasoningSummaryConfig; use codex_protocol::models::ResponseItem; diff --git a/codex-rs/core/src/codex.rs b/codex-rs/core/src/codex.rs index 746790b3287..c4a02a4eecf 100644 --- a/codex-rs/core/src/codex.rs +++ b/codex-rs/core/src/codex.rs @@ -150,7 +150,7 @@ use crate::user_notification::UserNotification; use crate::util::backoff; use codex_async_utils::OrCancelExt; use codex_execpolicy::Policy as ExecPolicy; -use codex_otel::traces::otel_manager::OtelManager; +use codex_otel::OtelManager; use codex_protocol::config_types::ReasoningSummary as ReasoningSummaryConfig; use codex_protocol::models::ContentItem; use codex_protocol::models::ResponseInputItem; diff --git a/codex-rs/core/src/state/service.rs b/codex-rs/core/src/state/service.rs index b859af310a8..8257346c4cd 100644 --- a/codex-rs/core/src/state/service.rs +++ b/codex-rs/core/src/state/service.rs @@ -8,7 +8,7 @@ use crate::skills::SkillsManager; use crate::tools::sandboxing::ApprovalStore; use crate::unified_exec::UnifiedExecSessionManager; use crate::user_notification::UserNotifier; -use codex_otel::traces::otel_manager::OtelManager; +use codex_otel::OtelManager; use tokio::sync::Mutex; use tokio::sync::RwLock; use tokio_util::sync::CancellationToken; diff --git a/codex-rs/core/src/tools/orchestrator.rs b/codex-rs/core/src/tools/orchestrator.rs index 113ad98e0f6..f0810916a55 100644 --- a/codex-rs/core/src/tools/orchestrator.rs +++ b/codex-rs/core/src/tools/orchestrator.rs @@ -17,6 +17,7 @@ use crate::tools::sandboxing::ToolCtx; use crate::tools::sandboxing::ToolError; use crate::tools::sandboxing::ToolRuntime; use crate::tools::sandboxing::default_exec_approval_requirement; +use codex_otel::ToolDecisionSource; use codex_protocol::protocol::AskForApproval; use codex_protocol::protocol::ReviewDecision; @@ -45,8 +46,8 @@ impl ToolOrchestrator { let otel = turn_ctx.client.get_otel_manager(); let otel_tn = &tool_ctx.tool_name; let otel_ci = &tool_ctx.call_id; - let otel_user = codex_otel::traces::otel_manager::ToolDecisionSource::User; - let otel_cfg = codex_otel::traces::otel_manager::ToolDecisionSource::Config; + let otel_user = ToolDecisionSource::User; + let otel_cfg = ToolDecisionSource::Config; // 1) Approval let mut already_approved = false; diff --git a/codex-rs/core/tests/chat_completions_payload.rs b/codex-rs/core/tests/chat_completions_payload.rs index c36fef937ef..9ba3e25b075 100644 --- a/codex-rs/core/tests/chat_completions_payload.rs +++ b/codex-rs/core/tests/chat_completions_payload.rs @@ -13,7 +13,7 @@ use codex_core::Prompt; use codex_core::ResponseItem; use codex_core::WireApi; use codex_core::openai_models::models_manager::ModelsManager; -use codex_otel::traces::otel_manager::OtelManager; +use codex_otel::OtelManager; use codex_protocol::ConversationId; use codex_protocol::models::ReasoningItemContent; use codex_protocol::protocol::SessionSource; diff --git a/codex-rs/core/tests/chat_completions_sse.rs b/codex-rs/core/tests/chat_completions_sse.rs index c5820f57123..926da089b8b 100644 --- a/codex-rs/core/tests/chat_completions_sse.rs +++ b/codex-rs/core/tests/chat_completions_sse.rs @@ -12,7 +12,7 @@ use codex_core::ResponseEvent; use codex_core::ResponseItem; use codex_core::WireApi; use codex_core::openai_models::models_manager::ModelsManager; -use codex_otel::traces::otel_manager::OtelManager; +use codex_otel::OtelManager; use codex_protocol::ConversationId; use codex_protocol::models::ReasoningItemContent; use codex_protocol::protocol::SessionSource; diff --git a/codex-rs/core/tests/responses_headers.rs b/codex-rs/core/tests/responses_headers.rs index 580fc58bcc4..0bb69aa5567 100644 --- a/codex-rs/core/tests/responses_headers.rs +++ b/codex-rs/core/tests/responses_headers.rs @@ -11,7 +11,7 @@ use codex_core::ResponseEvent; use codex_core::ResponseItem; use codex_core::WireApi; use codex_core::openai_models::models_manager::ModelsManager; -use codex_otel::traces::otel_manager::OtelManager; +use codex_otel::OtelManager; use codex_protocol::ConversationId; use codex_protocol::config_types::ReasoningSummary; use codex_protocol::openai_models::ReasoningSummaryFormat; diff --git a/codex-rs/core/tests/suite/client.rs b/codex-rs/core/tests/suite/client.rs index 132dc50aadd..d4891b2121e 100644 --- a/codex-rs/core/tests/suite/client.rs +++ b/codex-rs/core/tests/suite/client.rs @@ -20,7 +20,7 @@ use codex_core::openai_models::models_manager::ModelsManager; use codex_core::protocol::EventMsg; use codex_core::protocol::Op; use codex_core::protocol::SessionSource; -use codex_otel::traces::otel_manager::OtelManager; +use codex_otel::OtelManager; use codex_protocol::ConversationId; use codex_protocol::config_types::ReasoningSummary; use codex_protocol::config_types::Verbosity; diff --git a/codex-rs/docs/metrics.md b/codex-rs/docs/metrics.md index d19ae9e02b3..50d0ae94323 100644 --- a/codex-rs/docs/metrics.md +++ b/codex-rs/docs/metrics.md @@ -39,7 +39,7 @@ client and emit metrics through the same handle. By default, metrics sent via ```rust use codex_otel::metrics::HistogramBuckets; use codex_otel::metrics::MetricsConfig; -use codex_otel::traces::otel_manager::OtelManager; +use codex_otel::OtelManager; let manager = OtelManager::new( conversation_id, diff --git a/codex-rs/otel/README.md b/codex-rs/otel/README.md index 605da6f2ef6..a7cb17dd81c 100644 --- a/codex-rs/otel/README.md +++ b/codex-rs/otel/README.md @@ -3,7 +3,7 @@ `codex-otel` is the OpenTelemetry integration crate for Codex. It provides: - Trace/log exporters and tracing subscriber layers (`codex_otel::traces::otel_provider`). -- A structured event helper (`codex_otel::traces::otel_manager::OtelManager`). +- A structured event helper (`codex_otel::OtelManager`). - A Statsig OTLP metrics client (`codex_otel::metrics`). - A metrics facade on `OtelManager` so tracing + metrics share metadata. @@ -53,7 +53,7 @@ if let Some(provider) = OtelProvider::from(&settings)? { Codex-specific events. ```rust -use codex_otel::traces::otel_manager::OtelManager; +use codex_otel::OtelManager; let manager = OtelManager::new( conversation_id, @@ -99,7 +99,7 @@ Attach metrics once in `OtelSettings.metrics` and reuse them from ```rust use codex_otel::config::{OtelExporter, OtelHttpProtocol, OtelSettings}; use codex_otel::metrics::{HistogramBuckets, MetricsConfig}; -use codex_otel::traces::otel_manager::OtelManager; +use codex_otel::OtelManager; use codex_otel::traces::otel_provider::OtelProvider; use tracing_subscriber::prelude::*; diff --git a/codex-rs/otel/src/lib.rs b/codex-rs/otel/src/lib.rs index 0f55a738096..9b1353cfd13 100644 --- a/codex-rs/otel/src/lib.rs +++ b/codex-rs/otel/src/lib.rs @@ -1,3 +1,208 @@ pub mod config; pub mod metrics; pub mod traces; + +use crate::metrics::HistogramBuckets; +use crate::metrics::MetricsBatch; +use crate::metrics::MetricsClient; +use crate::metrics::MetricsConfig; +use crate::metrics::Result as MetricsResult; +use crate::metrics::validation::validate_tag_key; +use crate::metrics::validation::validate_tag_value; +use crate::traces::otel_provider::OtelProvider; +use codex_protocol::ConversationId; +use serde::Serialize; +use std::time::Duration; +use strum_macros::Display; +use tracing::Span; + +#[derive(Debug, Clone, Serialize, Display)] +#[serde(rename_all = "snake_case")] +pub enum ToolDecisionSource { + Config, + User, +} + +#[derive(Debug, Clone)] +pub struct OtelEventMetadata { + pub(crate) conversation_id: ConversationId, + pub(crate) auth_mode: Option, + pub(crate) account_id: Option, + pub(crate) account_email: Option, + pub(crate) model: String, + pub(crate) slug: String, + pub(crate) log_user_prompts: bool, + pub(crate) app_version: &'static str, + pub(crate) terminal_type: String, +} + +#[derive(Debug, Clone)] +pub struct OtelManager { + pub(crate) metadata: OtelEventMetadata, + pub(crate) session_span: Span, + pub(crate) metrics: Option, + pub(crate) metrics_use_metadata_tags: bool, +} + +impl OtelManager { + pub fn with_model(&self, model: &str, slug: &str) -> Self { + let mut manager = self.clone(); + manager.metadata.model = model.to_owned(); + manager.metadata.slug = slug.to_owned(); + manager + } + + pub fn with_metrics(mut self, metrics: MetricsClient) -> Self { + self.metrics = Some(metrics); + self.metrics_use_metadata_tags = true; + self + } + + pub fn with_metrics_without_metadata_tags(mut self, metrics: MetricsClient) -> Self { + self.metrics = Some(metrics); + self.metrics_use_metadata_tags = false; + self + } + + pub fn with_metrics_config(self, config: MetricsConfig) -> MetricsResult { + let metrics = MetricsClient::new(config)?; + Ok(self.with_metrics(metrics)) + } + + pub fn with_provider_metrics(self, provider: &OtelProvider) -> Self { + match provider.metrics() { + Some(metrics) => self.with_metrics(metrics.clone()), + None => self, + } + } + + pub fn counter(&self, name: &str, inc: i64, tags: &[(&str, &str)]) -> MetricsResult<()> { + let Some(metrics) = &self.metrics else { + return Ok(()); + }; + let tags = self.tags_with_metadata(tags)?; + metrics.counter(name, inc, &tags) + } + + pub fn histogram( + &self, + name: &str, + value: i64, + buckets: &HistogramBuckets, + tags: &[(&str, &str)], + ) -> MetricsResult<()> { + let Some(metrics) = &self.metrics else { + return Ok(()); + }; + let tags = self.tags_with_metadata(tags)?; + metrics.histogram(name, value, buckets, &tags) + } + + pub fn record_duration( + &self, + name: &str, + duration: Duration, + buckets: &HistogramBuckets, + tags: &[(&str, &str)], + ) -> MetricsResult<()> { + let Some(metrics) = &self.metrics else { + return Ok(()); + }; + let tags = self.tags_with_metadata(tags)?; + metrics.record_duration(name, duration, buckets, &tags) + } + + pub fn time( + &self, + name: &str, + buckets: &HistogramBuckets, + tags: &[(&str, &str)], + f: impl FnOnce() -> T, + ) -> MetricsResult { + let Some(metrics) = &self.metrics else { + return Ok(f()); + }; + let tags = self.tags_with_metadata(tags)?; + metrics.time(name, buckets, &tags, f) + } + + pub fn time_result( + &self, + name: &str, + buckets: &HistogramBuckets, + tags: &[(&str, &str)], + f: impl FnOnce() -> MetricsResult, + ) -> MetricsResult { + let Some(metrics) = &self.metrics else { + return f(); + }; + let tags = self.tags_with_metadata(tags)?; + metrics.time_result(name, buckets, &tags, f) + } + + pub fn batch(&self) -> MetricsResult { + MetricsBatch::with_default_tags(self.metadata_tags_owned()?) + } + + pub fn send(&self, batch: MetricsBatch) -> MetricsResult<()> { + let Some(metrics) = &self.metrics else { + return Ok(()); + }; + metrics.send(batch) + } + + pub fn shutdown_metrics(&self) -> MetricsResult<()> { + let Some(metrics) = &self.metrics else { + return Ok(()); + }; + metrics.shutdown() + } + + fn tags_with_metadata<'a>( + &'a self, + tags: &'a [(&'a str, &'a str)], + ) -> MetricsResult> { + let mut merged = self.metadata_tag_refs()?; + merged.extend(tags.iter().copied()); + Ok(merged) + } + + fn metadata_tag_refs(&self) -> MetricsResult> { + if !self.metrics_use_metadata_tags { + return Ok(Vec::new()); + } + let mut tags = Vec::with_capacity(5); + Self::push_metadata_tag(&mut tags, "auth_mode", self.metadata.auth_mode.as_deref())?; + Self::push_metadata_tag(&mut tags, "model", Some(self.metadata.model.as_str()))?; + Self::push_metadata_tag(&mut tags, "slug", Some(self.metadata.slug.as_str()))?; + Self::push_metadata_tag( + &mut tags, + "terminal.type", + Some(self.metadata.terminal_type.as_str()), + )?; + Self::push_metadata_tag(&mut tags, "app.version", Some(self.metadata.app_version))?; + Ok(tags) + } + + fn metadata_tags_owned(&self) -> MetricsResult> { + let tags = self.metadata_tag_refs()?; + Ok(tags + .into_iter() + .map(|(key, value)| (key.to_string(), value.to_string())) + .collect()) + } + + fn push_metadata_tag<'a>( + tags: &mut Vec<(&'a str, &'a str)>, + key: &'static str, + value: Option<&'a str>, + ) -> MetricsResult<()> { + let Some(value) = value else { + return Ok(()); + }; + validate_tag_key(key)?; + validate_tag_value(value)?; + tags.push((key, value)); + Ok(()) + } +} diff --git a/codex-rs/otel/src/metrics/batch.rs b/codex-rs/otel/src/metrics/batch.rs index 37bad7d0a44..e896a94470d 100644 --- a/codex-rs/otel/src/metrics/batch.rs +++ b/codex-rs/otel/src/metrics/batch.rs @@ -2,6 +2,8 @@ use crate::metrics::error::MetricsError; use crate::metrics::error::Result; use crate::metrics::tags::collect_tags; use crate::metrics::validation::validate_metric_name; +use crate::metrics::validation::validate_tag_key; +use crate::metrics::validation::validate_tag_value; #[cfg_attr(test, derive(PartialEq, Eq))] #[derive(Clone, Debug)] @@ -118,6 +120,7 @@ pub(crate) enum MetricEvent { pub struct MetricsBatch { events: Vec, + default_tags: Vec<(String, String)>, } impl Default for MetricsBatch { @@ -129,17 +132,32 @@ impl Default for MetricsBatch { impl MetricsBatch { /// Create an empty metrics batch. pub fn new() -> Self { - Self { events: Vec::new() } + Self { + events: Vec::new(), + default_tags: Vec::new(), + } + } + + pub fn with_default_tags(default_tags: Vec<(String, String)>) -> Result { + for (key, value) in &default_tags { + validate_tag_key(key)?; + validate_tag_value(value)?; + } + Ok(Self { + events: Vec::new(), + default_tags, + }) } /// Append a counter increment to the batch. pub fn counter(&mut self, name: &str, inc: i64, tags: &[(&str, &str)]) -> Result<()> { validate_metric_name(name)?; - let tags = collect_tags(tags)?; + let mut merged_tags = self.default_tags.clone(); + merged_tags.extend(collect_tags(tags)?); self.events.push(MetricEvent::Counter { name: name.to_string(), value: inc, - tags, + tags: merged_tags, }); Ok(()) } @@ -155,11 +173,12 @@ impl MetricsBatch { // Buckets remain part of the API, but OTEL histogram aggregation owns bucket selection. let _ = buckets.bounds(); validate_metric_name(name)?; - let tags = collect_tags(tags)?; + let mut merged_tags = self.default_tags.clone(); + merged_tags.extend(collect_tags(tags)?); self.events.push(MetricEvent::Histogram { name: name.to_string(), value, - tags, + tags: merged_tags, }); Ok(()) } diff --git a/codex-rs/otel/src/traces/otel_manager.rs b/codex-rs/otel/src/traces/otel_manager.rs index b54ea49131c..7b915bcc705 100644 --- a/codex-rs/otel/src/traces/otel_manager.rs +++ b/codex-rs/otel/src/traces/otel_manager.rs @@ -1,11 +1,3 @@ -use crate::metrics::HistogramBuckets; -use crate::metrics::MetricsBatch; -use crate::metrics::MetricsClient; -use crate::metrics::MetricsConfig; -use crate::metrics::Result as MetricsResult; -use crate::metrics::validation::validate_tag_key; -use crate::metrics::validation::validate_tag_value; -use crate::traces::otel_provider::OtelProvider; use crate::traces::otel_provider::traceparent_context_from_env; use chrono::SecondsFormat; use chrono::Utc; @@ -24,45 +16,19 @@ use eventsource_stream::Event as StreamEvent; use eventsource_stream::EventStreamError as StreamError; use reqwest::Error; use reqwest::Response; -use serde::Serialize; use std::borrow::Cow; use std::fmt::Display; use std::future::Future; use std::time::Duration; use std::time::Instant; -use strum_macros::Display; use tokio::time::error::Elapsed; use tracing::Span; use tracing::trace_span; use tracing_opentelemetry::OpenTelemetrySpanExt; -#[derive(Debug, Clone, Serialize, Display)] -#[serde(rename_all = "snake_case")] -pub enum ToolDecisionSource { - Config, - User, -} - -#[derive(Debug, Clone)] -pub struct OtelEventMetadata { - conversation_id: ConversationId, - auth_mode: Option, - account_id: Option, - account_email: Option, - model: String, - slug: String, - log_user_prompts: bool, - app_version: &'static str, - terminal_type: String, -} - -#[derive(Debug, Clone)] -pub struct OtelManager { - metadata: OtelEventMetadata, - session_span: Span, - metrics: Option, - metrics_use_metadata_tags: bool, -} +pub use crate::OtelEventMetadata; +pub use crate::OtelManager; +pub use crate::ToolDecisionSource; impl OtelManager { #[allow(clippy::too_many_arguments)] @@ -101,123 +67,10 @@ impl OtelManager { } } - pub fn with_model(&self, model: &str, slug: &str) -> Self { - let mut manager = self.clone(); - manager.metadata.model = model.to_owned(); - manager.metadata.slug = slug.to_owned(); - manager - } - - pub fn with_metrics(mut self, metrics: MetricsClient) -> Self { - self.metrics = Some(metrics); - self.metrics_use_metadata_tags = true; - self - } - - pub fn with_metrics_without_metadata_tags(mut self, metrics: MetricsClient) -> Self { - self.metrics = Some(metrics); - self.metrics_use_metadata_tags = false; - self - } - - pub fn with_metrics_config(self, config: MetricsConfig) -> MetricsResult { - let metrics = MetricsClient::new(config)?; - Ok(self.with_metrics(metrics)) - } - - pub fn with_provider_metrics(self, provider: &OtelProvider) -> Self { - match provider.metrics() { - Some(metrics) => self.with_metrics(metrics.clone()), - None => self, - } - } - pub fn current_span(&self) -> &Span { &self.session_span } - pub fn counter(&self, name: &str, inc: i64, tags: &[(&str, &str)]) -> MetricsResult<()> { - let Some(metrics) = &self.metrics else { - return Ok(()); - }; - let tags = self.tags_with_metadata(tags)?; - metrics.counter(name, inc, &tags) - } - - pub fn histogram( - &self, - name: &str, - value: i64, - buckets: &HistogramBuckets, - tags: &[(&str, &str)], - ) -> MetricsResult<()> { - let Some(metrics) = &self.metrics else { - return Ok(()); - }; - let tags = self.tags_with_metadata(tags)?; - metrics.histogram(name, value, buckets, &tags) - } - - pub fn record_duration( - &self, - name: &str, - duration: Duration, - buckets: &HistogramBuckets, - tags: &[(&str, &str)], - ) -> MetricsResult<()> { - let Some(metrics) = &self.metrics else { - return Ok(()); - }; - let tags = self.tags_with_metadata(tags)?; - metrics.record_duration(name, duration, buckets, &tags) - } - - pub fn time( - &self, - name: &str, - buckets: &HistogramBuckets, - tags: &[(&str, &str)], - f: impl FnOnce() -> T, - ) -> MetricsResult { - let Some(metrics) = &self.metrics else { - return Ok(f()); - }; - let tags = self.tags_with_metadata(tags)?; - metrics.time(name, buckets, &tags, f) - } - - pub fn time_result( - &self, - name: &str, - buckets: &HistogramBuckets, - tags: &[(&str, &str)], - f: impl FnOnce() -> MetricsResult, - ) -> MetricsResult { - let Some(metrics) = &self.metrics else { - return f(); - }; - let tags = self.tags_with_metadata(tags)?; - metrics.time_result(name, buckets, &tags, f) - } - - pub fn batch(&self) -> MetricsResult { - Ok(OtelMetricsBatch::new(self.metadata_tags_owned()?)) - } - - pub fn send(&self, batch: OtelMetricsBatch) -> MetricsResult<()> { - let Some(metrics) = &self.metrics else { - return Ok(()); - }; - metrics.send(batch.into_inner()) - } - - pub fn shutdown_metrics(&self) -> MetricsResult<()> { - let Some(metrics) = &self.metrics else { - return Ok(()); - }; - metrics.shutdown() - } - pub fn record_responses(&self, handle_responses_span: &Span, event: &ResponseEvent) { handle_responses_span.record("otel.name", OtelManager::responses_type(event)); @@ -280,7 +133,7 @@ impl OtelManager { F: FnOnce() -> Fut, Fut: Future>, { - let start = std::time::Instant::now(); + let start = Instant::now(); let response = f().await; let duration = start.elapsed(); @@ -647,108 +500,6 @@ impl OtelManager { ResponseItem::Other => "other".into(), } } - - fn tags_with_metadata<'a>( - &'a self, - tags: &'a [(&'a str, &'a str)], - ) -> MetricsResult> { - let mut merged = self.metadata_tag_refs()?; - merged.extend(tags.iter().copied()); - Ok(merged) - } - - fn metadata_tag_refs(&self) -> MetricsResult> { - if !self.metrics_use_metadata_tags { - return Ok(Vec::new()); - } - let mut tags = Vec::with_capacity(5); - Self::push_metadata_tag(&mut tags, "auth_mode", self.metadata.auth_mode.as_deref())?; - Self::push_metadata_tag(&mut tags, "model", Some(self.metadata.model.as_str()))?; - Self::push_metadata_tag(&mut tags, "slug", Some(self.metadata.slug.as_str()))?; - Self::push_metadata_tag( - &mut tags, - "terminal.type", - Some(self.metadata.terminal_type.as_str()), - )?; - Self::push_metadata_tag(&mut tags, "app.version", Some(self.metadata.app_version))?; - Ok(tags) - } - - fn metadata_tags_owned(&self) -> MetricsResult> { - let tags = self.metadata_tag_refs()?; - Ok(tags - .into_iter() - .map(|(key, value)| (key.to_string(), value.to_string())) - .collect()) - } - - fn push_metadata_tag<'a>( - tags: &mut Vec<(&'a str, &'a str)>, - key: &'static str, - value: Option<&'a str>, - ) -> MetricsResult<()> { - let Some(value) = value else { - return Ok(()); - }; - validate_tag_key(key)?; - validate_tag_value(value)?; - tags.push((key, value)); - Ok(()) - } -} - -pub struct OtelMetricsBatch { - batch: MetricsBatch, - metadata_tags: Vec<(String, String)>, -} - -impl OtelMetricsBatch { - fn new(metadata_tags: Vec<(String, String)>) -> Self { - Self { - batch: MetricsBatch::new(), - metadata_tags, - } - } - - pub fn counter(&mut self, name: &str, inc: i64, tags: &[(&str, &str)]) -> MetricsResult<()> { - let metadata_tags = std::mem::take(&mut self.metadata_tags); - let merged = Self::merge_tags(&metadata_tags, tags); - let result = self.batch.counter(name, inc, &merged); - self.metadata_tags = metadata_tags; - result - } - - pub fn histogram( - &mut self, - name: &str, - value: i64, - buckets: &HistogramBuckets, - tags: &[(&str, &str)], - ) -> MetricsResult<()> { - let metadata_tags = std::mem::take(&mut self.metadata_tags); - let merged = Self::merge_tags(&metadata_tags, tags); - let result = self.batch.histogram(name, value, buckets, &merged); - self.metadata_tags = metadata_tags; - result - } - - fn merge_tags<'a>( - metadata_tags: &'a [(String, String)], - tags: &'a [(&'a str, &'a str)], - ) -> Vec<(&'a str, &'a str)> { - let mut merged = Vec::with_capacity(metadata_tags.len() + tags.len()); - merged.extend( - metadata_tags - .iter() - .map(|(key, value)| (key.as_str(), value.as_str())), - ); - merged.extend(tags.iter().copied()); - merged - } - - fn into_inner(self) -> MetricsBatch { - self.batch - } } fn timestamp() -> String { From d347354ef9293c8d3359d1b9c780f72ff5b54859 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Mon, 22 Dec 2025 10:39:14 +0100 Subject: [PATCH 11/43] Test base --- codex-rs/otel/tests/harness/mod.rs | 165 +++++++++++++++++++ codex-rs/otel/tests/suite/mod.rs | 3 + codex-rs/otel/tests/suite/send.rs | 207 ++++++++++++++++++++++++ codex-rs/otel/tests/suite/timing.rs | 113 +++++++++++++ codex-rs/otel/tests/suite/validation.rs | 130 +++++++++++++++ codex-rs/otel/tests/tests.rs | 2 + 6 files changed, 620 insertions(+) create mode 100644 codex-rs/otel/tests/harness/mod.rs create mode 100644 codex-rs/otel/tests/suite/mod.rs create mode 100644 codex-rs/otel/tests/suite/send.rs create mode 100644 codex-rs/otel/tests/suite/timing.rs create mode 100644 codex-rs/otel/tests/suite/validation.rs create mode 100644 codex-rs/otel/tests/tests.rs diff --git a/codex-rs/otel/tests/harness/mod.rs b/codex-rs/otel/tests/harness/mod.rs new file mode 100644 index 00000000000..3a51dd73574 --- /dev/null +++ b/codex-rs/otel/tests/harness/mod.rs @@ -0,0 +1,165 @@ +use serde_json::Value; +use std::collections::BTreeMap; +use std::io::Read; +use std::io::Write; +use std::net::TcpListener; +use std::net::TcpStream; +use std::thread; + +#[derive(Debug)] +pub(crate) struct CapturedRequest { + pub(crate) method: String, + pub(crate) path: String, + pub(crate) headers: BTreeMap, + pub(crate) body: Vec, +} + +#[derive(Debug)] +pub(crate) struct ParsedEnvelope { + pub(crate) header: Value, + pub(crate) item_header: Value, + pub(crate) payload: String, +} + +#[derive(Debug)] +pub(crate) struct ParsedStatsdLine { + pub(crate) name: String, + pub(crate) value: i64, + pub(crate) kind: String, + pub(crate) tags: BTreeMap, +} + +/// Spawn a simple HTTP server that captures one request and responds with `status`. +pub(crate) fn spawn_server(status: u16) -> (String, thread::JoinHandle) { + let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server"); + let addr = listener.local_addr().expect("local addr"); + let dsn = format!("http://public:@{addr}/123"); + + let handle = thread::spawn(move || { + let (mut stream, _) = listener.accept().expect("accept connection"); + let request = read_http_request(&mut stream); + let reason = match status { + 200 => "OK", + 500 => "Internal Server Error", + _ => "OK", + }; + let response = + format!("HTTP/1.1 {status} {reason}\r\nContent-Length: 0\r\nConnection: close\r\n\r\n"); + stream + .write_all(response.as_bytes()) + .expect("write response"); + request + }); + + (dsn, handle) +} + +// Read a single HTTP request from the stream and return the parsed data. +fn read_http_request(stream: &mut TcpStream) -> CapturedRequest { + let mut buffer = Vec::new(); + let mut chunk = [0_u8; 1024]; + let mut header_end = None; + while header_end.is_none() { + let read = stream.read(&mut chunk).expect("read request"); + if read == 0 { + break; + } + buffer.extend_from_slice(&chunk[..read]); + header_end = find_header_end(&buffer); + } + let header_end = header_end.expect("request headers"); + let headers_bytes = &buffer[..header_end]; + let headers_str = std::str::from_utf8(headers_bytes).expect("headers utf-8"); + let mut lines = headers_str.split("\r\n"); + let request_line = lines.next().expect("request line"); + let mut request_parts = request_line.split_whitespace(); + let method = request_parts.next().expect("method").to_string(); + let path = request_parts.next().expect("path").to_string(); + + let mut headers = BTreeMap::new(); + for line in lines { + if line.is_empty() { + continue; + } + if let Some((key, value)) = line.split_once(':') { + headers.insert(key.trim().to_ascii_lowercase(), value.trim().to_string()); + } + } + + let content_length = headers + .get("content-length") + .and_then(|value| value.parse::().ok()) + .unwrap_or(0); + let mut body = buffer[header_end..].to_vec(); + while body.len() < content_length { + let read = stream.read(&mut chunk).expect("read body"); + if read == 0 { + break; + } + body.extend_from_slice(&chunk[..read]); + } + + CapturedRequest { + method, + path, + headers, + body, + } +} + +// Locate the end of the HTTP headers in a buffered request. +fn find_header_end(buffer: &[u8]) -> Option { + buffer + .windows(4) + .position(|window| window == b"\r\n\r\n") + .map(|pos| pos + 4) +} + +/// Parse a Sentry envelope payload into headers and statsd payload text. +pub(crate) fn parse_envelope(body: &[u8]) -> ParsedEnvelope { + let mut parts = body.splitn(3, |byte| *byte == b'\n'); + let header_line = parts.next().expect("envelope header"); + let item_header_line = parts.next().expect("item header"); + let payload = parts.next().unwrap_or(&[]); + + let header = serde_json::from_slice(header_line).expect("parse envelope header"); + let item_header = serde_json::from_slice(item_header_line).expect("parse item header"); + let payload = std::str::from_utf8(payload) + .expect("payload utf-8") + .trim_end_matches('\n') + .to_string(); + + ParsedEnvelope { + header, + item_header, + payload, + } +} + +/// Parse a single statsd line (with optional tags) into components. +pub(crate) fn parse_statsd_line(line: &str) -> ParsedStatsdLine { + let (metric, tags_part) = line + .split_once("|#") + .map(|(metric, tags)| (metric, Some(tags))) + .unwrap_or((line, None)); + let (name_value, kind) = metric.split_once('|').expect("metric kind"); + let (name, value) = name_value.split_once(':').expect("metric value"); + let value = value.parse::().expect("metric value parse"); + + let mut tags = BTreeMap::new(); + if let Some(tags_part) = tags_part + && !tags_part.is_empty() + { + for tag in tags_part.split(',') { + let (key, value) = tag.split_once(':').expect("tag"); + tags.insert(key.to_string(), value.to_string()); + } + } + + ParsedStatsdLine { + name: name.to_string(), + value, + kind: kind.to_string(), + tags, + } +} diff --git a/codex-rs/otel/tests/suite/mod.rs b/codex-rs/otel/tests/suite/mod.rs new file mode 100644 index 00000000000..42708df7981 --- /dev/null +++ b/codex-rs/otel/tests/suite/mod.rs @@ -0,0 +1,3 @@ +mod send; +mod timing; +mod validation; diff --git a/codex-rs/otel/tests/suite/send.rs b/codex-rs/otel/tests/suite/send.rs new file mode 100644 index 00000000000..1abfdde37aa --- /dev/null +++ b/codex-rs/otel/tests/suite/send.rs @@ -0,0 +1,207 @@ +use crate::harness::parse_envelope; +use crate::harness::parse_statsd_line; +use crate::harness::spawn_server; +use codex_metrics::HistogramBuckets; +use codex_metrics::MetricsClient; +use codex_metrics::MetricsConfig; +use codex_metrics::MetricsError; +use codex_metrics::Result; +use pretty_assertions::assert_eq; +use std::net::TcpListener; +use std::thread; +use std::time::Duration; + +// Ensures counters/histograms render with default + per-call tags. +#[test] +fn send_builds_payload_with_tags_and_histograms() -> Result<()> { + let (dsn, handle) = spawn_server(200); + let metrics = MetricsClient::new( + MetricsConfig::new(dsn.clone()) + .with_tag("service", "codex-cli")? + .with_tag("env", "prod")?, + )?; + let buckets = HistogramBuckets::from_values(&[25, 50, 100])?; + + let mut batch = metrics.batch(); + batch.counter("codex.turns", 1, &[("model", "gpt-5.1"), ("env", "dev")])?; + batch.histogram("codex.tool_latency", 25, &buckets, &[("tool", "shell")])?; + metrics.send(batch)?; + metrics.shutdown()?; + + let captured = handle.join().expect("server thread"); + assert_eq!(captured.method, "POST"); + assert_eq!(captured.path, "/api/123/envelope/"); + assert_eq!( + captured.headers.get("content-type").map(String::as_str), + Some("application/x-sentry-envelope") + ); + + let envelope = parse_envelope(&captured.body); + assert_eq!(envelope.header["dsn"].as_str(), Some(dsn.as_str())); + assert_eq!(envelope.item_header["type"], "statsd"); + assert_eq!(envelope.item_header["content_type"], "text/plain"); + assert_eq!( + envelope.item_header["length"].as_u64(), + Some(envelope.payload.len() as u64) + ); + + let lines: Vec<&str> = envelope.payload.split('\n').collect(); + assert_eq!(lines.len(), 5); + + let line = parse_statsd_line(lines[0]); + assert_eq!(line.name, "codex.turns"); + assert_eq!(line.value, 1); + assert_eq!(line.kind, "c"); + assert_eq!( + line.tags.get("service").map(String::as_str), + Some("codex-cli") + ); + assert_eq!(line.tags.get("env").map(String::as_str), Some("dev")); + assert_eq!(line.tags.get("model").map(String::as_str), Some("gpt-5.1")); + + for (line, expected_le) in lines.iter().skip(1).zip(["25", "50", "100", "inf"]) { + let line = parse_statsd_line(line); + assert_eq!(line.name, "codex.tool_latency"); + assert_eq!(line.value, 1); + assert_eq!(line.kind, "c"); + assert_eq!( + line.tags.get("service").map(String::as_str), + Some("codex-cli") + ); + assert_eq!(line.tags.get("env").map(String::as_str), Some("prod")); + assert_eq!(line.tags.get("tool").map(String::as_str), Some("shell")); + assert_eq!(line.tags.get("le").map(String::as_str), Some(expected_le)); + } + + Ok(()) +} + +// Ensures defaults merge per line and overrides take precedence. +#[test] +fn send_merges_default_tags_per_line() -> Result<()> { + let (dsn, handle) = spawn_server(200); + let metrics = MetricsClient::new( + MetricsConfig::new(dsn.clone()) + .with_tag("service", "codex-cli")? + .with_tag("env", "prod")? + .with_tag("region", "us")?, + )?; + + let mut batch = metrics.batch(); + batch.counter("codex.alpha", 1, &[("env", "dev"), ("component", "alpha")])?; + batch.counter( + "codex.beta", + 2, + &[("service", "worker"), ("component", "beta")], + )?; + metrics.send(batch)?; + metrics.shutdown()?; + + let captured = handle.join().expect("server thread"); + let envelope = parse_envelope(&captured.body); + let lines: Vec<&str> = envelope.payload.split('\n').collect(); + assert_eq!(lines.len(), 2); + assert_eq!( + lines[0], + "codex.alpha:1|c|#component:alpha,env:dev,region:us,service:codex-cli" + ); + assert_eq!( + lines[1], + "codex.beta:2|c|#component:beta,env:prod,region:us,service:worker" + ); + + Ok(()) +} + +// Verifies values above the max bucket use the inf tag. +#[test] +fn send_uses_inf_bucket_for_values_over_max() -> Result<()> { + let (dsn, handle) = spawn_server(200); + let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; + let buckets = HistogramBuckets::from_values(&[10, 20])?; + + let mut batch = metrics.batch(); + batch.histogram("codex.tool_latency", 99, &buckets, &[("tool", "shell")])?; + metrics.send(batch)?; + metrics.shutdown()?; + + let captured = handle.join().expect("server thread"); + let envelope = parse_envelope(&captured.body); + let lines: Vec<&str> = envelope.payload.split('\n').collect(); + assert_eq!(lines.len(), 1); + let line = parse_statsd_line(lines[0]); + assert_eq!(line.tags.get("le").map(String::as_str), Some("inf")); + + Ok(()) +} + +// Verifies enqueued batches are delivered by the background worker. +#[test] +fn client_sends_enqueued_batch() -> Result<()> { + let (dsn, handle) = spawn_server(200); + let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; + + let mut batch = metrics.batch(); + batch.counter("codex.turns", 1, &[("model", "gpt-5.1")])?; + metrics.send(batch)?; + metrics.shutdown()?; + + let captured = handle.join().expect("server thread"); + let envelope = parse_envelope(&captured.body); + let lines: Vec<&str> = envelope.payload.split('\n').collect(); + assert_eq!(lines.len(), 1); + + let line = parse_statsd_line(lines[0]); + assert_eq!(line.name, "codex.turns"); + assert_eq!(line.value, 1); + assert_eq!(line.kind, "c"); + assert_eq!(line.tags.get("model").map(String::as_str), Some("gpt-5.1")); + + Ok(()) +} + +// Ensures a non-success response panics in debug builds via error_or_panic. +#[test] +fn send_panics_on_non_success_status_in_debug() -> Result<()> { + let (dsn, handle) = spawn_server(500); + let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; + + let mut batch = metrics.batch(); + batch.counter("codex.turns", 1, &[])?; + metrics.send(batch)?; + let err = metrics.shutdown().unwrap_err(); + assert!(matches!(err, MetricsError::WorkerPanicked)); + + let captured = handle.join().expect("server thread"); + assert_eq!(captured.method, "POST"); + Ok(()) +} + +// Ensures empty batches do not trigger any HTTP request. +#[test] +fn client_core_skips_empty_batch() -> Result<()> { + let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server"); + listener.set_nonblocking(true).expect("set nonblocking"); + let addr = listener.local_addr().expect("local addr"); + let dsn = format!("http://public:@{addr}/123"); + let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; + + metrics.send(metrics.batch())?; + metrics.shutdown()?; + + let mut saw_connection = false; + for _ in 0..10 { + match listener.accept() { + Ok(_) => { + saw_connection = true; + break; + } + Err(err) if err.kind() == std::io::ErrorKind::WouldBlock => { + thread::sleep(Duration::from_millis(10)); + } + Err(err) => panic!("unexpected accept error: {err}"), + } + } + assert!(!saw_connection, "expected no request for empty batch"); + Ok(()) +} diff --git a/codex-rs/otel/tests/suite/timing.rs b/codex-rs/otel/tests/suite/timing.rs new file mode 100644 index 00000000000..938bdefaf14 --- /dev/null +++ b/codex-rs/otel/tests/suite/timing.rs @@ -0,0 +1,113 @@ +use crate::harness::parse_envelope; +use crate::harness::parse_statsd_line; +use crate::harness::spawn_server; +use codex_metrics::HistogramBuckets; +use codex_metrics::MetricsClient; +use codex_metrics::MetricsConfig; +use codex_metrics::MetricsError; +use codex_metrics::Result; +use pretty_assertions::assert_eq; +use std::time::Duration; + +// Ensures duration recording maps to the expected bucket tag. +#[test] +fn record_duration_uses_matching_bucket() -> Result<()> { + let (dsn, handle) = spawn_server(200); + let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; + let buckets = HistogramBuckets::from_values(&[10, 20])?; + + metrics.record_duration( + "codex.request_latency", + Duration::from_millis(15), + &buckets, + &[("route", "chat")], + )?; + metrics.shutdown()?; + + let captured = handle.join().expect("server thread"); + let envelope = parse_envelope(&captured.body); + let lines: Vec<&str> = envelope.payload.split('\n').collect(); + assert_eq!(lines.len(), 2); + + let line = parse_statsd_line(lines[0]); + assert_eq!(line.name, "codex.request_latency"); + assert_eq!(line.tags.get("route").map(String::as_str), Some("chat")); + assert_eq!(line.tags.get("le").map(String::as_str), Some("20")); + + let line = parse_statsd_line(lines[1]); + assert_eq!(line.tags.get("le").map(String::as_str), Some("inf")); + + Ok(()) +} + +// Ensures time_result returns the closure output and records timing. +#[test] +fn time_result_records_success() -> Result<()> { + let (dsn, handle) = spawn_server(200); + let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; + let buckets = HistogramBuckets::from_values(&[10, 20])?; + + let value = metrics.time_result( + "codex.request_latency", + &buckets, + &[("route", "chat")], + || Ok("ok"), + )?; + assert_eq!(value, "ok"); + metrics.shutdown()?; + + let captured = handle.join().expect("server thread"); + let envelope = parse_envelope(&captured.body); + let lines: Vec<&str> = envelope.payload.split('\n').collect(); + assert!(!lines.is_empty()); + let parsed: Vec<_> = lines.iter().copied().map(parse_statsd_line).collect(); + assert!( + parsed + .iter() + .any(|line| { line.tags.get("le").map(String::as_str) == Some("inf") }) + ); + for line in parsed { + assert_eq!(line.name, "codex.request_latency"); + assert_eq!(line.tags.get("route").map(String::as_str), Some("chat")); + assert!(line.tags.contains_key("le")); + } + + Ok(()) +} + +// Ensures time_result propagates errors but still records timing. +#[test] +fn time_result_records_on_error() -> Result<()> { + let (dsn, handle) = spawn_server(200); + let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; + let buckets = HistogramBuckets::from_values(&[10, 20])?; + + let err = metrics + .time_result( + "codex.request_latency", + &buckets, + &[("route", "chat")], + || -> Result<&'static str> { Err(MetricsError::EmptyMetricName) }, + ) + .unwrap_err(); + assert!(matches!(err, MetricsError::EmptyMetricName)); + metrics.shutdown()?; + + let captured = handle.join().expect("server thread"); + let envelope = parse_envelope(&captured.body); + let lines: Vec<&str> = envelope.payload.split('\n').collect(); + assert!(!lines.is_empty()); + let parsed: Vec<_> = lines.iter().copied().map(parse_statsd_line).collect(); + assert!( + parsed + .iter() + .any(|line| { line.tags.get("le").map(String::as_str) == Some("inf") }) + ); + for line in parsed { + assert_eq!(line.name, "codex.request_latency"); + assert_eq!(line.tags.get("route").map(String::as_str), Some("chat")); + assert!(line.tags.contains_key("le")); + } + + Ok(()) +} diff --git a/codex-rs/otel/tests/suite/validation.rs b/codex-rs/otel/tests/suite/validation.rs new file mode 100644 index 00000000000..2383b774539 --- /dev/null +++ b/codex-rs/otel/tests/suite/validation.rs @@ -0,0 +1,130 @@ +use codex_metrics::HistogramBuckets; +use codex_metrics::MetricsBatch; +use codex_metrics::MetricsClient; +use codex_metrics::MetricsConfig; +use codex_metrics::MetricsError; +use codex_metrics::Result; + +// Validates invalid DSNs are rejected early. +#[test] +fn invalid_dsn_reports_error() -> Result<()> { + assert!(matches!( + MetricsClient::new(MetricsConfig::new("not a dsn")), + Err(MetricsError::InvalidDsn { .. }) + )); + Ok(()) +} + +// Ensures invalid tag components are rejected during config build. +#[test] +fn invalid_tag_component_is_rejected() -> Result<()> { + let err = MetricsConfig::default() + .with_tag("bad key", "value") + .unwrap_err(); + assert!(matches!( + err, + MetricsError::InvalidTagComponent { label, value } + if label == "tag key" && value == "bad key" + )); + Ok(()) +} + +// Ensures the reserved histogram bucketing tag key is rejected in config defaults. +#[test] +fn reserved_tag_key_is_rejected_in_config() -> Result<()> { + let err = MetricsConfig::default().with_tag("le", "10").unwrap_err(); + assert!(matches!( + err, + MetricsError::ReservedTagKey { key } if key == "le" + )); + Ok(()) +} + +// Ensures per-metric tag keys are validated. +#[test] +fn counter_rejects_invalid_tag_key() { + let mut batch = MetricsBatch::new(); + let err = batch + .counter("codex.turns", 1, &[("bad key", "value")]) + .unwrap_err(); + assert!(matches!( + err, + MetricsError::InvalidTagComponent { label, value } + if label == "tag key" && value == "bad key" + )); +} + +// Ensures per-metric tag keys cannot use reserved histogram bucketing keys. +#[test] +fn counter_rejects_reserved_tag_key() { + let mut batch = MetricsBatch::new(); + let err = batch + .counter("codex.turns", 1, &[("le", "10")]) + .unwrap_err(); + assert!(matches!( + err, + MetricsError::ReservedTagKey { key } if key == "le" + )); +} + +// Ensures per-metric tag values are validated. +#[test] +fn histogram_rejects_invalid_tag_value() -> Result<()> { + let mut batch = MetricsBatch::new(); + let buckets = HistogramBuckets::from_values(&[10])?; + let err = batch + .histogram( + "codex.request_latency", + 3, + &buckets, + &[("route", "bad value")], + ) + .unwrap_err(); + assert!(matches!( + err, + MetricsError::InvalidTagComponent { label, value } + if label == "tag value" && value == "bad value" + )); + Ok(()) +} + +// Ensures histogram calls reject reserved tag keys even though they internally add `le`. +#[test] +fn histogram_rejects_reserved_tag_key() -> Result<()> { + let mut batch = MetricsBatch::new(); + let buckets = HistogramBuckets::from_values(&[10])?; + let err = batch + .histogram("codex.request_latency", 3, &buckets, &[("le", "10")]) + .unwrap_err(); + assert!(matches!( + err, + MetricsError::ReservedTagKey { key } if key == "le" + )); + Ok(()) +} + +// Ensures invalid metric names are rejected when building a batch. +#[test] +fn counter_rejects_invalid_metric_name() -> Result<()> { + let mut batch = MetricsBatch::new(); + let err = batch.counter("bad name", 1, &[]).unwrap_err(); + assert!(matches!( + err, + MetricsError::InvalidMetricName { name } if name == "bad name" + )); + Ok(()) +} + +// Ensures empty histogram bucket lists are rejected. +#[test] +fn empty_buckets_are_rejected() { + let err = HistogramBuckets::from_values(&[]).unwrap_err(); + assert!(matches!(err, MetricsError::EmptyBuckets)); +} + +// Ensures range overflow is detected when building buckets. +#[test] +fn range_overflow_is_reported() { + let err = HistogramBuckets::from_range(i64::MAX - 1, i64::MAX, 2).unwrap_err(); + assert!(matches!(err, MetricsError::BucketRangeOverflow { .. })); +} diff --git a/codex-rs/otel/tests/tests.rs b/codex-rs/otel/tests/tests.rs new file mode 100644 index 00000000000..92f88b95fd8 --- /dev/null +++ b/codex-rs/otel/tests/tests.rs @@ -0,0 +1,2 @@ +mod harness; +mod suite; From 3d5ee777cc584669c5fbfc7755c9312de0d3f803 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Mon, 22 Dec 2025 11:07:10 +0100 Subject: [PATCH 12/43] Test base 1 --- codex-rs/otel/src/metrics/client.rs | 1 - codex-rs/otel/src/metrics/config.rs | 4 +- codex-rs/otel/tests/harness/mod.rs | 213 ++++++------------ codex-rs/otel/tests/suite/send.rs | 285 +++++++++++++----------- codex-rs/otel/tests/suite/timing.rs | 114 +++++----- codex-rs/otel/tests/suite/validation.rs | 16 +- 6 files changed, 289 insertions(+), 344 deletions(-) diff --git a/codex-rs/otel/src/metrics/client.rs b/codex-rs/otel/src/metrics/client.rs index 49a05765abd..b7edf20ca16 100644 --- a/codex-rs/otel/src/metrics/client.rs +++ b/codex-rs/otel/src/metrics/client.rs @@ -326,7 +326,6 @@ impl Drop for MetricsClient { fn build_meter_provider(config: &MetricsConfig) -> Result { match &config.exporter { MetricsExporter::OtlpHttp => build_otlp_http_provider(config), - #[cfg(test)] MetricsExporter::InMemory(exporter) => { let reader = PeriodicReader::builder(exporter.clone()).build(); Ok(SdkMeterProvider::builder().with_reader(reader).build()) diff --git a/codex-rs/otel/src/metrics/config.rs b/codex-rs/otel/src/metrics/config.rs index aeff931f26c..60dd0e4ab45 100644 --- a/codex-rs/otel/src/metrics/config.rs +++ b/codex-rs/otel/src/metrics/config.rs @@ -12,7 +12,6 @@ use std::time::Duration; #[derive(Clone, Debug)] pub(crate) enum MetricsExporter { OtlpHttp, - #[cfg(test)] InMemory(opentelemetry_sdk::metrics::InMemoryMetricExporter), } @@ -83,8 +82,7 @@ impl MetricsConfig { self } - #[cfg(test)] - pub(crate) fn with_in_memory_exporter( + pub fn with_in_memory_exporter( mut self, exporter: opentelemetry_sdk::metrics::InMemoryMetricExporter, ) -> Self { diff --git a/codex-rs/otel/tests/harness/mod.rs b/codex-rs/otel/tests/harness/mod.rs index 3a51dd73574..6129e2d42b5 100644 --- a/codex-rs/otel/tests/harness/mod.rs +++ b/codex-rs/otel/tests/harness/mod.rs @@ -1,165 +1,76 @@ -use serde_json::Value; +use codex_otel::metrics::MetricsClient; +use codex_otel::metrics::MetricsConfig; +use codex_otel::metrics::Result; +use opentelemetry::KeyValue; +use opentelemetry_sdk::metrics::InMemoryMetricExporter; +use opentelemetry_sdk::metrics::data::AggregatedMetrics; +use opentelemetry_sdk::metrics::data::Metric; +use opentelemetry_sdk::metrics::data::MetricData; +use opentelemetry_sdk::metrics::data::ResourceMetrics; use std::collections::BTreeMap; -use std::io::Read; -use std::io::Write; -use std::net::TcpListener; -use std::net::TcpStream; -use std::thread; -#[derive(Debug)] -pub(crate) struct CapturedRequest { - pub(crate) method: String, - pub(crate) path: String, - pub(crate) headers: BTreeMap, - pub(crate) body: Vec, -} - -#[derive(Debug)] -pub(crate) struct ParsedEnvelope { - pub(crate) header: Value, - pub(crate) item_header: Value, - pub(crate) payload: String, -} - -#[derive(Debug)] -pub(crate) struct ParsedStatsdLine { - pub(crate) name: String, - pub(crate) value: i64, - pub(crate) kind: String, - pub(crate) tags: BTreeMap, +pub(crate) fn build_metrics_with_defaults( + default_tags: &[(&str, &str)], +) -> Result<(MetricsClient, InMemoryMetricExporter)> { + let exporter = InMemoryMetricExporter::default(); + let mut config = MetricsConfig::new("test-key").with_in_memory_exporter(exporter.clone()); + for (key, value) in default_tags { + config = config.with_tag(*key, *value)?; + } + let metrics = MetricsClient::new(config)?; + Ok((metrics, exporter)) } -/// Spawn a simple HTTP server that captures one request and responds with `status`. -pub(crate) fn spawn_server(status: u16) -> (String, thread::JoinHandle) { - let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server"); - let addr = listener.local_addr().expect("local addr"); - let dsn = format!("http://public:@{addr}/123"); - - let handle = thread::spawn(move || { - let (mut stream, _) = listener.accept().expect("accept connection"); - let request = read_http_request(&mut stream); - let reason = match status { - 200 => "OK", - 500 => "Internal Server Error", - _ => "OK", - }; - let response = - format!("HTTP/1.1 {status} {reason}\r\nContent-Length: 0\r\nConnection: close\r\n\r\n"); - stream - .write_all(response.as_bytes()) - .expect("write response"); - request - }); - - (dsn, handle) +pub(crate) fn latest_metrics(exporter: &InMemoryMetricExporter) -> ResourceMetrics { + let Ok(metrics) = exporter.get_finished_metrics() else { + panic!("finished metrics error"); + }; + let Some(metrics) = metrics.into_iter().last() else { + panic!("metrics export missing"); + }; + metrics } -// Read a single HTTP request from the stream and return the parsed data. -fn read_http_request(stream: &mut TcpStream) -> CapturedRequest { - let mut buffer = Vec::new(); - let mut chunk = [0_u8; 1024]; - let mut header_end = None; - while header_end.is_none() { - let read = stream.read(&mut chunk).expect("read request"); - if read == 0 { - break; +pub(crate) fn find_metric<'a>( + resource_metrics: &'a ResourceMetrics, + name: &str, +) -> Option<&'a Metric> { + for scope_metrics in resource_metrics.scope_metrics() { + for metric in scope_metrics.metrics() { + if metric.name() == name { + return Some(metric); + } } - buffer.extend_from_slice(&chunk[..read]); - header_end = find_header_end(&buffer); - } - let header_end = header_end.expect("request headers"); - let headers_bytes = &buffer[..header_end]; - let headers_str = std::str::from_utf8(headers_bytes).expect("headers utf-8"); - let mut lines = headers_str.split("\r\n"); - let request_line = lines.next().expect("request line"); - let mut request_parts = request_line.split_whitespace(); - let method = request_parts.next().expect("method").to_string(); - let path = request_parts.next().expect("path").to_string(); - - let mut headers = BTreeMap::new(); - for line in lines { - if line.is_empty() { - continue; - } - if let Some((key, value)) = line.split_once(':') { - headers.insert(key.trim().to_ascii_lowercase(), value.trim().to_string()); - } - } - - let content_length = headers - .get("content-length") - .and_then(|value| value.parse::().ok()) - .unwrap_or(0); - let mut body = buffer[header_end..].to_vec(); - while body.len() < content_length { - let read = stream.read(&mut chunk).expect("read body"); - if read == 0 { - break; - } - body.extend_from_slice(&chunk[..read]); - } - - CapturedRequest { - method, - path, - headers, - body, } + None } -// Locate the end of the HTTP headers in a buffered request. -fn find_header_end(buffer: &[u8]) -> Option { - buffer - .windows(4) - .position(|window| window == b"\r\n\r\n") - .map(|pos| pos + 4) -} - -/// Parse a Sentry envelope payload into headers and statsd payload text. -pub(crate) fn parse_envelope(body: &[u8]) -> ParsedEnvelope { - let mut parts = body.splitn(3, |byte| *byte == b'\n'); - let header_line = parts.next().expect("envelope header"); - let item_header_line = parts.next().expect("item header"); - let payload = parts.next().unwrap_or(&[]); - - let header = serde_json::from_slice(header_line).expect("parse envelope header"); - let item_header = serde_json::from_slice(item_header_line).expect("parse item header"); - let payload = std::str::from_utf8(payload) - .expect("payload utf-8") - .trim_end_matches('\n') - .to_string(); - - ParsedEnvelope { - header, - item_header, - payload, - } +pub(crate) fn attributes_to_map<'a>( + attributes: impl Iterator, +) -> BTreeMap { + attributes + .map(|kv| (kv.key.as_str().to_string(), kv.value.as_str().to_string())) + .collect() } -/// Parse a single statsd line (with optional tags) into components. -pub(crate) fn parse_statsd_line(line: &str) -> ParsedStatsdLine { - let (metric, tags_part) = line - .split_once("|#") - .map(|(metric, tags)| (metric, Some(tags))) - .unwrap_or((line, None)); - let (name_value, kind) = metric.split_once('|').expect("metric kind"); - let (name, value) = name_value.split_once(':').expect("metric value"); - let value = value.parse::().expect("metric value parse"); - - let mut tags = BTreeMap::new(); - if let Some(tags_part) = tags_part - && !tags_part.is_empty() - { - for tag in tags_part.split(',') { - let (key, value) = tag.split_once(':').expect("tag"); - tags.insert(key.to_string(), value.to_string()); - } - } - - ParsedStatsdLine { - name: name.to_string(), - value, - kind: kind.to_string(), - tags, +pub(crate) fn histogram_data( + resource_metrics: &ResourceMetrics, + name: &str, +) -> (Vec, Vec, f64, u64) { + let metric = + find_metric(resource_metrics, name).unwrap_or_else(|| panic!("metric {name} missing")); + match metric.data() { + AggregatedMetrics::F64(data) => match data { + MetricData::Histogram(histogram) => { + let points: Vec<_> = histogram.data_points().collect(); + assert_eq!(points.len(), 1); + let point = points[0]; + let bounds = point.bounds().collect(); + let bucket_counts = point.bucket_counts().collect(); + (bounds, bucket_counts, point.sum(), point.count()) + } + _ => panic!("unexpected histogram aggregation"), + }, + _ => panic!("unexpected metric data type"), } } diff --git a/codex-rs/otel/tests/suite/send.rs b/codex-rs/otel/tests/suite/send.rs index 1abfdde37aa..52e58c1aff5 100644 --- a/codex-rs/otel/tests/suite/send.rs +++ b/codex-rs/otel/tests/suite/send.rs @@ -1,25 +1,19 @@ -use crate::harness::parse_envelope; -use crate::harness::parse_statsd_line; -use crate::harness::spawn_server; -use codex_metrics::HistogramBuckets; -use codex_metrics::MetricsClient; -use codex_metrics::MetricsConfig; -use codex_metrics::MetricsError; -use codex_metrics::Result; +use crate::harness::attributes_to_map; +use crate::harness::build_metrics_with_defaults; +use crate::harness::find_metric; +use crate::harness::histogram_data; +use crate::harness::latest_metrics; +use codex_otel::metrics::HistogramBuckets; +use codex_otel::metrics::MetricsBatch; +use codex_otel::metrics::Result; use pretty_assertions::assert_eq; -use std::net::TcpListener; -use std::thread; -use std::time::Duration; +use std::collections::BTreeMap; // Ensures counters/histograms render with default + per-call tags. #[test] fn send_builds_payload_with_tags_and_histograms() -> Result<()> { - let (dsn, handle) = spawn_server(200); - let metrics = MetricsClient::new( - MetricsConfig::new(dsn.clone()) - .with_tag("service", "codex-cli")? - .with_tag("env", "prod")?, - )?; + let (metrics, exporter) = + build_metrics_with_defaults(&[("service", "codex-cli"), ("env", "prod")])?; let buckets = HistogramBuckets::from_values(&[25, 50, 100])?; let mut batch = metrics.batch(); @@ -28,50 +22,58 @@ fn send_builds_payload_with_tags_and_histograms() -> Result<()> { metrics.send(batch)?; metrics.shutdown()?; - let captured = handle.join().expect("server thread"); - assert_eq!(captured.method, "POST"); - assert_eq!(captured.path, "/api/123/envelope/"); - assert_eq!( - captured.headers.get("content-type").map(String::as_str), - Some("application/x-sentry-envelope") - ); - - let envelope = parse_envelope(&captured.body); - assert_eq!(envelope.header["dsn"].as_str(), Some(dsn.as_str())); - assert_eq!(envelope.item_header["type"], "statsd"); - assert_eq!(envelope.item_header["content_type"], "text/plain"); - assert_eq!( - envelope.item_header["length"].as_u64(), - Some(envelope.payload.len() as u64) - ); - - let lines: Vec<&str> = envelope.payload.split('\n').collect(); - assert_eq!(lines.len(), 5); + let resource_metrics = latest_metrics(&exporter); - let line = parse_statsd_line(lines[0]); - assert_eq!(line.name, "codex.turns"); - assert_eq!(line.value, 1); - assert_eq!(line.kind, "c"); - assert_eq!( - line.tags.get("service").map(String::as_str), - Some("codex-cli") + let counter = find_metric(&resource_metrics, "codex.turns").expect("counter metric missing"); + let counter_attributes = match counter.data() { + opentelemetry_sdk::metrics::data::AggregatedMetrics::I64(data) => match data { + opentelemetry_sdk::metrics::data::MetricData::Sum(sum) => { + let points: Vec<_> = sum.data_points().collect(); + assert_eq!(points.len(), 1); + assert_eq!(points[0].value(), 1); + attributes_to_map(points[0].attributes()) + } + _ => panic!("unexpected counter aggregation"), + }, + _ => panic!("unexpected counter data type"), + }; + + let expected_counter_attributes = BTreeMap::from([ + ("service".to_string(), "codex-cli".to_string()), + ("env".to_string(), "dev".to_string()), + ("model".to_string(), "gpt-5.1".to_string()), + ]); + assert_eq!(counter_attributes, expected_counter_attributes); + + let (bounds, bucket_counts, sum, count) = + histogram_data(&resource_metrics, "codex.tool_latency"); + assert!(!bounds.is_empty()); + assert_eq!(bucket_counts.iter().sum::(), 1); + assert_eq!(sum, 25.0); + assert_eq!(count, 1); + + let histogram_attrs = attributes_to_map( + match find_metric(&resource_metrics, "codex.tool_latency").and_then(|metric| { + match metric.data() { + opentelemetry_sdk::metrics::data::AggregatedMetrics::F64(data) => match data { + opentelemetry_sdk::metrics::data::MetricData::Histogram(histogram) => { + histogram.data_points().next().map(|p| p.attributes()) + } + _ => None, + }, + _ => None, + } + }) { + Some(attrs) => attrs, + None => panic!("histogram attributes missing"), + }, ); - assert_eq!(line.tags.get("env").map(String::as_str), Some("dev")); - assert_eq!(line.tags.get("model").map(String::as_str), Some("gpt-5.1")); - - for (line, expected_le) in lines.iter().skip(1).zip(["25", "50", "100", "inf"]) { - let line = parse_statsd_line(line); - assert_eq!(line.name, "codex.tool_latency"); - assert_eq!(line.value, 1); - assert_eq!(line.kind, "c"); - assert_eq!( - line.tags.get("service").map(String::as_str), - Some("codex-cli") - ); - assert_eq!(line.tags.get("env").map(String::as_str), Some("prod")); - assert_eq!(line.tags.get("tool").map(String::as_str), Some("shell")); - assert_eq!(line.tags.get("le").map(String::as_str), Some(expected_le)); - } + let expected_histogram_attributes = BTreeMap::from([ + ("service".to_string(), "codex-cli".to_string()), + ("env".to_string(), "prod".to_string()), + ("tool".to_string(), "shell".to_string()), + ]); + assert_eq!(histogram_attrs, expected_histogram_attributes); Ok(()) } @@ -79,13 +81,11 @@ fn send_builds_payload_with_tags_and_histograms() -> Result<()> { // Ensures defaults merge per line and overrides take precedence. #[test] fn send_merges_default_tags_per_line() -> Result<()> { - let (dsn, handle) = spawn_server(200); - let metrics = MetricsClient::new( - MetricsConfig::new(dsn.clone()) - .with_tag("service", "codex-cli")? - .with_tag("env", "prod")? - .with_tag("region", "us")?, - )?; + let (metrics, exporter) = build_metrics_with_defaults(&[ + ("service", "codex-cli"), + ("env", "prod"), + ("region", "us"), + ])?; let mut batch = metrics.batch(); batch.counter("codex.alpha", 1, &[("env", "dev"), ("component", "alpha")])?; @@ -97,27 +97,60 @@ fn send_merges_default_tags_per_line() -> Result<()> { metrics.send(batch)?; metrics.shutdown()?; - let captured = handle.join().expect("server thread"); - let envelope = parse_envelope(&captured.body); - let lines: Vec<&str> = envelope.payload.split('\n').collect(); - assert_eq!(lines.len(), 2); - assert_eq!( - lines[0], - "codex.alpha:1|c|#component:alpha,env:dev,region:us,service:codex-cli" - ); - assert_eq!( - lines[1], - "codex.beta:2|c|#component:beta,env:prod,region:us,service:worker" - ); + let resource_metrics = latest_metrics(&exporter); + let alpha_metric = + find_metric(&resource_metrics, "codex.alpha").expect("codex.alpha metric missing"); + let alpha_point = match alpha_metric.data() { + opentelemetry_sdk::metrics::data::AggregatedMetrics::I64(data) => match data { + opentelemetry_sdk::metrics::data::MetricData::Sum(sum) => { + let points: Vec<_> = sum.data_points().collect(); + assert_eq!(points.len(), 1); + points[0] + } + _ => panic!("unexpected counter aggregation"), + }, + _ => panic!("unexpected counter data type"), + }; + assert_eq!(alpha_point.value(), 1); + let alpha_attrs = attributes_to_map(alpha_point.attributes()); + let expected_alpha_attrs = BTreeMap::from([ + ("component".to_string(), "alpha".to_string()), + ("env".to_string(), "dev".to_string()), + ("region".to_string(), "us".to_string()), + ("service".to_string(), "codex-cli".to_string()), + ]); + assert_eq!(alpha_attrs, expected_alpha_attrs); + + let beta_metric = + find_metric(&resource_metrics, "codex.beta").expect("codex.beta metric missing"); + let beta_point = match beta_metric.data() { + opentelemetry_sdk::metrics::data::AggregatedMetrics::I64(data) => match data { + opentelemetry_sdk::metrics::data::MetricData::Sum(sum) => { + let points: Vec<_> = sum.data_points().collect(); + assert_eq!(points.len(), 1); + points[0] + } + _ => panic!("unexpected counter aggregation"), + }, + _ => panic!("unexpected counter data type"), + }; + assert_eq!(beta_point.value(), 2); + let beta_attrs = attributes_to_map(beta_point.attributes()); + let expected_beta_attrs = BTreeMap::from([ + ("component".to_string(), "beta".to_string()), + ("env".to_string(), "prod".to_string()), + ("region".to_string(), "us".to_string()), + ("service".to_string(), "worker".to_string()), + ]); + assert_eq!(beta_attrs, expected_beta_attrs); Ok(()) } -// Verifies values above the max bucket use the inf tag. +// Verifies values above the max bucket use the inf bucket. #[test] fn send_uses_inf_bucket_for_values_over_max() -> Result<()> { - let (dsn, handle) = spawn_server(200); - let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; + let (metrics, exporter) = build_metrics_with_defaults(&[])?; let buckets = HistogramBuckets::from_values(&[10, 20])?; let mut batch = metrics.batch(); @@ -125,12 +158,12 @@ fn send_uses_inf_bucket_for_values_over_max() -> Result<()> { metrics.send(batch)?; metrics.shutdown()?; - let captured = handle.join().expect("server thread"); - let envelope = parse_envelope(&captured.body); - let lines: Vec<&str> = envelope.payload.split('\n').collect(); - assert_eq!(lines.len(), 1); - let line = parse_statsd_line(lines[0]); - assert_eq!(line.tags.get("le").map(String::as_str), Some("inf")); + let (bounds, bucket_counts, sum, count) = + histogram_data(&latest_metrics(&exporter), "codex.tool_latency"); + assert!(!bounds.is_empty()); + assert_eq!(bucket_counts.iter().sum::(), 1); + assert_eq!(sum, 99.0); + assert_eq!(count, 1); Ok(()) } @@ -138,70 +171,68 @@ fn send_uses_inf_bucket_for_values_over_max() -> Result<()> { // Verifies enqueued batches are delivered by the background worker. #[test] fn client_sends_enqueued_batch() -> Result<()> { - let (dsn, handle) = spawn_server(200); - let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; + let (metrics, exporter) = build_metrics_with_defaults(&[])?; let mut batch = metrics.batch(); batch.counter("codex.turns", 1, &[("model", "gpt-5.1")])?; metrics.send(batch)?; metrics.shutdown()?; - let captured = handle.join().expect("server thread"); - let envelope = parse_envelope(&captured.body); - let lines: Vec<&str> = envelope.payload.split('\n').collect(); - assert_eq!(lines.len(), 1); - - let line = parse_statsd_line(lines[0]); - assert_eq!(line.name, "codex.turns"); - assert_eq!(line.value, 1); - assert_eq!(line.kind, "c"); - assert_eq!(line.tags.get("model").map(String::as_str), Some("gpt-5.1")); + let resource_metrics = latest_metrics(&exporter); + let counter = find_metric(&resource_metrics, "codex.turns").expect("counter metric missing"); + let points = match counter.data() { + opentelemetry_sdk::metrics::data::AggregatedMetrics::I64(data) => match data { + opentelemetry_sdk::metrics::data::MetricData::Sum(sum) => { + sum.data_points().collect::>() + } + _ => panic!("unexpected counter aggregation"), + }, + _ => panic!("unexpected counter data type"), + }; + assert_eq!(points.len(), 1); + let point = points[0]; + assert_eq!(point.value(), 1); + let attrs = attributes_to_map(point.attributes()); + assert_eq!(attrs.get("model").map(String::as_str), Some("gpt-5.1")); Ok(()) } -// Ensures a non-success response panics in debug builds via error_or_panic. +// Ensures shutdown flushes successfully with in-memory exporters. #[test] fn send_panics_on_non_success_status_in_debug() -> Result<()> { - let (dsn, handle) = spawn_server(500); - let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; + let (metrics, exporter) = build_metrics_with_defaults(&[])?; let mut batch = metrics.batch(); batch.counter("codex.turns", 1, &[])?; metrics.send(batch)?; - let err = metrics.shutdown().unwrap_err(); - assert!(matches!(err, MetricsError::WorkerPanicked)); + metrics.shutdown()?; + + let resource_metrics = latest_metrics(&exporter); + let counter = find_metric(&resource_metrics, "codex.turns").expect("counter metric missing"); + let points = match counter.data() { + opentelemetry_sdk::metrics::data::AggregatedMetrics::I64(data) => match data { + opentelemetry_sdk::metrics::data::MetricData::Sum(sum) => { + sum.data_points().collect::>() + } + _ => panic!("unexpected counter aggregation"), + }, + _ => panic!("unexpected counter data type"), + }; + assert_eq!(points.len(), 1); - let captured = handle.join().expect("server thread"); - assert_eq!(captured.method, "POST"); Ok(()) } -// Ensures empty batches do not trigger any HTTP request. +// Ensures empty batches do not trigger any export. #[test] fn client_core_skips_empty_batch() -> Result<()> { - let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server"); - listener.set_nonblocking(true).expect("set nonblocking"); - let addr = listener.local_addr().expect("local addr"); - let dsn = format!("http://public:@{addr}/123"); - let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; + let (metrics, exporter) = build_metrics_with_defaults(&[])?; - metrics.send(metrics.batch())?; + metrics.send(MetricsBatch::new())?; metrics.shutdown()?; - let mut saw_connection = false; - for _ in 0..10 { - match listener.accept() { - Ok(_) => { - saw_connection = true; - break; - } - Err(err) if err.kind() == std::io::ErrorKind::WouldBlock => { - thread::sleep(Duration::from_millis(10)); - } - Err(err) => panic!("unexpected accept error: {err}"), - } - } - assert!(!saw_connection, "expected no request for empty batch"); + let finished = exporter.get_finished_metrics().unwrap(); + assert!(finished.is_empty(), "expected no metrics exported"); Ok(()) } diff --git a/codex-rs/otel/tests/suite/timing.rs b/codex-rs/otel/tests/suite/timing.rs index 938bdefaf14..3b6a74232e4 100644 --- a/codex-rs/otel/tests/suite/timing.rs +++ b/codex-rs/otel/tests/suite/timing.rs @@ -1,19 +1,17 @@ -use crate::harness::parse_envelope; -use crate::harness::parse_statsd_line; -use crate::harness::spawn_server; -use codex_metrics::HistogramBuckets; -use codex_metrics::MetricsClient; -use codex_metrics::MetricsConfig; -use codex_metrics::MetricsError; -use codex_metrics::Result; +use crate::harness::attributes_to_map; +use crate::harness::build_metrics_with_defaults; +use crate::harness::histogram_data; +use crate::harness::latest_metrics; +use codex_otel::metrics::HistogramBuckets; +use codex_otel::metrics::MetricsError; +use codex_otel::metrics::Result; use pretty_assertions::assert_eq; use std::time::Duration; // Ensures duration recording maps to the expected bucket tag. #[test] fn record_duration_uses_matching_bucket() -> Result<()> { - let (dsn, handle) = spawn_server(200); - let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; + let (metrics, exporter) = build_metrics_with_defaults(&[])?; let buckets = HistogramBuckets::from_values(&[10, 20])?; metrics.record_duration( @@ -24,18 +22,12 @@ fn record_duration_uses_matching_bucket() -> Result<()> { )?; metrics.shutdown()?; - let captured = handle.join().expect("server thread"); - let envelope = parse_envelope(&captured.body); - let lines: Vec<&str> = envelope.payload.split('\n').collect(); - assert_eq!(lines.len(), 2); - - let line = parse_statsd_line(lines[0]); - assert_eq!(line.name, "codex.request_latency"); - assert_eq!(line.tags.get("route").map(String::as_str), Some("chat")); - assert_eq!(line.tags.get("le").map(String::as_str), Some("20")); - - let line = parse_statsd_line(lines[1]); - assert_eq!(line.tags.get("le").map(String::as_str), Some("inf")); + let (bounds, bucket_counts, sum, count) = + histogram_data(&latest_metrics(&exporter), "codex.request_latency"); + assert!(!bounds.is_empty()); + assert_eq!(bucket_counts.iter().sum::(), 1); + assert_eq!(sum, 15.0); + assert_eq!(count, 1); Ok(()) } @@ -43,8 +35,7 @@ fn record_duration_uses_matching_bucket() -> Result<()> { // Ensures time_result returns the closure output and records timing. #[test] fn time_result_records_success() -> Result<()> { - let (dsn, handle) = spawn_server(200); - let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; + let (metrics, exporter) = build_metrics_with_defaults(&[])?; let buckets = HistogramBuckets::from_values(&[10, 20])?; let value = metrics.time_result( @@ -56,21 +47,29 @@ fn time_result_records_success() -> Result<()> { assert_eq!(value, "ok"); metrics.shutdown()?; - let captured = handle.join().expect("server thread"); - let envelope = parse_envelope(&captured.body); - let lines: Vec<&str> = envelope.payload.split('\n').collect(); - assert!(!lines.is_empty()); - let parsed: Vec<_> = lines.iter().copied().map(parse_statsd_line).collect(); - assert!( - parsed - .iter() - .any(|line| { line.tags.get("le").map(String::as_str) == Some("inf") }) + let resource_metrics = latest_metrics(&exporter); + let (bounds, bucket_counts, _sum, count) = + histogram_data(&resource_metrics, "codex.request_latency"); + assert!(!bounds.is_empty()); + assert_eq!(count, 1); + assert_eq!(bucket_counts.iter().sum::(), 1); + let attrs = attributes_to_map( + match crate::harness::find_metric(&resource_metrics, "codex.request_latency").and_then( + |metric| match metric.data() { + opentelemetry_sdk::metrics::data::AggregatedMetrics::F64(data) => match data { + opentelemetry_sdk::metrics::data::MetricData::Histogram(histogram) => { + histogram.data_points().next().map(|p| p.attributes()) + } + _ => None, + }, + _ => None, + }, + ) { + Some(attrs) => attrs, + None => panic!("attributes missing"), + }, ); - for line in parsed { - assert_eq!(line.name, "codex.request_latency"); - assert_eq!(line.tags.get("route").map(String::as_str), Some("chat")); - assert!(line.tags.contains_key("le")); - } + assert_eq!(attrs.get("route").map(String::as_str), Some("chat")); Ok(()) } @@ -78,8 +77,7 @@ fn time_result_records_success() -> Result<()> { // Ensures time_result propagates errors but still records timing. #[test] fn time_result_records_on_error() -> Result<()> { - let (dsn, handle) = spawn_server(200); - let metrics = MetricsClient::new(MetricsConfig::new(dsn))?; + let (metrics, exporter) = build_metrics_with_defaults(&[])?; let buckets = HistogramBuckets::from_values(&[10, 20])?; let err = metrics @@ -93,21 +91,29 @@ fn time_result_records_on_error() -> Result<()> { assert!(matches!(err, MetricsError::EmptyMetricName)); metrics.shutdown()?; - let captured = handle.join().expect("server thread"); - let envelope = parse_envelope(&captured.body); - let lines: Vec<&str> = envelope.payload.split('\n').collect(); - assert!(!lines.is_empty()); - let parsed: Vec<_> = lines.iter().copied().map(parse_statsd_line).collect(); - assert!( - parsed - .iter() - .any(|line| { line.tags.get("le").map(String::as_str) == Some("inf") }) + let resource_metrics = latest_metrics(&exporter); + let (bounds, bucket_counts, _sum, count) = + histogram_data(&resource_metrics, "codex.request_latency"); + assert!(!bounds.is_empty()); + assert_eq!(bucket_counts.iter().sum::(), 1); + assert_eq!(count, 1); + let attrs = attributes_to_map( + match crate::harness::find_metric(&resource_metrics, "codex.request_latency").and_then( + |metric| match metric.data() { + opentelemetry_sdk::metrics::data::AggregatedMetrics::F64(data) => match data { + opentelemetry_sdk::metrics::data::MetricData::Histogram(histogram) => { + histogram.data_points().next().map(|p| p.attributes()) + } + _ => None, + }, + _ => None, + }, + ) { + Some(attrs) => attrs, + None => panic!("attributes missing"), + }, ); - for line in parsed { - assert_eq!(line.name, "codex.request_latency"); - assert_eq!(line.tags.get("route").map(String::as_str), Some("chat")); - assert!(line.tags.contains_key("le")); - } + assert_eq!(attrs.get("route").map(String::as_str), Some("chat")); Ok(()) } diff --git a/codex-rs/otel/tests/suite/validation.rs b/codex-rs/otel/tests/suite/validation.rs index 2383b774539..b852d464496 100644 --- a/codex-rs/otel/tests/suite/validation.rs +++ b/codex-rs/otel/tests/suite/validation.rs @@ -1,16 +1,16 @@ -use codex_metrics::HistogramBuckets; -use codex_metrics::MetricsBatch; -use codex_metrics::MetricsClient; -use codex_metrics::MetricsConfig; -use codex_metrics::MetricsError; -use codex_metrics::Result; +use codex_otel::metrics::HistogramBuckets; +use codex_otel::metrics::MetricsBatch; +use codex_otel::metrics::MetricsClient; +use codex_otel::metrics::MetricsConfig; +use codex_otel::metrics::MetricsError; +use codex_otel::metrics::Result; // Validates invalid DSNs are rejected early. #[test] fn invalid_dsn_reports_error() -> Result<()> { assert!(matches!( - MetricsClient::new(MetricsConfig::new("not a dsn")), - Err(MetricsError::InvalidDsn { .. }) + MetricsClient::new(MetricsConfig::new("")), + Err(MetricsError::EmptyApiKey) )); Ok(()) } From 421d785d142ccc2502329e1d091e421f0aee67d5 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Mon, 22 Dec 2025 12:10:44 +0100 Subject: [PATCH 13/43] Move to tokio --- codex-rs/Cargo.lock | 1 + codex-rs/core/src/codex.rs | 9 +- codex-rs/docs/metrics.md | 27 +- codex-rs/otel/Cargo.toml | 3 +- codex-rs/otel/README.md | 12 +- codex-rs/otel/src/metrics/batch.rs | 4 + codex-rs/otel/src/metrics/client.rs | 379 +++++++++++++++++++++++----- codex-rs/otel/src/metrics/config.rs | 14 +- codex-rs/otel/src/metrics/error.rs | 32 +++ codex-rs/otel/src/metrics/tests.rs | 109 ++++++++ 10 files changed, 501 insertions(+), 89 deletions(-) diff --git a/codex-rs/Cargo.lock b/codex-rs/Cargo.lock index 1d8e247c0cc..f58ab014863 100644 --- a/codex-rs/Cargo.lock +++ b/codex-rs/Cargo.lock @@ -1604,6 +1604,7 @@ dependencies = [ "tracing", "tracing-opentelemetry", "tracing-subscriber", + "wiremock", ] [[package]] diff --git a/codex-rs/core/src/codex.rs b/codex-rs/core/src/codex.rs index c4a02a4eecf..62cfcefddb2 100644 --- a/codex-rs/core/src/codex.rs +++ b/codex-rs/core/src/codex.rs @@ -151,6 +151,8 @@ use crate::util::backoff; use codex_async_utils::OrCancelExt; use codex_execpolicy::Policy as ExecPolicy; use codex_otel::OtelManager; +use codex_otel::metrics::MetricsClient; +use codex_otel::metrics::MetricsConfig; use codex_protocol::config_types::ReasoningSummary as ReasoningSummaryConfig; use codex_protocol::models::ContentItem; use codex_protocol::models::ResponseInputItem; @@ -635,7 +637,8 @@ impl Session { config.otel.log_user_prompt, terminal::user_agent(), session_configuration.session_source.clone(), - ); + ) + .with_metrics(MetricsClient::new(MetricsConfig::default())?); otel_manager.conversation_starts( config.model_provider.name.as_str(), @@ -649,6 +652,8 @@ impl Session { config.active_profile.clone(), ); + otel_manager.counter("jif_test_1", 2, &[("value", "k_jif")])?; + let mut default_shell = shell::default_user_shell(); // Create the mutable state for the Session. if config.features.enabled(Feature::ShellSnapshot) { @@ -2780,6 +2785,8 @@ mod tests { use std::time::Duration; use tokio::time::sleep; + use codex_otel::metrics::MetricsClient; + use codex_otel::metrics::MetricsConfig; use mcp_types::ContentBlock; use mcp_types::TextContent; use pretty_assertions::assert_eq; diff --git a/codex-rs/docs/metrics.md b/codex-rs/docs/metrics.md index 50d0ae94323..777d2fcd6e7 100644 --- a/codex-rs/docs/metrics.md +++ b/codex-rs/docs/metrics.md @@ -1,12 +1,12 @@ -# Metrics (Statsig + OTEL) +# Metrics (Statsig HTTP) The `codex_otel::metrics` module sends counters and histograms to a Statsig -backend using OTLP/HTTP. It uses a background worker to keep callers -non-blocking and exports metrics via OpenTelemetry. +backend by POSTing JSON to the Statsig `log_event` endpoint. A tokio-backed +worker keeps callers non-blocking while metrics are serialized and sent. -You must supply a Statsig OTLP endpoint and API key. This module ships with -placeholders (``, ``, -``) so they are obvious to replace. +Defaults are provided for the Statsig API key, header name, and endpoint so +you can send metrics immediately. Override them if you need to target a +different Statsig project. ## Quick start @@ -17,7 +17,7 @@ use codex_otel::metrics::MetricsConfig; let metrics = MetricsClient::new( MetricsConfig::new("") - .with_endpoint("") + .with_endpoint("") .with_api_key_header("") .with_tag("service", "codex-cli")?, )?; @@ -54,7 +54,7 @@ let manager = OtelManager::new( ) .with_metrics_config( MetricsConfig::new("") - .with_endpoint("") + .with_endpoint("") .with_api_key_header(""), )?; @@ -72,19 +72,18 @@ If you set `metrics: Some(MetricsConfig)` on `OtelSettings` and build an `MetricsConfig` lets you specify: - `MetricsConfig::new(api_key)` to set the Statsig API key. -- `with_endpoint(endpoint)` to set the OTLP endpoint. +- `with_endpoint(endpoint)` to set the Statsig `log_event` endpoint. - `with_api_key_header(header)` to set the API key header name. - `with_tag(key, value)` to add default tags for every metric. -- `with_timeout(duration)` to set the OTLP export timeout. -- `with_export_interval(duration)` to set the periodic export interval. +- `with_timeout(duration)` to set the HTTP request timeout. +- `with_export_interval(duration)` to tweak the in-memory exporter interval in tests. - `with_user_agent(agent)` to override the HTTP `User-Agent` header. The queue capacity is fixed at 1024 entries. ## Histograms -Histograms are recorded as OpenTelemetry histograms. Bucket boundaries are -controlled by the OTEL pipeline (collector/exporter configuration). The +Histogram samples are sent as individual Statsig events. The `HistogramBuckets` type is retained for API compatibility and validation but is not used to pre-bucket samples. @@ -161,4 +160,4 @@ Tag keys and values: All APIs return `codex_otel::metrics::Result` with a `MetricsError` variant on failure. Errors cover invalid configuration, validation failures, queue -backpressure, and OTLP exporter setup issues. +backpressure, and HTTP client setup or request failures. diff --git a/codex-rs/otel/Cargo.toml b/codex-rs/otel/Cargo.toml index 0b422de6011..846cc1eebaf 100644 --- a/codex-rs/otel/Cargo.toml +++ b/codex-rs/otel/Cargo.toml @@ -34,7 +34,7 @@ opentelemetry-otlp = { workspace = true, features = [ "tls-roots", ]} opentelemetry-semantic-conventions = { workspace = true } -opentelemetry_sdk = { workspace = true, features = ["logs", "metrics", "rt-tokio", "trace"] } +opentelemetry_sdk = { workspace = true, features = ["logs", "metrics", "rt-tokio", "testing", "trace"] } http = { workspace = true } reqwest = { workspace = true, features = ["blocking", "rustls-tls"] } serde = { workspace = true, features = ["derive"] } @@ -54,3 +54,4 @@ tracing-subscriber = { workspace = true } [dev-dependencies] opentelemetry_sdk = { workspace = true, features = ["testing"] } pretty_assertions = { workspace = true } +wiremock = { workspace = true } diff --git a/codex-rs/otel/README.md b/codex-rs/otel/README.md index a7cb17dd81c..6e429be6b7e 100644 --- a/codex-rs/otel/README.md +++ b/codex-rs/otel/README.md @@ -70,11 +70,11 @@ let manager = OtelManager::new( manager.user_prompt(&prompt_items); ``` -## Metrics (Statsig + OTLP) +## Metrics (Statsig HTTP) -The metrics client sends counters and histograms to Statsig via OTLP/HTTP. Use -placeholders for the Statsig endpoint and API key header until you have real -values: +The metrics client sends counters and histograms to Statsig via the `log_event` +endpoint. Use placeholders for the Statsig endpoint and API key header until +you have real values: ```rust use codex_otel::metrics::HistogramBuckets; @@ -83,7 +83,7 @@ use codex_otel::metrics::MetricsConfig; let metrics = MetricsClient::new( MetricsConfig::new("") - .with_endpoint("") + .with_endpoint("") .with_api_key_header(""), )?; @@ -122,7 +122,7 @@ let settings = OtelSettings { }, metrics: Some( MetricsConfig::new("") - .with_endpoint("") + .with_endpoint("") .with_api_key_header(""), ), }; diff --git a/codex-rs/otel/src/metrics/batch.rs b/codex-rs/otel/src/metrics/batch.rs index e896a94470d..d023441fd0d 100644 --- a/codex-rs/otel/src/metrics/batch.rs +++ b/codex-rs/otel/src/metrics/batch.rs @@ -138,6 +138,10 @@ impl MetricsBatch { } } + pub(crate) fn len(&self) -> usize { + self.events.len() + } + pub fn with_default_tags(default_tags: Vec<(String, String)>) -> Result { for (key, value) in &default_tags { validate_tag_key(key)?; diff --git a/codex-rs/otel/src/metrics/client.rs b/codex-rs/otel/src/metrics/client.rs index b7edf20ca16..66d9256deb5 100644 --- a/codex-rs/otel/src/metrics/client.rs +++ b/codex-rs/otel/src/metrics/client.rs @@ -13,28 +13,32 @@ use crate::metrics::tags::tags_to_attributes; use crate::metrics::time::duration_to_millis; use crate::metrics::util::error_or_panic; use crate::metrics::validation::validate_tags; +use chrono::Utc; use opentelemetry::KeyValue; use opentelemetry::metrics::Histogram; use opentelemetry::metrics::Meter; use opentelemetry::metrics::MeterProvider; use opentelemetry::metrics::UpDownCounter; -use opentelemetry_otlp::MetricExporter; -use opentelemetry_otlp::Protocol; -use opentelemetry_otlp::WithExportConfig; -use opentelemetry_otlp::WithHttpConfig; use opentelemetry_sdk::metrics::PeriodicReader; use opentelemetry_sdk::metrics::SdkMeterProvider; +use reqwest::header::HeaderName; +use reqwest::header::HeaderValue; +use reqwest::header::USER_AGENT; +use serde::Serialize; use std::collections::BTreeMap; use std::collections::HashMap; use std::sync::Arc; use std::sync::Mutex; -use std::sync::mpsc; -use std::sync::mpsc::TrySendError; use std::thread; use std::time::Duration; use std::time::Instant; +use tokio::runtime::Runtime; +use tokio::sync::mpsc; +use tokio::sync::mpsc::error::TrySendError; const METER_NAME: &str = "codex-otel-metrics"; +const STATSIG_USER_ID: &str = "codex-metrics"; +const STATSIG_SDK_TYPE: &str = "codex-otel-rust"; enum WorkerMessage { Batch(MetricsBatch), @@ -42,10 +46,9 @@ enum WorkerMessage { } struct WorkerState { - sender: Mutex>>, + sender: Mutex>>, handle: Mutex>>, capacity: usize, - meter_provider: Mutex>, } #[derive(Debug)] @@ -105,7 +108,7 @@ impl MetricRecorder { } } -/// Background metrics client that enqueues metrics to a dedicated worker thread. +/// Background metrics client that enqueues metrics to a tokio-backed worker. #[derive(Clone)] pub struct MetricsClient { state: Arc, @@ -138,21 +141,19 @@ impl MetricsClient { validate_tags(&config.default_tags)?; - let meter_provider = build_meter_provider(&config)?; - let meter = meter_provider.meter(METER_NAME); - - let recorder = MetricRecorder::new(meter, config.default_tags); + let exporter_label = config.exporter_label(); + let worker_exporter_label = exporter_label.clone(); + let exporter = build_worker_exporter(&config)?; + let runtime = build_runtime()?; - let (sender, receiver) = mpsc::sync_channel(capacity); - let worker_provider = meter_provider.clone(); - let handle = thread::spawn(move || run_worker(recorder, receiver, worker_provider)); + let (sender, receiver) = mpsc::channel(capacity); + let handle = spawn_worker(runtime, exporter, worker_exporter_label, receiver); Ok(Self { state: Arc::new(WorkerState { sender: Mutex::new(Some(sender)), handle: Mutex::new(Some(handle)), capacity, - meter_provider: Mutex::new(Some(meter_provider)), }), }) } @@ -250,7 +251,7 @@ impl MetricsClient { Err(TrySendError::Full(_)) => Err(MetricsError::QueueFull { capacity: self.state.capacity, }), - Err(TrySendError::Disconnected(_)) => Err(MetricsError::WorkerUnavailable), + Err(TrySendError::Closed(_)) => Err(MetricsError::WorkerUnavailable), } } @@ -271,18 +272,25 @@ impl MetricsClient { .handle .lock() .unwrap_or_else(std::sync::PoisonError::into_inner); - let mut meter_provider = self - .state - .meter_provider - .lock() - .unwrap_or_else(std::sync::PoisonError::into_inner); let Some(handle) = handle.take() else { return Ok(()); }; let mut joined = false; if let Some(sender) = sender { - let _ = sender.try_send(WorkerMessage::Shutdown); + match sender.try_send(WorkerMessage::Shutdown) { + Ok(()) | Err(TrySendError::Closed(_)) => {} + Err(TrySendError::Full(_)) => { + if tokio::runtime::Handle::try_current().is_ok() { + let sender = sender.clone(); + let _ = + thread::spawn(move || sender.blocking_send(WorkerMessage::Shutdown)) + .join(); + } else { + let _ = sender.blocking_send(WorkerMessage::Shutdown); + } + } + } } if timeout.is_zero() { @@ -302,13 +310,8 @@ impl MetricsClient { } } - if joined && let Some(meter_provider) = meter_provider.take() { - meter_provider - .force_flush() - .map_err(|source| MetricsError::FlushFailed { source })?; - meter_provider - .shutdown() - .map_err(|source| MetricsError::ShutdownFailed { source })?; + if joined { + return Ok(()); } Ok(()) @@ -323,53 +326,299 @@ impl Drop for MetricsClient { } } -fn build_meter_provider(config: &MetricsConfig) -> Result { +fn build_runtime() -> Result { + tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .map_err(|source| MetricsError::RuntimeBuild { source }) +} + +fn build_worker_exporter(config: &MetricsConfig) -> Result { match &config.exporter { - MetricsExporter::OtlpHttp => build_otlp_http_provider(config), - MetricsExporter::InMemory(exporter) => { - let reader = PeriodicReader::builder(exporter.clone()).build(); - Ok(SdkMeterProvider::builder().with_reader(reader).build()) - } + MetricsExporter::StatsigHttp => Ok(WorkerExporter::Statsig(StatsigExporter::from(config)?)), + MetricsExporter::InMemory(exporter) => Ok(WorkerExporter::InMemory( + InMemoryExporter::from(config, exporter.clone()), + )), } } -fn build_otlp_http_provider(config: &MetricsConfig) -> Result { - let mut headers = HashMap::new(); - headers.insert(config.api_key_header.clone(), config.api_key.clone()); - if !config.user_agent.is_empty() { - headers.insert("User-Agent".to_string(), config.user_agent.clone()); +fn spawn_worker( + runtime: Runtime, + exporter: WorkerExporter, + exporter_label: String, + receiver: mpsc::Receiver, +) -> thread::JoinHandle<()> { + thread::spawn(move || { + let worker = MetricsWorker::new(exporter, exporter_label); + runtime.block_on(worker.run(receiver)); + }) +} + +struct MetricsWorker { + exporter: WorkerExporter, + exporter_label: String, +} + +impl MetricsWorker { + fn new(exporter: WorkerExporter, exporter_label: String) -> Self { + Self { + exporter, + exporter_label, + } } - let exporter = MetricExporter::builder() - .with_http() - .with_protocol(Protocol::HttpBinary) - .with_endpoint(config.endpoint.clone()) - .with_timeout(config.timeout) - .with_headers(headers) - .build() - .map_err(|source| MetricsError::ExporterBuild { source })?; + async fn run(mut self, mut receiver: mpsc::Receiver) { + let mut received_shutdown = false; + while let Some(message) = receiver.recv().await { + match message { + WorkerMessage::Batch(batch) => self.export_batch(batch).await, + WorkerMessage::Shutdown => { + received_shutdown = true; + break; + } + } + } + if received_shutdown || matches!(&self.exporter, WorkerExporter::InMemory(_)) { + self.shutdown().await; + } + } - let reader = PeriodicReader::builder(exporter) - .with_interval(config.export_interval) - .build(); + async fn export_batch(&mut self, batch: MetricsBatch) { + match &mut self.exporter { + WorkerExporter::Statsig(exporter) => { + if let Err(err) = exporter.export_batch(batch).await { + error_or_panic(format!( + "statsig metrics export failed: {err} (exporter={})", + self.exporter_label + )); + } + } + WorkerExporter::InMemory(exporter) => { + exporter.export(batch, &self.exporter_label).await; + } + } + } - Ok(SdkMeterProvider::builder().with_reader(reader).build()) + async fn shutdown(&mut self) { + if let WorkerExporter::InMemory(exporter) = &mut self.exporter { + exporter.shutdown(&self.exporter_label).await; + } + } } -fn run_worker( - mut recorder: MetricRecorder, - receiver: mpsc::Receiver, +enum WorkerExporter { + Statsig(StatsigExporter), + InMemory(InMemoryExporter), +} + +struct InMemoryExporter { + recorder: MetricRecorder, meter_provider: SdkMeterProvider, -) { - for message in receiver { - match message { - WorkerMessage::Batch(batch) => { - recorder.record_batch(batch); - if let Err(err) = meter_provider.force_flush() { - error_or_panic(format!("metrics flush failed: {err}")); +} + +impl InMemoryExporter { + fn from( + config: &MetricsConfig, + exporter: opentelemetry_sdk::metrics::InMemoryMetricExporter, + ) -> Self { + let reader = PeriodicReader::builder(exporter) + .with_interval(config.export_interval) + .build(); + let meter_provider = SdkMeterProvider::builder().with_reader(reader).build(); + let meter = meter_provider.meter(METER_NAME); + let recorder = MetricRecorder::new(meter, config.default_tags.clone()); + Self { + recorder, + meter_provider, + } + } + + async fn export(&mut self, batch: MetricsBatch, exporter_label: &str) { + let event_count = batch.len(); + self.recorder.record_batch(batch); + if let Err(err) = self.meter_provider.force_flush() { + error_or_panic(format!( + "metrics flush failed: {err} (events={event_count}, exporter={exporter_label})" + )); + } + } + + async fn shutdown(&mut self, exporter_label: &str) { + if let Err(err) = self.meter_provider.force_flush() { + error_or_panic(format!( + "metrics flush failed during shutdown: {err} (exporter={exporter_label})" + )); + } + if let Err(err) = self.meter_provider.shutdown() { + error_or_panic(format!( + "metrics shutdown failed: {err} (exporter={exporter_label})" + )); + } + } +} + +struct StatsigExporter { + client: reqwest::Client, + endpoint: String, + api_key_header: HeaderName, + api_key: HeaderValue, + user_agent: Option, + default_tags: BTreeMap, +} + +impl StatsigExporter { + fn from(config: &MetricsConfig) -> Result { + let api_key_header = + HeaderName::from_bytes(config.api_key_header.as_bytes()).map_err(|source| { + MetricsError::InvalidApiKeyHeader { + header: config.api_key_header.clone(), + source, } + })?; + let api_key = HeaderValue::from_str(&config.api_key).map_err(|source| { + MetricsError::InvalidHeaderValue { + header: config.api_key_header.clone(), + source, } - WorkerMessage::Shutdown => break, + })?; + let user_agent = if config.user_agent.is_empty() { + None + } else { + Some(HeaderValue::from_str(&config.user_agent).map_err(|source| { + MetricsError::InvalidHeaderValue { + header: "User-Agent".to_string(), + source, + } + })?) + }; + let client = reqwest::Client::builder() + .timeout(config.timeout) + .build() + .map_err(|source| MetricsError::HttpClientBuild { source })?; + + Ok(Self { + client, + endpoint: config.endpoint.clone(), + api_key_header, + api_key, + user_agent, + default_tags: config.default_tags.clone(), + }) + } + + async fn export_batch(&self, batch: MetricsBatch) -> Result<()> { + let payload = self.build_payload(batch); + if payload.events.is_empty() { + return Ok(()); + } + + let mut request = self + .client + .post(&self.endpoint) + .header(self.api_key_header.clone(), self.api_key.clone()); + + if let Some(user_agent) = &self.user_agent { + request = request.header(USER_AGENT, user_agent.clone()); + } + + let response = request + .json(&payload) + .send() + .await + .map_err(|source| MetricsError::StatsigRequestFailed { source })?; + + if let Err(status_err) = response.error_for_status_ref() { + let status = status_err + .status() + .unwrap_or(reqwest::StatusCode::INTERNAL_SERVER_ERROR); + let body = response.text().await.unwrap_or_default(); + return Err(MetricsError::StatsigResponseError { status, body }); + } + + Ok(()) + } + + fn build_payload(&self, batch: MetricsBatch) -> StatsigPayload { + let timestamp = Utc::now().timestamp_millis(); + let events = batch + .into_events() + .into_iter() + .map(|event| self.event_from_metric(event, timestamp)) + .collect(); + + StatsigPayload { + events, + statsig_metadata: StatsigMetadata { + sdk_type: STATSIG_SDK_TYPE.to_string(), + sdk_version: env!("CARGO_PKG_VERSION").to_string(), + }, + } + } + + fn event_from_metric(&self, event: MetricEvent, timestamp: i64) -> StatsigEvent { + match event { + MetricEvent::Counter { name, value, tags } => StatsigEvent { + event_name: name, + value: value as f64, + metadata: StatsigEventMetadata { + metric_type: "counter".to_string(), + tags: merge_tags(&self.default_tags, &tags), + }, + user: StatsigUser { + user_id: STATSIG_USER_ID.to_string(), + }, + time: timestamp, + }, + MetricEvent::Histogram { name, value, tags } => StatsigEvent { + event_name: name, + value: value as f64, + metadata: StatsigEventMetadata { + metric_type: "histogram".to_string(), + tags: merge_tags(&self.default_tags, &tags), + }, + user: StatsigUser { + user_id: STATSIG_USER_ID.to_string(), + }, + time: timestamp, + }, } } } + +#[derive(Debug, Serialize)] +struct StatsigPayload { + events: Vec, + #[serde(rename = "statsigMetadata")] + statsig_metadata: StatsigMetadata, +} + +#[derive(Debug, Serialize)] +struct StatsigEvent { + #[serde(rename = "eventName")] + event_name: String, + value: f64, + metadata: StatsigEventMetadata, + user: StatsigUser, + time: i64, +} + +#[derive(Debug, Serialize)] +struct StatsigEventMetadata { + #[serde(rename = "metric_type")] + metric_type: String, + tags: BTreeMap, +} + +#[derive(Debug, Serialize)] +struct StatsigUser { + #[serde(rename = "userID")] + user_id: String, +} + +#[derive(Debug, Serialize)] +struct StatsigMetadata { + #[serde(rename = "sdkType")] + sdk_type: String, + #[serde(rename = "sdkVersion")] + sdk_version: String, +} diff --git a/codex-rs/otel/src/metrics/config.rs b/codex-rs/otel/src/metrics/config.rs index 60dd0e4ab45..ca160d0bc80 100644 --- a/codex-rs/otel/src/metrics/config.rs +++ b/codex-rs/otel/src/metrics/config.rs @@ -11,7 +11,7 @@ use std::time::Duration; #[derive(Clone, Debug)] pub(crate) enum MetricsExporter { - OtlpHttp, + StatsigHttp, InMemory(opentelemetry_sdk::metrics::InMemoryMetricExporter), } @@ -38,7 +38,7 @@ impl MetricsConfig { timeout: DEFAULT_TIMEOUT, export_interval: DEFAULT_EXPORT_INTERVAL, user_agent: format!("codex-otel-metrics/{}", env!("CARGO_PKG_VERSION")), - exporter: MetricsExporter::OtlpHttp, + exporter: MetricsExporter::StatsigHttp, } } @@ -89,6 +89,16 @@ impl MetricsConfig { self.exporter = MetricsExporter::InMemory(exporter); self } + + pub(crate) fn exporter_label(&self) -> String { + match &self.exporter { + MetricsExporter::StatsigHttp => format!( + "statsig_http endpoint={} interval={:?} timeout={:?}", + self.endpoint, self.export_interval, self.timeout + ), + MetricsExporter::InMemory(_) => "in_memory".to_string(), + } + } } impl Default for MetricsConfig { diff --git a/codex-rs/otel/src/metrics/error.rs b/codex-rs/otel/src/metrics/error.rs index 16f6f452259..291d31aed63 100644 --- a/codex-rs/otel/src/metrics/error.rs +++ b/codex-rs/otel/src/metrics/error.rs @@ -31,6 +31,28 @@ pub enum MetricsError { ReservedTagKey { key: String }, // Config. + #[error("failed to build tokio runtime")] + RuntimeBuild { + #[source] + source: std::io::Error, + }, + #[error("invalid api key header: {header}")] + InvalidApiKeyHeader { + header: String, + #[source] + source: reqwest::header::InvalidHeaderName, + }, + #[error("invalid header value: {header}")] + InvalidHeaderValue { + header: String, + #[source] + source: reqwest::header::InvalidHeaderValue, + }, + #[error("failed to build metrics http client")] + HttpClientBuild { + #[source] + source: reqwest::Error, + }, #[error("metrics endpoint cannot be empty")] EmptyEndpoint, #[error("metrics api key cannot be empty")] @@ -60,4 +82,14 @@ pub enum MetricsError { WorkerUnavailable, #[error("metrics worker thread panicked")] WorkerPanicked, + #[error("failed to send statsig metrics request")] + StatsigRequestFailed { + #[source] + source: reqwest::Error, + }, + #[error("statsig metrics request failed: {status} {body}")] + StatsigResponseError { + status: reqwest::StatusCode, + body: String, + }, } diff --git a/codex-rs/otel/src/metrics/tests.rs b/codex-rs/otel/src/metrics/tests.rs index a2bfba3075a..e5211b08b9e 100644 --- a/codex-rs/otel/src/metrics/tests.rs +++ b/codex-rs/otel/src/metrics/tests.rs @@ -11,8 +11,15 @@ use opentelemetry_sdk::metrics::data::Metric; use opentelemetry_sdk::metrics::data::MetricData; use opentelemetry_sdk::metrics::data::ResourceMetrics; use pretty_assertions::assert_eq; +use serde_json::Value; use std::collections::BTreeMap; use std::time::Duration; +use wiremock::Mock; +use wiremock::MockServer; +use wiremock::ResponseTemplate; +use wiremock::matchers::header; +use wiremock::matchers::method; +use wiremock::matchers::path; fn build_test_client() -> Result<(MetricsClient, InMemoryMetricExporter)> { let exporter = InMemoryMetricExporter::default(); @@ -53,6 +60,108 @@ fn attributes_to_map<'a>( .collect() } +fn json_tags(value: &Value) -> BTreeMap { + value + .as_object() + .expect("tags should be an object") + .iter() + .map(|(key, value)| { + let value = value + .as_str() + .unwrap_or_else(|| panic!("tag {key} should be a string")); + (key.clone(), value.to_string()) + }) + .collect() +} + +#[tokio::test] +// Sends metrics to a Statsig endpoint with merged tags and metadata. +async fn statsig_http_exporter_sends_events() -> Result<()> { + let server = MockServer::start().await; + let _mock = Mock::given(method("POST")) + .and(path("/v1/log_event")) + .and(header("statsig-api-key", "test-key")) + .and(header("user-agent", "codex-test-agent")) + .respond_with(ResponseTemplate::new(200)) + .expect(1) + .mount(&server) + .await; + + let config = MetricsConfig::new("test-key") + .with_endpoint(format!("{}/v1/log_event", server.uri())) + .with_user_agent("codex-test-agent") + .with_tag("service", "codex-cli")? + .with_tag("env", "prod")?; + let metrics = MetricsClient::new(config)?; + let buckets = HistogramBuckets::from_values(&[25, 50])?; + + let mut batch = metrics.batch(); + batch.counter("codex.turns", 1, &[("model", "gpt-5.1")])?; + batch.histogram("codex.tool_latency", 25, &buckets, &[("tool", "shell")])?; + metrics.send(batch)?; + metrics.shutdown()?; + + let requests = server.received_requests().await.unwrap(); + assert_eq!(requests.len(), 1); + + let body: Value = serde_json::from_slice(&requests[0].body).unwrap(); + let events = body + .get("events") + .and_then(Value::as_array) + .cloned() + .unwrap_or_default(); + assert_eq!(events.len(), 2); + + let counter = &events[0]; + assert_eq!( + counter.get("eventName").and_then(Value::as_str), + Some("codex.turns") + ); + assert_eq!(counter.get("value").and_then(Value::as_f64), Some(1.0)); + let counter_tags = counter + .get("metadata") + .and_then(|value| value.get("tags")) + .expect("counter tags missing"); + let expected_counter_tags = BTreeMap::from([ + ("service".to_string(), "codex-cli".to_string()), + ("env".to_string(), "prod".to_string()), + ("model".to_string(), "gpt-5.1".to_string()), + ]); + assert_eq!(json_tags(counter_tags), expected_counter_tags); + + let histogram = &events[1]; + assert_eq!( + histogram.get("eventName").and_then(Value::as_str), + Some("codex.tool_latency") + ); + assert_eq!(histogram.get("value").and_then(Value::as_f64), Some(25.0)); + let histogram_tags = histogram + .get("metadata") + .and_then(|value| value.get("tags")) + .expect("histogram tags missing"); + let expected_histogram_tags = BTreeMap::from([ + ("service".to_string(), "codex-cli".to_string()), + ("env".to_string(), "prod".to_string()), + ("tool".to_string(), "shell".to_string()), + ]); + assert_eq!(json_tags(histogram_tags), expected_histogram_tags); + + let statsig_metadata = body + .get("statsigMetadata") + .and_then(Value::as_object) + .expect("statsig metadata missing"); + assert_eq!( + statsig_metadata.get("sdkType").and_then(Value::as_str), + Some("codex-otel-rust") + ); + assert_eq!( + statsig_metadata.get("sdkVersion").and_then(Value::as_str), + Some(env!("CARGO_PKG_VERSION")) + ); + + Ok(()) +} + #[test] // Ensures counters/histograms record with default + per-call tags. fn send_builds_metrics_with_tags_and_histograms() -> Result<()> { From ff0055502b8d8e97257a91c23ab262b432c642cb Mon Sep 17 00:00:00 2001 From: jif-oai Date: Mon, 22 Dec 2025 12:30:51 +0100 Subject: [PATCH 14/43] Simplifications --- codex-rs/docs/metrics.md | 31 +-- codex-rs/otel/README.md | 9 +- codex-rs/otel/src/lib.rs | 40 +--- codex-rs/otel/src/metrics/batch.rs | 287 ------------------------ codex-rs/otel/src/metrics/client.rs | 127 +++++------ codex-rs/otel/src/metrics/config.rs | 10 +- codex-rs/otel/src/metrics/error.rs | 31 --- codex-rs/otel/src/metrics/tests.rs | 159 +++++-------- codex-rs/otel/src/metrics/util.rs | 2 +- codex-rs/otel/src/metrics/validation.rs | 5 - codex-rs/otel/tests/suite/send.rs | 55 +---- codex-rs/otel/tests/suite/timing.rs | 17 +- codex-rs/otel/tests/suite/validation.rs | 90 ++------ 13 files changed, 166 insertions(+), 697 deletions(-) delete mode 100644 codex-rs/otel/src/metrics/batch.rs diff --git a/codex-rs/docs/metrics.md b/codex-rs/docs/metrics.md index 777d2fcd6e7..9ec5c420953 100644 --- a/codex-rs/docs/metrics.md +++ b/codex-rs/docs/metrics.md @@ -11,7 +11,6 @@ different Statsig project. ## Quick start ```rust -use codex_otel::metrics::HistogramBuckets; use codex_otel::metrics::MetricsClient; use codex_otel::metrics::MetricsConfig; @@ -22,10 +21,8 @@ let metrics = MetricsClient::new( .with_tag("service", "codex-cli")?, )?; -let buckets = HistogramBuckets::from_values(&[25, 50, 100, 250, 500, 1000])?; - metrics.counter("codex.session_started", 1, &[("source", "tui")])?; -metrics.histogram("codex.request_latency", 83, &buckets, &[("route", "chat")])?; +metrics.histogram("codex.request_latency", 83, &[("route", "chat")])?; ``` ## OtelManager facade @@ -37,7 +34,6 @@ client and emit metrics through the same handle. By default, metrics sent via `with_metrics_without_metadata_tags` to opt out. ```rust -use codex_otel::metrics::HistogramBuckets; use codex_otel::metrics::MetricsConfig; use codex_otel::OtelManager; @@ -57,10 +53,8 @@ let manager = OtelManager::new( .with_endpoint("") .with_api_key_header(""), )?; - -let buckets = HistogramBuckets::from_values(&[25, 50, 100, 250, 500])?; manager.counter("codex.session_started", 1, &[("source", "tui")])?; -manager.histogram("codex.request_latency", 83, &buckets, &[("route", "chat")])?; +manager.histogram("codex.request_latency", 83, &[("route", "chat")])?; ``` If you set `metrics: Some(MetricsConfig)` on `OtelSettings` and build an @@ -81,19 +75,13 @@ If you set `metrics: Some(MetricsConfig)` on `OtelSettings` and build an The queue capacity is fixed at 1024 entries. -## Histograms - -Histogram samples are sent as individual Statsig events. The -`HistogramBuckets` type is retained for API compatibility and validation but -is not used to pre-bucket samples. - ## Timing Measure a closure and emit a histogram sample for the elapsed time in milliseconds: ```rust -let result = metrics.time("codex.request_latency", &buckets, &[("route", "chat")], || { +let result = metrics.time("codex.request_latency", &[("route", "chat")], || { "ok" })?; ``` @@ -104,7 +92,6 @@ If the closure already returns `codex_otel::metrics::Result`, use ```rust let result = metrics.time_result( "codex.request_latency", - &buckets, &[("route", "chat")], || Ok("ok"), )?; @@ -116,22 +103,10 @@ If you already have a duration, record it directly: metrics.record_duration( "codex.request_latency", std::time::Duration::from_millis(83), - &buckets, &[("route", "chat")], )?; ``` -## Batching - -Batching reduces overhead and keeps metrics aligned in time: - -```rust -let mut batch = metrics.batch(); -batch.counter("codex.turns", 1, &[("model", "gpt-5.1")])?; -batch.histogram("codex.tool_latency", 140, &buckets, &[("tool", "shell")])?; -metrics.send(batch)?; -``` - ## Shutdown and queue capacity The client uses a bounded queue (default capacity 1024). Enqueueing returns a diff --git a/codex-rs/otel/README.md b/codex-rs/otel/README.md index 6e429be6b7e..a02663ca9e4 100644 --- a/codex-rs/otel/README.md +++ b/codex-rs/otel/README.md @@ -77,7 +77,6 @@ endpoint. Use placeholders for the Statsig endpoint and API key header until you have real values: ```rust -use codex_otel::metrics::HistogramBuckets; use codex_otel::metrics::MetricsClient; use codex_otel::metrics::MetricsConfig; @@ -87,7 +86,6 @@ let metrics = MetricsClient::new( .with_api_key_header(""), )?; -let buckets = HistogramBuckets::from_values(&[25, 50, 100, 250, 500, 1000])?; metrics.counter("codex.session_started", 1, &[("source", "tui")])?; ``` @@ -98,7 +96,7 @@ Attach metrics once in `OtelSettings.metrics` and reuse them from ```rust use codex_otel::config::{OtelExporter, OtelHttpProtocol, OtelSettings}; -use codex_otel::metrics::{HistogramBuckets, MetricsConfig}; +use codex_otel::metrics::MetricsConfig; use codex_otel::OtelManager; use codex_otel::traces::otel_provider::OtelProvider; use tracing_subscriber::prelude::*; @@ -151,17 +149,14 @@ let manager = provider .map(|p| manager.with_provider_metrics(p)) .unwrap_or(manager); -let buckets = HistogramBuckets::from_values(&[25, 50, 100, 250, 500])?; manager.counter("codex.session_started", 1, &[("source", "tui")])?; -manager.histogram("codex.request_latency", 83, &buckets, &[("route", "chat")])?; +manager.histogram("codex.request_latency", 83, &[("route", "chat")])?; ``` By default, `OtelManager` adds metadata tags to metrics: `auth_mode`, `model`, `slug`, `terminal.type`, and `app.version`. Use `with_metrics_without_metadata_tags` to disable these tags. -For batching, use `OtelManager::batch()` and `OtelManager::send()`. - ## Shutdown - `OtelProvider::shutdown()` stops the OTEL exporter. diff --git a/codex-rs/otel/src/lib.rs b/codex-rs/otel/src/lib.rs index 9b1353cfd13..d12ada5ce81 100644 --- a/codex-rs/otel/src/lib.rs +++ b/codex-rs/otel/src/lib.rs @@ -2,8 +2,6 @@ pub mod config; pub mod metrics; pub mod traces; -use crate::metrics::HistogramBuckets; -use crate::metrics::MetricsBatch; use crate::metrics::MetricsClient; use crate::metrics::MetricsConfig; use crate::metrics::Result as MetricsResult; @@ -84,38 +82,30 @@ impl OtelManager { metrics.counter(name, inc, &tags) } - pub fn histogram( - &self, - name: &str, - value: i64, - buckets: &HistogramBuckets, - tags: &[(&str, &str)], - ) -> MetricsResult<()> { + pub fn histogram(&self, name: &str, value: i64, tags: &[(&str, &str)]) -> MetricsResult<()> { let Some(metrics) = &self.metrics else { return Ok(()); }; let tags = self.tags_with_metadata(tags)?; - metrics.histogram(name, value, buckets, &tags) + metrics.histogram(name, value, &tags) } pub fn record_duration( &self, name: &str, duration: Duration, - buckets: &HistogramBuckets, tags: &[(&str, &str)], ) -> MetricsResult<()> { let Some(metrics) = &self.metrics else { return Ok(()); }; let tags = self.tags_with_metadata(tags)?; - metrics.record_duration(name, duration, buckets, &tags) + metrics.record_duration(name, duration, &tags) } pub fn time( &self, name: &str, - buckets: &HistogramBuckets, tags: &[(&str, &str)], f: impl FnOnce() -> T, ) -> MetricsResult { @@ -123,13 +113,12 @@ impl OtelManager { return Ok(f()); }; let tags = self.tags_with_metadata(tags)?; - metrics.time(name, buckets, &tags, f) + metrics.time(name, &tags, f) } pub fn time_result( &self, name: &str, - buckets: &HistogramBuckets, tags: &[(&str, &str)], f: impl FnOnce() -> MetricsResult, ) -> MetricsResult { @@ -137,18 +126,7 @@ impl OtelManager { return f(); }; let tags = self.tags_with_metadata(tags)?; - metrics.time_result(name, buckets, &tags, f) - } - - pub fn batch(&self) -> MetricsResult { - MetricsBatch::with_default_tags(self.metadata_tags_owned()?) - } - - pub fn send(&self, batch: MetricsBatch) -> MetricsResult<()> { - let Some(metrics) = &self.metrics else { - return Ok(()); - }; - metrics.send(batch) + metrics.time_result(name, &tags, f) } pub fn shutdown_metrics(&self) -> MetricsResult<()> { @@ -184,14 +162,6 @@ impl OtelManager { Ok(tags) } - fn metadata_tags_owned(&self) -> MetricsResult> { - let tags = self.metadata_tag_refs()?; - Ok(tags - .into_iter() - .map(|(key, value)| (key.to_string(), value.to_string())) - .collect()) - } - fn push_metadata_tag<'a>( tags: &mut Vec<(&'a str, &'a str)>, key: &'static str, diff --git a/codex-rs/otel/src/metrics/batch.rs b/codex-rs/otel/src/metrics/batch.rs deleted file mode 100644 index d023441fd0d..00000000000 --- a/codex-rs/otel/src/metrics/batch.rs +++ /dev/null @@ -1,287 +0,0 @@ -use crate::metrics::error::MetricsError; -use crate::metrics::error::Result; -use crate::metrics::tags::collect_tags; -use crate::metrics::validation::validate_metric_name; -use crate::metrics::validation::validate_tag_key; -use crate::metrics::validation::validate_tag_value; - -#[cfg_attr(test, derive(PartialEq, Eq))] -#[derive(Clone, Debug)] -pub struct HistogramBuckets { - bounds: Vec, -} - -impl HistogramBuckets { - /// Build histogram buckets from unsorted bounds (upper limits). - pub fn new(mut bounds: Vec) -> Result { - if bounds.is_empty() { - return Err(MetricsError::EmptyBuckets); - } - bounds.sort_unstable(); - bounds.dedup(); - Ok(Self { bounds }) - } - - /// Build histogram buckets from a slice of upper bounds. - pub fn from_values(bounds: &[i64]) -> Result { - Self::new(bounds.to_vec()) - } - - /// Build linear histogram buckets from an inclusive range and step size. - pub fn from_range(from: i64, to: i64, n_step: i64) -> Result { - if n_step <= 0 { - return Err(MetricsError::BucketStepNonPositive { step: n_step }); - } - if from > to { - return Err(MetricsError::BucketRangeDescending { from, to }); - } - - let mut bounds = Vec::new(); - let mut current = from; - bounds.push(current); - - while current < to { - let next = match current.checked_add(n_step) { - Some(next) => next, - None => { - return Err(MetricsError::BucketRangeOverflow { - from, - to, - step: n_step, - }); - } - }; - if next >= to { - bounds.push(to); - break; - } - bounds.push(next); - current = next; - } - - Self::new(bounds) - } - - /// Build exponential histogram buckets from an inclusive range and factor. - pub fn from_exponential(from: i64, to: i64, factor: f64) -> Result { - if from <= 0 { - return Err(MetricsError::BucketStartNonPositive { start: from }); - } - if from > to { - return Err(MetricsError::BucketRangeDescending { from, to }); - } - if !factor.is_finite() || factor <= 1.0 { - return Err(MetricsError::BucketFactorInvalid { factor }); - } - - let mut bounds = Vec::new(); - let mut current = from; - bounds.push(current); - - while current < to { - let next_value = (current as f64) * factor; - if !next_value.is_finite() || next_value >= to as f64 { - bounds.push(to); - break; - } - let mut next = next_value.ceil() as i64; - if next <= current { - next = current + 1; - } - if next >= to { - bounds.push(to); - break; - } - bounds.push(next); - current = next; - } - - Self::new(bounds) - } - - pub(crate) fn bounds(&self) -> &[i64] { - &self.bounds - } -} - -#[derive(Clone, Debug)] -pub(crate) enum MetricEvent { - Counter { - name: String, - value: i64, - tags: Vec<(String, String)>, - }, - Histogram { - name: String, - value: i64, - tags: Vec<(String, String)>, - }, -} - -pub struct MetricsBatch { - events: Vec, - default_tags: Vec<(String, String)>, -} - -impl Default for MetricsBatch { - fn default() -> Self { - Self::new() - } -} - -impl MetricsBatch { - /// Create an empty metrics batch. - pub fn new() -> Self { - Self { - events: Vec::new(), - default_tags: Vec::new(), - } - } - - pub(crate) fn len(&self) -> usize { - self.events.len() - } - - pub fn with_default_tags(default_tags: Vec<(String, String)>) -> Result { - for (key, value) in &default_tags { - validate_tag_key(key)?; - validate_tag_value(value)?; - } - Ok(Self { - events: Vec::new(), - default_tags, - }) - } - - /// Append a counter increment to the batch. - pub fn counter(&mut self, name: &str, inc: i64, tags: &[(&str, &str)]) -> Result<()> { - validate_metric_name(name)?; - let mut merged_tags = self.default_tags.clone(); - merged_tags.extend(collect_tags(tags)?); - self.events.push(MetricEvent::Counter { - name: name.to_string(), - value: inc, - tags: merged_tags, - }); - Ok(()) - } - - /// Append a histogram sample to the batch. - pub fn histogram( - &mut self, - name: &str, - value: i64, - buckets: &HistogramBuckets, - tags: &[(&str, &str)], - ) -> Result<()> { - // Buckets remain part of the API, but OTEL histogram aggregation owns bucket selection. - let _ = buckets.bounds(); - validate_metric_name(name)?; - let mut merged_tags = self.default_tags.clone(); - merged_tags.extend(collect_tags(tags)?); - self.events.push(MetricEvent::Histogram { - name: name.to_string(), - value, - tags: merged_tags, - }); - Ok(()) - } - - pub(crate) fn is_empty(&self) -> bool { - self.events.is_empty() - } - - pub(crate) fn into_events(self) -> Vec { - self.events - } -} - -#[cfg(test)] -mod tests { - use super::HistogramBuckets; - use crate::metrics::error::MetricsError; - use crate::metrics::error::Result; - use pretty_assertions::assert_eq; - - #[test] - // Verifies linear bucket construction over a clean step range. - fn from_range_builds_linear_buckets() -> Result<()> { - let buckets = HistogramBuckets::from_range(25, 100, 25)?; - let expected = HistogramBuckets::from_values(&[25, 50, 75, 100])?; - assert_eq!(buckets, expected); - Ok(()) - } - - #[test] - // Ensures uneven steps still include the final upper bound. - fn from_range_includes_upper_bound_when_step_is_uneven() -> Result<()> { - let buckets = HistogramBuckets::from_range(10, 95, 30)?; - let expected = HistogramBuckets::from_values(&[10, 40, 70, 95])?; - assert_eq!(buckets, expected); - Ok(()) - } - - #[test] - // Confirms a single-value range produces one bucket. - fn from_range_accepts_single_value_range() -> Result<()> { - let buckets = HistogramBuckets::from_range(42, 42, 5)?; - let expected = HistogramBuckets::from_values(&[42])?; - assert_eq!(buckets, expected); - Ok(()) - } - - #[test] - // Rejects a non-positive step to avoid invalid ranges. - fn from_range_rejects_non_positive_step() { - let err = HistogramBuckets::from_range(0, 10, 0).unwrap_err(); - assert_eq!(err.to_string(), "histogram bucket step must be positive: 0"); - } - - #[test] - // Rejects descending ranges to prevent inverted buckets. - fn from_range_rejects_descending_range() { - let err = HistogramBuckets::from_range(10, 0, 1).unwrap_err(); - assert_eq!( - err.to_string(), - "histogram bucket range must be ascending: 10..=0" - ); - } - - #[test] - // Verifies exponential buckets grow and include the upper bound. - fn from_exponential_builds_buckets() -> Result<()> { - let buckets = HistogramBuckets::from_exponential(10, 100, 2.0)?; - let expected = HistogramBuckets::from_values(&[10, 20, 40, 80, 100])?; - assert_eq!(buckets, expected); - Ok(()) - } - - #[test] - // Ensures exponential buckets always include the final bound. - fn from_exponential_includes_upper_bound() -> Result<()> { - let buckets = HistogramBuckets::from_exponential(30, 100, 3.0)?; - let expected = HistogramBuckets::from_values(&[30, 90, 100])?; - assert_eq!(buckets, expected); - Ok(()) - } - - #[test] - // Rejects non-positive starts because exponential growth requires > 0. - fn from_exponential_rejects_non_positive_start() { - let err = HistogramBuckets::from_exponential(0, 10, 2.0).unwrap_err(); - assert!(matches!( - err, - MetricsError::BucketStartNonPositive { start: 0 } - )); - } - - #[test] - // Rejects invalid exponential factors (non-finite or <= 1). - fn from_exponential_rejects_invalid_factor() { - let err = HistogramBuckets::from_exponential(1, 10, 1.0).unwrap_err(); - assert!(matches!( - err, - MetricsError::BucketFactorInvalid { factor: 1.0 } - )); - } -} diff --git a/codex-rs/otel/src/metrics/client.rs b/codex-rs/otel/src/metrics/client.rs index 66d9256deb5..1be9d39940e 100644 --- a/codex-rs/otel/src/metrics/client.rs +++ b/codex-rs/otel/src/metrics/client.rs @@ -1,17 +1,16 @@ use crate::metrics::DEFAULT_QUEUE_CAPACITY; use crate::metrics::DEFAULT_SHUTDOWN_TIMEOUT; use crate::metrics::SHUTDOWN_POLL_INTERVAL; -use crate::metrics::batch::HistogramBuckets; -use crate::metrics::batch::MetricEvent; -use crate::metrics::batch::MetricsBatch; use crate::metrics::config::MetricsConfig; use crate::metrics::config::MetricsExporter; use crate::metrics::error::MetricsError; use crate::metrics::error::Result; +use crate::metrics::tags::collect_tags; use crate::metrics::tags::merge_tags; use crate::metrics::tags::tags_to_attributes; use crate::metrics::time::duration_to_millis; use crate::metrics::util::error_or_panic; +use crate::metrics::validation::validate_metric_name; use crate::metrics::validation::validate_tags; use chrono::Utc; use opentelemetry::KeyValue; @@ -40,8 +39,22 @@ const METER_NAME: &str = "codex-otel-metrics"; const STATSIG_USER_ID: &str = "codex-metrics"; const STATSIG_SDK_TYPE: &str = "codex-otel-rust"; +#[derive(Clone, Debug)] +enum MetricEvent { + Counter { + name: String, + value: i64, + tags: Vec<(String, String)>, + }, + Histogram { + name: String, + value: i64, + tags: Vec<(String, String)>, + }, +} + enum WorkerMessage { - Batch(MetricsBatch), + Event(MetricEvent), Shutdown, } @@ -69,15 +82,13 @@ impl MetricRecorder { } } - fn record_batch(&mut self, batch: MetricsBatch) { - for event in batch.into_events() { - match event { - MetricEvent::Counter { name, value, tags } => { - self.record_counter(&name, value, &tags); - } - MetricEvent::Histogram { name, value, tags } => { - self.record_histogram(&name, value, &tags); - } + fn record_event(&mut self, event: MetricEvent) { + match event { + MetricEvent::Counter { name, value, tags } => { + self.record_counter(&name, value, &tags); + } + MetricEvent::Histogram { name, value, tags } => { + self.record_histogram(&name, value, &tags); } } } @@ -160,22 +171,24 @@ impl MetricsClient { /// Send a single counter increment without blocking the caller. pub fn counter(&self, name: &str, inc: i64, tags: &[(&str, &str)]) -> Result<()> { - let mut batch = MetricsBatch::new(); - batch.counter(name, inc, tags)?; - self.send(batch) + validate_metric_name(name)?; + let tags = collect_tags(tags)?; + self.send_event(MetricEvent::Counter { + name: name.to_string(), + value: inc, + tags, + }) } /// Send a single histogram sample. - pub fn histogram( - &self, - name: &str, - value: i64, - buckets: &HistogramBuckets, - tags: &[(&str, &str)], - ) -> Result<()> { - let mut batch = MetricsBatch::new(); - batch.histogram(name, value, buckets, tags)?; - self.send(batch) + pub fn histogram(&self, name: &str, value: i64, tags: &[(&str, &str)]) -> Result<()> { + validate_metric_name(name)?; + let tags = collect_tags(tags)?; + self.send_event(MetricEvent::Histogram { + name: name.to_string(), + value, + tags, + }) } /// Record a duration in milliseconds using a histogram. @@ -183,24 +196,17 @@ impl MetricsClient { &self, name: &str, duration: Duration, - buckets: &HistogramBuckets, tags: &[(&str, &str)], ) -> Result<()> { let millis = duration_to_millis(duration); - self.histogram(name, millis, buckets, tags) + self.histogram(name, millis, tags) } /// Measure a closure and emit a histogram sample for the elapsed time. - pub fn time( - &self, - name: &str, - buckets: &HistogramBuckets, - tags: &[(&str, &str)], - f: impl FnOnce() -> T, - ) -> Result { + pub fn time(&self, name: &str, tags: &[(&str, &str)], f: impl FnOnce() -> T) -> Result { let start = Instant::now(); let output = f(); - self.record_duration(name, start.elapsed(), buckets, tags)?; + self.record_duration(name, start.elapsed(), tags)?; Ok(output) } @@ -208,7 +214,6 @@ impl MetricsClient { pub fn time_result( &self, name: &str, - buckets: &HistogramBuckets, tags: &[(&str, &str)], f: impl FnOnce() -> Result, ) -> Result { @@ -216,27 +221,17 @@ impl MetricsClient { let output = f(); match output { Ok(value) => { - self.record_duration(name, start.elapsed(), buckets, tags)?; + self.record_duration(name, start.elapsed(), tags)?; Ok(value) } Err(err) => { - let _ = self.record_duration(name, start.elapsed(), buckets, tags); + let _ = self.record_duration(name, start.elapsed(), tags); Err(err) } } } - /// Create an empty batch for multi-metric sends. - pub fn batch(&self) -> MetricsBatch { - MetricsBatch::new() - } - - /// Enqueue a batch of metrics for the worker to send. - pub fn send(&self, batch: MetricsBatch) -> Result<()> { - if batch.is_empty() { - return Ok(()); - } - + fn send_event(&self, event: MetricEvent) -> Result<()> { let sender = self .state .sender @@ -246,7 +241,7 @@ impl MetricsClient { return Err(MetricsError::WorkerUnavailable); }; - match sender.try_send(WorkerMessage::Batch(batch)) { + match sender.try_send(WorkerMessage::Event(event)) { Ok(()) => Ok(()), Err(TrySendError::Full(_)) => Err(MetricsError::QueueFull { capacity: self.state.capacity, @@ -371,7 +366,7 @@ impl MetricsWorker { let mut received_shutdown = false; while let Some(message) = receiver.recv().await { match message { - WorkerMessage::Batch(batch) => self.export_batch(batch).await, + WorkerMessage::Event(event) => self.export_event(event).await, WorkerMessage::Shutdown => { received_shutdown = true; break; @@ -383,10 +378,10 @@ impl MetricsWorker { } } - async fn export_batch(&mut self, batch: MetricsBatch) { + async fn export_event(&mut self, event: MetricEvent) { match &mut self.exporter { WorkerExporter::Statsig(exporter) => { - if let Err(err) = exporter.export_batch(batch).await { + if let Err(err) = exporter.export_event(event).await { error_or_panic(format!( "statsig metrics export failed: {err} (exporter={})", self.exporter_label @@ -394,7 +389,7 @@ impl MetricsWorker { } } WorkerExporter::InMemory(exporter) => { - exporter.export(batch, &self.exporter_label).await; + exporter.export_event(event, &self.exporter_label).await; } } } @@ -433,12 +428,11 @@ impl InMemoryExporter { } } - async fn export(&mut self, batch: MetricsBatch, exporter_label: &str) { - let event_count = batch.len(); - self.recorder.record_batch(batch); + async fn export_event(&mut self, event: MetricEvent, exporter_label: &str) { + self.recorder.record_event(event); if let Err(err) = self.meter_provider.force_flush() { error_or_panic(format!( - "metrics flush failed: {err} (events={event_count}, exporter={exporter_label})" + "metrics flush failed: {err} (exporter={exporter_label})" )); } } @@ -506,11 +500,8 @@ impl StatsigExporter { }) } - async fn export_batch(&self, batch: MetricsBatch) -> Result<()> { - let payload = self.build_payload(batch); - if payload.events.is_empty() { - return Ok(()); - } + async fn export_event(&self, event: MetricEvent) -> Result<()> { + let payload = self.build_payload(event); let mut request = self .client @@ -538,13 +529,9 @@ impl StatsigExporter { Ok(()) } - fn build_payload(&self, batch: MetricsBatch) -> StatsigPayload { + fn build_payload(&self, event: MetricEvent) -> StatsigPayload { let timestamp = Utc::now().timestamp_millis(); - let events = batch - .into_events() - .into_iter() - .map(|event| self.event_from_metric(event, timestamp)) - .collect(); + let events = vec![self.event_from_metric(event, timestamp)]; StatsigPayload { events, diff --git a/codex-rs/otel/src/metrics/config.rs b/codex-rs/otel/src/metrics/config.rs index ca160d0bc80..d4f1ebbc12f 100644 --- a/codex-rs/otel/src/metrics/config.rs +++ b/codex-rs/otel/src/metrics/config.rs @@ -1,7 +1,7 @@ use crate::metrics::DEFAULT_API_KEY; use crate::metrics::DEFAULT_API_KEY_HEADER; use crate::metrics::DEFAULT_EXPORT_INTERVAL; -use crate::metrics::DEFAULT_OTLP_ENDPOINT; +use crate::metrics::DEFAULT_STATSIG_ENDPOINT; use crate::metrics::DEFAULT_TIMEOUT; use crate::metrics::error::Result; use crate::metrics::validation::validate_tag_key; @@ -31,7 +31,7 @@ impl MetricsConfig { /// Create a config with the provided API key and default settings. pub fn new(api_key: impl Into) -> Self { Self { - endpoint: DEFAULT_OTLP_ENDPOINT.to_string(), + endpoint: DEFAULT_STATSIG_ENDPOINT.to_string(), api_key: api_key.into(), api_key_header: DEFAULT_API_KEY_HEADER.to_string(), default_tags: BTreeMap::new(), @@ -42,7 +42,7 @@ impl MetricsConfig { } } - /// Override the OTLP endpoint. + /// Override the Statsig endpoint. pub fn with_endpoint(mut self, endpoint: impl Into) -> Self { self.endpoint = endpoint.into(); self @@ -64,13 +64,13 @@ impl MetricsConfig { Ok(self) } - /// Override the OTLP exporter timeout. + /// Override the HTTP client timeout. pub fn with_timeout(mut self, timeout: Duration) -> Self { self.timeout = timeout; self } - /// Override the OTLP export interval. + /// Override the export interval used by the in-memory exporter (tests). pub fn with_export_interval(mut self, interval: Duration) -> Self { self.export_interval = interval; self diff --git a/codex-rs/otel/src/metrics/error.rs b/codex-rs/otel/src/metrics/error.rs index 291d31aed63..022bd352d52 100644 --- a/codex-rs/otel/src/metrics/error.rs +++ b/codex-rs/otel/src/metrics/error.rs @@ -4,20 +4,6 @@ pub type Result = std::result::Result; #[derive(Debug, Error)] pub enum MetricsError { - // Buckets. - #[error("histogram buckets cannot be empty")] - EmptyBuckets, - #[error("histogram bucket step must be positive: {step}")] - BucketStepNonPositive { step: i64 }, - #[error("histogram bucket start must be positive: {start}")] - BucketStartNonPositive { start: i64 }, - #[error("histogram bucket factor must be finite and greater than 1: {factor}")] - BucketFactorInvalid { factor: f64 }, - #[error("histogram bucket range must be ascending: {from}..={to}")] - BucketRangeDescending { from: i64, to: i64 }, - #[error("histogram bucket range overflow: {from}..={to} step {step}")] - BucketRangeOverflow { from: i64, to: i64, step: i64 }, - // Metrics. #[error("metric name cannot be empty")] EmptyMetricName, @@ -27,8 +13,6 @@ pub enum MetricsError { EmptyTagComponent { label: String }, #[error("{label} contains invalid characters: {value}")] InvalidTagComponent { label: String, value: String }, - #[error("tag key is reserved: {key}")] - ReservedTagKey { key: String }, // Config. #[error("failed to build tokio runtime")] @@ -57,21 +41,6 @@ pub enum MetricsError { EmptyEndpoint, #[error("metrics api key cannot be empty")] EmptyApiKey, - #[error("failed to build metrics exporter")] - ExporterBuild { - #[source] - source: opentelemetry_otlp::ExporterBuildError, - }, - #[error("failed to flush metrics")] - FlushFailed { - #[source] - source: opentelemetry_sdk::error::OTelSdkError, - }, - #[error("failed to shutdown metrics provider")] - ShutdownFailed { - #[source] - source: opentelemetry_sdk::error::OTelSdkError, - }, // Worker. #[error("metrics queue capacity must be positive")] diff --git a/codex-rs/otel/src/metrics/tests.rs b/codex-rs/otel/src/metrics/tests.rs index e5211b08b9e..72c16c6932b 100644 --- a/codex-rs/otel/src/metrics/tests.rs +++ b/codex-rs/otel/src/metrics/tests.rs @@ -1,5 +1,3 @@ -use super::HistogramBuckets; -use super::MetricsBatch; use super::MetricsClient; use super::MetricsConfig; use super::MetricsError; @@ -83,7 +81,7 @@ async fn statsig_http_exporter_sends_events() -> Result<()> { .and(header("statsig-api-key", "test-key")) .and(header("user-agent", "codex-test-agent")) .respond_with(ResponseTemplate::new(200)) - .expect(1) + .expect(2) .mount(&server) .await; @@ -93,30 +91,49 @@ async fn statsig_http_exporter_sends_events() -> Result<()> { .with_tag("service", "codex-cli")? .with_tag("env", "prod")?; let metrics = MetricsClient::new(config)?; - let buckets = HistogramBuckets::from_values(&[25, 50])?; - let mut batch = metrics.batch(); - batch.counter("codex.turns", 1, &[("model", "gpt-5.1")])?; - batch.histogram("codex.tool_latency", 25, &buckets, &[("tool", "shell")])?; - metrics.send(batch)?; + metrics.counter("codex.turns", 1, &[("model", "gpt-5.1")])?; + metrics.histogram("codex.tool_latency", 25, &[("tool", "shell")])?; metrics.shutdown()?; let requests = server.received_requests().await.unwrap(); - assert_eq!(requests.len(), 1); - - let body: Value = serde_json::from_slice(&requests[0].body).unwrap(); - let events = body - .get("events") - .and_then(Value::as_array) - .cloned() - .unwrap_or_default(); - assert_eq!(events.len(), 2); - - let counter = &events[0]; - assert_eq!( - counter.get("eventName").and_then(Value::as_str), - Some("codex.turns") - ); + assert_eq!(requests.len(), 2); + + let mut events_by_name = BTreeMap::new(); + for request in &requests { + let body: Value = serde_json::from_slice(&request.body).unwrap(); + let events = body + .get("events") + .and_then(Value::as_array) + .cloned() + .unwrap_or_default(); + assert_eq!(events.len(), 1); + + let statsig_metadata = body + .get("statsigMetadata") + .and_then(Value::as_object) + .expect("statsig metadata missing"); + assert_eq!( + statsig_metadata.get("sdkType").and_then(Value::as_str), + Some("codex-otel-rust") + ); + assert_eq!( + statsig_metadata.get("sdkVersion").and_then(Value::as_str), + Some(env!("CARGO_PKG_VERSION")) + ); + + let event = events[0].clone(); + let name = event + .get("eventName") + .and_then(Value::as_str) + .unwrap_or_default() + .to_string(); + events_by_name.insert(name, event); + } + + let counter = events_by_name + .get("codex.turns") + .expect("counter event missing"); assert_eq!(counter.get("value").and_then(Value::as_f64), Some(1.0)); let counter_tags = counter .get("metadata") @@ -129,11 +146,9 @@ async fn statsig_http_exporter_sends_events() -> Result<()> { ]); assert_eq!(json_tags(counter_tags), expected_counter_tags); - let histogram = &events[1]; - assert_eq!( - histogram.get("eventName").and_then(Value::as_str), - Some("codex.tool_latency") - ); + let histogram = events_by_name + .get("codex.tool_latency") + .expect("histogram event missing"); assert_eq!(histogram.get("value").and_then(Value::as_f64), Some(25.0)); let histogram_tags = histogram .get("metadata") @@ -146,19 +161,6 @@ async fn statsig_http_exporter_sends_events() -> Result<()> { ]); assert_eq!(json_tags(histogram_tags), expected_histogram_tags); - let statsig_metadata = body - .get("statsigMetadata") - .and_then(Value::as_object) - .expect("statsig metadata missing"); - assert_eq!( - statsig_metadata.get("sdkType").and_then(Value::as_str), - Some("codex-otel-rust") - ); - assert_eq!( - statsig_metadata.get("sdkVersion").and_then(Value::as_str), - Some(env!("CARGO_PKG_VERSION")) - ); - Ok(()) } @@ -166,12 +168,9 @@ async fn statsig_http_exporter_sends_events() -> Result<()> { // Ensures counters/histograms record with default + per-call tags. fn send_builds_metrics_with_tags_and_histograms() -> Result<()> { let (metrics, exporter) = build_test_client()?; - let buckets = HistogramBuckets::from_values(&[25, 50, 100])?; - let mut batch = metrics.batch(); - batch.counter("codex.turns", 1, &[("model", "gpt-5.1"), ("env", "dev")])?; - batch.histogram("codex.tool_latency", 25, &buckets, &[("tool", "shell")])?; - metrics.send(batch)?; + metrics.counter("codex.turns", 1, &[("model", "gpt-5.1"), ("env", "dev")])?; + metrics.histogram("codex.tool_latency", 25, &[("tool", "shell")])?; metrics.shutdown()?; let resource_metrics = latest_metrics(&exporter); @@ -239,14 +238,12 @@ fn send_merges_default_tags_per_metric() -> Result<()> { .with_in_memory_exporter(exporter.clone()); let metrics = MetricsClient::new(config)?; - let mut batch = metrics.batch(); - batch.counter("codex.alpha", 1, &[("env", "dev"), ("component", "alpha")])?; - batch.counter( + metrics.counter("codex.alpha", 1, &[("env", "dev"), ("component", "alpha")])?; + metrics.counter( "codex.beta", 2, &[("service", "worker"), ("component", "beta")], )?; - metrics.send(batch)?; metrics.shutdown()?; let resource_metrics = latest_metrics(&exporter); @@ -302,12 +299,10 @@ fn send_merges_default_tags_per_metric() -> Result<()> { // Ensures duration recording maps to histogram output. fn record_duration_uses_histogram() -> Result<()> { let (metrics, exporter) = build_test_client()?; - let buckets = HistogramBuckets::from_values(&[10, 20])?; metrics.record_duration( "codex.request_latency", Duration::from_millis(15), - &buckets, &[("route", "chat")], )?; metrics.shutdown()?; @@ -345,11 +340,9 @@ fn record_duration_uses_histogram() -> Result<()> { // Ensures time_result propagates errors but still records timing. fn time_result_records_on_error() -> Result<()> { let (metrics, exporter) = build_test_client()?; - let buckets = HistogramBuckets::from_values(&[10, 20])?; let Err(err) = metrics.time_result( "codex.request_latency", - &buckets, &[("route", "chat")], || -> Result<&'static str> { Err(MetricsError::EmptyMetricName) }, ) else { @@ -391,21 +384,11 @@ fn invalid_tag_component_is_rejected() -> Result<()> { Ok(()) } -#[test] -// Ensures the reserved histogram bucketing tag key is rejected in config defaults. -fn reserved_tag_key_is_rejected_in_config() -> Result<()> { - let Err(err) = MetricsConfig::default().with_tag("le", "10") else { - panic!("expected error"); - }; - assert!(matches!(err, MetricsError::ReservedTagKey { key } if key == "le")); - Ok(()) -} - #[test] // Ensures per-metric tag keys are validated. -fn counter_rejects_invalid_tag_key() { - let mut batch = MetricsBatch::new(); - let Err(err) = batch.counter("codex.turns", 1, &[("bad key", "value")]) else { +fn counter_rejects_invalid_tag_key() -> Result<()> { + let (metrics, _exporter) = build_test_client()?; + let Err(err) = metrics.counter("codex.turns", 1, &[("bad key", "value")]) else { panic!("expected error"); }; assert!(matches!( @@ -413,29 +396,15 @@ fn counter_rejects_invalid_tag_key() { MetricsError::InvalidTagComponent { label, value } if label == "tag key" && value == "bad key" )); -} - -#[test] -// Ensures per-metric tag keys cannot use reserved histogram bucketing keys. -fn counter_rejects_reserved_tag_key() { - let mut batch = MetricsBatch::new(); - let Err(err) = batch.counter("codex.turns", 1, &[("le", "10")]) else { - panic!("expected error"); - }; - assert!(matches!(err, MetricsError::ReservedTagKey { key } if key == "le")); + metrics.shutdown()?; + Ok(()) } #[test] // Ensures per-metric tag values are validated. fn histogram_rejects_invalid_tag_value() -> Result<()> { - let mut batch = MetricsBatch::new(); - let buckets = HistogramBuckets::from_values(&[10])?; - let Err(err) = batch.histogram( - "codex.request_latency", - 3, - &buckets, - &[("route", "bad value")], - ) else { + let (metrics, _exporter) = build_test_client()?; + let Err(err) = metrics.histogram("codex.request_latency", 3, &[("route", "bad value")]) else { panic!("expected error"); }; assert!(matches!( @@ -443,32 +412,22 @@ fn histogram_rejects_invalid_tag_value() -> Result<()> { MetricsError::InvalidTagComponent { label, value } if label == "tag value" && value == "bad value" )); + metrics.shutdown()?; Ok(()) } #[test] -// Ensures histogram calls reject reserved tag keys even though they no longer add `le`. -fn histogram_rejects_reserved_tag_key() -> Result<()> { - let mut batch = MetricsBatch::new(); - let buckets = HistogramBuckets::from_values(&[10])?; - let Err(err) = batch.histogram("codex.request_latency", 3, &buckets, &[("le", "10")]) else { - panic!("expected error"); - }; - assert!(matches!(err, MetricsError::ReservedTagKey { key } if key == "le")); - Ok(()) -} - -#[test] -// Ensures invalid metric names are rejected when building a batch. +// Ensures invalid metric names are rejected. fn counter_rejects_invalid_metric_name() -> Result<()> { - let mut batch = MetricsBatch::new(); - let Err(err) = batch.counter("bad name", 1, &[]) else { + let (metrics, _exporter) = build_test_client()?; + let Err(err) = metrics.counter("bad name", 1, &[]) else { panic!("expected error"); }; assert!(matches!( err, MetricsError::InvalidMetricName { name } if name == "bad name" )); + metrics.shutdown()?; Ok(()) } diff --git a/codex-rs/otel/src/metrics/util.rs b/codex-rs/otel/src/metrics/util.rs index 01c18894b9d..60d0ddfe224 100644 --- a/codex-rs/otel/src/metrics/util.rs +++ b/codex-rs/otel/src/metrics/util.rs @@ -1,7 +1,7 @@ use tracing::error; pub(crate) fn error_or_panic(message: impl ToString) { - if cfg!(debug_assertions) || env!("CARGO_PKG_VERSION").contains("alpha") { + if cfg!(debug_assertions) { panic!("{}", message.to_string()); } else { error!("{}", message.to_string()); diff --git a/codex-rs/otel/src/metrics/validation.rs b/codex-rs/otel/src/metrics/validation.rs index 8e2181e2660..ce9e436d8f5 100644 --- a/codex-rs/otel/src/metrics/validation.rs +++ b/codex-rs/otel/src/metrics/validation.rs @@ -24,11 +24,6 @@ pub(crate) fn validate_metric_name(name: &str) -> Result<()> { pub(crate) fn validate_tag_key(key: &str) -> Result<()> { validate_tag_component(key, "tag key")?; - if key == "le" { - return Err(MetricsError::ReservedTagKey { - key: key.to_string(), - }); - } Ok(()) } diff --git a/codex-rs/otel/tests/suite/send.rs b/codex-rs/otel/tests/suite/send.rs index 52e58c1aff5..f5899c821dd 100644 --- a/codex-rs/otel/tests/suite/send.rs +++ b/codex-rs/otel/tests/suite/send.rs @@ -3,8 +3,6 @@ use crate::harness::build_metrics_with_defaults; use crate::harness::find_metric; use crate::harness::histogram_data; use crate::harness::latest_metrics; -use codex_otel::metrics::HistogramBuckets; -use codex_otel::metrics::MetricsBatch; use codex_otel::metrics::Result; use pretty_assertions::assert_eq; use std::collections::BTreeMap; @@ -14,12 +12,9 @@ use std::collections::BTreeMap; fn send_builds_payload_with_tags_and_histograms() -> Result<()> { let (metrics, exporter) = build_metrics_with_defaults(&[("service", "codex-cli"), ("env", "prod")])?; - let buckets = HistogramBuckets::from_values(&[25, 50, 100])?; - let mut batch = metrics.batch(); - batch.counter("codex.turns", 1, &[("model", "gpt-5.1"), ("env", "dev")])?; - batch.histogram("codex.tool_latency", 25, &buckets, &[("tool", "shell")])?; - metrics.send(batch)?; + metrics.counter("codex.turns", 1, &[("model", "gpt-5.1"), ("env", "dev")])?; + metrics.histogram("codex.tool_latency", 25, &[("tool", "shell")])?; metrics.shutdown()?; let resource_metrics = latest_metrics(&exporter); @@ -87,14 +82,12 @@ fn send_merges_default_tags_per_line() -> Result<()> { ("region", "us"), ])?; - let mut batch = metrics.batch(); - batch.counter("codex.alpha", 1, &[("env", "dev"), ("component", "alpha")])?; - batch.counter( + metrics.counter("codex.alpha", 1, &[("env", "dev"), ("component", "alpha")])?; + metrics.counter( "codex.beta", 2, &[("service", "worker"), ("component", "beta")], )?; - metrics.send(batch)?; metrics.shutdown()?; let resource_metrics = latest_metrics(&exporter); @@ -147,35 +140,12 @@ fn send_merges_default_tags_per_line() -> Result<()> { Ok(()) } -// Verifies values above the max bucket use the inf bucket. +// Verifies enqueued metrics are delivered by the background worker. #[test] -fn send_uses_inf_bucket_for_values_over_max() -> Result<()> { +fn client_sends_enqueued_metric() -> Result<()> { let (metrics, exporter) = build_metrics_with_defaults(&[])?; - let buckets = HistogramBuckets::from_values(&[10, 20])?; - let mut batch = metrics.batch(); - batch.histogram("codex.tool_latency", 99, &buckets, &[("tool", "shell")])?; - metrics.send(batch)?; - metrics.shutdown()?; - - let (bounds, bucket_counts, sum, count) = - histogram_data(&latest_metrics(&exporter), "codex.tool_latency"); - assert!(!bounds.is_empty()); - assert_eq!(bucket_counts.iter().sum::(), 1); - assert_eq!(sum, 99.0); - assert_eq!(count, 1); - - Ok(()) -} - -// Verifies enqueued batches are delivered by the background worker. -#[test] -fn client_sends_enqueued_batch() -> Result<()> { - let (metrics, exporter) = build_metrics_with_defaults(&[])?; - - let mut batch = metrics.batch(); - batch.counter("codex.turns", 1, &[("model", "gpt-5.1")])?; - metrics.send(batch)?; + metrics.counter("codex.turns", 1, &[("model", "gpt-5.1")])?; metrics.shutdown()?; let resource_metrics = latest_metrics(&exporter); @@ -200,12 +170,10 @@ fn client_sends_enqueued_batch() -> Result<()> { // Ensures shutdown flushes successfully with in-memory exporters. #[test] -fn send_panics_on_non_success_status_in_debug() -> Result<()> { +fn shutdown_flushes_in_memory_exporter() -> Result<()> { let (metrics, exporter) = build_metrics_with_defaults(&[])?; - let mut batch = metrics.batch(); - batch.counter("codex.turns", 1, &[])?; - metrics.send(batch)?; + metrics.counter("codex.turns", 1, &[])?; metrics.shutdown()?; let resource_metrics = latest_metrics(&exporter); @@ -224,12 +192,11 @@ fn send_panics_on_non_success_status_in_debug() -> Result<()> { Ok(()) } -// Ensures empty batches do not trigger any export. +// Ensures shutting down without recording metrics does not export anything. #[test] -fn client_core_skips_empty_batch() -> Result<()> { +fn shutdown_without_metrics_exports_nothing() -> Result<()> { let (metrics, exporter) = build_metrics_with_defaults(&[])?; - metrics.send(MetricsBatch::new())?; metrics.shutdown()?; let finished = exporter.get_finished_metrics().unwrap(); diff --git a/codex-rs/otel/tests/suite/timing.rs b/codex-rs/otel/tests/suite/timing.rs index 3b6a74232e4..f1315df2695 100644 --- a/codex-rs/otel/tests/suite/timing.rs +++ b/codex-rs/otel/tests/suite/timing.rs @@ -2,22 +2,19 @@ use crate::harness::attributes_to_map; use crate::harness::build_metrics_with_defaults; use crate::harness::histogram_data; use crate::harness::latest_metrics; -use codex_otel::metrics::HistogramBuckets; use codex_otel::metrics::MetricsError; use codex_otel::metrics::Result; use pretty_assertions::assert_eq; use std::time::Duration; -// Ensures duration recording maps to the expected bucket tag. +// Ensures duration recording maps to histogram output. #[test] -fn record_duration_uses_matching_bucket() -> Result<()> { +fn record_duration_records_histogram() -> Result<()> { let (metrics, exporter) = build_metrics_with_defaults(&[])?; - let buckets = HistogramBuckets::from_values(&[10, 20])?; metrics.record_duration( "codex.request_latency", Duration::from_millis(15), - &buckets, &[("route", "chat")], )?; metrics.shutdown()?; @@ -36,14 +33,8 @@ fn record_duration_uses_matching_bucket() -> Result<()> { #[test] fn time_result_records_success() -> Result<()> { let (metrics, exporter) = build_metrics_with_defaults(&[])?; - let buckets = HistogramBuckets::from_values(&[10, 20])?; - let value = metrics.time_result( - "codex.request_latency", - &buckets, - &[("route", "chat")], - || Ok("ok"), - )?; + let value = metrics.time_result("codex.request_latency", &[("route", "chat")], || Ok("ok"))?; assert_eq!(value, "ok"); metrics.shutdown()?; @@ -78,12 +69,10 @@ fn time_result_records_success() -> Result<()> { #[test] fn time_result_records_on_error() -> Result<()> { let (metrics, exporter) = build_metrics_with_defaults(&[])?; - let buckets = HistogramBuckets::from_values(&[10, 20])?; let err = metrics .time_result( "codex.request_latency", - &buckets, &[("route", "chat")], || -> Result<&'static str> { Err(MetricsError::EmptyMetricName) }, ) diff --git a/codex-rs/otel/tests/suite/validation.rs b/codex-rs/otel/tests/suite/validation.rs index b852d464496..09ef521aee9 100644 --- a/codex-rs/otel/tests/suite/validation.rs +++ b/codex-rs/otel/tests/suite/validation.rs @@ -1,9 +1,14 @@ -use codex_otel::metrics::HistogramBuckets; -use codex_otel::metrics::MetricsBatch; use codex_otel::metrics::MetricsClient; use codex_otel::metrics::MetricsConfig; use codex_otel::metrics::MetricsError; use codex_otel::metrics::Result; +use opentelemetry_sdk::metrics::InMemoryMetricExporter; + +fn build_in_memory_client() -> Result { + let exporter = InMemoryMetricExporter::default(); + let config = MetricsConfig::new("test-key").with_in_memory_exporter(exporter); + MetricsClient::new(config) +} // Validates invalid DSNs are rejected early. #[test] @@ -29,22 +34,11 @@ fn invalid_tag_component_is_rejected() -> Result<()> { Ok(()) } -// Ensures the reserved histogram bucketing tag key is rejected in config defaults. -#[test] -fn reserved_tag_key_is_rejected_in_config() -> Result<()> { - let err = MetricsConfig::default().with_tag("le", "10").unwrap_err(); - assert!(matches!( - err, - MetricsError::ReservedTagKey { key } if key == "le" - )); - Ok(()) -} - // Ensures per-metric tag keys are validated. #[test] -fn counter_rejects_invalid_tag_key() { - let mut batch = MetricsBatch::new(); - let err = batch +fn counter_rejects_invalid_tag_key() -> Result<()> { + let metrics = build_in_memory_client()?; + let err = metrics .counter("codex.turns", 1, &[("bad key", "value")]) .unwrap_err(); assert!(matches!( @@ -52,79 +46,35 @@ fn counter_rejects_invalid_tag_key() { MetricsError::InvalidTagComponent { label, value } if label == "tag key" && value == "bad key" )); -} - -// Ensures per-metric tag keys cannot use reserved histogram bucketing keys. -#[test] -fn counter_rejects_reserved_tag_key() { - let mut batch = MetricsBatch::new(); - let err = batch - .counter("codex.turns", 1, &[("le", "10")]) - .unwrap_err(); - assert!(matches!( - err, - MetricsError::ReservedTagKey { key } if key == "le" - )); + metrics.shutdown()?; + Ok(()) } // Ensures per-metric tag values are validated. #[test] fn histogram_rejects_invalid_tag_value() -> Result<()> { - let mut batch = MetricsBatch::new(); - let buckets = HistogramBuckets::from_values(&[10])?; - let err = batch - .histogram( - "codex.request_latency", - 3, - &buckets, - &[("route", "bad value")], - ) + let metrics = build_in_memory_client()?; + let err = metrics + .histogram("codex.request_latency", 3, &[("route", "bad value")]) .unwrap_err(); assert!(matches!( err, MetricsError::InvalidTagComponent { label, value } if label == "tag value" && value == "bad value" )); + metrics.shutdown()?; Ok(()) } -// Ensures histogram calls reject reserved tag keys even though they internally add `le`. -#[test] -fn histogram_rejects_reserved_tag_key() -> Result<()> { - let mut batch = MetricsBatch::new(); - let buckets = HistogramBuckets::from_values(&[10])?; - let err = batch - .histogram("codex.request_latency", 3, &buckets, &[("le", "10")]) - .unwrap_err(); - assert!(matches!( - err, - MetricsError::ReservedTagKey { key } if key == "le" - )); - Ok(()) -} - -// Ensures invalid metric names are rejected when building a batch. +// Ensures invalid metric names are rejected. #[test] fn counter_rejects_invalid_metric_name() -> Result<()> { - let mut batch = MetricsBatch::new(); - let err = batch.counter("bad name", 1, &[]).unwrap_err(); + let metrics = build_in_memory_client()?; + let err = metrics.counter("bad name", 1, &[]).unwrap_err(); assert!(matches!( err, MetricsError::InvalidMetricName { name } if name == "bad name" )); + metrics.shutdown()?; Ok(()) } - -// Ensures empty histogram bucket lists are rejected. -#[test] -fn empty_buckets_are_rejected() { - let err = HistogramBuckets::from_values(&[]).unwrap_err(); - assert!(matches!(err, MetricsError::EmptyBuckets)); -} - -// Ensures range overflow is detected when building buckets. -#[test] -fn range_overflow_is_reported() { - let err = HistogramBuckets::from_range(i64::MAX - 1, i64::MAX, 2).unwrap_err(); - assert!(matches!(err, MetricsError::BucketRangeOverflow { .. })); -} From b2a89045282954f2e905e41c0bc0fb825928c89c Mon Sep 17 00:00:00 2001 From: jif-oai Date: Mon, 22 Dec 2025 12:42:42 +0100 Subject: [PATCH 15/43] Further simplifications --- codex-rs/docs/metrics.md | 1 - codex-rs/otel/README.md | 2 +- codex-rs/otel/src/metrics/client.rs | 37 ++++++----------------------- codex-rs/otel/src/metrics/config.rs | 19 +++++---------- codex-rs/otel/src/metrics/mod.rs | 1 - 5 files changed, 14 insertions(+), 46 deletions(-) diff --git a/codex-rs/docs/metrics.md b/codex-rs/docs/metrics.md index 9ec5c420953..7e51f097342 100644 --- a/codex-rs/docs/metrics.md +++ b/codex-rs/docs/metrics.md @@ -70,7 +70,6 @@ If you set `metrics: Some(MetricsConfig)` on `OtelSettings` and build an - `with_api_key_header(header)` to set the API key header name. - `with_tag(key, value)` to add default tags for every metric. - `with_timeout(duration)` to set the HTTP request timeout. -- `with_export_interval(duration)` to tweak the in-memory exporter interval in tests. - `with_user_agent(agent)` to override the HTTP `User-Agent` header. The queue capacity is fixed at 1024 entries. diff --git a/codex-rs/otel/README.md b/codex-rs/otel/README.md index a02663ca9e4..1150c4ff85e 100644 --- a/codex-rs/otel/README.md +++ b/codex-rs/otel/README.md @@ -4,7 +4,7 @@ - Trace/log exporters and tracing subscriber layers (`codex_otel::traces::otel_provider`). - A structured event helper (`codex_otel::OtelManager`). -- A Statsig OTLP metrics client (`codex_otel::metrics`). +- A Statsig `log_event` metrics client (`codex_otel::metrics`). - A metrics facade on `OtelManager` so tracing + metrics share metadata. ## Tracing and logs diff --git a/codex-rs/otel/src/metrics/client.rs b/codex-rs/otel/src/metrics/client.rs index 1be9d39940e..37ebf26c431 100644 --- a/codex-rs/otel/src/metrics/client.rs +++ b/codex-rs/otel/src/metrics/client.rs @@ -55,7 +55,6 @@ enum MetricEvent { enum WorkerMessage { Event(MetricEvent), - Shutdown, } struct WorkerState { @@ -272,21 +271,8 @@ impl MetricsClient { }; let mut joined = false; - if let Some(sender) = sender { - match sender.try_send(WorkerMessage::Shutdown) { - Ok(()) | Err(TrySendError::Closed(_)) => {} - Err(TrySendError::Full(_)) => { - if tokio::runtime::Handle::try_current().is_ok() { - let sender = sender.clone(); - let _ = - thread::spawn(move || sender.blocking_send(WorkerMessage::Shutdown)) - .join(); - } else { - let _ = sender.blocking_send(WorkerMessage::Shutdown); - } - } - } - } + // Dropping the sender closes the channel; the worker drains pending events and exits. + drop(sender); if timeout.is_zero() { if handle.is_finished() { @@ -332,7 +318,7 @@ fn build_worker_exporter(config: &MetricsConfig) -> Result { match &config.exporter { MetricsExporter::StatsigHttp => Ok(WorkerExporter::Statsig(StatsigExporter::from(config)?)), MetricsExporter::InMemory(exporter) => Ok(WorkerExporter::InMemory( - InMemoryExporter::from(config, exporter.clone()), + InMemoryExporter::from(config.default_tags.clone(), exporter.clone()), )), } } @@ -363,19 +349,12 @@ impl MetricsWorker { } async fn run(mut self, mut receiver: mpsc::Receiver) { - let mut received_shutdown = false; while let Some(message) = receiver.recv().await { match message { WorkerMessage::Event(event) => self.export_event(event).await, - WorkerMessage::Shutdown => { - received_shutdown = true; - break; - } } } - if received_shutdown || matches!(&self.exporter, WorkerExporter::InMemory(_)) { - self.shutdown().await; - } + self.shutdown().await; } async fn export_event(&mut self, event: MetricEvent) { @@ -413,15 +392,13 @@ struct InMemoryExporter { impl InMemoryExporter { fn from( - config: &MetricsConfig, + default_tags: BTreeMap, exporter: opentelemetry_sdk::metrics::InMemoryMetricExporter, ) -> Self { - let reader = PeriodicReader::builder(exporter) - .with_interval(config.export_interval) - .build(); + let reader = PeriodicReader::builder(exporter).build(); let meter_provider = SdkMeterProvider::builder().with_reader(reader).build(); let meter = meter_provider.meter(METER_NAME); - let recorder = MetricRecorder::new(meter, config.default_tags.clone()); + let recorder = MetricRecorder::new(meter, default_tags); Self { recorder, meter_provider, diff --git a/codex-rs/otel/src/metrics/config.rs b/codex-rs/otel/src/metrics/config.rs index d4f1ebbc12f..8e975ea7c3c 100644 --- a/codex-rs/otel/src/metrics/config.rs +++ b/codex-rs/otel/src/metrics/config.rs @@ -1,6 +1,5 @@ use crate::metrics::DEFAULT_API_KEY; use crate::metrics::DEFAULT_API_KEY_HEADER; -use crate::metrics::DEFAULT_EXPORT_INTERVAL; use crate::metrics::DEFAULT_STATSIG_ENDPOINT; use crate::metrics::DEFAULT_TIMEOUT; use crate::metrics::error::Result; @@ -22,7 +21,6 @@ pub struct MetricsConfig { pub(crate) api_key_header: String, pub(crate) default_tags: BTreeMap, pub(crate) timeout: Duration, - pub(crate) export_interval: Duration, pub(crate) user_agent: String, pub(crate) exporter: MetricsExporter, } @@ -36,7 +34,6 @@ impl MetricsConfig { api_key_header: DEFAULT_API_KEY_HEADER.to_string(), default_tags: BTreeMap::new(), timeout: DEFAULT_TIMEOUT, - export_interval: DEFAULT_EXPORT_INTERVAL, user_agent: format!("codex-otel-metrics/{}", env!("CARGO_PKG_VERSION")), exporter: MetricsExporter::StatsigHttp, } @@ -70,12 +67,6 @@ impl MetricsConfig { self } - /// Override the export interval used by the in-memory exporter (tests). - pub fn with_export_interval(mut self, interval: Duration) -> Self { - self.export_interval = interval; - self - } - /// Override the HTTP user agent header. pub fn with_user_agent(mut self, user_agent: impl Into) -> Self { self.user_agent = user_agent.into(); @@ -92,10 +83,12 @@ impl MetricsConfig { pub(crate) fn exporter_label(&self) -> String { match &self.exporter { - MetricsExporter::StatsigHttp => format!( - "statsig_http endpoint={} interval={:?} timeout={:?}", - self.endpoint, self.export_interval, self.timeout - ), + MetricsExporter::StatsigHttp => { + format!( + "statsig_http endpoint={} timeout={:?}", + self.endpoint, self.timeout + ) + } MetricsExporter::InMemory(_) => "in_memory".to_string(), } } diff --git a/codex-rs/otel/src/metrics/mod.rs b/codex-rs/otel/src/metrics/mod.rs index 61996b00f83..9ec0ad5c310 100644 --- a/codex-rs/otel/src/metrics/mod.rs +++ b/codex-rs/otel/src/metrics/mod.rs @@ -13,7 +13,6 @@ pub(crate) const DEFAULT_OTLP_ENDPOINT: &str = "" pub(crate) const DEFAULT_API_KEY_HEADER: &str = "statsig-api-key"; pub(crate) const DEFAULT_API_KEY: &str = ""; pub(crate) const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10); -pub(crate) const DEFAULT_EXPORT_INTERVAL: Duration = Duration::from_secs(10); pub(crate) const DEFAULT_QUEUE_CAPACITY: usize = 1024; pub(crate) const DEFAULT_SHUTDOWN_TIMEOUT: Duration = Duration::from_millis(500); pub(crate) const SHUTDOWN_POLL_INTERVAL: Duration = Duration::from_millis(10); From 6bcbe8e9e96b2d58f911c0d48604b949b6358ba2 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Mon, 22 Dec 2025 12:46:03 +0100 Subject: [PATCH 16/43] Batch events in worker --- codex-rs/otel/src/metrics/client.rs | 73 +++++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 10 deletions(-) diff --git a/codex-rs/otel/src/metrics/client.rs b/codex-rs/otel/src/metrics/client.rs index 37ebf26c431..8560fcb8e0b 100644 --- a/codex-rs/otel/src/metrics/client.rs +++ b/codex-rs/otel/src/metrics/client.rs @@ -33,11 +33,14 @@ use std::time::Duration; use std::time::Instant; use tokio::runtime::Runtime; use tokio::sync::mpsc; +use tokio::sync::mpsc::error::TryRecvError; use tokio::sync::mpsc::error::TrySendError; const METER_NAME: &str = "codex-otel-metrics"; const STATSIG_USER_ID: &str = "codex-metrics"; const STATSIG_SDK_TYPE: &str = "codex-otel-rust"; +const STATSIG_MAX_BATCH_EVENTS: usize = 50; +const STATSIG_BATCH_WINDOW: Duration = Duration::from_millis(1000); #[derive(Clone, Debug)] enum MetricEvent { @@ -351,16 +354,19 @@ impl MetricsWorker { async fn run(mut self, mut receiver: mpsc::Receiver) { while let Some(message) = receiver.recv().await { match message { - WorkerMessage::Event(event) => self.export_event(event).await, + WorkerMessage::Event(event) => { + let events = Self::collect_batch(event, &mut receiver).await; + self.export_batch(events).await; + } } } self.shutdown().await; } - async fn export_event(&mut self, event: MetricEvent) { + async fn export_batch(&mut self, events: Vec) { match &mut self.exporter { WorkerExporter::Statsig(exporter) => { - if let Err(err) = exporter.export_event(event).await { + if let Err(err) = exporter.export_events(events).await { error_or_panic(format!( "statsig metrics export failed: {err} (exporter={})", self.exporter_label @@ -368,11 +374,49 @@ impl MetricsWorker { } } WorkerExporter::InMemory(exporter) => { - exporter.export_event(event, &self.exporter_label).await; + exporter.export_events(events, &self.exporter_label).await; } } } + async fn collect_batch( + first: MetricEvent, + receiver: &mut mpsc::Receiver, + ) -> Vec { + let mut events = Vec::with_capacity(1); + events.push(first); + + // Fast-path: drain anything already enqueued. + while events.len() < STATSIG_MAX_BATCH_EVENTS { + match receiver.try_recv() { + Ok(WorkerMessage::Event(event)) => events.push(event), + Err(TryRecvError::Empty) => break, + Err(TryRecvError::Disconnected) => return events, + } + } + + if events.len() >= STATSIG_MAX_BATCH_EVENTS { + return events; + } + + // Small coalescing window to catch near-simultaneous metrics without blocking callers. + let deadline = Instant::now() + STATSIG_BATCH_WINDOW; + while events.len() < STATSIG_MAX_BATCH_EVENTS { + let remaining = deadline.saturating_duration_since(Instant::now()); + if remaining.is_zero() { + break; + } + + match tokio::time::timeout(remaining, receiver.recv()).await { + Ok(Some(WorkerMessage::Event(event))) => events.push(event), + Ok(None) => break, + Err(_) => break, + } + } + + events + } + async fn shutdown(&mut self) { if let WorkerExporter::InMemory(exporter) = &mut self.exporter { exporter.shutdown(&self.exporter_label).await; @@ -405,8 +449,10 @@ impl InMemoryExporter { } } - async fn export_event(&mut self, event: MetricEvent, exporter_label: &str) { - self.recorder.record_event(event); + async fn export_events(&mut self, events: Vec, exporter_label: &str) { + for event in events { + self.recorder.record_event(event); + } if let Err(err) = self.meter_provider.force_flush() { error_or_panic(format!( "metrics flush failed: {err} (exporter={exporter_label})" @@ -477,8 +523,12 @@ impl StatsigExporter { }) } - async fn export_event(&self, event: MetricEvent) -> Result<()> { - let payload = self.build_payload(event); + async fn export_events(&self, events: Vec) -> Result<()> { + if events.is_empty() { + return Ok(()); + } + + let payload = self.build_payload(events); let mut request = self .client @@ -506,9 +556,12 @@ impl StatsigExporter { Ok(()) } - fn build_payload(&self, event: MetricEvent) -> StatsigPayload { + fn build_payload(&self, events: Vec) -> StatsigPayload { let timestamp = Utc::now().timestamp_millis(); - let events = vec![self.event_from_metric(event, timestamp)]; + let events = events + .into_iter() + .map(|event| self.event_from_metric(event, timestamp)) + .collect(); StatsigPayload { events, From cea563e92992166a24380784baf31d5cace3e331 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Mon, 22 Dec 2025 13:23:20 +0100 Subject: [PATCH 17/43] flatten metrics --- codex-rs/otel/src/metrics/client.rs | 3 +- codex-rs/otel/src/metrics/tests.rs | 69 ++++++++++++++--------------- codex-rs/otel/tests/suite/send.rs | 7 +-- codex-rs/otel/tests/suite/timing.rs | 14 +++--- 4 files changed, 47 insertions(+), 46 deletions(-) diff --git a/codex-rs/otel/src/metrics/client.rs b/codex-rs/otel/src/metrics/client.rs index 8560fcb8e0b..d9e55d370eb 100644 --- a/codex-rs/otel/src/metrics/client.rs +++ b/codex-rs/otel/src/metrics/client.rs @@ -155,7 +155,7 @@ impl MetricsClient { validate_tags(&config.default_tags)?; let exporter_label = config.exporter_label(); - let worker_exporter_label = exporter_label.clone(); + let worker_exporter_label = exporter_label; let exporter = build_worker_exporter(&config)?; let runtime = build_runtime()?; @@ -623,6 +623,7 @@ struct StatsigEvent { struct StatsigEventMetadata { #[serde(rename = "metric_type")] metric_type: String, + #[serde(flatten)] tags: BTreeMap, } diff --git a/codex-rs/otel/src/metrics/tests.rs b/codex-rs/otel/src/metrics/tests.rs index 72c16c6932b..8de1aaa0916 100644 --- a/codex-rs/otel/src/metrics/tests.rs +++ b/codex-rs/otel/src/metrics/tests.rs @@ -81,7 +81,7 @@ async fn statsig_http_exporter_sends_events() -> Result<()> { .and(header("statsig-api-key", "test-key")) .and(header("user-agent", "codex-test-agent")) .respond_with(ResponseTemplate::new(200)) - .expect(2) + .expect(1) .mount(&server) .await; @@ -97,32 +97,31 @@ async fn statsig_http_exporter_sends_events() -> Result<()> { metrics.shutdown()?; let requests = server.received_requests().await.unwrap(); - assert_eq!(requests.len(), 2); + assert_eq!(requests.len(), 1); + + let body: Value = serde_json::from_slice(&requests[0].body).unwrap(); + let events = body + .get("events") + .and_then(Value::as_array) + .cloned() + .unwrap_or_default(); + assert_eq!(events.len(), 2); + + let statsig_metadata = body + .get("statsigMetadata") + .and_then(Value::as_object) + .expect("statsig metadata missing"); + assert_eq!( + statsig_metadata.get("sdkType").and_then(Value::as_str), + Some("codex-otel-rust") + ); + assert_eq!( + statsig_metadata.get("sdkVersion").and_then(Value::as_str), + Some(env!("CARGO_PKG_VERSION")) + ); let mut events_by_name = BTreeMap::new(); - for request in &requests { - let body: Value = serde_json::from_slice(&request.body).unwrap(); - let events = body - .get("events") - .and_then(Value::as_array) - .cloned() - .unwrap_or_default(); - assert_eq!(events.len(), 1); - - let statsig_metadata = body - .get("statsigMetadata") - .and_then(Value::as_object) - .expect("statsig metadata missing"); - assert_eq!( - statsig_metadata.get("sdkType").and_then(Value::as_str), - Some("codex-otel-rust") - ); - assert_eq!( - statsig_metadata.get("sdkVersion").and_then(Value::as_str), - Some(env!("CARGO_PKG_VERSION")) - ); - - let event = events[0].clone(); + for event in events { let name = event .get("eventName") .and_then(Value::as_str) @@ -135,31 +134,29 @@ async fn statsig_http_exporter_sends_events() -> Result<()> { .get("codex.turns") .expect("counter event missing"); assert_eq!(counter.get("value").and_then(Value::as_f64), Some(1.0)); - let counter_tags = counter - .get("metadata") - .and_then(|value| value.get("tags")) - .expect("counter tags missing"); - let expected_counter_tags = BTreeMap::from([ + let counter_metadata = counter.get("metadata").expect("counter metadata missing"); + let expected_counter_metadata = BTreeMap::from([ + ("metric_type".to_string(), "counter".to_string()), ("service".to_string(), "codex-cli".to_string()), ("env".to_string(), "prod".to_string()), ("model".to_string(), "gpt-5.1".to_string()), ]); - assert_eq!(json_tags(counter_tags), expected_counter_tags); + assert_eq!(json_tags(counter_metadata), expected_counter_metadata); let histogram = events_by_name .get("codex.tool_latency") .expect("histogram event missing"); assert_eq!(histogram.get("value").and_then(Value::as_f64), Some(25.0)); - let histogram_tags = histogram + let histogram_metadata = histogram .get("metadata") - .and_then(|value| value.get("tags")) - .expect("histogram tags missing"); - let expected_histogram_tags = BTreeMap::from([ + .expect("histogram metadata missing"); + let expected_histogram_metadata = BTreeMap::from([ + ("metric_type".to_string(), "histogram".to_string()), ("service".to_string(), "codex-cli".to_string()), ("env".to_string(), "prod".to_string()), ("tool".to_string(), "shell".to_string()), ]); - assert_eq!(json_tags(histogram_tags), expected_histogram_tags); + assert_eq!(json_tags(histogram_metadata), expected_histogram_metadata); Ok(()) } diff --git a/codex-rs/otel/tests/suite/send.rs b/codex-rs/otel/tests/suite/send.rs index f5899c821dd..4ae0f44b584 100644 --- a/codex-rs/otel/tests/suite/send.rs +++ b/codex-rs/otel/tests/suite/send.rs @@ -51,9 +51,10 @@ fn send_builds_payload_with_tags_and_histograms() -> Result<()> { match find_metric(&resource_metrics, "codex.tool_latency").and_then(|metric| { match metric.data() { opentelemetry_sdk::metrics::data::AggregatedMetrics::F64(data) => match data { - opentelemetry_sdk::metrics::data::MetricData::Histogram(histogram) => { - histogram.data_points().next().map(|p| p.attributes()) - } + opentelemetry_sdk::metrics::data::MetricData::Histogram(histogram) => histogram + .data_points() + .next() + .map(opentelemetry_sdk::metrics::data::HistogramDataPoint::attributes), _ => None, }, _ => None, diff --git a/codex-rs/otel/tests/suite/timing.rs b/codex-rs/otel/tests/suite/timing.rs index f1315df2695..2ffd23f64d4 100644 --- a/codex-rs/otel/tests/suite/timing.rs +++ b/codex-rs/otel/tests/suite/timing.rs @@ -48,9 +48,10 @@ fn time_result_records_success() -> Result<()> { match crate::harness::find_metric(&resource_metrics, "codex.request_latency").and_then( |metric| match metric.data() { opentelemetry_sdk::metrics::data::AggregatedMetrics::F64(data) => match data { - opentelemetry_sdk::metrics::data::MetricData::Histogram(histogram) => { - histogram.data_points().next().map(|p| p.attributes()) - } + opentelemetry_sdk::metrics::data::MetricData::Histogram(histogram) => histogram + .data_points() + .next() + .map(opentelemetry_sdk::metrics::data::HistogramDataPoint::attributes), _ => None, }, _ => None, @@ -90,9 +91,10 @@ fn time_result_records_on_error() -> Result<()> { match crate::harness::find_metric(&resource_metrics, "codex.request_latency").and_then( |metric| match metric.data() { opentelemetry_sdk::metrics::data::AggregatedMetrics::F64(data) => match data { - opentelemetry_sdk::metrics::data::MetricData::Histogram(histogram) => { - histogram.data_points().next().map(|p| p.attributes()) - } + opentelemetry_sdk::metrics::data::MetricData::Histogram(histogram) => histogram + .data_points() + .next() + .map(opentelemetry_sdk::metrics::data::HistogramDataPoint::attributes), _ => None, }, _ => None, From ddff652f68cda546df088a4a905105f8697babf5 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Mon, 22 Dec 2025 13:43:03 +0100 Subject: [PATCH 18/43] Basic emission --- codex-rs/core/src/codex.rs | 3 +- codex-rs/core/src/features.rs | 18 + codex-rs/otel/src/metrics/mod.rs | 12 +- codex-rs/otel/src/metrics/tests.rs | 447 ------------------- codex-rs/otel/tests/suite/manager_metrics.rs | 106 +++++ codex-rs/otel/tests/suite/mod.rs | 1 + codex-rs/otel/tests/suite/validation.rs | 4 +- 7 files changed, 132 insertions(+), 459 deletions(-) delete mode 100644 codex-rs/otel/src/metrics/tests.rs create mode 100644 codex-rs/otel/tests/suite/manager_metrics.rs diff --git a/codex-rs/core/src/codex.rs b/codex-rs/core/src/codex.rs index 62cfcefddb2..2ef306cc41a 100644 --- a/codex-rs/core/src/codex.rs +++ b/codex-rs/core/src/codex.rs @@ -639,6 +639,7 @@ impl Session { session_configuration.session_source.clone(), ) .with_metrics(MetricsClient::new(MetricsConfig::default())?); + config.features.emit_metrics(&otel_manager); otel_manager.conversation_starts( config.model_provider.name.as_str(), @@ -652,8 +653,6 @@ impl Session { config.active_profile.clone(), ); - otel_manager.counter("jif_test_1", 2, &[("value", "k_jif")])?; - let mut default_shell = shell::default_user_shell(); // Create the mutable state for the Session. if config.features.enabled(Feature::ShellSnapshot) { diff --git a/codex-rs/core/src/features.rs b/codex-rs/core/src/features.rs index 83bf2294957..7911d1296a0 100644 --- a/codex-rs/core/src/features.rs +++ b/codex-rs/core/src/features.rs @@ -7,6 +7,7 @@ use crate::config::ConfigToml; use crate::config::profile::ConfigProfile; +use codex_otel::OtelManager; use serde::Deserialize; use std::collections::BTreeMap; use std::collections::BTreeSet; @@ -195,6 +196,23 @@ impl Features { .map(|usage| (usage.alias.as_str(), usage.feature)) } + pub fn emit_metrics(&self, otel: &OtelManager) { + for feature in FEATURES { + if self.enabled(feature.id) != feature.default_enabled { + if let Err(e) = otel.counter( + "feature.state", + 1, + &[ + ("feature", feature.key), + ("value", &self.enabled(feature.id).to_string()), + ], + ) { + tracing::warn!("Error while emitting feature metrics {e:?}"); + } + } + } + } + /// Apply a table of key -> bool toggles (e.g. from TOML). pub fn apply_map(&mut self, m: &BTreeMap) { for (k, v) in m { diff --git a/codex-rs/otel/src/metrics/mod.rs b/codex-rs/otel/src/metrics/mod.rs index 9ec0ad5c310..1ff5b0393af 100644 --- a/codex-rs/otel/src/metrics/mod.rs +++ b/codex-rs/otel/src/metrics/mod.rs @@ -1,4 +1,3 @@ -mod batch; mod client; mod config; mod error; @@ -9,20 +8,17 @@ pub(crate) mod validation; use std::time::Duration; -pub(crate) const DEFAULT_OTLP_ENDPOINT: &str = ""; +// Publicly available API key for codex local project. +pub(crate) const DEFAULT_API_KEY: &str = "client-MkRuleRQBd6qakfnDYqJVR9JuXcY57Ljly3vi5JVUIO"; +pub(crate) const DEFAULT_STATSIG_ENDPOINT: &str = "https://ab.chatgpt.com/v1/log_event"; pub(crate) const DEFAULT_API_KEY_HEADER: &str = "statsig-api-key"; -pub(crate) const DEFAULT_API_KEY: &str = ""; + pub(crate) const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10); pub(crate) const DEFAULT_QUEUE_CAPACITY: usize = 1024; pub(crate) const DEFAULT_SHUTDOWN_TIMEOUT: Duration = Duration::from_millis(500); pub(crate) const SHUTDOWN_POLL_INTERVAL: Duration = Duration::from_millis(10); -pub use crate::metrics::batch::HistogramBuckets; -pub use crate::metrics::batch::MetricsBatch; pub use crate::metrics::client::MetricsClient; pub use crate::metrics::config::MetricsConfig; pub use crate::metrics::error::MetricsError; pub use crate::metrics::error::Result; - -#[cfg(test)] -mod tests; diff --git a/codex-rs/otel/src/metrics/tests.rs b/codex-rs/otel/src/metrics/tests.rs deleted file mode 100644 index 8de1aaa0916..00000000000 --- a/codex-rs/otel/src/metrics/tests.rs +++ /dev/null @@ -1,447 +0,0 @@ -use super::MetricsClient; -use super::MetricsConfig; -use super::MetricsError; -use super::Result; -use opentelemetry::KeyValue; -use opentelemetry_sdk::metrics::InMemoryMetricExporter; -use opentelemetry_sdk::metrics::data::AggregatedMetrics; -use opentelemetry_sdk::metrics::data::Metric; -use opentelemetry_sdk::metrics::data::MetricData; -use opentelemetry_sdk::metrics::data::ResourceMetrics; -use pretty_assertions::assert_eq; -use serde_json::Value; -use std::collections::BTreeMap; -use std::time::Duration; -use wiremock::Mock; -use wiremock::MockServer; -use wiremock::ResponseTemplate; -use wiremock::matchers::header; -use wiremock::matchers::method; -use wiremock::matchers::path; - -fn build_test_client() -> Result<(MetricsClient, InMemoryMetricExporter)> { - let exporter = InMemoryMetricExporter::default(); - let config = MetricsConfig::new("test-key") - .with_tag("service", "codex-cli")? - .with_tag("env", "prod")? - .with_in_memory_exporter(exporter.clone()); - let metrics = MetricsClient::new(config)?; - Ok((metrics, exporter)) -} - -fn latest_metrics(exporter: &InMemoryMetricExporter) -> ResourceMetrics { - let Ok(metrics) = exporter.get_finished_metrics() else { - panic!("finished metrics error"); - }; - let Some(metrics) = metrics.into_iter().last() else { - panic!("metrics export missing"); - }; - metrics -} - -fn find_metric<'a>(resource_metrics: &'a ResourceMetrics, name: &str) -> Option<&'a Metric> { - for scope_metrics in resource_metrics.scope_metrics() { - for metric in scope_metrics.metrics() { - if metric.name() == name { - return Some(metric); - } - } - } - None -} - -fn attributes_to_map<'a>( - attributes: impl Iterator, -) -> BTreeMap { - attributes - .map(|kv| (kv.key.as_str().to_string(), kv.value.as_str().to_string())) - .collect() -} - -fn json_tags(value: &Value) -> BTreeMap { - value - .as_object() - .expect("tags should be an object") - .iter() - .map(|(key, value)| { - let value = value - .as_str() - .unwrap_or_else(|| panic!("tag {key} should be a string")); - (key.clone(), value.to_string()) - }) - .collect() -} - -#[tokio::test] -// Sends metrics to a Statsig endpoint with merged tags and metadata. -async fn statsig_http_exporter_sends_events() -> Result<()> { - let server = MockServer::start().await; - let _mock = Mock::given(method("POST")) - .and(path("/v1/log_event")) - .and(header("statsig-api-key", "test-key")) - .and(header("user-agent", "codex-test-agent")) - .respond_with(ResponseTemplate::new(200)) - .expect(1) - .mount(&server) - .await; - - let config = MetricsConfig::new("test-key") - .with_endpoint(format!("{}/v1/log_event", server.uri())) - .with_user_agent("codex-test-agent") - .with_tag("service", "codex-cli")? - .with_tag("env", "prod")?; - let metrics = MetricsClient::new(config)?; - - metrics.counter("codex.turns", 1, &[("model", "gpt-5.1")])?; - metrics.histogram("codex.tool_latency", 25, &[("tool", "shell")])?; - metrics.shutdown()?; - - let requests = server.received_requests().await.unwrap(); - assert_eq!(requests.len(), 1); - - let body: Value = serde_json::from_slice(&requests[0].body).unwrap(); - let events = body - .get("events") - .and_then(Value::as_array) - .cloned() - .unwrap_or_default(); - assert_eq!(events.len(), 2); - - let statsig_metadata = body - .get("statsigMetadata") - .and_then(Value::as_object) - .expect("statsig metadata missing"); - assert_eq!( - statsig_metadata.get("sdkType").and_then(Value::as_str), - Some("codex-otel-rust") - ); - assert_eq!( - statsig_metadata.get("sdkVersion").and_then(Value::as_str), - Some(env!("CARGO_PKG_VERSION")) - ); - - let mut events_by_name = BTreeMap::new(); - for event in events { - let name = event - .get("eventName") - .and_then(Value::as_str) - .unwrap_or_default() - .to_string(); - events_by_name.insert(name, event); - } - - let counter = events_by_name - .get("codex.turns") - .expect("counter event missing"); - assert_eq!(counter.get("value").and_then(Value::as_f64), Some(1.0)); - let counter_metadata = counter.get("metadata").expect("counter metadata missing"); - let expected_counter_metadata = BTreeMap::from([ - ("metric_type".to_string(), "counter".to_string()), - ("service".to_string(), "codex-cli".to_string()), - ("env".to_string(), "prod".to_string()), - ("model".to_string(), "gpt-5.1".to_string()), - ]); - assert_eq!(json_tags(counter_metadata), expected_counter_metadata); - - let histogram = events_by_name - .get("codex.tool_latency") - .expect("histogram event missing"); - assert_eq!(histogram.get("value").and_then(Value::as_f64), Some(25.0)); - let histogram_metadata = histogram - .get("metadata") - .expect("histogram metadata missing"); - let expected_histogram_metadata = BTreeMap::from([ - ("metric_type".to_string(), "histogram".to_string()), - ("service".to_string(), "codex-cli".to_string()), - ("env".to_string(), "prod".to_string()), - ("tool".to_string(), "shell".to_string()), - ]); - assert_eq!(json_tags(histogram_metadata), expected_histogram_metadata); - - Ok(()) -} - -#[test] -// Ensures counters/histograms record with default + per-call tags. -fn send_builds_metrics_with_tags_and_histograms() -> Result<()> { - let (metrics, exporter) = build_test_client()?; - - metrics.counter("codex.turns", 1, &[("model", "gpt-5.1"), ("env", "dev")])?; - metrics.histogram("codex.tool_latency", 25, &[("tool", "shell")])?; - metrics.shutdown()?; - - let resource_metrics = latest_metrics(&exporter); - - let Some(counter_metric) = find_metric(&resource_metrics, "codex.turns") else { - panic!("counter metric missing"); - }; - let attributes = match counter_metric.data() { - AggregatedMetrics::I64(data) => match data { - MetricData::Sum(sum) => { - let points: Vec<_> = sum.data_points().collect(); - assert_eq!(points.len(), 1); - let point = points[0]; - assert_eq!(point.value(), 1); - attributes_to_map(point.attributes()) - } - _ => panic!("unexpected counter aggregation"), - }, - _ => panic!("unexpected counter data type"), - }; - - let expected_counter_attributes = BTreeMap::from([ - ("service".to_string(), "codex-cli".to_string()), - ("env".to_string(), "dev".to_string()), - ("model".to_string(), "gpt-5.1".to_string()), - ]); - assert_eq!(attributes, expected_counter_attributes); - - let Some(histogram_metric) = find_metric(&resource_metrics, "codex.tool_latency") else { - panic!("histogram metric missing"); - }; - let attributes = match histogram_metric.data() { - AggregatedMetrics::F64(data) => match data { - MetricData::Histogram(histogram) => { - let points: Vec<_> = histogram.data_points().collect(); - assert_eq!(points.len(), 1); - let point = points[0]; - assert_eq!(point.count(), 1); - assert_eq!(point.sum(), 25.0); - attributes_to_map(point.attributes()) - } - _ => panic!("unexpected histogram aggregation"), - }, - _ => panic!("unexpected histogram data type"), - }; - - let expected_histogram_attributes = BTreeMap::from([ - ("service".to_string(), "codex-cli".to_string()), - ("env".to_string(), "prod".to_string()), - ("tool".to_string(), "shell".to_string()), - ]); - assert_eq!(attributes, expected_histogram_attributes); - - Ok(()) -} - -#[test] -// Ensures defaults merge per metric and overrides take precedence. -fn send_merges_default_tags_per_metric() -> Result<()> { - let exporter = InMemoryMetricExporter::default(); - let config = MetricsConfig::new("test-key") - .with_tag("service", "codex-cli")? - .with_tag("env", "prod")? - .with_tag("region", "us")? - .with_in_memory_exporter(exporter.clone()); - let metrics = MetricsClient::new(config)?; - - metrics.counter("codex.alpha", 1, &[("env", "dev"), ("component", "alpha")])?; - metrics.counter( - "codex.beta", - 2, - &[("service", "worker"), ("component", "beta")], - )?; - metrics.shutdown()?; - - let resource_metrics = latest_metrics(&exporter); - - let Some(alpha_metric) = find_metric(&resource_metrics, "codex.alpha") else { - panic!("alpha metric missing"); - }; - let alpha_attributes = match alpha_metric.data() { - AggregatedMetrics::I64(data) => match data { - MetricData::Sum(sum) => { - let points: Vec<_> = sum.data_points().collect(); - assert_eq!(points.len(), 1); - attributes_to_map(points[0].attributes()) - } - _ => panic!("unexpected alpha aggregation"), - }, - _ => panic!("unexpected alpha data type"), - }; - let expected_alpha_attributes = BTreeMap::from([ - ("service".to_string(), "codex-cli".to_string()), - ("env".to_string(), "dev".to_string()), - ("region".to_string(), "us".to_string()), - ("component".to_string(), "alpha".to_string()), - ]); - assert_eq!(alpha_attributes, expected_alpha_attributes); - - let Some(beta_metric) = find_metric(&resource_metrics, "codex.beta") else { - panic!("beta metric missing"); - }; - let beta_attributes = match beta_metric.data() { - AggregatedMetrics::I64(data) => match data { - MetricData::Sum(sum) => { - let points: Vec<_> = sum.data_points().collect(); - assert_eq!(points.len(), 1); - attributes_to_map(points[0].attributes()) - } - _ => panic!("unexpected beta aggregation"), - }, - _ => panic!("unexpected beta data type"), - }; - let expected_beta_attributes = BTreeMap::from([ - ("service".to_string(), "worker".to_string()), - ("env".to_string(), "prod".to_string()), - ("region".to_string(), "us".to_string()), - ("component".to_string(), "beta".to_string()), - ]); - assert_eq!(beta_attributes, expected_beta_attributes); - - Ok(()) -} - -#[test] -// Ensures duration recording maps to histogram output. -fn record_duration_uses_histogram() -> Result<()> { - let (metrics, exporter) = build_test_client()?; - - metrics.record_duration( - "codex.request_latency", - Duration::from_millis(15), - &[("route", "chat")], - )?; - metrics.shutdown()?; - - let resource_metrics = latest_metrics(&exporter); - let Some(metric) = find_metric(&resource_metrics, "codex.request_latency") else { - panic!("request latency histogram missing"); - }; - let attributes = match metric.data() { - AggregatedMetrics::F64(data) => match data { - MetricData::Histogram(histogram) => { - let points: Vec<_> = histogram.data_points().collect(); - assert_eq!(points.len(), 1); - let point = points[0]; - assert_eq!(point.count(), 1); - assert_eq!(point.sum(), 15.0); - attributes_to_map(point.attributes()) - } - _ => panic!("unexpected histogram aggregation"), - }, - _ => panic!("unexpected histogram data type"), - }; - - let expected_attributes = BTreeMap::from([ - ("service".to_string(), "codex-cli".to_string()), - ("env".to_string(), "prod".to_string()), - ("route".to_string(), "chat".to_string()), - ]); - assert_eq!(attributes, expected_attributes); - - Ok(()) -} - -#[test] -// Ensures time_result propagates errors but still records timing. -fn time_result_records_on_error() -> Result<()> { - let (metrics, exporter) = build_test_client()?; - - let Err(err) = metrics.time_result( - "codex.request_latency", - &[("route", "chat")], - || -> Result<&'static str> { Err(MetricsError::EmptyMetricName) }, - ) else { - panic!("expected error"); - }; - assert!(matches!(err, MetricsError::EmptyMetricName)); - metrics.shutdown()?; - - let resource_metrics = latest_metrics(&exporter); - let Some(metric) = find_metric(&resource_metrics, "codex.request_latency") else { - panic!("request latency histogram missing"); - }; - match metric.data() { - AggregatedMetrics::F64(data) => match data { - MetricData::Histogram(histogram) => { - let points: Vec<_> = histogram.data_points().collect(); - assert_eq!(points.len(), 1); - assert_eq!(points[0].count(), 1); - } - _ => panic!("unexpected histogram aggregation"), - }, - _ => panic!("unexpected histogram data type"), - } - - Ok(()) -} - -#[test] -// Validates invalid tag components are rejected during config build. -fn invalid_tag_component_is_rejected() -> Result<()> { - let Err(err) = MetricsConfig::default().with_tag("bad key", "value") else { - panic!("expected error"); - }; - assert!(matches!( - err, - MetricsError::InvalidTagComponent { label, value } - if label == "tag key" && value == "bad key" - )); - Ok(()) -} - -#[test] -// Ensures per-metric tag keys are validated. -fn counter_rejects_invalid_tag_key() -> Result<()> { - let (metrics, _exporter) = build_test_client()?; - let Err(err) = metrics.counter("codex.turns", 1, &[("bad key", "value")]) else { - panic!("expected error"); - }; - assert!(matches!( - err, - MetricsError::InvalidTagComponent { label, value } - if label == "tag key" && value == "bad key" - )); - metrics.shutdown()?; - Ok(()) -} - -#[test] -// Ensures per-metric tag values are validated. -fn histogram_rejects_invalid_tag_value() -> Result<()> { - let (metrics, _exporter) = build_test_client()?; - let Err(err) = metrics.histogram("codex.request_latency", 3, &[("route", "bad value")]) else { - panic!("expected error"); - }; - assert!(matches!( - err, - MetricsError::InvalidTagComponent { label, value } - if label == "tag value" && value == "bad value" - )); - metrics.shutdown()?; - Ok(()) -} - -#[test] -// Ensures invalid metric names are rejected. -fn counter_rejects_invalid_metric_name() -> Result<()> { - let (metrics, _exporter) = build_test_client()?; - let Err(err) = metrics.counter("bad name", 1, &[]) else { - panic!("expected error"); - }; - assert!(matches!( - err, - MetricsError::InvalidMetricName { name } if name == "bad name" - )); - metrics.shutdown()?; - Ok(()) -} - -#[test] -// Validates missing API key is rejected early. -fn empty_api_key_is_rejected() { - let Err(err) = MetricsClient::new(MetricsConfig::new("")) else { - panic!("expected error"); - }; - assert!(matches!(err, MetricsError::EmptyApiKey)); -} - -#[test] -// Validates missing endpoint is rejected early. -fn empty_endpoint_is_rejected() { - let Err(err) = MetricsClient::new(MetricsConfig::new("test").with_endpoint("")) else { - panic!("expected error"); - }; - assert!(matches!(err, MetricsError::EmptyEndpoint)); -} diff --git a/codex-rs/otel/tests/suite/manager_metrics.rs b/codex-rs/otel/tests/suite/manager_metrics.rs new file mode 100644 index 00000000000..560f4097ac9 --- /dev/null +++ b/codex-rs/otel/tests/suite/manager_metrics.rs @@ -0,0 +1,106 @@ +use crate::harness::attributes_to_map; +use crate::harness::build_metrics_with_defaults; +use crate::harness::find_metric; +use crate::harness::latest_metrics; +use codex_app_server_protocol::AuthMode; +use codex_otel::OtelManager; +use codex_otel::metrics::Result; +use codex_protocol::ConversationId; +use codex_protocol::protocol::SessionSource; +use opentelemetry_sdk::metrics::data::AggregatedMetrics; +use opentelemetry_sdk::metrics::data::MetricData; +use pretty_assertions::assert_eq; +use std::collections::BTreeMap; + +// Ensures OtelManager attaches metadata tags when forwarding metrics. +#[test] +fn manager_attaches_metadata_tags_to_metrics() -> Result<()> { + let (metrics, exporter) = build_metrics_with_defaults(&[("service", "codex-cli")])?; + let manager = OtelManager::new( + ConversationId::new(), + "gpt-5.1", + "gpt-5.1", + Some("account-id".to_string()), + None, + Some(AuthMode::ApiKey), + true, + "tty".to_string(), + SessionSource::Cli, + ) + .with_metrics(metrics); + + manager.counter("codex.session_started", 1, &[("source", "tui")])?; + manager.shutdown_metrics()?; + + let resource_metrics = latest_metrics(&exporter); + let metric = + find_metric(&resource_metrics, "codex.session_started").expect("counter metric missing"); + let attrs = match metric.data() { + AggregatedMetrics::I64(data) => match data { + MetricData::Sum(sum) => { + let points: Vec<_> = sum.data_points().collect(); + assert_eq!(points.len(), 1); + attributes_to_map(points[0].attributes()) + } + _ => panic!("unexpected counter aggregation"), + }, + _ => panic!("unexpected counter data type"), + }; + + let expected = BTreeMap::from([ + ( + "app.version".to_string(), + env!("CARGO_PKG_VERSION").to_string(), + ), + ("auth_mode".to_string(), AuthMode::ApiKey.to_string()), + ("model".to_string(), "gpt-5.1".to_string()), + ("service".to_string(), "codex-cli".to_string()), + ("slug".to_string(), "gpt-5.1".to_string()), + ("source".to_string(), "tui".to_string()), + ("terminal.type".to_string(), "tty".to_string()), + ]); + assert_eq!(attrs, expected); + + Ok(()) +} + +// Ensures metadata tagging can be disabled when recording via OtelManager. +#[test] +fn manager_allows_disabling_metadata_tags() -> Result<()> { + let (metrics, exporter) = build_metrics_with_defaults(&[])?; + let manager = OtelManager::new( + ConversationId::new(), + "gpt-4o", + "gpt-4o", + Some("account-id".to_string()), + None, + Some(AuthMode::ApiKey), + true, + "tty".to_string(), + SessionSource::Cli, + ) + .with_metrics_without_metadata_tags(metrics); + + manager.counter("codex.session_started", 1, &[("source", "tui")])?; + manager.shutdown_metrics()?; + + let resource_metrics = latest_metrics(&exporter); + let metric = + find_metric(&resource_metrics, "codex.session_started").expect("counter metric missing"); + let attrs = match metric.data() { + AggregatedMetrics::I64(data) => match data { + MetricData::Sum(sum) => { + let points: Vec<_> = sum.data_points().collect(); + assert_eq!(points.len(), 1); + attributes_to_map(points[0].attributes()) + } + _ => panic!("unexpected counter aggregation"), + }, + _ => panic!("unexpected counter data type"), + }; + + let expected = BTreeMap::from([("source".to_string(), "tui".to_string())]); + assert_eq!(attrs, expected); + + Ok(()) +} diff --git a/codex-rs/otel/tests/suite/mod.rs b/codex-rs/otel/tests/suite/mod.rs index 42708df7981..46b8ba57c86 100644 --- a/codex-rs/otel/tests/suite/mod.rs +++ b/codex-rs/otel/tests/suite/mod.rs @@ -1,3 +1,4 @@ +mod manager_metrics; mod send; mod timing; mod validation; diff --git a/codex-rs/otel/tests/suite/validation.rs b/codex-rs/otel/tests/suite/validation.rs index 09ef521aee9..70aa28ac5a1 100644 --- a/codex-rs/otel/tests/suite/validation.rs +++ b/codex-rs/otel/tests/suite/validation.rs @@ -10,9 +10,9 @@ fn build_in_memory_client() -> Result { MetricsClient::new(config) } -// Validates invalid DSNs are rejected early. +// Validates missing API key is rejected early. #[test] -fn invalid_dsn_reports_error() -> Result<()> { +fn empty_api_key_is_rejected() -> Result<()> { assert!(matches!( MetricsClient::new(MetricsConfig::new("")), Err(MetricsError::EmptyApiKey) From 17e16f37ef92dbe6e5ee377f75e41ff1cf73b081 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Mon, 22 Dec 2025 13:45:41 +0100 Subject: [PATCH 19/43] Drop warning --- codex-rs/otel/tests/suite/send.rs | 13 ++++++------- codex-rs/otel/tests/suite/timing.rs | 26 ++++++++++++-------------- 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/codex-rs/otel/tests/suite/send.rs b/codex-rs/otel/tests/suite/send.rs index 4ae0f44b584..ce5fcf9384b 100644 --- a/codex-rs/otel/tests/suite/send.rs +++ b/codex-rs/otel/tests/suite/send.rs @@ -50,13 +50,12 @@ fn send_builds_payload_with_tags_and_histograms() -> Result<()> { let histogram_attrs = attributes_to_map( match find_metric(&resource_metrics, "codex.tool_latency").and_then(|metric| { match metric.data() { - opentelemetry_sdk::metrics::data::AggregatedMetrics::F64(data) => match data { - opentelemetry_sdk::metrics::data::MetricData::Histogram(histogram) => histogram - .data_points() - .next() - .map(opentelemetry_sdk::metrics::data::HistogramDataPoint::attributes), - _ => None, - }, + opentelemetry_sdk::metrics::data::AggregatedMetrics::F64( + opentelemetry_sdk::metrics::data::MetricData::Histogram(histogram), + ) => histogram + .data_points() + .next() + .map(opentelemetry_sdk::metrics::data::HistogramDataPoint::attributes), _ => None, } }) { diff --git a/codex-rs/otel/tests/suite/timing.rs b/codex-rs/otel/tests/suite/timing.rs index 2ffd23f64d4..cd7687ce9ed 100644 --- a/codex-rs/otel/tests/suite/timing.rs +++ b/codex-rs/otel/tests/suite/timing.rs @@ -47,13 +47,12 @@ fn time_result_records_success() -> Result<()> { let attrs = attributes_to_map( match crate::harness::find_metric(&resource_metrics, "codex.request_latency").and_then( |metric| match metric.data() { - opentelemetry_sdk::metrics::data::AggregatedMetrics::F64(data) => match data { - opentelemetry_sdk::metrics::data::MetricData::Histogram(histogram) => histogram - .data_points() - .next() - .map(opentelemetry_sdk::metrics::data::HistogramDataPoint::attributes), - _ => None, - }, + opentelemetry_sdk::metrics::data::AggregatedMetrics::F64( + opentelemetry_sdk::metrics::data::MetricData::Histogram(histogram), + ) => histogram + .data_points() + .next() + .map(opentelemetry_sdk::metrics::data::HistogramDataPoint::attributes), _ => None, }, ) { @@ -90,13 +89,12 @@ fn time_result_records_on_error() -> Result<()> { let attrs = attributes_to_map( match crate::harness::find_metric(&resource_metrics, "codex.request_latency").and_then( |metric| match metric.data() { - opentelemetry_sdk::metrics::data::AggregatedMetrics::F64(data) => match data { - opentelemetry_sdk::metrics::data::MetricData::Histogram(histogram) => histogram - .data_points() - .next() - .map(opentelemetry_sdk::metrics::data::HistogramDataPoint::attributes), - _ => None, - }, + opentelemetry_sdk::metrics::data::AggregatedMetrics::F64( + opentelemetry_sdk::metrics::data::MetricData::Histogram(histogram), + ) => histogram + .data_points() + .next() + .map(opentelemetry_sdk::metrics::data::HistogramDataPoint::attributes), _ => None, }, ) { From b9d174e11953f0e7c11621147038ae37f6b3f64e Mon Sep 17 00:00:00 2001 From: jif-oai Date: Mon, 22 Dec 2025 13:54:42 +0100 Subject: [PATCH 20/43] Fix merge --- codex-rs/core/src/codex.rs | 4 ++-- codex-rs/core/src/features.rs | 7 +++---- codex-rs/core/tests/chat_completions_payload.rs | 2 +- codex-rs/core/tests/chat_completions_sse.rs | 2 +- codex-rs/core/tests/responses_headers.rs | 2 +- 5 files changed, 8 insertions(+), 9 deletions(-) diff --git a/codex-rs/core/src/codex.rs b/codex-rs/core/src/codex.rs index 3a2b8bf726b..ba820da38cd 100644 --- a/codex-rs/core/src/codex.rs +++ b/codex-rs/core/src/codex.rs @@ -2778,8 +2778,8 @@ mod tests { use std::time::Duration; use tokio::time::sleep; - use codex_otel::metrics::MetricsClient; - use codex_otel::metrics::MetricsConfig; + + use mcp_types::ContentBlock; use mcp_types::TextContent; use pretty_assertions::assert_eq; diff --git a/codex-rs/core/src/features.rs b/codex-rs/core/src/features.rs index 54b69b76ad2..ab44f905fab 100644 --- a/codex-rs/core/src/features.rs +++ b/codex-rs/core/src/features.rs @@ -196,9 +196,9 @@ impl Features { pub fn emit_metrics(&self, otel: &OtelManager) { for feature in FEATURES { - if self.enabled(feature.id) != feature.default_enabled { - if let Err(e) = otel.counter( - "feature.state", + if self.enabled(feature.id) != feature.default_enabled + && let Err(e) = otel.counter( + "codex.feature.state", 1, &[ ("feature", feature.key), @@ -207,7 +207,6 @@ impl Features { ) { tracing::warn!("Error while emitting feature metrics {e:?}"); } - } } } diff --git a/codex-rs/core/tests/chat_completions_payload.rs b/codex-rs/core/tests/chat_completions_payload.rs index 60bb39cc50d..b5c5a656142 100644 --- a/codex-rs/core/tests/chat_completions_payload.rs +++ b/codex-rs/core/tests/chat_completions_payload.rs @@ -12,7 +12,7 @@ use codex_core::ModelProviderInfo; use codex_core::Prompt; use codex_core::ResponseItem; use codex_core::WireApi; -use codex_core::openai_models::models_manager::ModelsManager; +use codex_core::models_manager::manager::ModelsManager; use codex_otel::OtelManager; use codex_protocol::ConversationId; use codex_protocol::models::ReasoningItemContent; diff --git a/codex-rs/core/tests/chat_completions_sse.rs b/codex-rs/core/tests/chat_completions_sse.rs index 74c3849a880..bf56a93921b 100644 --- a/codex-rs/core/tests/chat_completions_sse.rs +++ b/codex-rs/core/tests/chat_completions_sse.rs @@ -11,7 +11,7 @@ use codex_core::Prompt; use codex_core::ResponseEvent; use codex_core::ResponseItem; use codex_core::WireApi; -use codex_core::openai_models::models_manager::ModelsManager; +use codex_core::models_manager::manager::ModelsManager; use codex_otel::OtelManager; use codex_protocol::ConversationId; use codex_protocol::models::ReasoningItemContent; diff --git a/codex-rs/core/tests/responses_headers.rs b/codex-rs/core/tests/responses_headers.rs index b2602301c11..416d479b0cd 100644 --- a/codex-rs/core/tests/responses_headers.rs +++ b/codex-rs/core/tests/responses_headers.rs @@ -10,7 +10,7 @@ use codex_core::Prompt; use codex_core::ResponseEvent; use codex_core::ResponseItem; use codex_core::WireApi; -use codex_core::openai_models::models_manager::ModelsManager; +use codex_core::models_manager::manager::ModelsManager; use codex_otel::OtelManager; use codex_protocol::ConversationId; use codex_protocol::config_types::ReasoningSummary; From 035712cffd28156d500dcba6e9b91a728b49dbc3 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Mon, 22 Dec 2025 14:57:30 +0100 Subject: [PATCH 21/43] Lots of simplifications --- codex-rs/Cargo.lock | 1 - codex-rs/core/src/codex.rs | 2 - codex-rs/core/src/features.rs | 7 +- codex-rs/otel/Cargo.toml | 1 - codex-rs/otel/src/metrics/client.rs | 468 ++------------------------ codex-rs/otel/src/metrics/config.rs | 6 +- codex-rs/otel/src/metrics/exporter.rs | 322 ++++++++++++++++++ codex-rs/otel/src/metrics/mod.rs | 2 + codex-rs/otel/src/metrics/tags.rs | 7 - codex-rs/otel/src/metrics/worker.rs | 103 ++++++ 10 files changed, 455 insertions(+), 464 deletions(-) create mode 100644 codex-rs/otel/src/metrics/exporter.rs create mode 100644 codex-rs/otel/src/metrics/worker.rs diff --git a/codex-rs/Cargo.lock b/codex-rs/Cargo.lock index 029c00d10df..0665f6f6408 100644 --- a/codex-rs/Cargo.lock +++ b/codex-rs/Cargo.lock @@ -1606,7 +1606,6 @@ dependencies = [ "tracing", "tracing-opentelemetry", "tracing-subscriber", - "wiremock", ] [[package]] diff --git a/codex-rs/core/src/codex.rs b/codex-rs/core/src/codex.rs index ba820da38cd..569d0031d32 100644 --- a/codex-rs/core/src/codex.rs +++ b/codex-rs/core/src/codex.rs @@ -2778,8 +2778,6 @@ mod tests { use std::time::Duration; use tokio::time::sleep; - - use mcp_types::ContentBlock; use mcp_types::TextContent; use pretty_assertions::assert_eq; diff --git a/codex-rs/core/src/features.rs b/codex-rs/core/src/features.rs index ab44f905fab..ae5af4f228d 100644 --- a/codex-rs/core/src/features.rs +++ b/codex-rs/core/src/features.rs @@ -204,9 +204,10 @@ impl Features { ("feature", feature.key), ("value", &self.enabled(feature.id).to_string()), ], - ) { - tracing::warn!("Error while emitting feature metrics {e:?}"); - } + ) + { + tracing::warn!("Error while emitting feature metrics {e:?}"); + } } } diff --git a/codex-rs/otel/Cargo.toml b/codex-rs/otel/Cargo.toml index 846cc1eebaf..cc8b473f01a 100644 --- a/codex-rs/otel/Cargo.toml +++ b/codex-rs/otel/Cargo.toml @@ -54,4 +54,3 @@ tracing-subscriber = { workspace = true } [dev-dependencies] opentelemetry_sdk = { workspace = true, features = ["testing"] } pretty_assertions = { workspace = true } -wiremock = { workspace = true } diff --git a/codex-rs/otel/src/metrics/client.rs b/codex-rs/otel/src/metrics/client.rs index d9e55d370eb..81328434e3e 100644 --- a/codex-rs/otel/src/metrics/client.rs +++ b/codex-rs/otel/src/metrics/client.rs @@ -1,136 +1,34 @@ use crate::metrics::DEFAULT_QUEUE_CAPACITY; use crate::metrics::DEFAULT_SHUTDOWN_TIMEOUT; -use crate::metrics::SHUTDOWN_POLL_INTERVAL; use crate::metrics::config::MetricsConfig; -use crate::metrics::config::MetricsExporter; use crate::metrics::error::MetricsError; use crate::metrics::error::Result; +use crate::metrics::exporter::MetricEvent; +use crate::metrics::exporter::build_worker_exporter; use crate::metrics::tags::collect_tags; -use crate::metrics::tags::merge_tags; -use crate::metrics::tags::tags_to_attributes; use crate::metrics::time::duration_to_millis; -use crate::metrics::util::error_or_panic; use crate::metrics::validation::validate_metric_name; use crate::metrics::validation::validate_tags; -use chrono::Utc; -use opentelemetry::KeyValue; -use opentelemetry::metrics::Histogram; -use opentelemetry::metrics::Meter; -use opentelemetry::metrics::MeterProvider; -use opentelemetry::metrics::UpDownCounter; -use opentelemetry_sdk::metrics::PeriodicReader; -use opentelemetry_sdk::metrics::SdkMeterProvider; -use reqwest::header::HeaderName; -use reqwest::header::HeaderValue; -use reqwest::header::USER_AGENT; -use serde::Serialize; -use std::collections::BTreeMap; -use std::collections::HashMap; -use std::sync::Arc; +use crate::metrics::worker::spawn_worker; use std::sync::Mutex; use std::thread; use std::time::Duration; use std::time::Instant; use tokio::runtime::Runtime; use tokio::sync::mpsc; -use tokio::sync::mpsc::error::TryRecvError; -use tokio::sync::mpsc::error::TrySendError; - -const METER_NAME: &str = "codex-otel-metrics"; -const STATSIG_USER_ID: &str = "codex-metrics"; -const STATSIG_SDK_TYPE: &str = "codex-otel-rust"; -const STATSIG_MAX_BATCH_EVENTS: usize = 50; -const STATSIG_BATCH_WINDOW: Duration = Duration::from_millis(1000); - -#[derive(Clone, Debug)] -enum MetricEvent { - Counter { - name: String, - value: i64, - tags: Vec<(String, String)>, - }, - Histogram { - name: String, - value: i64, - tags: Vec<(String, String)>, - }, -} - -enum WorkerMessage { - Event(MetricEvent), -} - -struct WorkerState { - sender: Mutex>>, - handle: Mutex>>, - capacity: usize, -} - -#[derive(Debug)] -struct MetricRecorder { - meter: Meter, - counters: HashMap>, - histograms: HashMap>, - default_tags: BTreeMap, -} - -impl MetricRecorder { - fn new(meter: Meter, default_tags: BTreeMap) -> Self { - Self { - meter, - counters: HashMap::new(), - histograms: HashMap::new(), - default_tags, - } - } - - fn record_event(&mut self, event: MetricEvent) { - match event { - MetricEvent::Counter { name, value, tags } => { - self.record_counter(&name, value, &tags); - } - MetricEvent::Histogram { name, value, tags } => { - self.record_histogram(&name, value, &tags); - } - } - } - - fn record_counter(&mut self, name: &str, value: i64, tags: &[(String, String)]) { - let attributes = self.attributes_for(tags); - let name = name.to_string(); - let counter = self - .counters - .entry(name.clone()) - .or_insert_with(|| self.meter.i64_up_down_counter(name.clone()).build()); - counter.add(value, &attributes); - } - - fn record_histogram(&mut self, name: &str, value: i64, tags: &[(String, String)]) { - let attributes = self.attributes_for(tags); - let name = name.to_string(); - let histogram = self - .histograms - .entry(name.clone()) - .or_insert_with(|| self.meter.f64_histogram(name.clone()).build()); - histogram.record(value as f64, &attributes); - } - - fn attributes_for(&self, tags: &[(String, String)]) -> Vec { - let merged = merge_tags(&self.default_tags, tags); - tags_to_attributes(&merged) - } -} /// Background metrics client that enqueues metrics to a tokio-backed worker. #[derive(Clone)] pub struct MetricsClient { - state: Arc, + sender: std::sync::Arc>>>, + handle: std::sync::Arc>>>, + capacity: usize, } impl std::fmt::Debug for MetricsClient { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("MetricsClient") - .field("capacity", &self.state.capacity) + .field("capacity", &self.capacity) .finish() } } @@ -155,19 +53,16 @@ impl MetricsClient { validate_tags(&config.default_tags)?; let exporter_label = config.exporter_label(); - let worker_exporter_label = exporter_label; let exporter = build_worker_exporter(&config)?; let runtime = build_runtime()?; let (sender, receiver) = mpsc::channel(capacity); - let handle = spawn_worker(runtime, exporter, worker_exporter_label, receiver); + let handle = spawn_worker(runtime, exporter, exporter_label, receiver); Ok(Self { - state: Arc::new(WorkerState { - sender: Mutex::new(Some(sender)), - handle: Mutex::new(Some(handle)), - capacity, - }), + sender: std::sync::Arc::new(Mutex::new(Some(sender))), + handle: std::sync::Arc::new(Mutex::new(Some(handle))), + capacity, }) } @@ -235,7 +130,6 @@ impl MetricsClient { fn send_event(&self, event: MetricEvent) -> Result<()> { let sender = self - .state .sender .lock() .unwrap_or_else(std::sync::PoisonError::into_inner); @@ -243,12 +137,14 @@ impl MetricsClient { return Err(MetricsError::WorkerUnavailable); }; - match sender.try_send(WorkerMessage::Event(event)) { + match sender.try_send(event) { Ok(()) => Ok(()), - Err(TrySendError::Full(_)) => Err(MetricsError::QueueFull { - capacity: self.state.capacity, + Err(tokio::sync::mpsc::error::TrySendError::Full(_)) => Err(MetricsError::QueueFull { + capacity: self.capacity, }), - Err(TrySendError::Closed(_)) => Err(MetricsError::WorkerUnavailable), + Err(tokio::sync::mpsc::error::TrySendError::Closed(_)) => { + Err(MetricsError::WorkerUnavailable) + } } } @@ -259,13 +155,11 @@ impl MetricsClient { fn shutdown_inner(&self, timeout: Duration) -> Result<()> { let sender = self - .state .sender .lock() .unwrap_or_else(std::sync::PoisonError::into_inner) .take(); let mut handle = self - .state .handle .lock() .unwrap_or_else(std::sync::PoisonError::into_inner); @@ -290,7 +184,7 @@ impl MetricsClient { joined = true; break; } - thread::sleep(SHUTDOWN_POLL_INTERVAL); + thread::sleep(crate::metrics::SHUTDOWN_POLL_INTERVAL); } } @@ -304,7 +198,7 @@ impl MetricsClient { impl Drop for MetricsClient { fn drop(&mut self) { - if Arc::strong_count(&self.state) == 1 { + if std::sync::Arc::strong_count(&self.sender) == 1 { let _ = self.shutdown_inner(DEFAULT_SHUTDOWN_TIMEOUT); } } @@ -316,327 +210,3 @@ fn build_runtime() -> Result { .build() .map_err(|source| MetricsError::RuntimeBuild { source }) } - -fn build_worker_exporter(config: &MetricsConfig) -> Result { - match &config.exporter { - MetricsExporter::StatsigHttp => Ok(WorkerExporter::Statsig(StatsigExporter::from(config)?)), - MetricsExporter::InMemory(exporter) => Ok(WorkerExporter::InMemory( - InMemoryExporter::from(config.default_tags.clone(), exporter.clone()), - )), - } -} - -fn spawn_worker( - runtime: Runtime, - exporter: WorkerExporter, - exporter_label: String, - receiver: mpsc::Receiver, -) -> thread::JoinHandle<()> { - thread::spawn(move || { - let worker = MetricsWorker::new(exporter, exporter_label); - runtime.block_on(worker.run(receiver)); - }) -} - -struct MetricsWorker { - exporter: WorkerExporter, - exporter_label: String, -} - -impl MetricsWorker { - fn new(exporter: WorkerExporter, exporter_label: String) -> Self { - Self { - exporter, - exporter_label, - } - } - - async fn run(mut self, mut receiver: mpsc::Receiver) { - while let Some(message) = receiver.recv().await { - match message { - WorkerMessage::Event(event) => { - let events = Self::collect_batch(event, &mut receiver).await; - self.export_batch(events).await; - } - } - } - self.shutdown().await; - } - - async fn export_batch(&mut self, events: Vec) { - match &mut self.exporter { - WorkerExporter::Statsig(exporter) => { - if let Err(err) = exporter.export_events(events).await { - error_or_panic(format!( - "statsig metrics export failed: {err} (exporter={})", - self.exporter_label - )); - } - } - WorkerExporter::InMemory(exporter) => { - exporter.export_events(events, &self.exporter_label).await; - } - } - } - - async fn collect_batch( - first: MetricEvent, - receiver: &mut mpsc::Receiver, - ) -> Vec { - let mut events = Vec::with_capacity(1); - events.push(first); - - // Fast-path: drain anything already enqueued. - while events.len() < STATSIG_MAX_BATCH_EVENTS { - match receiver.try_recv() { - Ok(WorkerMessage::Event(event)) => events.push(event), - Err(TryRecvError::Empty) => break, - Err(TryRecvError::Disconnected) => return events, - } - } - - if events.len() >= STATSIG_MAX_BATCH_EVENTS { - return events; - } - - // Small coalescing window to catch near-simultaneous metrics without blocking callers. - let deadline = Instant::now() + STATSIG_BATCH_WINDOW; - while events.len() < STATSIG_MAX_BATCH_EVENTS { - let remaining = deadline.saturating_duration_since(Instant::now()); - if remaining.is_zero() { - break; - } - - match tokio::time::timeout(remaining, receiver.recv()).await { - Ok(Some(WorkerMessage::Event(event))) => events.push(event), - Ok(None) => break, - Err(_) => break, - } - } - - events - } - - async fn shutdown(&mut self) { - if let WorkerExporter::InMemory(exporter) = &mut self.exporter { - exporter.shutdown(&self.exporter_label).await; - } - } -} - -enum WorkerExporter { - Statsig(StatsigExporter), - InMemory(InMemoryExporter), -} - -struct InMemoryExporter { - recorder: MetricRecorder, - meter_provider: SdkMeterProvider, -} - -impl InMemoryExporter { - fn from( - default_tags: BTreeMap, - exporter: opentelemetry_sdk::metrics::InMemoryMetricExporter, - ) -> Self { - let reader = PeriodicReader::builder(exporter).build(); - let meter_provider = SdkMeterProvider::builder().with_reader(reader).build(); - let meter = meter_provider.meter(METER_NAME); - let recorder = MetricRecorder::new(meter, default_tags); - Self { - recorder, - meter_provider, - } - } - - async fn export_events(&mut self, events: Vec, exporter_label: &str) { - for event in events { - self.recorder.record_event(event); - } - if let Err(err) = self.meter_provider.force_flush() { - error_or_panic(format!( - "metrics flush failed: {err} (exporter={exporter_label})" - )); - } - } - - async fn shutdown(&mut self, exporter_label: &str) { - if let Err(err) = self.meter_provider.force_flush() { - error_or_panic(format!( - "metrics flush failed during shutdown: {err} (exporter={exporter_label})" - )); - } - if let Err(err) = self.meter_provider.shutdown() { - error_or_panic(format!( - "metrics shutdown failed: {err} (exporter={exporter_label})" - )); - } - } -} - -struct StatsigExporter { - client: reqwest::Client, - endpoint: String, - api_key_header: HeaderName, - api_key: HeaderValue, - user_agent: Option, - default_tags: BTreeMap, -} - -impl StatsigExporter { - fn from(config: &MetricsConfig) -> Result { - let api_key_header = - HeaderName::from_bytes(config.api_key_header.as_bytes()).map_err(|source| { - MetricsError::InvalidApiKeyHeader { - header: config.api_key_header.clone(), - source, - } - })?; - let api_key = HeaderValue::from_str(&config.api_key).map_err(|source| { - MetricsError::InvalidHeaderValue { - header: config.api_key_header.clone(), - source, - } - })?; - let user_agent = if config.user_agent.is_empty() { - None - } else { - Some(HeaderValue::from_str(&config.user_agent).map_err(|source| { - MetricsError::InvalidHeaderValue { - header: "User-Agent".to_string(), - source, - } - })?) - }; - let client = reqwest::Client::builder() - .timeout(config.timeout) - .build() - .map_err(|source| MetricsError::HttpClientBuild { source })?; - - Ok(Self { - client, - endpoint: config.endpoint.clone(), - api_key_header, - api_key, - user_agent, - default_tags: config.default_tags.clone(), - }) - } - - async fn export_events(&self, events: Vec) -> Result<()> { - if events.is_empty() { - return Ok(()); - } - - let payload = self.build_payload(events); - - let mut request = self - .client - .post(&self.endpoint) - .header(self.api_key_header.clone(), self.api_key.clone()); - - if let Some(user_agent) = &self.user_agent { - request = request.header(USER_AGENT, user_agent.clone()); - } - - let response = request - .json(&payload) - .send() - .await - .map_err(|source| MetricsError::StatsigRequestFailed { source })?; - - if let Err(status_err) = response.error_for_status_ref() { - let status = status_err - .status() - .unwrap_or(reqwest::StatusCode::INTERNAL_SERVER_ERROR); - let body = response.text().await.unwrap_or_default(); - return Err(MetricsError::StatsigResponseError { status, body }); - } - - Ok(()) - } - - fn build_payload(&self, events: Vec) -> StatsigPayload { - let timestamp = Utc::now().timestamp_millis(); - let events = events - .into_iter() - .map(|event| self.event_from_metric(event, timestamp)) - .collect(); - - StatsigPayload { - events, - statsig_metadata: StatsigMetadata { - sdk_type: STATSIG_SDK_TYPE.to_string(), - sdk_version: env!("CARGO_PKG_VERSION").to_string(), - }, - } - } - - fn event_from_metric(&self, event: MetricEvent, timestamp: i64) -> StatsigEvent { - match event { - MetricEvent::Counter { name, value, tags } => StatsigEvent { - event_name: name, - value: value as f64, - metadata: StatsigEventMetadata { - metric_type: "counter".to_string(), - tags: merge_tags(&self.default_tags, &tags), - }, - user: StatsigUser { - user_id: STATSIG_USER_ID.to_string(), - }, - time: timestamp, - }, - MetricEvent::Histogram { name, value, tags } => StatsigEvent { - event_name: name, - value: value as f64, - metadata: StatsigEventMetadata { - metric_type: "histogram".to_string(), - tags: merge_tags(&self.default_tags, &tags), - }, - user: StatsigUser { - user_id: STATSIG_USER_ID.to_string(), - }, - time: timestamp, - }, - } - } -} - -#[derive(Debug, Serialize)] -struct StatsigPayload { - events: Vec, - #[serde(rename = "statsigMetadata")] - statsig_metadata: StatsigMetadata, -} - -#[derive(Debug, Serialize)] -struct StatsigEvent { - #[serde(rename = "eventName")] - event_name: String, - value: f64, - metadata: StatsigEventMetadata, - user: StatsigUser, - time: i64, -} - -#[derive(Debug, Serialize)] -struct StatsigEventMetadata { - #[serde(rename = "metric_type")] - metric_type: String, - #[serde(flatten)] - tags: BTreeMap, -} - -#[derive(Debug, Serialize)] -struct StatsigUser { - #[serde(rename = "userID")] - user_id: String, -} - -#[derive(Debug, Serialize)] -struct StatsigMetadata { - #[serde(rename = "sdkType")] - sdk_type: String, - #[serde(rename = "sdkVersion")] - sdk_version: String, -} diff --git a/codex-rs/otel/src/metrics/config.rs b/codex-rs/otel/src/metrics/config.rs index 8e975ea7c3c..122beab1ec4 100644 --- a/codex-rs/otel/src/metrics/config.rs +++ b/codex-rs/otel/src/metrics/config.rs @@ -96,6 +96,10 @@ impl MetricsConfig { impl Default for MetricsConfig { fn default() -> Self { - Self::new(DEFAULT_API_KEY) + if cfg!(test) { + Self::new("MOCK_API_KEY"); + } else { + Self::new(DEFAULT_API_KEY) + } } } diff --git a/codex-rs/otel/src/metrics/exporter.rs b/codex-rs/otel/src/metrics/exporter.rs new file mode 100644 index 00000000000..03a9131c03a --- /dev/null +++ b/codex-rs/otel/src/metrics/exporter.rs @@ -0,0 +1,322 @@ +use crate::metrics::config::MetricsConfig; +use crate::metrics::config::MetricsExporter; +use crate::metrics::error::MetricsError; +use crate::metrics::error::Result; +use crate::metrics::tags::merge_tags; +use crate::metrics::util::error_or_panic; +use chrono::Utc; +use opentelemetry::KeyValue; +use opentelemetry::metrics::Histogram; +use opentelemetry::metrics::Meter; +use opentelemetry::metrics::MeterProvider; +use opentelemetry::metrics::UpDownCounter; +use opentelemetry_sdk::metrics::PeriodicReader; +use opentelemetry_sdk::metrics::SdkMeterProvider; +use reqwest::header::HeaderName; +use reqwest::header::HeaderValue; +use reqwest::header::USER_AGENT; +use serde::Serialize; +use std::collections::BTreeMap; +use std::collections::HashMap; + +pub(crate) const METER_NAME: &str = "codex-otel-metrics"; +const STATSIG_USER_ID: &str = "codex-metrics"; +const STATSIG_SDK_TYPE: &str = "codex-otel-rust"; + +#[derive(Clone, Debug)] +pub(crate) enum MetricEvent { + Counter { + name: String, + value: i64, + tags: Vec<(String, String)>, + }, + Histogram { + name: String, + value: i64, + tags: Vec<(String, String)>, + }, +} + +pub(crate) fn build_worker_exporter(config: &MetricsConfig) -> Result { + match &config.exporter { + MetricsExporter::StatsigHttp => Ok(WorkerExporter::Statsig(StatsigExporter::from(config)?)), + MetricsExporter::InMemory(exporter) => Ok(WorkerExporter::InMemory( + InMemoryExporter::from(config.default_tags.clone(), exporter.clone()), + )), + } +} + +pub(crate) enum WorkerExporter { + Statsig(StatsigExporter), + InMemory(InMemoryExporter), +} + +pub(crate) struct InMemoryExporter { + recorder: MetricRecorder, + meter_provider: SdkMeterProvider, +} + +impl InMemoryExporter { + fn from( + default_tags: BTreeMap, + exporter: opentelemetry_sdk::metrics::InMemoryMetricExporter, + ) -> Self { + let reader = PeriodicReader::builder(exporter).build(); + let meter_provider = SdkMeterProvider::builder().with_reader(reader).build(); + let meter = meter_provider.meter(METER_NAME); + let recorder = MetricRecorder::new(meter, default_tags); + Self { + recorder, + meter_provider, + } + } + + pub(crate) async fn export_events(&mut self, events: Vec, exporter_label: &str) { + for event in events { + self.recorder.record_event(event); + } + if let Err(err) = self.meter_provider.force_flush() { + error_or_panic(format!( + "metrics flush failed: {err} (exporter={exporter_label})" + )); + } + } + + pub(crate) async fn shutdown(&mut self, exporter_label: &str) { + if let Err(err) = self.meter_provider.force_flush() { + error_or_panic(format!( + "metrics flush failed during shutdown: {err} (exporter={exporter_label})" + )); + } + if let Err(err) = self.meter_provider.shutdown() { + error_or_panic(format!( + "metrics shutdown failed: {err} (exporter={exporter_label})" + )); + } + } +} + +#[derive(Debug)] +struct MetricRecorder { + meter: Meter, + counters: HashMap>, + histograms: HashMap>, + default_tags: BTreeMap, +} + +impl MetricRecorder { + fn new(meter: Meter, default_tags: BTreeMap) -> Self { + Self { + meter, + counters: HashMap::new(), + histograms: HashMap::new(), + default_tags, + } + } + + fn record_event(&mut self, event: MetricEvent) { + match event { + MetricEvent::Counter { name, value, tags } => { + self.record_counter(&name, value, &tags); + } + MetricEvent::Histogram { name, value, tags } => { + self.record_histogram(&name, value, &tags); + } + } + } + + fn record_counter(&mut self, name: &str, value: i64, tags: &[(String, String)]) { + let attributes = self.attributes_for(tags); + let name = name.to_string(); + let counter = self + .counters + .entry(name.clone()) + .or_insert_with(|| self.meter.i64_up_down_counter(name.clone()).build()); + counter.add(value, &attributes); + } + + fn record_histogram(&mut self, name: &str, value: i64, tags: &[(String, String)]) { + let attributes = self.attributes_for(tags); + let name = name.to_string(); + let histogram = self + .histograms + .entry(name.clone()) + .or_insert_with(|| self.meter.f64_histogram(name.clone()).build()); + histogram.record(value as f64, &attributes); + } + + fn attributes_for(&self, tags: &[(String, String)]) -> Vec { + let merged = merge_tags(&self.default_tags, tags); + merged + .iter() + .map(|(key, value)| KeyValue::new(key.clone(), value.clone())) + .collect() + } +} + +pub(crate) struct StatsigExporter { + client: reqwest::Client, + endpoint: String, + api_key_header: HeaderName, + api_key: HeaderValue, + user_agent: Option, + default_tags: BTreeMap, +} + +impl StatsigExporter { + fn from(config: &MetricsConfig) -> Result { + let api_key_header = + HeaderName::from_bytes(config.api_key_header.as_bytes()).map_err(|source| { + MetricsError::InvalidApiKeyHeader { + header: config.api_key_header.clone(), + source, + } + })?; + let api_key = HeaderValue::from_str(&config.api_key).map_err(|source| { + MetricsError::InvalidHeaderValue { + header: config.api_key_header.clone(), + source, + } + })?; + let user_agent = if config.user_agent.is_empty() { + None + } else { + Some(HeaderValue::from_str(&config.user_agent).map_err(|source| { + MetricsError::InvalidHeaderValue { + header: "User-Agent".to_string(), + source, + } + })?) + }; + let client = reqwest::Client::builder() + .timeout(config.timeout) + .build() + .map_err(|source| MetricsError::HttpClientBuild { source })?; + + Ok(Self { + client, + endpoint: config.endpoint.clone(), + api_key_header, + api_key, + user_agent, + default_tags: config.default_tags.clone(), + }) + } + + pub(crate) async fn export_events(&self, events: Vec) -> Result<()> { + if events.is_empty() { + return Ok(()); + } + + let payload = self.build_payload(events); + + let mut request = self + .client + .post(&self.endpoint) + .header(self.api_key_header.clone(), self.api_key.clone()); + + if let Some(user_agent) = &self.user_agent { + request = request.header(USER_AGENT, user_agent.clone()); + } + + let response = request + .json(&payload) + .send() + .await + .map_err(|source| MetricsError::StatsigRequestFailed { source })?; + + if let Err(status_err) = response.error_for_status_ref() { + let status = status_err + .status() + .unwrap_or(reqwest::StatusCode::INTERNAL_SERVER_ERROR); + let body = response.text().await.unwrap_or_default(); + return Err(MetricsError::StatsigResponseError { status, body }); + } + + Ok(()) + } + + fn build_payload(&self, events: Vec) -> StatsigPayload { + let timestamp = Utc::now().timestamp_millis(); + let events = events + .into_iter() + .map(|event| self.event_from_metric(event, timestamp)) + .collect(); + + StatsigPayload { + events, + statsig_metadata: StatsigMetadata { + sdk_type: STATSIG_SDK_TYPE.to_string(), + sdk_version: env!("CARGO_PKG_VERSION").to_string(), + }, + } + } + + fn event_from_metric(&self, event: MetricEvent, timestamp: i64) -> StatsigEvent { + match event { + MetricEvent::Counter { name, value, tags } => StatsigEvent { + event_name: name, + value: value as f64, + metadata: StatsigEventMetadata { + metric_type: "counter".to_string(), + tags: merge_tags(&self.default_tags, &tags), + }, + user: StatsigUser { + user_id: STATSIG_USER_ID.to_string(), + }, + time: timestamp, + }, + MetricEvent::Histogram { name, value, tags } => StatsigEvent { + event_name: name, + value: value as f64, + metadata: StatsigEventMetadata { + metric_type: "histogram".to_string(), + tags: merge_tags(&self.default_tags, &tags), + }, + user: StatsigUser { + user_id: STATSIG_USER_ID.to_string(), + }, + time: timestamp, + }, + } + } +} + +#[derive(Debug, Serialize)] +struct StatsigPayload { + events: Vec, + #[serde(rename = "statsigMetadata")] + statsig_metadata: StatsigMetadata, +} + +#[derive(Debug, Serialize)] +struct StatsigEvent { + #[serde(rename = "eventName")] + event_name: String, + value: f64, + metadata: StatsigEventMetadata, + user: StatsigUser, + time: i64, +} + +#[derive(Debug, Serialize)] +struct StatsigEventMetadata { + #[serde(rename = "metric_type")] + metric_type: String, + #[serde(flatten)] + tags: BTreeMap, +} + +#[derive(Debug, Serialize)] +struct StatsigUser { + #[serde(rename = "userID")] + user_id: String, +} + +#[derive(Debug, Serialize)] +struct StatsigMetadata { + #[serde(rename = "sdkType")] + sdk_type: String, + #[serde(rename = "sdkVersion")] + sdk_version: String, +} diff --git a/codex-rs/otel/src/metrics/mod.rs b/codex-rs/otel/src/metrics/mod.rs index 1ff5b0393af..386dff332f6 100644 --- a/codex-rs/otel/src/metrics/mod.rs +++ b/codex-rs/otel/src/metrics/mod.rs @@ -1,10 +1,12 @@ mod client; mod config; mod error; +mod exporter; mod tags; mod time; mod util; pub(crate) mod validation; +mod worker; use std::time::Duration; diff --git a/codex-rs/otel/src/metrics/tags.rs b/codex-rs/otel/src/metrics/tags.rs index da730672628..51766370f44 100644 --- a/codex-rs/otel/src/metrics/tags.rs +++ b/codex-rs/otel/src/metrics/tags.rs @@ -1,7 +1,6 @@ use crate::metrics::error::Result; use crate::metrics::validation::validate_tag_key; use crate::metrics::validation::validate_tag_value; -use opentelemetry::KeyValue; use std::collections::BTreeMap; pub(crate) fn collect_tags(tags: &[(&str, &str)]) -> Result> { @@ -24,9 +23,3 @@ pub(crate) fn merge_tags( } merged } - -pub(crate) fn tags_to_attributes(tags: &BTreeMap) -> Vec { - tags.iter() - .map(|(key, value)| KeyValue::new(key.clone(), value.clone())) - .collect() -} diff --git a/codex-rs/otel/src/metrics/worker.rs b/codex-rs/otel/src/metrics/worker.rs new file mode 100644 index 00000000000..f18da6b8883 --- /dev/null +++ b/codex-rs/otel/src/metrics/worker.rs @@ -0,0 +1,103 @@ +use crate::metrics::exporter::MetricEvent; +use crate::metrics::exporter::WorkerExporter; +use crate::metrics::util::error_or_panic; +use std::thread; +use std::time::Duration; +use std::time::Instant; +use tokio::runtime::Runtime; +use tokio::sync::mpsc; +use tokio::sync::mpsc::error::TryRecvError; + +pub(crate) fn spawn_worker( + runtime: Runtime, + exporter: WorkerExporter, + exporter_label: String, + receiver: mpsc::Receiver, +) -> thread::JoinHandle<()> { + thread::spawn(move || { + let worker = MetricsWorker::new(exporter, exporter_label); + runtime.block_on(worker.run(receiver)); + }) +} + +struct MetricsWorker { + exporter: WorkerExporter, + exporter_label: String, +} + +impl MetricsWorker { + fn new(exporter: WorkerExporter, exporter_label: String) -> Self { + Self { + exporter, + exporter_label, + } + } + + async fn run(mut self, mut receiver: mpsc::Receiver) { + while let Some(event) = receiver.recv().await { + let events = Self::collect_batch(event, &mut receiver).await; + self.export_batch(events).await; + } + self.shutdown().await; + } + + async fn export_batch(&mut self, events: Vec) { + match &mut self.exporter { + WorkerExporter::Statsig(exporter) => { + if let Err(err) = exporter.export_events(events).await { + error_or_panic(format!( + "statsig metrics export failed: {err} (exporter={})", + self.exporter_label + )); + } + } + WorkerExporter::InMemory(exporter) => { + exporter.export_events(events, &self.exporter_label).await; + } + } + } + + async fn collect_batch( + first: MetricEvent, + receiver: &mut mpsc::Receiver, + ) -> Vec { + let mut events = Vec::with_capacity(1); + events.push(first); + + // Fast-path: drain anything already enqueued. + while events.len() < 50 { + match receiver.try_recv() { + Ok(event) => events.push(event), + Err(TryRecvError::Empty) => break, + Err(TryRecvError::Disconnected) => return events, + } + } + + if events.len() >= 50 { + return events; + } + + // Small coalescing window to catch near-simultaneous metrics without blocking callers. + let deadline = Instant::now() + Duration::from_millis(1000); + while events.len() < 50 { + let remaining = deadline.saturating_duration_since(Instant::now()); + if remaining.is_zero() { + break; + } + + match tokio::time::timeout(remaining, receiver.recv()).await { + Ok(Some(event)) => events.push(event), + Ok(None) => break, + Err(_) => break, + } + } + + events + } + + async fn shutdown(&mut self) { + if let WorkerExporter::InMemory(exporter) = &mut self.exporter { + exporter.shutdown(&self.exporter_label).await; + } + } +} From c59724942241ecd381b4a99b7454ded7973c6ee4 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Mon, 22 Dec 2025 15:08:20 +0100 Subject: [PATCH 22/43] Make export symetric --- codex-rs/otel/src/metrics/client.rs | 16 ++--- codex-rs/otel/src/metrics/config.rs | 81 +++++++++++++++---------- codex-rs/otel/src/metrics/exporter.rs | 47 +++++++++----- codex-rs/otel/tests/harness/mod.rs | 2 +- codex-rs/otel/tests/suite/validation.rs | 2 +- 5 files changed, 94 insertions(+), 54 deletions(-) diff --git a/codex-rs/otel/src/metrics/client.rs b/codex-rs/otel/src/metrics/client.rs index 81328434e3e..c299ea02c4d 100644 --- a/codex-rs/otel/src/metrics/client.rs +++ b/codex-rs/otel/src/metrics/client.rs @@ -1,6 +1,7 @@ use crate::metrics::DEFAULT_QUEUE_CAPACITY; use crate::metrics::DEFAULT_SHUTDOWN_TIMEOUT; use crate::metrics::config::MetricsConfig; +use crate::metrics::config::MetricsExporter; use crate::metrics::error::MetricsError; use crate::metrics::error::Result; use crate::metrics::exporter::MetricEvent; @@ -42,16 +43,17 @@ impl MetricsClient { return Err(MetricsError::QueueCapacityZero); } - if config.endpoint.is_empty() { - return Err(MetricsError::EmptyEndpoint); - } + validate_tags(&config.default_tags)?; - if config.api_key.is_empty() { - return Err(MetricsError::EmptyApiKey); + if let MetricsExporter::StatsigHttp { endpoint, .. } = &config.exporter { + if endpoint.is_empty() { + return Err(MetricsError::EmptyEndpoint); + } + if config.api_key.is_empty() { + return Err(MetricsError::EmptyApiKey); + } } - validate_tags(&config.default_tags)?; - let exporter_label = config.exporter_label(); let exporter = build_worker_exporter(&config)?; let runtime = build_runtime()?; diff --git a/codex-rs/otel/src/metrics/config.rs b/codex-rs/otel/src/metrics/config.rs index 122beab1ec4..ae4b65d4ed8 100644 --- a/codex-rs/otel/src/metrics/config.rs +++ b/codex-rs/otel/src/metrics/config.rs @@ -10,44 +10,70 @@ use std::time::Duration; #[derive(Clone, Debug)] pub(crate) enum MetricsExporter { - StatsigHttp, + StatsigHttp { + endpoint: String, + api_key_header: String, + timeout: Duration, + user_agent: String, + }, InMemory(opentelemetry_sdk::metrics::InMemoryMetricExporter), } +impl MetricsExporter { + pub(crate) fn statsig_defaults() -> Self { + Self::StatsigHttp { + endpoint: DEFAULT_STATSIG_ENDPOINT.to_string(), + api_key_header: DEFAULT_API_KEY_HEADER.to_string(), + timeout: DEFAULT_TIMEOUT, + user_agent: format!("codex-otel-metrics/{}", env!("CARGO_PKG_VERSION")), + } + } +} + #[derive(Clone, Debug)] pub struct MetricsConfig { - pub(crate) endpoint: String, pub(crate) api_key: String, - pub(crate) api_key_header: String, pub(crate) default_tags: BTreeMap, - pub(crate) timeout: Duration, - pub(crate) user_agent: String, pub(crate) exporter: MetricsExporter, } impl MetricsConfig { - /// Create a config with the provided API key and default settings. + /// Create a Statsig config with the provided API key and default settings. pub fn new(api_key: impl Into) -> Self { + Self::statsig(api_key) + } + + /// Create a Statsig config with the provided API key and default settings. + pub fn statsig(api_key: impl Into) -> Self { Self { - endpoint: DEFAULT_STATSIG_ENDPOINT.to_string(), api_key: api_key.into(), - api_key_header: DEFAULT_API_KEY_HEADER.to_string(), default_tags: BTreeMap::new(), - timeout: DEFAULT_TIMEOUT, - user_agent: format!("codex-otel-metrics/{}", env!("CARGO_PKG_VERSION")), - exporter: MetricsExporter::StatsigHttp, + exporter: MetricsExporter::statsig_defaults(), + } + } + + /// Create an in-memory config (used in tests). + pub fn in_memory(exporter: opentelemetry_sdk::metrics::InMemoryMetricExporter) -> Self { + Self { + api_key: String::new(), + default_tags: BTreeMap::new(), + exporter: MetricsExporter::InMemory(exporter), } } /// Override the Statsig endpoint. pub fn with_endpoint(mut self, endpoint: impl Into) -> Self { - self.endpoint = endpoint.into(); + if let MetricsExporter::StatsigHttp { endpoint: e, .. } = &mut self.exporter { + *e = endpoint.into(); + } self } /// Override the API key header name. pub fn with_api_key_header(mut self, header: impl Into) -> Self { - self.api_key_header = header.into(); + if let MetricsExporter::StatsigHttp { api_key_header, .. } = &mut self.exporter { + *api_key_header = header.into(); + } self } @@ -63,32 +89,25 @@ impl MetricsConfig { /// Override the HTTP client timeout. pub fn with_timeout(mut self, timeout: Duration) -> Self { - self.timeout = timeout; + if let MetricsExporter::StatsigHttp { timeout: t, .. } = &mut self.exporter { + *t = timeout; + } self } /// Override the HTTP user agent header. pub fn with_user_agent(mut self, user_agent: impl Into) -> Self { - self.user_agent = user_agent.into(); - self - } - - pub fn with_in_memory_exporter( - mut self, - exporter: opentelemetry_sdk::metrics::InMemoryMetricExporter, - ) -> Self { - self.exporter = MetricsExporter::InMemory(exporter); + if let MetricsExporter::StatsigHttp { user_agent: ua, .. } = &mut self.exporter { + *ua = user_agent.into(); + } self } pub(crate) fn exporter_label(&self) -> String { match &self.exporter { - MetricsExporter::StatsigHttp => { - format!( - "statsig_http endpoint={} timeout={:?}", - self.endpoint, self.timeout - ) - } + MetricsExporter::StatsigHttp { + endpoint, timeout, .. + } => format!("statsig_http endpoint={} timeout={:?}", endpoint, timeout), MetricsExporter::InMemory(_) => "in_memory".to_string(), } } @@ -97,9 +116,9 @@ impl MetricsConfig { impl Default for MetricsConfig { fn default() -> Self { if cfg!(test) { - Self::new("MOCK_API_KEY"); + Self::statsig("MOCK_API_KEY") } else { - Self::new(DEFAULT_API_KEY) + Self::statsig(DEFAULT_API_KEY) } } } diff --git a/codex-rs/otel/src/metrics/exporter.rs b/codex-rs/otel/src/metrics/exporter.rs index 03a9131c03a..c2a819c0123 100644 --- a/codex-rs/otel/src/metrics/exporter.rs +++ b/codex-rs/otel/src/metrics/exporter.rs @@ -18,6 +18,7 @@ use reqwest::header::USER_AGENT; use serde::Serialize; use std::collections::BTreeMap; use std::collections::HashMap; +use std::time::Duration; pub(crate) const METER_NAME: &str = "codex-otel-metrics"; const STATSIG_USER_ID: &str = "codex-metrics"; @@ -39,7 +40,19 @@ pub(crate) enum MetricEvent { pub(crate) fn build_worker_exporter(config: &MetricsConfig) -> Result { match &config.exporter { - MetricsExporter::StatsigHttp => Ok(WorkerExporter::Statsig(StatsigExporter::from(config)?)), + MetricsExporter::StatsigHttp { + endpoint, + api_key_header, + timeout, + user_agent, + } => Ok(WorkerExporter::Statsig(StatsigExporter::from( + endpoint, + api_key_header, + timeout, + user_agent, + &config.api_key, + &config.default_tags, + )?)), MetricsExporter::InMemory(exporter) => Ok(WorkerExporter::InMemory( InMemoryExporter::from(config.default_tags.clone(), exporter.clone()), )), @@ -164,24 +177,30 @@ pub(crate) struct StatsigExporter { } impl StatsigExporter { - fn from(config: &MetricsConfig) -> Result { + fn from( + endpoint: &str, + api_key_header: &str, + timeout: &Duration, + user_agent: &str, + api_key: &str, + default_tags: &BTreeMap, + ) -> Result { let api_key_header = - HeaderName::from_bytes(config.api_key_header.as_bytes()).map_err(|source| { + HeaderName::from_bytes(api_key_header.as_bytes()).map_err(|source| { MetricsError::InvalidApiKeyHeader { - header: config.api_key_header.clone(), + header: api_key_header.to_string(), source, } })?; - let api_key = HeaderValue::from_str(&config.api_key).map_err(|source| { - MetricsError::InvalidHeaderValue { - header: config.api_key_header.clone(), + let api_key = + HeaderValue::from_str(api_key).map_err(|source| MetricsError::InvalidHeaderValue { + header: api_key_header.to_string(), source, - } - })?; - let user_agent = if config.user_agent.is_empty() { + })?; + let user_agent = if user_agent.is_empty() { None } else { - Some(HeaderValue::from_str(&config.user_agent).map_err(|source| { + Some(HeaderValue::from_str(user_agent).map_err(|source| { MetricsError::InvalidHeaderValue { header: "User-Agent".to_string(), source, @@ -189,17 +208,17 @@ impl StatsigExporter { })?) }; let client = reqwest::Client::builder() - .timeout(config.timeout) + .timeout(*timeout) .build() .map_err(|source| MetricsError::HttpClientBuild { source })?; Ok(Self { client, - endpoint: config.endpoint.clone(), + endpoint: endpoint.to_string(), api_key_header, api_key, user_agent, - default_tags: config.default_tags.clone(), + default_tags: default_tags.clone(), }) } diff --git a/codex-rs/otel/tests/harness/mod.rs b/codex-rs/otel/tests/harness/mod.rs index 6129e2d42b5..30a2e67c3fd 100644 --- a/codex-rs/otel/tests/harness/mod.rs +++ b/codex-rs/otel/tests/harness/mod.rs @@ -13,7 +13,7 @@ pub(crate) fn build_metrics_with_defaults( default_tags: &[(&str, &str)], ) -> Result<(MetricsClient, InMemoryMetricExporter)> { let exporter = InMemoryMetricExporter::default(); - let mut config = MetricsConfig::new("test-key").with_in_memory_exporter(exporter.clone()); + let mut config = MetricsConfig::in_memory(exporter.clone()); for (key, value) in default_tags { config = config.with_tag(*key, *value)?; } diff --git a/codex-rs/otel/tests/suite/validation.rs b/codex-rs/otel/tests/suite/validation.rs index 70aa28ac5a1..8af1d9720a1 100644 --- a/codex-rs/otel/tests/suite/validation.rs +++ b/codex-rs/otel/tests/suite/validation.rs @@ -6,7 +6,7 @@ use opentelemetry_sdk::metrics::InMemoryMetricExporter; fn build_in_memory_client() -> Result { let exporter = InMemoryMetricExporter::default(); - let config = MetricsConfig::new("test-key").with_in_memory_exporter(exporter); + let config = MetricsConfig::in_memory(exporter); MetricsClient::new(config) } From d40fec9a1e89a9a0b698a09feda34cbffd8f5e2d Mon Sep 17 00:00:00 2001 From: jif-oai Date: Mon, 22 Dec 2025 15:12:11 +0100 Subject: [PATCH 23/43] SIMPLEEEER --- codex-rs/docs/metrics.md | 137 ---------------------------- codex-rs/otel/README.md | 85 ++--------------- codex-rs/otel/src/metrics/config.rs | 7 +- 3 files changed, 13 insertions(+), 216 deletions(-) delete mode 100644 codex-rs/docs/metrics.md diff --git a/codex-rs/docs/metrics.md b/codex-rs/docs/metrics.md deleted file mode 100644 index 7e51f097342..00000000000 --- a/codex-rs/docs/metrics.md +++ /dev/null @@ -1,137 +0,0 @@ -# Metrics (Statsig HTTP) - -The `codex_otel::metrics` module sends counters and histograms to a Statsig -backend by POSTing JSON to the Statsig `log_event` endpoint. A tokio-backed -worker keeps callers non-blocking while metrics are serialized and sent. - -Defaults are provided for the Statsig API key, header name, and endpoint so -you can send metrics immediately. Override them if you need to target a -different Statsig project. - -## Quick start - -```rust -use codex_otel::metrics::MetricsClient; -use codex_otel::metrics::MetricsConfig; - -let metrics = MetricsClient::new( - MetricsConfig::new("") - .with_endpoint("") - .with_api_key_header("") - .with_tag("service", "codex-cli")?, -)?; - -metrics.counter("codex.session_started", 1, &[("source", "tui")])?; -metrics.histogram("codex.request_latency", 83, &[("route", "chat")])?; -``` - -## OtelManager facade - -If you're already using `OtelManager` for tracing, you can attach a metrics -client and emit metrics through the same handle. By default, metrics sent via -`OtelManager` include metadata tags: `auth_mode`, `model`, `slug`, -`terminal.type`, and `app.version`. Use -`with_metrics_without_metadata_tags` to opt out. - -```rust -use codex_otel::metrics::MetricsConfig; -use codex_otel::OtelManager; - -let manager = OtelManager::new( - conversation_id, - model, - slug, - account_id, - account_email, - auth_mode, - log_user_prompts, - terminal_type, - session_source, -) -.with_metrics_config( - MetricsConfig::new("") - .with_endpoint("") - .with_api_key_header(""), -)?; -manager.counter("codex.session_started", 1, &[("source", "tui")])?; -manager.histogram("codex.request_latency", 83, &[("route", "chat")])?; -``` - -If you set `metrics: Some(MetricsConfig)` on `OtelSettings` and build an -`OtelProvider`, you can reuse that client via -`OtelManager::with_provider_metrics(&provider)`. - -## Configuration - -`MetricsConfig` lets you specify: - -- `MetricsConfig::new(api_key)` to set the Statsig API key. -- `with_endpoint(endpoint)` to set the Statsig `log_event` endpoint. -- `with_api_key_header(header)` to set the API key header name. -- `with_tag(key, value)` to add default tags for every metric. -- `with_timeout(duration)` to set the HTTP request timeout. -- `with_user_agent(agent)` to override the HTTP `User-Agent` header. - -The queue capacity is fixed at 1024 entries. - -## Timing - -Measure a closure and emit a histogram sample for the elapsed time in -milliseconds: - -```rust -let result = metrics.time("codex.request_latency", &[("route", "chat")], || { - "ok" -})?; -``` - -If the closure already returns `codex_otel::metrics::Result`, use -`time_result` to avoid nested results: - -```rust -let result = metrics.time_result( - "codex.request_latency", - &[("route", "chat")], - || Ok("ok"), -)?; -``` - -If you already have a duration, record it directly: - -```rust -metrics.record_duration( - "codex.request_latency", - std::time::Duration::from_millis(83), - &[("route", "chat")], -)?; -``` - -## Shutdown and queue capacity - -The client uses a bounded queue (default capacity 1024). Enqueueing returns a -`MetricsError::QueueFull` error if the queue is full or -`MetricsError::WorkerUnavailable` if the worker is no longer running. - -`shutdown` flushes queued metrics, requests a final export, and waits up to -500ms for the worker to stop. `MetricsClient` also attempts a best-effort -shutdown on drop using the default timeout, so explicit calls to `shutdown` -are optional. - -## Validation rules - -Metric names: - -- Must be non-empty. -- Allowed characters: ASCII letters/digits plus `.`, `_`, `-`. - -Tag keys and values: - -- Must be non-empty. -- Allowed characters: ASCII letters/digits plus `.`, `_`, `-`, `/`. -- The tag key `le` is reserved. - -## Error handling - -All APIs return `codex_otel::metrics::Result` with a `MetricsError` variant -on failure. Errors cover invalid configuration, validation failures, queue -backpressure, and HTTP client setup or request failures. diff --git a/codex-rs/otel/README.md b/codex-rs/otel/README.md index 1150c4ff85e..413757af906 100644 --- a/codex-rs/otel/README.md +++ b/codex-rs/otel/README.md @@ -70,93 +70,26 @@ let manager = OtelManager::new( manager.user_prompt(&prompt_items); ``` -## Metrics (Statsig HTTP) +## Metrics (Statsig HTTP or in-memory) -The metrics client sends counters and histograms to Statsig via the `log_event` -endpoint. Use placeholders for the Statsig endpoint and API key header until -you have real values: +Statsig example: ```rust -use codex_otel::metrics::MetricsClient; -use codex_otel::metrics::MetricsConfig; - -let metrics = MetricsClient::new( - MetricsConfig::new("") - .with_endpoint("") - .with_api_key_header(""), -)?; +let metrics = MetricsClient::new(MetricsConfig::default())?; metrics.counter("codex.session_started", 1, &[("source", "tui")])?; +metrics.histogram("codex.request_latency", 83, &[("route", "chat")])?; ``` -## Metrics via OtelManager - -Attach metrics once in `OtelSettings.metrics` and reuse them from -`OtelManager`: +In-memory (tests): ```rust -use codex_otel::config::{OtelExporter, OtelHttpProtocol, OtelSettings}; -use codex_otel::metrics::MetricsConfig; -use codex_otel::OtelManager; -use codex_otel::traces::otel_provider::OtelProvider; -use tracing_subscriber::prelude::*; - -let settings = OtelSettings { - environment: "dev".into(), - service_name: "codex-cli".into(), - service_version: env!("CARGO_PKG_VERSION").into(), - codex_home: std::path::PathBuf::from("/tmp"), - exporter: OtelExporter::OtlpHttp { - endpoint: "https://otlp.example.com".into(), - headers: std::collections::HashMap::new(), - protocol: OtelHttpProtocol::Binary, - tls: None, - }, - trace_exporter: OtelExporter::OtlpHttp { - endpoint: "https://otlp.example.com".into(), - headers: std::collections::HashMap::new(), - protocol: OtelHttpProtocol::Binary, - tls: None, - }, - metrics: Some( - MetricsConfig::new("") - .with_endpoint("") - .with_api_key_header(""), - ), -}; - -let provider = OtelProvider::from(&settings)?; -if let Some(p) = &provider { - tracing_subscriber::registry() - .with(p.logger_layer()) - .with(p.tracing_layer()) - .init(); -} - -let manager = OtelManager::new( - conversation_id, - model, - slug, - account_id, - account_email, - auth_mode, - log_user_prompts, - terminal_type, - session_source, -); -let manager = provider - .as_ref() - .map(|p| manager.with_provider_metrics(p)) - .unwrap_or(manager); - -manager.counter("codex.session_started", 1, &[("source", "tui")])?; -manager.histogram("codex.request_latency", 83, &[("route", "chat")])?; +let exporter = InMemoryMetricExporter::default(); +let metrics = MetricsClient::new(MetricsConfig::in_memory(exporter.clone()))?; +metrics.counter("codex.turns", 1, &[("model", "gpt-5.1")])?; +metrics.shutdown()?; // flushes in-memory exporter ``` -By default, `OtelManager` adds metadata tags to metrics: `auth_mode`, `model`, -`slug`, `terminal.type`, and `app.version`. Use -`with_metrics_without_metadata_tags` to disable these tags. - ## Shutdown - `OtelProvider::shutdown()` stops the OTEL exporter. diff --git a/codex-rs/otel/src/metrics/config.rs b/codex-rs/otel/src/metrics/config.rs index ae4b65d4ed8..63e82fbf85b 100644 --- a/codex-rs/otel/src/metrics/config.rs +++ b/codex-rs/otel/src/metrics/config.rs @@ -7,6 +7,7 @@ use crate::metrics::validation::validate_tag_key; use crate::metrics::validation::validate_tag_value; use std::collections::BTreeMap; use std::time::Duration; +use opentelemetry_sdk::metrics::InMemoryMetricExporter; #[derive(Clone, Debug)] pub(crate) enum MetricsExporter { @@ -16,7 +17,7 @@ pub(crate) enum MetricsExporter { timeout: Duration, user_agent: String, }, - InMemory(opentelemetry_sdk::metrics::InMemoryMetricExporter), + InMemory(InMemoryMetricExporter), } impl MetricsExporter { @@ -107,7 +108,7 @@ impl MetricsConfig { match &self.exporter { MetricsExporter::StatsigHttp { endpoint, timeout, .. - } => format!("statsig_http endpoint={} timeout={:?}", endpoint, timeout), + } => format!("statsig_http endpoint={endpoint} timeout={timeout:?}"), MetricsExporter::InMemory(_) => "in_memory".to_string(), } } @@ -116,7 +117,7 @@ impl MetricsConfig { impl Default for MetricsConfig { fn default() -> Self { if cfg!(test) { - Self::statsig("MOCK_API_KEY") + Self::in_memory(InMemoryMetricExporter::default()) } else { Self::statsig(DEFAULT_API_KEY) } From dafbfe311f3894c3e71d94cca12e95cf2eb2f825 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Mon, 22 Dec 2025 15:24:36 +0100 Subject: [PATCH 24/43] Add a few metrics --- codex-rs/core/src/tasks/compact.rs | 6 ++++++ codex-rs/core/src/tasks/review.rs | 6 ++++++ codex-rs/core/src/tasks/undo.rs | 5 +++++ codex-rs/core/src/tasks/user_shell.rs | 6 ++++++ 4 files changed, 23 insertions(+) diff --git a/codex-rs/core/src/tasks/compact.rs b/codex-rs/core/src/tasks/compact.rs index a2a268fbb36..1e46e627513 100644 --- a/codex-rs/core/src/tasks/compact.rs +++ b/codex-rs/core/src/tasks/compact.rs @@ -24,6 +24,12 @@ impl SessionTask for CompactTask { input: Vec, _cancellation_token: CancellationToken, ) -> Option { + let _ = session + .session + .services + .otel_manager + .counter("codex.task.compact", 1, &[]); + let session = session.clone_session(); if crate::compact::should_use_remote_compact_task( session.as_ref(), diff --git a/codex-rs/core/src/tasks/review.rs b/codex-rs/core/src/tasks/review.rs index 00dbc51f405..072a7c07271 100644 --- a/codex-rs/core/src/tasks/review.rs +++ b/codex-rs/core/src/tasks/review.rs @@ -46,6 +46,12 @@ impl SessionTask for ReviewTask { input: Vec, cancellation_token: CancellationToken, ) -> Option { + let _ = session + .session + .services + .otel_manager + .counter("codex.task.review", 1, &[]); + // Start sub-codex conversation and get the receiver for events. let output = match start_review_conversation( session.clone(), diff --git a/codex-rs/core/src/tasks/undo.rs b/codex-rs/core/src/tasks/undo.rs index 5da7edd16fa..86232c094ce 100644 --- a/codex-rs/core/src/tasks/undo.rs +++ b/codex-rs/core/src/tasks/undo.rs @@ -38,6 +38,11 @@ impl SessionTask for UndoTask { _input: Vec, cancellation_token: CancellationToken, ) -> Option { + let _ = session + .session + .services + .otel_manager + .counter("codex.task.undo", 1, &[]); let sess = session.clone_session(); sess.send_event( ctx.as_ref(), diff --git a/codex-rs/core/src/tasks/user_shell.rs b/codex-rs/core/src/tasks/user_shell.rs index aec09514ca3..e76f70253ec 100644 --- a/codex-rs/core/src/tasks/user_shell.rs +++ b/codex-rs/core/src/tasks/user_shell.rs @@ -58,6 +58,12 @@ impl SessionTask for UserShellCommandTask { _input: Vec, cancellation_token: CancellationToken, ) -> Option { + let _ = session + .session + .services + .otel_manager + .counter("codex.task.user_shell", 1, &[]); + let event = EventMsg::TaskStarted(TaskStartedEvent { model_context_window: turn_context.client.get_model_context_window(), }); From e31246240d475074fcde1d990d76de083cb8cde6 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Mon, 22 Dec 2025 15:34:27 +0100 Subject: [PATCH 25/43] Use a trait --- codex-rs/otel/src/metrics/client.rs | 16 +- codex-rs/otel/src/metrics/config.rs | 8 +- codex-rs/otel/src/metrics/exporter.rs | 283 +------------------- codex-rs/otel/src/metrics/mod.rs | 6 +- codex-rs/otel/src/metrics/sink.rs | 41 +++ codex-rs/otel/src/metrics/sink/in_memory.rs | 58 ++++ codex-rs/otel/src/metrics/sink/statsig.rs | 202 ++++++++++++++ codex-rs/otel/src/metrics/tags.rs | 20 +- codex-rs/otel/src/metrics/worker.rs | 45 ++-- 9 files changed, 352 insertions(+), 327 deletions(-) create mode 100644 codex-rs/otel/src/metrics/sink.rs create mode 100644 codex-rs/otel/src/metrics/sink/in_memory.rs create mode 100644 codex-rs/otel/src/metrics/sink/statsig.rs diff --git a/codex-rs/otel/src/metrics/client.rs b/codex-rs/otel/src/metrics/client.rs index c299ea02c4d..ef71b253484 100644 --- a/codex-rs/otel/src/metrics/client.rs +++ b/codex-rs/otel/src/metrics/client.rs @@ -5,12 +5,13 @@ use crate::metrics::config::MetricsExporter; use crate::metrics::error::MetricsError; use crate::metrics::error::Result; use crate::metrics::exporter::MetricEvent; -use crate::metrics::exporter::build_worker_exporter; -use crate::metrics::tags::collect_tags; +use crate::metrics::sink::build_metric_sink; +use crate::metrics::tags::merge_tags; use crate::metrics::time::duration_to_millis; use crate::metrics::validation::validate_metric_name; use crate::metrics::validation::validate_tags; use crate::metrics::worker::spawn_worker; +use std::collections::BTreeMap; use std::sync::Mutex; use std::thread; use std::time::Duration; @@ -18,12 +19,13 @@ use std::time::Instant; use tokio::runtime::Runtime; use tokio::sync::mpsc; -/// Background metrics client that enqueues metrics to a tokio-backed worker. +/// Background metrics client that enqueues metrics to a worker thread. #[derive(Clone)] pub struct MetricsClient { sender: std::sync::Arc>>>, handle: std::sync::Arc>>>, capacity: usize, + default_tags: BTreeMap, } impl std::fmt::Debug for MetricsClient { @@ -55,7 +57,8 @@ impl MetricsClient { } let exporter_label = config.exporter_label(); - let exporter = build_worker_exporter(&config)?; + let exporter = build_metric_sink(&config)?; + let default_tags = config.default_tags.clone(); let runtime = build_runtime()?; let (sender, receiver) = mpsc::channel(capacity); @@ -65,13 +68,14 @@ impl MetricsClient { sender: std::sync::Arc::new(Mutex::new(Some(sender))), handle: std::sync::Arc::new(Mutex::new(Some(handle))), capacity, + default_tags, }) } /// Send a single counter increment without blocking the caller. pub fn counter(&self, name: &str, inc: i64, tags: &[(&str, &str)]) -> Result<()> { validate_metric_name(name)?; - let tags = collect_tags(tags)?; + let tags = merge_tags(&self.default_tags, tags)?; self.send_event(MetricEvent::Counter { name: name.to_string(), value: inc, @@ -82,7 +86,7 @@ impl MetricsClient { /// Send a single histogram sample. pub fn histogram(&self, name: &str, value: i64, tags: &[(&str, &str)]) -> Result<()> { validate_metric_name(name)?; - let tags = collect_tags(tags)?; + let tags = merge_tags(&self.default_tags, tags)?; self.send_event(MetricEvent::Histogram { name: name.to_string(), value, diff --git a/codex-rs/otel/src/metrics/config.rs b/codex-rs/otel/src/metrics/config.rs index 63e82fbf85b..bdaa14f9b85 100644 --- a/codex-rs/otel/src/metrics/config.rs +++ b/codex-rs/otel/src/metrics/config.rs @@ -1,13 +1,13 @@ -use crate::metrics::DEFAULT_API_KEY; -use crate::metrics::DEFAULT_API_KEY_HEADER; -use crate::metrics::DEFAULT_STATSIG_ENDPOINT; use crate::metrics::DEFAULT_TIMEOUT; use crate::metrics::error::Result; +use crate::metrics::sink::statsig::DEFAULT_API_KEY; +use crate::metrics::sink::statsig::DEFAULT_API_KEY_HEADER; +use crate::metrics::sink::statsig::DEFAULT_STATSIG_ENDPOINT; use crate::metrics::validation::validate_tag_key; use crate::metrics::validation::validate_tag_value; +use opentelemetry_sdk::metrics::InMemoryMetricExporter; use std::collections::BTreeMap; use std::time::Duration; -use opentelemetry_sdk::metrics::InMemoryMetricExporter; #[derive(Clone, Debug)] pub(crate) enum MetricsExporter { diff --git a/codex-rs/otel/src/metrics/exporter.rs b/codex-rs/otel/src/metrics/exporter.rs index c2a819c0123..3da943ab8bd 100644 --- a/codex-rs/otel/src/metrics/exporter.rs +++ b/codex-rs/otel/src/metrics/exporter.rs @@ -1,133 +1,43 @@ -use crate::metrics::config::MetricsConfig; -use crate::metrics::config::MetricsExporter; -use crate::metrics::error::MetricsError; -use crate::metrics::error::Result; -use crate::metrics::tags::merge_tags; -use crate::metrics::util::error_or_panic; -use chrono::Utc; use opentelemetry::KeyValue; use opentelemetry::metrics::Histogram; use opentelemetry::metrics::Meter; -use opentelemetry::metrics::MeterProvider; use opentelemetry::metrics::UpDownCounter; -use opentelemetry_sdk::metrics::PeriodicReader; -use opentelemetry_sdk::metrics::SdkMeterProvider; -use reqwest::header::HeaderName; -use reqwest::header::HeaderValue; -use reqwest::header::USER_AGENT; -use serde::Serialize; use std::collections::BTreeMap; use std::collections::HashMap; -use std::time::Duration; pub(crate) const METER_NAME: &str = "codex-otel-metrics"; -const STATSIG_USER_ID: &str = "codex-metrics"; -const STATSIG_SDK_TYPE: &str = "codex-otel-rust"; #[derive(Clone, Debug)] pub(crate) enum MetricEvent { Counter { name: String, value: i64, - tags: Vec<(String, String)>, + tags: BTreeMap, }, Histogram { name: String, value: i64, - tags: Vec<(String, String)>, + tags: BTreeMap, }, } -pub(crate) fn build_worker_exporter(config: &MetricsConfig) -> Result { - match &config.exporter { - MetricsExporter::StatsigHttp { - endpoint, - api_key_header, - timeout, - user_agent, - } => Ok(WorkerExporter::Statsig(StatsigExporter::from( - endpoint, - api_key_header, - timeout, - user_agent, - &config.api_key, - &config.default_tags, - )?)), - MetricsExporter::InMemory(exporter) => Ok(WorkerExporter::InMemory( - InMemoryExporter::from(config.default_tags.clone(), exporter.clone()), - )), - } -} - -pub(crate) enum WorkerExporter { - Statsig(StatsigExporter), - InMemory(InMemoryExporter), -} - -pub(crate) struct InMemoryExporter { - recorder: MetricRecorder, - meter_provider: SdkMeterProvider, -} - -impl InMemoryExporter { - fn from( - default_tags: BTreeMap, - exporter: opentelemetry_sdk::metrics::InMemoryMetricExporter, - ) -> Self { - let reader = PeriodicReader::builder(exporter).build(); - let meter_provider = SdkMeterProvider::builder().with_reader(reader).build(); - let meter = meter_provider.meter(METER_NAME); - let recorder = MetricRecorder::new(meter, default_tags); - Self { - recorder, - meter_provider, - } - } - - pub(crate) async fn export_events(&mut self, events: Vec, exporter_label: &str) { - for event in events { - self.recorder.record_event(event); - } - if let Err(err) = self.meter_provider.force_flush() { - error_or_panic(format!( - "metrics flush failed: {err} (exporter={exporter_label})" - )); - } - } - - pub(crate) async fn shutdown(&mut self, exporter_label: &str) { - if let Err(err) = self.meter_provider.force_flush() { - error_or_panic(format!( - "metrics flush failed during shutdown: {err} (exporter={exporter_label})" - )); - } - if let Err(err) = self.meter_provider.shutdown() { - error_or_panic(format!( - "metrics shutdown failed: {err} (exporter={exporter_label})" - )); - } - } -} - #[derive(Debug)] -struct MetricRecorder { +pub(crate) struct MetricRecorder { meter: Meter, counters: HashMap>, histograms: HashMap>, - default_tags: BTreeMap, } impl MetricRecorder { - fn new(meter: Meter, default_tags: BTreeMap) -> Self { + pub(crate) fn new(meter: Meter) -> Self { Self { meter, counters: HashMap::new(), histograms: HashMap::new(), - default_tags, } } - fn record_event(&mut self, event: MetricEvent) { + pub(crate) fn record_event(&mut self, event: MetricEvent) { match event { MetricEvent::Counter { name, value, tags } => { self.record_counter(&name, value, &tags); @@ -138,7 +48,7 @@ impl MetricRecorder { } } - fn record_counter(&mut self, name: &str, value: i64, tags: &[(String, String)]) { + fn record_counter(&mut self, name: &str, value: i64, tags: &BTreeMap) { let attributes = self.attributes_for(tags); let name = name.to_string(); let counter = self @@ -148,7 +58,7 @@ impl MetricRecorder { counter.add(value, &attributes); } - fn record_histogram(&mut self, name: &str, value: i64, tags: &[(String, String)]) { + fn record_histogram(&mut self, name: &str, value: i64, tags: &BTreeMap) { let attributes = self.attributes_for(tags); let name = name.to_string(); let histogram = self @@ -158,184 +68,9 @@ impl MetricRecorder { histogram.record(value as f64, &attributes); } - fn attributes_for(&self, tags: &[(String, String)]) -> Vec { - let merged = merge_tags(&self.default_tags, tags); - merged - .iter() + fn attributes_for(&self, tags: &BTreeMap) -> Vec { + tags.iter() .map(|(key, value)| KeyValue::new(key.clone(), value.clone())) .collect() } } - -pub(crate) struct StatsigExporter { - client: reqwest::Client, - endpoint: String, - api_key_header: HeaderName, - api_key: HeaderValue, - user_agent: Option, - default_tags: BTreeMap, -} - -impl StatsigExporter { - fn from( - endpoint: &str, - api_key_header: &str, - timeout: &Duration, - user_agent: &str, - api_key: &str, - default_tags: &BTreeMap, - ) -> Result { - let api_key_header = - HeaderName::from_bytes(api_key_header.as_bytes()).map_err(|source| { - MetricsError::InvalidApiKeyHeader { - header: api_key_header.to_string(), - source, - } - })?; - let api_key = - HeaderValue::from_str(api_key).map_err(|source| MetricsError::InvalidHeaderValue { - header: api_key_header.to_string(), - source, - })?; - let user_agent = if user_agent.is_empty() { - None - } else { - Some(HeaderValue::from_str(user_agent).map_err(|source| { - MetricsError::InvalidHeaderValue { - header: "User-Agent".to_string(), - source, - } - })?) - }; - let client = reqwest::Client::builder() - .timeout(*timeout) - .build() - .map_err(|source| MetricsError::HttpClientBuild { source })?; - - Ok(Self { - client, - endpoint: endpoint.to_string(), - api_key_header, - api_key, - user_agent, - default_tags: default_tags.clone(), - }) - } - - pub(crate) async fn export_events(&self, events: Vec) -> Result<()> { - if events.is_empty() { - return Ok(()); - } - - let payload = self.build_payload(events); - - let mut request = self - .client - .post(&self.endpoint) - .header(self.api_key_header.clone(), self.api_key.clone()); - - if let Some(user_agent) = &self.user_agent { - request = request.header(USER_AGENT, user_agent.clone()); - } - - let response = request - .json(&payload) - .send() - .await - .map_err(|source| MetricsError::StatsigRequestFailed { source })?; - - if let Err(status_err) = response.error_for_status_ref() { - let status = status_err - .status() - .unwrap_or(reqwest::StatusCode::INTERNAL_SERVER_ERROR); - let body = response.text().await.unwrap_or_default(); - return Err(MetricsError::StatsigResponseError { status, body }); - } - - Ok(()) - } - - fn build_payload(&self, events: Vec) -> StatsigPayload { - let timestamp = Utc::now().timestamp_millis(); - let events = events - .into_iter() - .map(|event| self.event_from_metric(event, timestamp)) - .collect(); - - StatsigPayload { - events, - statsig_metadata: StatsigMetadata { - sdk_type: STATSIG_SDK_TYPE.to_string(), - sdk_version: env!("CARGO_PKG_VERSION").to_string(), - }, - } - } - - fn event_from_metric(&self, event: MetricEvent, timestamp: i64) -> StatsigEvent { - match event { - MetricEvent::Counter { name, value, tags } => StatsigEvent { - event_name: name, - value: value as f64, - metadata: StatsigEventMetadata { - metric_type: "counter".to_string(), - tags: merge_tags(&self.default_tags, &tags), - }, - user: StatsigUser { - user_id: STATSIG_USER_ID.to_string(), - }, - time: timestamp, - }, - MetricEvent::Histogram { name, value, tags } => StatsigEvent { - event_name: name, - value: value as f64, - metadata: StatsigEventMetadata { - metric_type: "histogram".to_string(), - tags: merge_tags(&self.default_tags, &tags), - }, - user: StatsigUser { - user_id: STATSIG_USER_ID.to_string(), - }, - time: timestamp, - }, - } - } -} - -#[derive(Debug, Serialize)] -struct StatsigPayload { - events: Vec, - #[serde(rename = "statsigMetadata")] - statsig_metadata: StatsigMetadata, -} - -#[derive(Debug, Serialize)] -struct StatsigEvent { - #[serde(rename = "eventName")] - event_name: String, - value: f64, - metadata: StatsigEventMetadata, - user: StatsigUser, - time: i64, -} - -#[derive(Debug, Serialize)] -struct StatsigEventMetadata { - #[serde(rename = "metric_type")] - metric_type: String, - #[serde(flatten)] - tags: BTreeMap, -} - -#[derive(Debug, Serialize)] -struct StatsigUser { - #[serde(rename = "userID")] - user_id: String, -} - -#[derive(Debug, Serialize)] -struct StatsigMetadata { - #[serde(rename = "sdkType")] - sdk_type: String, - #[serde(rename = "sdkVersion")] - sdk_version: String, -} diff --git a/codex-rs/otel/src/metrics/mod.rs b/codex-rs/otel/src/metrics/mod.rs index 386dff332f6..0355b890fd9 100644 --- a/codex-rs/otel/src/metrics/mod.rs +++ b/codex-rs/otel/src/metrics/mod.rs @@ -2,6 +2,7 @@ mod client; mod config; mod error; mod exporter; +mod sink; mod tags; mod time; mod util; @@ -10,11 +11,6 @@ mod worker; use std::time::Duration; -// Publicly available API key for codex local project. -pub(crate) const DEFAULT_API_KEY: &str = "client-MkRuleRQBd6qakfnDYqJVR9JuXcY57Ljly3vi5JVUIO"; -pub(crate) const DEFAULT_STATSIG_ENDPOINT: &str = "https://ab.chatgpt.com/v1/log_event"; -pub(crate) const DEFAULT_API_KEY_HEADER: &str = "statsig-api-key"; - pub(crate) const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10); pub(crate) const DEFAULT_QUEUE_CAPACITY: usize = 1024; pub(crate) const DEFAULT_SHUTDOWN_TIMEOUT: Duration = Duration::from_millis(500); diff --git a/codex-rs/otel/src/metrics/sink.rs b/codex-rs/otel/src/metrics/sink.rs new file mode 100644 index 00000000000..8d5e73b9d01 --- /dev/null +++ b/codex-rs/otel/src/metrics/sink.rs @@ -0,0 +1,41 @@ +use crate::metrics::MetricsConfig; +use crate::metrics::config::MetricsExporter; +use crate::metrics::exporter::MetricEvent; +use crate::metrics::sink::in_memory::InMemoryExporter; +use crate::metrics::sink::statsig::StatsigExporter; +use std::pin::Pin; + +pub(crate) mod in_memory; +pub(crate) mod statsig; + +pub(crate) trait MetricSink: Send { + fn export_batch<'a>( + &'a mut self, + events: Vec, + ) -> Pin> + Send + 'a>>; + fn shutdown<'a>( + &'a mut self, + ) -> Pin> + Send + 'a>>; +} + +pub(crate) fn build_metric_sink( + config: &MetricsConfig, +) -> crate::metrics::Result> { + match &config.exporter { + MetricsExporter::StatsigHttp { + endpoint, + api_key_header, + timeout, + user_agent, + } => Ok(Box::new(StatsigExporter::from( + endpoint, + api_key_header, + timeout, + user_agent, + &config.api_key, + )?)), + MetricsExporter::InMemory(exporter) => { + Ok(Box::new(InMemoryExporter::from(exporter.clone()))) + } + } +} diff --git a/codex-rs/otel/src/metrics/sink/in_memory.rs b/codex-rs/otel/src/metrics/sink/in_memory.rs new file mode 100644 index 00000000000..6b02bcbb153 --- /dev/null +++ b/codex-rs/otel/src/metrics/sink/in_memory.rs @@ -0,0 +1,58 @@ +use crate::metrics::exporter::METER_NAME; +use crate::metrics::exporter::MetricEvent; +use crate::metrics::exporter::MetricRecorder; +use crate::metrics::sink::MetricSink; +use crate::metrics::util::error_or_panic; +use opentelemetry::metrics::MeterProvider; +use opentelemetry_sdk::metrics::PeriodicReader; +use opentelemetry_sdk::metrics::SdkMeterProvider; +use std::pin::Pin; + +pub(crate) struct InMemoryExporter { + recorder: MetricRecorder, + meter_provider: SdkMeterProvider, +} + +impl InMemoryExporter { + pub(crate) fn from(exporter: opentelemetry_sdk::metrics::InMemoryMetricExporter) -> Self { + let reader = PeriodicReader::builder(exporter).build(); + let meter_provider = SdkMeterProvider::builder().with_reader(reader).build(); + let meter = meter_provider.meter(METER_NAME); + let recorder = MetricRecorder::new(meter); + Self { + recorder, + meter_provider, + } + } +} + +impl MetricSink for InMemoryExporter { + fn export_batch<'a>( + &'a mut self, + events: Vec, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + for event in events { + self.recorder.record_event(event); + } + if let Err(err) = self.meter_provider.force_flush() { + error_or_panic(format!("metrics flush failed: {err}")); + } + Ok(()) + }) + } + + fn shutdown<'a>( + &'a mut self, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + if let Err(err) = self.meter_provider.force_flush() { + error_or_panic(format!("metrics flush failed during shutdown: {err}")); + } + if let Err(err) = self.meter_provider.shutdown() { + error_or_panic(format!("metrics shutdown failed: {err}")); + } + Ok(()) + }) + } +} diff --git a/codex-rs/otel/src/metrics/sink/statsig.rs b/codex-rs/otel/src/metrics/sink/statsig.rs new file mode 100644 index 00000000000..2cb72dd387c --- /dev/null +++ b/codex-rs/otel/src/metrics/sink/statsig.rs @@ -0,0 +1,202 @@ +use crate::metrics::MetricsError; +use crate::metrics::exporter::MetricEvent; +use crate::metrics::sink::MetricSink; +use chrono::Utc; +use http::HeaderName; +use http::HeaderValue; +use http::header::USER_AGENT; +use reqwest::Client; +use serde::Serialize; +use std::collections::BTreeMap; +use std::pin::Pin; +use std::time::Duration; + +// Publicly available API key for codex local project. +pub(crate) const DEFAULT_API_KEY: &str = "client-MkRuleRQBd6qakfnDYqJVR9JuXcY57Ljly3vi5JVUIO"; +pub(crate) const DEFAULT_STATSIG_ENDPOINT: &str = "https://ab.chatgpt.com/v1/log_event"; +pub(crate) const DEFAULT_API_KEY_HEADER: &str = "statsig-api-key"; +const STATSIG_USER_ID: &str = "codex-metrics"; +const STATSIG_SDK_TYPE: &str = "codex-otel-rust"; + +pub(crate) struct StatsigExporter { + client: Client, + endpoint: String, + api_key_header: HeaderName, + api_key: HeaderValue, + user_agent: Option, +} + +impl StatsigExporter { + pub(crate) fn from( + endpoint: &str, + api_key_header: &str, + timeout: &Duration, + user_agent: &str, + api_key: &str, + ) -> crate::metrics::Result { + let api_key_header = + HeaderName::from_bytes(api_key_header.as_bytes()).map_err(|source| { + MetricsError::InvalidApiKeyHeader { + header: api_key_header.to_string(), + source, + } + })?; + let api_key = + HeaderValue::from_str(api_key).map_err(|source| MetricsError::InvalidHeaderValue { + header: api_key_header.to_string(), + source, + })?; + let user_agent = if user_agent.is_empty() { + None + } else { + Some(HeaderValue::from_str(user_agent).map_err(|source| { + MetricsError::InvalidHeaderValue { + header: "User-Agent".to_string(), + source, + } + })?) + }; + let client = Client::builder() + .timeout(*timeout) + .build() + .map_err(|source| MetricsError::HttpClientBuild { source })?; + + Ok(Self { + client, + endpoint: endpoint.to_string(), + api_key_header, + api_key, + user_agent, + }) + } + + fn build_payload(&self, events: Vec) -> StatsigPayload { + let timestamp = Utc::now().timestamp_millis(); + let events = events + .into_iter() + .map(|event| self.event_from_metric(event, timestamp)) + .collect(); + + StatsigPayload { + events, + statsig_metadata: StatsigMetadata { + sdk_type: STATSIG_SDK_TYPE.to_string(), + sdk_version: env!("CARGO_PKG_VERSION").to_string(), + }, + } + } + + fn event_from_metric(&self, event: MetricEvent, timestamp: i64) -> StatsigEvent { + match event { + MetricEvent::Counter { name, value, tags } => StatsigEvent { + event_name: name, + value: value as f64, + metadata: StatsigEventMetadata { + metric_type: "counter".to_string(), + tags, + }, + user: StatsigUser { + user_id: STATSIG_USER_ID.to_string(), + }, + time: timestamp, + }, + MetricEvent::Histogram { name, value, tags } => StatsigEvent { + event_name: name, + value: value as f64, + metadata: StatsigEventMetadata { + metric_type: "histogram".to_string(), + tags, + }, + user: StatsigUser { + user_id: STATSIG_USER_ID.to_string(), + }, + time: timestamp, + }, + } + } +} + +impl MetricSink for StatsigExporter { + fn export_batch<'a>( + &'a mut self, + events: Vec, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + if events.is_empty() { + return Ok(()); + } + + let payload = self.build_payload(events); + + let mut request = self + .client + .post(&self.endpoint) + .header(self.api_key_header.clone(), self.api_key.clone()); + + if let Some(user_agent) = &self.user_agent { + request = request.header(USER_AGENT, user_agent.clone()); + } + + let response = request + .json(&payload) + .send() + .await + .map_err(|source| MetricsError::StatsigRequestFailed { source })?; + + if let Err(status_err) = response.error_for_status_ref() { + let status = status_err + .status() + .unwrap_or(reqwest::StatusCode::INTERNAL_SERVER_ERROR); + let body = response.text().await.unwrap_or_default(); + return Err(MetricsError::StatsigResponseError { status, body }); + } + + Ok(()) + }) + } + + fn shutdown<'a>( + &'a mut self, + ) -> Pin> + Send + 'a>> { + Box::pin(async { Ok(()) }) + } +} + +#[derive(Debug, Serialize)] +struct StatsigPayload { + events: Vec, + #[serde(rename = "statsigMetadata")] + statsig_metadata: StatsigMetadata, +} + +#[derive(Debug, Serialize)] +struct StatsigEvent { + #[serde(rename = "eventName")] + event_name: String, + value: f64, + metadata: StatsigEventMetadata, + user: StatsigUser, + time: i64, +} + +#[derive(Debug, Serialize)] +struct StatsigEventMetadata { + #[serde(rename = "metric_type")] + metric_type: String, + #[serde(flatten)] + tags: BTreeMap, +} + +#[derive(Debug, Serialize)] +struct StatsigUser { + #[serde(rename = "userID")] + user_id: String, +} + +#[derive(Debug, Serialize)] +struct StatsigMetadata { + #[serde(rename = "sdkType")] + sdk_type: String, + #[serde(rename = "sdkVersion")] + sdk_version: String, +} diff --git a/codex-rs/otel/src/metrics/tags.rs b/codex-rs/otel/src/metrics/tags.rs index 51766370f44..981bfb46272 100644 --- a/codex-rs/otel/src/metrics/tags.rs +++ b/codex-rs/otel/src/metrics/tags.rs @@ -3,23 +3,15 @@ use crate::metrics::validation::validate_tag_key; use crate::metrics::validation::validate_tag_value; use std::collections::BTreeMap; -pub(crate) fn collect_tags(tags: &[(&str, &str)]) -> Result> { - tags.iter() - .map(|(key, value)| { - validate_tag_key(key)?; - validate_tag_value(value)?; - Ok(((*key).to_string(), (*value).to_string())) - }) - .collect() -} - pub(crate) fn merge_tags( default_tags: &BTreeMap, - tags: &[(String, String)], -) -> BTreeMap { + tags: &[(&str, &str)], +) -> Result> { let mut merged = default_tags.clone(); for (key, value) in tags { - merged.insert(key.clone(), value.clone()); + validate_tag_key(key)?; + validate_tag_value(value)?; + merged.insert((*key).to_string(), (*value).to_string()); } - merged + Ok(merged) } diff --git a/codex-rs/otel/src/metrics/worker.rs b/codex-rs/otel/src/metrics/worker.rs index f18da6b8883..5195f29e002 100644 --- a/codex-rs/otel/src/metrics/worker.rs +++ b/codex-rs/otel/src/metrics/worker.rs @@ -1,5 +1,5 @@ use crate::metrics::exporter::MetricEvent; -use crate::metrics::exporter::WorkerExporter; +use crate::metrics::sink::MetricSink; use crate::metrics::util::error_or_panic; use std::thread; use std::time::Duration; @@ -8,9 +8,12 @@ use tokio::runtime::Runtime; use tokio::sync::mpsc; use tokio::sync::mpsc::error::TryRecvError; +const MAX_BATCH_SIZE: usize = 50; +const BATCH_TIMEOUT: Duration = Duration::from_millis(1000); + pub(crate) fn spawn_worker( runtime: Runtime, - exporter: WorkerExporter, + exporter: Box, exporter_label: String, receiver: mpsc::Receiver, ) -> thread::JoinHandle<()> { @@ -21,12 +24,12 @@ pub(crate) fn spawn_worker( } struct MetricsWorker { - exporter: WorkerExporter, + exporter: Box, exporter_label: String, } impl MetricsWorker { - fn new(exporter: WorkerExporter, exporter_label: String) -> Self { + fn new(exporter: Box, exporter_label: String) -> Self { Self { exporter, exporter_label, @@ -42,18 +45,11 @@ impl MetricsWorker { } async fn export_batch(&mut self, events: Vec) { - match &mut self.exporter { - WorkerExporter::Statsig(exporter) => { - if let Err(err) = exporter.export_events(events).await { - error_or_panic(format!( - "statsig metrics export failed: {err} (exporter={})", - self.exporter_label - )); - } - } - WorkerExporter::InMemory(exporter) => { - exporter.export_events(events, &self.exporter_label).await; - } + if let Err(err) = self.exporter.export_batch(events).await { + error_or_panic(format!( + "metrics export failed: {err} (exporter={})", + self.exporter_label + )); } } @@ -64,8 +60,7 @@ impl MetricsWorker { let mut events = Vec::with_capacity(1); events.push(first); - // Fast-path: drain anything already enqueued. - while events.len() < 50 { + while events.len() < MAX_BATCH_SIZE { match receiver.try_recv() { Ok(event) => events.push(event), Err(TryRecvError::Empty) => break, @@ -73,13 +68,12 @@ impl MetricsWorker { } } - if events.len() >= 50 { + if events.len() >= MAX_BATCH_SIZE { return events; } - // Small coalescing window to catch near-simultaneous metrics without blocking callers. - let deadline = Instant::now() + Duration::from_millis(1000); - while events.len() < 50 { + let deadline = Instant::now() + BATCH_TIMEOUT; + while events.len() < MAX_BATCH_SIZE { let remaining = deadline.saturating_duration_since(Instant::now()); if remaining.is_zero() { break; @@ -96,8 +90,11 @@ impl MetricsWorker { } async fn shutdown(&mut self) { - if let WorkerExporter::InMemory(exporter) = &mut self.exporter { - exporter.shutdown(&self.exporter_label).await; + if let Err(err) = self.exporter.shutdown().await { + error_or_panic(format!( + "metrics shutdown failed: {err} (exporter={})", + self.exporter_label + )); } } } From 0fcdcc009b841a167372f38e7befb2c2cb2a27ae Mon Sep 17 00:00:00 2001 From: jif-oai Date: Mon, 22 Dec 2025 15:43:20 +0100 Subject: [PATCH 26/43] Further cleaning --- codex-rs/otel/src/metrics/client.rs | 2 +- codex-rs/otel/src/metrics/event.rs | 15 ++++ codex-rs/otel/src/metrics/exporter.rs | 76 --------------------- codex-rs/otel/src/metrics/mod.rs | 3 +- codex-rs/otel/src/metrics/sink.rs | 2 +- codex-rs/otel/src/metrics/sink/in_memory.rs | 66 +++++++++++++++++- codex-rs/otel/src/metrics/sink/statsig.rs | 2 +- codex-rs/otel/src/metrics/worker.rs | 2 +- 8 files changed, 84 insertions(+), 84 deletions(-) create mode 100644 codex-rs/otel/src/metrics/event.rs delete mode 100644 codex-rs/otel/src/metrics/exporter.rs diff --git a/codex-rs/otel/src/metrics/client.rs b/codex-rs/otel/src/metrics/client.rs index ef71b253484..b0d2d3a5fab 100644 --- a/codex-rs/otel/src/metrics/client.rs +++ b/codex-rs/otel/src/metrics/client.rs @@ -1,10 +1,10 @@ use crate::metrics::DEFAULT_QUEUE_CAPACITY; use crate::metrics::DEFAULT_SHUTDOWN_TIMEOUT; +use crate::metrics::MetricEvent; use crate::metrics::config::MetricsConfig; use crate::metrics::config::MetricsExporter; use crate::metrics::error::MetricsError; use crate::metrics::error::Result; -use crate::metrics::exporter::MetricEvent; use crate::metrics::sink::build_metric_sink; use crate::metrics::tags::merge_tags; use crate::metrics::time::duration_to_millis; diff --git a/codex-rs/otel/src/metrics/event.rs b/codex-rs/otel/src/metrics/event.rs new file mode 100644 index 00000000000..fdbc04fde11 --- /dev/null +++ b/codex-rs/otel/src/metrics/event.rs @@ -0,0 +1,15 @@ +use std::collections::BTreeMap; + +#[derive(Clone, Debug)] +pub(crate) enum MetricEvent { + Counter { + name: String, + value: i64, + tags: BTreeMap, + }, + Histogram { + name: String, + value: i64, + tags: BTreeMap, + }, +} diff --git a/codex-rs/otel/src/metrics/exporter.rs b/codex-rs/otel/src/metrics/exporter.rs deleted file mode 100644 index 3da943ab8bd..00000000000 --- a/codex-rs/otel/src/metrics/exporter.rs +++ /dev/null @@ -1,76 +0,0 @@ -use opentelemetry::KeyValue; -use opentelemetry::metrics::Histogram; -use opentelemetry::metrics::Meter; -use opentelemetry::metrics::UpDownCounter; -use std::collections::BTreeMap; -use std::collections::HashMap; - -pub(crate) const METER_NAME: &str = "codex-otel-metrics"; - -#[derive(Clone, Debug)] -pub(crate) enum MetricEvent { - Counter { - name: String, - value: i64, - tags: BTreeMap, - }, - Histogram { - name: String, - value: i64, - tags: BTreeMap, - }, -} - -#[derive(Debug)] -pub(crate) struct MetricRecorder { - meter: Meter, - counters: HashMap>, - histograms: HashMap>, -} - -impl MetricRecorder { - pub(crate) fn new(meter: Meter) -> Self { - Self { - meter, - counters: HashMap::new(), - histograms: HashMap::new(), - } - } - - pub(crate) fn record_event(&mut self, event: MetricEvent) { - match event { - MetricEvent::Counter { name, value, tags } => { - self.record_counter(&name, value, &tags); - } - MetricEvent::Histogram { name, value, tags } => { - self.record_histogram(&name, value, &tags); - } - } - } - - fn record_counter(&mut self, name: &str, value: i64, tags: &BTreeMap) { - let attributes = self.attributes_for(tags); - let name = name.to_string(); - let counter = self - .counters - .entry(name.clone()) - .or_insert_with(|| self.meter.i64_up_down_counter(name.clone()).build()); - counter.add(value, &attributes); - } - - fn record_histogram(&mut self, name: &str, value: i64, tags: &BTreeMap) { - let attributes = self.attributes_for(tags); - let name = name.to_string(); - let histogram = self - .histograms - .entry(name.clone()) - .or_insert_with(|| self.meter.f64_histogram(name.clone()).build()); - histogram.record(value as f64, &attributes); - } - - fn attributes_for(&self, tags: &BTreeMap) -> Vec { - tags.iter() - .map(|(key, value)| KeyValue::new(key.clone(), value.clone())) - .collect() - } -} diff --git a/codex-rs/otel/src/metrics/mod.rs b/codex-rs/otel/src/metrics/mod.rs index 0355b890fd9..6b1daca73d3 100644 --- a/codex-rs/otel/src/metrics/mod.rs +++ b/codex-rs/otel/src/metrics/mod.rs @@ -1,7 +1,7 @@ mod client; mod config; mod error; -mod exporter; +mod event; mod sink; mod tags; mod time; @@ -20,3 +20,4 @@ pub use crate::metrics::client::MetricsClient; pub use crate::metrics::config::MetricsConfig; pub use crate::metrics::error::MetricsError; pub use crate::metrics::error::Result; +pub(crate) use crate::metrics::event::MetricEvent; diff --git a/codex-rs/otel/src/metrics/sink.rs b/codex-rs/otel/src/metrics/sink.rs index 8d5e73b9d01..e5fd2fdf5b2 100644 --- a/codex-rs/otel/src/metrics/sink.rs +++ b/codex-rs/otel/src/metrics/sink.rs @@ -1,6 +1,6 @@ +use crate::metrics::MetricEvent; use crate::metrics::MetricsConfig; use crate::metrics::config::MetricsExporter; -use crate::metrics::exporter::MetricEvent; use crate::metrics::sink::in_memory::InMemoryExporter; use crate::metrics::sink::statsig::StatsigExporter; use std::pin::Pin; diff --git a/codex-rs/otel/src/metrics/sink/in_memory.rs b/codex-rs/otel/src/metrics/sink/in_memory.rs index 6b02bcbb153..955daeff1b4 100644 --- a/codex-rs/otel/src/metrics/sink/in_memory.rs +++ b/codex-rs/otel/src/metrics/sink/in_memory.rs @@ -1,13 +1,73 @@ -use crate::metrics::exporter::METER_NAME; -use crate::metrics::exporter::MetricEvent; -use crate::metrics::exporter::MetricRecorder; +use crate::metrics::MetricEvent; use crate::metrics::sink::MetricSink; use crate::metrics::util::error_or_panic; +use opentelemetry::KeyValue; +use opentelemetry::metrics::Histogram; +use opentelemetry::metrics::Meter; use opentelemetry::metrics::MeterProvider; +use opentelemetry::metrics::UpDownCounter; use opentelemetry_sdk::metrics::PeriodicReader; use opentelemetry_sdk::metrics::SdkMeterProvider; +use std::collections::BTreeMap; +use std::collections::HashMap; use std::pin::Pin; +const METER_NAME: &str = "codex-otel-metrics"; + +#[derive(Debug)] +struct MetricRecorder { + meter: Meter, + counters: HashMap>, + histograms: HashMap>, +} + +impl MetricRecorder { + fn new(meter: Meter) -> Self { + Self { + meter, + counters: HashMap::new(), + histograms: HashMap::new(), + } + } + + fn record_event(&mut self, event: MetricEvent) { + match event { + MetricEvent::Counter { name, value, tags } => { + self.record_counter(&name, value, &tags); + } + MetricEvent::Histogram { name, value, tags } => { + self.record_histogram(&name, value, &tags); + } + } + } + + fn record_counter(&mut self, name: &str, value: i64, tags: &BTreeMap) { + let attributes = self.attributes_for(tags); + let name = name.to_string(); + let counter = self + .counters + .entry(name.clone()) + .or_insert_with(|| self.meter.i64_up_down_counter(name.clone()).build()); + counter.add(value, &attributes); + } + + fn record_histogram(&mut self, name: &str, value: i64, tags: &BTreeMap) { + let attributes = self.attributes_for(tags); + let name = name.to_string(); + let histogram = self + .histograms + .entry(name.clone()) + .or_insert_with(|| self.meter.f64_histogram(name.clone()).build()); + histogram.record(value as f64, &attributes); + } + + fn attributes_for(&self, tags: &BTreeMap) -> Vec { + tags.iter() + .map(|(key, value)| KeyValue::new(key.clone(), value.clone())) + .collect() + } +} + pub(crate) struct InMemoryExporter { recorder: MetricRecorder, meter_provider: SdkMeterProvider, diff --git a/codex-rs/otel/src/metrics/sink/statsig.rs b/codex-rs/otel/src/metrics/sink/statsig.rs index 2cb72dd387c..7dd17769706 100644 --- a/codex-rs/otel/src/metrics/sink/statsig.rs +++ b/codex-rs/otel/src/metrics/sink/statsig.rs @@ -1,5 +1,5 @@ +use crate::metrics::MetricEvent; use crate::metrics::MetricsError; -use crate::metrics::exporter::MetricEvent; use crate::metrics::sink::MetricSink; use chrono::Utc; use http::HeaderName; diff --git a/codex-rs/otel/src/metrics/worker.rs b/codex-rs/otel/src/metrics/worker.rs index 5195f29e002..f526228ed41 100644 --- a/codex-rs/otel/src/metrics/worker.rs +++ b/codex-rs/otel/src/metrics/worker.rs @@ -1,4 +1,4 @@ -use crate::metrics::exporter::MetricEvent; +use crate::metrics::MetricEvent; use crate::metrics::sink::MetricSink; use crate::metrics::util::error_or_panic; use std::thread; From 0b2164c8831fd16eb29623329826764ac2025138 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Mon, 22 Dec 2025 15:48:14 +0100 Subject: [PATCH 27/43] Better error --- codex-rs/otel/README.md | 3 ++- codex-rs/otel/src/metrics/client.rs | 2 +- codex-rs/otel/src/metrics/error.rs | 3 +++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/codex-rs/otel/README.md b/codex-rs/otel/README.md index 413757af906..cc9490bf235 100644 --- a/codex-rs/otel/README.md +++ b/codex-rs/otel/README.md @@ -96,4 +96,5 @@ metrics.shutdown()?; // flushes in-memory exporter - `OtelManager::shutdown_metrics()` flushes and stops the metrics worker. Both are optional because drop performs best-effort shutdown, but calling them -explicitly gives deterministic flushing. +explicitly gives deterministic flushing (or a shutdown error if flushing does +not complete in time). diff --git a/codex-rs/otel/src/metrics/client.rs b/codex-rs/otel/src/metrics/client.rs index b0d2d3a5fab..55520b22aae 100644 --- a/codex-rs/otel/src/metrics/client.rs +++ b/codex-rs/otel/src/metrics/client.rs @@ -198,7 +198,7 @@ impl MetricsClient { return Ok(()); } - Ok(()) + Err(MetricsError::ShutdownTimeout { timeout }) } } diff --git a/codex-rs/otel/src/metrics/error.rs b/codex-rs/otel/src/metrics/error.rs index 022bd352d52..351fb94d9eb 100644 --- a/codex-rs/otel/src/metrics/error.rs +++ b/codex-rs/otel/src/metrics/error.rs @@ -1,3 +1,4 @@ +use std::time::Duration; use thiserror::Error; pub type Result = std::result::Result; @@ -51,6 +52,8 @@ pub enum MetricsError { WorkerUnavailable, #[error("metrics worker thread panicked")] WorkerPanicked, + #[error("metrics shutdown timed out after {timeout:?}")] + ShutdownTimeout { timeout: Duration }, #[error("failed to send statsig metrics request")] StatsigRequestFailed { #[source] From ee430714c235e5edcdaa32fffed779da1f20830f Mon Sep 17 00:00:00 2001 From: jif-oai Date: Mon, 22 Dec 2025 15:55:22 +0100 Subject: [PATCH 28/43] Improve readme --- codex-rs/otel/README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/codex-rs/otel/README.md b/codex-rs/otel/README.md index cc9490bf235..45c56e6466e 100644 --- a/codex-rs/otel/README.md +++ b/codex-rs/otel/README.md @@ -72,6 +72,11 @@ manager.user_prompt(&prompt_items); ## Metrics (Statsig HTTP or in-memory) +Modes: + +- Statsig HTTP: sends `log_event` batches to the configured Statsig endpoint (default outside tests). +- In-memory: records via `opentelemetry_sdk::metrics::InMemoryMetricExporter` for tests/assertions; call `shutdown()` to flush. + Statsig example: ```rust From bd5daad8cee18849f297b4c1729dd361bf8c87b9 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Mon, 22 Dec 2025 16:01:11 +0100 Subject: [PATCH 29/43] Cleaning --- codex-rs/otel/src/lib.rs | 6 ------ codex-rs/otel/src/metrics/client.rs | 20 +++++++++++++++----- codex-rs/otel/src/metrics/tags.rs | 17 ----------------- codex-rs/otel/src/metrics/time.rs | 7 ------- 4 files changed, 15 insertions(+), 35 deletions(-) delete mode 100644 codex-rs/otel/src/metrics/tags.rs delete mode 100644 codex-rs/otel/src/metrics/time.rs diff --git a/codex-rs/otel/src/lib.rs b/codex-rs/otel/src/lib.rs index d12ada5ce81..9e99acf98b2 100644 --- a/codex-rs/otel/src/lib.rs +++ b/codex-rs/otel/src/lib.rs @@ -152,12 +152,6 @@ impl OtelManager { let mut tags = Vec::with_capacity(5); Self::push_metadata_tag(&mut tags, "auth_mode", self.metadata.auth_mode.as_deref())?; Self::push_metadata_tag(&mut tags, "model", Some(self.metadata.model.as_str()))?; - Self::push_metadata_tag(&mut tags, "slug", Some(self.metadata.slug.as_str()))?; - Self::push_metadata_tag( - &mut tags, - "terminal.type", - Some(self.metadata.terminal_type.as_str()), - )?; Self::push_metadata_tag(&mut tags, "app.version", Some(self.metadata.app_version))?; Ok(tags) } diff --git a/codex-rs/otel/src/metrics/client.rs b/codex-rs/otel/src/metrics/client.rs index 55520b22aae..fe541878f59 100644 --- a/codex-rs/otel/src/metrics/client.rs +++ b/codex-rs/otel/src/metrics/client.rs @@ -6,9 +6,7 @@ use crate::metrics::config::MetricsExporter; use crate::metrics::error::MetricsError; use crate::metrics::error::Result; use crate::metrics::sink::build_metric_sink; -use crate::metrics::tags::merge_tags; -use crate::metrics::time::duration_to_millis; -use crate::metrics::validation::validate_metric_name; +use crate::metrics::validation::{validate_metric_name, validate_tag_key, validate_tag_value}; use crate::metrics::validation::validate_tags; use crate::metrics::worker::spawn_worker; use std::collections::BTreeMap; @@ -101,8 +99,7 @@ impl MetricsClient { duration: Duration, tags: &[(&str, &str)], ) -> Result<()> { - let millis = duration_to_millis(duration); - self.histogram(name, millis, tags) + self.histogram(name, duration.as_millis().min(i64::MAX as u128) as i64, tags) } /// Measure a closure and emit a histogram sample for the elapsed time. @@ -216,3 +213,16 @@ fn build_runtime() -> Result { .build() .map_err(|source| MetricsError::RuntimeBuild { source }) } + +pub(crate) fn merge_tags( + default_tags: &BTreeMap, + tags: &[(&str, &str)], +) -> Result> { + let mut merged = default_tags.clone(); + for (key, value) in tags { + validate_tag_key(key)?; + validate_tag_value(value)?; + merged.insert((*key).to_string(), (*value).to_string()); + } + Ok(merged) +} \ No newline at end of file diff --git a/codex-rs/otel/src/metrics/tags.rs b/codex-rs/otel/src/metrics/tags.rs deleted file mode 100644 index 981bfb46272..00000000000 --- a/codex-rs/otel/src/metrics/tags.rs +++ /dev/null @@ -1,17 +0,0 @@ -use crate::metrics::error::Result; -use crate::metrics::validation::validate_tag_key; -use crate::metrics::validation::validate_tag_value; -use std::collections::BTreeMap; - -pub(crate) fn merge_tags( - default_tags: &BTreeMap, - tags: &[(&str, &str)], -) -> Result> { - let mut merged = default_tags.clone(); - for (key, value) in tags { - validate_tag_key(key)?; - validate_tag_value(value)?; - merged.insert((*key).to_string(), (*value).to_string()); - } - Ok(merged) -} diff --git a/codex-rs/otel/src/metrics/time.rs b/codex-rs/otel/src/metrics/time.rs deleted file mode 100644 index d68d76fa4e7..00000000000 --- a/codex-rs/otel/src/metrics/time.rs +++ /dev/null @@ -1,7 +0,0 @@ -use std::time::Duration; - -pub(crate) fn duration_to_millis(duration: Duration) -> i64 { - let millis = duration.as_millis(); - let capped = millis.min(i64::MAX as u128); - capped as i64 -} From 12205720a96ea07e1d35cee83e937c4bb9f8399c Mon Sep 17 00:00:00 2001 From: jif-oai Date: Mon, 22 Dec 2025 16:05:06 +0100 Subject: [PATCH 30/43] Fixes --- codex-rs/otel/src/metrics/client.rs | 12 +++++++++--- codex-rs/otel/src/metrics/mod.rs | 2 -- codex-rs/otel/tests/suite/manager_metrics.rs | 2 -- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/codex-rs/otel/src/metrics/client.rs b/codex-rs/otel/src/metrics/client.rs index fe541878f59..57241b0fa39 100644 --- a/codex-rs/otel/src/metrics/client.rs +++ b/codex-rs/otel/src/metrics/client.rs @@ -6,7 +6,9 @@ use crate::metrics::config::MetricsExporter; use crate::metrics::error::MetricsError; use crate::metrics::error::Result; use crate::metrics::sink::build_metric_sink; -use crate::metrics::validation::{validate_metric_name, validate_tag_key, validate_tag_value}; +use crate::metrics::validation::validate_metric_name; +use crate::metrics::validation::validate_tag_key; +use crate::metrics::validation::validate_tag_value; use crate::metrics::validation::validate_tags; use crate::metrics::worker::spawn_worker; use std::collections::BTreeMap; @@ -99,7 +101,11 @@ impl MetricsClient { duration: Duration, tags: &[(&str, &str)], ) -> Result<()> { - self.histogram(name, duration.as_millis().min(i64::MAX as u128) as i64, tags) + self.histogram( + name, + duration.as_millis().min(i64::MAX as u128) as i64, + tags, + ) } /// Measure a closure and emit a histogram sample for the elapsed time. @@ -225,4 +231,4 @@ pub(crate) fn merge_tags( merged.insert((*key).to_string(), (*value).to_string()); } Ok(merged) -} \ No newline at end of file +} diff --git a/codex-rs/otel/src/metrics/mod.rs b/codex-rs/otel/src/metrics/mod.rs index 6b1daca73d3..b1f6d5c21f3 100644 --- a/codex-rs/otel/src/metrics/mod.rs +++ b/codex-rs/otel/src/metrics/mod.rs @@ -3,8 +3,6 @@ mod config; mod error; mod event; mod sink; -mod tags; -mod time; mod util; pub(crate) mod validation; mod worker; diff --git a/codex-rs/otel/tests/suite/manager_metrics.rs b/codex-rs/otel/tests/suite/manager_metrics.rs index 560f4097ac9..bf8f5239c9a 100644 --- a/codex-rs/otel/tests/suite/manager_metrics.rs +++ b/codex-rs/otel/tests/suite/manager_metrics.rs @@ -55,9 +55,7 @@ fn manager_attaches_metadata_tags_to_metrics() -> Result<()> { ("auth_mode".to_string(), AuthMode::ApiKey.to_string()), ("model".to_string(), "gpt-5.1".to_string()), ("service".to_string(), "codex-cli".to_string()), - ("slug".to_string(), "gpt-5.1".to_string()), ("source".to_string(), "tui".to_string()), - ("terminal.type".to_string(), "tty".to_string()), ]); assert_eq!(attrs, expected); From 8e48beac8e7e2ab99864333568a813a241a5b4b8 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Mon, 22 Dec 2025 16:39:30 +0100 Subject: [PATCH 31/43] Fix tests --- codex-rs/core/Cargo.toml | 1 + codex-rs/otel/Cargo.toml | 6 ++++++ codex-rs/otel/README.md | 5 +++++ codex-rs/otel/src/metrics/config.rs | 5 ++++- 4 files changed, 16 insertions(+), 1 deletion(-) diff --git a/codex-rs/core/Cargo.toml b/codex-rs/core/Cargo.toml index 7cb0eb67032..d5e32593176 100644 --- a/codex-rs/core/Cargo.toml +++ b/codex-rs/core/Cargo.toml @@ -122,6 +122,7 @@ assert_cmd = { workspace = true } assert_matches = { workspace = true } codex-arg0 = { workspace = true } codex-core = { path = ".", features = ["deterministic_process_ids"] } +codex-otel = { workspace = true, features = ["test-in-memory-metrics"] } core_test_support = { workspace = true } ctor = { workspace = true } escargot = { workspace = true } diff --git a/codex-rs/otel/Cargo.toml b/codex-rs/otel/Cargo.toml index cc8b473f01a..2e323d1f2a4 100644 --- a/codex-rs/otel/Cargo.toml +++ b/codex-rs/otel/Cargo.toml @@ -12,6 +12,12 @@ path = "src/lib.rs" [lints] workspace = true +[features] +# Opt-in switch for integration tests in other crates that depend on `codex-otel`: +# enable this feature in `dev-dependencies` to make `MetricsConfig::default()` use +# an in-memory exporter instead of emitting Statsig HTTP requests. +test-in-memory-metrics = [] + [dependencies] chrono = { workspace = true } codex-app-server-protocol = { workspace = true } diff --git a/codex-rs/otel/README.md b/codex-rs/otel/README.md index 45c56e6466e..554595fa560 100644 --- a/codex-rs/otel/README.md +++ b/codex-rs/otel/README.md @@ -95,6 +95,11 @@ metrics.counter("codex.turns", 1, &[("model", "gpt-5.1")])?; metrics.shutdown()?; // flushes in-memory exporter ``` +Note: `MetricsConfig::default()` only switches to in-memory automatically for unit tests inside +`codex-otel` itself (`cfg(test)`). For integration tests in other crates, either construct an +in-memory config explicitly (as above) or enable the `test-in-memory-metrics` feature on the +`codex-otel` dependency in that crate’s `dev-dependencies`. + ## Shutdown - `OtelProvider::shutdown()` stops the OTEL exporter. diff --git a/codex-rs/otel/src/metrics/config.rs b/codex-rs/otel/src/metrics/config.rs index bdaa14f9b85..ecd28b6a0d9 100644 --- a/codex-rs/otel/src/metrics/config.rs +++ b/codex-rs/otel/src/metrics/config.rs @@ -116,7 +116,10 @@ impl MetricsConfig { impl Default for MetricsConfig { fn default() -> Self { - if cfg!(test) { + // `cfg(test)` only applies to *unit tests* within this crate. Integration tests compile + // `codex-otel` as a normal dependency, so they must opt into the in-memory default via a + // feature (see `test-in-memory-metrics`). + if cfg!(any(test, feature = "test-in-memory-metrics")) { Self::in_memory(InMemoryMetricExporter::default()) } else { Self::statsig(DEFAULT_API_KEY) From 80457cde678a4e6fc9c68ff6ae4704aabc186c98 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Tue, 23 Dec 2025 14:08:32 +0100 Subject: [PATCH 32/43] Fix merge --- codex-rs/core/src/codex.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/codex-rs/core/src/codex.rs b/codex-rs/core/src/codex.rs index 2d4ab981956..32ed74a3b59 100644 --- a/codex-rs/core/src/codex.rs +++ b/codex-rs/core/src/codex.rs @@ -148,7 +148,6 @@ use crate::user_instructions::UserInstructions; use crate::user_notification::UserNotification; use crate::util::backoff; use codex_async_utils::OrCancelExt; -use codex_otel::otel_manager::OtelManager; use codex_execpolicy::Policy as ExecPolicy; use codex_otel::OtelManager; use codex_otel::metrics::MetricsClient; From 88e6096152abb92c29ec2117e54e37ff45b8edc7 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Tue, 23 Dec 2025 16:04:15 +0100 Subject: [PATCH 33/43] Move to otel --- codex-rs/core/Cargo.toml | 6 +- codex-rs/core/src/codex.rs | 6 +- codex-rs/core/src/config/mod.rs | 2 + codex-rs/core/src/config/types.rs | 5 + codex-rs/core/src/otel_init.rs | 48 ++- codex-rs/otel/Cargo.toml | 7 +- codex-rs/otel/README.md | 53 ++- codex-rs/otel/src/config.rs | 15 + codex-rs/otel/src/lib.rs | 2 + codex-rs/otel/src/metrics/client.rs | 359 ++++++++++++-------- codex-rs/otel/src/metrics/config.rs | 122 ++----- codex-rs/otel/src/metrics/error.rs | 57 +--- codex-rs/otel/src/metrics/event.rs | 15 - codex-rs/otel/src/metrics/mod.rs | 24 +- codex-rs/otel/src/metrics/sink.rs | 41 --- codex-rs/otel/src/metrics/sink/in_memory.rs | 118 ------- codex-rs/otel/src/metrics/sink/statsig.rs | 202 ----------- codex-rs/otel/src/metrics/util.rs | 9 - codex-rs/otel/src/metrics/worker.rs | 100 ------ codex-rs/otel/src/otlp.rs | 163 +++++++++ codex-rs/otel/src/traces/otel_manager.rs | 2 +- codex-rs/otel/src/traces/otel_provider.rs | 181 +--------- codex-rs/otel/tests/harness/mod.rs | 7 +- codex-rs/otel/tests/suite/validation.rs | 23 +- docs/config.md | 40 ++- docs/example-config.md | 19 ++ docs/telemetry.md | 6 +- 27 files changed, 644 insertions(+), 988 deletions(-) delete mode 100644 codex-rs/otel/src/metrics/event.rs delete mode 100644 codex-rs/otel/src/metrics/sink.rs delete mode 100644 codex-rs/otel/src/metrics/sink/in_memory.rs delete mode 100644 codex-rs/otel/src/metrics/sink/statsig.rs delete mode 100644 codex-rs/otel/src/metrics/util.rs delete mode 100644 codex-rs/otel/src/metrics/worker.rs create mode 100644 codex-rs/otel/src/otlp.rs diff --git a/codex-rs/core/Cargo.toml b/codex-rs/core/Cargo.toml index 0b220a4b2e4..45d4495fa57 100644 --- a/codex-rs/core/Cargo.toml +++ b/codex-rs/core/Cargo.toml @@ -91,7 +91,9 @@ which = { workspace = true } wildmatch = { workspace = true } [features] +default = ["statsig-default-metrics-exporter"] deterministic_process_ids = [] +statsig-default-metrics-exporter = ["codex-otel/statsig-default-metrics-exporter"] test-support = [] @@ -122,8 +124,8 @@ keyring = { workspace = true, features = ["sync-secret-service"] } assert_cmd = { workspace = true } assert_matches = { workspace = true } codex-arg0 = { workspace = true } -codex-core = { path = ".", features = ["deterministic_process_ids"] } -codex-otel = { workspace = true, features = ["test-in-memory-metrics"] } +codex-core = { path = ".", default-features = false, features = ["deterministic_process_ids"] } +codex-otel = { workspace = true } core_test_support = { workspace = true } ctor = { workspace = true } escargot = { workspace = true } diff --git a/codex-rs/core/src/codex.rs b/codex-rs/core/src/codex.rs index 32ed74a3b59..6a94bc504f1 100644 --- a/codex-rs/core/src/codex.rs +++ b/codex-rs/core/src/codex.rs @@ -148,10 +148,7 @@ use crate::user_instructions::UserInstructions; use crate::user_notification::UserNotification; use crate::util::backoff; use codex_async_utils::OrCancelExt; -use codex_execpolicy::Policy as ExecPolicy; use codex_otel::OtelManager; -use codex_otel::metrics::MetricsClient; -use codex_otel::metrics::MetricsConfig; use codex_protocol::config_types::ReasoningSummary as ReasoningSummaryConfig; use codex_protocol::models::ContentItem; use codex_protocol::models::ResponseInputItem; @@ -631,8 +628,7 @@ impl Session { config.otel.log_user_prompt, terminal::user_agent(), session_configuration.session_source.clone(), - ) - .with_metrics(MetricsClient::new(MetricsConfig::default())?); + ); config.features.emit_metrics(&otel_manager); otel_manager.conversation_starts( diff --git a/codex-rs/core/src/config/mod.rs b/codex-rs/core/src/config/mod.rs index 90f5ca2e08c..7fb0e6489f6 100644 --- a/codex-rs/core/src/config/mod.rs +++ b/codex-rs/core/src/config/mod.rs @@ -1434,11 +1434,13 @@ impl Config { .unwrap_or(DEFAULT_OTEL_ENVIRONMENT.to_string()); let exporter = t.exporter.unwrap_or(OtelExporterKind::None); let trace_exporter = t.trace_exporter.unwrap_or_else(|| exporter.clone()); + let metrics_exporter = t.metrics_exporter.unwrap_or(OtelExporterKind::None); OtelConfig { log_user_prompt, environment, exporter, trace_exporter, + metrics_exporter, } }, }; diff --git a/codex-rs/core/src/config/types.rs b/codex-rs/core/src/config/types.rs index 3aa72d5ce50..6fa58f7591f 100644 --- a/codex-rs/core/src/config/types.rs +++ b/codex-rs/core/src/config/types.rs @@ -328,6 +328,9 @@ pub struct OtelConfigToml { /// Optional trace exporter pub trace_exporter: Option, + + /// Optional metrics exporter + pub metrics_exporter: Option, } /// Effective OTEL settings after defaults are applied. @@ -337,6 +340,7 @@ pub struct OtelConfig { pub environment: String, pub exporter: OtelExporterKind, pub trace_exporter: OtelExporterKind, + pub metrics_exporter: OtelExporterKind, } impl Default for OtelConfig { @@ -346,6 +350,7 @@ impl Default for OtelConfig { environment: DEFAULT_OTEL_ENVIRONMENT.to_owned(), exporter: OtelExporterKind::None, trace_exporter: OtelExporterKind::None, + metrics_exporter: OtelExporterKind::None, } } } diff --git a/codex-rs/core/src/otel_init.rs b/codex-rs/core/src/otel_init.rs index 8ee4746dd20..42fae248df4 100644 --- a/codex-rs/core/src/otel_init.rs +++ b/codex-rs/core/src/otel_init.rs @@ -6,9 +6,13 @@ use codex_otel::config::OtelExporter; use codex_otel::config::OtelHttpProtocol; use codex_otel::config::OtelSettings; use codex_otel::config::OtelTlsConfig as OtelTlsSettings; +use codex_otel::metrics::MetricsConfig; use codex_otel::traces::otel_provider::OtelProvider; use std::error::Error; +#[cfg(feature = "statsig-default-metrics-exporter")] +use codex_otel::config::statsig_default_metrics_exporter; + /// Build an OpenTelemetry provider from the app Config. /// /// Returns `None` when OTEL export is disabled. @@ -63,6 +67,19 @@ pub fn build_provider( let exporter = to_otel_exporter(&config.otel.exporter); let trace_exporter = to_otel_exporter(&config.otel.trace_exporter); + let metrics_exporter = to_otel_exporter(&config.otel.metrics_exporter); + + let metrics = match &metrics_exporter { + OtelExporter::None => None, + _ => Some(MetricsConfig::otlp( + config.otel.environment.to_string(), + originator().value.to_owned(), + service_version.to_string(), + metrics_exporter, + )), + }; + + let metrics = metrics.or_else(|| default_metrics(config, service_version)); OtelProvider::from(&OtelSettings { service_name: originator().value.to_owned(), @@ -71,10 +88,39 @@ pub fn build_provider( environment: config.otel.environment.to_string(), exporter, trace_exporter, - metrics: None, + metrics, }) } +#[cfg(feature = "statsig-default-metrics-exporter")] +fn default_metrics(config: &Config, service_version: &str) -> Option { + if is_test_process() { + return None; + } + + if matches!(config.otel.exporter, Kind::None) + && matches!(config.otel.trace_exporter, Kind::None) + { + return None; + } + + Some(MetricsConfig::otlp( + config.otel.environment.to_string(), + originator().value.to_owned(), + service_version.to_string(), + statsig_default_metrics_exporter(), + )) +} + +#[cfg(not(feature = "statsig-default-metrics-exporter"))] +fn default_metrics(_config: &Config, _service_version: &str) -> Option { + None +} + +fn is_test_process() -> bool { + std::env::var_os("RUST_TEST_THREADS").is_some() +} + /// Filter predicate for exporting only Codex-owned events via OTEL. /// Keeps events that originated from codex_otel module pub fn codex_export_filter(meta: &tracing::Metadata<'_>) -> bool { diff --git a/codex-rs/otel/Cargo.toml b/codex-rs/otel/Cargo.toml index 2e323d1f2a4..68fb6e595c6 100644 --- a/codex-rs/otel/Cargo.toml +++ b/codex-rs/otel/Cargo.toml @@ -13,10 +13,9 @@ path = "src/lib.rs" workspace = true [features] -# Opt-in switch for integration tests in other crates that depend on `codex-otel`: -# enable this feature in `dev-dependencies` to make `MetricsConfig::default()` use -# an in-memory exporter instead of emitting Statsig HTTP requests. -test-in-memory-metrics = [] +## Provides a built-in default metrics exporter to an internal Statsig endpoint. +## Intended for production binaries; tests should not rely on it. +statsig-default-metrics-exporter = [] [dependencies] chrono = { workspace = true } diff --git a/codex-rs/otel/README.md b/codex-rs/otel/README.md index 554595fa560..b22ce791dd5 100644 --- a/codex-rs/otel/README.md +++ b/codex-rs/otel/README.md @@ -4,7 +4,7 @@ - Trace/log exporters and tracing subscriber layers (`codex_otel::traces::otel_provider`). - A structured event helper (`codex_otel::OtelManager`). -- A Statsig `log_event` metrics client (`codex_otel::metrics`). +- OpenTelemetry metrics support via OTLP exporters (`codex_otel::metrics`). - A metrics facade on `OtelManager` so tracing + metrics share metadata. ## Tracing and logs @@ -70,40 +70,69 @@ let manager = OtelManager::new( manager.user_prompt(&prompt_items); ``` -## Metrics (Statsig HTTP or in-memory) +## Metrics (OTLP or in-memory) Modes: -- Statsig HTTP: sends `log_event` batches to the configured Statsig endpoint (default outside tests). +- OTLP: exports metrics via the OpenTelemetry OTLP exporter (HTTP or gRPC). - In-memory: records via `opentelemetry_sdk::metrics::InMemoryMetricExporter` for tests/assertions; call `shutdown()` to flush. -Statsig example: +Statsig ingestion (OTLP/HTTP JSON) example: ```rust -let metrics = MetricsClient::new(MetricsConfig::default())?; +use codex_otel::config::{OtelExporter, OtelHttpProtocol}; + +let metrics = MetricsClient::new(MetricsConfig::otlp( + "dev", + "codex-cli", + env!("CARGO_PKG_VERSION"), + OtelExporter::OtlpHttp { + endpoint: "https://api.statsig.com/otlp".to_string(), + headers: std::collections::HashMap::from([( + "statsig-api-key".to_string(), + std::env::var("STATSIG_SERVER_SDK_SECRET")?, + )]), + protocol: OtelHttpProtocol::Json, + tls: None, + }, +))?; metrics.counter("codex.session_started", 1, &[("source", "tui")])?; metrics.histogram("codex.request_latency", 83, &[("route", "chat")])?; ``` +When built with the `codex-otel/statsig-default-metrics-exporter` feature you can also use the +crate-provided defaults (client key + `ab.chatgpt.com`) instead of wiring the header yourself: + +```rust +use codex_otel::config::statsig_default_metrics_exporter; + +let metrics = MetricsClient::new(MetricsConfig::otlp( + "dev", + "codex-cli", + env!("CARGO_PKG_VERSION"), + statsig_default_metrics_exporter(), +))?; +``` + In-memory (tests): ```rust let exporter = InMemoryMetricExporter::default(); -let metrics = MetricsClient::new(MetricsConfig::in_memory(exporter.clone()))?; +let metrics = MetricsClient::new(MetricsConfig::in_memory( + "test", + "codex-cli", + env!("CARGO_PKG_VERSION"), + exporter.clone(), +))?; metrics.counter("codex.turns", 1, &[("model", "gpt-5.1")])?; metrics.shutdown()?; // flushes in-memory exporter ``` -Note: `MetricsConfig::default()` only switches to in-memory automatically for unit tests inside -`codex-otel` itself (`cfg(test)`). For integration tests in other crates, either construct an -in-memory config explicitly (as above) or enable the `test-in-memory-metrics` feature on the -`codex-otel` dependency in that crate’s `dev-dependencies`. - ## Shutdown - `OtelProvider::shutdown()` stops the OTEL exporter. -- `OtelManager::shutdown_metrics()` flushes and stops the metrics worker. +- `OtelManager::shutdown_metrics()` flushes and shuts down the metrics provider. Both are optional because drop performs best-effort shutdown, but calling them explicitly gives deterministic flushing (or a shutdown error if flushing does diff --git a/codex-rs/otel/src/config.rs b/codex-rs/otel/src/config.rs index 78a867aa94e..5fc8a24f9f2 100644 --- a/codex-rs/otel/src/config.rs +++ b/codex-rs/otel/src/config.rs @@ -4,6 +4,21 @@ use std::path::PathBuf; use crate::metrics::MetricsConfig; use codex_utils_absolute_path::AbsolutePathBuf; +#[cfg(feature = "statsig-default-metrics-exporter")] +pub fn statsig_default_metrics_exporter() -> OtelExporter { + let headers = std::collections::HashMap::from([( + "statsig-api-key".to_string(), + "client-MkRuleRQBd6qakfnDYqJVR9JuXcY57Ljly3vi5JVUIO".to_string(), + )]); + + OtelExporter::OtlpHttp { + endpoint: "https://ab.chatgpt.com".to_string(), + headers, + protocol: OtelHttpProtocol::Json, + tls: None, + } +} + #[derive(Clone, Debug)] pub struct OtelSettings { pub environment: String, diff --git a/codex-rs/otel/src/lib.rs b/codex-rs/otel/src/lib.rs index 9e99acf98b2..14acd84c5a1 100644 --- a/codex-rs/otel/src/lib.rs +++ b/codex-rs/otel/src/lib.rs @@ -2,6 +2,8 @@ pub mod config; pub mod metrics; pub mod traces; +mod otlp; + use crate::metrics::MetricsClient; use crate::metrics::MetricsConfig; use crate::metrics::Result as MetricsResult; diff --git a/codex-rs/otel/src/metrics/client.rs b/codex-rs/otel/src/metrics/client.rs index 57241b0fa39..74a2fe57fe2 100644 --- a/codex-rs/otel/src/metrics/client.rs +++ b/codex-rs/otel/src/metrics/client.rs @@ -1,97 +1,160 @@ -use crate::metrics::DEFAULT_QUEUE_CAPACITY; -use crate::metrics::DEFAULT_SHUTDOWN_TIMEOUT; -use crate::metrics::MetricEvent; +use crate::config::OtelExporter; +use crate::config::OtelHttpProtocol; +use crate::metrics::MetricsError; +use crate::metrics::Result; use crate::metrics::config::MetricsConfig; use crate::metrics::config::MetricsExporter; -use crate::metrics::error::MetricsError; -use crate::metrics::error::Result; -use crate::metrics::sink::build_metric_sink; use crate::metrics::validation::validate_metric_name; use crate::metrics::validation::validate_tag_key; use crate::metrics::validation::validate_tag_value; use crate::metrics::validation::validate_tags; -use crate::metrics::worker::spawn_worker; +use opentelemetry::KeyValue; +use opentelemetry::metrics::Histogram; +use opentelemetry::metrics::Meter; +use opentelemetry::metrics::MeterProvider as _; +use opentelemetry::metrics::UpDownCounter; +use opentelemetry_otlp::OTEL_EXPORTER_OTLP_METRICS_TIMEOUT; +use opentelemetry_otlp::Protocol; +use opentelemetry_otlp::WithExportConfig; +use opentelemetry_otlp::WithHttpConfig; +use opentelemetry_otlp::WithTonicConfig; +use opentelemetry_sdk::Resource; +use opentelemetry_sdk::metrics::PeriodicReader; +use opentelemetry_sdk::metrics::SdkMeterProvider; +use opentelemetry_sdk::metrics::Temporality; +use opentelemetry_semantic_conventions as semconv; use std::collections::BTreeMap; +use std::collections::HashMap; use std::sync::Mutex; -use std::thread; use std::time::Duration; use std::time::Instant; -use tokio::runtime::Runtime; -use tokio::sync::mpsc; - -/// Background metrics client that enqueues metrics to a worker thread. -#[derive(Clone)] -pub struct MetricsClient { - sender: std::sync::Arc>>>, - handle: std::sync::Arc>>>, - capacity: usize, +use tracing::debug; + +const ENV_ATTRIBUTE: &str = "env"; +const METER_NAME: &str = "codex"; + +#[derive(Debug)] +struct MetricsClientInner { + meter_provider: SdkMeterProvider, + meter: Meter, + counters: Mutex>>, + histograms: Mutex>>, default_tags: BTreeMap, } -impl std::fmt::Debug for MetricsClient { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("MetricsClient") - .field("capacity", &self.capacity) - .finish() +impl MetricsClientInner { + fn counter(&self, name: &str, inc: i64, tags: &[(&str, &str)]) -> Result<()> { + validate_metric_name(name)?; + let attributes = self.attributes(tags)?; + + let mut counters = self + .counters + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let counter = counters + .entry(name.to_string()) + .or_insert_with(|| self.meter.i64_up_down_counter(name.to_string()).build()); + counter.add(inc, &attributes); + Ok(()) + } + + fn histogram(&self, name: &str, value: i64, tags: &[(&str, &str)]) -> Result<()> { + validate_metric_name(name)?; + let attributes = self.attributes(tags)?; + + let mut histograms = self + .histograms + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let histogram = histograms + .entry(name.to_string()) + .or_insert_with(|| self.meter.f64_histogram(name.to_string()).build()); + histogram.record(value as f64, &attributes); + Ok(()) + } + + fn attributes(&self, tags: &[(&str, &str)]) -> Result> { + if tags.is_empty() { + return Ok(self + .default_tags + .iter() + .map(|(key, value)| KeyValue::new(key.clone(), value.clone())) + .collect()); + } + + let mut merged = self.default_tags.clone(); + for (key, value) in tags { + validate_tag_key(key)?; + validate_tag_value(value)?; + merged.insert((*key).to_string(), (*value).to_string()); + } + + Ok(merged + .into_iter() + .map(|(key, value)| KeyValue::new(key, value)) + .collect()) + } + + fn shutdown(&self) -> Result<()> { + debug!("flushing OTEL metrics"); + self.meter_provider + .force_flush() + .map_err(|source| MetricsError::ProviderShutdown { source })?; + self.meter_provider + .shutdown() + .map_err(|source| MetricsError::ProviderShutdown { source })?; + Ok(()) } } +/// OpenTelemetry metrics client used by Codex. +#[derive(Clone, Debug)] +pub struct MetricsClient(std::sync::Arc); + impl MetricsClient { /// Build a metrics client from configuration and validate defaults. pub fn new(config: MetricsConfig) -> Result { - let capacity = DEFAULT_QUEUE_CAPACITY; - - if capacity == 0 { - return Err(MetricsError::QueueCapacityZero); - } - validate_tags(&config.default_tags)?; - if let MetricsExporter::StatsigHttp { endpoint, .. } = &config.exporter { - if endpoint.is_empty() { - return Err(MetricsError::EmptyEndpoint); + let resource = Resource::builder() + .with_service_name(config.service_name.clone()) + .with_attributes(vec![ + KeyValue::new( + semconv::attribute::SERVICE_VERSION, + config.service_version.clone(), + ), + KeyValue::new(ENV_ATTRIBUTE, config.environment.clone()), + ]) + .build(); + + let temporality = Temporality::default(); + let (meter_provider, meter) = match config.exporter { + MetricsExporter::InMemory(exporter) => { + build_provider(resource, exporter, config.export_interval) } - if config.api_key.is_empty() { - return Err(MetricsError::EmptyApiKey); + MetricsExporter::Otlp(exporter) => { + let exporter = build_otlp_metric_exporter(exporter, temporality)?; + build_provider(resource, exporter, config.export_interval) } - } - - let exporter_label = config.exporter_label(); - let exporter = build_metric_sink(&config)?; - let default_tags = config.default_tags.clone(); - let runtime = build_runtime()?; - - let (sender, receiver) = mpsc::channel(capacity); - let handle = spawn_worker(runtime, exporter, exporter_label, receiver); + }; - Ok(Self { - sender: std::sync::Arc::new(Mutex::new(Some(sender))), - handle: std::sync::Arc::new(Mutex::new(Some(handle))), - capacity, - default_tags, - }) + Ok(Self(std::sync::Arc::new(MetricsClientInner { + meter_provider, + meter, + counters: Mutex::new(HashMap::new()), + histograms: Mutex::new(HashMap::new()), + default_tags: config.default_tags, + }))) } - /// Send a single counter increment without blocking the caller. + /// Send a single counter increment. pub fn counter(&self, name: &str, inc: i64, tags: &[(&str, &str)]) -> Result<()> { - validate_metric_name(name)?; - let tags = merge_tags(&self.default_tags, tags)?; - self.send_event(MetricEvent::Counter { - name: name.to_string(), - value: inc, - tags, - }) + self.0.counter(name, inc, tags) } /// Send a single histogram sample. pub fn histogram(&self, name: &str, value: i64, tags: &[(&str, &str)]) -> Result<()> { - validate_metric_name(name)?; - let tags = merge_tags(&self.default_tags, tags)?; - self.send_event(MetricEvent::Histogram { - name: name.to_string(), - value, - tags, - }) + self.0.histogram(name, value, tags) } /// Record a duration in milliseconds using a histogram. @@ -125,110 +188,114 @@ impl MetricsClient { ) -> Result { let start = Instant::now(); let output = f(); + let duration_result = self.record_duration(name, start.elapsed(), tags); match output { Ok(value) => { - self.record_duration(name, start.elapsed(), tags)?; + duration_result?; Ok(value) } Err(err) => { - let _ = self.record_duration(name, start.elapsed(), tags); + let _ = duration_result; Err(err) } } } - fn send_event(&self, event: MetricEvent) -> Result<()> { - let sender = self - .sender - .lock() - .unwrap_or_else(std::sync::PoisonError::into_inner); - let Some(sender) = sender.as_ref() else { - return Err(MetricsError::WorkerUnavailable); - }; - - match sender.try_send(event) { - Ok(()) => Ok(()), - Err(tokio::sync::mpsc::error::TrySendError::Full(_)) => Err(MetricsError::QueueFull { - capacity: self.capacity, - }), - Err(tokio::sync::mpsc::error::TrySendError::Closed(_)) => { - Err(MetricsError::WorkerUnavailable) - } - } + /// Flush metrics and stop the underlying OTEL meter provider. + pub fn shutdown(&self) -> Result<()> { + self.0.shutdown() } +} - /// Flush queued metrics and stop the worker thread. - pub fn shutdown(&self) -> Result<()> { - self.shutdown_inner(DEFAULT_SHUTDOWN_TIMEOUT) +fn build_provider( + resource: Resource, + exporter: E, + interval: Option, +) -> (SdkMeterProvider, Meter) +where + E: opentelemetry_sdk::metrics::exporter::PushMetricExporter + 'static, +{ + let mut reader_builder = PeriodicReader::builder(exporter); + if let Some(interval) = interval { + reader_builder = reader_builder.with_interval(interval); } + let reader = reader_builder.build(); + let provider = SdkMeterProvider::builder() + .with_resource(resource) + .with_reader(reader) + .build(); + let meter = provider.meter(METER_NAME); + (provider, meter) +} - fn shutdown_inner(&self, timeout: Duration) -> Result<()> { - let sender = self - .sender - .lock() - .unwrap_or_else(std::sync::PoisonError::into_inner) - .take(); - let mut handle = self - .handle - .lock() - .unwrap_or_else(std::sync::PoisonError::into_inner); - let Some(handle) = handle.take() else { - return Ok(()); - }; - let mut joined = false; +fn build_otlp_metric_exporter( + exporter: OtelExporter, + temporality: Temporality, +) -> Result { + match exporter { + OtelExporter::None => Err(MetricsError::ExporterDisabled), + OtelExporter::OtlpGrpc { + endpoint, + headers, + tls, + } => { + debug!("Using OTLP Grpc exporter for metrics: {endpoint}"); - // Dropping the sender closes the channel; the worker drains pending events and exits. - drop(sender); + let header_map = crate::otlp::build_header_map(&headers); - if timeout.is_zero() { - if handle.is_finished() { - handle.join().map_err(|_| MetricsError::WorkerPanicked)?; - joined = true; - } - } else { - let start = Instant::now(); - while start.elapsed() < timeout { - if handle.is_finished() { - handle.join().map_err(|_| MetricsError::WorkerPanicked)?; - joined = true; - break; - } - thread::sleep(crate::metrics::SHUTDOWN_POLL_INTERVAL); - } - } + let base_tls_config = tonic::transport::ClientTlsConfig::new() + .with_enabled_roots() + .assume_http2(true); + + let tls_config = match tls.as_ref() { + Some(tls) => crate::otlp::build_grpc_tls_config(&endpoint, base_tls_config, tls) + .map_err(|err| MetricsError::InvalidConfig { + message: err.to_string(), + })?, + None => base_tls_config, + }; - if joined { - return Ok(()); + opentelemetry_otlp::MetricExporter::builder() + .with_tonic() + .with_endpoint(endpoint) + .with_temporality(temporality) + .with_metadata(tonic::metadata::MetadataMap::from_headers(header_map)) + .with_tls_config(tls_config) + .build() + .map_err(|source| MetricsError::ExporterBuild { source }) } + OtelExporter::OtlpHttp { + endpoint, + headers, + protocol, + tls, + } => { + debug!("Using OTLP Http exporter for metrics: {endpoint}"); - Err(MetricsError::ShutdownTimeout { timeout }) - } -} + let protocol = match protocol { + OtelHttpProtocol::Binary => Protocol::HttpBinary, + OtelHttpProtocol::Json => Protocol::HttpJson, + }; -impl Drop for MetricsClient { - fn drop(&mut self) { - if std::sync::Arc::strong_count(&self.sender) == 1 { - let _ = self.shutdown_inner(DEFAULT_SHUTDOWN_TIMEOUT); - } - } -} + let mut exporter_builder = opentelemetry_otlp::MetricExporter::builder() + .with_http() + .with_endpoint(endpoint) + .with_temporality(temporality) + .with_protocol(protocol) + .with_headers(headers); -fn build_runtime() -> Result { - tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .map_err(|source| MetricsError::RuntimeBuild { source }) -} + if let Some(tls) = tls.as_ref() { + let client = + crate::otlp::build_http_client(tls, OTEL_EXPORTER_OTLP_METRICS_TIMEOUT) + .map_err(|err| MetricsError::InvalidConfig { + message: err.to_string(), + })?; + exporter_builder = exporter_builder.with_http_client(client); + } -pub(crate) fn merge_tags( - default_tags: &BTreeMap, - tags: &[(&str, &str)], -) -> Result> { - let mut merged = default_tags.clone(); - for (key, value) in tags { - validate_tag_key(key)?; - validate_tag_value(value)?; - merged.insert((*key).to_string(), (*value).to_string()); + exporter_builder + .build() + .map_err(|source| MetricsError::ExporterBuild { source }) + } } - Ok(merged) } diff --git a/codex-rs/otel/src/metrics/config.rs b/codex-rs/otel/src/metrics/config.rs index ecd28b6a0d9..c7a459183be 100644 --- a/codex-rs/otel/src/metrics/config.rs +++ b/codex-rs/otel/src/metrics/config.rs @@ -1,8 +1,5 @@ -use crate::metrics::DEFAULT_TIMEOUT; -use crate::metrics::error::Result; -use crate::metrics::sink::statsig::DEFAULT_API_KEY; -use crate::metrics::sink::statsig::DEFAULT_API_KEY_HEADER; -use crate::metrics::sink::statsig::DEFAULT_STATSIG_ENDPOINT; +use crate::config::OtelExporter; +use crate::metrics::Result; use crate::metrics::validation::validate_tag_key; use crate::metrics::validation::validate_tag_value; use opentelemetry_sdk::metrics::InMemoryMetricExporter; @@ -10,71 +7,58 @@ use std::collections::BTreeMap; use std::time::Duration; #[derive(Clone, Debug)] -pub(crate) enum MetricsExporter { - StatsigHttp { - endpoint: String, - api_key_header: String, - timeout: Duration, - user_agent: String, - }, +pub enum MetricsExporter { + Otlp(OtelExporter), InMemory(InMemoryMetricExporter), } -impl MetricsExporter { - pub(crate) fn statsig_defaults() -> Self { - Self::StatsigHttp { - endpoint: DEFAULT_STATSIG_ENDPOINT.to_string(), - api_key_header: DEFAULT_API_KEY_HEADER.to_string(), - timeout: DEFAULT_TIMEOUT, - user_agent: format!("codex-otel-metrics/{}", env!("CARGO_PKG_VERSION")), - } - } -} - #[derive(Clone, Debug)] pub struct MetricsConfig { - pub(crate) api_key: String, - pub(crate) default_tags: BTreeMap, + pub(crate) environment: String, + pub(crate) service_name: String, + pub(crate) service_version: String, pub(crate) exporter: MetricsExporter, + pub(crate) export_interval: Option, + pub(crate) default_tags: BTreeMap, } impl MetricsConfig { - /// Create a Statsig config with the provided API key and default settings. - pub fn new(api_key: impl Into) -> Self { - Self::statsig(api_key) - } - - /// Create a Statsig config with the provided API key and default settings. - pub fn statsig(api_key: impl Into) -> Self { + pub fn otlp( + environment: impl Into, + service_name: impl Into, + service_version: impl Into, + exporter: OtelExporter, + ) -> Self { Self { - api_key: api_key.into(), + environment: environment.into(), + service_name: service_name.into(), + service_version: service_version.into(), + exporter: MetricsExporter::Otlp(exporter), + export_interval: None, default_tags: BTreeMap::new(), - exporter: MetricsExporter::statsig_defaults(), } } /// Create an in-memory config (used in tests). - pub fn in_memory(exporter: opentelemetry_sdk::metrics::InMemoryMetricExporter) -> Self { + pub fn in_memory( + environment: impl Into, + service_name: impl Into, + service_version: impl Into, + exporter: InMemoryMetricExporter, + ) -> Self { Self { - api_key: String::new(), - default_tags: BTreeMap::new(), + environment: environment.into(), + service_name: service_name.into(), + service_version: service_version.into(), exporter: MetricsExporter::InMemory(exporter), + export_interval: None, + default_tags: BTreeMap::new(), } } - /// Override the Statsig endpoint. - pub fn with_endpoint(mut self, endpoint: impl Into) -> Self { - if let MetricsExporter::StatsigHttp { endpoint: e, .. } = &mut self.exporter { - *e = endpoint.into(); - } - self - } - - /// Override the API key header name. - pub fn with_api_key_header(mut self, header: impl Into) -> Self { - if let MetricsExporter::StatsigHttp { api_key_header, .. } = &mut self.exporter { - *api_key_header = header.into(); - } + /// Override the interval between periodic metric exports. + pub fn with_export_interval(mut self, interval: Duration) -> Self { + self.export_interval = Some(interval); self } @@ -87,42 +71,4 @@ impl MetricsConfig { self.default_tags.insert(key, value); Ok(self) } - - /// Override the HTTP client timeout. - pub fn with_timeout(mut self, timeout: Duration) -> Self { - if let MetricsExporter::StatsigHttp { timeout: t, .. } = &mut self.exporter { - *t = timeout; - } - self - } - - /// Override the HTTP user agent header. - pub fn with_user_agent(mut self, user_agent: impl Into) -> Self { - if let MetricsExporter::StatsigHttp { user_agent: ua, .. } = &mut self.exporter { - *ua = user_agent.into(); - } - self - } - - pub(crate) fn exporter_label(&self) -> String { - match &self.exporter { - MetricsExporter::StatsigHttp { - endpoint, timeout, .. - } => format!("statsig_http endpoint={endpoint} timeout={timeout:?}"), - MetricsExporter::InMemory(_) => "in_memory".to_string(), - } - } -} - -impl Default for MetricsConfig { - fn default() -> Self { - // `cfg(test)` only applies to *unit tests* within this crate. Integration tests compile - // `codex-otel` as a normal dependency, so they must opt into the in-memory default via a - // feature (see `test-in-memory-metrics`). - if cfg!(any(test, feature = "test-in-memory-metrics")) { - Self::in_memory(InMemoryMetricExporter::default()) - } else { - Self::statsig(DEFAULT_API_KEY) - } - } } diff --git a/codex-rs/otel/src/metrics/error.rs b/codex-rs/otel/src/metrics/error.rs index 351fb94d9eb..d046708a308 100644 --- a/codex-rs/otel/src/metrics/error.rs +++ b/codex-rs/otel/src/metrics/error.rs @@ -1,4 +1,3 @@ -use std::time::Duration; use thiserror::Error; pub type Result = std::result::Result; @@ -15,53 +14,21 @@ pub enum MetricsError { #[error("{label} contains invalid characters: {value}")] InvalidTagComponent { label: String, value: String }, - // Config. - #[error("failed to build tokio runtime")] - RuntimeBuild { - #[source] - source: std::io::Error, - }, - #[error("invalid api key header: {header}")] - InvalidApiKeyHeader { - header: String, - #[source] - source: reqwest::header::InvalidHeaderName, - }, - #[error("invalid header value: {header}")] - InvalidHeaderValue { - header: String, - #[source] - source: reqwest::header::InvalidHeaderValue, - }, - #[error("failed to build metrics http client")] - HttpClientBuild { + #[error("metrics exporter is disabled")] + ExporterDisabled, + + #[error("failed to build OTLP metrics exporter")] + ExporterBuild { #[source] - source: reqwest::Error, + source: opentelemetry_otlp::ExporterBuildError, }, - #[error("metrics endpoint cannot be empty")] - EmptyEndpoint, - #[error("metrics api key cannot be empty")] - EmptyApiKey, - // Worker. - #[error("metrics queue capacity must be positive")] - QueueCapacityZero, - #[error("metrics queue is full (capacity {capacity})")] - QueueFull { capacity: usize }, - #[error("metrics worker is unavailable")] - WorkerUnavailable, - #[error("metrics worker thread panicked")] - WorkerPanicked, - #[error("metrics shutdown timed out after {timeout:?}")] - ShutdownTimeout { timeout: Duration }, - #[error("failed to send statsig metrics request")] - StatsigRequestFailed { + #[error("invalid OTLP metrics configuration: {message}")] + InvalidConfig { message: String }, + + #[error("failed to flush or shutdown metrics provider")] + ProviderShutdown { #[source] - source: reqwest::Error, - }, - #[error("statsig metrics request failed: {status} {body}")] - StatsigResponseError { - status: reqwest::StatusCode, - body: String, + source: opentelemetry_sdk::error::OTelSdkError, }, } diff --git a/codex-rs/otel/src/metrics/event.rs b/codex-rs/otel/src/metrics/event.rs deleted file mode 100644 index fdbc04fde11..00000000000 --- a/codex-rs/otel/src/metrics/event.rs +++ /dev/null @@ -1,15 +0,0 @@ -use std::collections::BTreeMap; - -#[derive(Clone, Debug)] -pub(crate) enum MetricEvent { - Counter { - name: String, - value: i64, - tags: BTreeMap, - }, - Histogram { - name: String, - value: i64, - tags: BTreeMap, - }, -} diff --git a/codex-rs/otel/src/metrics/mod.rs b/codex-rs/otel/src/metrics/mod.rs index b1f6d5c21f3..ccc3f841ada 100644 --- a/codex-rs/otel/src/metrics/mod.rs +++ b/codex-rs/otel/src/metrics/mod.rs @@ -1,21 +1,21 @@ mod client; mod config; mod error; -mod event; -mod sink; -mod util; pub(crate) mod validation; -mod worker; - -use std::time::Duration; - -pub(crate) const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10); -pub(crate) const DEFAULT_QUEUE_CAPACITY: usize = 1024; -pub(crate) const DEFAULT_SHUTDOWN_TIMEOUT: Duration = Duration::from_millis(500); -pub(crate) const SHUTDOWN_POLL_INTERVAL: Duration = Duration::from_millis(10); pub use crate::metrics::client::MetricsClient; pub use crate::metrics::config::MetricsConfig; +pub use crate::metrics::config::MetricsExporter; pub use crate::metrics::error::MetricsError; pub use crate::metrics::error::Result; -pub(crate) use crate::metrics::event::MetricEvent; +use std::sync::OnceLock; + +static GLOBAL_METRICS: OnceLock = OnceLock::new(); + +pub(crate) fn install_global(metrics: MetricsClient) { + let _ = GLOBAL_METRICS.set(metrics); +} + +pub(crate) fn global() -> Option { + GLOBAL_METRICS.get().cloned() +} diff --git a/codex-rs/otel/src/metrics/sink.rs b/codex-rs/otel/src/metrics/sink.rs deleted file mode 100644 index e5fd2fdf5b2..00000000000 --- a/codex-rs/otel/src/metrics/sink.rs +++ /dev/null @@ -1,41 +0,0 @@ -use crate::metrics::MetricEvent; -use crate::metrics::MetricsConfig; -use crate::metrics::config::MetricsExporter; -use crate::metrics::sink::in_memory::InMemoryExporter; -use crate::metrics::sink::statsig::StatsigExporter; -use std::pin::Pin; - -pub(crate) mod in_memory; -pub(crate) mod statsig; - -pub(crate) trait MetricSink: Send { - fn export_batch<'a>( - &'a mut self, - events: Vec, - ) -> Pin> + Send + 'a>>; - fn shutdown<'a>( - &'a mut self, - ) -> Pin> + Send + 'a>>; -} - -pub(crate) fn build_metric_sink( - config: &MetricsConfig, -) -> crate::metrics::Result> { - match &config.exporter { - MetricsExporter::StatsigHttp { - endpoint, - api_key_header, - timeout, - user_agent, - } => Ok(Box::new(StatsigExporter::from( - endpoint, - api_key_header, - timeout, - user_agent, - &config.api_key, - )?)), - MetricsExporter::InMemory(exporter) => { - Ok(Box::new(InMemoryExporter::from(exporter.clone()))) - } - } -} diff --git a/codex-rs/otel/src/metrics/sink/in_memory.rs b/codex-rs/otel/src/metrics/sink/in_memory.rs deleted file mode 100644 index 955daeff1b4..00000000000 --- a/codex-rs/otel/src/metrics/sink/in_memory.rs +++ /dev/null @@ -1,118 +0,0 @@ -use crate::metrics::MetricEvent; -use crate::metrics::sink::MetricSink; -use crate::metrics::util::error_or_panic; -use opentelemetry::KeyValue; -use opentelemetry::metrics::Histogram; -use opentelemetry::metrics::Meter; -use opentelemetry::metrics::MeterProvider; -use opentelemetry::metrics::UpDownCounter; -use opentelemetry_sdk::metrics::PeriodicReader; -use opentelemetry_sdk::metrics::SdkMeterProvider; -use std::collections::BTreeMap; -use std::collections::HashMap; -use std::pin::Pin; - -const METER_NAME: &str = "codex-otel-metrics"; - -#[derive(Debug)] -struct MetricRecorder { - meter: Meter, - counters: HashMap>, - histograms: HashMap>, -} - -impl MetricRecorder { - fn new(meter: Meter) -> Self { - Self { - meter, - counters: HashMap::new(), - histograms: HashMap::new(), - } - } - - fn record_event(&mut self, event: MetricEvent) { - match event { - MetricEvent::Counter { name, value, tags } => { - self.record_counter(&name, value, &tags); - } - MetricEvent::Histogram { name, value, tags } => { - self.record_histogram(&name, value, &tags); - } - } - } - - fn record_counter(&mut self, name: &str, value: i64, tags: &BTreeMap) { - let attributes = self.attributes_for(tags); - let name = name.to_string(); - let counter = self - .counters - .entry(name.clone()) - .or_insert_with(|| self.meter.i64_up_down_counter(name.clone()).build()); - counter.add(value, &attributes); - } - - fn record_histogram(&mut self, name: &str, value: i64, tags: &BTreeMap) { - let attributes = self.attributes_for(tags); - let name = name.to_string(); - let histogram = self - .histograms - .entry(name.clone()) - .or_insert_with(|| self.meter.f64_histogram(name.clone()).build()); - histogram.record(value as f64, &attributes); - } - - fn attributes_for(&self, tags: &BTreeMap) -> Vec { - tags.iter() - .map(|(key, value)| KeyValue::new(key.clone(), value.clone())) - .collect() - } -} - -pub(crate) struct InMemoryExporter { - recorder: MetricRecorder, - meter_provider: SdkMeterProvider, -} - -impl InMemoryExporter { - pub(crate) fn from(exporter: opentelemetry_sdk::metrics::InMemoryMetricExporter) -> Self { - let reader = PeriodicReader::builder(exporter).build(); - let meter_provider = SdkMeterProvider::builder().with_reader(reader).build(); - let meter = meter_provider.meter(METER_NAME); - let recorder = MetricRecorder::new(meter); - Self { - recorder, - meter_provider, - } - } -} - -impl MetricSink for InMemoryExporter { - fn export_batch<'a>( - &'a mut self, - events: Vec, - ) -> Pin> + Send + 'a>> { - Box::pin(async move { - for event in events { - self.recorder.record_event(event); - } - if let Err(err) = self.meter_provider.force_flush() { - error_or_panic(format!("metrics flush failed: {err}")); - } - Ok(()) - }) - } - - fn shutdown<'a>( - &'a mut self, - ) -> Pin> + Send + 'a>> { - Box::pin(async move { - if let Err(err) = self.meter_provider.force_flush() { - error_or_panic(format!("metrics flush failed during shutdown: {err}")); - } - if let Err(err) = self.meter_provider.shutdown() { - error_or_panic(format!("metrics shutdown failed: {err}")); - } - Ok(()) - }) - } -} diff --git a/codex-rs/otel/src/metrics/sink/statsig.rs b/codex-rs/otel/src/metrics/sink/statsig.rs deleted file mode 100644 index 7dd17769706..00000000000 --- a/codex-rs/otel/src/metrics/sink/statsig.rs +++ /dev/null @@ -1,202 +0,0 @@ -use crate::metrics::MetricEvent; -use crate::metrics::MetricsError; -use crate::metrics::sink::MetricSink; -use chrono::Utc; -use http::HeaderName; -use http::HeaderValue; -use http::header::USER_AGENT; -use reqwest::Client; -use serde::Serialize; -use std::collections::BTreeMap; -use std::pin::Pin; -use std::time::Duration; - -// Publicly available API key for codex local project. -pub(crate) const DEFAULT_API_KEY: &str = "client-MkRuleRQBd6qakfnDYqJVR9JuXcY57Ljly3vi5JVUIO"; -pub(crate) const DEFAULT_STATSIG_ENDPOINT: &str = "https://ab.chatgpt.com/v1/log_event"; -pub(crate) const DEFAULT_API_KEY_HEADER: &str = "statsig-api-key"; -const STATSIG_USER_ID: &str = "codex-metrics"; -const STATSIG_SDK_TYPE: &str = "codex-otel-rust"; - -pub(crate) struct StatsigExporter { - client: Client, - endpoint: String, - api_key_header: HeaderName, - api_key: HeaderValue, - user_agent: Option, -} - -impl StatsigExporter { - pub(crate) fn from( - endpoint: &str, - api_key_header: &str, - timeout: &Duration, - user_agent: &str, - api_key: &str, - ) -> crate::metrics::Result { - let api_key_header = - HeaderName::from_bytes(api_key_header.as_bytes()).map_err(|source| { - MetricsError::InvalidApiKeyHeader { - header: api_key_header.to_string(), - source, - } - })?; - let api_key = - HeaderValue::from_str(api_key).map_err(|source| MetricsError::InvalidHeaderValue { - header: api_key_header.to_string(), - source, - })?; - let user_agent = if user_agent.is_empty() { - None - } else { - Some(HeaderValue::from_str(user_agent).map_err(|source| { - MetricsError::InvalidHeaderValue { - header: "User-Agent".to_string(), - source, - } - })?) - }; - let client = Client::builder() - .timeout(*timeout) - .build() - .map_err(|source| MetricsError::HttpClientBuild { source })?; - - Ok(Self { - client, - endpoint: endpoint.to_string(), - api_key_header, - api_key, - user_agent, - }) - } - - fn build_payload(&self, events: Vec) -> StatsigPayload { - let timestamp = Utc::now().timestamp_millis(); - let events = events - .into_iter() - .map(|event| self.event_from_metric(event, timestamp)) - .collect(); - - StatsigPayload { - events, - statsig_metadata: StatsigMetadata { - sdk_type: STATSIG_SDK_TYPE.to_string(), - sdk_version: env!("CARGO_PKG_VERSION").to_string(), - }, - } - } - - fn event_from_metric(&self, event: MetricEvent, timestamp: i64) -> StatsigEvent { - match event { - MetricEvent::Counter { name, value, tags } => StatsigEvent { - event_name: name, - value: value as f64, - metadata: StatsigEventMetadata { - metric_type: "counter".to_string(), - tags, - }, - user: StatsigUser { - user_id: STATSIG_USER_ID.to_string(), - }, - time: timestamp, - }, - MetricEvent::Histogram { name, value, tags } => StatsigEvent { - event_name: name, - value: value as f64, - metadata: StatsigEventMetadata { - metric_type: "histogram".to_string(), - tags, - }, - user: StatsigUser { - user_id: STATSIG_USER_ID.to_string(), - }, - time: timestamp, - }, - } - } -} - -impl MetricSink for StatsigExporter { - fn export_batch<'a>( - &'a mut self, - events: Vec, - ) -> Pin> + Send + 'a>> { - Box::pin(async move { - if events.is_empty() { - return Ok(()); - } - - let payload = self.build_payload(events); - - let mut request = self - .client - .post(&self.endpoint) - .header(self.api_key_header.clone(), self.api_key.clone()); - - if let Some(user_agent) = &self.user_agent { - request = request.header(USER_AGENT, user_agent.clone()); - } - - let response = request - .json(&payload) - .send() - .await - .map_err(|source| MetricsError::StatsigRequestFailed { source })?; - - if let Err(status_err) = response.error_for_status_ref() { - let status = status_err - .status() - .unwrap_or(reqwest::StatusCode::INTERNAL_SERVER_ERROR); - let body = response.text().await.unwrap_or_default(); - return Err(MetricsError::StatsigResponseError { status, body }); - } - - Ok(()) - }) - } - - fn shutdown<'a>( - &'a mut self, - ) -> Pin> + Send + 'a>> { - Box::pin(async { Ok(()) }) - } -} - -#[derive(Debug, Serialize)] -struct StatsigPayload { - events: Vec, - #[serde(rename = "statsigMetadata")] - statsig_metadata: StatsigMetadata, -} - -#[derive(Debug, Serialize)] -struct StatsigEvent { - #[serde(rename = "eventName")] - event_name: String, - value: f64, - metadata: StatsigEventMetadata, - user: StatsigUser, - time: i64, -} - -#[derive(Debug, Serialize)] -struct StatsigEventMetadata { - #[serde(rename = "metric_type")] - metric_type: String, - #[serde(flatten)] - tags: BTreeMap, -} - -#[derive(Debug, Serialize)] -struct StatsigUser { - #[serde(rename = "userID")] - user_id: String, -} - -#[derive(Debug, Serialize)] -struct StatsigMetadata { - #[serde(rename = "sdkType")] - sdk_type: String, - #[serde(rename = "sdkVersion")] - sdk_version: String, -} diff --git a/codex-rs/otel/src/metrics/util.rs b/codex-rs/otel/src/metrics/util.rs deleted file mode 100644 index 60d0ddfe224..00000000000 --- a/codex-rs/otel/src/metrics/util.rs +++ /dev/null @@ -1,9 +0,0 @@ -use tracing::error; - -pub(crate) fn error_or_panic(message: impl ToString) { - if cfg!(debug_assertions) { - panic!("{}", message.to_string()); - } else { - error!("{}", message.to_string()); - } -} diff --git a/codex-rs/otel/src/metrics/worker.rs b/codex-rs/otel/src/metrics/worker.rs deleted file mode 100644 index f526228ed41..00000000000 --- a/codex-rs/otel/src/metrics/worker.rs +++ /dev/null @@ -1,100 +0,0 @@ -use crate::metrics::MetricEvent; -use crate::metrics::sink::MetricSink; -use crate::metrics::util::error_or_panic; -use std::thread; -use std::time::Duration; -use std::time::Instant; -use tokio::runtime::Runtime; -use tokio::sync::mpsc; -use tokio::sync::mpsc::error::TryRecvError; - -const MAX_BATCH_SIZE: usize = 50; -const BATCH_TIMEOUT: Duration = Duration::from_millis(1000); - -pub(crate) fn spawn_worker( - runtime: Runtime, - exporter: Box, - exporter_label: String, - receiver: mpsc::Receiver, -) -> thread::JoinHandle<()> { - thread::spawn(move || { - let worker = MetricsWorker::new(exporter, exporter_label); - runtime.block_on(worker.run(receiver)); - }) -} - -struct MetricsWorker { - exporter: Box, - exporter_label: String, -} - -impl MetricsWorker { - fn new(exporter: Box, exporter_label: String) -> Self { - Self { - exporter, - exporter_label, - } - } - - async fn run(mut self, mut receiver: mpsc::Receiver) { - while let Some(event) = receiver.recv().await { - let events = Self::collect_batch(event, &mut receiver).await; - self.export_batch(events).await; - } - self.shutdown().await; - } - - async fn export_batch(&mut self, events: Vec) { - if let Err(err) = self.exporter.export_batch(events).await { - error_or_panic(format!( - "metrics export failed: {err} (exporter={})", - self.exporter_label - )); - } - } - - async fn collect_batch( - first: MetricEvent, - receiver: &mut mpsc::Receiver, - ) -> Vec { - let mut events = Vec::with_capacity(1); - events.push(first); - - while events.len() < MAX_BATCH_SIZE { - match receiver.try_recv() { - Ok(event) => events.push(event), - Err(TryRecvError::Empty) => break, - Err(TryRecvError::Disconnected) => return events, - } - } - - if events.len() >= MAX_BATCH_SIZE { - return events; - } - - let deadline = Instant::now() + BATCH_TIMEOUT; - while events.len() < MAX_BATCH_SIZE { - let remaining = deadline.saturating_duration_since(Instant::now()); - if remaining.is_zero() { - break; - } - - match tokio::time::timeout(remaining, receiver.recv()).await { - Ok(Some(event)) => events.push(event), - Ok(None) => break, - Err(_) => break, - } - } - - events - } - - async fn shutdown(&mut self) { - if let Err(err) = self.exporter.shutdown().await { - error_or_panic(format!( - "metrics shutdown failed: {err} (exporter={})", - self.exporter_label - )); - } - } -} diff --git a/codex-rs/otel/src/otlp.rs b/codex-rs/otel/src/otlp.rs new file mode 100644 index 00000000000..1312fe4b074 --- /dev/null +++ b/codex-rs/otel/src/otlp.rs @@ -0,0 +1,163 @@ +use crate::config::OtelTlsConfig; +use codex_utils_absolute_path::AbsolutePathBuf; +use http::Uri; +use opentelemetry_otlp::OTEL_EXPORTER_OTLP_TIMEOUT; +use opentelemetry_otlp::OTEL_EXPORTER_OTLP_TIMEOUT_DEFAULT; +use reqwest::Certificate as ReqwestCertificate; +use reqwest::Identity as ReqwestIdentity; +use reqwest::header::HeaderMap; +use reqwest::header::HeaderName; +use reqwest::header::HeaderValue; +use std::env; +use std::error::Error; +use std::fs; +use std::io; +use std::io::ErrorKind; +use std::path::PathBuf; +use std::time::Duration; +use tonic::transport::Certificate as TonicCertificate; +use tonic::transport::ClientTlsConfig; +use tonic::transport::Identity as TonicIdentity; + +pub(crate) fn build_header_map(headers: &std::collections::HashMap) -> HeaderMap { + let mut header_map = HeaderMap::new(); + for (key, value) in headers { + if let Ok(name) = HeaderName::from_bytes(key.as_bytes()) + && let Ok(val) = HeaderValue::from_str(value) + { + header_map.insert(name, val); + } + } + header_map +} + +pub(crate) fn build_grpc_tls_config( + endpoint: &str, + tls_config: ClientTlsConfig, + tls: &OtelTlsConfig, +) -> Result> { + let uri: Uri = endpoint.parse()?; + let host = uri.host().ok_or_else(|| { + config_error(format!( + "OTLP gRPC endpoint {endpoint} does not include a host" + )) + })?; + + let mut config = tls_config.domain_name(host.to_owned()); + + if let Some(path) = tls.ca_certificate.as_ref() { + let (pem, _) = read_bytes(path)?; + config = config.ca_certificate(TonicCertificate::from_pem(pem)); + } + + match (&tls.client_certificate, &tls.client_private_key) { + (Some(cert_path), Some(key_path)) => { + let (cert_pem, _) = read_bytes(cert_path)?; + let (key_pem, _) = read_bytes(key_path)?; + config = config.identity(TonicIdentity::from_pem(cert_pem, key_pem)); + } + (Some(_), None) | (None, Some(_)) => { + return Err(config_error( + "client_certificate and client_private_key must both be provided for mTLS", + )); + } + (None, None) => {} + } + + Ok(config) +} + +/// Build a blocking HTTP client with TLS configuration for OTLP HTTP exporters. +/// +/// We use `reqwest::blocking::Client` because OTEL exporters run on dedicated +/// OS threads that are not necessarily backed by tokio. +pub(crate) fn build_http_client( + tls: &OtelTlsConfig, + timeout_var: &str, +) -> Result> { + if tokio::runtime::Handle::try_current().is_ok() { + tokio::task::block_in_place(|| build_http_client_inner(tls, timeout_var)) + } else { + build_http_client_inner(tls, timeout_var) + } +} + +fn build_http_client_inner( + tls: &OtelTlsConfig, + timeout_var: &str, +) -> Result> { + let mut builder = + reqwest::blocking::Client::builder().timeout(resolve_otlp_timeout(timeout_var)); + + if let Some(path) = tls.ca_certificate.as_ref() { + let (pem, location) = read_bytes(path)?; + let certificate = ReqwestCertificate::from_pem(pem.as_slice()).map_err(|error| { + config_error(format!( + "failed to parse certificate {}: {error}", + location.display() + )) + })?; + builder = builder + .tls_built_in_root_certs(false) + .add_root_certificate(certificate); + } + + match (&tls.client_certificate, &tls.client_private_key) { + (Some(cert_path), Some(key_path)) => { + let (mut cert_pem, cert_location) = read_bytes(cert_path)?; + let (key_pem, key_location) = read_bytes(key_path)?; + cert_pem.extend_from_slice(key_pem.as_slice()); + let identity = ReqwestIdentity::from_pem(cert_pem.as_slice()).map_err(|error| { + config_error(format!( + "failed to parse client identity using {} and {}: {error}", + cert_location.display(), + key_location.display() + )) + })?; + builder = builder.identity(identity).https_only(true); + } + (Some(_), None) | (None, Some(_)) => { + return Err(config_error( + "client_certificate and client_private_key must both be provided for mTLS", + )); + } + (None, None) => {} + } + + builder + .build() + .map_err(|error| Box::new(error) as Box) +} + +pub(crate) fn resolve_otlp_timeout(signal_var: &str) -> Duration { + if let Some(timeout) = read_timeout_env(signal_var) { + return timeout; + } + if let Some(timeout) = read_timeout_env(OTEL_EXPORTER_OTLP_TIMEOUT) { + return timeout; + } + OTEL_EXPORTER_OTLP_TIMEOUT_DEFAULT +} + +fn read_timeout_env(var: &str) -> Option { + let value = env::var(var).ok()?; + let parsed = value.parse::().ok()?; + if parsed < 0 { + return None; + } + Some(Duration::from_millis(parsed as u64)) +} + +fn read_bytes(path: &AbsolutePathBuf) -> Result<(Vec, PathBuf), Box> { + match fs::read(path) { + Ok(bytes) => Ok((bytes, path.to_path_buf())), + Err(error) => Err(Box::new(io::Error::new( + error.kind(), + format!("failed to read {}: {error}", path.display()), + ))), + } +} + +fn config_error(message: impl Into) -> Box { + Box::new(io::Error::new(ErrorKind::InvalidData, message.into())) +} diff --git a/codex-rs/otel/src/traces/otel_manager.rs b/codex-rs/otel/src/traces/otel_manager.rs index 7b915bcc705..1f7ac4030c9 100644 --- a/codex-rs/otel/src/traces/otel_manager.rs +++ b/codex-rs/otel/src/traces/otel_manager.rs @@ -62,7 +62,7 @@ impl OtelManager { terminal_type, }, session_span, - metrics: None, + metrics: crate::metrics::global(), metrics_use_metadata_tags: true, } } diff --git a/codex-rs/otel/src/traces/otel_provider.rs b/codex-rs/otel/src/traces/otel_provider.rs index 0d12e378aab..27b98409051 100644 --- a/codex-rs/otel/src/traces/otel_provider.rs +++ b/codex-rs/otel/src/traces/otel_provider.rs @@ -1,10 +1,7 @@ use crate::config::OtelExporter; use crate::config::OtelHttpProtocol; use crate::config::OtelSettings; -use crate::config::OtelTlsConfig; use crate::metrics::MetricsClient; -use codex_utils_absolute_path::AbsolutePathBuf; -use http::Uri; use opentelemetry::Context; use opentelemetry::KeyValue; use opentelemetry::context::ContextGuard; @@ -15,8 +12,6 @@ use opentelemetry::trace::TracerProvider as _; use opentelemetry_appender_tracing::layer::OpenTelemetryTracingBridge; use opentelemetry_otlp::LogExporter; use opentelemetry_otlp::OTEL_EXPORTER_OTLP_LOGS_TIMEOUT; -use opentelemetry_otlp::OTEL_EXPORTER_OTLP_TIMEOUT; -use opentelemetry_otlp::OTEL_EXPORTER_OTLP_TIMEOUT_DEFAULT; use opentelemetry_otlp::OTEL_EXPORTER_OTLP_TRACES_TIMEOUT; use opentelemetry_otlp::Protocol; use opentelemetry_otlp::SpanExporter; @@ -30,25 +25,13 @@ use opentelemetry_sdk::trace::BatchSpanProcessor; use opentelemetry_sdk::trace::SdkTracerProvider; use opentelemetry_sdk::trace::Tracer; use opentelemetry_semantic_conventions as semconv; -use reqwest::Certificate as ReqwestCertificate; -use reqwest::Identity as ReqwestIdentity; -use reqwest::header::HeaderMap; -use reqwest::header::HeaderName; -use reqwest::header::HeaderValue; use std::cell::RefCell; use std::collections::HashMap; use std::env; use std::error::Error; -use std::fs; -use std::io::ErrorKind; -use std::io::{self}; -use std::path::PathBuf; use std::sync::OnceLock; -use std::time::Duration; use tonic::metadata::MetadataMap; -use tonic::transport::Certificate as TonicCertificate; use tonic::transport::ClientTlsConfig; -use tonic::transport::Identity as TonicIdentity; use tracing::debug; use tracing::level_filters::LevelFilter; use tracing::warn; @@ -94,6 +77,10 @@ impl OtelProvider { .map(MetricsClient::new) .transpose()?; + if let Some(metrics) = metrics.as_ref() { + crate::metrics::install_global(metrics.clone()); + } + if !log_enabled && !trace_enabled && metrics.is_none() { debug!("No OTEL exporter enabled in settings."); return Ok(None); @@ -167,6 +154,9 @@ impl Drop for OtelProvider { if let Some(tracer_provider) = &self.tracer_provider { let _ = tracer_provider.shutdown(); } + if let Some(metrics) = &self.metrics { + let _ = metrics.shutdown(); + } } } @@ -248,14 +238,14 @@ fn build_logger( } => { debug!("Using OTLP Grpc exporter: {endpoint}"); - let header_map = build_header_map(headers); + let header_map = crate::otlp::build_header_map(headers); let base_tls_config = ClientTlsConfig::new() .with_enabled_roots() .assume_http2(true); let tls_config = match tls.as_ref() { - Some(tls) => build_grpc_tls_config(endpoint, base_tls_config, tls)?, + Some(tls) => crate::otlp::build_grpc_tls_config(endpoint, base_tls_config, tls)?, None => base_tls_config, }; @@ -288,7 +278,7 @@ fn build_logger( .with_headers(headers.clone()); if let Some(tls) = tls.as_ref() { - let client = build_http_client(tls, OTEL_EXPORTER_OTLP_LOGS_TIMEOUT)?; + let client = crate::otlp::build_http_client(tls, OTEL_EXPORTER_OTLP_LOGS_TIMEOUT)?; exporter_builder = exporter_builder.with_http_client(client); } @@ -314,14 +304,14 @@ fn build_tracer_provider( } => { debug!("Using OTLP Grpc exporter for traces: {endpoint}"); - let header_map = build_header_map(headers); + let header_map = crate::otlp::build_header_map(headers); let base_tls_config = ClientTlsConfig::new() .with_enabled_roots() .assume_http2(true); let tls_config = match tls.as_ref() { - Some(tls) => build_grpc_tls_config(endpoint, base_tls_config, tls)?, + Some(tls) => crate::otlp::build_grpc_tls_config(endpoint, base_tls_config, tls)?, None => base_tls_config, }; @@ -352,7 +342,8 @@ fn build_tracer_provider( .with_headers(headers.clone()); if let Some(tls) = tls.as_ref() { - let client = build_http_client(tls, OTEL_EXPORTER_OTLP_TRACES_TIMEOUT)?; + let client = + crate::otlp::build_http_client(tls, OTEL_EXPORTER_OTLP_TRACES_TIMEOUT)?; exporter_builder = exporter_builder.with_http_client(client); } @@ -368,150 +359,6 @@ fn build_tracer_provider( .build()) } -fn build_header_map(headers: &HashMap) -> HeaderMap { - let mut header_map = HeaderMap::new(); - for (key, value) in headers { - if let Ok(name) = HeaderName::from_bytes(key.as_bytes()) - && let Ok(val) = HeaderValue::from_str(value) - { - header_map.insert(name, val); - } - } - header_map -} - -fn build_grpc_tls_config( - endpoint: &str, - tls_config: ClientTlsConfig, - tls: &OtelTlsConfig, -) -> Result> { - let uri: Uri = endpoint.parse()?; - let host = uri.host().ok_or_else(|| { - config_error(format!( - "OTLP gRPC endpoint {endpoint} does not include a host" - )) - })?; - - let mut config = tls_config.domain_name(host.to_owned()); - - if let Some(path) = tls.ca_certificate.as_ref() { - let (pem, _) = read_bytes(path)?; - config = config.ca_certificate(TonicCertificate::from_pem(pem)); - } - - match (&tls.client_certificate, &tls.client_private_key) { - (Some(cert_path), Some(key_path)) => { - let (cert_pem, _) = read_bytes(cert_path)?; - let (key_pem, _) = read_bytes(key_path)?; - config = config.identity(TonicIdentity::from_pem(cert_pem, key_pem)); - } - (Some(_), None) | (None, Some(_)) => { - return Err(config_error( - "client_certificate and client_private_key must both be provided for mTLS", - )); - } - (None, None) => {} - } - - Ok(config) -} - -/// Build a blocking HTTP client with TLS configuration for the OTLP HTTP exporter. -/// -/// We use `reqwest::blocking::Client` instead of the async client because the -/// `opentelemetry_sdk` `BatchLogProcessor` spawns a dedicated OS thread that uses -/// `futures_executor::block_on()` rather than tokio. When the async reqwest client's -/// timeout calls `tokio::time::sleep()`, it panics with "no reactor running". -fn build_http_client( - tls: &OtelTlsConfig, - timeout_var: &str, -) -> Result> { - // Wrap in block_in_place because reqwest::blocking::Client creates its own - // internal tokio runtime, which would panic if built directly from an async context. - tokio::task::block_in_place(|| build_http_client_inner(tls, timeout_var)) -} - -fn build_http_client_inner( - tls: &OtelTlsConfig, - timeout_var: &str, -) -> Result> { - let mut builder = - reqwest::blocking::Client::builder().timeout(resolve_otlp_timeout(timeout_var)); - - if let Some(path) = tls.ca_certificate.as_ref() { - let (pem, location) = read_bytes(path)?; - let certificate = ReqwestCertificate::from_pem(pem.as_slice()).map_err(|error| { - config_error(format!( - "failed to parse certificate {}: {error}", - location.display() - )) - })?; - // Disable built-in root certificates and use only our custom CA - builder = builder - .tls_built_in_root_certs(false) - .add_root_certificate(certificate); - } - - match (&tls.client_certificate, &tls.client_private_key) { - (Some(cert_path), Some(key_path)) => { - let (mut cert_pem, cert_location) = read_bytes(cert_path)?; - let (key_pem, key_location) = read_bytes(key_path)?; - cert_pem.extend_from_slice(key_pem.as_slice()); - let identity = ReqwestIdentity::from_pem(cert_pem.as_slice()).map_err(|error| { - config_error(format!( - "failed to parse client identity using {} and {}: {error}", - cert_location.display(), - key_location.display() - )) - })?; - builder = builder.identity(identity).https_only(true); - } - (Some(_), None) | (None, Some(_)) => { - return Err(config_error( - "client_certificate and client_private_key must both be provided for mTLS", - )); - } - (None, None) => {} - } - - builder - .build() - .map_err(|error| Box::new(error) as Box) -} - -fn resolve_otlp_timeout(signal_var: &str) -> Duration { - if let Some(timeout) = read_timeout_env(signal_var) { - return timeout; - } - if let Some(timeout) = read_timeout_env(OTEL_EXPORTER_OTLP_TIMEOUT) { - return timeout; - } - OTEL_EXPORTER_OTLP_TIMEOUT_DEFAULT -} - -fn read_timeout_env(var: &str) -> Option { - let value = env::var(var).ok()?; - let parsed = value.parse::().ok()?; - if parsed < 0 { - return None; - } - Some(Duration::from_millis(parsed as u64)) -} - -fn read_bytes(path: &AbsolutePathBuf) -> Result<(Vec, PathBuf), Box> { - match fs::read(path) { - Ok(bytes) => Ok((bytes, path.to_path_buf())), - Err(error) => Err(Box::new(io::Error::new( - error.kind(), - format!("failed to read {}: {error}", path.display()), - ))), - } -} - -fn config_error(message: impl Into) -> Box { - Box::new(io::Error::new(ErrorKind::InvalidData, message.into())) -} - #[cfg(test)] mod tests { use super::*; diff --git a/codex-rs/otel/tests/harness/mod.rs b/codex-rs/otel/tests/harness/mod.rs index 30a2e67c3fd..acdba0b7e11 100644 --- a/codex-rs/otel/tests/harness/mod.rs +++ b/codex-rs/otel/tests/harness/mod.rs @@ -13,7 +13,12 @@ pub(crate) fn build_metrics_with_defaults( default_tags: &[(&str, &str)], ) -> Result<(MetricsClient, InMemoryMetricExporter)> { let exporter = InMemoryMetricExporter::default(); - let mut config = MetricsConfig::in_memory(exporter.clone()); + let mut config = MetricsConfig::in_memory( + "test", + "codex-cli", + env!("CARGO_PKG_VERSION"), + exporter.clone(), + ); for (key, value) in default_tags { config = config.with_tag(*key, *value)?; } diff --git a/codex-rs/otel/tests/suite/validation.rs b/codex-rs/otel/tests/suite/validation.rs index 8af1d9720a1..9a33cbc0644 100644 --- a/codex-rs/otel/tests/suite/validation.rs +++ b/codex-rs/otel/tests/suite/validation.rs @@ -6,26 +6,21 @@ use opentelemetry_sdk::metrics::InMemoryMetricExporter; fn build_in_memory_client() -> Result { let exporter = InMemoryMetricExporter::default(); - let config = MetricsConfig::in_memory(exporter); + let config = MetricsConfig::in_memory("test", "codex-cli", env!("CARGO_PKG_VERSION"), exporter); MetricsClient::new(config) } -// Validates missing API key is rejected early. -#[test] -fn empty_api_key_is_rejected() -> Result<()> { - assert!(matches!( - MetricsClient::new(MetricsConfig::new("")), - Err(MetricsError::EmptyApiKey) - )); - Ok(()) -} - // Ensures invalid tag components are rejected during config build. #[test] fn invalid_tag_component_is_rejected() -> Result<()> { - let err = MetricsConfig::default() - .with_tag("bad key", "value") - .unwrap_err(); + let err = MetricsConfig::in_memory( + "test", + "codex-cli", + env!("CARGO_PKG_VERSION"), + InMemoryMetricExporter::default(), + ) + .with_tag("bad key", "value") + .unwrap_err(); assert!(matches!( err, MetricsError::InvalidTagComponent { label, value } diff --git a/docs/config.md b/docs/config.md index 5198b22266d..08613aefad9 100644 --- a/docs/config.md +++ b/docs/config.md @@ -538,7 +538,8 @@ Some of the most common MCPs we've seen are: ### otel -Codex can emit [OpenTelemetry](https://opentelemetry.io/) **log events** that +Codex can emit [OpenTelemetry](https://opentelemetry.io/) **log events**, +**trace spans**, and **metrics** that describe each run: outbound API requests, streamed responses, user input, tool-approval decisions, and the result of every tool invocation. Export is **disabled by default** so local runs remain self-contained. Opt in by adding an @@ -548,6 +549,8 @@ tool-approval decisions, and the result of every tool invocation. Export is [otel] environment = "staging" # defaults to "dev" exporter = "none" # defaults to "none"; set to otlp-http or otlp-grpc to send events +trace_exporter = "none" # defaults to `exporter`; set to otlp-http or otlp-grpc to send spans +metrics_exporter = "none" # defaults to "none"; set to otlp-http or otlp-grpc to send metrics log_user_prompt = false # defaults to false; redact prompt text unless explicitly enabled ``` @@ -611,7 +614,7 @@ These event shapes may change as we iterate. ### Choosing an exporter -Set `otel.exporter` to control where events go: +Set `otel.exporter` to control where log events go (the event catalog below): - `none` – leaves instrumentation active but skips exporting. This is the default. @@ -652,10 +655,43 @@ client-certificate = "/etc/codex/certs/client.pem" client-private-key = "/etc/codex/certs/client-key.pem" ``` +Statsig supports OTLP ingestion over OTLP/HTTP JSON. To export Codex metrics and traces to Statsig: + +```toml +[otel.metrics_exporter."otlp-http"] +endpoint = "https://api.statsig.com/otlp" +protocol = "json" + +[otel.metrics_exporter."otlp-http".headers] +"statsig-api-key" = "${STATSIG_SERVER_SDK_SECRET}" + +[otel.trace_exporter."otlp-http"] +endpoint = "https://api.statsig.com/otlp" +protocol = "json" + +[otel.trace_exporter."otlp-http".headers] +"statsig-api-key" = "${STATSIG_SERVER_SDK_SECRET}" +``` + +If you have a legacy Statsig client key + custom ingest host, wire those credentials via headers too: + +```toml +[otel.metrics_exporter."otlp-http"] +endpoint = "https://ab.chatgpt.com" +protocol = "json" + +[otel.metrics_exporter."otlp-http".headers] +"statsig-api-key" = "client-MkRuleRQBd6qakfnDYqJVR9JuXcY57Ljly3vi5JVUIO" +``` + If the exporter is `none` nothing is written anywhere; otherwise you must run or point to your own collector. All exporters run on a background batch worker that is flushed on shutdown. +`otel.trace_exporter` and `otel.metrics_exporter` accept the same exporter types as `otel.exporter`, +but send OTEL trace spans and metrics respectively. You can point them at different endpoints if +your backend uses separate ingest URLs per signal. + If you build Codex from source the OTEL crate is still behind an `otel` feature flag; the official prebuilt binaries ship with the feature enabled. When the feature is disabled the telemetry hooks become no-ops so the CLI continues to diff --git a/docs/example-config.md b/docs/example-config.md index bfc467baf87..5821b94b189 100644 --- a/docs/example-config.md +++ b/docs/example-config.md @@ -336,6 +336,10 @@ log_user_prompt = false environment = "dev" # Exporter: none (default) | otlp-http | otlp-grpc exporter = "none" +# Optional trace exporter (spans). Default: same as `exporter` +trace_exporter = "none" +# Optional metrics exporter. Default: none +metrics_exporter = "none" # Example OTLP/HTTP exporter configuration # [otel.exporter."otlp-http"] @@ -362,4 +366,19 @@ exporter = "none" # ca-certificate = "certs/otel-ca.pem" # client-certificate = "/etc/codex/certs/client.pem" # client-private-key = "/etc/codex/certs/client-key.pem" + +# Example separate exporters for traces/metrics (e.g., Statsig OTLP ingestion) +# [otel.metrics_exporter."otlp-http"] +# endpoint = "https://api.statsig.com/otlp" +# protocol = "json" +# +# [otel.metrics_exporter."otlp-http".headers] +# "statsig-api-key" = "${STATSIG_SERVER_SDK_SECRET}" +# +# [otel.trace_exporter."otlp-http"] +# endpoint = "https://api.statsig.com/otlp" +# protocol = "json" +# +# [otel.trace_exporter."otlp-http".headers] +# "statsig-api-key" = "${STATSIG_SERVER_SDK_SECRET}" ``` diff --git a/docs/telemetry.md b/docs/telemetry.md index 320cae30c23..289d6c9a08b 100644 --- a/docs/telemetry.md +++ b/docs/telemetry.md @@ -6,9 +6,9 @@ ## Tracing -Codex does not export OpenTelemetry traces today. The only OTEL output is log -events emitted by the `codex_otel` crate, and those are exported only when an -OTEL exporter is configured; otherwise nothing is sent. +Codex can export OpenTelemetry **log events**, **trace spans**, and **metrics** +when OTEL exporters are configured in `config.toml` (`[otel]`). +By default, exporters are disabled and nothing is sent. ## Feedback From 6900a941b1ae97b2313a951304ebf1ec95c4ed64 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Tue, 23 Dec 2025 16:12:50 +0100 Subject: [PATCH 34/43] NIT --- codex-rs/otel/src/lib.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/codex-rs/otel/src/lib.rs b/codex-rs/otel/src/lib.rs index 14acd84c5a1..cd7d484a3dd 100644 --- a/codex-rs/otel/src/lib.rs +++ b/codex-rs/otel/src/lib.rs @@ -45,11 +45,10 @@ pub struct OtelManager { } impl OtelManager { - pub fn with_model(&self, model: &str, slug: &str) -> Self { - let mut manager = self.clone(); - manager.metadata.model = model.to_owned(); - manager.metadata.slug = slug.to_owned(); - manager + pub fn with_model(mut self, model: &str, slug: &str) -> Self { + self.metadata.model = model.to_owned(); + self.metadata.slug = slug.to_owned(); + self } pub fn with_metrics(mut self, metrics: MetricsClient) -> Self { From 062351dc9d6275a3f0d7848ca5ff3e69a0a7e2d5 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Tue, 23 Dec 2025 16:47:27 +0100 Subject: [PATCH 35/43] Update doc --- codex-rs/core/Cargo.toml | 4 +- codex-rs/core/src/config/mod.rs | 2 +- codex-rs/core/src/config/types.rs | 3 ++ codex-rs/core/src/otel_init.rs | 48 +---------------------- codex-rs/otel/Cargo.toml | 8 ++-- codex-rs/otel/README.md | 19 ++------- codex-rs/otel/src/config.rs | 41 ++++++++++++------- codex-rs/otel/src/metrics/client.rs | 4 ++ codex-rs/otel/src/traces/otel_provider.rs | 38 ++++++++++++------ docs/config.md | 6 +-- 10 files changed, 75 insertions(+), 98 deletions(-) diff --git a/codex-rs/core/Cargo.toml b/codex-rs/core/Cargo.toml index 45d4495fa57..282673db001 100644 --- a/codex-rs/core/Cargo.toml +++ b/codex-rs/core/Cargo.toml @@ -91,9 +91,7 @@ which = { workspace = true } wildmatch = { workspace = true } [features] -default = ["statsig-default-metrics-exporter"] deterministic_process_ids = [] -statsig-default-metrics-exporter = ["codex-otel/statsig-default-metrics-exporter"] test-support = [] @@ -125,7 +123,7 @@ assert_cmd = { workspace = true } assert_matches = { workspace = true } codex-arg0 = { workspace = true } codex-core = { path = ".", default-features = false, features = ["deterministic_process_ids"] } -codex-otel = { workspace = true } +codex-otel = { workspace = true, features = ["disable-default-metrics-exporter"] } core_test_support = { workspace = true } ctor = { workspace = true } escargot = { workspace = true } diff --git a/codex-rs/core/src/config/mod.rs b/codex-rs/core/src/config/mod.rs index 7fb0e6489f6..ec4328ad0c7 100644 --- a/codex-rs/core/src/config/mod.rs +++ b/codex-rs/core/src/config/mod.rs @@ -1434,7 +1434,7 @@ impl Config { .unwrap_or(DEFAULT_OTEL_ENVIRONMENT.to_string()); let exporter = t.exporter.unwrap_or(OtelExporterKind::None); let trace_exporter = t.trace_exporter.unwrap_or_else(|| exporter.clone()); - let metrics_exporter = t.metrics_exporter.unwrap_or(OtelExporterKind::None); + let metrics_exporter = t.metrics_exporter.unwrap_or(OtelExporterKind::Statsig); OtelConfig { log_user_prompt, environment, diff --git a/codex-rs/core/src/config/types.rs b/codex-rs/core/src/config/types.rs index 6fa58f7591f..b6acdcb33f6 100644 --- a/codex-rs/core/src/config/types.rs +++ b/codex-rs/core/src/config/types.rs @@ -297,6 +297,7 @@ pub struct OtelTlsConfig { #[serde(rename_all = "kebab-case")] pub enum OtelExporterKind { None, + Statsig, OtlpHttp { endpoint: String, #[serde(default)] @@ -330,6 +331,8 @@ pub struct OtelConfigToml { pub trace_exporter: Option, /// Optional metrics exporter + /// + /// Defaults to `statsig` outside of tests. pub metrics_exporter: Option, } diff --git a/codex-rs/core/src/otel_init.rs b/codex-rs/core/src/otel_init.rs index 42fae248df4..a57496ecf5d 100644 --- a/codex-rs/core/src/otel_init.rs +++ b/codex-rs/core/src/otel_init.rs @@ -6,13 +6,9 @@ use codex_otel::config::OtelExporter; use codex_otel::config::OtelHttpProtocol; use codex_otel::config::OtelSettings; use codex_otel::config::OtelTlsConfig as OtelTlsSettings; -use codex_otel::metrics::MetricsConfig; use codex_otel::traces::otel_provider::OtelProvider; use std::error::Error; -#[cfg(feature = "statsig-default-metrics-exporter")] -use codex_otel::config::statsig_default_metrics_exporter; - /// Build an OpenTelemetry provider from the app Config. /// /// Returns `None` when OTEL export is disabled. @@ -22,6 +18,7 @@ pub fn build_provider( ) -> Result, Box> { let to_otel_exporter = |kind: &Kind| match kind { Kind::None => OtelExporter::None, + Kind::Statsig => OtelExporter::Statsig, Kind::OtlpHttp { endpoint, headers, @@ -69,18 +66,6 @@ pub fn build_provider( let trace_exporter = to_otel_exporter(&config.otel.trace_exporter); let metrics_exporter = to_otel_exporter(&config.otel.metrics_exporter); - let metrics = match &metrics_exporter { - OtelExporter::None => None, - _ => Some(MetricsConfig::otlp( - config.otel.environment.to_string(), - originator().value.to_owned(), - service_version.to_string(), - metrics_exporter, - )), - }; - - let metrics = metrics.or_else(|| default_metrics(config, service_version)); - OtelProvider::from(&OtelSettings { service_name: originator().value.to_owned(), service_version: service_version.to_string(), @@ -88,39 +73,10 @@ pub fn build_provider( environment: config.otel.environment.to_string(), exporter, trace_exporter, - metrics, + metrics_exporter, }) } -#[cfg(feature = "statsig-default-metrics-exporter")] -fn default_metrics(config: &Config, service_version: &str) -> Option { - if is_test_process() { - return None; - } - - if matches!(config.otel.exporter, Kind::None) - && matches!(config.otel.trace_exporter, Kind::None) - { - return None; - } - - Some(MetricsConfig::otlp( - config.otel.environment.to_string(), - originator().value.to_owned(), - service_version.to_string(), - statsig_default_metrics_exporter(), - )) -} - -#[cfg(not(feature = "statsig-default-metrics-exporter"))] -fn default_metrics(_config: &Config, _service_version: &str) -> Option { - None -} - -fn is_test_process() -> bool { - std::env::var_os("RUST_TEST_THREADS").is_some() -} - /// Filter predicate for exporting only Codex-owned events via OTEL. /// Keeps events that originated from codex_otel module pub fn codex_export_filter(meta: &tracing::Metadata<'_>) -> bool { diff --git a/codex-rs/otel/Cargo.toml b/codex-rs/otel/Cargo.toml index 68fb6e595c6..b0e80074baa 100644 --- a/codex-rs/otel/Cargo.toml +++ b/codex-rs/otel/Cargo.toml @@ -13,9 +13,11 @@ path = "src/lib.rs" workspace = true [features] -## Provides a built-in default metrics exporter to an internal Statsig endpoint. -## Intended for production binaries; tests should not rely on it. -statsig-default-metrics-exporter = [] +## Disables the built-in default metrics exporter. +## +## Intended for use from `dev-dependencies` so unit/integration tests never +## attempt to export metrics over the network. +disable-default-metrics-exporter = [] [dependencies] chrono = { workspace = true } diff --git a/codex-rs/otel/README.md b/codex-rs/otel/README.md index b22ce791dd5..79f4e8f45c9 100644 --- a/codex-rs/otel/README.md +++ b/codex-rs/otel/README.md @@ -36,7 +36,7 @@ let settings = OtelSettings { protocol: OtelHttpProtocol::Binary, tls: None, }, - metrics: None, + metrics_exporter: OtelExporter::None, }; if let Some(provider) = OtelProvider::from(&settings)? { @@ -77,6 +77,9 @@ Modes: - OTLP: exports metrics via the OpenTelemetry OTLP exporter (HTTP or gRPC). - In-memory: records via `opentelemetry_sdk::metrics::InMemoryMetricExporter` for tests/assertions; call `shutdown()` to flush. +`codex-otel` also provides `OtelExporter::Statsig`, a shorthand for exporting OTLP/HTTP JSON metrics +to Statsig using Codex-internal defaults. + Statsig ingestion (OTLP/HTTP JSON) example: ```rust @@ -101,20 +104,6 @@ metrics.counter("codex.session_started", 1, &[("source", "tui")])?; metrics.histogram("codex.request_latency", 83, &[("route", "chat")])?; ``` -When built with the `codex-otel/statsig-default-metrics-exporter` feature you can also use the -crate-provided defaults (client key + `ab.chatgpt.com`) instead of wiring the header yourself: - -```rust -use codex_otel::config::statsig_default_metrics_exporter; - -let metrics = MetricsClient::new(MetricsConfig::otlp( - "dev", - "codex-cli", - env!("CARGO_PKG_VERSION"), - statsig_default_metrics_exporter(), -))?; -``` - In-memory (tests): ```rust diff --git a/codex-rs/otel/src/config.rs b/codex-rs/otel/src/config.rs index 5fc8a24f9f2..2e9fff5eca1 100644 --- a/codex-rs/otel/src/config.rs +++ b/codex-rs/otel/src/config.rs @@ -1,21 +1,30 @@ use std::collections::HashMap; use std::path::PathBuf; -use crate::metrics::MetricsConfig; use codex_utils_absolute_path::AbsolutePathBuf; -#[cfg(feature = "statsig-default-metrics-exporter")] -pub fn statsig_default_metrics_exporter() -> OtelExporter { - let headers = std::collections::HashMap::from([( - "statsig-api-key".to_string(), - "client-MkRuleRQBd6qakfnDYqJVR9JuXcY57Ljly3vi5JVUIO".to_string(), - )]); - - OtelExporter::OtlpHttp { - endpoint: "https://ab.chatgpt.com".to_string(), - headers, - protocol: OtelHttpProtocol::Json, - tls: None, +pub(crate) const STATSIG_OTLP_HTTP_ENDPOINT: &str = "https://ab.chatgpt.com/otlp"; +pub(crate) const STATSIG_API_KEY_HEADER: &str = "statsig-api-key"; +pub(crate) const STATSIG_API_KEY: &str = "client-MkRuleRQBd6qakfnDYqJVR9JuXcY57Ljly3vi5JVUIO"; + +pub(crate) fn resolve_exporter(exporter: &OtelExporter) -> OtelExporter { + match exporter { + OtelExporter::Statsig => { + if cfg!(test) || cfg!(feature = "disable-default-metrics-exporter") { + return OtelExporter::None; + } + + OtelExporter::OtlpHttp { + endpoint: STATSIG_OTLP_HTTP_ENDPOINT.to_string(), + headers: HashMap::from([( + STATSIG_API_KEY_HEADER.to_string(), + STATSIG_API_KEY.to_string(), + )]), + protocol: OtelHttpProtocol::Json, + tls: None, + } + }, + _ => exporter.clone(), } } @@ -27,7 +36,7 @@ pub struct OtelSettings { pub codex_home: PathBuf, pub exporter: OtelExporter, pub trace_exporter: OtelExporter, - pub metrics: Option, + pub metrics_exporter: OtelExporter, } #[derive(Clone, Debug)] @@ -48,6 +57,10 @@ pub struct OtelTlsConfig { #[derive(Clone, Debug)] pub enum OtelExporter { None, + /// Statsig metrics ingestion exporter using Codex-internal defaults. + /// + /// This is intended for metrics only. + Statsig, OtlpGrpc { endpoint: String, headers: HashMap, diff --git a/codex-rs/otel/src/metrics/client.rs b/codex-rs/otel/src/metrics/client.rs index 74a2fe57fe2..11f1998d824 100644 --- a/codex-rs/otel/src/metrics/client.rs +++ b/codex-rs/otel/src/metrics/client.rs @@ -234,6 +234,10 @@ fn build_otlp_metric_exporter( ) -> Result { match exporter { OtelExporter::None => Err(MetricsError::ExporterDisabled), + OtelExporter::Statsig => build_otlp_metric_exporter( + crate::config::resolve_exporter(&OtelExporter::Statsig), + temporality, + ), OtelExporter::OtlpGrpc { endpoint, headers, diff --git a/codex-rs/otel/src/traces/otel_provider.rs b/codex-rs/otel/src/traces/otel_provider.rs index 27b98409051..5c350be35eb 100644 --- a/codex-rs/otel/src/traces/otel_provider.rs +++ b/codex-rs/otel/src/traces/otel_provider.rs @@ -2,6 +2,7 @@ use crate::config::OtelExporter; use crate::config::OtelHttpProtocol; use crate::config::OtelSettings; use crate::metrics::MetricsClient; +use crate::metrics::MetricsConfig; use opentelemetry::Context; use opentelemetry::KeyValue; use opentelemetry::context::ContextGuard; @@ -47,6 +48,7 @@ thread_local! { static TRACEPARENT_GUARD: RefCell> = const { RefCell::new(None) }; } +// TODO(jif) move OtelProvider out of `traces/` pub struct OtelProvider { pub logger: Option, pub tracer_provider: Option, @@ -71,11 +73,19 @@ impl OtelProvider { let log_enabled = !matches!(settings.exporter, OtelExporter::None); let trace_enabled = !matches!(settings.trace_exporter, OtelExporter::None); - let metrics = settings - .metrics - .clone() - .map(MetricsClient::new) - .transpose()?; + + + let metric_exporter = crate::config::resolve_exporter(&settings.metrics_exporter); + let metrics = if matches!(metric_exporter, OtelExporter::None) { + None + } else { + Some(MetricsClient::new(MetricsConfig::otlp( + settings.environment.clone(), + settings.service_name.clone(), + settings.service_version.clone(), + metric_exporter, + ))?) + }; if let Some(metrics) = metrics.as_ref() { crate::metrics::install_global(metrics.clone()); @@ -229,8 +239,9 @@ fn build_logger( ) -> Result> { let mut builder = SdkLoggerProvider::builder().with_resource(resource.clone()); - match exporter { + match crate::config::resolve_exporter(exporter) { OtelExporter::None => return Ok(builder.build()), + OtelExporter::Statsig => unreachable!("statsig exporter should be resolved"), OtelExporter::OtlpGrpc { endpoint, headers, @@ -238,14 +249,14 @@ fn build_logger( } => { debug!("Using OTLP Grpc exporter: {endpoint}"); - let header_map = crate::otlp::build_header_map(headers); + let header_map = crate::otlp::build_header_map(&headers); let base_tls_config = ClientTlsConfig::new() .with_enabled_roots() .assume_http2(true); let tls_config = match tls.as_ref() { - Some(tls) => crate::otlp::build_grpc_tls_config(endpoint, base_tls_config, tls)?, + Some(tls) => crate::otlp::build_grpc_tls_config(&endpoint, base_tls_config, tls)?, None => base_tls_config, }; @@ -275,7 +286,7 @@ fn build_logger( .with_http() .with_endpoint(endpoint) .with_protocol(protocol) - .with_headers(headers.clone()); + .with_headers(headers); if let Some(tls) = tls.as_ref() { let client = crate::otlp::build_http_client(tls, OTEL_EXPORTER_OTLP_LOGS_TIMEOUT)?; @@ -295,8 +306,9 @@ fn build_tracer_provider( resource: &Resource, exporter: &OtelExporter, ) -> Result> { - let span_exporter = match exporter { + let span_exporter = match crate::config::resolve_exporter(exporter) { OtelExporter::None => return Ok(SdkTracerProvider::builder().build()), + OtelExporter::Statsig => unreachable!("statsig exporter should be resolved"), OtelExporter::OtlpGrpc { endpoint, headers, @@ -304,14 +316,14 @@ fn build_tracer_provider( } => { debug!("Using OTLP Grpc exporter for traces: {endpoint}"); - let header_map = crate::otlp::build_header_map(headers); + let header_map = crate::otlp::build_header_map(&headers); let base_tls_config = ClientTlsConfig::new() .with_enabled_roots() .assume_http2(true); let tls_config = match tls.as_ref() { - Some(tls) => crate::otlp::build_grpc_tls_config(endpoint, base_tls_config, tls)?, + Some(tls) => crate::otlp::build_grpc_tls_config(&endpoint, base_tls_config, tls)?, None => base_tls_config, }; @@ -339,7 +351,7 @@ fn build_tracer_provider( .with_http() .with_endpoint(endpoint) .with_protocol(protocol) - .with_headers(headers.clone()); + .with_headers(headers); if let Some(tls) = tls.as_ref() { let client = diff --git a/docs/config.md b/docs/config.md index 08613aefad9..be25da20a2d 100644 --- a/docs/config.md +++ b/docs/config.md @@ -550,7 +550,7 @@ tool-approval decisions, and the result of every tool invocation. Export is environment = "staging" # defaults to "dev" exporter = "none" # defaults to "none"; set to otlp-http or otlp-grpc to send events trace_exporter = "none" # defaults to `exporter`; set to otlp-http or otlp-grpc to send spans -metrics_exporter = "none" # defaults to "none"; set to otlp-http or otlp-grpc to send metrics +metrics_exporter = "none" # defaults to "statsig"; set to otlp-http or otlp-grpc to send metrics to custom otel services log_user_prompt = false # defaults to false; redact prompt text unless explicitly enabled ``` @@ -677,11 +677,11 @@ If you have a legacy Statsig client key + custom ingest host, wire those credent ```toml [otel.metrics_exporter."otlp-http"] -endpoint = "https://ab.chatgpt.com" +endpoint = "https://api.statsig.com/otlp" protocol = "json" [otel.metrics_exporter."otlp-http".headers] -"statsig-api-key" = "client-MkRuleRQBd6qakfnDYqJVR9JuXcY57Ljly3vi5JVUIO" +"x-api-key" = "123" ``` If the exporter is `none` nothing is written anywhere; otherwise you must run or point to your From 4c16066f36790ef65cdb26d23481d30e10ef980d Mon Sep 17 00:00:00 2001 From: jif-oai Date: Tue, 23 Dec 2025 17:23:14 +0100 Subject: [PATCH 36/43] Go to u64 --- codex-rs/otel/src/config.rs | 2 +- codex-rs/otel/src/metrics/client.rs | 14 ++++++++++---- codex-rs/otel/src/metrics/error.rs | 3 +++ codex-rs/otel/src/traces/otel_provider.rs | 2 -- codex-rs/otel/tests/suite/manager_metrics.rs | 4 ++-- codex-rs/otel/tests/suite/send.rs | 10 +++++----- codex-rs/otel/tests/suite/validation.rs | 12 ++++++++++++ 7 files changed, 33 insertions(+), 14 deletions(-) diff --git a/codex-rs/otel/src/config.rs b/codex-rs/otel/src/config.rs index 2e9fff5eca1..d1a1251bd91 100644 --- a/codex-rs/otel/src/config.rs +++ b/codex-rs/otel/src/config.rs @@ -23,7 +23,7 @@ pub(crate) fn resolve_exporter(exporter: &OtelExporter) -> OtelExporter { protocol: OtelHttpProtocol::Json, tls: None, } - }, + } _ => exporter.clone(), } } diff --git a/codex-rs/otel/src/metrics/client.rs b/codex-rs/otel/src/metrics/client.rs index 11f1998d824..c607fbebc04 100644 --- a/codex-rs/otel/src/metrics/client.rs +++ b/codex-rs/otel/src/metrics/client.rs @@ -9,10 +9,10 @@ use crate::metrics::validation::validate_tag_key; use crate::metrics::validation::validate_tag_value; use crate::metrics::validation::validate_tags; use opentelemetry::KeyValue; +use opentelemetry::metrics::Counter; use opentelemetry::metrics::Histogram; use opentelemetry::metrics::Meter; use opentelemetry::metrics::MeterProvider as _; -use opentelemetry::metrics::UpDownCounter; use opentelemetry_otlp::OTEL_EXPORTER_OTLP_METRICS_TIMEOUT; use opentelemetry_otlp::Protocol; use opentelemetry_otlp::WithExportConfig; @@ -37,7 +37,7 @@ const METER_NAME: &str = "codex"; struct MetricsClientInner { meter_provider: SdkMeterProvider, meter: Meter, - counters: Mutex>>, + counters: Mutex>>, histograms: Mutex>>, default_tags: BTreeMap, } @@ -45,6 +45,12 @@ struct MetricsClientInner { impl MetricsClientInner { fn counter(&self, name: &str, inc: i64, tags: &[(&str, &str)]) -> Result<()> { validate_metric_name(name)?; + if inc < 0 { + return Err(MetricsError::NegativeCounterIncrement { + name: name.to_string(), + inc, + }); + } let attributes = self.attributes(tags)?; let mut counters = self @@ -53,8 +59,8 @@ impl MetricsClientInner { .unwrap_or_else(std::sync::PoisonError::into_inner); let counter = counters .entry(name.to_string()) - .or_insert_with(|| self.meter.i64_up_down_counter(name.to_string()).build()); - counter.add(inc, &attributes); + .or_insert_with(|| self.meter.u64_counter(name.to_string()).build()); + counter.add(inc as u64, &attributes); Ok(()) } diff --git a/codex-rs/otel/src/metrics/error.rs b/codex-rs/otel/src/metrics/error.rs index d046708a308..dfb9653254a 100644 --- a/codex-rs/otel/src/metrics/error.rs +++ b/codex-rs/otel/src/metrics/error.rs @@ -17,6 +17,9 @@ pub enum MetricsError { #[error("metrics exporter is disabled")] ExporterDisabled, + #[error("counter increment must be non-negative for {name}: {inc}")] + NegativeCounterIncrement { name: String, inc: i64 }, + #[error("failed to build OTLP metrics exporter")] ExporterBuild { #[source] diff --git a/codex-rs/otel/src/traces/otel_provider.rs b/codex-rs/otel/src/traces/otel_provider.rs index 5c350be35eb..f060cc24c0d 100644 --- a/codex-rs/otel/src/traces/otel_provider.rs +++ b/codex-rs/otel/src/traces/otel_provider.rs @@ -73,8 +73,6 @@ impl OtelProvider { let log_enabled = !matches!(settings.exporter, OtelExporter::None); let trace_enabled = !matches!(settings.trace_exporter, OtelExporter::None); - - let metric_exporter = crate::config::resolve_exporter(&settings.metrics_exporter); let metrics = if matches!(metric_exporter, OtelExporter::None) { None diff --git a/codex-rs/otel/tests/suite/manager_metrics.rs b/codex-rs/otel/tests/suite/manager_metrics.rs index bf8f5239c9a..b85ba1bbf13 100644 --- a/codex-rs/otel/tests/suite/manager_metrics.rs +++ b/codex-rs/otel/tests/suite/manager_metrics.rs @@ -36,7 +36,7 @@ fn manager_attaches_metadata_tags_to_metrics() -> Result<()> { let metric = find_metric(&resource_metrics, "codex.session_started").expect("counter metric missing"); let attrs = match metric.data() { - AggregatedMetrics::I64(data) => match data { + AggregatedMetrics::U64(data) => match data { MetricData::Sum(sum) => { let points: Vec<_> = sum.data_points().collect(); assert_eq!(points.len(), 1); @@ -86,7 +86,7 @@ fn manager_allows_disabling_metadata_tags() -> Result<()> { let metric = find_metric(&resource_metrics, "codex.session_started").expect("counter metric missing"); let attrs = match metric.data() { - AggregatedMetrics::I64(data) => match data { + AggregatedMetrics::U64(data) => match data { MetricData::Sum(sum) => { let points: Vec<_> = sum.data_points().collect(); assert_eq!(points.len(), 1); diff --git a/codex-rs/otel/tests/suite/send.rs b/codex-rs/otel/tests/suite/send.rs index ce5fcf9384b..4e7e0279274 100644 --- a/codex-rs/otel/tests/suite/send.rs +++ b/codex-rs/otel/tests/suite/send.rs @@ -21,7 +21,7 @@ fn send_builds_payload_with_tags_and_histograms() -> Result<()> { let counter = find_metric(&resource_metrics, "codex.turns").expect("counter metric missing"); let counter_attributes = match counter.data() { - opentelemetry_sdk::metrics::data::AggregatedMetrics::I64(data) => match data { + opentelemetry_sdk::metrics::data::AggregatedMetrics::U64(data) => match data { opentelemetry_sdk::metrics::data::MetricData::Sum(sum) => { let points: Vec<_> = sum.data_points().collect(); assert_eq!(points.len(), 1); @@ -94,7 +94,7 @@ fn send_merges_default_tags_per_line() -> Result<()> { let alpha_metric = find_metric(&resource_metrics, "codex.alpha").expect("codex.alpha metric missing"); let alpha_point = match alpha_metric.data() { - opentelemetry_sdk::metrics::data::AggregatedMetrics::I64(data) => match data { + opentelemetry_sdk::metrics::data::AggregatedMetrics::U64(data) => match data { opentelemetry_sdk::metrics::data::MetricData::Sum(sum) => { let points: Vec<_> = sum.data_points().collect(); assert_eq!(points.len(), 1); @@ -117,7 +117,7 @@ fn send_merges_default_tags_per_line() -> Result<()> { let beta_metric = find_metric(&resource_metrics, "codex.beta").expect("codex.beta metric missing"); let beta_point = match beta_metric.data() { - opentelemetry_sdk::metrics::data::AggregatedMetrics::I64(data) => match data { + opentelemetry_sdk::metrics::data::AggregatedMetrics::U64(data) => match data { opentelemetry_sdk::metrics::data::MetricData::Sum(sum) => { let points: Vec<_> = sum.data_points().collect(); assert_eq!(points.len(), 1); @@ -151,7 +151,7 @@ fn client_sends_enqueued_metric() -> Result<()> { let resource_metrics = latest_metrics(&exporter); let counter = find_metric(&resource_metrics, "codex.turns").expect("counter metric missing"); let points = match counter.data() { - opentelemetry_sdk::metrics::data::AggregatedMetrics::I64(data) => match data { + opentelemetry_sdk::metrics::data::AggregatedMetrics::U64(data) => match data { opentelemetry_sdk::metrics::data::MetricData::Sum(sum) => { sum.data_points().collect::>() } @@ -179,7 +179,7 @@ fn shutdown_flushes_in_memory_exporter() -> Result<()> { let resource_metrics = latest_metrics(&exporter); let counter = find_metric(&resource_metrics, "codex.turns").expect("counter metric missing"); let points = match counter.data() { - opentelemetry_sdk::metrics::data::AggregatedMetrics::I64(data) => match data { + opentelemetry_sdk::metrics::data::AggregatedMetrics::U64(data) => match data { opentelemetry_sdk::metrics::data::MetricData::Sum(sum) => { sum.data_points().collect::>() } diff --git a/codex-rs/otel/tests/suite/validation.rs b/codex-rs/otel/tests/suite/validation.rs index 9a33cbc0644..f88d9fbcd42 100644 --- a/codex-rs/otel/tests/suite/validation.rs +++ b/codex-rs/otel/tests/suite/validation.rs @@ -73,3 +73,15 @@ fn counter_rejects_invalid_metric_name() -> Result<()> { metrics.shutdown()?; Ok(()) } + +#[test] +fn counter_rejects_negative_increment() -> Result<()> { + let metrics = build_in_memory_client()?; + let err = metrics.counter("codex.turns", -1, &[]).unwrap_err(); + assert!(matches!( + err, + MetricsError::NegativeCounterIncrement { name, inc } if name == "codex.turns" && inc == -1 + )); + metrics.shutdown()?; + Ok(()) +} From 968109b6fc2e50ef8436f5a1a5320b5022f6f7eb Mon Sep 17 00:00:00 2001 From: jif-oai Date: Tue, 23 Dec 2025 17:30:51 +0100 Subject: [PATCH 37/43] Add a test --- codex-rs/otel/tests/suite/mod.rs | 1 + .../otel/tests/suite/otlp_http_loopback.rs | 192 ++++++++++++++++++ 2 files changed, 193 insertions(+) create mode 100644 codex-rs/otel/tests/suite/otlp_http_loopback.rs diff --git a/codex-rs/otel/tests/suite/mod.rs b/codex-rs/otel/tests/suite/mod.rs index 46b8ba57c86..c79c7e37c4d 100644 --- a/codex-rs/otel/tests/suite/mod.rs +++ b/codex-rs/otel/tests/suite/mod.rs @@ -1,4 +1,5 @@ mod manager_metrics; +mod otlp_http_loopback; mod send; mod timing; mod validation; diff --git a/codex-rs/otel/tests/suite/otlp_http_loopback.rs b/codex-rs/otel/tests/suite/otlp_http_loopback.rs new file mode 100644 index 00000000000..599021b3f54 --- /dev/null +++ b/codex-rs/otel/tests/suite/otlp_http_loopback.rs @@ -0,0 +1,192 @@ +use codex_otel::config::OtelExporter; +use codex_otel::config::OtelHttpProtocol; +use codex_otel::metrics::MetricsClient; +use codex_otel::metrics::MetricsConfig; +use codex_otel::metrics::Result; +use std::collections::HashMap; +use std::io::Read as _; +use std::io::Write as _; +use std::net::TcpListener; +use std::net::TcpStream; +use std::sync::mpsc; +use std::thread; +use std::time::Duration; +use std::time::Instant; + +struct CapturedRequest { + path: String, + content_type: Option, + body: Vec, +} + +fn read_http_request( + stream: &mut TcpStream, +) -> std::io::Result<(String, HashMap, Vec)> { + stream.set_read_timeout(Some(Duration::from_secs(2)))?; + + let mut buf = Vec::new(); + let mut scratch = [0u8; 8192]; + let header_end = loop { + let n = stream.read(&mut scratch)?; + if n == 0 { + return Err(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + "EOF before headers", + )); + } + buf.extend_from_slice(&scratch[..n]); + if let Some(end) = buf.windows(4).position(|w| w == b"\r\n\r\n") { + break end; + } + if buf.len() > 1024 * 1024 { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidData, + "headers too large", + )); + } + }; + + let headers_bytes = &buf[..header_end]; + let mut body_bytes = buf[header_end + 4..].to_vec(); + + let headers_str = std::str::from_utf8(headers_bytes).map_err(|err| { + std::io::Error::new( + std::io::ErrorKind::InvalidData, + format!("headers not utf-8: {err}"), + ) + })?; + let mut lines = headers_str.split("\r\n"); + let start = lines.next().ok_or_else(|| { + std::io::Error::new(std::io::ErrorKind::InvalidData, "missing request line") + })?; + let mut parts = start.split_whitespace(); + let _method = parts.next().unwrap_or_default(); + let path = parts + .next() + .ok_or_else(|| std::io::Error::new(std::io::ErrorKind::InvalidData, "missing path"))? + .to_string(); + + let mut headers = HashMap::new(); + for line in lines { + let Some((k, v)) = line.split_once(':') else { + continue; + }; + headers.insert(k.trim().to_ascii_lowercase(), v.trim().to_string()); + } + + if let Some(len) = headers + .get("content-length") + .and_then(|v| v.parse::().ok()) + { + while body_bytes.len() < len { + let n = stream.read(&mut scratch)?; + if n == 0 { + return Err(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + "EOF before body complete", + )); + } + body_bytes.extend_from_slice(&scratch[..n]); + if body_bytes.len() > len + 1024 * 1024 { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidData, + "body too large", + )); + } + } + body_bytes.truncate(len); + } + + Ok((path, headers, body_bytes)) +} + +fn write_http_response(stream: &mut TcpStream, status: &str) -> std::io::Result<()> { + let response = format!("HTTP/1.1 {status}\r\nContent-Length: 0\r\nConnection: close\r\n\r\n"); + stream.write_all(response.as_bytes())?; + stream.flush() +} + +#[test] +fn otlp_http_exporter_sends_metrics_to_collector() -> Result<()> { + let listener = TcpListener::bind("127.0.0.1:0").expect("bind"); + let addr = listener.local_addr().expect("local_addr"); + listener.set_nonblocking(true).expect("set_nonblocking"); + + let (tx, rx) = mpsc::channel::>(); + let server = thread::spawn(move || { + let mut captured = Vec::new(); + let deadline = Instant::now() + Duration::from_secs(3); + + while Instant::now() < deadline { + match listener.accept() { + Ok((mut stream, _)) => { + let result = read_http_request(&mut stream); + let _ = write_http_response(&mut stream, "202 Accepted"); + if let Ok((path, headers, body)) = result { + captured.push(CapturedRequest { + path, + content_type: headers.get("content-type").cloned(), + body, + }); + } + } + Err(err) if err.kind() == std::io::ErrorKind::WouldBlock => { + thread::sleep(Duration::from_millis(10)); + } + Err(_) => break, + } + } + + let _ = tx.send(captured); + }); + + let metrics = MetricsClient::new(MetricsConfig::otlp( + "test", + "codex-cli", + env!("CARGO_PKG_VERSION"), + OtelExporter::OtlpHttp { + endpoint: format!("http://{addr}/v1/metrics"), + headers: HashMap::new(), + protocol: OtelHttpProtocol::Json, + tls: None, + }, + ))?; + + metrics.counter("codex.turns", 1, &[("source", "test")])?; + metrics.shutdown()?; + + server.join().expect("server join"); + let captured = rx.recv_timeout(Duration::from_secs(1)).expect("captured"); + + let request = captured + .iter() + .find(|req| req.path == "/v1/metrics") + .unwrap_or_else(|| { + let paths = captured + .iter() + .map(|req| req.path.as_str()) + .collect::>() + .join(", "); + panic!( + "missing /v1/metrics request; got {}: {paths}", + captured.len() + ); + }); + let content_type = request + .content_type + .as_deref() + .unwrap_or(""); + assert!( + content_type.starts_with("application/json"), + "unexpected content-type: {content_type}" + ); + + let body = String::from_utf8_lossy(&request.body); + assert!( + body.contains("codex.turns"), + "expected metric name not found; body prefix: {}", + &body.chars().take(2000).collect::() + ); + + Ok(()) +} From 67547c0c7a6dd6f3437a3e3f3a3f02d9a1f5992b Mon Sep 17 00:00:00 2001 From: jif-oai Date: Tue, 23 Dec 2025 17:38:40 +0100 Subject: [PATCH 38/43] Fix tests --- codex-rs/core/src/config/types.rs | 2 +- codex-rs/otel/src/config.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/codex-rs/core/src/config/types.rs b/codex-rs/core/src/config/types.rs index b6acdcb33f6..7b6e639dd46 100644 --- a/codex-rs/core/src/config/types.rs +++ b/codex-rs/core/src/config/types.rs @@ -353,7 +353,7 @@ impl Default for OtelConfig { environment: DEFAULT_OTEL_ENVIRONMENT.to_owned(), exporter: OtelExporterKind::None, trace_exporter: OtelExporterKind::None, - metrics_exporter: OtelExporterKind::None, + metrics_exporter: OtelExporterKind::Statsig, } } } diff --git a/codex-rs/otel/src/config.rs b/codex-rs/otel/src/config.rs index d1a1251bd91..f8f2d5a1063 100644 --- a/codex-rs/otel/src/config.rs +++ b/codex-rs/otel/src/config.rs @@ -3,7 +3,7 @@ use std::path::PathBuf; use codex_utils_absolute_path::AbsolutePathBuf; -pub(crate) const STATSIG_OTLP_HTTP_ENDPOINT: &str = "https://ab.chatgpt.com/otlp"; +pub(crate) const STATSIG_OTLP_HTTP_ENDPOINT: &str = "https://ab.chatgpt.com/otlp/v1/metrics"; pub(crate) const STATSIG_API_KEY_HEADER: &str = "statsig-api-key"; pub(crate) const STATSIG_API_KEY: &str = "client-MkRuleRQBd6qakfnDYqJVR9JuXcY57Ljly3vi5JVUIO"; From f34623bab32fca604bf86331b3a079c9ead86f63 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Mon, 5 Jan 2026 11:10:45 +0000 Subject: [PATCH 39/43] Fix merge --- codex-rs/otel/src/metrics/client.rs | 6 ++++-- codex-rs/otel/src/otlp.rs | 6 +++--- codex-rs/otel/src/traces/otel_provider.rs | 5 ----- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/codex-rs/otel/src/metrics/client.rs b/codex-rs/otel/src/metrics/client.rs index c607fbebc04..2feec63fba2 100644 --- a/codex-rs/otel/src/metrics/client.rs +++ b/codex-rs/otel/src/metrics/client.rs @@ -18,6 +18,8 @@ use opentelemetry_otlp::Protocol; use opentelemetry_otlp::WithExportConfig; use opentelemetry_otlp::WithHttpConfig; use opentelemetry_otlp::WithTonicConfig; +use opentelemetry_otlp::tonic_types::metadata::MetadataMap; +use opentelemetry_otlp::tonic_types::transport::ClientTlsConfig; use opentelemetry_sdk::Resource; use opentelemetry_sdk::metrics::PeriodicReader; use opentelemetry_sdk::metrics::SdkMeterProvider; @@ -253,7 +255,7 @@ fn build_otlp_metric_exporter( let header_map = crate::otlp::build_header_map(&headers); - let base_tls_config = tonic::transport::ClientTlsConfig::new() + let base_tls_config = ClientTlsConfig::new() .with_enabled_roots() .assume_http2(true); @@ -269,7 +271,7 @@ fn build_otlp_metric_exporter( .with_tonic() .with_endpoint(endpoint) .with_temporality(temporality) - .with_metadata(tonic::metadata::MetadataMap::from_headers(header_map)) + .with_metadata(MetadataMap::from_headers(header_map)) .with_tls_config(tls_config) .build() .map_err(|source| MetricsError::ExporterBuild { source }) diff --git a/codex-rs/otel/src/otlp.rs b/codex-rs/otel/src/otlp.rs index 1312fe4b074..c70e5e55e9e 100644 --- a/codex-rs/otel/src/otlp.rs +++ b/codex-rs/otel/src/otlp.rs @@ -3,6 +3,9 @@ use codex_utils_absolute_path::AbsolutePathBuf; use http::Uri; use opentelemetry_otlp::OTEL_EXPORTER_OTLP_TIMEOUT; use opentelemetry_otlp::OTEL_EXPORTER_OTLP_TIMEOUT_DEFAULT; +use opentelemetry_otlp::tonic_types::transport::Certificate as TonicCertificate; +use opentelemetry_otlp::tonic_types::transport::ClientTlsConfig; +use opentelemetry_otlp::tonic_types::transport::Identity as TonicIdentity; use reqwest::Certificate as ReqwestCertificate; use reqwest::Identity as ReqwestIdentity; use reqwest::header::HeaderMap; @@ -15,9 +18,6 @@ use std::io; use std::io::ErrorKind; use std::path::PathBuf; use std::time::Duration; -use tonic::transport::Certificate as TonicCertificate; -use tonic::transport::ClientTlsConfig; -use tonic::transport::Identity as TonicIdentity; pub(crate) fn build_header_map(headers: &std::collections::HashMap) -> HeaderMap { let mut header_map = HeaderMap::new(); diff --git a/codex-rs/otel/src/traces/otel_provider.rs b/codex-rs/otel/src/traces/otel_provider.rs index 11ec3c7df1f..b6a542d4bda 100644 --- a/codex-rs/otel/src/traces/otel_provider.rs +++ b/codex-rs/otel/src/traces/otel_provider.rs @@ -20,9 +20,7 @@ use opentelemetry_otlp::WithExportConfig; use opentelemetry_otlp::WithHttpConfig; use opentelemetry_otlp::WithTonicConfig; use opentelemetry_otlp::tonic_types::metadata::MetadataMap; -use opentelemetry_otlp::tonic_types::transport::Certificate as TonicCertificate; use opentelemetry_otlp::tonic_types::transport::ClientTlsConfig; -use opentelemetry_otlp::tonic_types::transport::Identity as TonicIdentity; use opentelemetry_sdk::Resource; use opentelemetry_sdk::logs::SdkLoggerProvider; use opentelemetry_sdk::propagation::TraceContextPropagator; @@ -35,9 +33,6 @@ use std::collections::HashMap; use std::env; use std::error::Error; use std::sync::OnceLock; -use std::time::Duration; -use tonic::metadata::MetadataMap; -use tonic::transport::ClientTlsConfig; use tracing::debug; use tracing::level_filters::LevelFilter; use tracing::warn; From 5f344d2d23ec128b9c2c3faa237fec1031c3e987 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Wed, 7 Jan 2026 16:12:04 +0000 Subject: [PATCH 40/43] Add timer --- codex-rs/core/src/tasks/compact.rs | 14 ++++---- codex-rs/otel/src/lib.rs | 26 +++------------ codex-rs/otel/src/metrics/client.rs | 30 +++-------------- codex-rs/otel/src/metrics/mod.rs | 1 + codex-rs/otel/src/metrics/timer.rs | 42 +++++++++++++++++++++++ codex-rs/otel/tests/suite/timing.rs | 52 ++++------------------------- 6 files changed, 66 insertions(+), 99 deletions(-) create mode 100644 codex-rs/otel/src/metrics/timer.rs diff --git a/codex-rs/core/src/tasks/compact.rs b/codex-rs/core/src/tasks/compact.rs index 1e46e627513..4b5f0d1cfb3 100644 --- a/codex-rs/core/src/tasks/compact.rs +++ b/codex-rs/core/src/tasks/compact.rs @@ -24,19 +24,21 @@ impl SessionTask for CompactTask { input: Vec, _cancellation_token: CancellationToken, ) -> Option { - let _ = session - .session - .services - .otel_manager - .counter("codex.task.compact", 1, &[]); - let session = session.clone_session(); if crate::compact::should_use_remote_compact_task( session.as_ref(), &ctx.client.get_provider(), ) { + let _ = session + .services + .otel_manager + .counter("codex.task.compact.remote", 1, &[]); crate::compact_remote::run_remote_compact_task(session, ctx).await } else { + let _ = session + .services + .otel_manager + .counter("codex.task.compact.local", 1, &[]); crate::compact::run_compact_task(session, ctx, input).await } diff --git a/codex-rs/otel/src/lib.rs b/codex-rs/otel/src/lib.rs index cd7d484a3dd..014734faf66 100644 --- a/codex-rs/otel/src/lib.rs +++ b/codex-rs/otel/src/lib.rs @@ -6,7 +6,9 @@ mod otlp; use crate::metrics::MetricsClient; use crate::metrics::MetricsConfig; +use crate::metrics::MetricsError; use crate::metrics::Result as MetricsResult; +use crate::metrics::timer::Timer; use crate::metrics::validation::validate_tag_key; use crate::metrics::validation::validate_tag_value; use crate::traces::otel_provider::OtelProvider; @@ -104,30 +106,12 @@ impl OtelManager { metrics.record_duration(name, duration, &tags) } - pub fn time( - &self, - name: &str, - tags: &[(&str, &str)], - f: impl FnOnce() -> T, - ) -> MetricsResult { - let Some(metrics) = &self.metrics else { - return Ok(f()); - }; - let tags = self.tags_with_metadata(tags)?; - metrics.time(name, &tags, f) - } - - pub fn time_result( - &self, - name: &str, - tags: &[(&str, &str)], - f: impl FnOnce() -> MetricsResult, - ) -> MetricsResult { + pub fn start_timer(&self, name: &str, tags: &[(&str, &str)]) -> Result { let Some(metrics) = &self.metrics else { - return f(); + return Err(MetricsError::ExporterDisabled); }; let tags = self.tags_with_metadata(tags)?; - metrics.time_result(name, &tags, f) + metrics.start_timer(name, &tags) } pub fn shutdown_metrics(&self) -> MetricsResult<()> { diff --git a/codex-rs/otel/src/metrics/client.rs b/codex-rs/otel/src/metrics/client.rs index 2feec63fba2..362199d6989 100644 --- a/codex-rs/otel/src/metrics/client.rs +++ b/codex-rs/otel/src/metrics/client.rs @@ -4,6 +4,7 @@ use crate::metrics::MetricsError; use crate::metrics::Result; use crate::metrics::config::MetricsConfig; use crate::metrics::config::MetricsExporter; +use crate::metrics::timer::Timer; use crate::metrics::validation::validate_metric_name; use crate::metrics::validation::validate_tag_key; use crate::metrics::validation::validate_tag_value; @@ -29,7 +30,6 @@ use std::collections::BTreeMap; use std::collections::HashMap; use std::sync::Mutex; use std::time::Duration; -use std::time::Instant; use tracing::debug; const ENV_ATTRIBUTE: &str = "env"; @@ -179,34 +179,12 @@ impl MetricsClient { ) } - /// Measure a closure and emit a histogram sample for the elapsed time. - pub fn time(&self, name: &str, tags: &[(&str, &str)], f: impl FnOnce() -> T) -> Result { - let start = Instant::now(); - let output = f(); - self.record_duration(name, start.elapsed(), tags)?; - Ok(output) - } - - /// Measure a closure that returns a metrics result without nesting results. - pub fn time_result( + pub fn start_timer( &self, name: &str, tags: &[(&str, &str)], - f: impl FnOnce() -> Result, - ) -> Result { - let start = Instant::now(); - let output = f(); - let duration_result = self.record_duration(name, start.elapsed(), tags); - match output { - Ok(value) => { - duration_result?; - Ok(value) - } - Err(err) => { - let _ = duration_result; - Err(err) - } - } + ) -> std::result::Result { + Ok(Timer::new(name, tags, self)) } /// Flush metrics and stop the underlying OTEL meter provider. diff --git a/codex-rs/otel/src/metrics/mod.rs b/codex-rs/otel/src/metrics/mod.rs index ccc3f841ada..b13d5f917e3 100644 --- a/codex-rs/otel/src/metrics/mod.rs +++ b/codex-rs/otel/src/metrics/mod.rs @@ -1,6 +1,7 @@ mod client; mod config; mod error; +pub(crate) mod timer; pub(crate) mod validation; pub use crate::metrics::client::MetricsClient; diff --git a/codex-rs/otel/src/metrics/timer.rs b/codex-rs/otel/src/metrics/timer.rs new file mode 100644 index 00000000000..b1624fda163 --- /dev/null +++ b/codex-rs/otel/src/metrics/timer.rs @@ -0,0 +1,42 @@ +use crate::metrics::MetricsClient; +use crate::metrics::error::Result; +use std::time::Instant; + +pub struct Timer { + name: String, + tags: Vec<(String, String)>, + client: MetricsClient, + start_time: Instant, +} + +impl Drop for Timer { + fn drop(&mut self) { + if let Err(e) = self.record() { + tracing::error!("metrics client error: {}", e); + } + } +} + +impl Timer { + pub(crate) fn new(name: &str, tags: &[(&str, &str)], client: &MetricsClient) -> Self { + Self { + name: name.to_string(), + tags: tags + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(), + client: client.clone(), + start_time: Instant::now(), + } + } + + pub fn record(&self) -> Result<()> { + let tags = self + .tags + .iter() + .map(|(k, v)| (k.as_str(), v.as_str())) + .collect::>(); + self.client + .record_duration(&self.name, self.start_time.elapsed(), &tags) + } +} diff --git a/codex-rs/otel/tests/suite/timing.rs b/codex-rs/otel/tests/suite/timing.rs index cd7687ce9ed..ce4f2f982e7 100644 --- a/codex-rs/otel/tests/suite/timing.rs +++ b/codex-rs/otel/tests/suite/timing.rs @@ -2,7 +2,6 @@ use crate::harness::attributes_to_map; use crate::harness::build_metrics_with_defaults; use crate::harness::histogram_data; use crate::harness::latest_metrics; -use codex_otel::metrics::MetricsError; use codex_otel::metrics::Result; use pretty_assertions::assert_eq; use std::time::Duration; @@ -31,61 +30,22 @@ fn record_duration_records_histogram() -> Result<()> { // Ensures time_result returns the closure output and records timing. #[test] -fn time_result_records_success() -> Result<()> { +fn timer_result_records_success() -> Result<()> { let (metrics, exporter) = build_metrics_with_defaults(&[])?; - let value = metrics.time_result("codex.request_latency", &[("route", "chat")], || Ok("ok"))?; - assert_eq!(value, "ok"); - metrics.shutdown()?; + { + let timer = metrics.start_timer("codex.request_latency", &[("route", "chat")]); + assert!(timer.is_ok()); + } - let resource_metrics = latest_metrics(&exporter); - let (bounds, bucket_counts, _sum, count) = - histogram_data(&resource_metrics, "codex.request_latency"); - assert!(!bounds.is_empty()); - assert_eq!(count, 1); - assert_eq!(bucket_counts.iter().sum::(), 1); - let attrs = attributes_to_map( - match crate::harness::find_metric(&resource_metrics, "codex.request_latency").and_then( - |metric| match metric.data() { - opentelemetry_sdk::metrics::data::AggregatedMetrics::F64( - opentelemetry_sdk::metrics::data::MetricData::Histogram(histogram), - ) => histogram - .data_points() - .next() - .map(opentelemetry_sdk::metrics::data::HistogramDataPoint::attributes), - _ => None, - }, - ) { - Some(attrs) => attrs, - None => panic!("attributes missing"), - }, - ); - assert_eq!(attrs.get("route").map(String::as_str), Some("chat")); - - Ok(()) -} - -// Ensures time_result propagates errors but still records timing. -#[test] -fn time_result_records_on_error() -> Result<()> { - let (metrics, exporter) = build_metrics_with_defaults(&[])?; - - let err = metrics - .time_result( - "codex.request_latency", - &[("route", "chat")], - || -> Result<&'static str> { Err(MetricsError::EmptyMetricName) }, - ) - .unwrap_err(); - assert!(matches!(err, MetricsError::EmptyMetricName)); metrics.shutdown()?; let resource_metrics = latest_metrics(&exporter); let (bounds, bucket_counts, _sum, count) = histogram_data(&resource_metrics, "codex.request_latency"); assert!(!bounds.is_empty()); - assert_eq!(bucket_counts.iter().sum::(), 1); assert_eq!(count, 1); + assert_eq!(bucket_counts.iter().sum::(), 1); let attrs = attributes_to_map( match crate::harness::find_metric(&resource_metrics, "codex.request_latency").and_then( |metric| match metric.data() { From dd1c025796ca0dcf9ef2db34c9cf5f11c4c691c6 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Wed, 7 Jan 2026 16:24:04 +0000 Subject: [PATCH 41/43] Disable metrics on analytics false --- codex-rs/core/src/otel_init.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/codex-rs/core/src/otel_init.rs b/codex-rs/core/src/otel_init.rs index a57496ecf5d..f9bf75e8acb 100644 --- a/codex-rs/core/src/otel_init.rs +++ b/codex-rs/core/src/otel_init.rs @@ -64,7 +64,11 @@ pub fn build_provider( let exporter = to_otel_exporter(&config.otel.exporter); let trace_exporter = to_otel_exporter(&config.otel.trace_exporter); - let metrics_exporter = to_otel_exporter(&config.otel.metrics_exporter); + let metrics_exporter = if config.analytics { + to_otel_exporter(&config.otel.metrics_exporter) + } else { + OtelExporter::None + }; OtelProvider::from(&OtelSettings { service_name: originator().value.to_owned(), From bc7fa5d685e0d347a2264974fa67dfeea9f78b74 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Wed, 7 Jan 2026 17:04:31 +0000 Subject: [PATCH 42/43] Drop error at client level and just log --- codex-rs/otel/src/lib.rs | 62 ++++++++++++-------- codex-rs/otel/tests/suite/manager_metrics.rs | 4 +- 2 files changed, 41 insertions(+), 25 deletions(-) diff --git a/codex-rs/otel/src/lib.rs b/codex-rs/otel/src/lib.rs index 014734faf66..4fe2aed9115 100644 --- a/codex-rs/otel/src/lib.rs +++ b/codex-rs/otel/src/lib.rs @@ -77,33 +77,49 @@ impl OtelManager { } } - pub fn counter(&self, name: &str, inc: i64, tags: &[(&str, &str)]) -> MetricsResult<()> { - let Some(metrics) = &self.metrics else { - return Ok(()); - }; - let tags = self.tags_with_metadata(tags)?; - metrics.counter(name, inc, &tags) + pub fn counter(&self, name: &str, inc: i64, tags: &[(&str, &str)]) { + let res: MetricsResult<()> = (|| { + let Some(metrics) = &self.metrics else { + return Ok(()); + }; + + let tags = self.tags_with_metadata(tags)?; + metrics.counter(name, inc, &tags) + })(); + + if let Err(e) = res { + tracing::warn!("metrics counter failed: {e}"); + } } - pub fn histogram(&self, name: &str, value: i64, tags: &[(&str, &str)]) -> MetricsResult<()> { - let Some(metrics) = &self.metrics else { - return Ok(()); - }; - let tags = self.tags_with_metadata(tags)?; - metrics.histogram(name, value, &tags) + pub fn histogram(&self, name: &str, value: i64, tags: &[(&str, &str)]) { + let res: MetricsResult<()> = (|| { + let Some(metrics) = &self.metrics else { + return Ok(()); + }; + + let tags = self.tags_with_metadata(tags)?; + metrics.histogram(name, value, &tags) + })(); + + if let Err(e) = res { + tracing::warn!("metrics histogram failed: {e}"); + } } - pub fn record_duration( - &self, - name: &str, - duration: Duration, - tags: &[(&str, &str)], - ) -> MetricsResult<()> { - let Some(metrics) = &self.metrics else { - return Ok(()); - }; - let tags = self.tags_with_metadata(tags)?; - metrics.record_duration(name, duration, &tags) + pub fn record_duration(&self, name: &str, duration: Duration, tags: &[(&str, &str)]) { + let res: MetricsResult<()> = (|| { + let Some(metrics) = &self.metrics else { + return Ok(()); + }; + + let tags = self.tags_with_metadata(tags)?; + metrics.record_duration(name, duration, &tags) + })(); + + if let Err(e) = res { + tracing::warn!("metrics duration failed: {e}"); + } } pub fn start_timer(&self, name: &str, tags: &[(&str, &str)]) -> Result { diff --git a/codex-rs/otel/tests/suite/manager_metrics.rs b/codex-rs/otel/tests/suite/manager_metrics.rs index b85ba1bbf13..a3dfd958104 100644 --- a/codex-rs/otel/tests/suite/manager_metrics.rs +++ b/codex-rs/otel/tests/suite/manager_metrics.rs @@ -29,7 +29,7 @@ fn manager_attaches_metadata_tags_to_metrics() -> Result<()> { ) .with_metrics(metrics); - manager.counter("codex.session_started", 1, &[("source", "tui")])?; + manager.counter("codex.session_started", 1, &[("source", "tui")]); manager.shutdown_metrics()?; let resource_metrics = latest_metrics(&exporter); @@ -79,7 +79,7 @@ fn manager_allows_disabling_metadata_tags() -> Result<()> { ) .with_metrics_without_metadata_tags(metrics); - manager.counter("codex.session_started", 1, &[("source", "tui")])?; + manager.counter("codex.session_started", 1, &[("source", "tui")]); manager.shutdown_metrics()?; let resource_metrics = latest_metrics(&exporter); From 28eec16fe8e0865c32d80752b2676329a5dfa8a9 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Wed, 7 Jan 2026 17:27:11 +0000 Subject: [PATCH 43/43] fix merge --- codex-rs/core/src/features.rs | 8 +++----- codex-rs/otel/src/lib.rs | 10 +++++----- codex-rs/otel/tests/suite/manager_metrics.rs | 6 +++--- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/codex-rs/core/src/features.rs b/codex-rs/core/src/features.rs index e24aa5f2c4c..ffc766f0f83 100644 --- a/codex-rs/core/src/features.rs +++ b/codex-rs/core/src/features.rs @@ -202,17 +202,15 @@ impl Features { pub fn emit_metrics(&self, otel: &OtelManager) { for feature in FEATURES { - if self.enabled(feature.id) != feature.default_enabled - && let Err(e) = otel.counter( + if self.enabled(feature.id) != feature.default_enabled { + otel.counter( "codex.feature.state", 1, &[ ("feature", feature.key), ("value", &self.enabled(feature.id).to_string()), ], - ) - { - tracing::warn!("Error while emitting feature metrics {e:?}"); + ); } } } diff --git a/codex-rs/otel/src/lib.rs b/codex-rs/otel/src/lib.rs index 4fe2aed9115..25607623204 100644 --- a/codex-rs/otel/src/lib.rs +++ b/codex-rs/otel/src/lib.rs @@ -12,7 +12,7 @@ use crate::metrics::timer::Timer; use crate::metrics::validation::validate_tag_key; use crate::metrics::validation::validate_tag_value; use crate::traces::otel_provider::OtelProvider; -use codex_protocol::ConversationId; +use codex_protocol::ThreadId; use serde::Serialize; use std::time::Duration; use strum_macros::Display; @@ -27,7 +27,7 @@ pub enum ToolDecisionSource { #[derive(Debug, Clone)] pub struct OtelEventMetadata { - pub(crate) conversation_id: ConversationId, + pub(crate) conversation_id: ThreadId, pub(crate) auth_mode: Option, pub(crate) account_id: Option, pub(crate) account_email: Option, @@ -88,7 +88,7 @@ impl OtelManager { })(); if let Err(e) = res { - tracing::warn!("metrics counter failed: {e}"); + tracing::warn!("metrics counter [{name}] failed: {e}"); } } @@ -103,7 +103,7 @@ impl OtelManager { })(); if let Err(e) = res { - tracing::warn!("metrics histogram failed: {e}"); + tracing::warn!("metrics histogram [{name}] failed: {e}"); } } @@ -118,7 +118,7 @@ impl OtelManager { })(); if let Err(e) = res { - tracing::warn!("metrics duration failed: {e}"); + tracing::warn!("metrics duration [{name}] failed: {e}"); } } diff --git a/codex-rs/otel/tests/suite/manager_metrics.rs b/codex-rs/otel/tests/suite/manager_metrics.rs index a3dfd958104..1497a5f84c7 100644 --- a/codex-rs/otel/tests/suite/manager_metrics.rs +++ b/codex-rs/otel/tests/suite/manager_metrics.rs @@ -5,7 +5,7 @@ use crate::harness::latest_metrics; use codex_app_server_protocol::AuthMode; use codex_otel::OtelManager; use codex_otel::metrics::Result; -use codex_protocol::ConversationId; +use codex_protocol::ThreadId; use codex_protocol::protocol::SessionSource; use opentelemetry_sdk::metrics::data::AggregatedMetrics; use opentelemetry_sdk::metrics::data::MetricData; @@ -17,7 +17,7 @@ use std::collections::BTreeMap; fn manager_attaches_metadata_tags_to_metrics() -> Result<()> { let (metrics, exporter) = build_metrics_with_defaults(&[("service", "codex-cli")])?; let manager = OtelManager::new( - ConversationId::new(), + ThreadId::new(), "gpt-5.1", "gpt-5.1", Some("account-id".to_string()), @@ -67,7 +67,7 @@ fn manager_attaches_metadata_tags_to_metrics() -> Result<()> { fn manager_allows_disabling_metadata_tags() -> Result<()> { let (metrics, exporter) = build_metrics_with_defaults(&[])?; let manager = OtelManager::new( - ConversationId::new(), + ThreadId::new(), "gpt-4o", "gpt-4o", Some("account-id".to_string()),