From c3736cff0a1fbaeecceb3fc0b97324b935c44a59 Mon Sep 17 00:00:00 2001 From: Owen Lin Date: Thu, 5 Mar 2026 16:30:53 -0800 Subject: [PATCH] feat(otel): safe tracing (#13626) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Motivation Today config.toml has three different OTEL knobs under `[otel]`: - `exporter` controls where OTEL logs go - `trace_exporter` controls where OTEL traces go - `metrics_exporter` controls where metrics go Those often (pretty much always?) serve different purposes. For example, for OpenAI internal usage, the **log exporter** is already being used for IT/security telemetry, and that use case is intentionally content-rich: tool calls, arguments, outputs, MCP payloads, and in some cases user content are all useful there. `log_user_prompt` is a good example of that distinction. When it’s enabled, we include raw prompt text in OTEL logs, which is acceptable for the security use case. The **trace exporter** is a different story. The goal there is to give OpenAI engineers visibility into latency and request behavior when they run Codex locally, without sending sensitive prompt or tool data as trace event data. In other words, traces should help answer “what was slow?” or “where did time go?”, not “what did the user say?” or “what did the tool return?” The complication is that Rust’s `tracing` crate does not make a hard distinction between “logs” and “trace events.” It gives us one instrumentation API for logs and trace events (via `tracing::event!`), and subscribers decide what gets treated as logs, trace events, or both. Before this change, our OTEL trace layer was effectively attached to the general tracing stream, which meant turning on `trace_exporter` could pick up content-rich events that were originally written with logging (and the `log_exporter`) in mind. That made it too easy for sensitive data to end up in exported traces by accident. ### Concrete example In `otel_manager.rs`, this `tracing::event!` call would be exported in both logs AND traces (as a trace event). ``` pub fn user_prompt(&self, items: &[UserInput]) { let prompt = items .iter() .flat_map(|item| match item { UserInput::Text { text, .. } => Some(text.as_str()), _ => None, }) .collect::(); let prompt_to_log = if self.metadata.log_user_prompts { prompt.as_str() } else { "[REDACTED]" }; tracing::event!( tracing::Level::INFO, event.name = "codex.user_prompt", event.timestamp = %timestamp(), // ... prompt = %prompt_to_log, ); } ``` Instead of `tracing::event!`, we should now be using `log_event!` and `trace_event!` instead to more clearly indicate which sink (logs vs. traces) that event should be exported to. ### What changed This PR makes the log and trace export distinct instead of treating them as two sinks for the same data. On the provider side, OTEL logs and traces now have separate routing/filtering policy. The log exporter keeps receiving the existing `codex_otel` events, while trace export is limited to spans and trace events. On the event side, `OtelManager` now emits two flavors of telemetry where needed: - a log-only event with the current rich payloads - a tracing-safe event with summaries only It also has a convenience `log_and_trace_event!` macro for emitting to both logs and traces when it's safe to do so, as well as log- and trace-specific fields. That means prompts, tool args, tool output, account email, MCP metadata, and similar content stay in the log lane, while traces get the pieces that are actually useful for performance work: durations, counts, sizes, status, token counts, tool origin, and normalized error classes. This preserves current IT/security logging behavior while making it safe to turn on trace export for employees. ### Full list of things removed from trace export - raw user prompt text from `codex.user_prompt` - raw tool arguments and output from `codex.tool_result` - MCP server metadata from `codex.tool_result` (mcp_server, mcp_server_origin) - account identity fields like `user.email` and `user.account_id` from trace-safe OTEL events - `host.name` from trace resources - generic `codex.tool_decision` events from traces - generic `codex.sse_event` events from traces - the full ToolCall debug payload from the `handle_tool_call` span What traces now keep instead is mostly: - spans - trace-safe OTEL events - counts, lengths, durations, status, token counts, and tool origin summaries --- codex-rs/core/src/tools/parallel.rs | 2 +- codex-rs/otel/src/lib.rs | 4 + codex-rs/otel/src/otel_provider.rs | 90 +++- codex-rs/otel/src/traces/otel_manager.rs | 413 ++++++++++-------- codex-rs/otel/tests/suite/mod.rs | 1 + .../tests/suite/otel_export_routing_policy.rs | 299 +++++++++++++ 6 files changed, 603 insertions(+), 206 deletions(-) create mode 100644 codex-rs/otel/tests/suite/otel_export_routing_policy.rs diff --git a/codex-rs/core/src/tools/parallel.rs b/codex-rs/core/src/tools/parallel.rs index 5f2fc89e5..a37c93db9 100644 --- a/codex-rs/core/src/tools/parallel.rs +++ b/codex-rs/core/src/tools/parallel.rs @@ -46,7 +46,7 @@ impl ToolCallRuntime { } } - #[instrument(level = "trace", skip_all, fields(call = ?call))] + #[instrument(level = "trace", skip_all)] pub(crate) fn handle_tool_call( self, call: ToolCall, diff --git a/codex-rs/otel/src/lib.rs b/codex-rs/otel/src/lib.rs index daae8ce03..4f36966b9 100644 --- a/codex-rs/otel/src/lib.rs +++ b/codex-rs/otel/src/lib.rs @@ -31,6 +31,10 @@ pub use crate::trace_context::current_span_w3c_trace_context; pub use crate::trace_context::set_parent_from_context; pub use crate::trace_context::set_parent_from_w3c_trace_context; +pub(crate) const OTEL_TARGET_PREFIX: &str = "codex_otel"; +pub(crate) const OTEL_LOG_ONLY_TARGET: &str = "codex_otel.log_only"; +pub(crate) const OTEL_TRACE_SAFE_TARGET: &str = "codex_otel.trace_safe"; + #[derive(Debug, Clone, Serialize, Display)] #[serde(rename_all = "snake_case")] pub enum ToolDecisionSource { diff --git a/codex-rs/otel/src/otel_provider.rs b/codex-rs/otel/src/otel_provider.rs index dcfb313b6..8145813d8 100644 --- a/codex-rs/otel/src/otel_provider.rs +++ b/codex-rs/otel/src/otel_provider.rs @@ -1,3 +1,5 @@ +use crate::OTEL_TARGET_PREFIX; +use crate::OTEL_TRACE_SAFE_TARGET; use crate::config::OtelExporter; use crate::config::OtelHttpProtocol; use crate::config::OtelSettings; @@ -31,7 +33,6 @@ use std::env; use std::error::Error; use std::sync::OnceLock; use tracing::debug; -use tracing::level_filters::LevelFilter; use tracing::warn; use tracing_subscriber::Layer; use tracing_subscriber::registry::LookupSpan; @@ -41,6 +42,13 @@ const HOST_NAME_ATTRIBUTE: &str = "host.name"; const TRACEPARENT_ENV_VAR: &str = "TRACEPARENT"; const TRACESTATE_ENV_VAR: &str = "TRACESTATE"; static TRACEPARENT_CONTEXT: OnceLock> = OnceLock::new(); + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum ResourceKind { + Logs, + Traces, +} + pub struct OtelProvider { pub logger: Option, pub tracer_provider: Option, @@ -90,13 +98,14 @@ impl OtelProvider { return Ok(None); } - let resource = make_resource(settings); + let log_resource = make_resource(settings, ResourceKind::Logs); + let trace_resource = make_resource(settings, ResourceKind::Traces); let logger = log_enabled - .then(|| build_logger(&resource, &settings.exporter)) + .then(|| build_logger(&log_resource, &settings.exporter)) .transpose()?; let tracer_provider = trace_enabled - .then(|| build_tracer_provider(&resource, &settings.trace_exporter)) + .then(|| build_tracer_provider(&trace_resource, &settings.trace_exporter)) .transpose()?; let tracer = tracer_provider @@ -121,7 +130,7 @@ impl OtelProvider { { self.logger.as_ref().map(|logger| { OpenTelemetryTracingBridge::new(logger).with_filter( - tracing_subscriber::filter::filter_fn(OtelProvider::codex_export_filter), + tracing_subscriber::filter::filter_fn(OtelProvider::log_export_filter), ) }) } @@ -133,12 +142,22 @@ impl OtelProvider { self.tracer.as_ref().map(|tracer| { tracing_opentelemetry::layer() .with_tracer(tracer.clone()) - .with_filter(LevelFilter::TRACE) + .with_filter(tracing_subscriber::filter::filter_fn( + OtelProvider::trace_export_filter, + )) }) } pub fn codex_export_filter(meta: &tracing::Metadata<'_>) -> bool { - meta.target().starts_with("codex_otel") + Self::log_export_filter(meta) + } + + pub fn log_export_filter(meta: &tracing::Metadata<'_>) -> bool { + is_log_export_target(meta.target()) + } + + pub fn trace_export_filter(meta: &tracing::Metadata<'_>) -> bool { + meta.is_span() || is_trace_safe_target(meta.target()) } pub fn metrics(&self) -> Option<&MetricsClient> { @@ -182,17 +201,22 @@ fn load_traceparent_context() -> Option { } } -fn make_resource(settings: &OtelSettings) -> Resource { +fn make_resource(settings: &OtelSettings, kind: ResourceKind) -> Resource { Resource::builder() .with_service_name(settings.service_name.clone()) .with_attributes(resource_attributes( settings, detected_host_name().as_deref(), + kind, )) .build() } -fn resource_attributes(settings: &OtelSettings, host_name: Option<&str>) -> Vec { +fn resource_attributes( + settings: &OtelSettings, + host_name: Option<&str>, + kind: ResourceKind, +) -> Vec { let mut attributes = vec![ KeyValue::new( semconv::attribute::SERVICE_VERSION, @@ -200,7 +224,9 @@ fn resource_attributes(settings: &OtelSettings, host_name: Option<&str>) -> Vec< ), KeyValue::new(ENV_ATTRIBUTE, settings.environment.clone()), ]; - if let Some(host_name) = host_name.and_then(normalize_host_name) { + if kind == ResourceKind::Logs + && let Some(host_name) = host_name.and_then(normalize_host_name) + { attributes.push(KeyValue::new(HOST_NAME_ATTRIBUTE, host_name)); } attributes @@ -216,6 +242,14 @@ fn normalize_host_name(host_name: &str) -> Option { (!host_name.is_empty()).then(|| host_name.to_owned()) } +fn is_log_export_target(target: &str) -> bool { + target.starts_with(OTEL_TARGET_PREFIX) && !is_trace_safe_target(target) +} + +fn is_trace_safe_target(target: &str) -> bool { + target.starts_with(OTEL_TRACE_SAFE_TARGET) +} + fn build_logger( resource: &Resource, exporter: &OtelExporter, @@ -387,7 +421,11 @@ mod tests { #[test] fn resource_attributes_include_host_name_when_present() { - let attrs = resource_attributes(&test_otel_settings(), Some("opentelemetry-test")); + let attrs = resource_attributes( + &test_otel_settings(), + Some("opentelemetry-test"), + ResourceKind::Logs, + ); let host_name = attrs .iter() @@ -399,8 +437,13 @@ mod tests { #[test] fn resource_attributes_omit_host_name_when_missing_or_empty() { - let missing = resource_attributes(&test_otel_settings(), None); - let empty = resource_attributes(&test_otel_settings(), Some(" ")); + let missing = resource_attributes(&test_otel_settings(), None, ResourceKind::Logs); + let empty = resource_attributes(&test_otel_settings(), Some(" "), ResourceKind::Logs); + let trace_attrs = resource_attributes( + &test_otel_settings(), + Some("opentelemetry-test"), + ResourceKind::Traces, + ); assert!( !missing @@ -412,6 +455,27 @@ mod tests { .iter() .any(|kv| kv.key.as_str() == HOST_NAME_ATTRIBUTE) ); + assert!( + !trace_attrs + .iter() + .any(|kv| kv.key.as_str() == HOST_NAME_ATTRIBUTE) + ); + } + + #[test] + fn log_export_target_excludes_trace_safe_events() { + assert!(is_log_export_target("codex_otel.log_only")); + assert!(is_log_export_target("codex_otel.network_proxy")); + assert!(!is_log_export_target("codex_otel.trace_safe")); + assert!(!is_log_export_target("codex_otel.trace_safe.debug")); + } + + #[test] + fn trace_export_target_only_includes_trace_safe_prefix() { + assert!(is_trace_safe_target("codex_otel.trace_safe")); + assert!(is_trace_safe_target("codex_otel.trace_safe.summary")); + assert!(!is_trace_safe_target("codex_otel.log_only")); + assert!(!is_trace_safe_target("codex_otel.network_proxy")); } fn test_otel_settings() -> OtelSettings { diff --git a/codex-rs/otel/src/traces/otel_manager.rs b/codex-rs/otel/src/traces/otel_manager.rs index aa11fce3b..ab0cc6ff0 100644 --- a/codex-rs/otel/src/traces/otel_manager.rs +++ b/codex-rs/otel/src/traces/otel_manager.rs @@ -1,3 +1,5 @@ +use crate::OTEL_LOG_ONLY_TARGET; +use crate::OTEL_TRACE_SAFE_TARGET; use crate::TelemetryAuthMode; use crate::metrics::names::API_CALL_COUNT_METRIC; use crate::metrics::names::API_CALL_DURATION_METRIC; @@ -56,6 +58,84 @@ const RESPONSES_API_ENGINE_SERVICE_TTFT_FIELD: &str = "engine_service_ttft_total const RESPONSES_API_ENGINE_IAPI_TBT_FIELD: &str = "engine_iapi_tbt_across_engine_calls_ms"; const RESPONSES_API_ENGINE_SERVICE_TBT_FIELD: &str = "engine_service_tbt_across_engine_calls_ms"; +macro_rules! log_event { + ($self:expr, $($fields:tt)*) => {{ + tracing::event!( + target: OTEL_LOG_ONLY_TARGET, + tracing::Level::INFO, + $($fields)* + event.timestamp = %timestamp(), + conversation.id = %$self.metadata.conversation_id, + app.version = %$self.metadata.app_version, + auth_mode = $self.metadata.auth_mode, + originator = %$self.metadata.originator, + user.account_id = $self.metadata.account_id, + user.email = $self.metadata.account_email, + terminal.type = %$self.metadata.terminal_type, + model = %$self.metadata.model, + slug = %$self.metadata.slug, + ); + }}; +} + +macro_rules! trace_event { + ($self:expr, $($fields:tt)*) => {{ + tracing::event!( + target: OTEL_TRACE_SAFE_TARGET, + tracing::Level::INFO, + $($fields)* + event.timestamp = %timestamp(), + conversation.id = %$self.metadata.conversation_id, + app.version = %$self.metadata.app_version, + auth_mode = $self.metadata.auth_mode, + originator = %$self.metadata.originator, + terminal.type = %$self.metadata.terminal_type, + model = %$self.metadata.model, + slug = %$self.metadata.slug, + ); + }}; +} + +macro_rules! log_and_trace_event { + ( + $self:expr, + common: { $($common:tt)* }, + log: { $($log:tt)* }, + trace: { $($trace:tt)* }, + ) => {{ + tracing::event!( + target: OTEL_LOG_ONLY_TARGET, + tracing::Level::INFO, + $($common)* + $($log)* + event.timestamp = %timestamp(), + conversation.id = %$self.metadata.conversation_id, + app.version = %$self.metadata.app_version, + auth_mode = $self.metadata.auth_mode, + originator = %$self.metadata.originator, + user.account_id = $self.metadata.account_id, + user.email = $self.metadata.account_email, + terminal.type = %$self.metadata.terminal_type, + model = %$self.metadata.model, + slug = %$self.metadata.slug, + ); + tracing::event!( + target: OTEL_TRACE_SAFE_TARGET, + tracing::Level::INFO, + $($common)* + $($trace)* + event.timestamp = %timestamp(), + conversation.id = %$self.metadata.conversation_id, + app.version = %$self.metadata.app_version, + auth_mode = $self.metadata.auth_mode, + originator = %$self.metadata.originator, + terminal.type = %$self.metadata.terminal_type, + model = %$self.metadata.model, + slug = %$self.metadata.slug, + ); + }}; +} + impl OtelManager { #[allow(clippy::too_many_arguments)] pub fn new( @@ -123,29 +203,27 @@ impl OtelManager { mcp_servers: Vec<&str>, active_profile: Option, ) { - tracing::event!( - tracing::Level::INFO, - event.name = "codex.conversation_starts", - event.timestamp = %timestamp(), - conversation.id = %self.metadata.conversation_id, - app.version = %self.metadata.app_version, - auth_mode = self.metadata.auth_mode, - originator = %self.metadata.originator, - user.account_id = self.metadata.account_id, - user.email = self.metadata.account_email, - terminal.type = %self.metadata.terminal_type, - model = %self.metadata.model, - slug = %self.metadata.slug, - provider_name = %provider_name, - reasoning_effort = reasoning_effort.map(|e| e.to_string()), - reasoning_summary = %reasoning_summary, - context_window = context_window, - auto_compact_token_limit = auto_compact_token_limit, - approval_policy = %approval_policy, - sandbox_policy = %sandbox_policy, - mcp_servers = mcp_servers.join(", "), - active_profile = active_profile, - ) + log_and_trace_event!( + self, + common: { + event.name = "codex.conversation_starts", + provider_name = %provider_name, + reasoning_effort = reasoning_effort.map(|e| e.to_string()), + reasoning_summary = %reasoning_summary, + context_window = context_window, + auto_compact_token_limit = auto_compact_token_limit, + approval_policy = %approval_policy, + sandbox_policy = %sandbox_policy, + }, + log: { + mcp_servers = mcp_servers.join(", "), + active_profile = active_profile, + }, + trace: { + mcp_server_count = mcp_servers.len() as i64, + active_profile_present = active_profile.is_some(), + }, + ); } pub async fn log_request(&self, attempt: u64, f: F) -> Result @@ -188,23 +266,17 @@ impl OtelManager { duration, &[("status", status_str.as_str()), ("success", success_str)], ); - tracing::event!( - tracing::Level::INFO, - event.name = "codex.api_request", - event.timestamp = %timestamp(), - conversation.id = %self.metadata.conversation_id, - app.version = %self.metadata.app_version, - auth_mode = self.metadata.auth_mode, - originator = %self.metadata.originator, - user.account_id = self.metadata.account_id, - user.email = self.metadata.account_email, - terminal.type = %self.metadata.terminal_type, - model = %self.metadata.model, - slug = %self.metadata.slug, - duration_ms = %duration.as_millis(), - http.response.status_code = status, - error.message = error, - attempt = attempt, + log_and_trace_event!( + self, + common: { + event.name = "codex.api_request", + duration_ms = %duration.as_millis(), + http.response.status_code = status, + error.message = error, + attempt = attempt, + }, + log: {}, + trace: {}, ); } @@ -220,22 +292,16 @@ impl OtelManager { duration, &[("success", success_str)], ); - tracing::event!( - tracing::Level::INFO, - event.name = "codex.websocket_request", - event.timestamp = %timestamp(), - conversation.id = %self.metadata.conversation_id, - app.version = %self.metadata.app_version, - auth_mode = self.metadata.auth_mode, - originator = %self.metadata.originator, - user.account_id = self.metadata.account_id, - user.email = self.metadata.account_email, - terminal.type = %self.metadata.terminal_type, - model = %self.metadata.model, - slug = %self.metadata.slug, - duration_ms = %duration.as_millis(), - success = success_str, - error.message = error, + log_and_trace_event!( + self, + common: { + event.name = "codex.websocket_request", + duration_ms = %duration.as_millis(), + success = success_str, + error.message = error, + }, + log: {}, + trace: {}, ); } @@ -321,23 +387,17 @@ impl OtelManager { let tags = [("kind", kind_str), ("success", success_str)]; self.counter(WEBSOCKET_EVENT_COUNT_METRIC, 1, &tags); self.record_duration(WEBSOCKET_EVENT_DURATION_METRIC, duration, &tags); - tracing::event!( - tracing::Level::INFO, - event.name = "codex.websocket_event", - event.timestamp = %timestamp(), - event.kind = %kind_str, - conversation.id = %self.metadata.conversation_id, - app.version = %self.metadata.app_version, - auth_mode = self.metadata.auth_mode, - originator = %self.metadata.originator, - user.account_id = self.metadata.account_id, - user.email = self.metadata.account_email, - terminal.type = %self.metadata.terminal_type, - model = %self.metadata.model, - slug = %self.metadata.slug, - duration_ms = %duration.as_millis(), - success = success_str, - error.message = error_message.as_deref(), + log_and_trace_event!( + self, + common: { + event.name = "codex.websocket_event", + event.kind = %kind_str, + duration_ms = %duration.as_millis(), + success = success_str, + error.message = error_message.as_deref(), + }, + log: {}, + trace: {}, ); } @@ -399,20 +459,10 @@ impl OtelManager { duration, &[("kind", kind), ("success", "true")], ); - tracing::event!( - tracing::Level::INFO, + log_event!( + self, event.name = "codex.sse_event", - event.timestamp = %timestamp(), event.kind = %kind, - conversation.id = %self.metadata.conversation_id, - app.version = %self.metadata.app_version, - auth_mode = self.metadata.auth_mode, - originator = %self.metadata.originator, - user.account_id = self.metadata.account_id, - user.email = self.metadata.account_email, - terminal.type = %self.metadata.terminal_type, - model = %self.metadata.model, - slug = %self.metadata.slug, duration_ms = %duration.as_millis(), ); } @@ -433,62 +483,43 @@ impl OtelManager { &[("kind", kind_str), ("success", "false")], ); match kind { - Some(kind) => tracing::event!( - tracing::Level::INFO, + Some(kind) => log_event!( + self, event.name = "codex.sse_event", - event.timestamp = %timestamp(), event.kind = %kind, - conversation.id = %self.metadata.conversation_id, - app.version = %self.metadata.app_version, - auth_mode = self.metadata.auth_mode, - originator = %self.metadata.originator, - user.account_id = self.metadata.account_id, - user.email = self.metadata.account_email, - terminal.type = %self.metadata.terminal_type, - model = %self.metadata.model, - slug = %self.metadata.slug, duration_ms = %duration.as_millis(), error.message = %error, ), - None => tracing::event!( - tracing::Level::INFO, + None => log_event!( + self, event.name = "codex.sse_event", - event.timestamp = %timestamp(), - conversation.id = %self.metadata.conversation_id, - app.version = %self.metadata.app_version, - auth_mode = self.metadata.auth_mode, - originator = %self.metadata.originator, - user.account_id = self.metadata.account_id, - user.email = self.metadata.account_email, - terminal.type = %self.metadata.terminal_type, - model = %self.metadata.model, - slug = %self.metadata.slug, duration_ms = %duration.as_millis(), error.message = %error, ), } + trace_event!( + self, + event.name = "codex.sse_event", + event.kind = %kind_str, + duration_ms = %duration.as_millis(), + error.message = %error, + ); } pub fn see_event_completed_failed(&self, error: &T) where T: Display, { - tracing::event!( - tracing::Level::INFO, - event.name = "codex.sse_event", - event.kind = %"response.completed", - event.timestamp = %timestamp(), - conversation.id = %self.metadata.conversation_id, - app.version = %self.metadata.app_version, - auth_mode = self.metadata.auth_mode, - originator = %self.metadata.originator, - user.account_id = self.metadata.account_id, - user.email = self.metadata.account_email, - terminal.type = %self.metadata.terminal_type, - model = %self.metadata.model, - slug = %self.metadata.slug, - error.message = %error, - ) + log_and_trace_event!( + self, + common: { + event.name = "codex.sse_event", + event.kind = %"response.completed", + error.message = %error, + }, + log: {}, + trace: {}, + ); } pub fn sse_event_completed( @@ -499,25 +530,19 @@ impl OtelManager { reasoning_token_count: Option, tool_token_count: i64, ) { - tracing::event!( - tracing::Level::INFO, - event.name = "codex.sse_event", - event.timestamp = %timestamp(), - event.kind = %"response.completed", - conversation.id = %self.metadata.conversation_id, - app.version = %self.metadata.app_version, - auth_mode = self.metadata.auth_mode, - originator = %self.metadata.originator, - user.account_id = self.metadata.account_id, - user.email = self.metadata.account_email, - terminal.type = %self.metadata.terminal_type, - model = %self.metadata.model, - slug = %self.metadata.slug, - input_token_count = %input_token_count, - output_token_count = %output_token_count, - cached_token_count = cached_token_count, - reasoning_token_count = reasoning_token_count, - tool_token_count = %tool_token_count, + log_and_trace_event!( + self, + common: { + event.name = "codex.sse_event", + event.kind = %"response.completed", + input_token_count = %input_token_count, + output_token_count = %output_token_count, + cached_token_count = cached_token_count, + reasoning_token_count = reasoning_token_count, + tool_token_count = %tool_token_count, + }, + log: {}, + trace: {}, ); } @@ -529,6 +554,18 @@ impl OtelManager { _ => None, }) .collect::(); + let text_input_count = items + .iter() + .filter(|item| matches!(item, UserInput::Text { .. })) + .count(); + let image_input_count = items + .iter() + .filter(|item| matches!(item, UserInput::Image { .. })) + .count(); + let local_image_input_count = items + .iter() + .filter(|item| matches!(item, UserInput::LocalImage { .. })) + .count(); let prompt_to_log = if self.metadata.log_user_prompts { prompt.as_str() @@ -536,22 +573,20 @@ impl OtelManager { "[REDACTED]" }; - tracing::event!( - tracing::Level::INFO, + log_event!( + self, event.name = "codex.user_prompt", - event.timestamp = %timestamp(), - conversation.id = %self.metadata.conversation_id, - app.version = %self.metadata.app_version, - auth_mode = self.metadata.auth_mode, - originator = %self.metadata.originator, - user.account_id = self.metadata.account_id, - user.email = self.metadata.account_email, - terminal.type = %self.metadata.terminal_type, - model = %self.metadata.model, - slug = %self.metadata.slug, prompt_length = %prompt.chars().count(), prompt = %prompt_to_log, ); + trace_event!( + self, + event.name = "codex.user_prompt", + prompt_length = %prompt.chars().count(), + text_input_count = text_input_count as i64, + image_input_count = image_input_count as i64, + local_image_input_count = local_image_input_count as i64, + ); } pub fn tool_decision( @@ -561,19 +596,9 @@ impl OtelManager { decision: &ReviewDecision, source: ToolDecisionSource, ) { - tracing::event!( - tracing::Level::INFO, + log_event!( + self, event.name = "codex.tool_decision", - event.timestamp = %timestamp(), - conversation.id = %self.metadata.conversation_id, - app.version = %self.metadata.app_version, - auth_mode = self.metadata.auth_mode, - originator = %self.metadata.originator, - user.account_id = self.metadata.account_id, - user.email = self.metadata.account_email, - terminal.type = %self.metadata.terminal_type, - model = %self.metadata.model, - slug = %self.metadata.slug, tool_name = %tool_name, call_id = %call_id, decision = %decision.clone().to_string().to_lowercase(), @@ -622,19 +647,9 @@ impl OtelManager { } pub fn log_tool_failed(&self, tool_name: &str, error: &str) { - tracing::event!( - tracing::Level::INFO, + log_event!( + self, event.name = "codex.tool_result", - event.timestamp = %timestamp(), - conversation.id = %self.metadata.conversation_id, - app.version = %self.metadata.app_version, - auth_mode = self.metadata.auth_mode, - originator = %self.metadata.originator, - user.account_id = self.metadata.account_id, - user.email = self.metadata.account_email, - terminal.type = %self.metadata.terminal_type, - model = %self.metadata.model, - slug = %self.metadata.slug, tool_name = %tool_name, duration_ms = %Duration::ZERO.as_millis(), success = %false, @@ -642,6 +657,17 @@ impl OtelManager { mcp_server = "", mcp_server_origin = "", ); + trace_event!( + self, + event.name = "codex.tool_result", + tool_name = %tool_name, + duration_ms = %Duration::ZERO.as_millis(), + success = %false, + output_length = error.len() as i64, + output_line_count = error.lines().count() as i64, + tool_origin = %"builtin", + error.message = %error, + ); } #[allow(clippy::too_many_arguments)] @@ -666,19 +692,9 @@ impl OtelManager { self.record_duration(TOOL_CALL_DURATION_METRIC, duration, &tags); let mcp_server = mcp_server.unwrap_or(""); let mcp_server_origin = mcp_server_origin.unwrap_or(""); - tracing::event!( - tracing::Level::INFO, + log_event!( + self, event.name = "codex.tool_result", - event.timestamp = %timestamp(), - conversation.id = %self.metadata.conversation_id, - app.version = %self.metadata.app_version, - auth_mode = self.metadata.auth_mode, - originator = %self.metadata.originator, - user.account_id = self.metadata.account_id, - user.email = self.metadata.account_email, - terminal.type = %self.metadata.terminal_type, - model = %self.metadata.model, - slug = %self.metadata.slug, tool_name = %tool_name, call_id = %call_id, arguments = %arguments, @@ -688,6 +704,19 @@ impl OtelManager { mcp_server = %mcp_server, mcp_server_origin = %mcp_server_origin, ); + trace_event!( + self, + event.name = "codex.tool_result", + tool_name = %tool_name, + call_id = %call_id, + duration_ms = %duration.as_millis(), + success = %success_str, + arguments_length = arguments.len() as i64, + output_length = output.len() as i64, + output_line_count = output.lines().count() as i64, + tool_origin = if mcp_server.is_empty() { "builtin" } else { "mcp" }, + mcp_tool = !mcp_server.is_empty(), + ); } fn record_responses_websocket_timing_metrics(&self, value: &serde_json::Value) { diff --git a/codex-rs/otel/tests/suite/mod.rs b/codex-rs/otel/tests/suite/mod.rs index 16aa0f494..79ee1e475 100644 --- a/codex-rs/otel/tests/suite/mod.rs +++ b/codex-rs/otel/tests/suite/mod.rs @@ -1,4 +1,5 @@ mod manager_metrics; +mod otel_export_routing_policy; mod otlp_http_loopback; mod runtime_summary; mod send; diff --git a/codex-rs/otel/tests/suite/otel_export_routing_policy.rs b/codex-rs/otel/tests/suite/otel_export_routing_policy.rs new file mode 100644 index 000000000..875bfd666 --- /dev/null +++ b/codex-rs/otel/tests/suite/otel_export_routing_policy.rs @@ -0,0 +1,299 @@ +use codex_otel::OtelManager; +use codex_otel::TelemetryAuthMode; +use codex_otel::otel_provider::OtelProvider; +use opentelemetry::KeyValue; +use opentelemetry::logs::AnyValue; +use opentelemetry::trace::TracerProvider as _; +use opentelemetry_sdk::logs::InMemoryLogExporter; +use opentelemetry_sdk::logs::SdkLogRecord; +use opentelemetry_sdk::logs::SdkLoggerProvider; +use opentelemetry_sdk::trace::InMemorySpanExporter; +use opentelemetry_sdk::trace::SdkTracerProvider; +use pretty_assertions::assert_eq; +use std::borrow::Cow; +use std::collections::BTreeMap; +use std::path::PathBuf; +use tracing_subscriber::Layer; +use tracing_subscriber::filter::filter_fn; +use tracing_subscriber::layer::SubscriberExt; + +use codex_protocol::ThreadId; +use codex_protocol::protocol::SessionSource; +use codex_protocol::user_input::UserInput; + +fn log_attributes(record: &SdkLogRecord) -> BTreeMap { + record + .attributes_iter() + .map(|(key, value)| (key.as_str().to_string(), any_value_to_string(value))) + .collect() +} + +fn span_event_attributes(event: &opentelemetry::trace::Event) -> BTreeMap { + event + .attributes + .iter() + .map(|KeyValue { key, value, .. }| (key.as_str().to_string(), value.to_string())) + .collect() +} + +fn any_value_to_string(value: &AnyValue) -> String { + match value { + AnyValue::Int(value) => value.to_string(), + AnyValue::Double(value) => value.to_string(), + AnyValue::String(value) => value.as_str().to_string(), + AnyValue::Boolean(value) => value.to_string(), + AnyValue::Bytes(value) => String::from_utf8_lossy(value).into_owned(), + AnyValue::ListAny(value) => format!("{value:?}"), + AnyValue::Map(value) => format!("{value:?}"), + _ => format!("{value:?}"), + } +} + +fn find_log_by_event_name<'a>( + logs: &'a [opentelemetry_sdk::logs::in_memory_exporter::LogDataWithResource], + event_name: &str, +) -> &'a opentelemetry_sdk::logs::in_memory_exporter::LogDataWithResource { + logs.iter() + .find(|log| { + log_attributes(&log.record) + .get("event.name") + .is_some_and(|value| value == event_name) + }) + .unwrap_or_else(|| panic!("missing log event: {event_name}")) +} + +fn find_span_event_by_name_attr<'a>( + events: &'a [opentelemetry::trace::Event], + event_name: &str, +) -> &'a opentelemetry::trace::Event { + events + .iter() + .find(|event| { + span_event_attributes(event) + .get("event.name") + .is_some_and(|value| value == event_name) + }) + .unwrap_or_else(|| panic!("missing span event: {event_name}")) +} + +#[test] +fn otel_export_routing_policy_routes_user_prompt_log_and_trace_events() { + let log_exporter = InMemoryLogExporter::default(); + let logger_provider = SdkLoggerProvider::builder() + .with_simple_exporter(log_exporter.clone()) + .build(); + let span_exporter = InMemorySpanExporter::default(); + let tracer_provider = SdkTracerProvider::builder() + .with_simple_exporter(span_exporter.clone()) + .build(); + let tracer = tracer_provider.tracer("sink-split-test"); + + let subscriber = tracing_subscriber::registry() + .with( + opentelemetry_appender_tracing::layer::OpenTelemetryTracingBridge::new( + &logger_provider, + ) + .with_filter(filter_fn(OtelProvider::log_export_filter)), + ) + .with( + tracing_opentelemetry::layer() + .with_tracer(tracer) + .with_filter(filter_fn(OtelProvider::trace_export_filter)), + ); + + tracing::subscriber::with_default(subscriber, || { + tracing::callsite::rebuild_interest_cache(); + let manager = OtelManager::new( + ThreadId::new(), + "gpt-5.1", + "gpt-5.1", + Some("account-id".to_string()), + Some("engineer@example.com".to_string()), + Some(TelemetryAuthMode::ApiKey), + "codex_exec".to_string(), + true, + "tty".to_string(), + SessionSource::Cli, + ); + let root_span = tracing::info_span!("root"); + let _root_guard = root_span.enter(); + manager.user_prompt(&[ + UserInput::Text { + text: "super secret prompt".to_string(), + text_elements: Vec::new(), + }, + UserInput::Image { + image_url: "https://example.com/image.png".to_string(), + }, + UserInput::LocalImage { + path: PathBuf::from("/tmp/secret.png"), + }, + ]); + }); + + logger_provider.force_flush().expect("flush logs"); + tracer_provider.force_flush().expect("flush traces"); + + let logs = log_exporter.get_emitted_logs().expect("log export"); + assert!( + logs.iter() + .all(|log| { log.record.target().map(Cow::as_ref) == Some("codex_otel.log_only") }) + ); + + let prompt_log = find_log_by_event_name(&logs, "codex.user_prompt"); + let prompt_log_attrs = log_attributes(&prompt_log.record); + assert_eq!( + prompt_log_attrs.get("prompt").map(String::as_str), + Some("super secret prompt") + ); + assert_eq!( + prompt_log_attrs.get("user.email").map(String::as_str), + Some("engineer@example.com") + ); + + let spans = span_exporter.get_finished_spans().expect("span export"); + assert_eq!(spans.len(), 1); + let span_events = &spans[0].events.events; + assert_eq!(span_events.len(), 1); + + let prompt_trace_event = find_span_event_by_name_attr(span_events, "codex.user_prompt"); + let prompt_trace_attrs = span_event_attributes(prompt_trace_event); + assert_eq!( + prompt_trace_attrs.get("prompt_length").map(String::as_str), + Some("19") + ); + assert_eq!( + prompt_trace_attrs + .get("text_input_count") + .map(String::as_str), + Some("1") + ); + assert_eq!( + prompt_trace_attrs + .get("image_input_count") + .map(String::as_str), + Some("1") + ); + assert_eq!( + prompt_trace_attrs + .get("local_image_input_count") + .map(String::as_str), + Some("1") + ); + assert!(!prompt_trace_attrs.contains_key("prompt")); + assert!(!prompt_trace_attrs.contains_key("user.email")); + assert!(!prompt_trace_attrs.contains_key("user.account_id")); +} + +#[test] +fn otel_export_routing_policy_routes_tool_result_log_and_trace_events() { + let log_exporter = InMemoryLogExporter::default(); + let logger_provider = SdkLoggerProvider::builder() + .with_simple_exporter(log_exporter.clone()) + .build(); + let span_exporter = InMemorySpanExporter::default(); + let tracer_provider = SdkTracerProvider::builder() + .with_simple_exporter(span_exporter.clone()) + .build(); + let tracer = tracer_provider.tracer("sink-split-test"); + + let subscriber = tracing_subscriber::registry() + .with( + opentelemetry_appender_tracing::layer::OpenTelemetryTracingBridge::new( + &logger_provider, + ) + .with_filter(filter_fn(OtelProvider::log_export_filter)), + ) + .with( + tracing_opentelemetry::layer() + .with_tracer(tracer) + .with_filter(filter_fn(OtelProvider::trace_export_filter)), + ); + + tracing::subscriber::with_default(subscriber, || { + tracing::callsite::rebuild_interest_cache(); + let manager = OtelManager::new( + ThreadId::new(), + "gpt-5.1", + "gpt-5.1", + Some("account-id".to_string()), + Some("engineer@example.com".to_string()), + Some(TelemetryAuthMode::ApiKey), + "codex_exec".to_string(), + true, + "tty".to_string(), + SessionSource::Cli, + ); + let root_span = tracing::info_span!("root"); + let _root_guard = root_span.enter(); + manager.tool_result_with_tags( + "shell", + "call-1", + "secret arguments", + std::time::Duration::from_millis(42), + true, + "secret output\nsecond line", + &[], + Some("internal-mcp"), + Some("stdio"), + ); + }); + + logger_provider.force_flush().expect("flush logs"); + tracer_provider.force_flush().expect("flush traces"); + + let logs = log_exporter.get_emitted_logs().expect("log export"); + assert!( + logs.iter() + .all(|log| { log.record.target().map(Cow::as_ref) == Some("codex_otel.log_only") }) + ); + + let tool_log = find_log_by_event_name(&logs, "codex.tool_result"); + let tool_log_attrs = log_attributes(&tool_log.record); + assert_eq!( + tool_log_attrs.get("arguments").map(String::as_str), + Some("secret arguments") + ); + assert_eq!( + tool_log_attrs.get("output").map(String::as_str), + Some("secret output\nsecond line") + ); + assert_eq!( + tool_log_attrs.get("mcp_server").map(String::as_str), + Some("internal-mcp") + ); + + let spans = span_exporter.get_finished_spans().expect("span export"); + assert_eq!(spans.len(), 1); + let span_events = &spans[0].events.events; + assert_eq!(span_events.len(), 1); + + let tool_trace_event = find_span_event_by_name_attr(span_events, "codex.tool_result"); + let tool_trace_attrs = span_event_attributes(tool_trace_event); + assert_eq!( + tool_trace_attrs.get("arguments_length").map(String::as_str), + Some("16") + ); + assert_eq!( + tool_trace_attrs.get("output_length").map(String::as_str), + Some("25") + ); + assert_eq!( + tool_trace_attrs + .get("output_line_count") + .map(String::as_str), + Some("2") + ); + assert_eq!( + tool_trace_attrs.get("tool_origin").map(String::as_str), + Some("mcp") + ); + assert_eq!( + tool_trace_attrs.get("mcp_tool").map(String::as_str), + Some("true") + ); + assert!(!tool_trace_attrs.contains_key("arguments")); + assert!(!tool_trace_attrs.contains_key("output")); + assert!(!tool_trace_attrs.contains_key("mcp_server")); + assert!(!tool_trace_attrs.contains_key("mcp_server_origin")); +}