From d6d03d42ea20dfb4ab2b84bbbff4755b455afac1 Mon Sep 17 00:00:00 2001 From: guinness-oai Date: Wed, 20 May 2026 16:03:51 -0700 Subject: [PATCH] [codex] Fix realtime v1 websocket compatibility (#23771) ## Why Realtime v1 websocket sessions now expect a slightly different boundary shape for text input, completed input transcripts, and connection headers. Codex was still using the older shape, so some v1 text appends could be rejected before the existing conversation flow could handle them. ## What changed - Send v1 user text items with `input_text` content - Accept v1 turn-marked input transcript events as completed transcripts - Add the v1 alpha header only for v1 realtime sessions - Cover the outbound text shape, transcript parsing, and versioned headers ## Test plan - `cargo test -p codex-api endpoint::realtime_websocket::methods::tests` - `cargo test -p codex-core quicksilver_alpha_header` --- .../endpoint/realtime_websocket/methods.rs | 20 ++++++++++++++ .../endpoint/realtime_websocket/methods_v1.rs | 2 +- .../endpoint/realtime_websocket/protocol.rs | 1 - .../realtime_websocket/protocol_v1.rs | 3 ++- codex-rs/core/src/realtime_conversation.rs | 7 +++++ .../core/src/realtime_conversation_tests.rs | 27 +++++++++++++++++++ 6 files changed, 57 insertions(+), 3 deletions(-) diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs index 9fcca1c3e..ad26549a2 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs @@ -993,6 +993,22 @@ mod tests { ); } + #[test] + fn parse_v1_input_transcript_turn_marked_event() { + let payload = json!({ + "type": "conversation.input_transcript.turn_marked", + "transcript": "hello realtime" + }) + .to_string(); + + assert_eq!( + parse_realtime_event(payload.as_str(), RealtimeEventParser::V1), + Some(RealtimeEvent::InputTranscriptDone(RealtimeTranscriptDone { + text: "hello realtime".to_string(), + })) + ); + } + #[test] fn parse_output_transcript_delta_event() { let payload = json!({ @@ -1581,6 +1597,10 @@ mod tests { .expect("text"); let third_json: Value = serde_json::from_str(&third).expect("json"); assert_eq!(third_json["type"], "conversation.item.create"); + assert_eq!( + third_json["item"]["content"][0]["type"], + Value::String("input_text".to_string()) + ); assert_eq!(third_json["item"]["content"][0]["text"], "hello agent"); let fourth = ws diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_v1.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_v1.rs index 19e4fa203..0f1a26908 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_v1.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_v1.rs @@ -21,7 +21,7 @@ pub(super) fn conversation_item_create_message(text: String) -> RealtimeOutbound r#type: ConversationItemType::Message, role: ConversationRole::User, content: vec![ConversationItemContent { - r#type: ConversationContentType::Text, + r#type: ConversationContentType::InputText, text, }], }), diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs index d689f6ea9..5df4c0c50 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs @@ -199,7 +199,6 @@ pub(super) struct ConversationItemContent { #[derive(Debug, Clone, Copy, Serialize)] #[serde(rename_all = "snake_case")] pub(super) enum ConversationContentType { - Text, InputText, } diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v1.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v1.rs index 3c1d25aed..a46485224 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v1.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v1.rs @@ -43,7 +43,8 @@ pub(super) fn parse_realtime_event_v1(payload: &str) -> Option { | "conversation.item.input_audio_transcription.delta" => { parse_transcript_delta_event(&parsed, "delta").map(RealtimeEvent::InputTranscriptDelta) } - "conversation.item.input_audio_transcription.completed" => { + "conversation.input_transcript.turn_marked" + | "conversation.item.input_audio_transcription.completed" => { parse_transcript_done_event(&parsed, "transcript") .map(RealtimeEvent::InputTranscriptDone) } diff --git a/codex-rs/core/src/realtime_conversation.rs b/codex-rs/core/src/realtime_conversation.rs index 249b3ae15..7f71142e1 100644 --- a/codex-rs/core/src/realtime_conversation.rs +++ b/codex-rs/core/src/realtime_conversation.rs @@ -641,12 +641,14 @@ async fn prepare_realtime_start( realtime_request_headers( requested_realtime_session_id.as_deref(), Some(realtime_api_key.as_str()), + version, )? } ConversationStartTransport::Webrtc { .. } => { realtime_request_headers( requested_realtime_session_id.as_deref(), /*api_key*/ None, + version, )? } }; @@ -973,9 +975,14 @@ fn realtime_api_key(auth: Option<&CodexAuth>, provider: &ModelProviderInfo) -> C fn realtime_request_headers( realtime_session_id: Option<&str>, api_key: Option<&str>, + version: RealtimeWsVersion, ) -> CodexResult> { let mut headers = HeaderMap::new(); + if version == RealtimeWsVersion::V1 { + headers.insert("openai-alpha", HeaderValue::from_static("quicksilver=v1")); + } + if let Some(realtime_session_id) = realtime_session_id && let Ok(realtime_session_id) = HeaderValue::from_str(realtime_session_id) { diff --git a/codex-rs/core/src/realtime_conversation_tests.rs b/codex-rs/core/src/realtime_conversation_tests.rs index a146c4386..b67205ef8 100644 --- a/codex-rs/core/src/realtime_conversation_tests.rs +++ b/codex-rs/core/src/realtime_conversation_tests.rs @@ -1,9 +1,11 @@ use super::RealtimeHandoffState; use super::RealtimeSessionKind; use super::realtime_delegation_from_handoff; +use super::realtime_request_headers; use super::realtime_text_from_handoff_request; use super::wrap_realtime_delegation_input; use async_channel::bounded; +use codex_config::config_toml::RealtimeWsVersion; use codex_protocol::protocol::RealtimeHandoffRequested; use codex_protocol::protocol::RealtimeTranscriptEntry; use pretty_assertions::assert_eq; @@ -137,3 +139,28 @@ async fn clears_active_handoff_explicitly() { *state.active_handoff.lock().await = None; assert_eq!(state.active_handoff.lock().await.clone(), None); } + +#[test] +fn uses_quicksilver_alpha_header_for_realtime_v1() { + let headers = + realtime_request_headers(Some("session_1"), Some("sk-test"), RealtimeWsVersion::V1) + .expect("headers") + .expect("headers"); + + assert_eq!( + headers + .get("openai-alpha") + .and_then(|value| value.to_str().ok()), + Some("quicksilver=v1") + ); +} + +#[test] +fn omits_quicksilver_alpha_header_for_realtime_v2() { + let headers = + realtime_request_headers(Some("session_1"), Some("sk-test"), RealtimeWsVersion::V2) + .expect("headers") + .expect("headers"); + + assert!(headers.get("openai-alpha").is_none()); +}