diff --git a/codex-rs/app-server-protocol/schema/json/ClientRequest.json b/codex-rs/app-server-protocol/schema/json/ClientRequest.json index e183c88b0..275d0fc13 100644 --- a/codex-rs/app-server-protocol/schema/json/ClientRequest.json +++ b/codex-rs/app-server-protocol/schema/json/ClientRequest.json @@ -2119,13 +2119,6 @@ ], "type": "object" }, - "RealtimeConversationArchitecture": { - "enum": [ - "realtimeapi", - "avas" - ], - "type": "string" - }, "RealtimeConversationVersion": { "enum": [ "v1", diff --git a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json index 0a18c588f..cb9c7a622 100644 --- a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json +++ b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json @@ -14330,13 +14330,6 @@ "title": "RawResponseItemCompletedNotification", "type": "object" }, - "RealtimeConversationArchitecture": { - "enum": [ - "realtimeapi", - "avas" - ], - "type": "string" - }, "RealtimeConversationVersion": { "enum": [ "v1", diff --git a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json index 0ddfdb62b..df16d6588 100644 --- a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json +++ b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json @@ -10754,13 +10754,6 @@ "title": "RawResponseItemCompletedNotification", "type": "object" }, - "RealtimeConversationArchitecture": { - "enum": [ - "realtimeapi", - "avas" - ], - "type": "string" - }, "RealtimeConversationVersion": { "enum": [ "v1", diff --git a/codex-rs/app-server-protocol/schema/typescript/RealtimeConversationArchitecture.ts b/codex-rs/app-server-protocol/schema/typescript/RealtimeConversationArchitecture.ts deleted file mode 100644 index 4467e4a0f..000000000 --- a/codex-rs/app-server-protocol/schema/typescript/RealtimeConversationArchitecture.ts +++ /dev/null @@ -1,5 +0,0 @@ -// GENERATED CODE! DO NOT MODIFY BY HAND! - -// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually. - -export type RealtimeConversationArchitecture = "realtimeapi" | "avas"; diff --git a/codex-rs/app-server-protocol/schema/typescript/index.ts b/codex-rs/app-server-protocol/schema/typescript/index.ts index 786d4c72a..f7db65d5d 100644 --- a/codex-rs/app-server-protocol/schema/typescript/index.ts +++ b/codex-rs/app-server-protocol/schema/typescript/index.ts @@ -54,7 +54,6 @@ export type { NetworkPolicyRuleAction } from "./NetworkPolicyRuleAction"; export type { ParsedCommand } from "./ParsedCommand"; export type { Personality } from "./Personality"; export type { PlanType } from "./PlanType"; -export type { RealtimeConversationArchitecture } from "./RealtimeConversationArchitecture"; export type { RealtimeConversationVersion } from "./RealtimeConversationVersion"; export type { RealtimeOutputModality } from "./RealtimeOutputModality"; export type { RealtimeVoice } from "./RealtimeVoice"; diff --git a/codex-rs/app-server-protocol/src/protocol/common.rs b/codex-rs/app-server-protocol/src/protocol/common.rs index 3336f74a6..4e9a78539 100644 --- a/codex-rs/app-server-protocol/src/protocol/common.rs +++ b/codex-rs/app-server-protocol/src/protocol/common.rs @@ -1702,7 +1702,6 @@ mod tests { use codex_protocol::account::PlanType; use codex_protocol::models::BUILT_IN_PERMISSION_PROFILE_READ_ONLY; use codex_protocol::parse_command::ParsedCommand; - use codex_protocol::protocol::RealtimeConversationArchitecture; use codex_protocol::protocol::RealtimeConversationVersion; use codex_protocol::protocol::RealtimeOutputModality; use codex_protocol::protocol::RealtimeVoice; @@ -3139,7 +3138,6 @@ mod tests { let request = ClientRequest::ThreadRealtimeStart { request_id: RequestId::Integer(9), params: v2::ThreadRealtimeStartParams { - architecture: Some(RealtimeConversationArchitecture::Avas), client_managed_handoffs: Some(true), codex_responses_as_items: None, codex_response_item_prefix: None, @@ -3160,7 +3158,6 @@ mod tests { "method": "thread/realtime/start", "id": 9, "params": { - "architecture": "avas", "threadId": "thr_123", "clientManagedHandoffs": true, "codexResponsesAsItems": null, @@ -3186,7 +3183,6 @@ mod tests { let default_prompt_request = ClientRequest::ThreadRealtimeStart { request_id: RequestId::Integer(9), params: v2::ThreadRealtimeStartParams { - architecture: None, client_managed_handoffs: None, codex_responses_as_items: None, codex_response_item_prefix: None, @@ -3207,7 +3203,6 @@ mod tests { "method": "thread/realtime/start", "id": 9, "params": { - "architecture": null, "threadId": "thr_123", "clientManagedHandoffs": null, "codexResponsesAsItems": null, @@ -3228,7 +3223,6 @@ mod tests { let null_prompt_request = ClientRequest::ThreadRealtimeStart { request_id: RequestId::Integer(9), params: v2::ThreadRealtimeStartParams { - architecture: None, client_managed_handoffs: None, codex_responses_as_items: None, codex_response_item_prefix: None, @@ -3249,7 +3243,6 @@ mod tests { "method": "thread/realtime/start", "id": 9, "params": { - "architecture": null, "threadId": "thr_123", "clientManagedHandoffs": null, "codexResponsesAsItems": null, @@ -3436,7 +3429,6 @@ mod tests { let request = ClientRequest::ThreadRealtimeStart { request_id: RequestId::Integer(1), params: v2::ThreadRealtimeStartParams { - architecture: None, client_managed_handoffs: None, codex_responses_as_items: None, codex_response_item_prefix: None, diff --git a/codex-rs/app-server-protocol/src/protocol/v2/realtime.rs b/codex-rs/app-server-protocol/src/protocol/v2/realtime.rs index b30ecd7b6..4b8b2056e 100644 --- a/codex-rs/app-server-protocol/src/protocol/v2/realtime.rs +++ b/codex-rs/app-server-protocol/src/protocol/v2/realtime.rs @@ -1,6 +1,5 @@ use codex_protocol::protocol::ConversationTextRole; use codex_protocol::protocol::RealtimeAudioFrame as CoreRealtimeAudioFrame; -use codex_protocol::protocol::RealtimeConversationArchitecture; use codex_protocol::protocol::RealtimeConversationVersion; use codex_protocol::protocol::RealtimeOutputModality; use codex_protocol::protocol::RealtimeVoice; @@ -67,9 +66,6 @@ impl From for CoreRealtimeAudioFrame { #[ts(export_to = "v2/")] pub struct ThreadRealtimeStartParams { pub thread_id: String, - /// Overrides the configured realtime architecture for this session only. - #[ts(optional = nullable)] - pub architecture: Option, /// Leaves Codex response handoffs to the client's explicit append calls instead of forwarding /// them automatically. Defaults to false. #[ts(optional = nullable)] diff --git a/codex-rs/app-server/README.md b/codex-rs/app-server/README.md index 871f9cee7..5035ebf71 100644 --- a/codex-rs/app-server/README.md +++ b/codex-rs/app-server/README.md @@ -172,7 +172,7 @@ Example with notification opt-out: - `thread/inject_items` — append raw Responses API items to a loaded thread’s model-visible history without starting a user turn; returns `{}` on success. - `turn/steer` — add user input to an already in-flight regular turn without starting a new turn; returns the active `turnId` that accepted the input. `clientUserMessageId` is optional; when supplied, the corresponding `userMessage` item echoes it as `clientId`. Review and manual compaction turns reject `turn/steer`. - `turn/interrupt` — request cancellation of an in-flight turn by `(thread_id, turn_id)`; success is an empty `{}` response and the turn finishes with `status: "interrupted"`. -- `thread/realtime/start` — start a thread-scoped realtime session (experimental); pass `outputModality: "text"` or `outputModality: "audio"` to choose model output, optionally pass `model` and `version` to override configured realtime selection for this session only, and pass `includeStartupContext: false` to omit Codex's generated startup context. By default, automatic Codex text follows the protocol's speakable output path. Pass `clientManagedHandoffs: true` to disable automatic Codex response delivery so only the client's explicit append calls produce handoffs. Pass `codexResponsesAsItems: true` to send automatic Codex responses as realtime conversation items instead, and optionally pass `codexResponseItemPrefix` to prepend experiment instructions to those items. For V1 sessions, pass `codexResponseHandoffPrefix` while item mode is disabled to route automatic Codex commentary through `conversation.handoff.append` with that prefix; final answers remain unprefixed. Returns `{}` and streams `thread/realtime/*` notifications. Omit `transport` for the websocket transport, or pass `{ "type": "webrtc", "sdp": "..." }` to create a WebRTC session from a browser-generated SDP offer; the remote answer SDP is emitted as `thread/realtime/sdp`. +- `thread/realtime/start` — start a thread-scoped realtime session (experimental); pass `outputModality: "text"` or `outputModality: "audio"` to choose model output, optionally pass `model` and, for websocket transport only, `version` to override configured realtime selection for this session only, and pass `includeStartupContext: false` to omit Codex's generated startup context. By default, automatic Codex text follows the protocol's speakable output path. Pass `clientManagedHandoffs: true` to disable automatic Codex response delivery so only the client's explicit append calls produce handoffs. Pass `codexResponsesAsItems: true` to send automatic Codex responses as realtime conversation items instead, and optionally pass `codexResponseItemPrefix` to prepend experiment instructions to those items. For V1 sessions, pass `codexResponseHandoffPrefix` while item mode is disabled to route automatic Codex commentary through `conversation.handoff.append` with that prefix; final answers remain unprefixed. Returns `{}` and streams `thread/realtime/*` notifications. Omit `transport` for the websocket transport, or pass `{ "type": "webrtc", "sdp": "..." }` to create an AVAS/v1 WebRTC session from a browser-generated SDP offer; the remote answer SDP is emitted as `thread/realtime/sdp`. Explicit `version: "v2"` requests are rejected for WebRTC. - `thread/realtime/appendAudio` — append an input audio chunk to the active realtime session (experimental); returns `{}`. - `thread/realtime/appendText` — append text input to the active realtime session with a required `role` of `user`, `developer`, or `assistant` (experimental); returns `{}`. Older clients that omit `role` default to `user`. - `thread/realtime/appendSpeech` — append text that the realtime model should speak to the user (experimental); returns `{}`. @@ -885,8 +885,12 @@ Then send `offer.sdp` to app-server. Core uses `experimental_realtime_ws_backend Omit `prompt` to use Codex's default realtime backend prompt. Send `prompt: null` or `prompt: ""` when the session should start without that default backend prompt. -Clients may also pass `model` and `version` on `thread/realtime/start` to select a +Clients may also pass `model` on `thread/realtime/start` to select a different realtime session configuration without changing thread or user config. +For websocket transport, clients may pass `version` to select the realtime +protocol version for this session only. WebRTC sessions always use AVAS with +realtime v1; omitting `version` uses v1, and explicitly passing `version: "v2"` +is rejected. Pass `includeStartupContext: false` to skip Codex's startup context for this session while still using the selected backend prompt. Pass `clientManagedHandoffs: true` to suppress automatic Codex response handoffs diff --git a/codex-rs/app-server/src/request_processors/turn_processor.rs b/codex-rs/app-server/src/request_processors/turn_processor.rs index bd7aaf1ba..b7408a8c3 100644 --- a/codex-rs/app-server/src/request_processors/turn_processor.rs +++ b/codex-rs/app-server/src/request_processors/turn_processor.rs @@ -941,7 +941,6 @@ impl TurnRequestProcessor { request_id, thread.as_ref(), Op::RealtimeConversationStart(ConversationStartParams { - architecture: params.architecture, client_managed_handoffs: params.client_managed_handoffs.unwrap_or(false), codex_responses_as_items: params.codex_responses_as_items.unwrap_or(false), codex_response_item_prefix: params.codex_response_item_prefix, diff --git a/codex-rs/app-server/tests/suite/v2/experimental_api.rs b/codex-rs/app-server/tests/suite/v2/experimental_api.rs index 62ad4c067..d9d346993 100644 --- a/codex-rs/app-server/tests/suite/v2/experimental_api.rs +++ b/codex-rs/app-server/tests/suite/v2/experimental_api.rs @@ -81,7 +81,6 @@ async fn realtime_conversation_start_requires_experimental_api_capability() -> R let request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { - architecture: None, client_managed_handoffs: None, codex_responses_as_items: None, codex_response_item_prefix: None, @@ -198,7 +197,6 @@ async fn realtime_webrtc_start_requires_experimental_api_capability() -> Result< let request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { - architecture: None, client_managed_handoffs: None, codex_responses_as_items: None, codex_response_item_prefix: None, diff --git a/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs b/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs index d9f7d6fd3..199e3a72d 100644 --- a/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs +++ b/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs @@ -78,6 +78,7 @@ use wiremock::matchers::path; use wiremock::matchers::path_regex; const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10); +const DELEGATED_SHELL_TURN_TIMEOUT: Duration = Duration::from_secs(30); const DELEGATED_SHELL_TOOL_TIMEOUT_MS: u64 = 30_000; const STARTUP_CONTEXT_HEADER: &str = "Startup context from Codex."; const V2_STEERING_ACKNOWLEDGEMENT: &str = @@ -347,7 +348,6 @@ impl RealtimeE2eHarness { let start_request_id = self .mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { - architecture: None, client_managed_handoffs, thread_id: self.thread_id.clone(), codex_response_item_prefix: codex_responses_as_items @@ -385,6 +385,58 @@ impl RealtimeE2eHarness { Ok(StartedWebrtcRealtime { started, sdp }) } + async fn start_websocket_realtime(&mut self) -> Result { + self.start_websocket_realtime_with_codex_responses_as_items( + /*codex_responses_as_items*/ None, + ) + .await + } + + async fn start_websocket_realtime_with_codex_response_items( + &mut self, + ) -> Result { + self.start_websocket_realtime_with_codex_responses_as_items( + /*codex_responses_as_items*/ Some(true), + ) + .await + } + + async fn start_websocket_realtime_with_codex_responses_as_items( + &mut self, + codex_responses_as_items: Option, + ) -> Result { + let start_request_id = self + .mcp + .send_thread_realtime_start_request(ThreadRealtimeStartParams { + thread_id: self.thread_id.clone(), + client_managed_handoffs: None, + codex_response_item_prefix: codex_responses_as_items + .unwrap_or(false) + .then(|| RESPONSE_ITEM_PREFIX.to_string()), + codex_response_handoff_prefix: None, + codex_responses_as_items, + model: None, + output_modality: RealtimeOutputModality::Audio, + include_startup_context: None, + prompt: Some(Some("backend prompt".to_string())), + realtime_session_id: None, + transport: None, + version: None, + voice: None, + }) + .await?; + let start_response: JSONRPCResponse = timeout( + DEFAULT_TIMEOUT, + self.mcp + .read_stream_until_response_message(RequestId::Integer(start_request_id)), + ) + .await??; + let _: ThreadRealtimeStartResponse = to_response(start_response)?; + + self.read_notification::("thread/realtime/started") + .await + } + async fn read_notification(&mut self, method: &str) -> Result { read_notification(&mut self.mcp, method).await } @@ -619,7 +671,6 @@ async fn realtime_conversation_streams_v2_notifications() -> Result<()> { let start_request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { - architecture: None, client_managed_handoffs: None, codex_responses_as_items: None, codex_response_item_prefix: None, @@ -911,7 +962,6 @@ async fn realtime_start_can_skip_startup_context() -> Result<()> { let start_request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { - architecture: None, client_managed_handoffs: None, codex_responses_as_items: None, codex_response_item_prefix: None, @@ -1010,7 +1060,6 @@ async fn realtime_text_output_modality_requests_text_output_and_final_transcript let start_request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { - architecture: None, client_managed_handoffs: None, codex_responses_as_items: None, codex_response_item_prefix: None, @@ -1192,7 +1241,6 @@ async fn realtime_conversation_stop_emits_closed_notification() -> Result<()> { let start_request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { - architecture: None, client_managed_handoffs: None, codex_responses_as_items: None, codex_response_item_prefix: None, @@ -1297,7 +1345,6 @@ async fn realtime_webrtc_start_emits_sdp_notification() -> Result<()> { let thread_id = thread_start.thread.id; let start_request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { - architecture: None, client_managed_handoffs: None, codex_responses_as_items: None, codex_response_item_prefix: None, @@ -1326,7 +1373,7 @@ async fn realtime_webrtc_start_emits_sdp_notification() -> Result<()> { read_notification::(&mut mcp, "thread/realtime/started") .await?; assert_eq!(started.thread_id, thread_id); - assert_eq!(started.version, RealtimeConversationVersion::V2); + assert_eq!(started.version, RealtimeConversationVersion::V1); let sdp_notification = read_notification::(&mut mcp, "thread/realtime/sdp").await?; @@ -1353,7 +1400,7 @@ async fn realtime_webrtc_start_emits_sdp_notification() -> Result<()> { ); assert_eq!( realtime_server.single_handshake().uri(), - "/v1/realtime?call_id=rtc_app_test" + "/v1/realtime?intent=quicksilver&call_id=rtc_app_test" ); let stop_request_id = mcp @@ -1382,7 +1429,10 @@ async fn realtime_webrtc_start_emits_sdp_notification() -> Result<()> { let request = call_capture.single_request(); assert_eq!(request.url.path(), "/v1/realtime/calls"); - assert_eq!(request.url.query(), None); + assert_eq!( + request.url.query(), + Some("intent=quicksilver&architecture=avas") + ); assert_eq!( request .headers @@ -1391,8 +1441,7 @@ async fn realtime_webrtc_start_emits_sdp_notification() -> Result<()> { Some("multipart/form-data; boundary=codex-realtime-call-boundary") ); let body = String::from_utf8(request.body).context("multipart body should be utf-8")?; - let session = r#"{"tool_choice":"auto","type":"realtime","model":"gpt-realtime-1.5","instructions":"backend prompt\n\nstartup context","output_modalities":["audio"],"audio":{"input":{"format":{"type":"audio/pcm","rate":24000},"noise_reduction":{"type":"near_field"},"transcription":{"model":"gpt-4o-mini-transcribe"},"turn_detection":{"type":"server_vad","interrupt_response":true,"create_response":true,"silence_duration_ms":500}},"output":{"format":{"type":"audio/pcm","rate":24000},"voice":"marin"}},"tools":[{"type":"function","name":"background_agent","description":"Send a user request to the background agent. Use this as the default action. Do not rephrase the user's ask or rewrite it in your own words; pass along the user's own words. If the background agent is idle, this starts a new task and returns the final result to the user. If the background agent is already working on a task, this sends the request as guidance to steer that previous task. If the user asks to do something next, later, after this, or once current work finishes, call this tool so the work is actually queued instead of merely promising to do it later.","parameters":{"type":"object","properties":{"prompt":{"type":"string","description":"The user request to delegate to the background agent."}},"required":["prompt"],"additionalProperties":false}},{"type":"function","name":"remain_silent","description":"Call this when the best response is to say nothing. Use it instead of speaking after hidden system/control messages, after background agent updates in silent modes, or whenever acknowledging aloud would be distracting. This tool has no user-visible effect.","parameters":{"type":"object","properties":{},"additionalProperties":false}}]}"#; - let session = normalized_json_string(session)?; + let session = normalized_json_string(v1_session_create_json())?; assert_eq!( body, format!( @@ -1785,9 +1834,9 @@ async fn realtime_automatic_standalone_output_is_item_and_append_speaks() -> Res .await?; let started = harness - .start_webrtc_realtime_with_codex_response_items("v=offer\r\n") + .start_websocket_realtime_with_codex_response_items() .await?; - assert_eq!(started.started.version, RealtimeConversationVersion::V2); + assert_eq!(started.version, RealtimeConversationVersion::V2); assert_eq!( harness.sideband_outbound_request(/*request_index*/ 0).await["type"].as_str(), Some("session.update") @@ -1868,9 +1917,9 @@ async fn realtime_automatic_handoff_output_is_item_and_append_speaks() -> Result .await?; let started = harness - .start_webrtc_realtime_with_codex_response_items("v=offer\r\n") + .start_websocket_realtime_with_codex_response_items() .await?; - assert_eq!(started.started.version, RealtimeConversationVersion::V2); + assert_eq!(started.version, RealtimeConversationVersion::V2); assert_eq!( harness.sideband_outbound_request(/*request_index*/ 0).await["type"].as_str(), Some("session.update") @@ -1920,7 +1969,7 @@ async fn realtime_automatic_handoff_output_is_item_and_append_speaks() -> Result } #[tokio::test] -async fn webrtc_v2_assistant_output_without_handoff_reaches_realtime_context() -> Result<()> { +async fn websocket_v2_assistant_output_without_handoff_reaches_realtime_context() -> Result<()> { skip_if_no_network!(Ok(())); let final_answer = "long output ".repeat(1_000); @@ -1951,9 +2000,9 @@ async fn webrtc_v2_assistant_output_without_handoff_reaches_realtime_context() - .await?; let started = harness - .start_webrtc_realtime_with_codex_response_items("v=offer\r\n") + .start_websocket_realtime_with_codex_response_items() .await?; - assert_eq!(started.started.version, RealtimeConversationVersion::V2); + assert_eq!(started.version, RealtimeConversationVersion::V2); let request_id = harness .mcp @@ -2000,10 +2049,10 @@ async fn webrtc_v2_assistant_output_without_handoff_reaches_realtime_context() - } #[tokio::test] -async fn webrtc_v2_forwards_audio_and_text_between_client_and_sideband() -> Result<()> { +async fn websocket_v2_forwards_audio_and_text_between_client_and_sideband() -> Result<()> { skip_if_no_network!(Ok(())); - // Phase 1: create a v2 WebRTC conversation whose sideband sends transcript + output audio + // Phase 1: create a v2 websocket conversation whose sideband sends transcript + output audio // after the client has had a chance to append input. let mut harness = RealtimeE2eHarness::new( RealtimeTestVersion::V2, @@ -2028,13 +2077,13 @@ async fn webrtc_v2_forwards_audio_and_text_between_client_and_sideband() -> Resu ) .await?; - let started = harness.start_webrtc_realtime("v=offer\r\n").await?; - assert_eq!(started.started.version, RealtimeConversationVersion::V2); + let started = harness.start_websocket_realtime().await?; + assert_eq!(started.version, RealtimeConversationVersion::V2); assert_v2_session_update(&harness.sideband_outbound_request(/*request_index*/ 0).await)?; // Phase 2: drive app-server as the client would: append audio, append text, then receive // transcript/audio notifications that came from the sideband socket. - let thread_id = started.started.thread_id.clone(); + let thread_id = started.thread_id.clone(); harness.append_audio(thread_id.clone()).await?; harness.append_text(thread_id, "hello").await?; @@ -2083,7 +2132,7 @@ async fn webrtc_v2_forwards_audio_and_text_between_client_and_sideband() -> Resu /// Text input is append-only, so app-server should send the user message without /// requesting a new realtime response. #[tokio::test] -async fn webrtc_v2_text_input_is_append_only_while_response_is_active() -> Result<()> { +async fn websocket_v2_text_input_is_append_only_while_response_is_active() -> Result<()> { skip_if_no_network!(Ok(())); // Phase 1: script a server-side response that becomes active after the first @@ -2112,8 +2161,8 @@ async fn webrtc_v2_text_input_is_append_only_while_response_is_active() -> Resul ) .await?; - let started = harness.start_webrtc_realtime("v=offer\r\n").await?; - assert_eq!(started.started.version, RealtimeConversationVersion::V2); + let started = harness.start_websocket_realtime().await?; + assert_eq!(started.version, RealtimeConversationVersion::V2); // From here on, `sideband_outbound_request(n)` reads outbound messages to // the fake Realtime API sideband websocket. These are not client-facing @@ -2122,7 +2171,7 @@ async fn webrtc_v2_text_input_is_append_only_while_response_is_active() -> Resul // Phase 2: send the first text turn. Text input is append-only, so this // sends only the user text item. - let thread_id = started.started.thread_id.clone(); + let thread_id = started.thread_id.clone(); harness.append_text(thread_id.clone(), "first").await?; assert_v2_user_text_item( &harness.sideband_outbound_request(/*request_index*/ 1).await, @@ -2157,7 +2206,7 @@ async fn webrtc_v2_text_input_is_append_only_while_response_is_active() -> Resul /// Regression coverage for append-only Realtime V2 text input when the active /// response is cancelled instead of completed. #[tokio::test] -async fn webrtc_v2_text_input_is_append_only_when_response_is_cancelled() -> Result<()> { +async fn websocket_v2_text_input_is_append_only_when_response_is_cancelled() -> Result<()> { skip_if_no_network!(Ok(())); // Phase 1: script a server-side response that becomes active after the first @@ -2180,13 +2229,13 @@ async fn webrtc_v2_text_input_is_append_only_when_response_is_cancelled() -> Res ) .await?; - let started = harness.start_webrtc_realtime("v=offer\r\n").await?; - assert_eq!(started.started.version, RealtimeConversationVersion::V2); + let started = harness.start_websocket_realtime().await?; + assert_eq!(started.version, RealtimeConversationVersion::V2); assert_v2_session_update(&harness.sideband_outbound_request(/*request_index*/ 0).await)?; // Phase 2: send the first text turn. Text input is append-only, so this // sends only the user text item. - let thread_id = started.started.thread_id.clone(); + let thread_id = started.thread_id.clone(); harness.append_text(thread_id.clone(), "first").await?; assert_v2_user_text_item( &harness.sideband_outbound_request(/*request_index*/ 1).await, @@ -2218,8 +2267,7 @@ async fn webrtc_v2_text_input_is_append_only_when_response_is_cancelled() -> Res /// output to realtime and then requests a new `response.create` so realtime can /// react to that final output. #[tokio::test] -async fn webrtc_v2_background_agent_tool_call_delegates_and_returns_function_output() -> Result<()> -{ +async fn websocket_v2_background_agent_returns_function_output() -> Result<()> { skip_if_no_network!(Ok(())); // Phase 1: script a v2 background agent function call and a delegated Responses turn that @@ -2268,8 +2316,8 @@ async fn webrtc_v2_background_agent_tool_call_delegates_and_returns_function_out ) .await?; - let started = harness.start_webrtc_realtime("v=offer\r\n").await?; - assert_eq!(started.started.version, RealtimeConversationVersion::V2); + let started = harness.start_websocket_realtime().await?; + assert_eq!(started.version, RealtimeConversationVersion::V2); // Phase 2: wait for the delegated turn lifecycle kicked off by the v2 function-call item. let turn_started = harness @@ -2316,7 +2364,7 @@ async fn webrtc_v2_background_agent_tool_call_delegates_and_returns_function_out /// task. App-server acknowledges that steering message to realtime and then /// emits `response.create` so realtime can speak that acknowledgement. #[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn webrtc_v2_background_agent_steering_ack_requests_response_create() -> Result<()> { +async fn websocket_v2_background_agent_steering_ack_requests_response_create() -> Result<()> { skip_if_no_network!(Ok(())); // Phase 1: gate the delegated Responses turn from the first tool call so @@ -2356,8 +2404,8 @@ async fn webrtc_v2_background_agent_steering_ack_requests_response_create() -> R ) .await?; - let started = harness.start_webrtc_realtime("v=offer\r\n").await?; - assert_eq!(started.started.version, RealtimeConversationVersion::V2); + let started = harness.start_websocket_realtime().await?; + assert_eq!(started.version, RealtimeConversationVersion::V2); assert_v2_session_update(&harness.sideband_outbound_request(/*request_index*/ 0).await)?; let turn_started = harness .read_notification::("turn/started") @@ -2399,7 +2447,7 @@ async fn webrtc_v2_background_agent_steering_ack_requests_response_create() -> R } #[tokio::test] -async fn webrtc_v2_background_agent_progress_is_sent_before_function_output() -> Result<()> { +async fn websocket_v2_background_agent_progress_is_sent_before_function_output() -> Result<()> { skip_if_no_network!(Ok(())); let mut harness = RealtimeE2eHarness::new( @@ -2418,8 +2466,8 @@ async fn webrtc_v2_background_agent_progress_is_sent_before_function_output() -> ) .await?; - let started = harness.start_webrtc_realtime("v=offer\r\n").await?; - assert_eq!(started.started.version, RealtimeConversationVersion::V2); + let started = harness.start_websocket_realtime().await?; + assert_eq!(started.version, RealtimeConversationVersion::V2); let turn_completed = harness .read_notification::("turn/completed") @@ -2441,7 +2489,7 @@ async fn webrtc_v2_background_agent_progress_is_sent_before_function_output() -> } #[tokio::test] -async fn webrtc_v2_tool_call_delegated_turn_can_execute_shell_tool() -> Result<()> { +async fn websocket_v2_tool_call_delegated_turn_can_execute_shell_tool() -> Result<()> { skip_if_no_network!(Ok(())); // Phase 1: keep the two mocked OpenAI conversations explicit. The realtime sideband only @@ -2475,7 +2523,7 @@ async fn webrtc_v2_tool_call_delegated_turn_can_execute_shell_tool() -> Result<( ) .await?; - let _ = harness.start_webrtc_realtime("v=offer\r\n").await?; + let _ = harness.start_websocket_realtime().await?; // Phase 2: observe the delegated background agent turn executing the requested shell command. let started_command = wait_for_started_command_execution(&mut harness.mcp).await?; @@ -2503,9 +2551,12 @@ async fn webrtc_v2_tool_call_delegated_turn_can_execute_shell_tool() -> Result<( // Phase 3: verify the shell output reached Responses and the final delegated answer returned // to realtime as a single function-call-output item. - let turn_completed = harness - .read_notification::("turn/completed") - .await?; + let turn_completed = read_notification_with_timeout::( + &mut harness.mcp, + "turn/completed", + DELEGATED_SHELL_TURN_TIMEOUT, + ) + .await?; assert_eq!(turn_completed.thread_id, harness.thread_id); let requests = harness.main_loop_responses_requests().await?; @@ -2531,7 +2582,7 @@ async fn webrtc_v2_tool_call_delegated_turn_can_execute_shell_tool() -> Result<( } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn webrtc_v2_tool_call_does_not_block_sideband_audio() -> Result<()> { +async fn websocket_v2_tool_call_does_not_block_sideband_audio() -> Result<()> { skip_if_no_network!(Ok(())); // Phase 1: gate the delegated Responses stream so the sideband can send audio while the tool @@ -2574,7 +2625,7 @@ async fn webrtc_v2_tool_call_does_not_block_sideband_audio() -> Result<()> { ) .await?; - let _ = harness.start_webrtc_realtime("v=offer\r\n").await?; + let _ = harness.start_websocket_realtime().await?; let _ = harness .read_notification::("turn/started") .await?; @@ -2649,7 +2700,6 @@ async fn realtime_webrtc_start_surfaces_backend_error() -> Result<()> { let start_request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { - architecture: None, client_managed_handoffs: None, codex_responses_as_items: None, codex_response_item_prefix: None, @@ -2716,7 +2766,6 @@ async fn realtime_conversation_requires_feature_flag() -> Result<()> { let start_request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { - architecture: None, client_managed_handoffs: None, codex_responses_as_items: None, codex_response_item_prefix: None, @@ -2752,9 +2801,17 @@ async fn realtime_conversation_requires_feature_flag() -> Result<()> { async fn read_notification( mcp: &mut TestAppServer, method: &str, +) -> Result { + read_notification_with_timeout(mcp, method, DEFAULT_TIMEOUT).await +} + +async fn read_notification_with_timeout( + mcp: &mut TestAppServer, + method: &str, + timeout_duration: Duration, ) -> Result { let notification = timeout( - DEFAULT_TIMEOUT, + timeout_duration, mcp.read_stream_until_notification_message(method), ) .await??; @@ -2970,7 +3027,10 @@ fn assert_call_create_multipart( session: &str, ) -> Result<()> { assert_eq!(request.url.path(), "/v1/realtime/calls"); - assert_eq!(request.url.query(), None); + assert_eq!( + request.url.query(), + Some("intent=quicksilver&architecture=avas") + ); assert_eq!( request .headers diff --git a/codex-rs/codex-api/src/endpoint/realtime_call.rs b/codex-rs/codex-api/src/endpoint/realtime_call.rs index e2ade0266..af7060913 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_call.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_call.rs @@ -1,4 +1,5 @@ use crate::auth::SharedAuthProvider; +use crate::endpoint::realtime_websocket::RealtimeEventParser; use crate::endpoint::realtime_websocket::RealtimeSessionConfig; use crate::endpoint::realtime_websocket::session_update_session_json; use crate::endpoint::session::EndpointSession; @@ -9,7 +10,6 @@ use codex_client::HttpTransport; use codex_client::Request; use codex_client::RequestBody; use codex_client::RequestTelemetry; -use codex_protocol::protocol::RealtimeConversationArchitecture; use http::HeaderMap; use http::HeaderValue; use http::Method; @@ -120,27 +120,12 @@ impl RealtimeCallClient { sdp: String, session_config: RealtimeSessionConfig, extra_headers: HeaderMap, - ) -> Result { - self.create_with_session_architecture_and_headers( - sdp, - session_config, - RealtimeConversationArchitecture::RealtimeApi, - extra_headers, - ) - .await - } - - pub async fn create_with_session_architecture_and_headers( - &self, - sdp: String, - session_config: RealtimeSessionConfig, - architecture: RealtimeConversationArchitecture, - extra_headers: HeaderMap, ) -> Result { trace!(target: "codex_api::realtime_websocket::wire", "realtime call request SDP: {sdp}"); // WebRTC can begin inference as soon as the peer connection comes up, so the initial // session payload is sent with call creation. The sideband WebSocket still sends its normal // session.update after it joins. + validate_avas_session_config(&session_config)?; let mut session = realtime_session_json(session_config)?; if let Some(session) = session.as_object_mut() { session.remove("id"); @@ -159,7 +144,7 @@ impl RealtimeCallClient { Self::path(), extra_headers, Some(body), - |req| configure_realtime_call_request(req, architecture), + configure_realtime_call_request, ) .await?; let sdp = decode_sdp_response(resp.body.as_ref())?; @@ -191,7 +176,7 @@ impl RealtimeCallClient { extra_headers, /*body*/ None, |req| { - configure_realtime_call_request(req, architecture); + configure_realtime_call_request(req); req.headers.insert( CONTENT_TYPE, HeaderValue::from_static(MULTIPART_CONTENT_TYPE), @@ -208,17 +193,18 @@ impl RealtimeCallClient { } } -fn configure_realtime_call_request( - request: &mut Request, - architecture: RealtimeConversationArchitecture, -) { - match architecture { - RealtimeConversationArchitecture::RealtimeApi => {} - RealtimeConversationArchitecture::Avas => { - append_query_pair(&mut request.url, "intent", "quicksilver"); - append_query_pair(&mut request.url, "architecture", "avas"); - } +fn configure_realtime_call_request(request: &mut Request) { + append_query_pair(&mut request.url, "intent", "quicksilver"); + append_query_pair(&mut request.url, "architecture", "avas"); +} + +fn validate_avas_session_config(session_config: &RealtimeSessionConfig) -> Result<(), ApiError> { + if session_config.event_parser != RealtimeEventParser::V1 { + return Err(ApiError::InvalidRequest { + message: "AVAS realtime calls require realtime v1".to_string(), + }); } + Ok(()) } fn append_query_pair(url: &mut String, key: &str, value: &str) { @@ -377,10 +363,18 @@ mod tests { instructions: "hi".to_string(), model: Some("gpt-realtime".to_string()), session_id: Some(session_id.to_string()), - event_parser: RealtimeEventParser::RealtimeV2, + event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Conversational, output_modality: RealtimeOutputModality::Audio, + voice: RealtimeVoice::Cove, + } + } + + fn realtime_v2_session_config(session_id: &str) -> RealtimeSessionConfig { + RealtimeSessionConfig { + event_parser: RealtimeEventParser::RealtimeV2, voice: RealtimeVoice::Marin, + ..realtime_session_config(session_id) } } @@ -488,7 +482,10 @@ mod tests { let request = transport.last_request.lock().unwrap().clone().unwrap(); assert_eq!(request.method, Method::POST); - assert_eq!(request.url, "https://api.openai.com/v1/realtime/calls"); + assert_eq!( + request.url, + "https://api.openai.com/v1/realtime/calls?intent=quicksilver&architecture=avas" + ); assert_eq!( request.headers.get(CONTENT_TYPE).unwrap(), HeaderValue::from_static(MULTIPART_CONTENT_TYPE) @@ -524,7 +521,7 @@ mod tests { } #[tokio::test] - async fn sends_avas_session_call_query_params() { + async fn sends_session_call_with_avas_query_params() { let transport = CapturingTransport::new(); let client = RealtimeCallClient::new( transport.clone(), @@ -533,10 +530,9 @@ mod tests { ); let response = client - .create_with_session_architecture_and_headers( + .create_with_session_and_headers( "v=offer\r\n".to_string(), realtime_session_config("sess-api"), - RealtimeConversationArchitecture::Avas, HeaderMap::new(), ) .await @@ -558,6 +554,30 @@ mod tests { ); } + #[tokio::test] + async fn rejects_v2_session_call_before_sending_request() { + let transport = CapturingTransport::new(); + let client = RealtimeCallClient::new( + transport.clone(), + provider("https://api.openai.com/v1"), + Arc::new(DummyAuth), + ); + + let err = client + .create_with_session( + "v=offer\r\n".to_string(), + realtime_v2_session_config("sess-api"), + ) + .await + .expect_err("v2 session config should be rejected"); + + assert_eq!( + err.to_string(), + "invalid request: AVAS realtime calls require realtime v1" + ); + assert!(transport.last_request.lock().unwrap().is_none()); + } + #[tokio::test] async fn sends_backend_session_call_as_json_body() { let transport = CapturingTransport::new(); @@ -587,7 +607,7 @@ mod tests { assert_eq!(request.method, Method::POST); assert_eq!( request.url, - "https://chatgpt.com/backend-api/codex/realtime/calls" + "https://chatgpt.com/backend-api/codex/realtime/calls?intent=quicksilver&architecture=avas" ); let mut expected_session = realtime_session_json(realtime_session_config("sess-backend")) .expect("session should encode"); diff --git a/codex-rs/config/src/config_toml.rs b/codex-rs/config/src/config_toml.rs index 7270c3f32..f575aa27d 100644 --- a/codex-rs/config/src/config_toml.rs +++ b/codex-rs/config/src/config_toml.rs @@ -587,14 +587,12 @@ pub enum RealtimeTransport { Websocket, } -pub use codex_protocol::protocol::RealtimeConversationArchitecture as RealtimeArchitecture; pub use codex_protocol::protocol::RealtimeConversationVersion as RealtimeWsVersion; pub use codex_protocol::protocol::RealtimeVoice; #[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq, Eq, JsonSchema)] #[schemars(deny_unknown_fields)] pub struct RealtimeConfig { - pub architecture: RealtimeArchitecture, pub version: RealtimeWsVersion, #[serde(rename = "type")] pub session_type: RealtimeWsMode, @@ -605,7 +603,6 @@ pub struct RealtimeConfig { #[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq, Eq, JsonSchema)] #[schemars(deny_unknown_fields)] pub struct RealtimeToml { - pub architecture: Option, pub version: Option, #[serde(rename = "type")] pub session_type: Option, diff --git a/codex-rs/core/config.schema.json b/codex-rs/core/config.schema.json index 21c797526..f80ec3b56 100644 --- a/codex-rs/core/config.schema.json +++ b/codex-rs/core/config.schema.json @@ -2524,13 +2524,6 @@ }, "type": "object" }, - "RealtimeConversationArchitecture": { - "enum": [ - "realtimeapi", - "avas" - ], - "type": "string" - }, "RealtimeConversationVersion": { "enum": [ "v1", @@ -2541,9 +2534,6 @@ "RealtimeToml": { "additionalProperties": false, "properties": { - "architecture": { - "$ref": "#/definitions/RealtimeConversationArchitecture" - }, "transport": { "$ref": "#/definitions/RealtimeTransport" }, diff --git a/codex-rs/core/src/client.rs b/codex-rs/core/src/client.rs index 4ea8c0ff1..773a15deb 100644 --- a/codex-rs/core/src/client.rs +++ b/codex-rs/core/src/client.rs @@ -77,7 +77,6 @@ use codex_protocol::models::ResponseItem; use codex_protocol::openai_models::ModelInfo; use codex_protocol::openai_models::ReasoningEffort as ReasoningEffortConfig; use codex_protocol::protocol::InternalSessionSource; -use codex_protocol::protocol::RealtimeConversationArchitecture; use codex_protocol::protocol::SessionSource; use codex_protocol::protocol::W3cTraceContext; use codex_rollout_trace::CompactionTraceContext; @@ -583,7 +582,6 @@ impl ModelClient { &self, sdp: String, session_config: ApiRealtimeSessionConfig, - architecture: RealtimeConversationArchitecture, mut extra_headers: ApiHeaderMap, api_provider_override: Option, ) -> Result { @@ -600,12 +598,7 @@ impl ModelClient { let transport = ReqwestTransport::new(build_reqwest_client()); let api_provider = api_provider_override.unwrap_or(client_setup.api_provider); let response = ApiRealtimeCallClient::new(transport, api_provider, client_setup.api_auth) - .create_with_session_architecture_and_headers( - sdp, - session_config, - architecture, - extra_headers, - ) + .create_with_session_and_headers(sdp, session_config, extra_headers) .await .map_err(map_api_error)?; Ok(RealtimeWebrtcCallStart { diff --git a/codex-rs/core/src/config/config_tests.rs b/codex-rs/core/src/config/config_tests.rs index b13c9575f..53a31433a 100644 --- a/codex-rs/core/src/config/config_tests.rs +++ b/codex-rs/core/src/config/config_tests.rs @@ -14,7 +14,6 @@ use codex_config::config_toml::AutoReviewToml; use codex_config::config_toml::ConfigToml; use codex_config::config_toml::ExperimentalRequestUserInput; use codex_config::config_toml::ProjectConfig; -use codex_config::config_toml::RealtimeArchitecture; use codex_config::config_toml::RealtimeConfig; use codex_config::config_toml::RealtimeToml; use codex_config::config_toml::RealtimeTransport; @@ -10896,7 +10895,6 @@ async fn realtime_loads_from_config_toml() -> std::io::Result<()> { let cfg: ConfigToml = toml::from_str( r#" [realtime] -architecture = "avas" version = "v2" type = "transcription" transport = "webrtc" @@ -10908,7 +10906,6 @@ voice = "cedar" assert_eq!( cfg.realtime, Some(RealtimeToml { - architecture: Some(RealtimeArchitecture::Avas), version: Some(RealtimeWsVersion::V2), session_type: Some(RealtimeWsMode::Transcription), transport: Some(RealtimeTransport::WebRtc), @@ -10927,7 +10924,6 @@ voice = "cedar" assert_eq!( config.realtime, RealtimeConfig { - architecture: RealtimeArchitecture::Avas, version: RealtimeWsVersion::V2, session_type: RealtimeWsMode::Transcription, transport: RealtimeTransport::WebRtc, diff --git a/codex-rs/core/src/config/mod.rs b/codex-rs/core/src/config/mod.rs index a1854067f..1089ba648 100644 --- a/codex-rs/core/src/config/mod.rs +++ b/codex-rs/core/src/config/mod.rs @@ -3792,7 +3792,6 @@ impl Config { .map_or_else(RealtimeConfig::default, |realtime| { let defaults = RealtimeConfig::default(); RealtimeConfig { - architecture: realtime.architecture.unwrap_or(defaults.architecture), version: realtime.version.unwrap_or(defaults.version), session_type: realtime.session_type.unwrap_or(defaults.session_type), transport: realtime.transport.unwrap_or(defaults.transport), diff --git a/codex-rs/core/src/realtime_conversation.rs b/codex-rs/core/src/realtime_conversation.rs index b68422846..667939f10 100644 --- a/codex-rs/core/src/realtime_conversation.rs +++ b/codex-rs/core/src/realtime_conversation.rs @@ -41,7 +41,6 @@ use codex_protocol::protocol::ConversationTextRole; use codex_protocol::protocol::ErrorEvent; use codex_protocol::protocol::Event; use codex_protocol::protocol::EventMsg; -use codex_protocol::protocol::RealtimeConversationArchitecture; use codex_protocol::protocol::RealtimeConversationClosedEvent; use codex_protocol::protocol::RealtimeConversationRealtimeEvent; use codex_protocol::protocol::RealtimeConversationSdpEvent; @@ -246,7 +245,6 @@ struct ConversationState { struct RealtimeStart { api_provider: ApiProvider, - architecture: RealtimeConversationArchitecture, extra_headers: Option, client_managed_handoffs: bool, codex_responses_as_items: bool, @@ -304,7 +302,6 @@ impl RealtimeConversationManager { async fn start_inner(&self, start: RealtimeStart) -> CodexResult { let RealtimeStart { api_provider, - architecture, extra_headers, client_managed_handoffs, codex_responses_as_items, @@ -351,7 +348,6 @@ impl RealtimeConversationManager { .create_realtime_call_with_headers( sdp, session_config.clone(), - architecture, extra_headers.unwrap_or_default(), realtime_call_api_provider, ) @@ -717,7 +713,6 @@ pub(crate) async fn handle_start( struct PreparedRealtimeConversationStart { api_provider: ApiProvider, - architecture: RealtimeConversationArchitecture, extra_headers: Option, client_managed_handoffs: bool, codex_responses_as_items: bool, @@ -730,6 +725,12 @@ struct PreparedRealtimeConversationStart { transport: ConversationStartTransport, } +#[derive(Clone, Copy)] +pub(crate) enum ConfiguredRealtimeVoice { + Use, + Ignore, +} + async fn prepare_realtime_start( sess: &Arc, params: ConversationStartParams, @@ -758,16 +759,21 @@ async fn prepare_realtime_start( } else { None }; - let version = params.version.unwrap_or(config.realtime.version); - // TODO(pbakkum): Remove the realtimeapi/AVAS branch once WebRTC realtime sessions always use AVAS. - let architecture = params.architecture.unwrap_or(config.realtime.architecture); - validate_realtime_architecture( - architecture, - version, - &transport, - config.realtime.session_type, - )?; - let session_config = build_realtime_session_config(sess, ¶ms, version).await?; + let version = params.version.unwrap_or(match &transport { + ConversationStartTransport::Websocket => config.realtime.version, + ConversationStartTransport::Webrtc { .. } => RealtimeWsVersion::V1, + }); + if matches!(transport, ConversationStartTransport::Webrtc { .. }) { + validate_avas_webrtc_start(version, config.realtime.session_type)?; + } + let configured_voice = match (&transport, params.version) { + (ConversationStartTransport::Webrtc { .. }, None) => ConfiguredRealtimeVoice::Ignore, + (ConversationStartTransport::Webrtc { .. } | ConversationStartTransport::Websocket, _) => { + ConfiguredRealtimeVoice::Use + } + }; + let session_config = + build_realtime_session_config(sess, ¶ms, version, configured_voice).await?; let requested_realtime_session_id = session_config.session_id.clone(); let extra_headers = match transport { ConversationStartTransport::Websocket => { @@ -788,7 +794,6 @@ async fn prepare_realtime_start( }; Ok(PreparedRealtimeConversationStart { api_provider, - architecture, extra_headers, client_managed_handoffs: params.client_managed_handoffs, codex_responses_as_items: params.codex_responses_as_items, @@ -802,28 +807,18 @@ async fn prepare_realtime_start( }) } -fn validate_realtime_architecture( - architecture: RealtimeConversationArchitecture, +fn validate_avas_webrtc_start( version: RealtimeWsVersion, - transport: &ConversationStartTransport, session_type: RealtimeWsMode, ) -> CodexResult<()> { - if architecture != RealtimeConversationArchitecture::Avas { - return Ok(()); - } if version != RealtimeWsVersion::V1 { return Err(CodexErr::InvalidRequest( - "AVAS realtime architecture requires realtime v1".to_string(), - )); - } - if !matches!(transport, ConversationStartTransport::Webrtc { .. }) { - return Err(CodexErr::InvalidRequest( - "AVAS realtime architecture requires WebRTC transport".to_string(), + "AVAS realtime calls require realtime v1".to_string(), )); } if session_type != RealtimeWsMode::Conversational { return Err(CodexErr::InvalidRequest( - "AVAS realtime architecture requires conversational realtime".to_string(), + "AVAS realtime calls require conversational realtime".to_string(), )); } Ok(()) @@ -833,6 +828,7 @@ pub(crate) async fn build_realtime_session_config( sess: &Arc, params: &ConversationStartParams, version: RealtimeWsVersion, + configured_voice: ConfiguredRealtimeVoice, ) -> CodexResult { let config = sess.get_config().await; let prompt = prepare_realtime_backend_prompt( @@ -879,9 +875,13 @@ pub(crate) async fn build_realtime_session_config( RealtimeWsMode::Conversational => RealtimeSessionMode::Conversational, RealtimeWsMode::Transcription => RealtimeSessionMode::Transcription, }; + let config_voice = match configured_voice { + ConfiguredRealtimeVoice::Use => config.realtime.voice, + ConfiguredRealtimeVoice::Ignore => None, + }; let voice = params .voice - .or(config.realtime.voice) + .or(config_voice) .unwrap_or_else(|| default_realtime_voice(version)); validate_realtime_voice(version, voice)?; Ok(RealtimeSessionConfig { @@ -960,7 +960,6 @@ async fn handle_start_inner( ) -> CodexResult<()> { let PreparedRealtimeConversationStart { api_provider, - architecture, extra_headers, client_managed_handoffs, codex_responses_as_items, @@ -979,7 +978,6 @@ async fn handle_start_inner( }; let start = RealtimeStart { api_provider, - architecture, extra_headers, client_managed_handoffs, codex_responses_as_items, diff --git a/codex-rs/core/tests/suite/compact_remote.rs b/codex-rs/core/tests/suite/compact_remote.rs index 40c01fa1b..77561093c 100644 --- a/codex-rs/core/tests/suite/compact_remote.rs +++ b/codex-rs/core/tests/suite/compact_remote.rs @@ -203,7 +203,6 @@ async fn start_remote_realtime_server() -> responses::WebSocketTestServer { async fn start_realtime_conversation(codex: &codex_core::CodexThread) -> Result<()> { codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, diff --git a/codex-rs/core/tests/suite/realtime_conversation.rs b/codex-rs/core/tests/suite/realtime_conversation.rs index 078870995..dd97e5a5d 100644 --- a/codex-rs/core/tests/suite/realtime_conversation.rs +++ b/codex-rs/core/tests/suite/realtime_conversation.rs @@ -18,7 +18,6 @@ use codex_protocol::protocol::EventMsg; use codex_protocol::protocol::InitialHistory; use codex_protocol::protocol::Op; use codex_protocol::protocol::RealtimeAudioFrame; -use codex_protocol::protocol::RealtimeConversationArchitecture; use codex_protocol::protocol::RealtimeConversationRealtimeEvent; use codex_protocol::protocol::RealtimeConversationVersion; use codex_protocol::protocol::RealtimeEvent; @@ -285,7 +284,6 @@ async fn conversation_start_audio_text_close_round_trip() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -432,7 +430,6 @@ async fn conversation_start_defaults_to_v2_and_gpt_realtime_1_5() -> Result<()> test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -528,7 +525,6 @@ async fn conversation_webrtc_start_posts_generated_session() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -585,7 +581,10 @@ async fn conversation_webrtc_start_posts_generated_session() -> Result<()> { // begin inference before the sideband WebSocket is ready. let request = capture.single_request(); assert_eq!(request.url.path(), "/v1/realtime/calls"); - assert_eq!(request.url.query(), None); + assert_eq!( + request.url.query(), + Some("intent=quicksilver&architecture=avas") + ); assert_eq!( request .headers @@ -674,7 +673,7 @@ async fn conversation_webrtc_start_posts_generated_session() -> Result<()> { } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn conversation_webrtc_start_uses_avas_architecture_query() -> Result<()> { +async fn conversation_webrtc_start_uses_avas_query() -> Result<()> { skip_if_no_network!(Ok(())); let server = start_mock_server().await; @@ -713,7 +712,6 @@ async fn conversation_webrtc_start_uses_avas_architecture_query() -> Result<()> test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: Some(RealtimeConversationArchitecture::Avas), client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -774,6 +772,132 @@ async fn conversation_webrtc_start_uses_avas_architecture_query() -> Result<()> Ok(()) } +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn conversation_webrtc_default_v1_ignores_configured_v2_voice() -> Result<()> { + skip_if_no_network!(Ok(())); + + let server = start_mock_server().await; + let capture = RealtimeCallRequestCapture::new(); + Mock::given(method("POST")) + .and(path_regex(".*/realtime/calls$")) + .and(capture.clone()) + .respond_with( + ResponseTemplate::new(200) + .insert_header("Location", "/v1/realtime/calls/calls/rtc_voice_default") + .set_body_string("v=answer\r\n"), + ) + .mount(&server) + .await; + let realtime_server = start_websocket_server_with_headers(vec![WebSocketConnectionConfig { + requests: vec![vec![json!({ + "type": "session.updated", + "session": { "id": "sess_webrtc_voice", "instructions": "backend prompt" } + })]], + response_headers: Vec::new(), + accept_delay: None, + close_after_requests: false, + }]) + .await; + + let realtime_ws_base_url = realtime_server.uri().to_string(); + let mut builder = test_codex().with_config(move |config| { + config.experimental_realtime_ws_backend_prompt = Some("backend prompt".to_string()); + config.experimental_realtime_ws_base_url = Some(realtime_ws_base_url); + config.realtime.version = RealtimeWsVersion::V2; + config.realtime.voice = Some(RealtimeVoice::Cedar); + }); + let test = builder.build(&server).await?; + + test.codex + .submit(Op::RealtimeConversationStart(ConversationStartParams { + client_managed_handoffs: false, + codex_responses_as_items: false, + codex_response_item_prefix: None, + codex_response_handoff_prefix: None, + model: None, + output_modality: RealtimeOutputModality::Audio, + include_startup_context: true, + prompt: Some(Some("backend prompt".to_string())), + realtime_session_id: None, + transport: Some(ConversationStartTransport::Webrtc { + sdp: "v=offer\r\n".to_string(), + }), + version: None, + voice: None, + })) + .await?; + + let created = wait_for_event_match(&test.codex, |msg| match msg { + EventMsg::RealtimeConversationSdp(created) => Some(Ok(created.clone())), + EventMsg::Error(err) => Some(Err(err.clone())), + _ => None, + }) + .await + .expect("conversation call create failed"); + assert_eq!(created.sdp, "v=answer\r\n"); + + let request = capture.single_request(); + let body = String::from_utf8(request.body).context("multipart body should be utf-8")?; + assert!(body.contains(r#""type":"quicksilver""#)); + assert!(body.contains(r#""voice":"cove""#)); + assert!(!body.contains(r#""voice":"cedar""#)); + + let session_update = wait_for_websocket_request( + &realtime_server, + /*connection_index*/ 0, + /*request_index*/ 0, + ) + .await?; + assert_eq!( + session_update.body_json()["session"]["audio"]["output"]["voice"], + "cove" + ); + + test.codex.submit(Op::RealtimeConversationClose).await?; + realtime_server.shutdown().await; + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn conversation_webrtc_default_v1_rejects_explicit_v2_voice() -> Result<()> { + skip_if_no_network!(Ok(())); + + let api_server = start_mock_server().await; + let mut builder = test_codex().with_config(|config| { + config.realtime.version = RealtimeWsVersion::V2; + }); + let test = builder.build(&api_server).await?; + + test.codex + .submit(Op::RealtimeConversationStart(ConversationStartParams { + client_managed_handoffs: false, + codex_responses_as_items: false, + codex_response_item_prefix: None, + codex_response_handoff_prefix: None, + model: None, + output_modality: RealtimeOutputModality::Audio, + include_startup_context: true, + prompt: Some(Some("backend prompt".to_string())), + realtime_session_id: None, + transport: Some(ConversationStartTransport::Webrtc { + sdp: "v=offer\r\n".to_string(), + }), + version: None, + voice: Some(RealtimeVoice::Cedar), + })) + .await?; + + let error = wait_for_event_match(&test.codex, |msg| match msg { + EventMsg::RealtimeConversationRealtime(RealtimeConversationRealtimeEvent { + payload: RealtimeEvent::Error(message), + }) => Some(message.clone()), + _ => None, + }) + .await; + assert!(error.contains("realtime voice `cedar` is not supported for v1")); + Ok(()) +} + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn conversation_webrtc_start_uses_configured_call_base_url_for_avas() -> Result<()> { skip_if_no_network!(Ok(())); @@ -816,7 +940,6 @@ async fn conversation_webrtc_start_uses_configured_call_base_url_for_avas() -> R test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: Some(RealtimeConversationArchitecture::Avas), client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -911,7 +1034,6 @@ async fn conversation_webrtc_close_while_sideband_connecting_drops_pending_join( test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -1003,7 +1125,6 @@ async fn conversation_webrtc_sideband_connect_failure_closes_with_error() -> Res test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -1097,7 +1218,6 @@ async fn conversation_start_uses_openai_env_key_fallback_with_chatgpt_auth() -> test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -1171,7 +1291,6 @@ async fn conversation_transport_close_emits_closed_event() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -1269,7 +1388,6 @@ async fn conversation_start_preflight_failure_emits_realtime_error_only() -> Res test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -1321,7 +1439,6 @@ async fn conversation_start_connect_failure_emits_realtime_error_only() -> Resul test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -1421,7 +1538,6 @@ async fn conversation_second_start_replaces_runtime() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -1452,7 +1568,6 @@ async fn conversation_second_start_replaces_runtime() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -1554,7 +1669,6 @@ async fn conversation_uses_experimental_realtime_ws_base_url_override() -> Resul test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -1624,7 +1738,6 @@ async fn conversation_uses_default_realtime_backend_prompt() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -1702,7 +1815,6 @@ async fn conversation_uses_empty_instructions_for_null_or_empty_prompt() -> Resu ] { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -1773,7 +1885,6 @@ async fn conversation_uses_explicit_start_voice() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -1836,7 +1947,6 @@ async fn conversation_uses_configured_realtime_voice() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -1887,7 +1997,6 @@ async fn conversation_rejects_voice_for_wrong_realtime_version() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -1939,7 +2048,6 @@ async fn conversation_uses_experimental_realtime_ws_backend_prompt_override() -> test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -2017,7 +2125,6 @@ async fn conversation_uses_experimental_realtime_ws_startup_context_override() - test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -2089,7 +2196,6 @@ async fn conversation_disables_realtime_startup_context_with_empty_override() -> test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -2154,7 +2260,6 @@ async fn conversation_start_injects_startup_context_from_thread_history() -> Res test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -2274,7 +2379,6 @@ async fn conversation_startup_context_current_thread_selects_many_turns_by_budge codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -2387,7 +2491,6 @@ async fn conversation_startup_context_falls_back_to_workspace_map() -> Result<() test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -2452,7 +2555,6 @@ async fn conversation_startup_context_is_truncated_and_sent_once_per_start() -> test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -2538,7 +2640,6 @@ async fn conversation_user_text_turn_is_not_sent_to_realtime() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -2640,7 +2741,6 @@ async fn realtime_v2_noop_tool_call_returns_empty_function_output_without_respon test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -2744,7 +2844,6 @@ async fn conversation_mirrors_assistant_message_text_to_realtime_handoff() -> Re test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -2884,7 +2983,6 @@ async fn conversation_handoff_persists_across_item_done_until_turn_complete() -> test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -3041,7 +3139,6 @@ async fn inbound_handoff_request_starts_turn() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -3156,7 +3253,6 @@ async fn inbound_handoff_request_uses_active_transcript() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -3264,7 +3360,6 @@ async fn inbound_handoff_request_sends_transcript_delta_after_each_handoff() -> test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -3370,7 +3465,6 @@ async fn inbound_conversation_item_does_not_start_turn_and_still_forwards_audio( test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -3498,7 +3592,6 @@ async fn delegated_turn_user_role_echo_does_not_redelegate_and_still_forwards_au test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -3656,7 +3749,6 @@ async fn inbound_handoff_request_does_not_block_realtime_event_forwarding() -> R test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -3803,7 +3895,6 @@ async fn inbound_handoff_request_steers_active_turn() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, @@ -3961,7 +4052,6 @@ async fn inbound_handoff_request_starts_turn_and_does_not_block_realtime_audio() test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - architecture: None, client_managed_handoffs: false, codex_responses_as_items: false, codex_response_item_prefix: None, diff --git a/codex-rs/protocol/src/protocol.rs b/codex-rs/protocol/src/protocol.rs index 285114d8b..99aa0a0b6 100644 --- a/codex-rs/protocol/src/protocol.rs +++ b/codex-rs/protocol/src/protocol.rs @@ -180,8 +180,6 @@ pub struct McpServerRefreshConfig { #[derive(Debug, Clone, PartialEq)] pub struct ConversationStartParams { - /// Overrides the configured realtime architecture for this session only. - pub architecture: Option, /// Whether Codex response handoffs are managed through explicit client append calls. pub client_managed_handoffs: bool, /// Sends automatic Codex responses as realtime conversation items instead of handoff appends. @@ -1549,15 +1547,6 @@ pub enum RealtimeConversationVersion { V2, } -#[derive(Debug, Clone, Copy, Default, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] -#[serde(rename_all = "snake_case")] -pub enum RealtimeConversationArchitecture { - #[default] - #[serde(rename = "realtimeapi")] - RealtimeApi, - Avas, -} - #[derive(Debug, Clone, Deserialize, Serialize, PartialEq, JsonSchema, TS)] pub struct RealtimeConversationStartedEvent { pub realtime_session_id: Option,