diff --git a/codex-rs/core/src/realtime_conversation.rs b/codex-rs/core/src/realtime_conversation.rs index e74973671..f81d6c094 100644 --- a/codex-rs/core/src/realtime_conversation.rs +++ b/codex-rs/core/src/realtime_conversation.rs @@ -805,7 +805,9 @@ async fn handle_start_inner( if let Some(text) = maybe_routed_text { debug!(text = %text, "[realtime-text] realtime conversation text output"); let sess_for_routed_text = Arc::clone(&sess_clone); - sess_for_routed_text.route_realtime_text_input(text).await; + sess_for_routed_text + .route_realtime_text_input(wrap_realtime_delegation_input(&text)) + .await; } if !fanout_realtime_active.load(Ordering::Relaxed) { break; @@ -867,6 +869,20 @@ fn realtime_text_from_handoff_request(handoff: &RealtimeHandoffRequested) -> Opt .or((!handoff.input_transcript.is_empty()).then_some(handoff.input_transcript.clone())) } +fn wrap_realtime_delegation_input(input: &str) -> String { + format!( + "\n {}\n", + escape_xml_text(input) + ) +} + +fn escape_xml_text(input: &str) -> String { + input + .replace('&', "&") + .replace('<', "<") + .replace('>', ">") +} + fn realtime_api_key(auth: Option<&CodexAuth>, provider: &ModelProviderInfo) -> CodexResult { if let Some(api_key) = provider.api_key()? { return Ok(api_key); diff --git a/codex-rs/core/src/realtime_conversation_tests.rs b/codex-rs/core/src/realtime_conversation_tests.rs index 0a32d063c..cda130e18 100644 --- a/codex-rs/core/src/realtime_conversation_tests.rs +++ b/codex-rs/core/src/realtime_conversation_tests.rs @@ -1,6 +1,7 @@ use super::RealtimeHandoffState; use super::RealtimeSessionKind; use super::realtime_text_from_handoff_request; +use super::wrap_realtime_delegation_input; use async_channel::bounded; use codex_protocol::protocol::RealtimeHandoffRequested; use codex_protocol::protocol::RealtimeTranscriptEntry; @@ -54,6 +55,22 @@ fn ignores_empty_handoff_request_input_transcript() { assert_eq!(realtime_text_from_handoff_request(&handoff), None); } +#[test] +fn wraps_realtime_delegation_input() { + assert_eq!( + wrap_realtime_delegation_input("hello"), + "\n hello\n" + ); +} + +#[test] +fn wraps_realtime_delegation_input_with_xml_escaping() { + assert_eq!( + wrap_realtime_delegation_input("use a < b && c > d"), + "\n use a < b && c > d\n" + ); +} + #[tokio::test] async fn clears_active_handoff_explicitly() { let (tx, _rx) = bounded(1); diff --git a/codex-rs/core/tests/suite/realtime_conversation.rs b/codex-rs/core/tests/suite/realtime_conversation.rs index 4759f5daf..add4065f7 100644 --- a/codex-rs/core/tests/suite/realtime_conversation.rs +++ b/codex-rs/core/tests/suite/realtime_conversation.rs @@ -2417,11 +2417,8 @@ async fn inbound_handoff_request_starts_turn() -> Result<()> { let request = response_mock.single_request(); let user_texts = request.message_input_texts("user"); - assert!( - user_texts - .iter() - .any(|text| text == "user: text from realtime") - ); + assert!(user_texts.iter().any(|text| text + == "\n user: text from realtime\n")); realtime_server.shutdown().await; Ok(()) @@ -2503,7 +2500,7 @@ async fn inbound_handoff_request_uses_active_transcript() -> Result<()> { let request = response_mock.single_request(); let user_texts = request.message_input_texts("user"); assert!(user_texts.iter().any(|text| text - == "assistant: assistant context\nuser: delegated query\nassistant: assist confirm")); + == "\n assistant: assistant context\nuser: delegated query\nassistant: assist confirm\n")); realtime_server.shutdown().await; Ok(()) @@ -2617,23 +2614,14 @@ async fn inbound_handoff_request_clears_active_transcript_after_each_handoff() - assert_eq!(requests.len(), 2); let first_user_texts = requests[0].message_input_texts("user"); - assert!( - first_user_texts - .iter() - .any(|text| text == "user: first question") - ); + assert!(first_user_texts.iter().any(|text| text + == "\n user: first question\n")); let second_user_texts = requests[1].message_input_texts("user"); - assert!( - second_user_texts - .iter() - .any(|text| text == "user: second question") - ); - assert!( - !second_user_texts - .iter() - .any(|text| text == "user: first question\nuser: second question") - ); + assert!(second_user_texts.iter().any(|text| text + == "\n user: second question\n")); + assert!(!second_user_texts.iter().any(|text| text + == "\n user: first question\nuser: second question\n")); realtime_server.shutdown().await; Ok(()) @@ -3156,14 +3144,12 @@ async fn inbound_handoff_request_steers_active_turn() -> Result<()> { assert!( !first_texts .iter() - .any(|text| text == "user: steer via realtime") + .any(|text| text + == "\n user: steer via realtime\n") ); assert!(second_texts.iter().any(|text| text == "first prompt")); - assert!( - second_texts - .iter() - .any(|text| text == "user: steer via realtime") - ); + assert!(second_texts.iter().any(|text| text + == "\n user: steer via realtime\n")); realtime_server.shutdown().await; api_server.shutdown().await; @@ -3278,7 +3264,9 @@ async fn inbound_handoff_request_starts_turn_and_does_not_block_realtime_audio() assert_eq!(requests.len(), 1); let first_body: Value = serde_json::from_slice(&requests[0]).expect("parse first request"); let first_texts = message_input_texts(&first_body, "user"); - let expected_text = format!("user: {delegated_text}"); + let expected_text = format!( + "\n user: {delegated_text}\n" + ); assert!(first_texts.iter().any(|text| text == &expected_text)); realtime_server.shutdown().await;