[codex] add roles to realtime append text (#27936)

## Summary

Add an explicit `user` or `developer` role to
`thread/realtime/appendText` and propagate it through the realtime input
queue into `conversation.item.create`. Older JSON clients that omit the
field continue to default to `user`.

This lets app-provided context such as memory retain developer authority
without bypassing app-server through a renderer-owned data channel. The
app-server schemas, API documentation, and focused protocol and
websocket coverage are updated with the new contract.

The Codex Apps consumer is tracked in
[openai/openai#1025261](https://github.com/openai/openai/pull/1025261).
This commit is contained in:
Alex Gamble
2026-06-12 15:05:37 -07:00
committed by GitHub
Unverified
parent 9915d34684
commit 216dee1189
18 changed files with 176 additions and 50 deletions
@@ -17,6 +17,7 @@ use crate::error::ApiError;
use crate::provider::Provider;
use codex_client::backoff;
use codex_client::maybe_build_rustls_client_config_with_custom_ca;
use codex_protocol::protocol::ConversationTextRole;
use codex_protocol::protocol::RealtimeTranscriptDelta;
use codex_utils_rustls_provider::ensure_rustls_crypto_provider;
use futures::SinkExt;
@@ -227,8 +228,12 @@ impl RealtimeWebsocketConnection {
self.writer.send_audio_frame(frame).await
}
pub async fn send_conversation_item_create(&self, text: String) -> Result<(), ApiError> {
self.writer.send_conversation_item_create(text).await
pub async fn send_conversation_item_create(
&self,
text: String,
role: ConversationTextRole,
) -> Result<(), ApiError> {
self.writer.send_conversation_item_create(text, role).await
}
pub async fn send_conversation_function_call_output(
@@ -286,9 +291,17 @@ impl RealtimeWebsocketWriter {
.await
}
pub async fn send_conversation_item_create(&self, text: String) -> Result<(), ApiError> {
self.send_json(&conversation_item_create_message(self.event_parser, text))
.await
pub async fn send_conversation_item_create(
&self,
text: String,
role: ConversationTextRole,
) -> Result<(), ApiError> {
self.send_json(&conversation_item_create_message(
self.event_parser,
text,
role,
))
.await
}
pub async fn send_conversation_handoff_append(
@@ -1609,6 +1622,7 @@ mod tests {
.expect("text");
let third_json: Value = serde_json::from_str(&third).expect("json");
assert_eq!(third_json["type"], "conversation.item.create");
assert_eq!(third_json["item"]["role"], "developer");
assert_eq!(
third_json["item"]["content"][0]["type"],
Value::String("input_text".to_string())
@@ -1746,7 +1760,10 @@ mod tests {
.await
.expect("send audio");
connection
.send_conversation_item_create("hello agent".to_string())
.send_conversation_item_create(
"hello agent".to_string(),
ConversationTextRole::Developer,
)
.await
.expect("send item");
connection
@@ -1948,6 +1965,7 @@ mod tests {
.expect("text");
let second_json: Value = serde_json::from_str(&second).expect("json");
assert_eq!(second_json["type"], "conversation.item.create");
assert_eq!(second_json["item"]["role"], "developer");
assert_eq!(
second_json["item"]["type"],
Value::String("message".to_string())
@@ -2030,7 +2048,10 @@ mod tests {
);
connection
.send_conversation_item_create("delegate this".to_string())
.send_conversation_item_create(
"delegate this".to_string(),
ConversationTextRole::Developer,
)
.await
.expect("send text item");
connection
@@ -13,6 +13,7 @@ use crate::endpoint::realtime_websocket::protocol::RealtimeSessionConfig;
use crate::endpoint::realtime_websocket::protocol::RealtimeSessionMode;
use crate::endpoint::realtime_websocket::protocol::RealtimeVoice;
use crate::endpoint::realtime_websocket::protocol::SessionUpdateSession;
use codex_protocol::protocol::ConversationTextRole;
use serde_json::Result as JsonResult;
use serde_json::Value;
use serde_json::to_value;
@@ -33,10 +34,11 @@ pub(super) fn normalized_session_mode(
pub(super) fn conversation_item_create_message(
event_parser: RealtimeEventParser,
text: String,
role: ConversationTextRole,
) -> RealtimeOutboundMessage {
match event_parser {
RealtimeEventParser::V1 => v1_conversation_item_create_message(text),
RealtimeEventParser::RealtimeV2 => v2_conversation_item_create_message(text),
RealtimeEventParser::V1 => v1_conversation_item_create_message(text, role),
RealtimeEventParser::RealtimeV2 => v2_conversation_item_create_message(text, role),
}
}
@@ -5,7 +5,6 @@ use crate::endpoint::realtime_websocket::protocol::ConversationItemContent;
use crate::endpoint::realtime_websocket::protocol::ConversationItemPayload;
use crate::endpoint::realtime_websocket::protocol::ConversationItemType;
use crate::endpoint::realtime_websocket::protocol::ConversationMessageItem;
use crate::endpoint::realtime_websocket::protocol::ConversationRole;
use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage;
use crate::endpoint::realtime_websocket::protocol::RealtimeVoice;
use crate::endpoint::realtime_websocket::protocol::SessionAudio;
@@ -14,12 +13,16 @@ use crate::endpoint::realtime_websocket::protocol::SessionAudioInput;
use crate::endpoint::realtime_websocket::protocol::SessionAudioOutput;
use crate::endpoint::realtime_websocket::protocol::SessionType;
use crate::endpoint::realtime_websocket::protocol::SessionUpdateSession;
use codex_protocol::protocol::ConversationTextRole;
pub(super) fn conversation_item_create_message(text: String) -> RealtimeOutboundMessage {
pub(super) fn conversation_item_create_message(
text: String,
role: ConversationTextRole,
) -> RealtimeOutboundMessage {
RealtimeOutboundMessage::ConversationItemCreate {
item: ConversationItemPayload::Message(ConversationMessageItem {
r#type: ConversationItemType::Message,
role: ConversationRole::User,
role,
content: vec![ConversationItemContent {
r#type: ConversationContentType::InputText,
text,
@@ -6,7 +6,6 @@ use crate::endpoint::realtime_websocket::protocol::ConversationItemContent;
use crate::endpoint::realtime_websocket::protocol::ConversationItemPayload;
use crate::endpoint::realtime_websocket::protocol::ConversationItemType;
use crate::endpoint::realtime_websocket::protocol::ConversationMessageItem;
use crate::endpoint::realtime_websocket::protocol::ConversationRole;
use crate::endpoint::realtime_websocket::protocol::NoiseReductionType;
use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage;
use crate::endpoint::realtime_websocket::protocol::RealtimeOutputModality;
@@ -25,6 +24,7 @@ use crate::endpoint::realtime_websocket::protocol::SessionTurnDetection;
use crate::endpoint::realtime_websocket::protocol::SessionType;
use crate::endpoint::realtime_websocket::protocol::SessionUpdateSession;
use crate::endpoint::realtime_websocket::protocol::TurnDetectionType;
use codex_protocol::protocol::ConversationTextRole;
use serde_json::json;
const REALTIME_V2_OUTPUT_MODALITY_AUDIO: &str = "audio";
@@ -36,11 +36,14 @@ const REALTIME_V2_SILENCE_TOOL_NAME: &str = "remain_silent";
const REALTIME_V2_SILENCE_TOOL_DESCRIPTION: &str = "Call this when the best response is to say nothing. Use it instead of speaking after hidden system/control messages, after background agent updates in silent modes, or whenever acknowledging aloud would be distracting. This tool has no user-visible effect.";
const REALTIME_V2_INPUT_TRANSCRIPTION_MODEL: &str = "gpt-4o-mini-transcribe";
pub(super) fn conversation_item_create_message(text: String) -> RealtimeOutboundMessage {
pub(super) fn conversation_item_create_message(
text: String,
role: ConversationTextRole,
) -> RealtimeOutboundMessage {
RealtimeOutboundMessage::ConversationItemCreate {
item: ConversationItemPayload::Message(ConversationMessageItem {
r#type: ConversationItemType::Message,
role: ConversationRole::User,
role,
content: vec![ConversationItemContent {
r#type: ConversationContentType::InputText,
text,
@@ -1,5 +1,6 @@
use crate::endpoint::realtime_websocket::protocol_v1::parse_realtime_event_v1;
use crate::endpoint::realtime_websocket::protocol_v2::parse_realtime_event_v2;
use codex_protocol::protocol::ConversationTextRole;
pub use codex_protocol::protocol::RealtimeAudioFrame;
pub use codex_protocol::protocol::RealtimeEvent;
pub use codex_protocol::protocol::RealtimeOutputModality;
@@ -157,7 +158,7 @@ pub(super) struct SessionAudioOutputFormat {
pub(super) struct ConversationMessageItem {
#[serde(rename = "type")]
pub(super) r#type: ConversationItemType,
pub(super) role: ConversationRole,
pub(super) role: ConversationTextRole,
pub(super) content: Vec<ConversationItemContent>,
}
@@ -168,12 +169,6 @@ pub(super) enum ConversationItemType {
FunctionCallOutput,
}
#[derive(Debug, Clone, Copy, Serialize)]
#[serde(rename_all = "snake_case")]
pub(super) enum ConversationRole {
User,
}
#[derive(Debug, Clone, Serialize)]
#[serde(untagged)]
pub(super) enum ConversationItemPayload {