[codex] Emit image view as core item (#20512)

## Why

Image-view results should be represented as a core-produced turn item
instead of being reconstructed by app-server. At the same time, existing
rollout/history paths still understand the legacy `ViewImageToolCall`
event, so this keeps that event as compatibility output generated from
the new item lifecycle.

## What changed

- Added `TurnItem::ImageView` to `codex-protocol`.
- Emitted image-view item start/completion directly from the core
`view_image` handler.
- Kept `ViewImageToolCall` as a legacy event and generate it from
completed `TurnItem::ImageView` items.
- Kept `thread_history.rs` on the legacy `ViewImageToolCall` replay
path, with `ImageView` item lifecycle events ignored there.
- Updated app-server protocol conversion, rollout persistence, and
affected exhaustive event matches for the new item plus legacy fan-out
shape.

## Verification

- `cargo test -p codex-protocol -p codex-app-server-protocol -p
codex-rollout -p codex-rollout-trace -p codex-mcp-server -p
codex-app-server --lib`
- `cargo test -p codex-core --test all
view_image_tool_attaches_local_image`
- `just fix -p codex-protocol -p codex-core -p codex-app-server-protocol
-p codex-app-server -p codex-rollout -p codex-rollout-trace -p
codex-mcp-server`
- `git diff --check`
This commit is contained in:
pakrym-oai
2026-05-01 11:28:30 -07:00
committed by GitHub
Unverified
parent 610eefb86b
commit aed74e5ee4
10 changed files with 83 additions and 46 deletions
@@ -356,6 +356,7 @@ impl ThreadHistoryBuilder {
| codex_protocol::items::TurnItem::AgentMessage(_)
| codex_protocol::items::TurnItem::Reasoning(_)
| codex_protocol::items::TurnItem::WebSearch(_)
| codex_protocol::items::TurnItem::ImageView(_)
| codex_protocol::items::TurnItem::ImageGeneration(_)
| codex_protocol::items::TurnItem::FileChange(_)
| codex_protocol::items::TurnItem::ContextCompaction(_) => {}
@@ -378,6 +379,7 @@ impl ThreadHistoryBuilder {
| codex_protocol::items::TurnItem::AgentMessage(_)
| codex_protocol::items::TurnItem::Reasoning(_)
| codex_protocol::items::TurnItem::WebSearch(_)
| codex_protocol::items::TurnItem::ImageView(_)
| codex_protocol::items::TurnItem::ImageGeneration(_)
| codex_protocol::items::TurnItem::FileChange(_)
| codex_protocol::items::TurnItem::ContextCompaction(_) => {}
@@ -6463,6 +6463,10 @@ impl From<CoreTurnItem> for ThreadItem {
query: search.query,
action: Some(WebSearchAction::from(search.action)),
},
CoreTurnItem::ImageView(image) => ThreadItem::ImageView {
id: image.id,
path: image.path,
},
CoreTurnItem::ImageGeneration(image) => ThreadItem::ImageGeneration {
id: image.id,
status: image.status,
@@ -8089,6 +8093,7 @@ mod tests {
use codex_protocol::items::AgentMessageContent;
use codex_protocol::items::AgentMessageItem;
use codex_protocol::items::FileChangeItem;
use codex_protocol::items::ImageViewItem;
use codex_protocol::items::ReasoningItem;
use codex_protocol::items::TurnItem;
use codex_protocol::items::UserMessageItem;
@@ -10370,6 +10375,19 @@ mod tests {
}
);
let image_view_item = TurnItem::ImageView(ImageViewItem {
id: "view-image-1".to_string(),
path: test_path_buf("/tmp/view-image.png").abs(),
});
assert_eq!(
ThreadItem::from(image_view_item),
ThreadItem::ImageView {
id: "view-image-1".to_string(),
path: test_path_buf("/tmp/view-image.png").abs(),
}
);
let file_change_item = TurnItem::FileChange(FileChangeItem {
id: "patch-1".to_string(),
changes: [(
@@ -954,28 +954,7 @@ pub(crate) async fn apply_bespoke_event_handling(
}))
.await;
}
EventMsg::ViewImageToolCall(view_image_event) => {
let item = ThreadItem::ImageView {
id: view_image_event.call_id.clone(),
path: view_image_event.path.clone(),
};
let started = ItemStartedNotification {
thread_id: conversation_id.to_string(),
turn_id: event_turn_id.clone(),
item: item.clone(),
};
outgoing
.send_server_notification(ServerNotification::ItemStarted(started))
.await;
let completed = ItemCompletedNotification {
thread_id: conversation_id.to_string(),
turn_id: event_turn_id.clone(),
item,
};
outgoing
.send_server_notification(ServerNotification::ItemCompleted(completed))
.await;
}
EventMsg::ViewImageToolCall(_) => {}
EventMsg::EnteredReviewMode(review_request) => {
let review = review_request
.user_facing_hint
+1 -1
View File
@@ -1470,9 +1470,9 @@ pub(super) fn realtime_text_for_event(msg: &EventMsg) -> Option<String> {
| EventMsg::PatchApplyBegin(_)
| EventMsg::PatchApplyUpdated(_)
| EventMsg::PatchApplyEnd(_)
| EventMsg::ViewImageToolCall(_)
| EventMsg::ImageGenerationBegin(_)
| EventMsg::ImageGenerationEnd(_)
| EventMsg::ViewImageToolCall(_)
| EventMsg::ExecApprovalRequest(_)
| EventMsg::RequestPermissions(_)
| EventMsg::RequestUserInput(_)
+8 -11
View File
@@ -1,3 +1,5 @@
use codex_protocol::items::ImageViewItem;
use codex_protocol::items::TurnItem;
use codex_protocol::models::DEFAULT_IMAGE_DETAIL;
use codex_protocol::models::FunctionCallOutputBody;
use codex_protocol::models::FunctionCallOutputContentItem;
@@ -17,8 +19,6 @@ use crate::tools::context::ToolPayload;
use crate::tools::handlers::parse_arguments;
use crate::tools::registry::ToolHandler;
use crate::tools::registry::ToolKind;
use codex_protocol::protocol::EventMsg;
use codex_protocol::protocol::ViewImageToolCallEvent;
pub struct ViewImageHandler;
@@ -152,15 +152,12 @@ impl ToolHandler for ViewImageHandler {
})?;
let image_url = image.into_data_url();
session
.send_event(
turn.as_ref(),
EventMsg::ViewImageToolCall(ViewImageToolCallEvent {
call_id,
path: event_path,
}),
)
.await;
let item = TurnItem::ImageView(ImageViewItem {
id: call_id,
path: event_path,
});
session.emit_turn_item_started(turn.as_ref(), &item).await;
session.emit_turn_item_completed(turn.as_ref(), item).await;
Ok(ViewImageOutput {
image_url,
+34 -9
View File
@@ -299,12 +299,26 @@ async fn view_image_tool_attaches_local_image() -> anyhow::Result<()> {
))
.await?;
let mut tool_event = None;
let mut item_started = None;
let mut item_completed = None;
let mut legacy_event = None;
wait_for_event_with_timeout(
codex,
|event| match event {
EventMsg::ViewImageToolCall(_) => {
tool_event = Some(event.clone());
EventMsg::ItemStarted(event) => {
if matches!(&event.item, codex_protocol::items::TurnItem::ImageView(_)) {
item_started = Some(event.item.clone());
}
false
}
EventMsg::ItemCompleted(event) => {
if matches!(&event.item, codex_protocol::items::TurnItem::ImageView(_)) {
item_completed = Some(event.item.clone());
}
false
}
EventMsg::ViewImageToolCall(event) => {
legacy_event = Some(event.clone());
false
}
EventMsg::TurnComplete(_) => true,
@@ -316,12 +330,23 @@ async fn view_image_tool_attaches_local_image() -> anyhow::Result<()> {
)
.await;
let tool_event = match tool_event.expect("view image tool event emitted") {
EventMsg::ViewImageToolCall(event) => event,
_ => unreachable!("stored event must be ViewImageToolCall"),
};
assert_eq!(tool_event.call_id, call_id);
assert_eq!(tool_event.path, abs_path);
match item_started.expect("view image item started event emitted") {
codex_protocol::items::TurnItem::ImageView(item) => {
assert_eq!(item.id, call_id);
assert_eq!(item.path, abs_path);
}
other => panic!("expected ImageView item, got {other:?}"),
}
match item_completed.expect("view image item completed event emitted") {
codex_protocol::items::TurnItem::ImageView(item) => {
assert_eq!(item.id, call_id);
assert_eq!(item.path, abs_path);
}
other => panic!("expected ImageView item, got {other:?}"),
}
let legacy_event = legacy_event.expect("legacy view image event emitted");
assert_eq!(legacy_event.call_id, call_id);
assert_eq!(legacy_event.path, abs_path);
let req = mock.single_request();
let body = req.body_json();
+1 -1
View File
@@ -356,9 +356,9 @@ async fn run_codex_tool_session_inner(
| EventMsg::TurnAborted(_)
| EventMsg::UserMessage(_)
| EventMsg::ShutdownComplete
| EventMsg::ViewImageToolCall(_)
| EventMsg::ImageGenerationBegin(_)
| EventMsg::ImageGenerationEnd(_)
| EventMsg::ViewImageToolCall(_)
| EventMsg::RawResponseItem(_)
| EventMsg::EnteredReviewMode(_)
| EventMsg::ItemStarted(_)
+15
View File
@@ -14,6 +14,7 @@ use crate::protocol::PatchApplyBeginEvent;
use crate::protocol::PatchApplyEndEvent;
use crate::protocol::PatchApplyStatus;
use crate::protocol::UserMessageEvent;
use crate::protocol::ViewImageToolCallEvent;
use crate::protocol::WebSearchEndEvent;
use crate::user_input::ByteRange;
use crate::user_input::TextElement;
@@ -38,6 +39,7 @@ pub enum TurnItem {
Plan(PlanItem),
Reasoning(ReasoningItem),
WebSearch(WebSearchItem),
ImageView(ImageViewItem),
ImageGeneration(ImageGenerationItem),
FileChange(FileChangeItem),
ContextCompaction(ContextCompactionItem),
@@ -121,6 +123,12 @@ pub struct WebSearchItem {
pub action: WebSearchAction,
}
#[derive(Debug, Clone, Deserialize, Serialize, TS, JsonSchema, PartialEq)]
pub struct ImageViewItem {
pub id: String,
pub path: AbsolutePathBuf,
}
#[derive(Debug, Clone, Deserialize, Serialize, TS, JsonSchema, PartialEq)]
pub struct ImageGenerationItem {
pub id: String,
@@ -439,6 +447,7 @@ impl TurnItem {
TurnItem::Plan(item) => item.id.clone(),
TurnItem::Reasoning(item) => item.id.clone(),
TurnItem::WebSearch(item) => item.id.clone(),
TurnItem::ImageView(item) => item.id.clone(),
TurnItem::ImageGeneration(item) => item.id.clone(),
TurnItem::FileChange(item) => item.id.clone(),
TurnItem::ContextCompaction(item) => item.id.clone(),
@@ -452,6 +461,12 @@ impl TurnItem {
TurnItem::AgentMessage(item) => item.as_legacy_events(),
TurnItem::Plan(_) => Vec::new(),
TurnItem::WebSearch(item) => vec![item.as_legacy_event()],
TurnItem::ImageView(item) => {
vec![EventMsg::ViewImageToolCall(ViewImageToolCallEvent {
call_id: item.id.clone(),
path: item.path.clone(),
})]
}
TurnItem::ImageGeneration(item) => vec![item.as_legacy_event()],
TurnItem::FileChange(item) => item
.as_legacy_end_event(String::new())
+1
View File
@@ -1836,6 +1836,7 @@ impl HasLegacyEvent for ItemStartedEvent {
TurnItem::WebSearch(item) => vec![EventMsg::WebSearchBegin(WebSearchBeginEvent {
call_id: item.id.clone(),
})],
TurnItem::ImageView(_) => Vec::new(),
TurnItem::ImageGeneration(item) => {
vec![EventMsg::ImageGenerationBegin(ImageGenerationBeginEvent {
call_id: item.id.clone(),
+2 -2
View File
@@ -243,11 +243,11 @@ pub(crate) fn tool_runtime_trace_event(event: &EventMsg) -> Option<ToolRuntimeTr
| EventMsg::WebSearchEnd(_)
| EventMsg::ImageGenerationBegin(_)
| EventMsg::ImageGenerationEnd(_)
| EventMsg::ViewImageToolCall(_)
| EventMsg::ExecCommandBegin(_)
| EventMsg::ExecCommandOutputDelta(_)
| EventMsg::TerminalInteraction(_)
| EventMsg::ExecCommandEnd(_)
| EventMsg::ViewImageToolCall(_)
| EventMsg::ExecApprovalRequest(_)
| EventMsg::RequestPermissions(_)
| EventMsg::RequestUserInput(_)
@@ -318,11 +318,11 @@ pub(crate) fn wrapped_protocol_event_type(event: &EventMsg) -> Option<&'static s
| EventMsg::WebSearchEnd(_)
| EventMsg::ImageGenerationBegin(_)
| EventMsg::ImageGenerationEnd(_)
| EventMsg::ViewImageToolCall(_)
| EventMsg::ExecCommandBegin(_)
| EventMsg::ExecCommandOutputDelta(_)
| EventMsg::TerminalInteraction(_)
| EventMsg::ExecCommandEnd(_)
| EventMsg::ViewImageToolCall(_)
| EventMsg::ExecApprovalRequest(_)
| EventMsg::RequestPermissions(_)
| EventMsg::RequestUserInput(_)