From a6f435ea94e8a8ff516cdaf8f5fe5a5bdafee449 Mon Sep 17 00:00:00 2001 From: Curtis 'Fjord' Hawthorne Date: Wed, 10 Jun 2026 19:21:24 -0700 Subject: [PATCH] core: resize all history images behind a feature flag (#27247) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Adds complete client-side image preparation behind the default-off `resize_all_images` feature flag. When enabled, local image producers defer decoding and resizing. Images are prepared centrally before insertion into conversation history, covering user input, `view_image`, and structured tool-output images. ## Behavior - Processes base64 `data:` images in messages and function/custom tool outputs. - Leaves non-data URLs, including HTTP(S) URLs, unchanged. - Applies image-detail budgets: - `high` and omitted: 2048px maximum dimension and 2.5K 32px patches. - `original`: 6000px maximum dimension and 10K 32px patches. - `auto`: uses the same 2048px / 2.5K-patch budget as high. - `low`: unsupported and replaced with an actionable placeholder. - Preserves original image bytes when no resize or format conversion is needed. - Enforces the shared 1 GiB encoded and decoded data-URL sanity limits. - Replaces only an image that fails preparation, preserving sibling content and tool-output metadata. - Uses bounded placeholders distinguishing generic processing failures, oversized images, and unsupported `low` detail. - Prepares resumed and forked history before installing it as live history without modifying persisted rollouts. ## Flag-Off Behavior When `resize_all_images` is disabled: - Existing local user-input and `view_image` processing remains unchanged. - Existing decoding and error behavior remains unchanged. - Arbitrary tool-output images are not processed. - HTTP(S) image URLs continue to be forwarded unchanged. #### [git stack](https://github.com/magus/git-stack-cli) - ✅ `1` https://github.com/openai/codex/pull/27245 - 👉 `2` https://github.com/openai/codex/pull/27247 - ⏳ `3` https://github.com/openai/codex/pull/27246 - ⏳ `4` https://github.com/openai/codex/pull/27266 --- codex-rs/core/config.schema.json | 6 + codex-rs/core/src/image_preparation.rs | 122 +++++++++++ codex-rs/core/src/image_preparation_tests.rs | 193 ++++++++++++++++++ codex-rs/core/src/lib.rs | 1 + codex-rs/core/src/prompt_debug.rs | 4 +- codex-rs/core/src/session/mod.rs | 46 ++++- codex-rs/core/src/session/tests.rs | 110 ++++++++++ .../core/src/tools/handlers/view_image.rs | 45 ++-- codex-rs/core/tests/suite/code_mode.rs | 123 ++++++++++- codex-rs/core/tests/suite/rmcp_client.rs | 109 ++++++++++ codex-rs/core/tests/suite/view_image.rs | 102 ++++++++- codex-rs/features/src/lib.rs | 8 + codex-rs/protocol/src/models.rs | 81 +++++--- codex-rs/utils/image/src/image_tests.rs | 6 +- codex-rs/utils/image/src/lib.rs | 9 +- 15 files changed, 907 insertions(+), 58 deletions(-) create mode 100644 codex-rs/core/src/image_preparation.rs create mode 100644 codex-rs/core/src/image_preparation_tests.rs diff --git a/codex-rs/core/config.schema.json b/codex-rs/core/config.schema.json index 9ed02fdd1..f61d22b3c 100644 --- a/codex-rs/core/config.schema.json +++ b/codex-rs/core/config.schema.json @@ -576,6 +576,9 @@ "request_rule": { "type": "boolean" }, + "resize_all_images": { + "type": "boolean" + }, "responses_websockets": { "type": "boolean" }, @@ -4708,6 +4711,9 @@ "request_rule": { "type": "boolean" }, + "resize_all_images": { + "type": "boolean" + }, "responses_websockets": { "type": "boolean" }, diff --git a/codex-rs/core/src/image_preparation.rs b/codex-rs/core/src/image_preparation.rs new file mode 100644 index 000000000..d992adb80 --- /dev/null +++ b/codex-rs/core/src/image_preparation.rs @@ -0,0 +1,122 @@ +use codex_protocol::models::ContentItem; +use codex_protocol::models::FunctionCallOutputContentItem; +use codex_protocol::models::ImageDetail; +use codex_protocol::models::ResponseItem; +use codex_utils_image::ImageProcessingError; +use codex_utils_image::PromptImageMode; +use codex_utils_image::PromptImageResizeLimits; +use codex_utils_image::load_data_url_for_prompt; +use tracing::warn; + +pub(crate) const IMAGE_PROCESSING_ERROR_PLACEHOLDER: &str = + "image content omitted because it could not be processed"; +const IMAGE_TOO_LARGE_PLACEHOLDER: &str = + "image content omitted because it exceeded the supported size limit; use a smaller image"; +const UNSUPPORTED_LOW_DETAIL_PLACEHOLDER: &str = "image content omitted because detail 'low' is not supported; use 'high', 'original', or 'auto'"; + +const HIGH_DETAIL_LIMITS: PromptImageResizeLimits = PromptImageResizeLimits { + max_dimension: 2048, + max_patches: 2_500, +}; +const ORIGINAL_DETAIL_LIMITS: PromptImageResizeLimits = PromptImageResizeLimits { + max_dimension: 6000, + max_patches: 10_000, +}; +#[derive(Debug, thiserror::Error)] +enum ImagePreparationError { + #[error("image detail `low` is not supported")] + UnsupportedLowDetail, + #[error(transparent)] + Processing(#[from] ImageProcessingError), +} + +impl ImagePreparationError { + fn placeholder(&self) -> &'static str { + match self { + ImagePreparationError::UnsupportedLowDetail => UNSUPPORTED_LOW_DETAIL_PLACEHOLDER, + ImagePreparationError::Processing(ImageProcessingError::ImageTooLarge { .. }) => { + IMAGE_TOO_LARGE_PLACEHOLDER + } + ImagePreparationError::Processing(_) => IMAGE_PROCESSING_ERROR_PLACEHOLDER, + } + } +} + +pub(crate) fn prepare_response_items(items: &mut [ResponseItem]) { + for item in items { + match item { + ResponseItem::Message { content, .. } => prepare_message_content(content), + ResponseItem::FunctionCallOutput { output, .. } + | ResponseItem::CustomToolCallOutput { output, .. } => { + if let Some(content) = output.content_items_mut() { + prepare_tool_output_content(content); + } + } + ResponseItem::Reasoning { .. } + | ResponseItem::AgentMessage { .. } + | ResponseItem::LocalShellCall { .. } + | ResponseItem::FunctionCall { .. } + | ResponseItem::ToolSearchCall { .. } + | ResponseItem::CustomToolCall { .. } + | ResponseItem::ToolSearchOutput { .. } + | ResponseItem::WebSearchCall { .. } + | ResponseItem::ImageGenerationCall { .. } + | ResponseItem::Compaction { .. } + | ResponseItem::CompactionTrigger + | ResponseItem::ContextCompaction { .. } + | ResponseItem::Other => {} + } + } +} + +fn prepare_message_content(items: &mut [ContentItem]) { + for item in items { + if let ContentItem::InputImage { image_url, detail } = item + && is_data_url(image_url) + && let Err(error) = prepare_image(image_url, *detail) + { + warn!(%error, "failed to prepare message image"); + *item = ContentItem::InputText { + text: error.placeholder().to_string(), + }; + } + } +} + +fn prepare_tool_output_content(items: &mut [FunctionCallOutputContentItem]) { + for item in items { + if let FunctionCallOutputContentItem::InputImage { image_url, detail } = item + && is_data_url(image_url) + && let Err(error) = prepare_image(image_url, *detail) + { + warn!(%error, "failed to prepare tool output image"); + *item = FunctionCallOutputContentItem::InputText { + text: error.placeholder().to_string(), + }; + } + } +} + +fn is_data_url(image_url: &str) -> bool { + image_url + .get(.."data:".len()) + .is_some_and(|prefix| prefix.eq_ignore_ascii_case("data:")) +} + +fn prepare_image( + image_url: &mut String, + detail: Option, +) -> Result<(), ImagePreparationError> { + let limits = match detail { + None | Some(ImageDetail::Auto | ImageDetail::High) => HIGH_DETAIL_LIMITS, + Some(ImageDetail::Original) => ORIGINAL_DETAIL_LIMITS, + Some(ImageDetail::Low) => return Err(ImagePreparationError::UnsupportedLowDetail), + }; + let image = load_data_url_for_prompt(image_url, PromptImageMode::ResizeWithLimits(limits))?; + *image_url = image.into_data_url(); + Ok(()) +} + +#[cfg(test)] +#[path = "image_preparation_tests.rs"] +mod tests; diff --git a/codex-rs/core/src/image_preparation_tests.rs b/codex-rs/core/src/image_preparation_tests.rs new file mode 100644 index 000000000..1d54973cc --- /dev/null +++ b/codex-rs/core/src/image_preparation_tests.rs @@ -0,0 +1,193 @@ +use std::io::Cursor; + +use base64::Engine; +use base64::engine::general_purpose::STANDARD as BASE64_STANDARD; +use codex_protocol::models::FunctionCallOutputBody; +use codex_protocol::models::FunctionCallOutputPayload; +use codex_utils_image::data_url_from_bytes; +use image::DynamicImage; +use image::GenericImageView; +use image::ImageBuffer; +use image::ImageFormat; +use image::Rgba; +use pretty_assertions::assert_eq; + +use super::*; + +fn png_data_url(width: u32, height: u32) -> (String, Vec) { + let image = ImageBuffer::from_pixel(width, height, Rgba([10u8, 20, 30, 255])); + let mut encoded = Cursor::new(Vec::new()); + DynamicImage::ImageRgba8(image) + .write_to(&mut encoded, ImageFormat::Png) + .expect("encode PNG"); + let bytes = encoded.into_inner(); + (data_url_from_bytes("image/png", &bytes), bytes) +} + +fn decoded_image(image_url: &str) -> (Vec, DynamicImage) { + let (_, payload) = image_url.split_once(',').expect("data URL payload"); + let bytes = BASE64_STANDARD.decode(payload).expect("decode image URL"); + let image = image::load_from_memory(&bytes).expect("decode processed image"); + (bytes, image) +} + +#[test] +fn preparation_preserves_small_image_bytes_and_non_data_urls() { + let (data_url, original_bytes) = png_data_url(/*width*/ 64, /*height*/ 32); + let http_url = "https://example.com/image.png".to_string(); + let mut items = vec![ResponseItem::Message { + id: None, + role: "user".to_string(), + content: vec![ + ContentItem::InputImage { + image_url: data_url, + detail: Some(ImageDetail::High), + }, + ContentItem::InputImage { + image_url: http_url.clone(), + detail: Some(ImageDetail::Low), + }, + ], + phase: None, + }]; + + prepare_response_items(&mut items); + + let ResponseItem::Message { content, .. } = &items[0] else { + panic!("expected message"); + }; + let [ + ContentItem::InputImage { image_url, .. }, + ContentItem::InputImage { + image_url: preserved_http_url, + detail: Some(ImageDetail::Low), + }, + ] = content.as_slice() + else { + panic!("expected two images"); + }; + assert_eq!(decoded_image(image_url).0, original_bytes); + assert_eq!(preserved_http_url, &http_url); +} + +#[test] +fn detail_policies_apply_the_expected_budgets() { + for (detail, input_dimensions, expected_dimensions) in [ + (Some(ImageDetail::High), (2048, 2048), (1600, 1600)), + (Some(ImageDetail::Original), (6401, 100), (6000, 94)), + (Some(ImageDetail::Original), (3201, 3201), (3200, 3200)), + (Some(ImageDetail::Auto), (2048, 2048), (1600, 1600)), + (None, (2048, 2048), (1600, 1600)), + ] { + let (image_url, _) = png_data_url(input_dimensions.0, input_dimensions.1); + let mut items = vec![ResponseItem::Message { + id: None, + role: "user".to_string(), + content: vec![ContentItem::InputImage { image_url, detail }], + phase: None, + }]; + + prepare_response_items(&mut items); + + let ResponseItem::Message { content, .. } = &items[0] else { + panic!("expected message"); + }; + let [ContentItem::InputImage { image_url, .. }] = content.as_slice() else { + panic!("expected image"); + }; + assert_eq!(decoded_image(image_url).1.dimensions(), expected_dimensions); + } +} + +#[test] +fn preparation_replaces_only_failed_tool_images_and_preserves_metadata() { + let (valid_image_url, _) = png_data_url(/*width*/ 64, /*height*/ 32); + let expected_valid_image_url = valid_image_url.clone(); + let mut items = vec![ResponseItem::CustomToolCallOutput { + call_id: "call-1".to_string(), + name: None, + output: FunctionCallOutputPayload { + body: FunctionCallOutputBody::ContentItems(vec![ + FunctionCallOutputContentItem::InputText { + text: "before".to_string(), + }, + FunctionCallOutputContentItem::InputImage { + image_url: "data:image/png;base64,%%%".to_string(), + detail: Some(ImageDetail::High), + }, + FunctionCallOutputContentItem::InputImage { + image_url: data_url_from_bytes("image/png", b"not an image"), + detail: Some(ImageDetail::High), + }, + FunctionCallOutputContentItem::InputImage { + image_url: valid_image_url.clone(), + detail: Some(ImageDetail::Low), + }, + FunctionCallOutputContentItem::InputImage { + image_url: valid_image_url, + detail: Some(ImageDetail::High), + }, + ]), + success: Some(true), + }, + }]; + + prepare_response_items(&mut items); + + assert_eq!( + items, + vec![ResponseItem::CustomToolCallOutput { + call_id: "call-1".to_string(), + name: None, + output: FunctionCallOutputPayload { + body: FunctionCallOutputBody::ContentItems(vec![ + FunctionCallOutputContentItem::InputText { + text: "before".to_string(), + }, + FunctionCallOutputContentItem::InputText { + text: IMAGE_PROCESSING_ERROR_PLACEHOLDER.to_string(), + }, + FunctionCallOutputContentItem::InputText { + text: IMAGE_PROCESSING_ERROR_PLACEHOLDER.to_string(), + }, + FunctionCallOutputContentItem::InputText { + text: UNSUPPORTED_LOW_DETAIL_PLACEHOLDER.to_string(), + }, + FunctionCallOutputContentItem::InputImage { + image_url: expected_valid_image_url, + detail: Some(ImageDetail::High), + }, + ]), + success: Some(true), + }, + }] + ); +} + +#[test] +fn preparation_errors_use_bounded_actionable_placeholders() { + let cases = [ + ( + ImagePreparationError::UnsupportedLowDetail, + UNSUPPORTED_LOW_DETAIL_PLACEHOLDER, + ), + ( + ImagePreparationError::Processing(ImageProcessingError::ImageTooLarge { + representation: "decoded input", + size: 2, + max: 1, + }), + IMAGE_TOO_LARGE_PLACEHOLDER, + ), + ( + ImagePreparationError::Processing(ImageProcessingError::InvalidDataUrl { + reason: "details remain in logs".to_string(), + }), + IMAGE_PROCESSING_ERROR_PLACEHOLDER, + ), + ]; + + for (error, expected) in cases { + assert_eq!(error.placeholder(), expected); + } +} diff --git a/codex-rs/core/src/lib.rs b/codex-rs/core/src/lib.rs index a9aad7a1e..1ffd87fce 100644 --- a/codex-rs/core/src/lib.rs +++ b/codex-rs/core/src/lib.rs @@ -42,6 +42,7 @@ mod exec_policy; mod git_info_tests; mod guardian; mod hook_runtime; +mod image_preparation; mod installation_id; pub(crate) mod landlock; pub use landlock::spawn_command_under_linux_sandbox; diff --git a/codex-rs/core/src/prompt_debug.rs b/codex-rs/core/src/prompt_debug.rs index f6bf7d002..617521608 100644 --- a/codex-rs/core/src/prompt_debug.rs +++ b/codex-rs/core/src/prompt_debug.rs @@ -5,7 +5,6 @@ use codex_exec_server::ExecServerRuntimePaths; use codex_login::AuthManager; use codex_protocol::error::CodexErr; use codex_protocol::error::Result as CodexResult; -use codex_protocol::models::ResponseInputItem; use codex_protocol::models::ResponseItem; use codex_protocol::protocol::SessionSource; use codex_protocol::user_input::UserInput; @@ -78,8 +77,7 @@ pub(crate) async fn build_prompt_input_from_session( .await; if !input.is_empty() { - let input_item = ResponseInputItem::from(input); - let response_item = ResponseItem::from(input_item); + let response_item = sess.response_item_from_user_input(turn_context.as_ref(), input); sess.record_conversation_items(turn_context.as_ref(), std::slice::from_ref(&response_item)) .await; } diff --git a/codex-rs/core/src/session/mod.rs b/codex-rs/core/src/session/mod.rs index 8424ce216..68d2ceaeb 100644 --- a/codex-rs/core/src/session/mod.rs +++ b/codex-rs/core/src/session/mod.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::collections::BTreeMap; use std::collections::HashMap; use std::collections::HashSet; @@ -31,6 +32,7 @@ use crate::context::PersonalitySpecInstructions; use crate::default_skill_metadata_budget; use crate::environment_selection::ResolvedTurnEnvironments; use crate::exec_policy::ExecPolicyManager; +use crate::image_preparation::prepare_response_items; use crate::parse_turn_item; use crate::realtime_conversation::RealtimeConversationManager; use crate::session_prefix::format_subagent_notification_message; @@ -326,6 +328,7 @@ use codex_protocol::config_types::CollaborationMode; use codex_protocol::config_types::Personality; use codex_protocol::config_types::ReasoningSummary as ReasoningSummaryConfig; use codex_protocol::config_types::WindowsSandboxLevel; +use codex_protocol::models::LocalImagePreparation; use codex_protocol::models::ResponseInputItem; use codex_protocol::models::ResponseItem; use codex_protocol::openai_models::ReasoningEffort as ReasoningEffortConfig; @@ -1300,13 +1303,20 @@ impl Session { rollout_items: &[RolloutItem], ) -> Option { let rollout_reconstruction::RolloutReconstruction { - history, + mut history, previous_turn_settings, reference_context_item, window_id, } = self .reconstruct_history_from_rollout(turn_context, rollout_items) .await; + if turn_context.features.enabled(Feature::ResizeAllImages) { + // Keep the recorded rollout unchanged. Prepare its reconstructed history before + // installing it, so legacy images are processed once for this resume or fork and + // will be processed again if the rollout is reconstructed in a future session. + // This meets image resizing requirements without modifying persisted rollouts. + prepare_response_items(&mut history); + } { let mut state = self.state.lock().await; state.replace_history(history, reference_context_item); @@ -2583,11 +2593,43 @@ impl Session { /// Records conversation items: append to history, persist to rollout, and /// notify clients observing raw response items. + pub(crate) fn prepare_conversation_items_for_history<'a>( + &self, + turn_context: &TurnContext, + items: &'a [ResponseItem], + ) -> Cow<'a, [ResponseItem]> { + if !turn_context.features.enabled(Feature::ResizeAllImages) { + return Cow::Borrowed(items); + } + + let mut prepared_items = items.to_vec(); + prepare_response_items(&mut prepared_items); + Cow::Owned(prepared_items) + } + + pub(crate) fn response_item_from_user_input( + &self, + turn_context: &TurnContext, + input: Vec, + ) -> ResponseItem { + let local_image_preparation = if turn_context.features.enabled(Feature::ResizeAllImages) { + LocalImagePreparation::Defer + } else { + LocalImagePreparation::Process + }; + ResponseItem::from(ResponseInputItem::from_user_input( + input, + local_image_preparation, + )) + } + pub(crate) async fn record_conversation_items( &self, turn_context: &TurnContext, items: &[ResponseItem], ) { + let items = self.prepare_conversation_items_for_history(turn_context, items); + let items = items.as_ref(); { let mut state = self.state.lock().await; state.record_items(items.iter(), turn_context.truncation_policy); @@ -3211,7 +3253,7 @@ impl Session { // Persist the user message to history, but emit the turn item from `UserInput` so // UI-only `text_elements` are preserved. `ResponseItem::Message` does not carry // those spans, and `record_response_item_and_emit_turn_item` would drop them. - let response_item = ResponseItem::from(ResponseInputItem::from(input.to_vec())); + let response_item = self.response_item_from_user_input(turn_context, input.to_vec()); self.record_conversation_items(turn_context, std::slice::from_ref(&response_item)) .await; let mut user_message_item = UserMessageItem::new(input); diff --git a/codex-rs/core/src/session/tests.rs b/codex-rs/core/src/session/tests.rs index 29de7ca66..af2bfaa4c 100644 --- a/codex-rs/core/src/session/tests.rs +++ b/codex-rs/core/src/session/tests.rs @@ -42,7 +42,9 @@ use codex_protocol::models::ActivePermissionProfile; use codex_protocol::models::BUILT_IN_PERMISSION_PROFILE_WORKSPACE; use codex_protocol::models::FileSystemPermissions; use codex_protocol::models::FunctionCallOutputBody; +use codex_protocol::models::FunctionCallOutputContentItem; use codex_protocol::models::FunctionCallOutputPayload; +use codex_protocol::models::ImageDetail; use codex_protocol::models::PermissionProfile; use codex_protocol::models::SandboxEnforcement; use codex_protocol::openai_models::ModelServiceTier; @@ -1641,6 +1643,114 @@ async fn record_initial_history_reconstructs_resumed_transcript() { assert_eq!(expected, history.raw_items()); } +#[tokio::test] +async fn resize_all_images_prepares_failures_before_history_insertion() { + let (session, turn_context, _rx) = make_session_and_context_with_auth_and_config_and_rx( + CodexAuth::from_api_key("Test API Key"), + Vec::new(), + |config| { + let _ = config.features.enable(Feature::ResizeAllImages); + }, + ) + .await; + let item = ResponseItem::FunctionCallOutput { + call_id: "call-1".to_string(), + output: FunctionCallOutputPayload { + body: FunctionCallOutputBody::ContentItems(vec![ + FunctionCallOutputContentItem::InputText { + text: "before".to_string(), + }, + FunctionCallOutputContentItem::InputImage { + image_url: "data:image/png;base64,%%%".to_string(), + detail: Some(ImageDetail::High), + }, + FunctionCallOutputContentItem::InputImage { + image_url: "https://example.com/image.png".to_string(), + detail: Some(ImageDetail::High), + }, + ]), + success: Some(true), + }, + }; + + session + .record_conversation_items(turn_context.as_ref(), std::slice::from_ref(&item)) + .await; + + assert_eq!( + session.state.lock().await.clone_history().raw_items(), + &[ResponseItem::FunctionCallOutput { + call_id: "call-1".to_string(), + output: FunctionCallOutputPayload { + body: FunctionCallOutputBody::ContentItems(vec![ + FunctionCallOutputContentItem::InputText { + text: "before".to_string(), + }, + FunctionCallOutputContentItem::InputText { + text: "image content omitted because it could not be processed".to_string(), + }, + FunctionCallOutputContentItem::InputImage { + image_url: "https://example.com/image.png".to_string(), + detail: Some(ImageDetail::High), + }, + ]), + success: Some(true), + }, + }] + ); +} + +#[tokio::test] +async fn resize_all_images_prepares_resumed_history_before_installing_it() { + let (session, _turn_context, _rx) = make_session_and_context_with_auth_and_config_and_rx( + CodexAuth::from_api_key("Test API Key"), + Vec::new(), + |config| { + let _ = config.features.enable(Feature::ResizeAllImages); + }, + ) + .await; + let resumed_item = ResponseItem::Message { + id: None, + role: "user".to_string(), + content: vec![ + ContentItem::InputImage { + image_url: "data:image/png;base64,%%%".to_string(), + detail: Some(ImageDetail::High), + }, + ContentItem::InputText { + text: "keep me".to_string(), + }, + ], + phase: None, + }; + + session + .record_initial_history(InitialHistory::Resumed(ResumedHistory { + conversation_id: ThreadId::default(), + history: vec![RolloutItem::ResponseItem(resumed_item)], + rollout_path: Some(PathBuf::from("/tmp/resume.jsonl")), + })) + .await; + + assert_eq!( + session.state.lock().await.clone_history().raw_items(), + &[ResponseItem::Message { + id: None, + role: "user".to_string(), + content: vec![ + ContentItem::InputText { + text: "image content omitted because it could not be processed".to_string(), + }, + ContentItem::InputText { + text: "keep me".to_string(), + }, + ], + phase: None, + }] + ); +} + #[test] fn resolve_multi_agent_version_handles_unset_and_legacy_history() { let thread_id = ThreadId::default(); diff --git a/codex-rs/core/src/tools/handlers/view_image.rs b/codex-rs/core/src/tools/handlers/view_image.rs index 0432ad338..c3023b421 100644 --- a/codex-rs/core/src/tools/handlers/view_image.rs +++ b/codex-rs/core/src/tools/handlers/view_image.rs @@ -1,3 +1,4 @@ +use codex_features::Feature; use codex_protocol::items::ImageViewItem; use codex_protocol::items::TurnItem; use codex_protocol::models::DEFAULT_IMAGE_DETAIL; @@ -8,6 +9,7 @@ use codex_protocol::models::ImageDetail; use codex_protocol::models::ResponseInputItem; use codex_protocol::openai_models::InputModality; use codex_utils_image::PromptImageMode; +use codex_utils_image::data_url_from_bytes; use codex_utils_image::load_for_prompt_bytes; use serde::Deserialize; @@ -175,25 +177,30 @@ impl ViewImageHandler { let can_request_original_detail = can_request_original_image_detail(&turn.model_info); let use_original_detail = can_request_original_detail && matches!(detail, Some(ViewImageDetail::Original)); - let image_mode = if use_original_detail { - PromptImageMode::Original - } else { - PromptImageMode::ResizeToFit - }; let image_detail = if use_original_detail { ImageDetail::Original } else { DEFAULT_IMAGE_DETAIL }; - let image = - load_for_prompt_bytes(abs_path.as_path(), file_bytes, image_mode).map_err(|error| { - FunctionCallError::RespondToModel(format!( - "unable to process image at `{}`: {error}", - abs_path.display() - )) - })?; - let image_url = image.into_data_url(); + let image_url = if turn.features.enabled(Feature::ResizeAllImages) { + // The history insertion path owns image decoding and resizing when this is enabled. + data_url_from_bytes("application/octet-stream", &file_bytes) + } else { + let image_mode = if use_original_detail { + PromptImageMode::Original + } else { + PromptImageMode::ResizeToFit + }; + load_for_prompt_bytes(abs_path.as_path(), file_bytes, image_mode) + .map_err(|error| { + FunctionCallError::RespondToModel(format!( + "unable to process image at `{}`: {error}", + abs_path.display() + )) + })? + .into_data_url() + }; let item = TurnItem::ImageView(ImageViewItem { id: call_id, @@ -218,7 +225,7 @@ pub struct ViewImageOutput { impl ToolOutput for ViewImageOutput { fn log_preview(&self) -> String { - self.image_url.clone() + format!("", self.image_url.len()) } fn success_for_logging(&self) -> bool { @@ -264,6 +271,16 @@ mod tests { use std::sync::Arc; use tokio::sync::Mutex; + #[test] + fn log_preview_omits_image_data() { + let output = ViewImageOutput { + image_url: "data:image/png;base64,AAA".to_string(), + image_detail: DEFAULT_IMAGE_DETAIL, + }; + + assert_eq!(output.log_preview(), ""); + } + #[test] fn code_mode_result_returns_image_url_object() { let output = ViewImageOutput { diff --git a/codex-rs/core/tests/suite/code_mode.rs b/codex-rs/core/tests/suite/code_mode.rs index e1b02292e..fdd87274a 100644 --- a/codex-rs/core/tests/suite/code_mode.rs +++ b/codex-rs/core/tests/suite/code_mode.rs @@ -42,11 +42,16 @@ use core_test_support::test_codex::turn_permission_fields; use core_test_support::wait_for_event; use core_test_support::wait_for_event_match; use core_test_support::wait_for_mcp_server; +use image::DynamicImage; +use image::GenericImageView; +use image::ImageBuffer; +use image::Rgba; use pretty_assertions::assert_eq; use serde_json::Value; use std::collections::HashMap; use std::collections::HashSet; use std::fs; +use std::io::Cursor; use std::path::Path; use std::sync::Arc; use std::time::Duration; @@ -168,12 +173,21 @@ async fn run_code_mode_turn_with_config( code: &str, configure: impl FnOnce(&mut Config) + Send + 'static, ) -> Result<(TestCodex, ResponseMock)> { - let mut builder = test_codex() - .with_model("test-gpt-5.1-codex") - .with_config(move |config| { - let _ = config.features.enable(Feature::CodeMode); - configure(config); - }); + run_code_mode_turn_with_model_and_config(server, prompt, code, "test-gpt-5.1-codex", configure) + .await +} + +async fn run_code_mode_turn_with_model_and_config( + server: &MockServer, + prompt: &str, + code: &str, + model: &'static str, + configure: impl FnOnce(&mut Config) + Send + 'static, +) -> Result<(TestCodex, ResponseMock)> { + let mut builder = test_codex().with_model(model).with_config(move |config| { + let _ = config.features.enable(Feature::CodeMode); + configure(config); + }); let test = builder.build(server).await?; responses::mount_sse_once( @@ -2536,6 +2550,103 @@ image("data:image/png;base64,AAA"); Ok(()) } +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn resize_all_images_replaces_malformed_code_mode_image_only() -> Result<()> { + skip_if_no_network!(Ok(())); + + let server = responses::start_mock_server().await; + let (_test, second_mock) = run_code_mode_turn_with_config( + &server, + "use exec to return images", + r#" +image("https://example.com/image.jpg"); +image("data:image/png;base64,AAA"); +"#, + |config| { + let _ = config.features.enable(Feature::ResizeAllImages); + }, + ) + .await?; + + let req = second_mock.single_request(); + let items = custom_tool_output_items(&req, "call-1"); + let (_, success) = custom_tool_output_body_and_success(&req, "call-1"); + assert_ne!(success, Some(false)); + assert_eq!(items.len(), 3); + assert_eq!( + items[1], + serde_json::json!({ + "type": "input_image", + "image_url": "https://example.com/image.jpg", + "detail": "high" + }) + ); + assert_eq!( + items[2], + serde_json::json!({ + "type": "input_text", + "text": "image content omitted because it could not be processed" + }) + ); + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn resize_all_images_resizes_explicit_original_code_mode_image() -> Result<()> { + skip_if_no_network!(Ok(())); + + let original_dimensions = (6401, 100); + let image = ImageBuffer::from_pixel( + original_dimensions.0, + original_dimensions.1, + Rgba([20, 40, 60, 255]), + ); + let mut encoded = Cursor::new(Vec::new()); + DynamicImage::ImageRgba8(image).write_to(&mut encoded, image::ImageFormat::Png)?; + let image_data_url = format!( + "data:image/png;base64,{}", + BASE64_STANDARD.encode(encoded.into_inner()) + ); + let code = format!( + "image({}, \"original\");", + serde_json::to_string(&image_data_url)? + ); + + let server = responses::start_mock_server().await; + let (_test, second_mock) = run_code_mode_turn_with_model_and_config( + &server, + "use exec to return a large original-detail image", + &code, + "gpt-5.3-codex", + |config| { + config + .features + .enable(Feature::ResizeAllImages) + .expect("resize_all_images should be enabled"); + }, + ) + .await?; + + let req = second_mock.single_request(); + let items = custom_tool_output_items(&req, "call-1"); + let (_, success) = custom_tool_output_body_and_success(&req, "call-1"); + assert_ne!(success, Some(false)); + let resized_url = items[1]["image_url"] + .as_str() + .expect("code mode image output should contain a data URL"); + assert_eq!(items[1]["detail"], "original"); + let (_, resized_base64) = resized_url + .split_once(',') + .expect("resized image should contain a data URL prefix"); + let resized_bytes = BASE64_STANDARD.decode(resized_base64)?; + let resized = image::load_from_memory(&resized_bytes)?; + let resized_dimensions = resized.dimensions(); + assert_eq!(resized_dimensions, (6000, 94)); + + Ok(()) +} + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn code_mode_can_use_view_image_result_with_image_helper() -> Result<()> { skip_if_no_network!(Ok(())); diff --git a/codex-rs/core/tests/suite/rmcp_client.rs b/codex-rs/core/tests/suite/rmcp_client.rs index c47aa0970..9f3fc16c3 100644 --- a/codex-rs/core/tests/suite/rmcp_client.rs +++ b/codex-rs/core/tests/suite/rmcp_client.rs @@ -2,6 +2,8 @@ use anyhow::Context as _; use anyhow::ensure; +use base64::Engine; +use base64::engine::general_purpose::STANDARD as BASE64_STANDARD; use core_test_support::test_codex::local_selections; use std::collections::HashMap; use std::ffi::OsStr; @@ -25,6 +27,7 @@ use codex_core::config::Config; use codex_exec_server::CreateDirectoryOptions; use codex_exec_server::Environment; use codex_exec_server::HttpRequestParams; +use codex_features::Feature; use codex_login::CodexAuth; use codex_mcp::MCP_SANDBOX_STATE_META_CAPABILITY; use codex_models_manager::manager::RefreshStrategy; @@ -57,11 +60,16 @@ use core_test_support::test_codex::test_codex; use core_test_support::test_codex::turn_permission_fields; use core_test_support::wait_for_event; use core_test_support::wait_for_mcp_server; +use image::DynamicImage; +use image::GenericImageView; +use image::ImageBuffer; +use image::Rgba; use reqwest::Client; use reqwest::StatusCode; use serde_json::Value; use serde_json::json; use serial_test::serial; +use std::io::Cursor; use tempfile::tempdir; use tokio::process::Child; use tokio::process::Command; @@ -1257,6 +1265,107 @@ async fn stdio_image_responses_round_trip() -> anyhow::Result<()> { Ok(()) } +#[tokio::test(flavor = "multi_thread", worker_threads = 1)] +#[serial(mcp_test_value)] +async fn stdio_image_responses_resize_large_image() -> anyhow::Result<()> { + skip_if_no_network!(Ok(())); + + let server = responses::start_mock_server().await; + let call_id = "img-resize-1"; + let server_name = "rmcp"; + let namespace = format!("mcp__{server_name}"); + + let original_dimensions = (3000, 2000); + let image = ImageBuffer::from_pixel( + original_dimensions.0, + original_dimensions.1, + Rgba([20, 40, 60, 255]), + ); + let mut encoded = Cursor::new(Vec::new()); + DynamicImage::ImageRgba8(image).write_to(&mut encoded, image::ImageFormat::Png)?; + let image_data_url = format!( + "data:image/png;base64,{}", + BASE64_STANDARD.encode(encoded.into_inner()) + ); + let tool_arguments = serde_json::to_string(&json!({ + "scenario": "image_only", + "data_url": image_data_url, + }))?; + + mount_sse_once( + &server, + responses::sse(vec![ + responses::ev_response_created("resp-1"), + responses::ev_function_call_with_namespace( + call_id, + &namespace, + "image_scenario", + &tool_arguments, + ), + responses::ev_completed("resp-1"), + ]), + ) + .await; + let final_mock = mount_sse_once( + &server, + responses::sse(vec![ + responses::ev_assistant_message("msg-1", "done"), + responses::ev_completed("resp-2"), + ]), + ) + .await; + + let rmcp_test_server_bin = remote_aware_stdio_server_bin()?; + let fixture = test_codex() + .with_config(move |config| { + config + .features + .enable(Feature::ResizeAllImages) + .expect("resize_all_images should be enabled"); + insert_mcp_server( + config, + server_name, + stdio_transport(rmcp_test_server_bin, /*env*/ None, Vec::new()), + TestMcpServerOptions { + environment_id: remote_aware_environment_id(), + ..Default::default() + }, + ); + }) + .build_with_remote_env(&server) + .await?; + wait_for_mcp_server(&fixture.codex, server_name).await?; + + fixture + .codex + .submit(read_only_user_turn( + &fixture, + "call the rmcp image_scenario tool", + )) + .await?; + wait_for_event(&fixture.codex, |ev| matches!(ev, EventMsg::TurnComplete(_))).await; + + let output_item = final_mock.single_request().function_call_output(call_id); + assert_eq!(output_item["call_id"], call_id); + let output = output_item["output"] + .as_array() + .expect("image MCP output should be content items"); + let resized_url = output[1]["image_url"] + .as_str() + .expect("MCP image output should contain a data URL"); + assert_eq!(output[1]["detail"], "high"); + let (_, resized_base64) = resized_url + .split_once(',') + .expect("resized image should contain a data URL prefix"); + let resized_bytes = BASE64_STANDARD.decode(resized_base64)?; + let resized = image::load_from_memory(&resized_bytes)?; + let resized_dimensions = resized.dimensions(); + assert_eq!(resized_dimensions, (1920, 1280)); + + server.verify().await; + Ok(()) +} + #[tokio::test(flavor = "multi_thread", worker_threads = 1)] #[serial(mcp_test_value)] async fn stdio_image_responses_preserve_original_detail_metadata() -> anyhow::Result<()> { diff --git a/codex-rs/core/tests/suite/view_image.rs b/codex-rs/core/tests/suite/view_image.rs index 582078856..02fa6984b 100644 --- a/codex-rs/core/tests/suite/view_image.rs +++ b/codex-rs/core/tests/suite/view_image.rs @@ -7,6 +7,7 @@ use codex_exec_server::CreateDirectoryOptions; use codex_exec_server::LOCAL_ENVIRONMENT_ID; use codex_exec_server::REMOTE_ENVIRONMENT_ID; use codex_exec_server::RemoveOptions; +use codex_features::Feature; use codex_login::CodexAuth; use codex_protocol::config_types::ReasoningSummary; use codex_protocol::models::PermissionProfile; @@ -177,10 +178,15 @@ async fn write_workspace_png( async fn assert_user_turn_local_image_resizes_to( original_dimensions: (u32, u32), expected_dimensions: (u32, u32), + resize_policy: TestImageResizePolicy, ) -> anyhow::Result<()> { let server = start_mock_server().await; - let mut builder = test_codex(); + let mut builder = test_codex().with_config(move |config| { + if resize_policy == TestImageResizePolicy::AllImages { + let _ = config.features.enable(Feature::ResizeAllImages); + } + }); let test = builder.build_with_remote_env(&server).await?; let TestCodex { codex, @@ -254,18 +260,42 @@ async fn assert_user_turn_local_image_resizes_to( Ok(()) } +#[derive(Clone, Copy, Eq, PartialEq)] +enum TestImageResizePolicy { + Legacy, + AllImages, +} + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn user_turn_with_local_image_attaches_image() -> anyhow::Result<()> { skip_if_no_network!(Ok(())); - assert_user_turn_local_image_resizes_to((2304, 864), (2048, 768)).await + assert_user_turn_local_image_resizes_to((2304, 864), (2048, 768), TestImageResizePolicy::Legacy) + .await } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn user_turn_with_vertical_local_image_resizes_to_square_bounds() -> anyhow::Result<()> { skip_if_no_network!(Ok(())); - assert_user_turn_local_image_resizes_to((1024, 4096), (512, 2048)).await + assert_user_turn_local_image_resizes_to( + (1024, 4096), + (512, 2048), + TestImageResizePolicy::Legacy, + ) + .await +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn resize_all_images_applies_patch_budget_to_local_user_image() -> anyhow::Result<()> { + skip_if_no_network!(Ok(())); + + assert_user_turn_local_image_resizes_to( + (2048, 2048), + (1600, 1600), + TestImageResizePolicy::AllImages, + ) + .await } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] @@ -1248,6 +1278,72 @@ async fn view_image_tool_errors_for_non_image_files() -> anyhow::Result<()> { Ok(()) } +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn resize_all_images_turns_invalid_view_image_into_placeholder() -> anyhow::Result<()> { + skip_if_no_network!(Ok(())); + + let server = start_mock_server().await; + let mut builder = test_codex().with_config(|config| { + let _ = config.features.enable(Feature::ResizeAllImages); + }); + let test = builder.build_with_remote_env(&server).await?; + let TestCodex { + codex, + session_configured, + .. + } = &test; + + let rel_path = "assets/invalid-image.json"; + write_workspace_file(&test, rel_path, br#"{ "message": "hello" }"#.to_vec()).await?; + let call_id = "view-image-invalid-placeholder"; + let arguments = serde_json::json!({ "path": rel_path }).to_string(); + + responses::mount_sse_once( + &server, + sse(vec![ + ev_response_created("resp-1"), + ev_function_call(call_id, "view_image", &arguments), + ev_completed("resp-1"), + ]), + ) + .await; + let second_mock = responses::mount_sse_once( + &server, + sse(vec![ + ev_assistant_message("msg-1", "done"), + ev_completed("resp-2"), + ]), + ) + .await; + + codex + .submit(disabled_user_turn( + &test, + vec![UserInput::Text { + text: "please inspect the image".into(), + text_elements: Vec::new(), + }], + session_configured.model.clone(), + )) + .await?; + wait_for_event_with_timeout( + codex, + |event| matches!(event, EventMsg::TurnComplete(_)), + VIEW_IMAGE_TURN_COMPLETE_TIMEOUT, + ) + .await; + + let request = second_mock.single_request(); + assert_eq!( + request.function_call_output(call_id).get("output"), + Some(&serde_json::json!([{ + "type": "input_text", + "text": "image content omitted because it could not be processed" + }])) + ); + Ok(()) +} + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn view_image_tool_errors_when_file_missing() -> anyhow::Result<()> { skip_if_no_network!(Ok(())); diff --git a/codex-rs/features/src/lib.rs b/codex-rs/features/src/lib.rs index b30c25516..3159d50a3 100644 --- a/codex-rs/features/src/lib.rs +++ b/codex-rs/features/src/lib.rs @@ -183,6 +183,8 @@ pub enum Feature { ImageGeneration, /// Replace hosted image generation with the standalone image-generation extension. ImageGenExt, + /// Resize all inline data-URL images before recording them in history. + ResizeAllImages, /// Allow prompting and installing missing MCP dependencies. SkillMcpDependencyInstall, /// Removed compatibility flag for deleted skill env var dependency prompting. @@ -1086,6 +1088,12 @@ pub const FEATURES: &[FeatureSpec] = &[ stage: Stage::UnderDevelopment, default_enabled: false, }, + FeatureSpec { + id: Feature::ResizeAllImages, + key: "resize_all_images", + stage: Stage::UnderDevelopment, + default_enabled: false, + }, FeatureSpec { id: Feature::SkillMcpDependencyInstall, key: "skill_mcp_dependency_install", diff --git a/codex-rs/protocol/src/models.rs b/codex-rs/protocol/src/models.rs index 8cece0d83..5df6d1e77 100644 --- a/codex-rs/protocol/src/models.rs +++ b/codex-rs/protocol/src/models.rs @@ -4,6 +4,7 @@ use std::num::NonZeroUsize; use std::path::Path; use codex_utils_image::PromptImageMode; +use codex_utils_image::data_url_from_bytes; use codex_utils_image::load_for_prompt_bytes; use serde::Deserialize; use serde::Deserializer; @@ -1088,24 +1089,7 @@ pub fn local_image_content_items_with_label_number( }; match load_for_prompt_bytes(path, file_bytes, mode) { - Ok(image) => { - let mut items = Vec::with_capacity(3); - if let Some(label_number) = label_number { - items.push(ContentItem::InputText { - text: local_image_open_tag_text_with_path(label_number, path), - }); - } - items.push(ContentItem::InputImage { - image_url: image.into_data_url(), - detail: Some(detail), - }); - if label_number.is_some() { - items.push(ContentItem::InputText { - text: LOCAL_IMAGE_CLOSE_TAG.to_string(), - }); - } - items - } + Ok(image) => local_image_content_items(path, image.into_data_url(), label_number, detail), Err(err) => match &err { ImageProcessingError::Read { .. } | ImageProcessingError::Encode { .. } @@ -1126,6 +1110,36 @@ pub fn local_image_content_items_with_label_number( } } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum LocalImagePreparation { + Process, + Defer, +} + +fn local_image_content_items( + path: &std::path::Path, + image_url: String, + label_number: Option, + detail: ImageDetail, +) -> Vec { + let mut items = Vec::with_capacity(3); + if let Some(label_number) = label_number { + items.push(ContentItem::InputText { + text: local_image_open_tag_text_with_path(label_number, path), + }); + } + items.push(ContentItem::InputImage { + image_url, + detail: Some(detail), + }); + if label_number.is_some() { + items.push(ContentItem::InputText { + text: LOCAL_IMAGE_CLOSE_TAG.to_string(), + }); + } + items +} + impl From for ResponseItem { fn from(item: ResponseInputItem) -> Self { match item { @@ -1238,6 +1252,15 @@ pub enum ReasoningItemContent { impl From> for ResponseInputItem { fn from(items: Vec) -> Self { + Self::from_user_input(items, LocalImagePreparation::Process) + } +} + +impl ResponseInputItem { + pub fn from_user_input( + items: Vec, + local_image_preparation: LocalImagePreparation, + ) -> Self { let mut image_index = 0; Self::Message { role: "user".to_string(), @@ -1259,12 +1282,22 @@ impl From> for ResponseInputItem { image_index += 1; let detail = detail.unwrap_or(DEFAULT_IMAGE_DETAIL); match std::fs::read(&path) { - Ok(file_bytes) => local_image_content_items_with_label_number( - &path, - file_bytes, - Some(image_index), - detail, - ), + Ok(file_bytes) => match local_image_preparation { + LocalImagePreparation::Process => { + local_image_content_items_with_label_number( + &path, + file_bytes, + Some(image_index), + detail, + ) + } + LocalImagePreparation::Defer => local_image_content_items( + &path, + data_url_from_bytes("application/octet-stream", &file_bytes), + Some(image_index), + detail, + ), + }, Err(err) => vec![local_image_error_placeholder(&path, err)], } } diff --git a/codex-rs/utils/image/src/image_tests.rs b/codex-rs/utils/image/src/image_tests.rs index 7fdc72c16..e3205c95c 100644 --- a/codex-rs/utils/image/src/image_tests.rs +++ b/codex-rs/utils/image/src/image_tests.rs @@ -105,8 +105,7 @@ async fn preserves_large_image_in_original_mode() { async fn data_url_processing_preserves_supported_source_bytes() { let image = ImageBuffer::from_pixel(64, 32, Rgba([10u8, 20, 30, 255])); let original_bytes = image_bytes(&image, ImageFormat::Png); - let encoded = BASE64_STANDARD.encode(&original_bytes); - let image_url = format!("data:image/png;base64,{encoded}") + let image_url = data_url_from_bytes("image/png", &original_bytes) .replacen("data:", "DATA:", 1) .replacen(";base64,", ";BASE64,", 1); @@ -123,8 +122,7 @@ async fn data_url_processing_preserves_supported_source_bytes() { async fn data_url_processing_converts_gif_to_png() { let image = ImageBuffer::from_pixel(64, 32, Rgba([10u8, 20, 30, 255])); let gif_bytes = image_bytes(&image, ImageFormat::Gif); - let encoded = BASE64_STANDARD.encode(&gif_bytes); - let image_url = format!("data:image/gif;base64,{encoded}"); + let image_url = data_url_from_bytes("image/gif", &gif_bytes); let processed = load_data_url_for_prompt(&image_url, PromptImageMode::ResizeToFit) .expect("process GIF data URL"); diff --git a/codex-rs/utils/image/src/lib.rs b/codex-rs/utils/image/src/lib.rs index 5f9337725..a7430fe04 100644 --- a/codex-rs/utils/image/src/lib.rs +++ b/codex-rs/utils/image/src/lib.rs @@ -40,11 +40,16 @@ pub struct EncodedImage { impl EncodedImage { pub fn into_data_url(self) -> String { - let encoded = BASE64_STANDARD.encode(&self.bytes); - format!("data:{};base64,{encoded}", self.mime) + data_url_from_bytes(&self.mime, &self.bytes) } } +/// Wraps image bytes in a data URL without decoding or validating them. +pub fn data_url_from_bytes(mime: &str, bytes: &[u8]) -> String { + let encoded = BASE64_STANDARD.encode(bytes); + format!("data:{mime};base64,{encoded}") +} + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum PromptImageMode { ResizeToFit,