diff --git a/codex-rs/Cargo.lock b/codex-rs/Cargo.lock index d9ee0f608..b0c1e3842 100644 --- a/codex-rs/Cargo.lock +++ b/codex-rs/Cargo.lock @@ -1820,8 +1820,10 @@ dependencies = [ "codex-state", "codex-test-macros", "codex-utils-absolute-path", + "codex-utils-cache", "codex-utils-cargo-bin", "codex-utils-home-dir", + "codex-utils-image", "codex-utils-pty", "codex-utils-readiness", "codex-utils-stream-parser", diff --git a/codex-rs/app-server-protocol/schema/json/ClientRequest.json b/codex-rs/app-server-protocol/schema/json/ClientRequest.json index 40745ce66..84ee3d5c1 100644 --- a/codex-rs/app-server-protocol/schema/json/ClientRequest.json +++ b/codex-rs/app-server-protocol/schema/json/ClientRequest.json @@ -514,6 +514,16 @@ }, { "properties": { + "detail": { + "anyOf": [ + { + "$ref": "#/definitions/ImageDetail" + }, + { + "type": "null" + } + ] + }, "image_url": { "type": "string" }, @@ -627,6 +637,15 @@ ], "type": "string" }, + "ImageDetail": { + "enum": [ + "auto", + "low", + "high", + "original" + ], + "type": "string" + }, "InitializeCapabilities": { "description": "Client-declared capabilities negotiated during initialize.", "properties": { diff --git a/codex-rs/app-server-protocol/schema/json/EventMsg.json b/codex-rs/app-server-protocol/schema/json/EventMsg.json index 4258bf7c2..85de9c0da 100644 --- a/codex-rs/app-server-protocol/schema/json/EventMsg.json +++ b/codex-rs/app-server-protocol/schema/json/EventMsg.json @@ -3478,6 +3478,16 @@ }, { "properties": { + "detail": { + "anyOf": [ + { + "$ref": "#/definitions/ImageDetail" + }, + { + "type": "null" + } + ] + }, "image_url": { "type": "string" }, @@ -3569,6 +3579,15 @@ ], "type": "object" }, + "ImageDetail": { + "enum": [ + "auto", + "low", + "high", + "original" + ], + "type": "string" + }, "LocalShellAction": { "oneOf": [ { diff --git a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json index 4695d2a81..a60e6c4a4 100644 --- a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json +++ b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json @@ -9700,6 +9700,16 @@ }, { "properties": { + "detail": { + "anyOf": [ + { + "$ref": "#/definitions/v2/ImageDetail" + }, + { + "type": "null" + } + ] + }, "image_url": { "type": "string" }, @@ -9865,6 +9875,15 @@ ], "type": "string" }, + "ImageDetail": { + "enum": [ + "auto", + "low", + "high", + "original" + ], + "type": "string" + }, "InputModality": { "description": "Canonical user-input modality tags advertised by a model.", "oneOf": [ diff --git a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json index e1f4c85d1..b4a5f9f6c 100644 --- a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json +++ b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json @@ -6542,6 +6542,16 @@ }, { "properties": { + "detail": { + "anyOf": [ + { + "$ref": "#/definitions/ImageDetail" + }, + { + "type": "null" + } + ] + }, "image_url": { "type": "string" }, @@ -6828,6 +6838,15 @@ ], "type": "object" }, + "ImageDetail": { + "enum": [ + "auto", + "low", + "high", + "original" + ], + "type": "string" + }, "InitializeCapabilities": { "description": "Client-declared capabilities negotiated during initialize.", "properties": { diff --git a/codex-rs/app-server-protocol/schema/json/v2/RawResponseItemCompletedNotification.json b/codex-rs/app-server-protocol/schema/json/v2/RawResponseItemCompletedNotification.json index 4717ff266..cedfdb19c 100644 --- a/codex-rs/app-server-protocol/schema/json/v2/RawResponseItemCompletedNotification.json +++ b/codex-rs/app-server-protocol/schema/json/v2/RawResponseItemCompletedNotification.json @@ -103,6 +103,16 @@ }, { "properties": { + "detail": { + "anyOf": [ + { + "$ref": "#/definitions/ImageDetail" + }, + { + "type": "null" + } + ] + }, "image_url": { "type": "string" }, @@ -173,6 +183,15 @@ ], "type": "object" }, + "ImageDetail": { + "enum": [ + "auto", + "low", + "high", + "original" + ], + "type": "string" + }, "LocalShellAction": { "oneOf": [ { diff --git a/codex-rs/app-server-protocol/schema/json/v2/ThreadResumeParams.json b/codex-rs/app-server-protocol/schema/json/v2/ThreadResumeParams.json index 353bda693..91850f60e 100644 --- a/codex-rs/app-server-protocol/schema/json/v2/ThreadResumeParams.json +++ b/codex-rs/app-server-protocol/schema/json/v2/ThreadResumeParams.json @@ -145,6 +145,16 @@ }, { "properties": { + "detail": { + "anyOf": [ + { + "$ref": "#/definitions/ImageDetail" + }, + { + "type": "null" + } + ] + }, "image_url": { "type": "string" }, @@ -215,6 +225,15 @@ ], "type": "object" }, + "ImageDetail": { + "enum": [ + "auto", + "low", + "high", + "original" + ], + "type": "string" + }, "LocalShellAction": { "oneOf": [ { diff --git a/codex-rs/app-server-protocol/schema/typescript/FunctionCallOutputContentItem.ts b/codex-rs/app-server-protocol/schema/typescript/FunctionCallOutputContentItem.ts index 8bfb6993d..fb2996f1e 100644 --- a/codex-rs/app-server-protocol/schema/typescript/FunctionCallOutputContentItem.ts +++ b/codex-rs/app-server-protocol/schema/typescript/FunctionCallOutputContentItem.ts @@ -1,9 +1,10 @@ // GENERATED CODE! DO NOT MODIFY BY HAND! // This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually. +import type { ImageDetail } from "./ImageDetail"; /** * Responses API compatible content items that can be returned by a tool call. * This is a subset of ContentItem with the types we support as function call outputs. */ -export type FunctionCallOutputContentItem = { "type": "input_text", text: string, } | { "type": "input_image", image_url: string, }; +export type FunctionCallOutputContentItem = { "type": "input_text", text: string, } | { "type": "input_image", image_url: string, detail?: ImageDetail, }; diff --git a/codex-rs/app-server-protocol/schema/typescript/ImageDetail.ts b/codex-rs/app-server-protocol/schema/typescript/ImageDetail.ts new file mode 100644 index 000000000..a48f07c08 --- /dev/null +++ b/codex-rs/app-server-protocol/schema/typescript/ImageDetail.ts @@ -0,0 +1,5 @@ +// GENERATED CODE! DO NOT MODIFY BY HAND! + +// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually. + +export type ImageDetail = "auto" | "low" | "high" | "original"; diff --git a/codex-rs/app-server-protocol/schema/typescript/index.ts b/codex-rs/app-server-protocol/schema/typescript/index.ts index adaeb4c26..228998cdd 100644 --- a/codex-rs/app-server-protocol/schema/typescript/index.ts +++ b/codex-rs/app-server-protocol/schema/typescript/index.ts @@ -83,6 +83,7 @@ export type { GitDiffToRemoteParams } from "./GitDiffToRemoteParams"; export type { GitDiffToRemoteResponse } from "./GitDiffToRemoteResponse"; export type { GitSha } from "./GitSha"; export type { HistoryEntry } from "./HistoryEntry"; +export type { ImageDetail } from "./ImageDetail"; export type { InitializeCapabilities } from "./InitializeCapabilities"; export type { InitializeParams } from "./InitializeParams"; export type { InitializeResponse } from "./InitializeResponse"; diff --git a/codex-rs/app-server/tests/common/models_cache.rs b/codex-rs/app-server/tests/common/models_cache.rs index 0de8fda5f..0c2a2e585 100644 --- a/codex-rs/app-server/tests/common/models_cache.rs +++ b/codex-rs/app-server/tests/common/models_cache.rs @@ -38,6 +38,7 @@ fn preset_to_info(preset: &ModelPreset, priority: i32) -> ModelInfo { apply_patch_tool_type: None, truncation_policy: TruncationPolicyConfig::bytes(10_000), supports_parallel_tool_calls: false, + supports_image_detail_original: false, context_window: Some(272_000), auto_compact_token_limit: None, effective_context_window_percent: 95, diff --git a/codex-rs/app-server/tests/suite/v2/dynamic_tools.rs b/codex-rs/app-server/tests/suite/v2/dynamic_tools.rs index 52a597f66..f3945c2b4 100644 --- a/codex-rs/app-server/tests/suite/v2/dynamic_tools.rs +++ b/codex-rs/app-server/tests/suite/v2/dynamic_tools.rs @@ -399,7 +399,10 @@ async fn dynamic_tool_call_round_trip_sends_content_items_to_model() -> Result<( FunctionCallOutputContentItem::InputText { text } } DynamicToolCallOutputContentItem::InputImage { image_url } => { - FunctionCallOutputContentItem::InputImage { image_url } + FunctionCallOutputContentItem::InputImage { + image_url, + detail: None, + } } }) .collect::>(); diff --git a/codex-rs/codex-api/src/endpoint/models.rs b/codex-rs/codex-api/src/endpoint/models.rs index 5d1c5fb12..ab90fc438 100644 --- a/codex-rs/codex-api/src/endpoint/models.rs +++ b/codex-rs/codex-api/src/endpoint/models.rs @@ -208,6 +208,7 @@ mod tests { "apply_patch_tool_type": null, "truncation_policy": {"mode": "bytes", "limit": 10_000}, "supports_parallel_tool_calls": false, + "supports_image_detail_original": false, "context_window": 272_000, "experimental_supported_tools": [], })) diff --git a/codex-rs/codex-api/tests/models_integration.rs b/codex-rs/codex-api/tests/models_integration.rs index 2b61f0de6..3d9133700 100644 --- a/codex-rs/codex-api/tests/models_integration.rs +++ b/codex-rs/codex-api/tests/models_integration.rs @@ -86,6 +86,7 @@ async fn models_client_hits_models_endpoint() { apply_patch_tool_type: None, truncation_policy: TruncationPolicyConfig::bytes(10_000), supports_parallel_tool_calls: false, + supports_image_detail_original: false, context_window: Some(272_000), auto_compact_token_limit: None, effective_context_window_percent: 95, diff --git a/codex-rs/core/Cargo.toml b/codex-rs/core/Cargo.toml index c3a37bb98..eafedc9d7 100644 --- a/codex-rs/core/Cargo.toml +++ b/codex-rs/core/Cargo.toml @@ -48,6 +48,8 @@ codex-protocol = { workspace = true } codex-rmcp-client = { workspace = true } codex-state = { workspace = true } codex-utils-absolute-path = { workspace = true } +codex-utils-cache = { workspace = true } +codex-utils-image = { workspace = true } codex-utils-home-dir = { workspace = true } codex-utils-pty = { workspace = true } codex-utils-readiness = { workspace = true } @@ -64,6 +66,7 @@ eventsource-stream = { workspace = true } futures = { workspace = true } http = { workspace = true } iana-time-zone = { workspace = true } +image = { workspace = true, features = ["jpeg", "png", "webp"] } indexmap = { workspace = true } keyring = { workspace = true, features = ["crypto-rust"] } libc = { workspace = true } @@ -88,7 +91,6 @@ sha2 = { workspace = true } shlex = { workspace = true } similar = { workspace = true } tempfile = { workspace = true } -test-case = "3.3.1" test-log = { workspace = true } thiserror = { workspace = true } time = { workspace = true, features = [ @@ -157,11 +159,11 @@ codex-test-macros = { workspace = true } codex-utils-cargo-bin = { workspace = true } core_test_support = { workspace = true } ctor = { workspace = true } -image = { workspace = true, features = ["jpeg", "png"] } insta = { workspace = true } maplit = { workspace = true } predicates = { workspace = true } pretty_assertions = { workspace = true } +test-case = "3.3.1" opentelemetry_sdk = { workspace = true, features = [ "experimental_metrics_custom_reader", "metrics", diff --git a/codex-rs/core/config.schema.json b/codex-rs/core/config.schema.json index 74af6d50c..749f6de23 100644 --- a/codex-rs/core/config.schema.json +++ b/codex-rs/core/config.schema.json @@ -355,6 +355,9 @@ "fast_mode": { "type": "boolean" }, + "image_detail_original": { + "type": "boolean" + }, "include_apply_patch_tool": { "type": "boolean" }, @@ -1739,6 +1742,9 @@ "fast_mode": { "type": "boolean" }, + "image_detail_original": { + "type": "boolean" + }, "include_apply_patch_tool": { "type": "boolean" }, diff --git a/codex-rs/core/models.json b/codex-rs/core/models.json index 2fd55b986..04660bf46 100644 --- a/codex-rs/core/models.json +++ b/codex-rs/core/models.json @@ -15,6 +15,7 @@ "limit": 10000 }, "supports_parallel_tool_calls": true, + "supports_image_detail_original": true, "context_window": 272000, "reasoning_summary_format": "experimental", "slug": "gpt-5.3-codex", @@ -83,6 +84,7 @@ "limit": 10000 }, "supports_parallel_tool_calls": true, + "supports_image_detail_original": false, "context_window": 272000, "reasoning_summary_format": "experimental", "slug": "gpt-5.2-codex", @@ -155,6 +157,7 @@ "limit": 10000 }, "supports_parallel_tool_calls": false, + "supports_image_detail_original": false, "context_window": 272000, "reasoning_summary_format": "experimental", "slug": "gpt-5.1-codex-max", @@ -220,6 +223,7 @@ "limit": 10000 }, "supports_parallel_tool_calls": false, + "supports_image_detail_original": false, "context_window": 272000, "reasoning_summary_format": "experimental", "slug": "gpt-5.1-codex", @@ -281,6 +285,7 @@ "limit": 10000 }, "supports_parallel_tool_calls": true, + "supports_image_detail_original": false, "context_window": 272000, "reasoning_summary_format": "none", "slug": "gpt-5.2", @@ -346,6 +351,7 @@ "limit": 10000 }, "supports_parallel_tool_calls": true, + "supports_image_detail_original": false, "context_window": 272000, "reasoning_summary_format": "none", "slug": "gpt-5.1", @@ -407,6 +413,7 @@ "limit": 10000 }, "supports_parallel_tool_calls": false, + "supports_image_detail_original": false, "context_window": 272000, "reasoning_summary_format": "experimental", "slug": "gpt-5-codex", @@ -468,6 +475,7 @@ "limit": 10000 }, "supports_parallel_tool_calls": false, + "supports_image_detail_original": false, "context_window": 272000, "reasoning_summary_format": "none", "slug": "gpt-5", @@ -532,6 +540,7 @@ "limit": 10000 }, "supports_parallel_tool_calls": false, + "supports_image_detail_original": false, "context_window": 128000, "reasoning_summary_format": "none", "slug": "gpt-oss-120b", @@ -589,6 +598,7 @@ "limit": 10000 }, "supports_parallel_tool_calls": false, + "supports_image_detail_original": false, "context_window": 128000, "reasoning_summary_format": "none", "slug": "gpt-oss-20b", @@ -647,6 +657,7 @@ "limit": 10000 }, "supports_parallel_tool_calls": false, + "supports_image_detail_original": false, "context_window": 272000, "reasoning_summary_format": "experimental", "slug": "gpt-5.1-codex-mini", @@ -704,6 +715,7 @@ "limit": 10000 }, "supports_parallel_tool_calls": false, + "supports_image_detail_original": false, "context_window": 272000, "reasoning_summary_format": "experimental", "slug": "gpt-5-codex-mini", diff --git a/codex-rs/core/src/client.rs b/codex-rs/core/src/client.rs index 9e5974b48..d81c10f39 100644 --- a/codex-rs/core/src/client.rs +++ b/codex-rs/core/src/client.rs @@ -1305,6 +1305,7 @@ mod tests { "apply_patch_tool_type": null, "truncation_policy": {"mode": "bytes", "limit": 10000}, "supports_parallel_tool_calls": false, + "supports_image_detail_original": false, "context_window": 272000, "auto_compact_token_limit": null, "experimental_supported_tools": [] diff --git a/codex-rs/core/src/context_manager/history.rs b/codex-rs/core/src/context_manager/history.rs index e4b7755ab..40e0f31e4 100644 --- a/codex-rs/core/src/context_manager/history.rs +++ b/codex-rs/core/src/context_manager/history.rs @@ -2,21 +2,29 @@ use crate::codex::TurnContext; use crate::context_manager::normalize; use crate::event_mapping::is_contextual_user_message_content; use crate::truncate::TruncationPolicy; +use crate::truncate::approx_bytes_for_tokens; use crate::truncate::approx_token_count; use crate::truncate::approx_tokens_from_byte_count_i64; use crate::truncate::truncate_function_output_items_with_policy; use crate::truncate::truncate_text; +use base64::Engine; +use base64::engine::general_purpose::STANDARD as BASE64_STANDARD; use codex_protocol::models::BaseInstructions; use codex_protocol::models::ContentItem; use codex_protocol::models::FunctionCallOutputBody; use codex_protocol::models::FunctionCallOutputContentItem; use codex_protocol::models::FunctionCallOutputPayload; +use codex_protocol::models::ImageDetail; use codex_protocol::models::ResponseItem; use codex_protocol::openai_models::InputModality; use codex_protocol::protocol::TokenUsage; use codex_protocol::protocol::TokenUsageInfo; use codex_protocol::protocol::TurnContextItem; +use codex_utils_cache::BlockingLruCache; +use codex_utils_cache::sha1_digest; +use std::num::NonZeroUsize; use std::ops::Deref; +use std::sync::LazyLock; /// Transcript of thread history #[derive(Debug, Clone, Default)] @@ -428,7 +436,19 @@ fn estimate_item_token_count(item: &ResponseItem) -> i64 { /// /// The estimator later converts bytes to tokens using a 4-bytes/token heuristic /// with ceiling division, so 7,373 bytes maps to approximately 1,844 tokens. -const IMAGE_BYTES_ESTIMATE: i64 = 7373; +const RESIZED_IMAGE_BYTES_ESTIMATE: i64 = 7373; +// See https://platform.openai.com/docs/guides/images-vision#calculating-costs. +// Use a direct 32px patch count only for `detail: "original"`; +// all other image inputs continue to use `RESIZED_IMAGE_BYTES_ESTIMATE`. +const ORIGINAL_IMAGE_PATCH_SIZE: u32 = 32; +const ORIGINAL_IMAGE_ESTIMATE_CACHE_SIZE: usize = 32; + +static ORIGINAL_IMAGE_ESTIMATE_CACHE: LazyLock>> = + LazyLock::new(|| { + BlockingLruCache::new( + NonZeroUsize::new(ORIGINAL_IMAGE_ESTIMATE_CACHE_SIZE).unwrap_or(NonZeroUsize::MIN), + ) + }); pub(crate) fn estimate_response_item_model_visible_bytes(item: &ResponseItem) -> i64 { match item { @@ -444,15 +464,15 @@ pub(crate) fn estimate_response_item_model_visible_bytes(item: &ResponseItem) -> let raw = serde_json::to_string(item) .map(|serialized| i64::try_from(serialized.len()).unwrap_or(i64::MAX)) .unwrap_or_default(); - let (payload_bytes, image_count) = image_data_url_estimate_adjustment(item); - if payload_bytes == 0 || image_count == 0 { + let (payload_bytes, replacement_bytes) = image_data_url_estimate_adjustment(item); + if payload_bytes == 0 || replacement_bytes == 0 { raw } else { - // Replace raw base64 payload bytes with a fixed per-image cost. - // We intentionally preserve the data URL prefix and JSON wrapper - // bytes already included in `raw`. + // Replace raw base64 payload bytes with a per-image estimate. + // We intentionally preserve the data URL prefix and JSON + // wrapper bytes already included in `raw`. raw.saturating_sub(payload_bytes) - .saturating_add(image_count.saturating_mul(IMAGE_BYTES_ESTIMATE)) + .saturating_add(replacement_bytes) } } } @@ -463,7 +483,7 @@ pub(crate) fn estimate_response_item_model_visible_bytes(item: &ResponseItem) -> /// /// We only discount payloads for `data:image/...;base64,...` URLs (case /// insensitive markers) and leave everything else at raw serialized size. -fn base64_data_url_payload_len(url: &str) -> Option { +fn parse_base64_image_data_url(url: &str) -> Option<&str> { if !url .get(.."data:".len()) .is_some_and(|prefix| prefix.eq_ignore_ascii_case("data:")) @@ -489,22 +509,62 @@ fn base64_data_url_payload_len(url: &str) -> Option { if !has_base64_marker { return None; } - Some(payload.len()) + Some(payload) +} + +fn estimate_original_image_bytes(image_url: &str) -> Option { + let key = sha1_digest(image_url.as_bytes()); + ORIGINAL_IMAGE_ESTIMATE_CACHE.get_or_insert_with(key, || { + let payload = match parse_base64_image_data_url(image_url) { + Some(payload) => payload, + None => { + tracing::trace!("skipping original-detail estimate for non-base64 image data URL"); + return None; + } + }; + let bytes = match BASE64_STANDARD.decode(payload) { + Ok(bytes) => bytes, + Err(error) => { + tracing::trace!("failed to decode original-detail image payload: {error}"); + return None; + } + }; + let dynamic = match image::load_from_memory(&bytes) { + Ok(dynamic) => dynamic, + Err(error) => { + tracing::trace!("failed to decode original-detail image bytes: {error}"); + return None; + } + }; + let width = i64::from(dynamic.width()); + let height = i64::from(dynamic.height()); + let patch_size = i64::from(ORIGINAL_IMAGE_PATCH_SIZE); + let patches_wide = width.saturating_add(patch_size.saturating_sub(1)) / patch_size; + let patches_high = height.saturating_add(patch_size.saturating_sub(1)) / patch_size; + let patch_count = patches_wide.saturating_mul(patches_high); + let patch_count = usize::try_from(patch_count).unwrap_or(usize::MAX); + Some(i64::try_from(approx_bytes_for_tokens(patch_count)).unwrap_or(i64::MAX)) + }) } /// Scans one response item for discount-eligible inline image data URLs and /// returns: /// - total base64 payload bytes to subtract from raw serialized size -/// - count of qualifying images to replace with `IMAGE_BYTES_ESTIMATE` +/// - total replacement byte estimate for those images fn image_data_url_estimate_adjustment(item: &ResponseItem) -> (i64, i64) { let mut payload_bytes = 0i64; - let mut image_count = 0i64; + let mut replacement_bytes = 0i64; - let mut accumulate = |image_url: &str| { - if let Some(payload_len) = base64_data_url_payload_len(image_url) { + let mut accumulate = |image_url: &str, detail: Option| { + if let Some(payload_len) = parse_base64_image_data_url(image_url).map(str::len) { payload_bytes = payload_bytes.saturating_add(i64::try_from(payload_len).unwrap_or(i64::MAX)); - image_count = image_count.saturating_add(1); + replacement_bytes = replacement_bytes.saturating_add(match detail { + Some(ImageDetail::Original) => { + estimate_original_image_bytes(image_url).unwrap_or(RESIZED_IMAGE_BYTES_ESTIMATE) + } + _ => RESIZED_IMAGE_BYTES_ESTIMATE, + }); } }; @@ -512,7 +572,7 @@ fn image_data_url_estimate_adjustment(item: &ResponseItem) -> (i64, i64) { ResponseItem::Message { content, .. } => { for content_item in content { if let ContentItem::InputImage { image_url } = content_item { - accumulate(image_url); + accumulate(image_url, None); } } } @@ -520,8 +580,10 @@ fn image_data_url_estimate_adjustment(item: &ResponseItem) -> (i64, i64) { | ResponseItem::CustomToolCallOutput { output, .. } => { if let FunctionCallOutputBody::ContentItems(items) = &output.body { for content_item in items { - if let FunctionCallOutputContentItem::InputImage { image_url } = content_item { - accumulate(image_url); + if let FunctionCallOutputContentItem::InputImage { image_url, detail } = + content_item + { + accumulate(image_url, *detail); } } } @@ -529,7 +591,7 @@ fn image_data_url_estimate_adjustment(item: &ResponseItem) -> (i64, i64) { _ => {} } - (payload_bytes, image_count) + (payload_bytes, replacement_bytes) } fn is_model_generated_item(item: &ResponseItem) -> bool { diff --git a/codex-rs/core/src/context_manager/history_tests.rs b/codex-rs/core/src/context_manager/history_tests.rs index 798abc767..46aa46232 100644 --- a/codex-rs/core/src/context_manager/history_tests.rs +++ b/codex-rs/core/src/context_manager/history_tests.rs @@ -1,12 +1,15 @@ use super::*; use crate::truncate; use crate::truncate::TruncationPolicy; +use base64::Engine; +use base64::engine::general_purpose::STANDARD as BASE64_STANDARD; use codex_git::GhostCommit; use codex_protocol::models::BaseInstructions; use codex_protocol::models::ContentItem; use codex_protocol::models::FunctionCallOutputBody; use codex_protocol::models::FunctionCallOutputContentItem; use codex_protocol::models::FunctionCallOutputPayload; +use codex_protocol::models::ImageDetail; use codex_protocol::models::LocalShellAction; use codex_protocol::models::LocalShellExecAction; use codex_protocol::models::LocalShellStatus; @@ -14,6 +17,9 @@ use codex_protocol::models::ReasoningItemContent; use codex_protocol::models::ReasoningItemReasoningSummary; use codex_protocol::openai_models::InputModality; use codex_protocol::openai_models::default_input_modalities; +use image::ImageBuffer; +use image::ImageFormat; +use image::Rgba; use pretty_assertions::assert_eq; use regex_lite::Regex; @@ -276,6 +282,7 @@ fn for_prompt_strips_images_when_model_does_not_support_images() { }, FunctionCallOutputContentItem::InputImage { image_url: "https://example.com/result.png".to_string(), + detail: None, }, ]), }, @@ -294,6 +301,7 @@ fn for_prompt_strips_images_when_model_does_not_support_images() { }, FunctionCallOutputContentItem::InputImage { image_url: "https://example.com/js-repl-result.png".to_string(), + detail: None, }, ]), }, @@ -489,6 +497,7 @@ fn replace_last_turn_images_replaces_tool_output_images() { body: FunctionCallOutputBody::ContentItems(vec![ FunctionCallOutputContentItem::InputImage { image_url: "data:image/png;base64,AAA".to_string(), + detail: None, }, ]), success: Some(true), @@ -1302,7 +1311,7 @@ fn image_data_url_payload_does_not_dominate_message_estimate() { let raw_len = serde_json::to_string(&image_item).unwrap().len() as i64; let estimated = estimate_response_item_model_visible_bytes(&image_item); - let expected = raw_len - payload.len() as i64 + IMAGE_BYTES_ESTIMATE; + let expected = raw_len - payload.len() as i64 + RESIZED_IMAGE_BYTES_ESTIMATE; let text_only_estimated = estimate_response_item_model_visible_bytes(&text_only_item); assert_eq!(estimated, expected); @@ -1320,13 +1329,16 @@ fn image_data_url_payload_does_not_dominate_function_call_output_estimate() { FunctionCallOutputContentItem::InputText { text: "Screenshot captured".to_string(), }, - FunctionCallOutputContentItem::InputImage { image_url }, + FunctionCallOutputContentItem::InputImage { + image_url, + detail: None, + }, ]), }; let raw_len = serde_json::to_string(&item).unwrap().len() as i64; let estimated = estimate_response_item_model_visible_bytes(&item); - let expected = raw_len - payload.len() as i64 + IMAGE_BYTES_ESTIMATE; + let expected = raw_len - payload.len() as i64 + RESIZED_IMAGE_BYTES_ESTIMATE; assert_eq!(estimated, expected); assert!(estimated < raw_len); @@ -1342,13 +1354,16 @@ fn image_data_url_payload_does_not_dominate_custom_tool_call_output_estimate() { FunctionCallOutputContentItem::InputText { text: "Screenshot captured".to_string(), }, - FunctionCallOutputContentItem::InputImage { image_url }, + FunctionCallOutputContentItem::InputImage { + image_url, + detail: None, + }, ]), }; let raw_len = serde_json::to_string(&item).unwrap().len() as i64; let estimated = estimate_response_item_model_visible_bytes(&item); - let expected = raw_len - payload.len() as i64 + IMAGE_BYTES_ESTIMATE; + let expected = raw_len - payload.len() as i64 + RESIZED_IMAGE_BYTES_ESTIMATE; assert_eq!(estimated, expected); assert!(estimated < raw_len); @@ -1370,6 +1385,7 @@ fn non_base64_image_urls_are_unchanged() { output: FunctionCallOutputPayload::from_content_items(vec![ FunctionCallOutputContentItem::InputImage { image_url: "file:///tmp/foo.png".to_string(), + detail: None, }, ]), }; @@ -1409,7 +1425,10 @@ fn non_image_base64_data_url_is_unchanged() { let item = ResponseItem::FunctionCallOutput { call_id: "call-octet".to_string(), output: FunctionCallOutputPayload::from_content_items(vec![ - FunctionCallOutputContentItem::InputImage { image_url }, + FunctionCallOutputContentItem::InputImage { + image_url, + detail: None, + }, ]), }; @@ -1433,7 +1452,7 @@ fn mixed_case_data_url_markers_are_adjusted() { let raw_len = serde_json::to_string(&item).unwrap().len() as i64; let estimated = estimate_response_item_model_visible_bytes(&item); - let expected = raw_len - payload.len() as i64 + IMAGE_BYTES_ESTIMATE; + let expected = raw_len - payload.len() as i64 + RESIZED_IMAGE_BYTES_ESTIMATE; assert_eq!(estimated, expected); } @@ -1465,7 +1484,70 @@ fn multiple_inline_images_apply_multiple_fixed_costs() { let raw_len = serde_json::to_string(&item).unwrap().len() as i64; let payload_sum = (payload_one.len() + payload_two.len()) as i64; let estimated = estimate_response_item_model_visible_bytes(&item); - let expected = raw_len - payload_sum + (2 * IMAGE_BYTES_ESTIMATE); + let expected = raw_len - payload_sum + (2 * RESIZED_IMAGE_BYTES_ESTIMATE); + + assert_eq!(estimated, expected); +} + +#[test] +fn original_detail_images_scale_with_dimensions() { + // 2304x864 at 32px patches yields 72 * 27 = 1,944 patches. + // The byte heuristic uses 4 bytes per token, so the replacement cost is 7,776 bytes. + const EXPECTED_ORIGINAL_DETAIL_IMAGE_BYTES: i64 = 7_776; + + let width = 2304; + let height = 864; + let image = ImageBuffer::from_pixel(width, height, Rgba([12u8, 34, 56, 255])); + let mut bytes = std::io::Cursor::new(Vec::new()); + image + .write_to(&mut bytes, ImageFormat::Png) + .expect("encode png"); + let payload = BASE64_STANDARD.encode(bytes.get_ref()); + let image_url = format!("data:image/png;base64,{payload}"); + let item = ResponseItem::FunctionCallOutput { + call_id: "call-original".to_string(), + output: FunctionCallOutputPayload::from_content_items(vec![ + FunctionCallOutputContentItem::InputImage { + image_url, + detail: Some(ImageDetail::Original), + }, + ]), + }; + + let raw_len = serde_json::to_string(&item).unwrap().len() as i64; + let estimated = estimate_response_item_model_visible_bytes(&item); + let expected = raw_len - payload.len() as i64 + EXPECTED_ORIGINAL_DETAIL_IMAGE_BYTES; + + assert_eq!(estimated, expected); +} + +#[test] +fn original_detail_webp_images_scale_with_dimensions() { + // Same dimensions as the PNG case above, so the patch-based replacement cost is the same. + const EXPECTED_ORIGINAL_DETAIL_IMAGE_BYTES: i64 = 7_776; + + let width = 2304; + let height = 864; + let image = ImageBuffer::from_pixel(width, height, Rgba([12u8, 34, 56, 255])); + let mut bytes = std::io::Cursor::new(Vec::new()); + image + .write_to(&mut bytes, ImageFormat::WebP) + .expect("encode webp"); + let payload = BASE64_STANDARD.encode(bytes.get_ref()); + let image_url = format!("data:image/webp;base64,{payload}"); + let item = ResponseItem::FunctionCallOutput { + call_id: "call-original-webp".to_string(), + output: FunctionCallOutputPayload::from_content_items(vec![ + FunctionCallOutputContentItem::InputImage { + image_url, + detail: Some(ImageDetail::Original), + }, + ]), + }; + + let raw_len = serde_json::to_string(&item).unwrap().len() as i64; + let estimated = estimate_response_item_model_visible_bytes(&item); + let expected = raw_len - payload.len() as i64 + EXPECTED_ORIGINAL_DETAIL_IMAGE_BYTES; assert_eq!(estimated, expected); } diff --git a/codex-rs/core/src/features.rs b/codex-rs/core/src/features.rs index d76c2dfc4..30c38e835 100644 --- a/codex-rs/core/src/features.rs +++ b/codex-rs/core/src/features.rs @@ -119,6 +119,8 @@ pub enum Feature { MemoryTool, /// Append additional AGENTS.md guidance to user instructions. ChildAgentsMd, + /// Allow `detail: "original"` image outputs on supported models. + ImageDetailOriginal, /// Enforce UTF8 output in Powershell. PowershellUtf8, /// Compress request bodies (zstd) when sending streaming requests to codex-backend. @@ -529,6 +531,12 @@ pub const FEATURES: &[FeatureSpec] = &[ stage: Stage::UnderDevelopment, default_enabled: false, }, + FeatureSpec { + id: Feature::ImageDetailOriginal, + key: "image_detail_original", + stage: Stage::UnderDevelopment, + default_enabled: false, + }, FeatureSpec { id: Feature::ApplyPatchFreeform, key: "apply_patch_freeform", diff --git a/codex-rs/core/src/models_manager/manager.rs b/codex-rs/core/src/models_manager/manager.rs index 1d7309020..35e723d25 100644 --- a/codex-rs/core/src/models_manager/manager.rs +++ b/codex-rs/core/src/models_manager/manager.rs @@ -471,6 +471,7 @@ mod tests { "apply_patch_tool_type": null, "truncation_policy": {"mode": "bytes", "limit": 10_000}, "supports_parallel_tool_calls": false, + "supports_image_detail_original": false, "context_window": 272_000, "experimental_supported_tools": [], })) @@ -549,6 +550,8 @@ mod tests { .build() .await .expect("load default test config"); + let mut overlay = remote_model("gpt-overlay", "Overlay", 0); + overlay.supports_image_detail_original = true; let auth_manager = AuthManager::from_auth_for_testing(CodexAuth::from_api_key("Test API Key")); @@ -556,7 +559,7 @@ mod tests { codex_home.path().to_path_buf(), auth_manager, Some(ModelsResponse { - models: vec![remote_model("gpt-overlay", "Overlay", 0)], + models: vec![overlay], }), CollaborationModesConfig::default(), ); @@ -568,6 +571,7 @@ mod tests { assert_eq!(model_info.slug, "gpt-overlay-experiment"); assert_eq!(model_info.display_name, "Overlay"); assert_eq!(model_info.context_window, Some(272_000)); + assert!(model_info.supports_image_detail_original); assert!(!model_info.supports_parallel_tool_calls); assert!(!model_info.used_fallback_model_metadata); } @@ -580,26 +584,24 @@ mod tests { .build() .await .expect("load default test config"); + let mut remote = remote_model("gpt-image", "Image", 0); + remote.supports_image_detail_original = true; let auth_manager = AuthManager::from_auth_for_testing(CodexAuth::from_api_key("Test API Key")); let manager = ModelsManager::new( codex_home.path().to_path_buf(), auth_manager, - None, + Some(ModelsResponse { + models: vec![remote], + }), CollaborationModesConfig::default(), ); - let known_slug = manager - .get_remote_models() - .await - .first() - .expect("bundled models should include at least one model") - .slug - .clone(); - let namespaced_model = format!("custom/{known_slug}"); + let namespaced_model = "custom/gpt-image".to_string(); let model_info = manager.get_model_info(&namespaced_model, &config).await; assert_eq!(model_info.slug, namespaced_model); + assert!(model_info.supports_image_detail_original); assert!(!model_info.used_fallback_model_metadata); } diff --git a/codex-rs/core/src/models_manager/model_info.rs b/codex-rs/core/src/models_manager/model_info.rs index 4824e4cd1..643d42571 100644 --- a/codex-rs/core/src/models_manager/model_info.rs +++ b/codex-rs/core/src/models_manager/model_info.rs @@ -80,6 +80,7 @@ pub(crate) fn model_info_from_slug(slug: &str) -> ModelInfo { apply_patch_tool_type: None, truncation_policy: TruncationPolicyConfig::bytes(10_000), supports_parallel_tool_calls: false, + supports_image_detail_original: false, context_window: Some(272_000), auto_compact_token_limit: None, effective_context_window_percent: 95, diff --git a/codex-rs/core/src/project_doc.rs b/codex-rs/core/src/project_doc.rs index 8323b86a1..6370b1544 100644 --- a/codex-rs/core/src/project_doc.rs +++ b/codex-rs/core/src/project_doc.rs @@ -55,6 +55,9 @@ fn render_js_repl_instructions(config: &Config) -> Option { section.push_str("- Helpers: `codex.tmpDir` and `codex.tool(name, args?)`.\n"); section.push_str("- `codex.tool` executes a normal tool call and resolves to the raw tool output object. Use it for shell and non-shell tools alike.\n"); section.push_str("- To share generated images with the model, write a file under `codex.tmpDir`, call `await codex.tool(\"view_image\", { path: \"/absolute/path\" })`, then delete the file.\n"); + if config.features.enabled(Feature::ImageDetailOriginal) { + section.push_str("- When generating or converting images for `view_image` in `js_repl`, prefer JPEG at 85% quality unless lossless quality is strictly required; other formats can be used if the user requests them. This keeps uploads smaller and reduces the chance of hitting image size caps.\n"); + } section.push_str("- Top-level bindings persist across cells. If you hit `SyntaxError: Identifier 'x' has already been declared`, reuse the binding, pick a new name, wrap in `{ ... }` for block scope, or reset the kernel with `js_repl_reset`.\n"); section.push_str("- Top-level static import declarations (for example `import x from \"pkg\"`) are currently unsupported in `js_repl`; use dynamic imports with `await import(\"pkg\")` instead.\n"); @@ -492,6 +495,21 @@ mod tests { assert_eq!(res, expected); } + #[tokio::test] + async fn js_repl_original_resolution_guidance_is_feature_gated() { + let tmp = tempfile::tempdir().expect("tempdir"); + let mut cfg = make_config(&tmp, 4096, None).await; + cfg.features + .enable(Feature::JsRepl) + .enable(Feature::ImageDetailOriginal); + + let res = get_user_instructions(&cfg, None) + .await + .expect("js_repl instructions expected"); + let expected = "## JavaScript REPL (Node)\n- Use `js_repl` for Node-backed JavaScript with top-level await in a persistent kernel.\n- `js_repl` is a freeform/custom tool. Direct `js_repl` calls must send raw JavaScript tool input (optionally with first-line `// codex-js-repl: timeout_ms=15000`). Do not wrap code in JSON (for example `{\"code\":\"...\"}`), quotes, or markdown code fences.\n- Helpers: `codex.tmpDir` and `codex.tool(name, args?)`.\n- `codex.tool` executes a normal tool call and resolves to the raw tool output object. Use it for shell and non-shell tools alike.\n- To share generated images with the model, write a file under `codex.tmpDir`, call `await codex.tool(\"view_image\", { path: \"/absolute/path\" })`, then delete the file.\n- When generating or converting images for `view_image` in `js_repl`, prefer JPEG at 85% quality unless lossless quality is strictly required; other formats can be used if the user requests them. This keeps uploads smaller and reduces the chance of hitting image size caps.\n- Top-level bindings persist across cells. If you hit `SyntaxError: Identifier 'x' has already been declared`, reuse the binding, pick a new name, wrap in `{ ... }` for block scope, or reset the kernel with `js_repl_reset`.\n- Top-level static import declarations (for example `import x from \"pkg\"`) are currently unsupported in `js_repl`; use dynamic imports with `await import(\"pkg\")` instead.\n- Avoid direct access to `process.stdout` / `process.stderr` / `process.stdin`; it can corrupt the JSON line protocol. Use `console.log` and `codex.tool(...)`."; + assert_eq!(res, expected); + } + /// When both system instructions *and* a project doc are present the two /// should be concatenated with the separator. #[tokio::test] diff --git a/codex-rs/core/src/tools/context.rs b/codex-rs/core/src/tools/context.rs index 0700b4d01..9f85e6acf 100644 --- a/codex-rs/core/src/tools/context.rs +++ b/codex-rs/core/src/tools/context.rs @@ -222,6 +222,7 @@ mod tests { }, FunctionCallOutputContentItem::InputImage { image_url: "data:image/png;base64,AAA".to_string(), + detail: None, }, FunctionCallOutputContentItem::InputText { text: "line 2".to_string(), @@ -239,6 +240,7 @@ mod tests { }, FunctionCallOutputContentItem::InputImage { image_url: "data:image/png;base64,AAA".to_string(), + detail: None, }, FunctionCallOutputContentItem::InputText { text: "line 2".to_string(), diff --git a/codex-rs/core/src/tools/handlers/view_image.rs b/codex-rs/core/src/tools/handlers/view_image.rs index 9bf03acb0..640175112 100644 --- a/codex-rs/core/src/tools/handlers/view_image.rs +++ b/codex-rs/core/src/tools/handlers/view_image.rs @@ -1,10 +1,15 @@ use async_trait::async_trait; +use codex_protocol::models::ContentItem; use codex_protocol::models::FunctionCallOutputBody; use codex_protocol::models::FunctionCallOutputContentItem; +use codex_protocol::models::ImageDetail; +use codex_protocol::models::local_image_content_items_with_label_number; use codex_protocol::openai_models::InputModality; +use codex_utils_image::PromptImageMode; use serde::Deserialize; use tokio::fs; +use crate::features::Feature; use crate::function_tool::FunctionCallError; use crate::protocol::EventMsg; use crate::protocol::ViewImageToolCallEvent; @@ -14,8 +19,6 @@ use crate::tools::context::ToolPayload; use crate::tools::handlers::parse_arguments; use crate::tools::registry::ToolHandler; use crate::tools::registry::ToolKind; -use codex_protocol::models::ContentItem; -use codex_protocol::models::local_image_content_items_with_label_number; pub struct ViewImageHandler; @@ -81,15 +84,26 @@ impl ToolHandler for ViewImageHandler { } let event_path = abs_path.clone(); - let content = local_image_content_items_with_label_number(&abs_path, None); - let content = content + let use_original_detail = turn.config.features.enabled(Feature::ImageDetailOriginal) + && turn.model_info.supports_image_detail_original; + let image_mode = if use_original_detail { + PromptImageMode::Original + } else { + PromptImageMode::ResizeToFit + }; + let image_detail = use_original_detail.then_some(ImageDetail::Original); + + let content = local_image_content_items_with_label_number(&abs_path, None, image_mode) .into_iter() .map(|item| match item { ContentItem::InputText { text } => { FunctionCallOutputContentItem::InputText { text } } ContentItem::InputImage { image_url } => { - FunctionCallOutputContentItem::InputImage { image_url } + FunctionCallOutputContentItem::InputImage { + image_url, + detail: image_detail, + } } ContentItem::OutputText { text } => { FunctionCallOutputContentItem::InputText { text } diff --git a/codex-rs/core/src/tools/js_repl/mod.rs b/codex-rs/core/src/tools/js_repl/mod.rs index a234fd0eb..ea9f42ce6 100644 --- a/codex-rs/core/src/tools/js_repl/mod.rs +++ b/codex-rs/core/src/tools/js_repl/mod.rs @@ -1900,6 +1900,7 @@ mod tests { output: FunctionCallOutputPayload::from_content_items(vec![ FunctionCallOutputContentItem::InputImage { image_url: "data:image/png;base64,abcd".to_string(), + detail: None, }, ]), }; @@ -1929,6 +1930,7 @@ mod tests { output: FunctionCallOutputPayload::from_content_items(vec![ FunctionCallOutputContentItem::InputImage { image_url: "data:image/png;base64,abcd".to_string(), + detail: None, }, ]), }; @@ -2417,15 +2419,17 @@ console.log(out.output?.body?.text ?? ""); image_url: "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGP4z8DwHwAFAAH/iZk9HQAAAABJRU5ErkJggg==" .to_string(), + detail: None, }] .as_slice() ); - let [FunctionCallOutputContentItem::InputImage { image_url }] = + let [FunctionCallOutputContentItem::InputImage { image_url, detail }] = result.content_items.as_slice() else { panic!("view_image should return exactly one input_image content item"); }; assert!(image_url.starts_with("data:image/png;base64,")); + assert_eq!(*detail, None); assert!(session.get_pending_input().await.is_empty()); Ok(()) @@ -2515,6 +2519,7 @@ console.log(out.type); }, FunctionCallOutputContentItem::InputImage { image_url: image_url.to_string(), + detail: None, }, ] ); diff --git a/codex-rs/core/src/truncate.rs b/codex-rs/core/src/truncate.rs index 309ed91b6..fb275e6d4 100644 --- a/codex-rs/core/src/truncate.rs +++ b/codex-rs/core/src/truncate.rs @@ -138,9 +138,10 @@ pub(crate) fn truncate_function_output_items_with_policy( remaining_budget = 0; } } - FunctionCallOutputContentItem::InputImage { image_url } => { + FunctionCallOutputContentItem::InputImage { image_url, detail } => { out.push(FunctionCallOutputContentItem::InputImage { image_url: image_url.clone(), + detail: *detail, }); } } @@ -491,6 +492,7 @@ mod tests { FunctionCallOutputContentItem::InputText { text: t2.clone() }, FunctionCallOutputContentItem::InputImage { image_url: "img:mid".to_string(), + detail: None, }, FunctionCallOutputContentItem::InputText { text: t3 }, FunctionCallOutputContentItem::InputText { text: t4 }, @@ -518,7 +520,8 @@ mod tests { assert_eq!( output[2], FunctionCallOutputContentItem::InputImage { - image_url: "img:mid".to_string() + image_url: "img:mid".to_string(), + detail: None, } ); diff --git a/codex-rs/core/tests/suite/model_switching.rs b/codex-rs/core/tests/suite/model_switching.rs index 385072f91..e8fbcc851 100644 --- a/codex-rs/core/tests/suite/model_switching.rs +++ b/codex-rs/core/tests/suite/model_switching.rs @@ -278,6 +278,7 @@ async fn model_change_from_image_to_text_strips_prior_image_content() -> Result< apply_patch_tool_type: None, truncation_policy: TruncationPolicyConfig::bytes(10_000), supports_parallel_tool_calls: false, + supports_image_detail_original: false, context_window: Some(272_000), auto_compact_token_limit: None, effective_context_window_percent: 95, @@ -439,6 +440,7 @@ async fn model_switch_to_smaller_model_updates_token_context_window() -> Result< apply_patch_tool_type: None, truncation_policy: TruncationPolicyConfig::bytes(10_000), supports_parallel_tool_calls: false, + supports_image_detail_original: false, context_window: Some(large_context_window), auto_compact_token_limit: None, effective_context_window_percent, diff --git a/codex-rs/core/tests/suite/models_cache_ttl.rs b/codex-rs/core/tests/suite/models_cache_ttl.rs index d2202b58c..54b062924 100644 --- a/codex-rs/core/tests/suite/models_cache_ttl.rs +++ b/codex-rs/core/tests/suite/models_cache_ttl.rs @@ -344,6 +344,7 @@ fn test_remote_model(slug: &str, priority: i32) -> ModelInfo { apply_patch_tool_type: None, truncation_policy: TruncationPolicyConfig::bytes(10_000), supports_parallel_tool_calls: false, + supports_image_detail_original: false, context_window: Some(272_000), auto_compact_token_limit: None, effective_context_window_percent: 95, diff --git a/codex-rs/core/tests/suite/personality.rs b/codex-rs/core/tests/suite/personality.rs index 73f57ba0b..f3a1f2e4d 100644 --- a/codex-rs/core/tests/suite/personality.rs +++ b/codex-rs/core/tests/suite/personality.rs @@ -619,6 +619,7 @@ async fn remote_model_friendly_personality_instructions_with_feature() -> anyhow apply_patch_tool_type: None, truncation_policy: TruncationPolicyConfig::bytes(10_000), supports_parallel_tool_calls: false, + supports_image_detail_original: false, context_window: Some(128_000), auto_compact_token_limit: None, effective_context_window_percent: 95, @@ -729,6 +730,7 @@ async fn user_turn_personality_remote_model_template_includes_update_message() - apply_patch_tool_type: None, truncation_policy: TruncationPolicyConfig::bytes(10_000), supports_parallel_tool_calls: false, + supports_image_detail_original: false, context_window: Some(128_000), auto_compact_token_limit: None, effective_context_window_percent: 95, diff --git a/codex-rs/core/tests/suite/remote_models.rs b/codex-rs/core/tests/suite/remote_models.rs index 63f15f9c0..ac2bc7156 100644 --- a/codex-rs/core/tests/suite/remote_models.rs +++ b/codex-rs/core/tests/suite/remote_models.rs @@ -303,6 +303,7 @@ async fn remote_models_remote_model_uses_unified_exec() -> Result<()> { apply_patch_tool_type: None, truncation_policy: TruncationPolicyConfig::bytes(10_000), supports_parallel_tool_calls: false, + supports_image_detail_original: false, context_window: Some(272_000), auto_compact_token_limit: None, effective_context_window_percent: 95, @@ -543,6 +544,7 @@ async fn remote_models_apply_remote_base_instructions() -> Result<()> { apply_patch_tool_type: None, truncation_policy: TruncationPolicyConfig::bytes(10_000), supports_parallel_tool_calls: false, + supports_image_detail_original: false, context_window: Some(272_000), auto_compact_token_limit: None, effective_context_window_percent: 95, @@ -1007,6 +1009,7 @@ fn test_remote_model_with_policy( apply_patch_tool_type: None, truncation_policy, supports_parallel_tool_calls: false, + supports_image_detail_original: false, context_window: Some(272_000), auto_compact_token_limit: None, effective_context_window_percent: 95, diff --git a/codex-rs/core/tests/suite/rmcp_client.rs b/codex-rs/core/tests/suite/rmcp_client.rs index 06c004298..a8d414e99 100644 --- a/codex-rs/core/tests/suite/rmcp_client.rs +++ b/codex-rs/core/tests/suite/rmcp_client.rs @@ -409,6 +409,7 @@ async fn stdio_image_responses_are_sanitized_for_text_only_model() -> anyhow::Re apply_patch_tool_type: None, truncation_policy: TruncationPolicyConfig::bytes(10_000), supports_parallel_tool_calls: false, + supports_image_detail_original: false, context_window: Some(272_000), auto_compact_token_limit: None, effective_context_window_percent: 95, diff --git a/codex-rs/core/tests/suite/view_image.rs b/codex-rs/core/tests/suite/view_image.rs index 5b8b9e088..9b73afbc1 100644 --- a/codex-rs/core/tests/suite/view_image.rs +++ b/codex-rs/core/tests/suite/view_image.rs @@ -291,6 +291,208 @@ async fn view_image_tool_attaches_local_image() -> anyhow::Result<()> { Ok(()) } +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn view_image_tool_can_preserve_original_resolution_on_gpt5_3_codex() -> anyhow::Result<()> { + skip_if_no_network!(Ok(())); + + let server = start_mock_server().await; + let mut builder = test_codex() + .with_model("gpt-5.3-codex") + .with_config(|config| { + config.features.enable(Feature::ImageDetailOriginal); + }); + let TestCodex { + codex, + cwd, + session_configured, + .. + } = builder.build(&server).await?; + + let rel_path = "assets/original-example.png"; + let abs_path = cwd.path().join(rel_path); + if let Some(parent) = abs_path.parent() { + std::fs::create_dir_all(parent)?; + } + let original_width = 2304; + let original_height = 864; + let image = ImageBuffer::from_pixel(original_width, original_height, Rgba([0u8, 80, 255, 255])); + image.save(&abs_path)?; + + let call_id = "view-image-original"; + let arguments = serde_json::json!({ "path": rel_path }).to_string(); + + let first_response = sse(vec![ + ev_response_created("resp-1"), + ev_function_call(call_id, "view_image", &arguments), + ev_completed("resp-1"), + ]); + responses::mount_sse_once(&server, first_response).await; + + let second_response = sse(vec![ + ev_assistant_message("msg-1", "done"), + ev_completed("resp-2"), + ]); + let mock = responses::mount_sse_once(&server, second_response).await; + + let session_model = session_configured.model.clone(); + + codex + .submit(Op::UserTurn { + items: vec![UserInput::Text { + text: "please add the original screenshot".into(), + text_elements: Vec::new(), + }], + final_output_json_schema: None, + cwd: cwd.path().to_path_buf(), + approval_policy: AskForApproval::Never, + sandbox_policy: SandboxPolicy::DangerFullAccess, + model: session_model, + effort: None, + service_tier: None, + summary: None, + collaboration_mode: None, + personality: None, + }) + .await?; + + wait_for_event_with_timeout( + &codex, + |event| matches!(event, EventMsg::TurnComplete(_)), + Duration::from_secs(10), + ) + .await; + + let req = mock.single_request(); + let function_output = req.function_call_output(call_id); + let output_items = function_output + .get("output") + .and_then(Value::as_array) + .expect("function_call_output should be a content item array"); + assert_eq!(output_items.len(), 1); + assert_eq!( + output_items[0].get("detail").and_then(Value::as_str), + Some("original") + ); + let image_url = output_items[0] + .get("image_url") + .and_then(Value::as_str) + .expect("image_url present"); + + let (_, encoded) = image_url + .split_once(',') + .expect("image url contains data prefix"); + let decoded = BASE64_STANDARD + .decode(encoded) + .expect("image data decodes from base64 for request"); + let preserved = load_from_memory(&decoded).expect("load preserved image"); + let (width, height) = preserved.dimensions(); + assert_eq!(width, original_width); + assert_eq!(height, original_height); + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn view_image_tool_keeps_legacy_behavior_below_gpt5_3_codex() -> anyhow::Result<()> { + skip_if_no_network!(Ok(())); + + let server = start_mock_server().await; + let mut builder = test_codex().with_model("gpt-5.2").with_config(|config| { + config.features.enable(Feature::ImageDetailOriginal); + }); + let TestCodex { + codex, + cwd, + session_configured, + .. + } = builder.build(&server).await?; + + let rel_path = "assets/original-example-lower-model.png"; + let abs_path = cwd.path().join(rel_path); + if let Some(parent) = abs_path.parent() { + std::fs::create_dir_all(parent)?; + } + let original_width = 2304; + let original_height = 864; + let image = ImageBuffer::from_pixel(original_width, original_height, Rgba([0u8, 80, 255, 255])); + image.save(&abs_path)?; + + let call_id = "view-image-original-lower-model"; + let arguments = serde_json::json!({ "path": rel_path }).to_string(); + + let first_response = sse(vec![ + ev_response_created("resp-1"), + ev_function_call(call_id, "view_image", &arguments), + ev_completed("resp-1"), + ]); + responses::mount_sse_once(&server, first_response).await; + + let second_response = sse(vec![ + ev_assistant_message("msg-1", "done"), + ev_completed("resp-2"), + ]); + let mock = responses::mount_sse_once(&server, second_response).await; + + let session_model = session_configured.model.clone(); + + codex + .submit(Op::UserTurn { + items: vec![UserInput::Text { + text: "please add the screenshot".into(), + text_elements: Vec::new(), + }], + final_output_json_schema: None, + cwd: cwd.path().to_path_buf(), + approval_policy: AskForApproval::Never, + sandbox_policy: SandboxPolicy::DangerFullAccess, + model: session_model, + effort: None, + service_tier: None, + summary: None, + collaboration_mode: None, + personality: None, + }) + .await?; + + wait_for_event_with_timeout( + &codex, + |event| matches!(event, EventMsg::TurnComplete(_)), + Duration::from_secs(10), + ) + .await; + + let req = mock.single_request(); + let function_output = req.function_call_output(call_id); + let output_items = function_output + .get("output") + .and_then(Value::as_array) + .expect("function_call_output should be a content item array"); + assert_eq!(output_items.len(), 1); + assert_eq!(output_items[0].get("detail"), None); + + let image_url = output_items[0] + .get("image_url") + .and_then(Value::as_str) + .expect("image_url present"); + + let (prefix, encoded) = image_url + .split_once(',') + .expect("image url contains data prefix"); + assert_eq!(prefix, "data:image/png;base64"); + + let decoded = BASE64_STANDARD + .decode(encoded) + .expect("image data decodes from base64 for request"); + let resized = load_from_memory(&decoded).expect("load resized image"); + let (resized_width, resized_height) = resized.dimensions(); + assert!(resized_width <= 2048); + assert!(resized_height <= 768); + assert!(resized_width < original_width); + assert!(resized_height < original_height); + + Ok(()) +} + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn js_repl_view_image_tool_attaches_local_image() -> anyhow::Result<()> { skip_if_no_network!(Ok(())); @@ -674,6 +876,7 @@ async fn view_image_tool_returns_unsupported_message_for_text_only_model() -> an apply_patch_tool_type: None, truncation_policy: TruncationPolicyConfig::bytes(10_000), supports_parallel_tool_calls: false, + supports_image_detail_original: false, context_window: Some(272_000), auto_compact_token_limit: None, effective_context_window_percent: 95, diff --git a/codex-rs/protocol/src/models.rs b/codex-rs/protocol/src/models.rs index 846120982..8be7c9abf 100644 --- a/codex-rs/protocol/src/models.rs +++ b/codex-rs/protocol/src/models.rs @@ -1,7 +1,8 @@ use std::collections::HashMap; use std::path::Path; -use codex_utils_image::load_and_resize_to_fit; +use codex_utils_image::PromptImageMode; +use codex_utils_image::load_for_prompt; use serde::Deserialize; use serde::Deserializer; use serde::Serialize; @@ -175,6 +176,15 @@ pub enum ContentItem { OutputText { text: String }, } +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, JsonSchema, TS)] +#[serde(rename_all = "lowercase")] +pub enum ImageDetail { + Auto, + Low, + High, + Original, +} + #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, JsonSchema, TS)] #[serde(rename_all = "snake_case")] /// Classifies an assistant message as interim commentary or final answer text. @@ -710,8 +720,9 @@ fn unsupported_image_error_placeholder(path: &std::path::Path, mime: &str) -> Co pub fn local_image_content_items_with_label_number( path: &std::path::Path, label_number: Option, + mode: PromptImageMode, ) -> Vec { - match load_and_resize_to_fit(path) { + match load_for_prompt(path, mode) { Ok(image) => { let mut items = Vec::with_capacity(3); if let Some(label_number) = label_number { @@ -872,7 +883,11 @@ impl From> for ResponseInputItem { } UserInput::LocalImage { path } => { image_index += 1; - local_image_content_items_with_label_number(&path, Some(image_index)) + local_image_content_items_with_label_number( + &path, + Some(image_index), + PromptImageMode::ResizeToFit, + ) } UserInput::Skill { .. } | UserInput::Mention { .. } => Vec::new(), // Tool bodies are injected later in core }) @@ -937,9 +952,16 @@ pub struct ShellCommandToolCallParams { #[serde(tag = "type", rename_all = "snake_case")] pub enum FunctionCallOutputContentItem { // Do not rename, these are serialized and used directly in the responses API. - InputText { text: String }, + InputText { + text: String, + }, // Do not rename, these are serialized and used directly in the responses API. - InputImage { image_url: String }, + InputImage { + image_url: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + #[ts(optional)] + detail: Option, + }, } /// Converts structured function-call output content into plain text for @@ -983,7 +1005,10 @@ impl From Self::InputText { text } } crate::dynamic_tools::DynamicToolCallOutputContentItem::InputImage { image_url } => { - Self::InputImage { image_url } + Self::InputImage { + image_url, + detail: None, + } } } } @@ -1185,7 +1210,10 @@ fn convert_mcp_content_to_items( let mime_type = mime_type.unwrap_or_else(|| "application/octet-stream".into()); format!("data:{mime_type};base64,{data}") }; - FunctionCallOutputContentItem::InputImage { image_url } + FunctionCallOutputContentItem::InputImage { + image_url, + detail: None, + } } Ok(McpContent::Unknown) | Err(_) => FunctionCallOutputContentItem::InputText { text: serde_json::to_string(content).unwrap_or_else(|_| "".to_string()), @@ -1239,6 +1267,7 @@ mod tests { items, vec![FunctionCallOutputContentItem::InputImage { image_url: "data:image/png;base64,Zm9v".to_string(), + detail: None, }] ); } @@ -1256,6 +1285,7 @@ mod tests { items, vec![FunctionCallOutputContentItem::InputImage { image_url: "data:image/png;base64,Zm9v".to_string(), + detail: None, }] ); } @@ -1278,6 +1308,7 @@ mod tests { }, FunctionCallOutputContentItem::InputImage { image_url: "data:image/png;base64,AAA".to_string(), + detail: None, }, FunctionCallOutputContentItem::InputText { text: "line 2".to_string(), @@ -1296,6 +1327,7 @@ mod tests { }, FunctionCallOutputContentItem::InputImage { image_url: "data:image/png;base64,AAA".to_string(), + detail: None, }, ]; @@ -1318,6 +1350,7 @@ mod tests { }, FunctionCallOutputContentItem::InputImage { image_url: "data:image/png;base64,AAA".to_string(), + detail: None, }, ]); @@ -1542,6 +1575,7 @@ mod tests { }, FunctionCallOutputContentItem::InputImage { image_url: "data:image/png;base64,BASE64".into(), + detail: None, }, ] ); @@ -1567,6 +1601,7 @@ mod tests { output: FunctionCallOutputPayload::from_content_items(vec![ FunctionCallOutputContentItem::InputImage { image_url: "data:image/png;base64,BASE64".into(), + detail: None, }, ]), }; @@ -1602,6 +1637,7 @@ mod tests { items, vec![FunctionCallOutputContentItem::InputImage { image_url: "data:image/png;base64,BASE64".into(), + detail: None, }] ); @@ -1624,6 +1660,7 @@ mod tests { }, FunctionCallOutputContentItem::InputImage { image_url: "data:image/png;base64,XYZ".into(), + detail: None, }, ]; assert_eq!( diff --git a/codex-rs/protocol/src/openai_models.rs b/codex-rs/protocol/src/openai_models.rs index b63f13744..089b00268 100644 --- a/codex-rs/protocol/src/openai_models.rs +++ b/codex-rs/protocol/src/openai_models.rs @@ -245,6 +245,8 @@ pub struct ModelInfo { pub apply_patch_tool_type: Option, pub truncation_policy: TruncationPolicyConfig, pub supports_parallel_tool_calls: bool, + #[serde(default)] + pub supports_image_detail_original: bool, #[serde(default, skip_serializing_if = "Option::is_none")] pub context_window: Option, /// Token threshold for automatic compaction. When omitted, core derives it @@ -515,6 +517,7 @@ mod tests { apply_patch_tool_type: None, truncation_policy: TruncationPolicyConfig::bytes(10_000), supports_parallel_tool_calls: false, + supports_image_detail_original: false, context_window: None, auto_compact_token_limit: None, effective_context_window_percent: 95, @@ -713,6 +716,7 @@ mod tests { .expect("deserialize model info"); assert_eq!(model.availability_nux, None); + assert!(!model.supports_image_detail_original); } #[test] diff --git a/codex-rs/utils/image/src/lib.rs b/codex-rs/utils/image/src/lib.rs index 0950bf176..d0aba2f60 100644 --- a/codex-rs/utils/image/src/lib.rs +++ b/codex-rs/utils/image/src/lib.rs @@ -14,6 +14,7 @@ use image::ImageEncoder; use image::ImageFormat; use image::codecs::jpeg::JpegEncoder; use image::codecs::png::PngEncoder; +use image::codecs::webp::WebPEncoder; use image::imageops::FilterType; /// Maximum width used when resizing images before uploading. pub const MAX_WIDTH: u32 = 2048; @@ -33,24 +34,48 @@ pub struct EncodedImage { impl EncodedImage { pub fn into_data_url(self) -> String { let encoded = BASE64_STANDARD.encode(&self.bytes); - format!("data:{};base64,{}", self.mime, encoded) + format!("data:{};base64,{encoded}", self.mime) } } -static IMAGE_CACHE: LazyLock> = +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum PromptImageMode { + ResizeToFit, + Original, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +struct ImageCacheKey { + digest: [u8; 20], + mode: PromptImageMode, +} + +static IMAGE_CACHE: LazyLock> = LazyLock::new(|| BlockingLruCache::new(NonZeroUsize::new(32).unwrap_or(NonZeroUsize::MIN))); pub fn load_and_resize_to_fit(path: &Path) -> Result { + load_for_prompt(path, PromptImageMode::ResizeToFit) +} + +pub fn load_for_prompt( + path: &Path, + mode: PromptImageMode, +) -> Result { let path_buf = path.to_path_buf(); let file_bytes = read_file_bytes(path, &path_buf)?; - let key = sha1_digest(&file_bytes); + let key = ImageCacheKey { + digest: sha1_digest(&file_bytes), + mode, + }; IMAGE_CACHE.get_or_try_insert_with(key, move || { let format = match image::guess_format(&file_bytes) { Ok(ImageFormat::Png) => Some(ImageFormat::Png), Ok(ImageFormat::Jpeg) => Some(ImageFormat::Jpeg), + Ok(ImageFormat::Gif) => Some(ImageFormat::Gif), + Ok(ImageFormat::WebP) => Some(ImageFormat::WebP), _ => None, }; @@ -63,42 +88,54 @@ pub fn load_and_resize_to_fit(path: &Path) -> Result bool { + // Public API docs explicitly call out non-animated GIF support only. + // Preserve byte-for-byte only for formats we can safely pass through. + matches!( + format, + ImageFormat::Png | ImageFormat::Jpeg | ImageFormat::WebP + ) +} + fn read_file_bytes(path: &Path, path_for_error: &Path) -> Result, ImageProcessingError> { match tokio::runtime::Handle::try_current() { // If we're inside a Tokio runtime, avoid block_on (it panics on worker threads). @@ -123,6 +160,7 @@ fn encode_image( ) -> Result<(Vec, ImageFormat), ImageProcessingError> { let target_format = match preferred_format { ImageFormat::Jpeg => ImageFormat::Jpeg, + ImageFormat::WebP => ImageFormat::WebP, _ => ImageFormat::Png, }; @@ -153,6 +191,21 @@ fn encode_image( source, })?; } + ImageFormat::WebP => { + let rgba = image.to_rgba8(); + let encoder = WebPEncoder::new_lossless(&mut buffer); + encoder + .write_image( + rgba.as_raw(), + image.width(), + image.height(), + ColorType::Rgba8.into(), + ) + .map_err(|source| ImageProcessingError::Encode { + format: target_format, + source, + })?; + } _ => unreachable!("unsupported target_format should have been handled earlier"), } @@ -162,6 +215,8 @@ fn encode_image( fn format_to_mime(format: ImageFormat) -> String { match format { ImageFormat::Jpeg => "image/jpeg".to_string(), + ImageFormat::Gif => "image/gif".to_string(), + ImageFormat::WebP => "image/webp".to_string(), _ => "image/png".to_string(), } } @@ -176,38 +231,70 @@ mod tests { #[tokio::test(flavor = "multi_thread")] async fn returns_original_image_when_within_bounds() { + for (format, mime) in [ + (ImageFormat::Png, "image/png"), + (ImageFormat::WebP, "image/webp"), + ] { + let temp_file = NamedTempFile::new().expect("temp file"); + let image = ImageBuffer::from_pixel(64, 32, Rgba([10u8, 20, 30, 255])); + image + .save_with_format(temp_file.path(), format) + .expect("write image to temp file"); + + let original_bytes = std::fs::read(temp_file.path()).expect("read written image"); + let encoded = load_and_resize_to_fit(temp_file.path()).expect("process image"); + + assert_eq!(encoded.width, 64); + assert_eq!(encoded.height, 32); + assert_eq!(encoded.mime, mime); + assert_eq!(encoded.bytes, original_bytes); + } + } + + #[tokio::test(flavor = "multi_thread")] + async fn downscales_large_image() { + for (format, mime) in [ + (ImageFormat::Png, "image/png"), + (ImageFormat::WebP, "image/webp"), + ] { + let temp_file = NamedTempFile::new().expect("temp file"); + let image = ImageBuffer::from_pixel(4096, 2048, Rgba([200u8, 10, 10, 255])); + image + .save_with_format(temp_file.path(), format) + .expect("write image to temp file"); + + let processed = load_and_resize_to_fit(temp_file.path()).expect("process image"); + + assert!(processed.width <= MAX_WIDTH); + assert!(processed.height <= MAX_HEIGHT); + assert_eq!(processed.mime, mime); + + let detected_format = + image::guess_format(&processed.bytes).expect("detect resized output format"); + assert_eq!(detected_format, format); + + let loaded = image::load_from_memory(&processed.bytes) + .expect("read resized bytes back into image"); + assert_eq!(loaded.dimensions(), (processed.width, processed.height)); + } + } + + #[tokio::test(flavor = "multi_thread")] + async fn preserves_large_image_in_original_mode() { let temp_file = NamedTempFile::new().expect("temp file"); - let image = ImageBuffer::from_pixel(64, 32, Rgba([10u8, 20, 30, 255])); + let image = ImageBuffer::from_pixel(4096, 2048, Rgba([180u8, 30, 30, 255])); image .save_with_format(temp_file.path(), ImageFormat::Png) .expect("write png to temp file"); let original_bytes = std::fs::read(temp_file.path()).expect("read written image"); + let processed = + load_for_prompt(temp_file.path(), PromptImageMode::Original).expect("process image"); - let encoded = load_and_resize_to_fit(temp_file.path()).expect("process image"); - - assert_eq!(encoded.width, 64); - assert_eq!(encoded.height, 32); - assert_eq!(encoded.mime, "image/png"); - assert_eq!(encoded.bytes, original_bytes); - } - - #[tokio::test(flavor = "multi_thread")] - async fn downscales_large_image() { - let temp_file = NamedTempFile::new().expect("temp file"); - let image = ImageBuffer::from_pixel(4096, 2048, Rgba([200u8, 10, 10, 255])); - image - .save_with_format(temp_file.path(), ImageFormat::Png) - .expect("write png to temp file"); - - let processed = load_and_resize_to_fit(temp_file.path()).expect("process image"); - - assert!(processed.width <= MAX_WIDTH); - assert!(processed.height <= MAX_HEIGHT); - - let loaded = - image::load_from_memory(&processed.bytes).expect("read resized bytes back into image"); - assert_eq!(loaded.dimensions(), (processed.width, processed.height)); + assert_eq!(processed.width, 4096); + assert_eq!(processed.height, 2048); + assert_eq!(processed.mime, "image/png"); + assert_eq!(processed.bytes, original_bytes); } #[tokio::test(flavor = "multi_thread")]