mirror of
https://github.com/pchuan98/codex.git
synced 2026-07-01 00:31:56 +08:00
Add under-development original-resolution view_image support (#13050)
## Summary
Add original-resolution support for `view_image` behind the
under-development `view_image_original_resolution` feature flag.
When the flag is enabled and the target model is `gpt-5.3-codex` or
newer, `view_image` now preserves original PNG/JPEG/WebP bytes and sends
`detail: "original"` to the Responses API instead of using the legacy
resize/compress path.
## What changed
- Added `view_image_original_resolution` as an under-development feature
flag.
- Added `ImageDetail` to the protocol models and support for serializing
`detail: "original"` on tool-returned images.
- Added `PromptImageMode::Original` to `codex-utils-image`.
- Preserves original PNG/JPEG/WebP bytes.
- Keeps legacy behavior for the resize path.
- Updated `view_image` to:
- use the shared `local_image_content_items_with_label_number(...)`
helper in both code paths
- select original-resolution mode only when:
- the feature flag is enabled, and
- the model slug parses as `gpt-5.3-codex` or newer
- Kept local user image attachments on the existing resize path; this
change is specific to `view_image`.
- Updated history/image accounting so only `detail: "original"` images
use the docs-based GPT-5 image cost calculation; legacy images still use
the old fixed estimate.
- Added JS REPL guidance, gated on the same feature flag, to prefer JPEG
at 85% quality unless lossless is required, while still allowing other
formats when explicitly requested.
- Updated tests and helper code that construct
`FunctionCallOutputContentItem::InputImage` to carry the new `detail`
field.
## Behavior
### Feature off
- `view_image` keeps the existing resize/re-encode behavior.
- History estimation keeps the existing fixed-cost heuristic.
### Feature on + `gpt-5.3-codex+`
- `view_image` sends original-resolution images with `detail:
"original"`.
- PNG/JPEG/WebP source bytes are preserved when possible.
- History estimation uses the GPT-5 docs-based image-cost calculation
for those `detail: "original"` images.
#### [git stack](https://github.com/magus/git-stack-cli)
- 👉 `1` https://github.com/openai/codex/pull/13050
- ⏳ `2` https://github.com/openai/codex/pull/13331
- ⏳ `3` https://github.com/openai/codex/pull/13049
This commit is contained in:
committed by
GitHub
Unverified
parent
935754baa3
commit
b92146d48b
Generated
+2
@@ -1820,8 +1820,10 @@ dependencies = [
|
||||
"codex-state",
|
||||
"codex-test-macros",
|
||||
"codex-utils-absolute-path",
|
||||
"codex-utils-cache",
|
||||
"codex-utils-cargo-bin",
|
||||
"codex-utils-home-dir",
|
||||
"codex-utils-image",
|
||||
"codex-utils-pty",
|
||||
"codex-utils-readiness",
|
||||
"codex-utils-stream-parser",
|
||||
|
||||
@@ -514,6 +514,16 @@
|
||||
},
|
||||
{
|
||||
"properties": {
|
||||
"detail": {
|
||||
"anyOf": [
|
||||
{
|
||||
"$ref": "#/definitions/ImageDetail"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
]
|
||||
},
|
||||
"image_url": {
|
||||
"type": "string"
|
||||
},
|
||||
@@ -627,6 +637,15 @@
|
||||
],
|
||||
"type": "string"
|
||||
},
|
||||
"ImageDetail": {
|
||||
"enum": [
|
||||
"auto",
|
||||
"low",
|
||||
"high",
|
||||
"original"
|
||||
],
|
||||
"type": "string"
|
||||
},
|
||||
"InitializeCapabilities": {
|
||||
"description": "Client-declared capabilities negotiated during initialize.",
|
||||
"properties": {
|
||||
|
||||
@@ -3478,6 +3478,16 @@
|
||||
},
|
||||
{
|
||||
"properties": {
|
||||
"detail": {
|
||||
"anyOf": [
|
||||
{
|
||||
"$ref": "#/definitions/ImageDetail"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
]
|
||||
},
|
||||
"image_url": {
|
||||
"type": "string"
|
||||
},
|
||||
@@ -3569,6 +3579,15 @@
|
||||
],
|
||||
"type": "object"
|
||||
},
|
||||
"ImageDetail": {
|
||||
"enum": [
|
||||
"auto",
|
||||
"low",
|
||||
"high",
|
||||
"original"
|
||||
],
|
||||
"type": "string"
|
||||
},
|
||||
"LocalShellAction": {
|
||||
"oneOf": [
|
||||
{
|
||||
|
||||
@@ -9700,6 +9700,16 @@
|
||||
},
|
||||
{
|
||||
"properties": {
|
||||
"detail": {
|
||||
"anyOf": [
|
||||
{
|
||||
"$ref": "#/definitions/v2/ImageDetail"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
]
|
||||
},
|
||||
"image_url": {
|
||||
"type": "string"
|
||||
},
|
||||
@@ -9865,6 +9875,15 @@
|
||||
],
|
||||
"type": "string"
|
||||
},
|
||||
"ImageDetail": {
|
||||
"enum": [
|
||||
"auto",
|
||||
"low",
|
||||
"high",
|
||||
"original"
|
||||
],
|
||||
"type": "string"
|
||||
},
|
||||
"InputModality": {
|
||||
"description": "Canonical user-input modality tags advertised by a model.",
|
||||
"oneOf": [
|
||||
|
||||
@@ -6542,6 +6542,16 @@
|
||||
},
|
||||
{
|
||||
"properties": {
|
||||
"detail": {
|
||||
"anyOf": [
|
||||
{
|
||||
"$ref": "#/definitions/ImageDetail"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
]
|
||||
},
|
||||
"image_url": {
|
||||
"type": "string"
|
||||
},
|
||||
@@ -6828,6 +6838,15 @@
|
||||
],
|
||||
"type": "object"
|
||||
},
|
||||
"ImageDetail": {
|
||||
"enum": [
|
||||
"auto",
|
||||
"low",
|
||||
"high",
|
||||
"original"
|
||||
],
|
||||
"type": "string"
|
||||
},
|
||||
"InitializeCapabilities": {
|
||||
"description": "Client-declared capabilities negotiated during initialize.",
|
||||
"properties": {
|
||||
|
||||
@@ -103,6 +103,16 @@
|
||||
},
|
||||
{
|
||||
"properties": {
|
||||
"detail": {
|
||||
"anyOf": [
|
||||
{
|
||||
"$ref": "#/definitions/ImageDetail"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
]
|
||||
},
|
||||
"image_url": {
|
||||
"type": "string"
|
||||
},
|
||||
@@ -173,6 +183,15 @@
|
||||
],
|
||||
"type": "object"
|
||||
},
|
||||
"ImageDetail": {
|
||||
"enum": [
|
||||
"auto",
|
||||
"low",
|
||||
"high",
|
||||
"original"
|
||||
],
|
||||
"type": "string"
|
||||
},
|
||||
"LocalShellAction": {
|
||||
"oneOf": [
|
||||
{
|
||||
|
||||
@@ -145,6 +145,16 @@
|
||||
},
|
||||
{
|
||||
"properties": {
|
||||
"detail": {
|
||||
"anyOf": [
|
||||
{
|
||||
"$ref": "#/definitions/ImageDetail"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
]
|
||||
},
|
||||
"image_url": {
|
||||
"type": "string"
|
||||
},
|
||||
@@ -215,6 +225,15 @@
|
||||
],
|
||||
"type": "object"
|
||||
},
|
||||
"ImageDetail": {
|
||||
"enum": [
|
||||
"auto",
|
||||
"low",
|
||||
"high",
|
||||
"original"
|
||||
],
|
||||
"type": "string"
|
||||
},
|
||||
"LocalShellAction": {
|
||||
"oneOf": [
|
||||
{
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
// GENERATED CODE! DO NOT MODIFY BY HAND!
|
||||
|
||||
// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.
|
||||
import type { ImageDetail } from "./ImageDetail";
|
||||
|
||||
/**
|
||||
* Responses API compatible content items that can be returned by a tool call.
|
||||
* This is a subset of ContentItem with the types we support as function call outputs.
|
||||
*/
|
||||
export type FunctionCallOutputContentItem = { "type": "input_text", text: string, } | { "type": "input_image", image_url: string, };
|
||||
export type FunctionCallOutputContentItem = { "type": "input_text", text: string, } | { "type": "input_image", image_url: string, detail?: ImageDetail, };
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
// GENERATED CODE! DO NOT MODIFY BY HAND!
|
||||
|
||||
// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.
|
||||
|
||||
export type ImageDetail = "auto" | "low" | "high" | "original";
|
||||
@@ -83,6 +83,7 @@ export type { GitDiffToRemoteParams } from "./GitDiffToRemoteParams";
|
||||
export type { GitDiffToRemoteResponse } from "./GitDiffToRemoteResponse";
|
||||
export type { GitSha } from "./GitSha";
|
||||
export type { HistoryEntry } from "./HistoryEntry";
|
||||
export type { ImageDetail } from "./ImageDetail";
|
||||
export type { InitializeCapabilities } from "./InitializeCapabilities";
|
||||
export type { InitializeParams } from "./InitializeParams";
|
||||
export type { InitializeResponse } from "./InitializeResponse";
|
||||
|
||||
@@ -38,6 +38,7 @@ fn preset_to_info(preset: &ModelPreset, priority: i32) -> ModelInfo {
|
||||
apply_patch_tool_type: None,
|
||||
truncation_policy: TruncationPolicyConfig::bytes(10_000),
|
||||
supports_parallel_tool_calls: false,
|
||||
supports_image_detail_original: false,
|
||||
context_window: Some(272_000),
|
||||
auto_compact_token_limit: None,
|
||||
effective_context_window_percent: 95,
|
||||
|
||||
@@ -399,7 +399,10 @@ async fn dynamic_tool_call_round_trip_sends_content_items_to_model() -> Result<(
|
||||
FunctionCallOutputContentItem::InputText { text }
|
||||
}
|
||||
DynamicToolCallOutputContentItem::InputImage { image_url } => {
|
||||
FunctionCallOutputContentItem::InputImage { image_url }
|
||||
FunctionCallOutputContentItem::InputImage {
|
||||
image_url,
|
||||
detail: None,
|
||||
}
|
||||
}
|
||||
})
|
||||
.collect::<Vec<FunctionCallOutputContentItem>>();
|
||||
|
||||
@@ -208,6 +208,7 @@ mod tests {
|
||||
"apply_patch_tool_type": null,
|
||||
"truncation_policy": {"mode": "bytes", "limit": 10_000},
|
||||
"supports_parallel_tool_calls": false,
|
||||
"supports_image_detail_original": false,
|
||||
"context_window": 272_000,
|
||||
"experimental_supported_tools": [],
|
||||
}))
|
||||
|
||||
@@ -86,6 +86,7 @@ async fn models_client_hits_models_endpoint() {
|
||||
apply_patch_tool_type: None,
|
||||
truncation_policy: TruncationPolicyConfig::bytes(10_000),
|
||||
supports_parallel_tool_calls: false,
|
||||
supports_image_detail_original: false,
|
||||
context_window: Some(272_000),
|
||||
auto_compact_token_limit: None,
|
||||
effective_context_window_percent: 95,
|
||||
|
||||
@@ -48,6 +48,8 @@ codex-protocol = { workspace = true }
|
||||
codex-rmcp-client = { workspace = true }
|
||||
codex-state = { workspace = true }
|
||||
codex-utils-absolute-path = { workspace = true }
|
||||
codex-utils-cache = { workspace = true }
|
||||
codex-utils-image = { workspace = true }
|
||||
codex-utils-home-dir = { workspace = true }
|
||||
codex-utils-pty = { workspace = true }
|
||||
codex-utils-readiness = { workspace = true }
|
||||
@@ -64,6 +66,7 @@ eventsource-stream = { workspace = true }
|
||||
futures = { workspace = true }
|
||||
http = { workspace = true }
|
||||
iana-time-zone = { workspace = true }
|
||||
image = { workspace = true, features = ["jpeg", "png", "webp"] }
|
||||
indexmap = { workspace = true }
|
||||
keyring = { workspace = true, features = ["crypto-rust"] }
|
||||
libc = { workspace = true }
|
||||
@@ -88,7 +91,6 @@ sha2 = { workspace = true }
|
||||
shlex = { workspace = true }
|
||||
similar = { workspace = true }
|
||||
tempfile = { workspace = true }
|
||||
test-case = "3.3.1"
|
||||
test-log = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
time = { workspace = true, features = [
|
||||
@@ -157,11 +159,11 @@ codex-test-macros = { workspace = true }
|
||||
codex-utils-cargo-bin = { workspace = true }
|
||||
core_test_support = { workspace = true }
|
||||
ctor = { workspace = true }
|
||||
image = { workspace = true, features = ["jpeg", "png"] }
|
||||
insta = { workspace = true }
|
||||
maplit = { workspace = true }
|
||||
predicates = { workspace = true }
|
||||
pretty_assertions = { workspace = true }
|
||||
test-case = "3.3.1"
|
||||
opentelemetry_sdk = { workspace = true, features = [
|
||||
"experimental_metrics_custom_reader",
|
||||
"metrics",
|
||||
|
||||
@@ -355,6 +355,9 @@
|
||||
"fast_mode": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"image_detail_original": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"include_apply_patch_tool": {
|
||||
"type": "boolean"
|
||||
},
|
||||
@@ -1739,6 +1742,9 @@
|
||||
"fast_mode": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"image_detail_original": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"include_apply_patch_tool": {
|
||||
"type": "boolean"
|
||||
},
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
"limit": 10000
|
||||
},
|
||||
"supports_parallel_tool_calls": true,
|
||||
"supports_image_detail_original": true,
|
||||
"context_window": 272000,
|
||||
"reasoning_summary_format": "experimental",
|
||||
"slug": "gpt-5.3-codex",
|
||||
@@ -83,6 +84,7 @@
|
||||
"limit": 10000
|
||||
},
|
||||
"supports_parallel_tool_calls": true,
|
||||
"supports_image_detail_original": false,
|
||||
"context_window": 272000,
|
||||
"reasoning_summary_format": "experimental",
|
||||
"slug": "gpt-5.2-codex",
|
||||
@@ -155,6 +157,7 @@
|
||||
"limit": 10000
|
||||
},
|
||||
"supports_parallel_tool_calls": false,
|
||||
"supports_image_detail_original": false,
|
||||
"context_window": 272000,
|
||||
"reasoning_summary_format": "experimental",
|
||||
"slug": "gpt-5.1-codex-max",
|
||||
@@ -220,6 +223,7 @@
|
||||
"limit": 10000
|
||||
},
|
||||
"supports_parallel_tool_calls": false,
|
||||
"supports_image_detail_original": false,
|
||||
"context_window": 272000,
|
||||
"reasoning_summary_format": "experimental",
|
||||
"slug": "gpt-5.1-codex",
|
||||
@@ -281,6 +285,7 @@
|
||||
"limit": 10000
|
||||
},
|
||||
"supports_parallel_tool_calls": true,
|
||||
"supports_image_detail_original": false,
|
||||
"context_window": 272000,
|
||||
"reasoning_summary_format": "none",
|
||||
"slug": "gpt-5.2",
|
||||
@@ -346,6 +351,7 @@
|
||||
"limit": 10000
|
||||
},
|
||||
"supports_parallel_tool_calls": true,
|
||||
"supports_image_detail_original": false,
|
||||
"context_window": 272000,
|
||||
"reasoning_summary_format": "none",
|
||||
"slug": "gpt-5.1",
|
||||
@@ -407,6 +413,7 @@
|
||||
"limit": 10000
|
||||
},
|
||||
"supports_parallel_tool_calls": false,
|
||||
"supports_image_detail_original": false,
|
||||
"context_window": 272000,
|
||||
"reasoning_summary_format": "experimental",
|
||||
"slug": "gpt-5-codex",
|
||||
@@ -468,6 +475,7 @@
|
||||
"limit": 10000
|
||||
},
|
||||
"supports_parallel_tool_calls": false,
|
||||
"supports_image_detail_original": false,
|
||||
"context_window": 272000,
|
||||
"reasoning_summary_format": "none",
|
||||
"slug": "gpt-5",
|
||||
@@ -532,6 +540,7 @@
|
||||
"limit": 10000
|
||||
},
|
||||
"supports_parallel_tool_calls": false,
|
||||
"supports_image_detail_original": false,
|
||||
"context_window": 128000,
|
||||
"reasoning_summary_format": "none",
|
||||
"slug": "gpt-oss-120b",
|
||||
@@ -589,6 +598,7 @@
|
||||
"limit": 10000
|
||||
},
|
||||
"supports_parallel_tool_calls": false,
|
||||
"supports_image_detail_original": false,
|
||||
"context_window": 128000,
|
||||
"reasoning_summary_format": "none",
|
||||
"slug": "gpt-oss-20b",
|
||||
@@ -647,6 +657,7 @@
|
||||
"limit": 10000
|
||||
},
|
||||
"supports_parallel_tool_calls": false,
|
||||
"supports_image_detail_original": false,
|
||||
"context_window": 272000,
|
||||
"reasoning_summary_format": "experimental",
|
||||
"slug": "gpt-5.1-codex-mini",
|
||||
@@ -704,6 +715,7 @@
|
||||
"limit": 10000
|
||||
},
|
||||
"supports_parallel_tool_calls": false,
|
||||
"supports_image_detail_original": false,
|
||||
"context_window": 272000,
|
||||
"reasoning_summary_format": "experimental",
|
||||
"slug": "gpt-5-codex-mini",
|
||||
|
||||
@@ -1305,6 +1305,7 @@ mod tests {
|
||||
"apply_patch_tool_type": null,
|
||||
"truncation_policy": {"mode": "bytes", "limit": 10000},
|
||||
"supports_parallel_tool_calls": false,
|
||||
"supports_image_detail_original": false,
|
||||
"context_window": 272000,
|
||||
"auto_compact_token_limit": null,
|
||||
"experimental_supported_tools": []
|
||||
|
||||
@@ -2,21 +2,29 @@ use crate::codex::TurnContext;
|
||||
use crate::context_manager::normalize;
|
||||
use crate::event_mapping::is_contextual_user_message_content;
|
||||
use crate::truncate::TruncationPolicy;
|
||||
use crate::truncate::approx_bytes_for_tokens;
|
||||
use crate::truncate::approx_token_count;
|
||||
use crate::truncate::approx_tokens_from_byte_count_i64;
|
||||
use crate::truncate::truncate_function_output_items_with_policy;
|
||||
use crate::truncate::truncate_text;
|
||||
use base64::Engine;
|
||||
use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
|
||||
use codex_protocol::models::BaseInstructions;
|
||||
use codex_protocol::models::ContentItem;
|
||||
use codex_protocol::models::FunctionCallOutputBody;
|
||||
use codex_protocol::models::FunctionCallOutputContentItem;
|
||||
use codex_protocol::models::FunctionCallOutputPayload;
|
||||
use codex_protocol::models::ImageDetail;
|
||||
use codex_protocol::models::ResponseItem;
|
||||
use codex_protocol::openai_models::InputModality;
|
||||
use codex_protocol::protocol::TokenUsage;
|
||||
use codex_protocol::protocol::TokenUsageInfo;
|
||||
use codex_protocol::protocol::TurnContextItem;
|
||||
use codex_utils_cache::BlockingLruCache;
|
||||
use codex_utils_cache::sha1_digest;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::ops::Deref;
|
||||
use std::sync::LazyLock;
|
||||
|
||||
/// Transcript of thread history
|
||||
#[derive(Debug, Clone, Default)]
|
||||
@@ -428,7 +436,19 @@ fn estimate_item_token_count(item: &ResponseItem) -> i64 {
|
||||
///
|
||||
/// The estimator later converts bytes to tokens using a 4-bytes/token heuristic
|
||||
/// with ceiling division, so 7,373 bytes maps to approximately 1,844 tokens.
|
||||
const IMAGE_BYTES_ESTIMATE: i64 = 7373;
|
||||
const RESIZED_IMAGE_BYTES_ESTIMATE: i64 = 7373;
|
||||
// See https://platform.openai.com/docs/guides/images-vision#calculating-costs.
|
||||
// Use a direct 32px patch count only for `detail: "original"`;
|
||||
// all other image inputs continue to use `RESIZED_IMAGE_BYTES_ESTIMATE`.
|
||||
const ORIGINAL_IMAGE_PATCH_SIZE: u32 = 32;
|
||||
const ORIGINAL_IMAGE_ESTIMATE_CACHE_SIZE: usize = 32;
|
||||
|
||||
static ORIGINAL_IMAGE_ESTIMATE_CACHE: LazyLock<BlockingLruCache<[u8; 20], Option<i64>>> =
|
||||
LazyLock::new(|| {
|
||||
BlockingLruCache::new(
|
||||
NonZeroUsize::new(ORIGINAL_IMAGE_ESTIMATE_CACHE_SIZE).unwrap_or(NonZeroUsize::MIN),
|
||||
)
|
||||
});
|
||||
|
||||
pub(crate) fn estimate_response_item_model_visible_bytes(item: &ResponseItem) -> i64 {
|
||||
match item {
|
||||
@@ -444,15 +464,15 @@ pub(crate) fn estimate_response_item_model_visible_bytes(item: &ResponseItem) ->
|
||||
let raw = serde_json::to_string(item)
|
||||
.map(|serialized| i64::try_from(serialized.len()).unwrap_or(i64::MAX))
|
||||
.unwrap_or_default();
|
||||
let (payload_bytes, image_count) = image_data_url_estimate_adjustment(item);
|
||||
if payload_bytes == 0 || image_count == 0 {
|
||||
let (payload_bytes, replacement_bytes) = image_data_url_estimate_adjustment(item);
|
||||
if payload_bytes == 0 || replacement_bytes == 0 {
|
||||
raw
|
||||
} else {
|
||||
// Replace raw base64 payload bytes with a fixed per-image cost.
|
||||
// We intentionally preserve the data URL prefix and JSON wrapper
|
||||
// bytes already included in `raw`.
|
||||
// Replace raw base64 payload bytes with a per-image estimate.
|
||||
// We intentionally preserve the data URL prefix and JSON
|
||||
// wrapper bytes already included in `raw`.
|
||||
raw.saturating_sub(payload_bytes)
|
||||
.saturating_add(image_count.saturating_mul(IMAGE_BYTES_ESTIMATE))
|
||||
.saturating_add(replacement_bytes)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -463,7 +483,7 @@ pub(crate) fn estimate_response_item_model_visible_bytes(item: &ResponseItem) ->
|
||||
///
|
||||
/// We only discount payloads for `data:image/...;base64,...` URLs (case
|
||||
/// insensitive markers) and leave everything else at raw serialized size.
|
||||
fn base64_data_url_payload_len(url: &str) -> Option<usize> {
|
||||
fn parse_base64_image_data_url(url: &str) -> Option<&str> {
|
||||
if !url
|
||||
.get(.."data:".len())
|
||||
.is_some_and(|prefix| prefix.eq_ignore_ascii_case("data:"))
|
||||
@@ -489,22 +509,62 @@ fn base64_data_url_payload_len(url: &str) -> Option<usize> {
|
||||
if !has_base64_marker {
|
||||
return None;
|
||||
}
|
||||
Some(payload.len())
|
||||
Some(payload)
|
||||
}
|
||||
|
||||
fn estimate_original_image_bytes(image_url: &str) -> Option<i64> {
|
||||
let key = sha1_digest(image_url.as_bytes());
|
||||
ORIGINAL_IMAGE_ESTIMATE_CACHE.get_or_insert_with(key, || {
|
||||
let payload = match parse_base64_image_data_url(image_url) {
|
||||
Some(payload) => payload,
|
||||
None => {
|
||||
tracing::trace!("skipping original-detail estimate for non-base64 image data URL");
|
||||
return None;
|
||||
}
|
||||
};
|
||||
let bytes = match BASE64_STANDARD.decode(payload) {
|
||||
Ok(bytes) => bytes,
|
||||
Err(error) => {
|
||||
tracing::trace!("failed to decode original-detail image payload: {error}");
|
||||
return None;
|
||||
}
|
||||
};
|
||||
let dynamic = match image::load_from_memory(&bytes) {
|
||||
Ok(dynamic) => dynamic,
|
||||
Err(error) => {
|
||||
tracing::trace!("failed to decode original-detail image bytes: {error}");
|
||||
return None;
|
||||
}
|
||||
};
|
||||
let width = i64::from(dynamic.width());
|
||||
let height = i64::from(dynamic.height());
|
||||
let patch_size = i64::from(ORIGINAL_IMAGE_PATCH_SIZE);
|
||||
let patches_wide = width.saturating_add(patch_size.saturating_sub(1)) / patch_size;
|
||||
let patches_high = height.saturating_add(patch_size.saturating_sub(1)) / patch_size;
|
||||
let patch_count = patches_wide.saturating_mul(patches_high);
|
||||
let patch_count = usize::try_from(patch_count).unwrap_or(usize::MAX);
|
||||
Some(i64::try_from(approx_bytes_for_tokens(patch_count)).unwrap_or(i64::MAX))
|
||||
})
|
||||
}
|
||||
|
||||
/// Scans one response item for discount-eligible inline image data URLs and
|
||||
/// returns:
|
||||
/// - total base64 payload bytes to subtract from raw serialized size
|
||||
/// - count of qualifying images to replace with `IMAGE_BYTES_ESTIMATE`
|
||||
/// - total replacement byte estimate for those images
|
||||
fn image_data_url_estimate_adjustment(item: &ResponseItem) -> (i64, i64) {
|
||||
let mut payload_bytes = 0i64;
|
||||
let mut image_count = 0i64;
|
||||
let mut replacement_bytes = 0i64;
|
||||
|
||||
let mut accumulate = |image_url: &str| {
|
||||
if let Some(payload_len) = base64_data_url_payload_len(image_url) {
|
||||
let mut accumulate = |image_url: &str, detail: Option<ImageDetail>| {
|
||||
if let Some(payload_len) = parse_base64_image_data_url(image_url).map(str::len) {
|
||||
payload_bytes =
|
||||
payload_bytes.saturating_add(i64::try_from(payload_len).unwrap_or(i64::MAX));
|
||||
image_count = image_count.saturating_add(1);
|
||||
replacement_bytes = replacement_bytes.saturating_add(match detail {
|
||||
Some(ImageDetail::Original) => {
|
||||
estimate_original_image_bytes(image_url).unwrap_or(RESIZED_IMAGE_BYTES_ESTIMATE)
|
||||
}
|
||||
_ => RESIZED_IMAGE_BYTES_ESTIMATE,
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -512,7 +572,7 @@ fn image_data_url_estimate_adjustment(item: &ResponseItem) -> (i64, i64) {
|
||||
ResponseItem::Message { content, .. } => {
|
||||
for content_item in content {
|
||||
if let ContentItem::InputImage { image_url } = content_item {
|
||||
accumulate(image_url);
|
||||
accumulate(image_url, None);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -520,8 +580,10 @@ fn image_data_url_estimate_adjustment(item: &ResponseItem) -> (i64, i64) {
|
||||
| ResponseItem::CustomToolCallOutput { output, .. } => {
|
||||
if let FunctionCallOutputBody::ContentItems(items) = &output.body {
|
||||
for content_item in items {
|
||||
if let FunctionCallOutputContentItem::InputImage { image_url } = content_item {
|
||||
accumulate(image_url);
|
||||
if let FunctionCallOutputContentItem::InputImage { image_url, detail } =
|
||||
content_item
|
||||
{
|
||||
accumulate(image_url, *detail);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -529,7 +591,7 @@ fn image_data_url_estimate_adjustment(item: &ResponseItem) -> (i64, i64) {
|
||||
_ => {}
|
||||
}
|
||||
|
||||
(payload_bytes, image_count)
|
||||
(payload_bytes, replacement_bytes)
|
||||
}
|
||||
|
||||
fn is_model_generated_item(item: &ResponseItem) -> bool {
|
||||
|
||||
@@ -1,12 +1,15 @@
|
||||
use super::*;
|
||||
use crate::truncate;
|
||||
use crate::truncate::TruncationPolicy;
|
||||
use base64::Engine;
|
||||
use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
|
||||
use codex_git::GhostCommit;
|
||||
use codex_protocol::models::BaseInstructions;
|
||||
use codex_protocol::models::ContentItem;
|
||||
use codex_protocol::models::FunctionCallOutputBody;
|
||||
use codex_protocol::models::FunctionCallOutputContentItem;
|
||||
use codex_protocol::models::FunctionCallOutputPayload;
|
||||
use codex_protocol::models::ImageDetail;
|
||||
use codex_protocol::models::LocalShellAction;
|
||||
use codex_protocol::models::LocalShellExecAction;
|
||||
use codex_protocol::models::LocalShellStatus;
|
||||
@@ -14,6 +17,9 @@ use codex_protocol::models::ReasoningItemContent;
|
||||
use codex_protocol::models::ReasoningItemReasoningSummary;
|
||||
use codex_protocol::openai_models::InputModality;
|
||||
use codex_protocol::openai_models::default_input_modalities;
|
||||
use image::ImageBuffer;
|
||||
use image::ImageFormat;
|
||||
use image::Rgba;
|
||||
use pretty_assertions::assert_eq;
|
||||
use regex_lite::Regex;
|
||||
|
||||
@@ -276,6 +282,7 @@ fn for_prompt_strips_images_when_model_does_not_support_images() {
|
||||
},
|
||||
FunctionCallOutputContentItem::InputImage {
|
||||
image_url: "https://example.com/result.png".to_string(),
|
||||
detail: None,
|
||||
},
|
||||
]),
|
||||
},
|
||||
@@ -294,6 +301,7 @@ fn for_prompt_strips_images_when_model_does_not_support_images() {
|
||||
},
|
||||
FunctionCallOutputContentItem::InputImage {
|
||||
image_url: "https://example.com/js-repl-result.png".to_string(),
|
||||
detail: None,
|
||||
},
|
||||
]),
|
||||
},
|
||||
@@ -489,6 +497,7 @@ fn replace_last_turn_images_replaces_tool_output_images() {
|
||||
body: FunctionCallOutputBody::ContentItems(vec![
|
||||
FunctionCallOutputContentItem::InputImage {
|
||||
image_url: "data:image/png;base64,AAA".to_string(),
|
||||
detail: None,
|
||||
},
|
||||
]),
|
||||
success: Some(true),
|
||||
@@ -1302,7 +1311,7 @@ fn image_data_url_payload_does_not_dominate_message_estimate() {
|
||||
|
||||
let raw_len = serde_json::to_string(&image_item).unwrap().len() as i64;
|
||||
let estimated = estimate_response_item_model_visible_bytes(&image_item);
|
||||
let expected = raw_len - payload.len() as i64 + IMAGE_BYTES_ESTIMATE;
|
||||
let expected = raw_len - payload.len() as i64 + RESIZED_IMAGE_BYTES_ESTIMATE;
|
||||
let text_only_estimated = estimate_response_item_model_visible_bytes(&text_only_item);
|
||||
|
||||
assert_eq!(estimated, expected);
|
||||
@@ -1320,13 +1329,16 @@ fn image_data_url_payload_does_not_dominate_function_call_output_estimate() {
|
||||
FunctionCallOutputContentItem::InputText {
|
||||
text: "Screenshot captured".to_string(),
|
||||
},
|
||||
FunctionCallOutputContentItem::InputImage { image_url },
|
||||
FunctionCallOutputContentItem::InputImage {
|
||||
image_url,
|
||||
detail: None,
|
||||
},
|
||||
]),
|
||||
};
|
||||
|
||||
let raw_len = serde_json::to_string(&item).unwrap().len() as i64;
|
||||
let estimated = estimate_response_item_model_visible_bytes(&item);
|
||||
let expected = raw_len - payload.len() as i64 + IMAGE_BYTES_ESTIMATE;
|
||||
let expected = raw_len - payload.len() as i64 + RESIZED_IMAGE_BYTES_ESTIMATE;
|
||||
|
||||
assert_eq!(estimated, expected);
|
||||
assert!(estimated < raw_len);
|
||||
@@ -1342,13 +1354,16 @@ fn image_data_url_payload_does_not_dominate_custom_tool_call_output_estimate() {
|
||||
FunctionCallOutputContentItem::InputText {
|
||||
text: "Screenshot captured".to_string(),
|
||||
},
|
||||
FunctionCallOutputContentItem::InputImage { image_url },
|
||||
FunctionCallOutputContentItem::InputImage {
|
||||
image_url,
|
||||
detail: None,
|
||||
},
|
||||
]),
|
||||
};
|
||||
|
||||
let raw_len = serde_json::to_string(&item).unwrap().len() as i64;
|
||||
let estimated = estimate_response_item_model_visible_bytes(&item);
|
||||
let expected = raw_len - payload.len() as i64 + IMAGE_BYTES_ESTIMATE;
|
||||
let expected = raw_len - payload.len() as i64 + RESIZED_IMAGE_BYTES_ESTIMATE;
|
||||
|
||||
assert_eq!(estimated, expected);
|
||||
assert!(estimated < raw_len);
|
||||
@@ -1370,6 +1385,7 @@ fn non_base64_image_urls_are_unchanged() {
|
||||
output: FunctionCallOutputPayload::from_content_items(vec![
|
||||
FunctionCallOutputContentItem::InputImage {
|
||||
image_url: "file:///tmp/foo.png".to_string(),
|
||||
detail: None,
|
||||
},
|
||||
]),
|
||||
};
|
||||
@@ -1409,7 +1425,10 @@ fn non_image_base64_data_url_is_unchanged() {
|
||||
let item = ResponseItem::FunctionCallOutput {
|
||||
call_id: "call-octet".to_string(),
|
||||
output: FunctionCallOutputPayload::from_content_items(vec![
|
||||
FunctionCallOutputContentItem::InputImage { image_url },
|
||||
FunctionCallOutputContentItem::InputImage {
|
||||
image_url,
|
||||
detail: None,
|
||||
},
|
||||
]),
|
||||
};
|
||||
|
||||
@@ -1433,7 +1452,7 @@ fn mixed_case_data_url_markers_are_adjusted() {
|
||||
|
||||
let raw_len = serde_json::to_string(&item).unwrap().len() as i64;
|
||||
let estimated = estimate_response_item_model_visible_bytes(&item);
|
||||
let expected = raw_len - payload.len() as i64 + IMAGE_BYTES_ESTIMATE;
|
||||
let expected = raw_len - payload.len() as i64 + RESIZED_IMAGE_BYTES_ESTIMATE;
|
||||
|
||||
assert_eq!(estimated, expected);
|
||||
}
|
||||
@@ -1465,7 +1484,70 @@ fn multiple_inline_images_apply_multiple_fixed_costs() {
|
||||
let raw_len = serde_json::to_string(&item).unwrap().len() as i64;
|
||||
let payload_sum = (payload_one.len() + payload_two.len()) as i64;
|
||||
let estimated = estimate_response_item_model_visible_bytes(&item);
|
||||
let expected = raw_len - payload_sum + (2 * IMAGE_BYTES_ESTIMATE);
|
||||
let expected = raw_len - payload_sum + (2 * RESIZED_IMAGE_BYTES_ESTIMATE);
|
||||
|
||||
assert_eq!(estimated, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn original_detail_images_scale_with_dimensions() {
|
||||
// 2304x864 at 32px patches yields 72 * 27 = 1,944 patches.
|
||||
// The byte heuristic uses 4 bytes per token, so the replacement cost is 7,776 bytes.
|
||||
const EXPECTED_ORIGINAL_DETAIL_IMAGE_BYTES: i64 = 7_776;
|
||||
|
||||
let width = 2304;
|
||||
let height = 864;
|
||||
let image = ImageBuffer::from_pixel(width, height, Rgba([12u8, 34, 56, 255]));
|
||||
let mut bytes = std::io::Cursor::new(Vec::new());
|
||||
image
|
||||
.write_to(&mut bytes, ImageFormat::Png)
|
||||
.expect("encode png");
|
||||
let payload = BASE64_STANDARD.encode(bytes.get_ref());
|
||||
let image_url = format!("data:image/png;base64,{payload}");
|
||||
let item = ResponseItem::FunctionCallOutput {
|
||||
call_id: "call-original".to_string(),
|
||||
output: FunctionCallOutputPayload::from_content_items(vec![
|
||||
FunctionCallOutputContentItem::InputImage {
|
||||
image_url,
|
||||
detail: Some(ImageDetail::Original),
|
||||
},
|
||||
]),
|
||||
};
|
||||
|
||||
let raw_len = serde_json::to_string(&item).unwrap().len() as i64;
|
||||
let estimated = estimate_response_item_model_visible_bytes(&item);
|
||||
let expected = raw_len - payload.len() as i64 + EXPECTED_ORIGINAL_DETAIL_IMAGE_BYTES;
|
||||
|
||||
assert_eq!(estimated, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn original_detail_webp_images_scale_with_dimensions() {
|
||||
// Same dimensions as the PNG case above, so the patch-based replacement cost is the same.
|
||||
const EXPECTED_ORIGINAL_DETAIL_IMAGE_BYTES: i64 = 7_776;
|
||||
|
||||
let width = 2304;
|
||||
let height = 864;
|
||||
let image = ImageBuffer::from_pixel(width, height, Rgba([12u8, 34, 56, 255]));
|
||||
let mut bytes = std::io::Cursor::new(Vec::new());
|
||||
image
|
||||
.write_to(&mut bytes, ImageFormat::WebP)
|
||||
.expect("encode webp");
|
||||
let payload = BASE64_STANDARD.encode(bytes.get_ref());
|
||||
let image_url = format!("data:image/webp;base64,{payload}");
|
||||
let item = ResponseItem::FunctionCallOutput {
|
||||
call_id: "call-original-webp".to_string(),
|
||||
output: FunctionCallOutputPayload::from_content_items(vec![
|
||||
FunctionCallOutputContentItem::InputImage {
|
||||
image_url,
|
||||
detail: Some(ImageDetail::Original),
|
||||
},
|
||||
]),
|
||||
};
|
||||
|
||||
let raw_len = serde_json::to_string(&item).unwrap().len() as i64;
|
||||
let estimated = estimate_response_item_model_visible_bytes(&item);
|
||||
let expected = raw_len - payload.len() as i64 + EXPECTED_ORIGINAL_DETAIL_IMAGE_BYTES;
|
||||
|
||||
assert_eq!(estimated, expected);
|
||||
}
|
||||
|
||||
@@ -119,6 +119,8 @@ pub enum Feature {
|
||||
MemoryTool,
|
||||
/// Append additional AGENTS.md guidance to user instructions.
|
||||
ChildAgentsMd,
|
||||
/// Allow `detail: "original"` image outputs on supported models.
|
||||
ImageDetailOriginal,
|
||||
/// Enforce UTF8 output in Powershell.
|
||||
PowershellUtf8,
|
||||
/// Compress request bodies (zstd) when sending streaming requests to codex-backend.
|
||||
@@ -529,6 +531,12 @@ pub const FEATURES: &[FeatureSpec] = &[
|
||||
stage: Stage::UnderDevelopment,
|
||||
default_enabled: false,
|
||||
},
|
||||
FeatureSpec {
|
||||
id: Feature::ImageDetailOriginal,
|
||||
key: "image_detail_original",
|
||||
stage: Stage::UnderDevelopment,
|
||||
default_enabled: false,
|
||||
},
|
||||
FeatureSpec {
|
||||
id: Feature::ApplyPatchFreeform,
|
||||
key: "apply_patch_freeform",
|
||||
|
||||
@@ -471,6 +471,7 @@ mod tests {
|
||||
"apply_patch_tool_type": null,
|
||||
"truncation_policy": {"mode": "bytes", "limit": 10_000},
|
||||
"supports_parallel_tool_calls": false,
|
||||
"supports_image_detail_original": false,
|
||||
"context_window": 272_000,
|
||||
"experimental_supported_tools": [],
|
||||
}))
|
||||
@@ -549,6 +550,8 @@ mod tests {
|
||||
.build()
|
||||
.await
|
||||
.expect("load default test config");
|
||||
let mut overlay = remote_model("gpt-overlay", "Overlay", 0);
|
||||
overlay.supports_image_detail_original = true;
|
||||
|
||||
let auth_manager =
|
||||
AuthManager::from_auth_for_testing(CodexAuth::from_api_key("Test API Key"));
|
||||
@@ -556,7 +559,7 @@ mod tests {
|
||||
codex_home.path().to_path_buf(),
|
||||
auth_manager,
|
||||
Some(ModelsResponse {
|
||||
models: vec![remote_model("gpt-overlay", "Overlay", 0)],
|
||||
models: vec![overlay],
|
||||
}),
|
||||
CollaborationModesConfig::default(),
|
||||
);
|
||||
@@ -568,6 +571,7 @@ mod tests {
|
||||
assert_eq!(model_info.slug, "gpt-overlay-experiment");
|
||||
assert_eq!(model_info.display_name, "Overlay");
|
||||
assert_eq!(model_info.context_window, Some(272_000));
|
||||
assert!(model_info.supports_image_detail_original);
|
||||
assert!(!model_info.supports_parallel_tool_calls);
|
||||
assert!(!model_info.used_fallback_model_metadata);
|
||||
}
|
||||
@@ -580,26 +584,24 @@ mod tests {
|
||||
.build()
|
||||
.await
|
||||
.expect("load default test config");
|
||||
let mut remote = remote_model("gpt-image", "Image", 0);
|
||||
remote.supports_image_detail_original = true;
|
||||
let auth_manager =
|
||||
AuthManager::from_auth_for_testing(CodexAuth::from_api_key("Test API Key"));
|
||||
let manager = ModelsManager::new(
|
||||
codex_home.path().to_path_buf(),
|
||||
auth_manager,
|
||||
None,
|
||||
Some(ModelsResponse {
|
||||
models: vec![remote],
|
||||
}),
|
||||
CollaborationModesConfig::default(),
|
||||
);
|
||||
let known_slug = manager
|
||||
.get_remote_models()
|
||||
.await
|
||||
.first()
|
||||
.expect("bundled models should include at least one model")
|
||||
.slug
|
||||
.clone();
|
||||
let namespaced_model = format!("custom/{known_slug}");
|
||||
let namespaced_model = "custom/gpt-image".to_string();
|
||||
|
||||
let model_info = manager.get_model_info(&namespaced_model, &config).await;
|
||||
|
||||
assert_eq!(model_info.slug, namespaced_model);
|
||||
assert!(model_info.supports_image_detail_original);
|
||||
assert!(!model_info.used_fallback_model_metadata);
|
||||
}
|
||||
|
||||
|
||||
@@ -80,6 +80,7 @@ pub(crate) fn model_info_from_slug(slug: &str) -> ModelInfo {
|
||||
apply_patch_tool_type: None,
|
||||
truncation_policy: TruncationPolicyConfig::bytes(10_000),
|
||||
supports_parallel_tool_calls: false,
|
||||
supports_image_detail_original: false,
|
||||
context_window: Some(272_000),
|
||||
auto_compact_token_limit: None,
|
||||
effective_context_window_percent: 95,
|
||||
|
||||
@@ -55,6 +55,9 @@ fn render_js_repl_instructions(config: &Config) -> Option<String> {
|
||||
section.push_str("- Helpers: `codex.tmpDir` and `codex.tool(name, args?)`.\n");
|
||||
section.push_str("- `codex.tool` executes a normal tool call and resolves to the raw tool output object. Use it for shell and non-shell tools alike.\n");
|
||||
section.push_str("- To share generated images with the model, write a file under `codex.tmpDir`, call `await codex.tool(\"view_image\", { path: \"/absolute/path\" })`, then delete the file.\n");
|
||||
if config.features.enabled(Feature::ImageDetailOriginal) {
|
||||
section.push_str("- When generating or converting images for `view_image` in `js_repl`, prefer JPEG at 85% quality unless lossless quality is strictly required; other formats can be used if the user requests them. This keeps uploads smaller and reduces the chance of hitting image size caps.\n");
|
||||
}
|
||||
section.push_str("- Top-level bindings persist across cells. If you hit `SyntaxError: Identifier 'x' has already been declared`, reuse the binding, pick a new name, wrap in `{ ... }` for block scope, or reset the kernel with `js_repl_reset`.\n");
|
||||
section.push_str("- Top-level static import declarations (for example `import x from \"pkg\"`) are currently unsupported in `js_repl`; use dynamic imports with `await import(\"pkg\")` instead.\n");
|
||||
|
||||
@@ -492,6 +495,21 @@ mod tests {
|
||||
assert_eq!(res, expected);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn js_repl_original_resolution_guidance_is_feature_gated() {
|
||||
let tmp = tempfile::tempdir().expect("tempdir");
|
||||
let mut cfg = make_config(&tmp, 4096, None).await;
|
||||
cfg.features
|
||||
.enable(Feature::JsRepl)
|
||||
.enable(Feature::ImageDetailOriginal);
|
||||
|
||||
let res = get_user_instructions(&cfg, None)
|
||||
.await
|
||||
.expect("js_repl instructions expected");
|
||||
let expected = "## JavaScript REPL (Node)\n- Use `js_repl` for Node-backed JavaScript with top-level await in a persistent kernel.\n- `js_repl` is a freeform/custom tool. Direct `js_repl` calls must send raw JavaScript tool input (optionally with first-line `// codex-js-repl: timeout_ms=15000`). Do not wrap code in JSON (for example `{\"code\":\"...\"}`), quotes, or markdown code fences.\n- Helpers: `codex.tmpDir` and `codex.tool(name, args?)`.\n- `codex.tool` executes a normal tool call and resolves to the raw tool output object. Use it for shell and non-shell tools alike.\n- To share generated images with the model, write a file under `codex.tmpDir`, call `await codex.tool(\"view_image\", { path: \"/absolute/path\" })`, then delete the file.\n- When generating or converting images for `view_image` in `js_repl`, prefer JPEG at 85% quality unless lossless quality is strictly required; other formats can be used if the user requests them. This keeps uploads smaller and reduces the chance of hitting image size caps.\n- Top-level bindings persist across cells. If you hit `SyntaxError: Identifier 'x' has already been declared`, reuse the binding, pick a new name, wrap in `{ ... }` for block scope, or reset the kernel with `js_repl_reset`.\n- Top-level static import declarations (for example `import x from \"pkg\"`) are currently unsupported in `js_repl`; use dynamic imports with `await import(\"pkg\")` instead.\n- Avoid direct access to `process.stdout` / `process.stderr` / `process.stdin`; it can corrupt the JSON line protocol. Use `console.log` and `codex.tool(...)`.";
|
||||
assert_eq!(res, expected);
|
||||
}
|
||||
|
||||
/// When both system instructions *and* a project doc are present the two
|
||||
/// should be concatenated with the separator.
|
||||
#[tokio::test]
|
||||
|
||||
@@ -222,6 +222,7 @@ mod tests {
|
||||
},
|
||||
FunctionCallOutputContentItem::InputImage {
|
||||
image_url: "data:image/png;base64,AAA".to_string(),
|
||||
detail: None,
|
||||
},
|
||||
FunctionCallOutputContentItem::InputText {
|
||||
text: "line 2".to_string(),
|
||||
@@ -239,6 +240,7 @@ mod tests {
|
||||
},
|
||||
FunctionCallOutputContentItem::InputImage {
|
||||
image_url: "data:image/png;base64,AAA".to_string(),
|
||||
detail: None,
|
||||
},
|
||||
FunctionCallOutputContentItem::InputText {
|
||||
text: "line 2".to_string(),
|
||||
|
||||
@@ -1,10 +1,15 @@
|
||||
use async_trait::async_trait;
|
||||
use codex_protocol::models::ContentItem;
|
||||
use codex_protocol::models::FunctionCallOutputBody;
|
||||
use codex_protocol::models::FunctionCallOutputContentItem;
|
||||
use codex_protocol::models::ImageDetail;
|
||||
use codex_protocol::models::local_image_content_items_with_label_number;
|
||||
use codex_protocol::openai_models::InputModality;
|
||||
use codex_utils_image::PromptImageMode;
|
||||
use serde::Deserialize;
|
||||
use tokio::fs;
|
||||
|
||||
use crate::features::Feature;
|
||||
use crate::function_tool::FunctionCallError;
|
||||
use crate::protocol::EventMsg;
|
||||
use crate::protocol::ViewImageToolCallEvent;
|
||||
@@ -14,8 +19,6 @@ use crate::tools::context::ToolPayload;
|
||||
use crate::tools::handlers::parse_arguments;
|
||||
use crate::tools::registry::ToolHandler;
|
||||
use crate::tools::registry::ToolKind;
|
||||
use codex_protocol::models::ContentItem;
|
||||
use codex_protocol::models::local_image_content_items_with_label_number;
|
||||
|
||||
pub struct ViewImageHandler;
|
||||
|
||||
@@ -81,15 +84,26 @@ impl ToolHandler for ViewImageHandler {
|
||||
}
|
||||
let event_path = abs_path.clone();
|
||||
|
||||
let content = local_image_content_items_with_label_number(&abs_path, None);
|
||||
let content = content
|
||||
let use_original_detail = turn.config.features.enabled(Feature::ImageDetailOriginal)
|
||||
&& turn.model_info.supports_image_detail_original;
|
||||
let image_mode = if use_original_detail {
|
||||
PromptImageMode::Original
|
||||
} else {
|
||||
PromptImageMode::ResizeToFit
|
||||
};
|
||||
let image_detail = use_original_detail.then_some(ImageDetail::Original);
|
||||
|
||||
let content = local_image_content_items_with_label_number(&abs_path, None, image_mode)
|
||||
.into_iter()
|
||||
.map(|item| match item {
|
||||
ContentItem::InputText { text } => {
|
||||
FunctionCallOutputContentItem::InputText { text }
|
||||
}
|
||||
ContentItem::InputImage { image_url } => {
|
||||
FunctionCallOutputContentItem::InputImage { image_url }
|
||||
FunctionCallOutputContentItem::InputImage {
|
||||
image_url,
|
||||
detail: image_detail,
|
||||
}
|
||||
}
|
||||
ContentItem::OutputText { text } => {
|
||||
FunctionCallOutputContentItem::InputText { text }
|
||||
|
||||
@@ -1900,6 +1900,7 @@ mod tests {
|
||||
output: FunctionCallOutputPayload::from_content_items(vec![
|
||||
FunctionCallOutputContentItem::InputImage {
|
||||
image_url: "data:image/png;base64,abcd".to_string(),
|
||||
detail: None,
|
||||
},
|
||||
]),
|
||||
};
|
||||
@@ -1929,6 +1930,7 @@ mod tests {
|
||||
output: FunctionCallOutputPayload::from_content_items(vec![
|
||||
FunctionCallOutputContentItem::InputImage {
|
||||
image_url: "data:image/png;base64,abcd".to_string(),
|
||||
detail: None,
|
||||
},
|
||||
]),
|
||||
};
|
||||
@@ -2417,15 +2419,17 @@ console.log(out.output?.body?.text ?? "");
|
||||
image_url:
|
||||
"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGP4z8DwHwAFAAH/iZk9HQAAAABJRU5ErkJggg=="
|
||||
.to_string(),
|
||||
detail: None,
|
||||
}]
|
||||
.as_slice()
|
||||
);
|
||||
let [FunctionCallOutputContentItem::InputImage { image_url }] =
|
||||
let [FunctionCallOutputContentItem::InputImage { image_url, detail }] =
|
||||
result.content_items.as_slice()
|
||||
else {
|
||||
panic!("view_image should return exactly one input_image content item");
|
||||
};
|
||||
assert!(image_url.starts_with("data:image/png;base64,"));
|
||||
assert_eq!(*detail, None);
|
||||
assert!(session.get_pending_input().await.is_empty());
|
||||
|
||||
Ok(())
|
||||
@@ -2515,6 +2519,7 @@ console.log(out.type);
|
||||
},
|
||||
FunctionCallOutputContentItem::InputImage {
|
||||
image_url: image_url.to_string(),
|
||||
detail: None,
|
||||
},
|
||||
]
|
||||
);
|
||||
|
||||
@@ -138,9 +138,10 @@ pub(crate) fn truncate_function_output_items_with_policy(
|
||||
remaining_budget = 0;
|
||||
}
|
||||
}
|
||||
FunctionCallOutputContentItem::InputImage { image_url } => {
|
||||
FunctionCallOutputContentItem::InputImage { image_url, detail } => {
|
||||
out.push(FunctionCallOutputContentItem::InputImage {
|
||||
image_url: image_url.clone(),
|
||||
detail: *detail,
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -491,6 +492,7 @@ mod tests {
|
||||
FunctionCallOutputContentItem::InputText { text: t2.clone() },
|
||||
FunctionCallOutputContentItem::InputImage {
|
||||
image_url: "img:mid".to_string(),
|
||||
detail: None,
|
||||
},
|
||||
FunctionCallOutputContentItem::InputText { text: t3 },
|
||||
FunctionCallOutputContentItem::InputText { text: t4 },
|
||||
@@ -518,7 +520,8 @@ mod tests {
|
||||
assert_eq!(
|
||||
output[2],
|
||||
FunctionCallOutputContentItem::InputImage {
|
||||
image_url: "img:mid".to_string()
|
||||
image_url: "img:mid".to_string(),
|
||||
detail: None,
|
||||
}
|
||||
);
|
||||
|
||||
|
||||
@@ -278,6 +278,7 @@ async fn model_change_from_image_to_text_strips_prior_image_content() -> Result<
|
||||
apply_patch_tool_type: None,
|
||||
truncation_policy: TruncationPolicyConfig::bytes(10_000),
|
||||
supports_parallel_tool_calls: false,
|
||||
supports_image_detail_original: false,
|
||||
context_window: Some(272_000),
|
||||
auto_compact_token_limit: None,
|
||||
effective_context_window_percent: 95,
|
||||
@@ -439,6 +440,7 @@ async fn model_switch_to_smaller_model_updates_token_context_window() -> Result<
|
||||
apply_patch_tool_type: None,
|
||||
truncation_policy: TruncationPolicyConfig::bytes(10_000),
|
||||
supports_parallel_tool_calls: false,
|
||||
supports_image_detail_original: false,
|
||||
context_window: Some(large_context_window),
|
||||
auto_compact_token_limit: None,
|
||||
effective_context_window_percent,
|
||||
|
||||
@@ -344,6 +344,7 @@ fn test_remote_model(slug: &str, priority: i32) -> ModelInfo {
|
||||
apply_patch_tool_type: None,
|
||||
truncation_policy: TruncationPolicyConfig::bytes(10_000),
|
||||
supports_parallel_tool_calls: false,
|
||||
supports_image_detail_original: false,
|
||||
context_window: Some(272_000),
|
||||
auto_compact_token_limit: None,
|
||||
effective_context_window_percent: 95,
|
||||
|
||||
@@ -619,6 +619,7 @@ async fn remote_model_friendly_personality_instructions_with_feature() -> anyhow
|
||||
apply_patch_tool_type: None,
|
||||
truncation_policy: TruncationPolicyConfig::bytes(10_000),
|
||||
supports_parallel_tool_calls: false,
|
||||
supports_image_detail_original: false,
|
||||
context_window: Some(128_000),
|
||||
auto_compact_token_limit: None,
|
||||
effective_context_window_percent: 95,
|
||||
@@ -729,6 +730,7 @@ async fn user_turn_personality_remote_model_template_includes_update_message() -
|
||||
apply_patch_tool_type: None,
|
||||
truncation_policy: TruncationPolicyConfig::bytes(10_000),
|
||||
supports_parallel_tool_calls: false,
|
||||
supports_image_detail_original: false,
|
||||
context_window: Some(128_000),
|
||||
auto_compact_token_limit: None,
|
||||
effective_context_window_percent: 95,
|
||||
|
||||
@@ -303,6 +303,7 @@ async fn remote_models_remote_model_uses_unified_exec() -> Result<()> {
|
||||
apply_patch_tool_type: None,
|
||||
truncation_policy: TruncationPolicyConfig::bytes(10_000),
|
||||
supports_parallel_tool_calls: false,
|
||||
supports_image_detail_original: false,
|
||||
context_window: Some(272_000),
|
||||
auto_compact_token_limit: None,
|
||||
effective_context_window_percent: 95,
|
||||
@@ -543,6 +544,7 @@ async fn remote_models_apply_remote_base_instructions() -> Result<()> {
|
||||
apply_patch_tool_type: None,
|
||||
truncation_policy: TruncationPolicyConfig::bytes(10_000),
|
||||
supports_parallel_tool_calls: false,
|
||||
supports_image_detail_original: false,
|
||||
context_window: Some(272_000),
|
||||
auto_compact_token_limit: None,
|
||||
effective_context_window_percent: 95,
|
||||
@@ -1007,6 +1009,7 @@ fn test_remote_model_with_policy(
|
||||
apply_patch_tool_type: None,
|
||||
truncation_policy,
|
||||
supports_parallel_tool_calls: false,
|
||||
supports_image_detail_original: false,
|
||||
context_window: Some(272_000),
|
||||
auto_compact_token_limit: None,
|
||||
effective_context_window_percent: 95,
|
||||
|
||||
@@ -409,6 +409,7 @@ async fn stdio_image_responses_are_sanitized_for_text_only_model() -> anyhow::Re
|
||||
apply_patch_tool_type: None,
|
||||
truncation_policy: TruncationPolicyConfig::bytes(10_000),
|
||||
supports_parallel_tool_calls: false,
|
||||
supports_image_detail_original: false,
|
||||
context_window: Some(272_000),
|
||||
auto_compact_token_limit: None,
|
||||
effective_context_window_percent: 95,
|
||||
|
||||
@@ -291,6 +291,208 @@ async fn view_image_tool_attaches_local_image() -> anyhow::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn view_image_tool_can_preserve_original_resolution_on_gpt5_3_codex() -> anyhow::Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
let server = start_mock_server().await;
|
||||
let mut builder = test_codex()
|
||||
.with_model("gpt-5.3-codex")
|
||||
.with_config(|config| {
|
||||
config.features.enable(Feature::ImageDetailOriginal);
|
||||
});
|
||||
let TestCodex {
|
||||
codex,
|
||||
cwd,
|
||||
session_configured,
|
||||
..
|
||||
} = builder.build(&server).await?;
|
||||
|
||||
let rel_path = "assets/original-example.png";
|
||||
let abs_path = cwd.path().join(rel_path);
|
||||
if let Some(parent) = abs_path.parent() {
|
||||
std::fs::create_dir_all(parent)?;
|
||||
}
|
||||
let original_width = 2304;
|
||||
let original_height = 864;
|
||||
let image = ImageBuffer::from_pixel(original_width, original_height, Rgba([0u8, 80, 255, 255]));
|
||||
image.save(&abs_path)?;
|
||||
|
||||
let call_id = "view-image-original";
|
||||
let arguments = serde_json::json!({ "path": rel_path }).to_string();
|
||||
|
||||
let first_response = sse(vec![
|
||||
ev_response_created("resp-1"),
|
||||
ev_function_call(call_id, "view_image", &arguments),
|
||||
ev_completed("resp-1"),
|
||||
]);
|
||||
responses::mount_sse_once(&server, first_response).await;
|
||||
|
||||
let second_response = sse(vec![
|
||||
ev_assistant_message("msg-1", "done"),
|
||||
ev_completed("resp-2"),
|
||||
]);
|
||||
let mock = responses::mount_sse_once(&server, second_response).await;
|
||||
|
||||
let session_model = session_configured.model.clone();
|
||||
|
||||
codex
|
||||
.submit(Op::UserTurn {
|
||||
items: vec![UserInput::Text {
|
||||
text: "please add the original screenshot".into(),
|
||||
text_elements: Vec::new(),
|
||||
}],
|
||||
final_output_json_schema: None,
|
||||
cwd: cwd.path().to_path_buf(),
|
||||
approval_policy: AskForApproval::Never,
|
||||
sandbox_policy: SandboxPolicy::DangerFullAccess,
|
||||
model: session_model,
|
||||
effort: None,
|
||||
service_tier: None,
|
||||
summary: None,
|
||||
collaboration_mode: None,
|
||||
personality: None,
|
||||
})
|
||||
.await?;
|
||||
|
||||
wait_for_event_with_timeout(
|
||||
&codex,
|
||||
|event| matches!(event, EventMsg::TurnComplete(_)),
|
||||
Duration::from_secs(10),
|
||||
)
|
||||
.await;
|
||||
|
||||
let req = mock.single_request();
|
||||
let function_output = req.function_call_output(call_id);
|
||||
let output_items = function_output
|
||||
.get("output")
|
||||
.and_then(Value::as_array)
|
||||
.expect("function_call_output should be a content item array");
|
||||
assert_eq!(output_items.len(), 1);
|
||||
assert_eq!(
|
||||
output_items[0].get("detail").and_then(Value::as_str),
|
||||
Some("original")
|
||||
);
|
||||
let image_url = output_items[0]
|
||||
.get("image_url")
|
||||
.and_then(Value::as_str)
|
||||
.expect("image_url present");
|
||||
|
||||
let (_, encoded) = image_url
|
||||
.split_once(',')
|
||||
.expect("image url contains data prefix");
|
||||
let decoded = BASE64_STANDARD
|
||||
.decode(encoded)
|
||||
.expect("image data decodes from base64 for request");
|
||||
let preserved = load_from_memory(&decoded).expect("load preserved image");
|
||||
let (width, height) = preserved.dimensions();
|
||||
assert_eq!(width, original_width);
|
||||
assert_eq!(height, original_height);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn view_image_tool_keeps_legacy_behavior_below_gpt5_3_codex() -> anyhow::Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
let server = start_mock_server().await;
|
||||
let mut builder = test_codex().with_model("gpt-5.2").with_config(|config| {
|
||||
config.features.enable(Feature::ImageDetailOriginal);
|
||||
});
|
||||
let TestCodex {
|
||||
codex,
|
||||
cwd,
|
||||
session_configured,
|
||||
..
|
||||
} = builder.build(&server).await?;
|
||||
|
||||
let rel_path = "assets/original-example-lower-model.png";
|
||||
let abs_path = cwd.path().join(rel_path);
|
||||
if let Some(parent) = abs_path.parent() {
|
||||
std::fs::create_dir_all(parent)?;
|
||||
}
|
||||
let original_width = 2304;
|
||||
let original_height = 864;
|
||||
let image = ImageBuffer::from_pixel(original_width, original_height, Rgba([0u8, 80, 255, 255]));
|
||||
image.save(&abs_path)?;
|
||||
|
||||
let call_id = "view-image-original-lower-model";
|
||||
let arguments = serde_json::json!({ "path": rel_path }).to_string();
|
||||
|
||||
let first_response = sse(vec![
|
||||
ev_response_created("resp-1"),
|
||||
ev_function_call(call_id, "view_image", &arguments),
|
||||
ev_completed("resp-1"),
|
||||
]);
|
||||
responses::mount_sse_once(&server, first_response).await;
|
||||
|
||||
let second_response = sse(vec![
|
||||
ev_assistant_message("msg-1", "done"),
|
||||
ev_completed("resp-2"),
|
||||
]);
|
||||
let mock = responses::mount_sse_once(&server, second_response).await;
|
||||
|
||||
let session_model = session_configured.model.clone();
|
||||
|
||||
codex
|
||||
.submit(Op::UserTurn {
|
||||
items: vec![UserInput::Text {
|
||||
text: "please add the screenshot".into(),
|
||||
text_elements: Vec::new(),
|
||||
}],
|
||||
final_output_json_schema: None,
|
||||
cwd: cwd.path().to_path_buf(),
|
||||
approval_policy: AskForApproval::Never,
|
||||
sandbox_policy: SandboxPolicy::DangerFullAccess,
|
||||
model: session_model,
|
||||
effort: None,
|
||||
service_tier: None,
|
||||
summary: None,
|
||||
collaboration_mode: None,
|
||||
personality: None,
|
||||
})
|
||||
.await?;
|
||||
|
||||
wait_for_event_with_timeout(
|
||||
&codex,
|
||||
|event| matches!(event, EventMsg::TurnComplete(_)),
|
||||
Duration::from_secs(10),
|
||||
)
|
||||
.await;
|
||||
|
||||
let req = mock.single_request();
|
||||
let function_output = req.function_call_output(call_id);
|
||||
let output_items = function_output
|
||||
.get("output")
|
||||
.and_then(Value::as_array)
|
||||
.expect("function_call_output should be a content item array");
|
||||
assert_eq!(output_items.len(), 1);
|
||||
assert_eq!(output_items[0].get("detail"), None);
|
||||
|
||||
let image_url = output_items[0]
|
||||
.get("image_url")
|
||||
.and_then(Value::as_str)
|
||||
.expect("image_url present");
|
||||
|
||||
let (prefix, encoded) = image_url
|
||||
.split_once(',')
|
||||
.expect("image url contains data prefix");
|
||||
assert_eq!(prefix, "data:image/png;base64");
|
||||
|
||||
let decoded = BASE64_STANDARD
|
||||
.decode(encoded)
|
||||
.expect("image data decodes from base64 for request");
|
||||
let resized = load_from_memory(&decoded).expect("load resized image");
|
||||
let (resized_width, resized_height) = resized.dimensions();
|
||||
assert!(resized_width <= 2048);
|
||||
assert!(resized_height <= 768);
|
||||
assert!(resized_width < original_width);
|
||||
assert!(resized_height < original_height);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn js_repl_view_image_tool_attaches_local_image() -> anyhow::Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
@@ -674,6 +876,7 @@ async fn view_image_tool_returns_unsupported_message_for_text_only_model() -> an
|
||||
apply_patch_tool_type: None,
|
||||
truncation_policy: TruncationPolicyConfig::bytes(10_000),
|
||||
supports_parallel_tool_calls: false,
|
||||
supports_image_detail_original: false,
|
||||
context_window: Some(272_000),
|
||||
auto_compact_token_limit: None,
|
||||
effective_context_window_percent: 95,
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
|
||||
use codex_utils_image::load_and_resize_to_fit;
|
||||
use codex_utils_image::PromptImageMode;
|
||||
use codex_utils_image::load_for_prompt;
|
||||
use serde::Deserialize;
|
||||
use serde::Deserializer;
|
||||
use serde::Serialize;
|
||||
@@ -175,6 +176,15 @@ pub enum ContentItem {
|
||||
OutputText { text: String },
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, JsonSchema, TS)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum ImageDetail {
|
||||
Auto,
|
||||
Low,
|
||||
High,
|
||||
Original,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, JsonSchema, TS)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
/// Classifies an assistant message as interim commentary or final answer text.
|
||||
@@ -710,8 +720,9 @@ fn unsupported_image_error_placeholder(path: &std::path::Path, mime: &str) -> Co
|
||||
pub fn local_image_content_items_with_label_number(
|
||||
path: &std::path::Path,
|
||||
label_number: Option<usize>,
|
||||
mode: PromptImageMode,
|
||||
) -> Vec<ContentItem> {
|
||||
match load_and_resize_to_fit(path) {
|
||||
match load_for_prompt(path, mode) {
|
||||
Ok(image) => {
|
||||
let mut items = Vec::with_capacity(3);
|
||||
if let Some(label_number) = label_number {
|
||||
@@ -872,7 +883,11 @@ impl From<Vec<UserInput>> for ResponseInputItem {
|
||||
}
|
||||
UserInput::LocalImage { path } => {
|
||||
image_index += 1;
|
||||
local_image_content_items_with_label_number(&path, Some(image_index))
|
||||
local_image_content_items_with_label_number(
|
||||
&path,
|
||||
Some(image_index),
|
||||
PromptImageMode::ResizeToFit,
|
||||
)
|
||||
}
|
||||
UserInput::Skill { .. } | UserInput::Mention { .. } => Vec::new(), // Tool bodies are injected later in core
|
||||
})
|
||||
@@ -937,9 +952,16 @@ pub struct ShellCommandToolCallParams {
|
||||
#[serde(tag = "type", rename_all = "snake_case")]
|
||||
pub enum FunctionCallOutputContentItem {
|
||||
// Do not rename, these are serialized and used directly in the responses API.
|
||||
InputText { text: String },
|
||||
InputText {
|
||||
text: String,
|
||||
},
|
||||
// Do not rename, these are serialized and used directly in the responses API.
|
||||
InputImage { image_url: String },
|
||||
InputImage {
|
||||
image_url: String,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
#[ts(optional)]
|
||||
detail: Option<ImageDetail>,
|
||||
},
|
||||
}
|
||||
|
||||
/// Converts structured function-call output content into plain text for
|
||||
@@ -983,7 +1005,10 @@ impl From<crate::dynamic_tools::DynamicToolCallOutputContentItem>
|
||||
Self::InputText { text }
|
||||
}
|
||||
crate::dynamic_tools::DynamicToolCallOutputContentItem::InputImage { image_url } => {
|
||||
Self::InputImage { image_url }
|
||||
Self::InputImage {
|
||||
image_url,
|
||||
detail: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1185,7 +1210,10 @@ fn convert_mcp_content_to_items(
|
||||
let mime_type = mime_type.unwrap_or_else(|| "application/octet-stream".into());
|
||||
format!("data:{mime_type};base64,{data}")
|
||||
};
|
||||
FunctionCallOutputContentItem::InputImage { image_url }
|
||||
FunctionCallOutputContentItem::InputImage {
|
||||
image_url,
|
||||
detail: None,
|
||||
}
|
||||
}
|
||||
Ok(McpContent::Unknown) | Err(_) => FunctionCallOutputContentItem::InputText {
|
||||
text: serde_json::to_string(content).unwrap_or_else(|_| "<content>".to_string()),
|
||||
@@ -1239,6 +1267,7 @@ mod tests {
|
||||
items,
|
||||
vec![FunctionCallOutputContentItem::InputImage {
|
||||
image_url: "data:image/png;base64,Zm9v".to_string(),
|
||||
detail: None,
|
||||
}]
|
||||
);
|
||||
}
|
||||
@@ -1256,6 +1285,7 @@ mod tests {
|
||||
items,
|
||||
vec![FunctionCallOutputContentItem::InputImage {
|
||||
image_url: "data:image/png;base64,Zm9v".to_string(),
|
||||
detail: None,
|
||||
}]
|
||||
);
|
||||
}
|
||||
@@ -1278,6 +1308,7 @@ mod tests {
|
||||
},
|
||||
FunctionCallOutputContentItem::InputImage {
|
||||
image_url: "data:image/png;base64,AAA".to_string(),
|
||||
detail: None,
|
||||
},
|
||||
FunctionCallOutputContentItem::InputText {
|
||||
text: "line 2".to_string(),
|
||||
@@ -1296,6 +1327,7 @@ mod tests {
|
||||
},
|
||||
FunctionCallOutputContentItem::InputImage {
|
||||
image_url: "data:image/png;base64,AAA".to_string(),
|
||||
detail: None,
|
||||
},
|
||||
];
|
||||
|
||||
@@ -1318,6 +1350,7 @@ mod tests {
|
||||
},
|
||||
FunctionCallOutputContentItem::InputImage {
|
||||
image_url: "data:image/png;base64,AAA".to_string(),
|
||||
detail: None,
|
||||
},
|
||||
]);
|
||||
|
||||
@@ -1542,6 +1575,7 @@ mod tests {
|
||||
},
|
||||
FunctionCallOutputContentItem::InputImage {
|
||||
image_url: "data:image/png;base64,BASE64".into(),
|
||||
detail: None,
|
||||
},
|
||||
]
|
||||
);
|
||||
@@ -1567,6 +1601,7 @@ mod tests {
|
||||
output: FunctionCallOutputPayload::from_content_items(vec![
|
||||
FunctionCallOutputContentItem::InputImage {
|
||||
image_url: "data:image/png;base64,BASE64".into(),
|
||||
detail: None,
|
||||
},
|
||||
]),
|
||||
};
|
||||
@@ -1602,6 +1637,7 @@ mod tests {
|
||||
items,
|
||||
vec![FunctionCallOutputContentItem::InputImage {
|
||||
image_url: "data:image/png;base64,BASE64".into(),
|
||||
detail: None,
|
||||
}]
|
||||
);
|
||||
|
||||
@@ -1624,6 +1660,7 @@ mod tests {
|
||||
},
|
||||
FunctionCallOutputContentItem::InputImage {
|
||||
image_url: "data:image/png;base64,XYZ".into(),
|
||||
detail: None,
|
||||
},
|
||||
];
|
||||
assert_eq!(
|
||||
|
||||
@@ -245,6 +245,8 @@ pub struct ModelInfo {
|
||||
pub apply_patch_tool_type: Option<ApplyPatchToolType>,
|
||||
pub truncation_policy: TruncationPolicyConfig,
|
||||
pub supports_parallel_tool_calls: bool,
|
||||
#[serde(default)]
|
||||
pub supports_image_detail_original: bool,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub context_window: Option<i64>,
|
||||
/// Token threshold for automatic compaction. When omitted, core derives it
|
||||
@@ -515,6 +517,7 @@ mod tests {
|
||||
apply_patch_tool_type: None,
|
||||
truncation_policy: TruncationPolicyConfig::bytes(10_000),
|
||||
supports_parallel_tool_calls: false,
|
||||
supports_image_detail_original: false,
|
||||
context_window: None,
|
||||
auto_compact_token_limit: None,
|
||||
effective_context_window_percent: 95,
|
||||
@@ -713,6 +716,7 @@ mod tests {
|
||||
.expect("deserialize model info");
|
||||
|
||||
assert_eq!(model.availability_nux, None);
|
||||
assert!(!model.supports_image_detail_original);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
+139
-52
@@ -14,6 +14,7 @@ use image::ImageEncoder;
|
||||
use image::ImageFormat;
|
||||
use image::codecs::jpeg::JpegEncoder;
|
||||
use image::codecs::png::PngEncoder;
|
||||
use image::codecs::webp::WebPEncoder;
|
||||
use image::imageops::FilterType;
|
||||
/// Maximum width used when resizing images before uploading.
|
||||
pub const MAX_WIDTH: u32 = 2048;
|
||||
@@ -33,24 +34,48 @@ pub struct EncodedImage {
|
||||
impl EncodedImage {
|
||||
pub fn into_data_url(self) -> String {
|
||||
let encoded = BASE64_STANDARD.encode(&self.bytes);
|
||||
format!("data:{};base64,{}", self.mime, encoded)
|
||||
format!("data:{};base64,{encoded}", self.mime)
|
||||
}
|
||||
}
|
||||
|
||||
static IMAGE_CACHE: LazyLock<BlockingLruCache<[u8; 20], EncodedImage>> =
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum PromptImageMode {
|
||||
ResizeToFit,
|
||||
Original,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
struct ImageCacheKey {
|
||||
digest: [u8; 20],
|
||||
mode: PromptImageMode,
|
||||
}
|
||||
|
||||
static IMAGE_CACHE: LazyLock<BlockingLruCache<ImageCacheKey, EncodedImage>> =
|
||||
LazyLock::new(|| BlockingLruCache::new(NonZeroUsize::new(32).unwrap_or(NonZeroUsize::MIN)));
|
||||
|
||||
pub fn load_and_resize_to_fit(path: &Path) -> Result<EncodedImage, ImageProcessingError> {
|
||||
load_for_prompt(path, PromptImageMode::ResizeToFit)
|
||||
}
|
||||
|
||||
pub fn load_for_prompt(
|
||||
path: &Path,
|
||||
mode: PromptImageMode,
|
||||
) -> Result<EncodedImage, ImageProcessingError> {
|
||||
let path_buf = path.to_path_buf();
|
||||
|
||||
let file_bytes = read_file_bytes(path, &path_buf)?;
|
||||
|
||||
let key = sha1_digest(&file_bytes);
|
||||
let key = ImageCacheKey {
|
||||
digest: sha1_digest(&file_bytes),
|
||||
mode,
|
||||
};
|
||||
|
||||
IMAGE_CACHE.get_or_try_insert_with(key, move || {
|
||||
let format = match image::guess_format(&file_bytes) {
|
||||
Ok(ImageFormat::Png) => Some(ImageFormat::Png),
|
||||
Ok(ImageFormat::Jpeg) => Some(ImageFormat::Jpeg),
|
||||
Ok(ImageFormat::Gif) => Some(ImageFormat::Gif),
|
||||
Ok(ImageFormat::WebP) => Some(ImageFormat::WebP),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
@@ -63,42 +88,54 @@ pub fn load_and_resize_to_fit(path: &Path) -> Result<EncodedImage, ImageProcessi
|
||||
|
||||
let (width, height) = dynamic.dimensions();
|
||||
|
||||
let encoded = if width <= MAX_WIDTH && height <= MAX_HEIGHT {
|
||||
if let Some(format) = format {
|
||||
let mime = format_to_mime(format);
|
||||
EncodedImage {
|
||||
bytes: file_bytes,
|
||||
mime,
|
||||
width,
|
||||
height,
|
||||
let encoded =
|
||||
if mode == PromptImageMode::Original || (width <= MAX_WIDTH && height <= MAX_HEIGHT) {
|
||||
if let Some(format) = format.filter(|format| can_preserve_source_bytes(*format)) {
|
||||
let mime = format_to_mime(format);
|
||||
EncodedImage {
|
||||
bytes: file_bytes,
|
||||
mime,
|
||||
width,
|
||||
height,
|
||||
}
|
||||
} else {
|
||||
let (bytes, output_format) = encode_image(&dynamic, ImageFormat::Png)?;
|
||||
let mime = format_to_mime(output_format);
|
||||
EncodedImage {
|
||||
bytes,
|
||||
mime,
|
||||
width,
|
||||
height,
|
||||
}
|
||||
}
|
||||
} else {
|
||||
let (bytes, output_format) = encode_image(&dynamic, ImageFormat::Png)?;
|
||||
let resized = dynamic.resize(MAX_WIDTH, MAX_HEIGHT, FilterType::Triangle);
|
||||
let target_format = format
|
||||
.filter(|format| can_preserve_source_bytes(*format))
|
||||
.unwrap_or(ImageFormat::Png);
|
||||
let (bytes, output_format) = encode_image(&resized, target_format)?;
|
||||
let mime = format_to_mime(output_format);
|
||||
EncodedImage {
|
||||
bytes,
|
||||
mime,
|
||||
width,
|
||||
height,
|
||||
width: resized.width(),
|
||||
height: resized.height(),
|
||||
}
|
||||
}
|
||||
} else {
|
||||
let resized = dynamic.resize(MAX_WIDTH, MAX_HEIGHT, FilterType::Triangle);
|
||||
let target_format = format.unwrap_or(ImageFormat::Png);
|
||||
let (bytes, output_format) = encode_image(&resized, target_format)?;
|
||||
let mime = format_to_mime(output_format);
|
||||
EncodedImage {
|
||||
bytes,
|
||||
mime,
|
||||
width: resized.width(),
|
||||
height: resized.height(),
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
Ok(encoded)
|
||||
})
|
||||
}
|
||||
|
||||
fn can_preserve_source_bytes(format: ImageFormat) -> bool {
|
||||
// Public API docs explicitly call out non-animated GIF support only.
|
||||
// Preserve byte-for-byte only for formats we can safely pass through.
|
||||
matches!(
|
||||
format,
|
||||
ImageFormat::Png | ImageFormat::Jpeg | ImageFormat::WebP
|
||||
)
|
||||
}
|
||||
|
||||
fn read_file_bytes(path: &Path, path_for_error: &Path) -> Result<Vec<u8>, ImageProcessingError> {
|
||||
match tokio::runtime::Handle::try_current() {
|
||||
// If we're inside a Tokio runtime, avoid block_on (it panics on worker threads).
|
||||
@@ -123,6 +160,7 @@ fn encode_image(
|
||||
) -> Result<(Vec<u8>, ImageFormat), ImageProcessingError> {
|
||||
let target_format = match preferred_format {
|
||||
ImageFormat::Jpeg => ImageFormat::Jpeg,
|
||||
ImageFormat::WebP => ImageFormat::WebP,
|
||||
_ => ImageFormat::Png,
|
||||
};
|
||||
|
||||
@@ -153,6 +191,21 @@ fn encode_image(
|
||||
source,
|
||||
})?;
|
||||
}
|
||||
ImageFormat::WebP => {
|
||||
let rgba = image.to_rgba8();
|
||||
let encoder = WebPEncoder::new_lossless(&mut buffer);
|
||||
encoder
|
||||
.write_image(
|
||||
rgba.as_raw(),
|
||||
image.width(),
|
||||
image.height(),
|
||||
ColorType::Rgba8.into(),
|
||||
)
|
||||
.map_err(|source| ImageProcessingError::Encode {
|
||||
format: target_format,
|
||||
source,
|
||||
})?;
|
||||
}
|
||||
_ => unreachable!("unsupported target_format should have been handled earlier"),
|
||||
}
|
||||
|
||||
@@ -162,6 +215,8 @@ fn encode_image(
|
||||
fn format_to_mime(format: ImageFormat) -> String {
|
||||
match format {
|
||||
ImageFormat::Jpeg => "image/jpeg".to_string(),
|
||||
ImageFormat::Gif => "image/gif".to_string(),
|
||||
ImageFormat::WebP => "image/webp".to_string(),
|
||||
_ => "image/png".to_string(),
|
||||
}
|
||||
}
|
||||
@@ -176,38 +231,70 @@ mod tests {
|
||||
|
||||
#[tokio::test(flavor = "multi_thread")]
|
||||
async fn returns_original_image_when_within_bounds() {
|
||||
for (format, mime) in [
|
||||
(ImageFormat::Png, "image/png"),
|
||||
(ImageFormat::WebP, "image/webp"),
|
||||
] {
|
||||
let temp_file = NamedTempFile::new().expect("temp file");
|
||||
let image = ImageBuffer::from_pixel(64, 32, Rgba([10u8, 20, 30, 255]));
|
||||
image
|
||||
.save_with_format(temp_file.path(), format)
|
||||
.expect("write image to temp file");
|
||||
|
||||
let original_bytes = std::fs::read(temp_file.path()).expect("read written image");
|
||||
let encoded = load_and_resize_to_fit(temp_file.path()).expect("process image");
|
||||
|
||||
assert_eq!(encoded.width, 64);
|
||||
assert_eq!(encoded.height, 32);
|
||||
assert_eq!(encoded.mime, mime);
|
||||
assert_eq!(encoded.bytes, original_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread")]
|
||||
async fn downscales_large_image() {
|
||||
for (format, mime) in [
|
||||
(ImageFormat::Png, "image/png"),
|
||||
(ImageFormat::WebP, "image/webp"),
|
||||
] {
|
||||
let temp_file = NamedTempFile::new().expect("temp file");
|
||||
let image = ImageBuffer::from_pixel(4096, 2048, Rgba([200u8, 10, 10, 255]));
|
||||
image
|
||||
.save_with_format(temp_file.path(), format)
|
||||
.expect("write image to temp file");
|
||||
|
||||
let processed = load_and_resize_to_fit(temp_file.path()).expect("process image");
|
||||
|
||||
assert!(processed.width <= MAX_WIDTH);
|
||||
assert!(processed.height <= MAX_HEIGHT);
|
||||
assert_eq!(processed.mime, mime);
|
||||
|
||||
let detected_format =
|
||||
image::guess_format(&processed.bytes).expect("detect resized output format");
|
||||
assert_eq!(detected_format, format);
|
||||
|
||||
let loaded = image::load_from_memory(&processed.bytes)
|
||||
.expect("read resized bytes back into image");
|
||||
assert_eq!(loaded.dimensions(), (processed.width, processed.height));
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread")]
|
||||
async fn preserves_large_image_in_original_mode() {
|
||||
let temp_file = NamedTempFile::new().expect("temp file");
|
||||
let image = ImageBuffer::from_pixel(64, 32, Rgba([10u8, 20, 30, 255]));
|
||||
let image = ImageBuffer::from_pixel(4096, 2048, Rgba([180u8, 30, 30, 255]));
|
||||
image
|
||||
.save_with_format(temp_file.path(), ImageFormat::Png)
|
||||
.expect("write png to temp file");
|
||||
|
||||
let original_bytes = std::fs::read(temp_file.path()).expect("read written image");
|
||||
let processed =
|
||||
load_for_prompt(temp_file.path(), PromptImageMode::Original).expect("process image");
|
||||
|
||||
let encoded = load_and_resize_to_fit(temp_file.path()).expect("process image");
|
||||
|
||||
assert_eq!(encoded.width, 64);
|
||||
assert_eq!(encoded.height, 32);
|
||||
assert_eq!(encoded.mime, "image/png");
|
||||
assert_eq!(encoded.bytes, original_bytes);
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread")]
|
||||
async fn downscales_large_image() {
|
||||
let temp_file = NamedTempFile::new().expect("temp file");
|
||||
let image = ImageBuffer::from_pixel(4096, 2048, Rgba([200u8, 10, 10, 255]));
|
||||
image
|
||||
.save_with_format(temp_file.path(), ImageFormat::Png)
|
||||
.expect("write png to temp file");
|
||||
|
||||
let processed = load_and_resize_to_fit(temp_file.path()).expect("process image");
|
||||
|
||||
assert!(processed.width <= MAX_WIDTH);
|
||||
assert!(processed.height <= MAX_HEIGHT);
|
||||
|
||||
let loaded =
|
||||
image::load_from_memory(&processed.bytes).expect("read resized bytes back into image");
|
||||
assert_eq!(loaded.dimensions(), (processed.width, processed.height));
|
||||
assert_eq!(processed.width, 4096);
|
||||
assert_eq!(processed.height, 2048);
|
||||
assert_eq!(processed.mime, "image/png");
|
||||
assert_eq!(processed.bytes, original_bytes);
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread")]
|
||||
|
||||
Reference in New Issue
Block a user