[codex] Test code-mode variable truncation (#28471)

## Summary

Code mode has two separate truncation points: the nested tool result
returned to JavaScript and the code-mode output later recorded for the
model. These tests now verify those behaviors independently.

- Report whether `result.output` was truncated before printing it.
- Verify omitted or sufficiently large nested limits produce `Variable
truncated: False`, while allowing the printed value to be truncated
downstream.
- Verify an explicit nested limit produces `Variable truncated: True`
when the command output exceeds it.
- Use a token-policy model fixture so downstream truncation is visible
as `…N tokens truncated…`.
- Align the explicit nested-truncation expectation with the warning
header.

This PR changes test coverage only; runtime truncation behavior is
unchanged.

## Validation

- `env -u CODEX_SANDBOX_NETWORK_DISABLED RUST_MIN_STACK=8388608 cargo
test -p codex-core --test all code_mode_exec -- --nocapture` (8 passed)
This commit is contained in:
Ahmed Ibrahim
2026-06-16 20:14:23 -07:00
committed by GitHub
Unverified
parent e2f074e16c
commit 0a3ad4c4ba
+91 -52
View File
@@ -940,9 +940,15 @@ text(JSON.stringify(results));
Ok(())
}
// This model uses token-based tool-output truncation, giving the downstream
// history assertions a stable `…N tokens truncated…` marker.
const TOKEN_POLICY_TEST_MODEL: &str = "gpt-5.4";
// A nested `exec_command` limit applies to `result.output` inside JavaScript.
// The outer code-mode and history budgets apply after the script calls `text`.
#[cfg_attr(windows, ignore = "no exec_command on Windows")]
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn code_mode_exec_command_explicit_max_output_tokens_truncates() -> Result<()> {
async fn code_mode_exec_nested_limit_formats_truncated_result_with_warning() -> Result<()> {
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;
@@ -964,7 +970,7 @@ text(result.output);
&custom_tool_output_items(&second_mock.single_request(), "call-1"),
/*index*/ 1
),
"Total output lines: 1\n\n0123456789…5 tokens truncated…0123456789"
"Warning: truncated output (original token count: 10)\nTotal output lines: 1\n\n0123456789…5 tokens truncated…0123456789"
);
Ok(())
@@ -972,7 +978,8 @@ text(result.output);
#[cfg_attr(windows, ignore = "no exec_command on Windows")]
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn code_mode_exec_explicit_max_above_default_preserves_output() -> Result<()> {
async fn code_mode_exec_nested_limit_preserves_result_variable_before_default_history_truncation()
-> Result<()> {
// TODO(anp): Remove after Wine exec returns complete nested-tool output to code mode.
skip_if_wine_exec!(
Ok(()),
@@ -981,7 +988,7 @@ async fn code_mode_exec_explicit_max_above_default_preserves_output() -> Result<
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;
let (_test, second_mock) = run_code_mode_turn(
let (_test, second_mock) = run_code_mode_turn_with_model_and_config(
&server,
"use exec_command from code mode",
r#"// @exec: {"max_output_tokens": 20000}
@@ -989,17 +996,19 @@ const result = await tools.exec_command({
cmd: "python3 -c \"import sys; sys.stdout.write('x' * 50000)\"",
max_output_tokens: 20000
});
text(result.output);
const resultVariableWasTruncated = result.output.length !== 50000;
text(`Variable truncated: ${resultVariableWasTruncated ? "True" : "False"}. Variable: ${result.output}`);
"#,
TOKEN_POLICY_TEST_MODEL,
|_| {},
)
.await?;
assert_eq!(
text_item(
&custom_tool_output_items(&second_mock.single_request(), "call-1"),
/*index*/ 1
),
"x".repeat(50_000)
let items = custom_tool_output_items(&second_mock.single_request(), "call-1");
let output = text_item(&items, /*index*/ 1);
assert_regex_match(
r"^Variable truncated: False\. Variable: x+…\d+ tokens truncated…x+$",
output,
);
Ok(())
@@ -1007,7 +1016,7 @@ text(result.output);
#[cfg_attr(windows, ignore = "no exec_command on Windows")]
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn code_mode_exec_explicit_max_above_default_truncates_larger_output() -> Result<()> {
async fn code_mode_exec_nested_limit_truncates_result_variable_when_exceeded() -> Result<()> {
// TODO(anp): Remove after Wine exec returns complete nested-tool output to code mode.
skip_if_wine_exec!(
Ok(()),
@@ -1016,7 +1025,7 @@ async fn code_mode_exec_explicit_max_above_default_truncates_larger_output() ->
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;
let (_test, second_mock) = run_code_mode_turn(
let (_test, second_mock) = run_code_mode_turn_with_model_and_config(
&server,
"use exec_command from code mode",
r#"// @exec: {"max_output_tokens": 25000}
@@ -1024,21 +1033,28 @@ const result = await tools.exec_command({
cmd: "python3 -c \"import sys; sys.stdout.write('A' * 90000)\"",
max_output_tokens: 20000
});
text(result.output);
const resultVariableWasTruncated = result.output.includes("…2500 tokens truncated…");
text(`Variable truncated: ${resultVariableWasTruncated ? "True" : "False"}. Variable: ${result.output}`);
"#,
TOKEN_POLICY_TEST_MODEL,
|_| {},
)
.await?;
assert_eq!(
text_item(
&custom_tool_output_items(&second_mock.single_request(), "call-1"),
/*index*/ 1
),
format!(
"Warning: truncated output (original token count: 22500)\nTotal output lines: 1\n\n{}…2500 tokens truncated…{}",
"A".repeat(40_000),
"A".repeat(40_000)
)
let items = custom_tool_output_items(&second_mock.single_request(), "call-1");
let output = text_item(&items, /*index*/ 1);
// The nested 20,000-token budget leaves about 80,000 characters. This
// ceiling independently proves that history applied its smaller cap.
assert!(
output.len() < 60_000,
"expected history to truncate the emitted value, got {} bytes",
output.len()
);
// The boolean describes the nested result; the marker below comes from
// history truncating the value emitted with `text` afterward.
assert_regex_match(
r"(?s)^Variable truncated: True\. Variable: .*…\d+ tokens truncated…A+$",
output,
);
Ok(())
@@ -1046,7 +1062,8 @@ text(result.output);
#[cfg_attr(windows, ignore = "no exec_command on Windows")]
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn code_mode_exec_explicit_max_above_truncation_policy_preserves_output() -> Result<()> {
async fn code_mode_exec_nested_limit_preserves_result_variable_before_configured_history_truncation()
-> Result<()> {
// TODO(anp): Remove after Wine exec returns complete nested-tool output to code mode.
skip_if_wine_exec!(
Ok(()),
@@ -1055,7 +1072,7 @@ async fn code_mode_exec_explicit_max_above_truncation_policy_preserves_output()
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;
let (_test, second_mock) = run_code_mode_turn_with_config(
let (_test, second_mock) = run_code_mode_turn_with_model_and_config(
&server,
"use exec_command from code mode",
r#"// @exec: {"max_output_tokens": 20000}
@@ -1063,20 +1080,28 @@ const result = await tools.exec_command({
cmd: "python3 -c \"import sys; sys.stdout.write('x' * 50000)\"",
max_output_tokens: 20000
});
text(result.output);
const resultVariableWasTruncated = result.output.length !== 50000;
text(`Variable truncated: ${resultVariableWasTruncated ? "True" : "False"}. Variable: ${result.output}`);
"#,
TOKEN_POLICY_TEST_MODEL,
|config| {
config.tool_output_token_limit = Some(50);
},
)
.await?;
assert_eq!(
text_item(
&custom_tool_output_items(&second_mock.single_request(), "call-1"),
/*index*/ 1
),
"x".repeat(50_000)
let items = custom_tool_output_items(&second_mock.single_request(), "call-1");
let output = text_item(&items, /*index*/ 1);
// The 50-token override must shrink this 50,000-character value far below
// what the default 10,000-token history cap would retain.
assert!(
output.len() < 1_000,
"expected configured history cap to truncate the emitted value, got {} bytes",
output.len()
);
assert_regex_match(
r"^Variable truncated: False\. Variable: x+…\d+ tokens truncated…x+$",
output,
);
Ok(())
@@ -1084,7 +1109,8 @@ text(result.output);
#[cfg_attr(windows, ignore = "no exec_command on Windows")]
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn code_mode_exec_without_max_preserves_output_beyond_default() -> Result<()> {
async fn code_mode_exec_without_nested_limit_preserves_result_variable_before_default_history_truncation()
-> Result<()> {
// TODO(anp): Remove after Wine exec returns complete nested-tool output to code mode.
skip_if_wine_exec!(
Ok(()),
@@ -1093,24 +1119,26 @@ async fn code_mode_exec_without_max_preserves_output_beyond_default() -> Result<
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;
let (_test, second_mock) = run_code_mode_turn(
let (_test, second_mock) = run_code_mode_turn_with_model_and_config(
&server,
"use exec_command from code mode",
r#"// @exec: {"max_output_tokens": 20000}
const result = await tools.exec_command({
cmd: "python3 -c \"import sys; sys.stdout.write('x' * 50000)\""
});
text(result.output);
const resultVariableWasTruncated = result.output.length !== 50000;
text(`Variable truncated: ${resultVariableWasTruncated ? "True" : "False"}. Variable: ${result.output}`);
"#,
TOKEN_POLICY_TEST_MODEL,
|_| {},
)
.await?;
assert_eq!(
text_item(
&custom_tool_output_items(&second_mock.single_request(), "call-1"),
/*index*/ 1
),
"x".repeat(50_000)
let items = custom_tool_output_items(&second_mock.single_request(), "call-1");
let output = text_item(&items, /*index*/ 1);
assert_regex_match(
r"^Variable truncated: False\. Variable: x+…\d+ tokens truncated…x+$",
output,
);
Ok(())
@@ -1118,7 +1146,8 @@ text(result.output);
#[cfg_attr(windows, ignore = "no exec_command on Windows")]
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn code_mode_exec_without_max_preserves_output_beyond_truncation_policy() -> Result<()> {
async fn code_mode_exec_without_nested_limit_preserves_result_variable_before_configured_history_truncation()
-> Result<()> {
// TODO(anp): Remove after Wine exec returns complete nested-tool output to code mode.
skip_if_wine_exec!(
Ok(()),
@@ -1127,35 +1156,45 @@ async fn code_mode_exec_without_max_preserves_output_beyond_truncation_policy()
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;
let (_test, second_mock) = run_code_mode_turn_with_config(
let (_test, second_mock) = run_code_mode_turn_with_model_and_config(
&server,
"use exec_command from code mode",
r#"// @exec: {"max_output_tokens": 20000}
const result = await tools.exec_command({
cmd: "python3 -c \"import sys; sys.stdout.write('x' * 50000)\""
});
text(result.output);
const resultVariableWasTruncated = result.output.length !== 50000;
text(`Variable truncated: ${resultVariableWasTruncated ? "True" : "False"}. Variable: ${result.output}`);
"#,
TOKEN_POLICY_TEST_MODEL,
|config| {
config.tool_output_token_limit = Some(50);
},
)
.await?;
assert_eq!(
text_item(
&custom_tool_output_items(&second_mock.single_request(), "call-1"),
/*index*/ 1
),
"x".repeat(50_000)
let items = custom_tool_output_items(&second_mock.single_request(), "call-1");
let output = text_item(&items, /*index*/ 1);
// The 50-token override must shrink this 50,000-character value far below
// what the default 10,000-token history cap would retain.
assert!(
output.len() < 1_000,
"expected configured history cap to truncate the emitted value, got {} bytes",
output.len()
);
assert_regex_match(
r"^Variable truncated: False\. Variable: x+…\d+ tokens truncated…x+$",
output,
);
Ok(())
}
// The outer directive limits output after JavaScript emits it; it does not
// limit `result.output` returned by the nested command.
#[cfg_attr(windows, ignore = "no exec_command on Windows")]
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn code_mode_exec_explicit_max_output_tokens_truncates() -> Result<()> {
async fn code_mode_exec_outer_limit_truncates_emitted_output() -> Result<()> {
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;