mirror of
https://github.com/pchuan98/codex.git
synced 2026-07-01 00:31:56 +08:00
[codex] Test code-mode variable truncation (#28471)
## Summary Code mode has two separate truncation points: the nested tool result returned to JavaScript and the code-mode output later recorded for the model. These tests now verify those behaviors independently. - Report whether `result.output` was truncated before printing it. - Verify omitted or sufficiently large nested limits produce `Variable truncated: False`, while allowing the printed value to be truncated downstream. - Verify an explicit nested limit produces `Variable truncated: True` when the command output exceeds it. - Use a token-policy model fixture so downstream truncation is visible as `…N tokens truncated…`. - Align the explicit nested-truncation expectation with the warning header. This PR changes test coverage only; runtime truncation behavior is unchanged. ## Validation - `env -u CODEX_SANDBOX_NETWORK_DISABLED RUST_MIN_STACK=8388608 cargo test -p codex-core --test all code_mode_exec -- --nocapture` (8 passed)
This commit is contained in:
committed by
GitHub
Unverified
parent
e2f074e16c
commit
0a3ad4c4ba
@@ -940,9 +940,15 @@ text(JSON.stringify(results));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// This model uses token-based tool-output truncation, giving the downstream
|
||||
// history assertions a stable `…N tokens truncated…` marker.
|
||||
const TOKEN_POLICY_TEST_MODEL: &str = "gpt-5.4";
|
||||
|
||||
// A nested `exec_command` limit applies to `result.output` inside JavaScript.
|
||||
// The outer code-mode and history budgets apply after the script calls `text`.
|
||||
#[cfg_attr(windows, ignore = "no exec_command on Windows")]
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn code_mode_exec_command_explicit_max_output_tokens_truncates() -> Result<()> {
|
||||
async fn code_mode_exec_nested_limit_formats_truncated_result_with_warning() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
let server = responses::start_mock_server().await;
|
||||
@@ -964,7 +970,7 @@ text(result.output);
|
||||
&custom_tool_output_items(&second_mock.single_request(), "call-1"),
|
||||
/*index*/ 1
|
||||
),
|
||||
"Total output lines: 1\n\n0123456789…5 tokens truncated…0123456789"
|
||||
"Warning: truncated output (original token count: 10)\nTotal output lines: 1\n\n0123456789…5 tokens truncated…0123456789"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
@@ -972,7 +978,8 @@ text(result.output);
|
||||
|
||||
#[cfg_attr(windows, ignore = "no exec_command on Windows")]
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn code_mode_exec_explicit_max_above_default_preserves_output() -> Result<()> {
|
||||
async fn code_mode_exec_nested_limit_preserves_result_variable_before_default_history_truncation()
|
||||
-> Result<()> {
|
||||
// TODO(anp): Remove after Wine exec returns complete nested-tool output to code mode.
|
||||
skip_if_wine_exec!(
|
||||
Ok(()),
|
||||
@@ -981,7 +988,7 @@ async fn code_mode_exec_explicit_max_above_default_preserves_output() -> Result<
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
let server = responses::start_mock_server().await;
|
||||
let (_test, second_mock) = run_code_mode_turn(
|
||||
let (_test, second_mock) = run_code_mode_turn_with_model_and_config(
|
||||
&server,
|
||||
"use exec_command from code mode",
|
||||
r#"// @exec: {"max_output_tokens": 20000}
|
||||
@@ -989,17 +996,19 @@ const result = await tools.exec_command({
|
||||
cmd: "python3 -c \"import sys; sys.stdout.write('x' * 50000)\"",
|
||||
max_output_tokens: 20000
|
||||
});
|
||||
text(result.output);
|
||||
const resultVariableWasTruncated = result.output.length !== 50000;
|
||||
text(`Variable truncated: ${resultVariableWasTruncated ? "True" : "False"}. Variable: ${result.output}`);
|
||||
"#,
|
||||
TOKEN_POLICY_TEST_MODEL,
|
||||
|_| {},
|
||||
)
|
||||
.await?;
|
||||
|
||||
assert_eq!(
|
||||
text_item(
|
||||
&custom_tool_output_items(&second_mock.single_request(), "call-1"),
|
||||
/*index*/ 1
|
||||
),
|
||||
"x".repeat(50_000)
|
||||
let items = custom_tool_output_items(&second_mock.single_request(), "call-1");
|
||||
let output = text_item(&items, /*index*/ 1);
|
||||
assert_regex_match(
|
||||
r"^Variable truncated: False\. Variable: x+…\d+ tokens truncated…x+$",
|
||||
output,
|
||||
);
|
||||
|
||||
Ok(())
|
||||
@@ -1007,7 +1016,7 @@ text(result.output);
|
||||
|
||||
#[cfg_attr(windows, ignore = "no exec_command on Windows")]
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn code_mode_exec_explicit_max_above_default_truncates_larger_output() -> Result<()> {
|
||||
async fn code_mode_exec_nested_limit_truncates_result_variable_when_exceeded() -> Result<()> {
|
||||
// TODO(anp): Remove after Wine exec returns complete nested-tool output to code mode.
|
||||
skip_if_wine_exec!(
|
||||
Ok(()),
|
||||
@@ -1016,7 +1025,7 @@ async fn code_mode_exec_explicit_max_above_default_truncates_larger_output() ->
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
let server = responses::start_mock_server().await;
|
||||
let (_test, second_mock) = run_code_mode_turn(
|
||||
let (_test, second_mock) = run_code_mode_turn_with_model_and_config(
|
||||
&server,
|
||||
"use exec_command from code mode",
|
||||
r#"// @exec: {"max_output_tokens": 25000}
|
||||
@@ -1024,21 +1033,28 @@ const result = await tools.exec_command({
|
||||
cmd: "python3 -c \"import sys; sys.stdout.write('A' * 90000)\"",
|
||||
max_output_tokens: 20000
|
||||
});
|
||||
text(result.output);
|
||||
const resultVariableWasTruncated = result.output.includes("…2500 tokens truncated…");
|
||||
text(`Variable truncated: ${resultVariableWasTruncated ? "True" : "False"}. Variable: ${result.output}`);
|
||||
"#,
|
||||
TOKEN_POLICY_TEST_MODEL,
|
||||
|_| {},
|
||||
)
|
||||
.await?;
|
||||
|
||||
assert_eq!(
|
||||
text_item(
|
||||
&custom_tool_output_items(&second_mock.single_request(), "call-1"),
|
||||
/*index*/ 1
|
||||
),
|
||||
format!(
|
||||
"Warning: truncated output (original token count: 22500)\nTotal output lines: 1\n\n{}…2500 tokens truncated…{}",
|
||||
"A".repeat(40_000),
|
||||
"A".repeat(40_000)
|
||||
)
|
||||
let items = custom_tool_output_items(&second_mock.single_request(), "call-1");
|
||||
let output = text_item(&items, /*index*/ 1);
|
||||
// The nested 20,000-token budget leaves about 80,000 characters. This
|
||||
// ceiling independently proves that history applied its smaller cap.
|
||||
assert!(
|
||||
output.len() < 60_000,
|
||||
"expected history to truncate the emitted value, got {} bytes",
|
||||
output.len()
|
||||
);
|
||||
// The boolean describes the nested result; the marker below comes from
|
||||
// history truncating the value emitted with `text` afterward.
|
||||
assert_regex_match(
|
||||
r"(?s)^Variable truncated: True\. Variable: .*…\d+ tokens truncated…A+$",
|
||||
output,
|
||||
);
|
||||
|
||||
Ok(())
|
||||
@@ -1046,7 +1062,8 @@ text(result.output);
|
||||
|
||||
#[cfg_attr(windows, ignore = "no exec_command on Windows")]
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn code_mode_exec_explicit_max_above_truncation_policy_preserves_output() -> Result<()> {
|
||||
async fn code_mode_exec_nested_limit_preserves_result_variable_before_configured_history_truncation()
|
||||
-> Result<()> {
|
||||
// TODO(anp): Remove after Wine exec returns complete nested-tool output to code mode.
|
||||
skip_if_wine_exec!(
|
||||
Ok(()),
|
||||
@@ -1055,7 +1072,7 @@ async fn code_mode_exec_explicit_max_above_truncation_policy_preserves_output()
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
let server = responses::start_mock_server().await;
|
||||
let (_test, second_mock) = run_code_mode_turn_with_config(
|
||||
let (_test, second_mock) = run_code_mode_turn_with_model_and_config(
|
||||
&server,
|
||||
"use exec_command from code mode",
|
||||
r#"// @exec: {"max_output_tokens": 20000}
|
||||
@@ -1063,20 +1080,28 @@ const result = await tools.exec_command({
|
||||
cmd: "python3 -c \"import sys; sys.stdout.write('x' * 50000)\"",
|
||||
max_output_tokens: 20000
|
||||
});
|
||||
text(result.output);
|
||||
const resultVariableWasTruncated = result.output.length !== 50000;
|
||||
text(`Variable truncated: ${resultVariableWasTruncated ? "True" : "False"}. Variable: ${result.output}`);
|
||||
"#,
|
||||
TOKEN_POLICY_TEST_MODEL,
|
||||
|config| {
|
||||
config.tool_output_token_limit = Some(50);
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
assert_eq!(
|
||||
text_item(
|
||||
&custom_tool_output_items(&second_mock.single_request(), "call-1"),
|
||||
/*index*/ 1
|
||||
),
|
||||
"x".repeat(50_000)
|
||||
let items = custom_tool_output_items(&second_mock.single_request(), "call-1");
|
||||
let output = text_item(&items, /*index*/ 1);
|
||||
// The 50-token override must shrink this 50,000-character value far below
|
||||
// what the default 10,000-token history cap would retain.
|
||||
assert!(
|
||||
output.len() < 1_000,
|
||||
"expected configured history cap to truncate the emitted value, got {} bytes",
|
||||
output.len()
|
||||
);
|
||||
assert_regex_match(
|
||||
r"^Variable truncated: False\. Variable: x+…\d+ tokens truncated…x+$",
|
||||
output,
|
||||
);
|
||||
|
||||
Ok(())
|
||||
@@ -1084,7 +1109,8 @@ text(result.output);
|
||||
|
||||
#[cfg_attr(windows, ignore = "no exec_command on Windows")]
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn code_mode_exec_without_max_preserves_output_beyond_default() -> Result<()> {
|
||||
async fn code_mode_exec_without_nested_limit_preserves_result_variable_before_default_history_truncation()
|
||||
-> Result<()> {
|
||||
// TODO(anp): Remove after Wine exec returns complete nested-tool output to code mode.
|
||||
skip_if_wine_exec!(
|
||||
Ok(()),
|
||||
@@ -1093,24 +1119,26 @@ async fn code_mode_exec_without_max_preserves_output_beyond_default() -> Result<
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
let server = responses::start_mock_server().await;
|
||||
let (_test, second_mock) = run_code_mode_turn(
|
||||
let (_test, second_mock) = run_code_mode_turn_with_model_and_config(
|
||||
&server,
|
||||
"use exec_command from code mode",
|
||||
r#"// @exec: {"max_output_tokens": 20000}
|
||||
const result = await tools.exec_command({
|
||||
cmd: "python3 -c \"import sys; sys.stdout.write('x' * 50000)\""
|
||||
});
|
||||
text(result.output);
|
||||
const resultVariableWasTruncated = result.output.length !== 50000;
|
||||
text(`Variable truncated: ${resultVariableWasTruncated ? "True" : "False"}. Variable: ${result.output}`);
|
||||
"#,
|
||||
TOKEN_POLICY_TEST_MODEL,
|
||||
|_| {},
|
||||
)
|
||||
.await?;
|
||||
|
||||
assert_eq!(
|
||||
text_item(
|
||||
&custom_tool_output_items(&second_mock.single_request(), "call-1"),
|
||||
/*index*/ 1
|
||||
),
|
||||
"x".repeat(50_000)
|
||||
let items = custom_tool_output_items(&second_mock.single_request(), "call-1");
|
||||
let output = text_item(&items, /*index*/ 1);
|
||||
assert_regex_match(
|
||||
r"^Variable truncated: False\. Variable: x+…\d+ tokens truncated…x+$",
|
||||
output,
|
||||
);
|
||||
|
||||
Ok(())
|
||||
@@ -1118,7 +1146,8 @@ text(result.output);
|
||||
|
||||
#[cfg_attr(windows, ignore = "no exec_command on Windows")]
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn code_mode_exec_without_max_preserves_output_beyond_truncation_policy() -> Result<()> {
|
||||
async fn code_mode_exec_without_nested_limit_preserves_result_variable_before_configured_history_truncation()
|
||||
-> Result<()> {
|
||||
// TODO(anp): Remove after Wine exec returns complete nested-tool output to code mode.
|
||||
skip_if_wine_exec!(
|
||||
Ok(()),
|
||||
@@ -1127,35 +1156,45 @@ async fn code_mode_exec_without_max_preserves_output_beyond_truncation_policy()
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
let server = responses::start_mock_server().await;
|
||||
let (_test, second_mock) = run_code_mode_turn_with_config(
|
||||
let (_test, second_mock) = run_code_mode_turn_with_model_and_config(
|
||||
&server,
|
||||
"use exec_command from code mode",
|
||||
r#"// @exec: {"max_output_tokens": 20000}
|
||||
const result = await tools.exec_command({
|
||||
cmd: "python3 -c \"import sys; sys.stdout.write('x' * 50000)\""
|
||||
});
|
||||
text(result.output);
|
||||
const resultVariableWasTruncated = result.output.length !== 50000;
|
||||
text(`Variable truncated: ${resultVariableWasTruncated ? "True" : "False"}. Variable: ${result.output}`);
|
||||
"#,
|
||||
TOKEN_POLICY_TEST_MODEL,
|
||||
|config| {
|
||||
config.tool_output_token_limit = Some(50);
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
assert_eq!(
|
||||
text_item(
|
||||
&custom_tool_output_items(&second_mock.single_request(), "call-1"),
|
||||
/*index*/ 1
|
||||
),
|
||||
"x".repeat(50_000)
|
||||
let items = custom_tool_output_items(&second_mock.single_request(), "call-1");
|
||||
let output = text_item(&items, /*index*/ 1);
|
||||
// The 50-token override must shrink this 50,000-character value far below
|
||||
// what the default 10,000-token history cap would retain.
|
||||
assert!(
|
||||
output.len() < 1_000,
|
||||
"expected configured history cap to truncate the emitted value, got {} bytes",
|
||||
output.len()
|
||||
);
|
||||
assert_regex_match(
|
||||
r"^Variable truncated: False\. Variable: x+…\d+ tokens truncated…x+$",
|
||||
output,
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// The outer directive limits output after JavaScript emits it; it does not
|
||||
// limit `result.output` returned by the nested command.
|
||||
#[cfg_attr(windows, ignore = "no exec_command on Windows")]
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn code_mode_exec_explicit_max_output_tokens_truncates() -> Result<()> {
|
||||
async fn code_mode_exec_outer_limit_truncates_emitted_output() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
let server = responses::start_mock_server().await;
|
||||
|
||||
Reference in New Issue
Block a user