diff --git a/codex-rs/core/tests/suite/code_mode.rs b/codex-rs/core/tests/suite/code_mode.rs index 6a4121cc5..60b476d61 100644 --- a/codex-rs/core/tests/suite/code_mode.rs +++ b/codex-rs/core/tests/suite/code_mode.rs @@ -940,9 +940,15 @@ text(JSON.stringify(results)); Ok(()) } +// This model uses token-based tool-output truncation, giving the downstream +// history assertions a stable `…N tokens truncated…` marker. +const TOKEN_POLICY_TEST_MODEL: &str = "gpt-5.4"; + +// A nested `exec_command` limit applies to `result.output` inside JavaScript. +// The outer code-mode and history budgets apply after the script calls `text`. #[cfg_attr(windows, ignore = "no exec_command on Windows")] #[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn code_mode_exec_command_explicit_max_output_tokens_truncates() -> Result<()> { +async fn code_mode_exec_nested_limit_formats_truncated_result_with_warning() -> Result<()> { skip_if_no_network!(Ok(())); let server = responses::start_mock_server().await; @@ -964,7 +970,7 @@ text(result.output); &custom_tool_output_items(&second_mock.single_request(), "call-1"), /*index*/ 1 ), - "Total output lines: 1\n\n0123456789…5 tokens truncated…0123456789" + "Warning: truncated output (original token count: 10)\nTotal output lines: 1\n\n0123456789…5 tokens truncated…0123456789" ); Ok(()) @@ -972,7 +978,8 @@ text(result.output); #[cfg_attr(windows, ignore = "no exec_command on Windows")] #[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn code_mode_exec_explicit_max_above_default_preserves_output() -> Result<()> { +async fn code_mode_exec_nested_limit_preserves_result_variable_before_default_history_truncation() +-> Result<()> { // TODO(anp): Remove after Wine exec returns complete nested-tool output to code mode. skip_if_wine_exec!( Ok(()), @@ -981,7 +988,7 @@ async fn code_mode_exec_explicit_max_above_default_preserves_output() -> Result< skip_if_no_network!(Ok(())); let server = responses::start_mock_server().await; - let (_test, second_mock) = run_code_mode_turn( + let (_test, second_mock) = run_code_mode_turn_with_model_and_config( &server, "use exec_command from code mode", r#"// @exec: {"max_output_tokens": 20000} @@ -989,17 +996,19 @@ const result = await tools.exec_command({ cmd: "python3 -c \"import sys; sys.stdout.write('x' * 50000)\"", max_output_tokens: 20000 }); -text(result.output); +const resultVariableWasTruncated = result.output.length !== 50000; +text(`Variable truncated: ${resultVariableWasTruncated ? "True" : "False"}. Variable: ${result.output}`); "#, + TOKEN_POLICY_TEST_MODEL, + |_| {}, ) .await?; - assert_eq!( - text_item( - &custom_tool_output_items(&second_mock.single_request(), "call-1"), - /*index*/ 1 - ), - "x".repeat(50_000) + let items = custom_tool_output_items(&second_mock.single_request(), "call-1"); + let output = text_item(&items, /*index*/ 1); + assert_regex_match( + r"^Variable truncated: False\. Variable: x+…\d+ tokens truncated…x+$", + output, ); Ok(()) @@ -1007,7 +1016,7 @@ text(result.output); #[cfg_attr(windows, ignore = "no exec_command on Windows")] #[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn code_mode_exec_explicit_max_above_default_truncates_larger_output() -> Result<()> { +async fn code_mode_exec_nested_limit_truncates_result_variable_when_exceeded() -> Result<()> { // TODO(anp): Remove after Wine exec returns complete nested-tool output to code mode. skip_if_wine_exec!( Ok(()), @@ -1016,7 +1025,7 @@ async fn code_mode_exec_explicit_max_above_default_truncates_larger_output() -> skip_if_no_network!(Ok(())); let server = responses::start_mock_server().await; - let (_test, second_mock) = run_code_mode_turn( + let (_test, second_mock) = run_code_mode_turn_with_model_and_config( &server, "use exec_command from code mode", r#"// @exec: {"max_output_tokens": 25000} @@ -1024,21 +1033,28 @@ const result = await tools.exec_command({ cmd: "python3 -c \"import sys; sys.stdout.write('A' * 90000)\"", max_output_tokens: 20000 }); -text(result.output); +const resultVariableWasTruncated = result.output.includes("…2500 tokens truncated…"); +text(`Variable truncated: ${resultVariableWasTruncated ? "True" : "False"}. Variable: ${result.output}`); "#, + TOKEN_POLICY_TEST_MODEL, + |_| {}, ) .await?; - assert_eq!( - text_item( - &custom_tool_output_items(&second_mock.single_request(), "call-1"), - /*index*/ 1 - ), - format!( - "Warning: truncated output (original token count: 22500)\nTotal output lines: 1\n\n{}…2500 tokens truncated…{}", - "A".repeat(40_000), - "A".repeat(40_000) - ) + let items = custom_tool_output_items(&second_mock.single_request(), "call-1"); + let output = text_item(&items, /*index*/ 1); + // The nested 20,000-token budget leaves about 80,000 characters. This + // ceiling independently proves that history applied its smaller cap. + assert!( + output.len() < 60_000, + "expected history to truncate the emitted value, got {} bytes", + output.len() + ); + // The boolean describes the nested result; the marker below comes from + // history truncating the value emitted with `text` afterward. + assert_regex_match( + r"(?s)^Variable truncated: True\. Variable: .*…\d+ tokens truncated…A+$", + output, ); Ok(()) @@ -1046,7 +1062,8 @@ text(result.output); #[cfg_attr(windows, ignore = "no exec_command on Windows")] #[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn code_mode_exec_explicit_max_above_truncation_policy_preserves_output() -> Result<()> { +async fn code_mode_exec_nested_limit_preserves_result_variable_before_configured_history_truncation() +-> Result<()> { // TODO(anp): Remove after Wine exec returns complete nested-tool output to code mode. skip_if_wine_exec!( Ok(()), @@ -1055,7 +1072,7 @@ async fn code_mode_exec_explicit_max_above_truncation_policy_preserves_output() skip_if_no_network!(Ok(())); let server = responses::start_mock_server().await; - let (_test, second_mock) = run_code_mode_turn_with_config( + let (_test, second_mock) = run_code_mode_turn_with_model_and_config( &server, "use exec_command from code mode", r#"// @exec: {"max_output_tokens": 20000} @@ -1063,20 +1080,28 @@ const result = await tools.exec_command({ cmd: "python3 -c \"import sys; sys.stdout.write('x' * 50000)\"", max_output_tokens: 20000 }); -text(result.output); +const resultVariableWasTruncated = result.output.length !== 50000; +text(`Variable truncated: ${resultVariableWasTruncated ? "True" : "False"}. Variable: ${result.output}`); "#, + TOKEN_POLICY_TEST_MODEL, |config| { config.tool_output_token_limit = Some(50); }, ) .await?; - assert_eq!( - text_item( - &custom_tool_output_items(&second_mock.single_request(), "call-1"), - /*index*/ 1 - ), - "x".repeat(50_000) + let items = custom_tool_output_items(&second_mock.single_request(), "call-1"); + let output = text_item(&items, /*index*/ 1); + // The 50-token override must shrink this 50,000-character value far below + // what the default 10,000-token history cap would retain. + assert!( + output.len() < 1_000, + "expected configured history cap to truncate the emitted value, got {} bytes", + output.len() + ); + assert_regex_match( + r"^Variable truncated: False\. Variable: x+…\d+ tokens truncated…x+$", + output, ); Ok(()) @@ -1084,7 +1109,8 @@ text(result.output); #[cfg_attr(windows, ignore = "no exec_command on Windows")] #[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn code_mode_exec_without_max_preserves_output_beyond_default() -> Result<()> { +async fn code_mode_exec_without_nested_limit_preserves_result_variable_before_default_history_truncation() +-> Result<()> { // TODO(anp): Remove after Wine exec returns complete nested-tool output to code mode. skip_if_wine_exec!( Ok(()), @@ -1093,24 +1119,26 @@ async fn code_mode_exec_without_max_preserves_output_beyond_default() -> Result< skip_if_no_network!(Ok(())); let server = responses::start_mock_server().await; - let (_test, second_mock) = run_code_mode_turn( + let (_test, second_mock) = run_code_mode_turn_with_model_and_config( &server, "use exec_command from code mode", r#"// @exec: {"max_output_tokens": 20000} const result = await tools.exec_command({ cmd: "python3 -c \"import sys; sys.stdout.write('x' * 50000)\"" }); -text(result.output); +const resultVariableWasTruncated = result.output.length !== 50000; +text(`Variable truncated: ${resultVariableWasTruncated ? "True" : "False"}. Variable: ${result.output}`); "#, + TOKEN_POLICY_TEST_MODEL, + |_| {}, ) .await?; - assert_eq!( - text_item( - &custom_tool_output_items(&second_mock.single_request(), "call-1"), - /*index*/ 1 - ), - "x".repeat(50_000) + let items = custom_tool_output_items(&second_mock.single_request(), "call-1"); + let output = text_item(&items, /*index*/ 1); + assert_regex_match( + r"^Variable truncated: False\. Variable: x+…\d+ tokens truncated…x+$", + output, ); Ok(()) @@ -1118,7 +1146,8 @@ text(result.output); #[cfg_attr(windows, ignore = "no exec_command on Windows")] #[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn code_mode_exec_without_max_preserves_output_beyond_truncation_policy() -> Result<()> { +async fn code_mode_exec_without_nested_limit_preserves_result_variable_before_configured_history_truncation() +-> Result<()> { // TODO(anp): Remove after Wine exec returns complete nested-tool output to code mode. skip_if_wine_exec!( Ok(()), @@ -1127,35 +1156,45 @@ async fn code_mode_exec_without_max_preserves_output_beyond_truncation_policy() skip_if_no_network!(Ok(())); let server = responses::start_mock_server().await; - let (_test, second_mock) = run_code_mode_turn_with_config( + let (_test, second_mock) = run_code_mode_turn_with_model_and_config( &server, "use exec_command from code mode", r#"// @exec: {"max_output_tokens": 20000} const result = await tools.exec_command({ cmd: "python3 -c \"import sys; sys.stdout.write('x' * 50000)\"" }); -text(result.output); +const resultVariableWasTruncated = result.output.length !== 50000; +text(`Variable truncated: ${resultVariableWasTruncated ? "True" : "False"}. Variable: ${result.output}`); "#, + TOKEN_POLICY_TEST_MODEL, |config| { config.tool_output_token_limit = Some(50); }, ) .await?; - assert_eq!( - text_item( - &custom_tool_output_items(&second_mock.single_request(), "call-1"), - /*index*/ 1 - ), - "x".repeat(50_000) + let items = custom_tool_output_items(&second_mock.single_request(), "call-1"); + let output = text_item(&items, /*index*/ 1); + // The 50-token override must shrink this 50,000-character value far below + // what the default 10,000-token history cap would retain. + assert!( + output.len() < 1_000, + "expected configured history cap to truncate the emitted value, got {} bytes", + output.len() + ); + assert_regex_match( + r"^Variable truncated: False\. Variable: x+…\d+ tokens truncated…x+$", + output, ); Ok(()) } +// The outer directive limits output after JavaScript emits it; it does not +// limit `result.output` returned by the nested command. #[cfg_attr(windows, ignore = "no exec_command on Windows")] #[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn code_mode_exec_explicit_max_output_tokens_truncates() -> Result<()> { +async fn code_mode_exec_outer_limit_truncates_emitted_output() -> Result<()> { skip_if_no_network!(Ok(())); let server = responses::start_mock_server().await;