[codex] Test code-mode variable truncation (#28471)

## Summary Code mode has two separate truncation points: the nested tool result returned to JavaScript and the code-mode output later recorded for the model. These tests now verify those behaviors independently. - Report whether `result.output` was truncated before printing it. - Verify omitted or sufficiently large nested limits produce `Variable truncated: False`, while allowing the printed value to be truncated downstream. - Verify an explicit nested limit produces `Variable truncated: True` when the command output exceeds it. - Use a token-policy model fixture so downstream truncation is visible as `…N tokens truncated…`. - Align the explicit nested-truncation expectation with the warning header. This PR changes test coverage only; runtime truncation behavior is unchanged. ## Validation - `env -u CODEX_SANDBOX_NETWORK_DISABLED RUST_MIN_STACK=8388608 cargo test -p codex-core --test all code_mode_exec -- --nocapture` (8 passed)
2026-07-01 00:31:56 +08:00 · 2026-06-16 20:14:23 -07:00
parent e2f074e16c
commit 0a3ad4c4ba
1 changed files with 91 additions and 52 deletions
@@ -940,9 +940,15 @@ text(JSON.stringify(results));
    Ok(())
 }

+// This model uses token-based tool-output truncation, giving the downstream
+// history assertions a stable `…N tokens truncated…` marker.
+const TOKEN_POLICY_TEST_MODEL: &str = "gpt-5.4";
+
+// A nested `exec_command` limit applies to `result.output` inside JavaScript.
+// The outer code-mode and history budgets apply after the script calls `text`.
 #[cfg_attr(windows, ignore = "no exec_command on Windows")]
 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-async fn code_mode_exec_command_explicit_max_output_tokens_truncates() -> Result<()> {
+async fn code_mode_exec_nested_limit_formats_truncated_result_with_warning() -> Result<()> {
    skip_if_no_network!(Ok(()));

    let server = responses::start_mock_server().await;
@@ -964,7 +970,7 @@ text(result.output);
            &custom_tool_output_items(&second_mock.single_request(), "call-1"),
            /*index*/ 1
        ),
-        "Total output lines: 1\n\n0123456789…5 tokens truncated…0123456789"
+        "Warning: truncated output (original token count: 10)\nTotal output lines: 1\n\n0123456789…5 tokens truncated…0123456789"
    );

    Ok(())
@@ -972,7 +978,8 @@ text(result.output);

 #[cfg_attr(windows, ignore = "no exec_command on Windows")]
 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-async fn code_mode_exec_explicit_max_above_default_preserves_output() -> Result<()> {
+async fn code_mode_exec_nested_limit_preserves_result_variable_before_default_history_truncation()
+-> Result<()> {
    // TODO(anp): Remove after Wine exec returns complete nested-tool output to code mode.
    skip_if_wine_exec!(
        Ok(()),
@@ -981,7 +988,7 @@ async fn code_mode_exec_explicit_max_above_default_preserves_output() -> Result<
    skip_if_no_network!(Ok(()));

    let server = responses::start_mock_server().await;
-    let (_test, second_mock) = run_code_mode_turn(
+    let (_test, second_mock) = run_code_mode_turn_with_model_and_config(
        &server,
        "use exec_command from code mode",
        r#"// @exec: {"max_output_tokens": 20000}
@@ -989,17 +996,19 @@ const result = await tools.exec_command({
  cmd: "python3 -c \"import sys; sys.stdout.write('x' * 50000)\"",
  max_output_tokens: 20000
 });
-text(result.output);
+const resultVariableWasTruncated = result.output.length !== 50000;
+text(`Variable truncated: ${resultVariableWasTruncated ? "True" : "False"}. Variable: ${result.output}`);
 "#,
+        TOKEN_POLICY_TEST_MODEL,
+        |_| {},
    )
    .await?;

-    assert_eq!(
-        text_item(
-            &custom_tool_output_items(&second_mock.single_request(), "call-1"),
-            /*index*/ 1
-        ),
-        "x".repeat(50_000)
+    let items = custom_tool_output_items(&second_mock.single_request(), "call-1");
+    let output = text_item(&items, /*index*/ 1);
+    assert_regex_match(
+        r"^Variable truncated: False\. Variable: x+…\d+ tokens truncated…x+$",
+        output,
    );

    Ok(())
@@ -1007,7 +1016,7 @@ text(result.output);

 #[cfg_attr(windows, ignore = "no exec_command on Windows")]
 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-async fn code_mode_exec_explicit_max_above_default_truncates_larger_output() -> Result<()> {
+async fn code_mode_exec_nested_limit_truncates_result_variable_when_exceeded() -> Result<()> {
    // TODO(anp): Remove after Wine exec returns complete nested-tool output to code mode.
    skip_if_wine_exec!(
        Ok(()),
@@ -1016,7 +1025,7 @@ async fn code_mode_exec_explicit_max_above_default_truncates_larger_output() ->
    skip_if_no_network!(Ok(()));

    let server = responses::start_mock_server().await;
-    let (_test, second_mock) = run_code_mode_turn(
+    let (_test, second_mock) = run_code_mode_turn_with_model_and_config(
        &server,
        "use exec_command from code mode",
        r#"// @exec: {"max_output_tokens": 25000}
@@ -1024,21 +1033,28 @@ const result = await tools.exec_command({
  cmd: "python3 -c \"import sys; sys.stdout.write('A' * 90000)\"",
  max_output_tokens: 20000
 });
-text(result.output);
+const resultVariableWasTruncated = result.output.includes("…2500 tokens truncated…");
+text(`Variable truncated: ${resultVariableWasTruncated ? "True" : "False"}. Variable: ${result.output}`);
 "#,
+        TOKEN_POLICY_TEST_MODEL,
+        |_| {},
    )
    .await?;

-    assert_eq!(
-        text_item(
-            &custom_tool_output_items(&second_mock.single_request(), "call-1"),
-            /*index*/ 1
-        ),
-        format!(
-            "Warning: truncated output (original token count: 22500)\nTotal output lines: 1\n\n{}…2500 tokens truncated…{}",
-            "A".repeat(40_000),
-            "A".repeat(40_000)
-        )
+    let items = custom_tool_output_items(&second_mock.single_request(), "call-1");
+    let output = text_item(&items, /*index*/ 1);
+    // The nested 20,000-token budget leaves about 80,000 characters. This
+    // ceiling independently proves that history applied its smaller cap.
+    assert!(
+        output.len() < 60_000,
+        "expected history to truncate the emitted value, got {} bytes",
+        output.len()
+    );
+    // The boolean describes the nested result; the marker below comes from
+    // history truncating the value emitted with `text` afterward.
+    assert_regex_match(
+        r"(?s)^Variable truncated: True\. Variable: .*…\d+ tokens truncated…A+$",
+        output,
    );

    Ok(())
@@ -1046,7 +1062,8 @@ text(result.output);

 #[cfg_attr(windows, ignore = "no exec_command on Windows")]
 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-async fn code_mode_exec_explicit_max_above_truncation_policy_preserves_output() -> Result<()> {
+async fn code_mode_exec_nested_limit_preserves_result_variable_before_configured_history_truncation()
+-> Result<()> {
    // TODO(anp): Remove after Wine exec returns complete nested-tool output to code mode.
    skip_if_wine_exec!(
        Ok(()),
@@ -1055,7 +1072,7 @@ async fn code_mode_exec_explicit_max_above_truncation_policy_preserves_output()
    skip_if_no_network!(Ok(()));

    let server = responses::start_mock_server().await;
-    let (_test, second_mock) = run_code_mode_turn_with_config(
+    let (_test, second_mock) = run_code_mode_turn_with_model_and_config(
        &server,
        "use exec_command from code mode",
        r#"// @exec: {"max_output_tokens": 20000}
@@ -1063,20 +1080,28 @@ const result = await tools.exec_command({
  cmd: "python3 -c \"import sys; sys.stdout.write('x' * 50000)\"",
  max_output_tokens: 20000
 });
-text(result.output);
+const resultVariableWasTruncated = result.output.length !== 50000;
+text(`Variable truncated: ${resultVariableWasTruncated ? "True" : "False"}. Variable: ${result.output}`);
 "#,
+        TOKEN_POLICY_TEST_MODEL,
        |config| {
            config.tool_output_token_limit = Some(50);
        },
    )
    .await?;

-    assert_eq!(
-        text_item(
-            &custom_tool_output_items(&second_mock.single_request(), "call-1"),
-            /*index*/ 1
-        ),
-        "x".repeat(50_000)
+    let items = custom_tool_output_items(&second_mock.single_request(), "call-1");
+    let output = text_item(&items, /*index*/ 1);
+    // The 50-token override must shrink this 50,000-character value far below
+    // what the default 10,000-token history cap would retain.
+    assert!(
+        output.len() < 1_000,
+        "expected configured history cap to truncate the emitted value, got {} bytes",
+        output.len()
+    );
+    assert_regex_match(
+        r"^Variable truncated: False\. Variable: x+…\d+ tokens truncated…x+$",
+        output,
    );

    Ok(())
@@ -1084,7 +1109,8 @@ text(result.output);

 #[cfg_attr(windows, ignore = "no exec_command on Windows")]
 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-async fn code_mode_exec_without_max_preserves_output_beyond_default() -> Result<()> {
+async fn code_mode_exec_without_nested_limit_preserves_result_variable_before_default_history_truncation()
+-> Result<()> {
    // TODO(anp): Remove after Wine exec returns complete nested-tool output to code mode.
    skip_if_wine_exec!(
        Ok(()),
@@ -1093,24 +1119,26 @@ async fn code_mode_exec_without_max_preserves_output_beyond_default() -> Result<
    skip_if_no_network!(Ok(()));

    let server = responses::start_mock_server().await;
-    let (_test, second_mock) = run_code_mode_turn(
+    let (_test, second_mock) = run_code_mode_turn_with_model_and_config(
        &server,
        "use exec_command from code mode",
        r#"// @exec: {"max_output_tokens": 20000}
 const result = await tools.exec_command({
  cmd: "python3 -c \"import sys; sys.stdout.write('x' * 50000)\""
 });
-text(result.output);
+const resultVariableWasTruncated = result.output.length !== 50000;
+text(`Variable truncated: ${resultVariableWasTruncated ? "True" : "False"}. Variable: ${result.output}`);
 "#,
+        TOKEN_POLICY_TEST_MODEL,
+        |_| {},
    )
    .await?;

-    assert_eq!(
-        text_item(
-            &custom_tool_output_items(&second_mock.single_request(), "call-1"),
-            /*index*/ 1
-        ),
-        "x".repeat(50_000)
+    let items = custom_tool_output_items(&second_mock.single_request(), "call-1");
+    let output = text_item(&items, /*index*/ 1);
+    assert_regex_match(
+        r"^Variable truncated: False\. Variable: x+…\d+ tokens truncated…x+$",
+        output,
    );

    Ok(())
@@ -1118,7 +1146,8 @@ text(result.output);

 #[cfg_attr(windows, ignore = "no exec_command on Windows")]
 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-async fn code_mode_exec_without_max_preserves_output_beyond_truncation_policy() -> Result<()> {
+async fn code_mode_exec_without_nested_limit_preserves_result_variable_before_configured_history_truncation()
+-> Result<()> {
    // TODO(anp): Remove after Wine exec returns complete nested-tool output to code mode.
    skip_if_wine_exec!(
        Ok(()),
@@ -1127,35 +1156,45 @@ async fn code_mode_exec_without_max_preserves_output_beyond_truncation_policy()
    skip_if_no_network!(Ok(()));

    let server = responses::start_mock_server().await;
-    let (_test, second_mock) = run_code_mode_turn_with_config(
+    let (_test, second_mock) = run_code_mode_turn_with_model_and_config(
        &server,
        "use exec_command from code mode",
        r#"// @exec: {"max_output_tokens": 20000}
 const result = await tools.exec_command({
  cmd: "python3 -c \"import sys; sys.stdout.write('x' * 50000)\""
 });
-text(result.output);
+const resultVariableWasTruncated = result.output.length !== 50000;
+text(`Variable truncated: ${resultVariableWasTruncated ? "True" : "False"}. Variable: ${result.output}`);
 "#,
+        TOKEN_POLICY_TEST_MODEL,
        |config| {
            config.tool_output_token_limit = Some(50);
        },
    )
    .await?;

-    assert_eq!(
-        text_item(
-            &custom_tool_output_items(&second_mock.single_request(), "call-1"),
-            /*index*/ 1
-        ),
-        "x".repeat(50_000)
+    let items = custom_tool_output_items(&second_mock.single_request(), "call-1");
+    let output = text_item(&items, /*index*/ 1);
+    // The 50-token override must shrink this 50,000-character value far below
+    // what the default 10,000-token history cap would retain.
+    assert!(
+        output.len() < 1_000,
+        "expected configured history cap to truncate the emitted value, got {} bytes",
+        output.len()
+    );
+    assert_regex_match(
+        r"^Variable truncated: False\. Variable: x+…\d+ tokens truncated…x+$",
+        output,
    );

    Ok(())
 }

+// The outer directive limits output after JavaScript emits it; it does not
+// limit `result.output` returned by the nested command.
 #[cfg_attr(windows, ignore = "no exec_command on Windows")]
 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-async fn code_mode_exec_explicit_max_output_tokens_truncates() -> Result<()> {
+async fn code_mode_exec_outer_limit_truncates_emitted_output() -> Result<()> {
    skip_if_no_network!(Ok(()));

    let server = responses::start_mock_server().await;