Migrate coverage to shell_command (#7042)

2026-07-01 00:31:56 +08:00 · 2025-11-20 19:44:00 -08:00
parent 830ab4ce20
commit 767b66f407
25 changed files with 284 additions and 262 deletions
@@ -187,8 +187,10 @@ dependencies = [
 "codex-app-server-protocol",
 "codex-core",
 "codex-protocol",
+ "core_test_support",
 "serde",
 "serde_json",
+ "shlex",
 "tokio",
 "uuid",
 "wiremock",
@@ -868,6 +870,7 @@ dependencies = [
 "serde",
 "serde_json",
 "serial_test",
+ "shlex",
 "tempfile",
 "tokio",
 "toml",
@@ -1776,6 +1779,7 @@ dependencies = [
 "notify",
 "regex-lite",
 "serde_json",
+ "shlex",
 "tempfile",
 "tokio",
 "walkdir",
@@ -3737,11 +3741,13 @@ dependencies = [
 "assert_cmd",
 "codex-core",
 "codex-mcp-server",
+ "core_test_support",
 "mcp-types",
 "os_info",
 "pretty_assertions",
 "serde",
 "serde_json",
+ "shlex",
 "tokio",
 "wiremock",
 ]
@@ -53,3 +53,4 @@ serial_test = { workspace = true }
 tempfile = { workspace = true }
 toml = { workspace = true }
 wiremock = { workspace = true }
+shlex = { workspace = true }
@@ -24,3 +24,5 @@ tokio = { workspace = true, features = [
 ] }
 uuid = { workspace = true }
 wiremock = { workspace = true }
+core_test_support = { path = "../../../core/tests/common" }
+shlex = { workspace = true }
@@ -9,12 +9,14 @@ pub use auth_fixtures::ChatGptIdTokenClaims;
 pub use auth_fixtures::encode_id_token;
 pub use auth_fixtures::write_chatgpt_auth;
 use codex_app_server_protocol::JSONRPCResponse;
+pub use core_test_support::format_with_current_shell;
+pub use core_test_support::format_with_current_shell_display;
 pub use mcp_process::McpProcess;
 pub use mock_model_server::create_mock_chat_completions_server;
 pub use mock_model_server::create_mock_chat_completions_server_unchecked;
 pub use responses::create_apply_patch_sse_response;
 pub use responses::create_final_assistant_message_sse_response;
-pub use responses::create_shell_sse_response;
+pub use responses::create_shell_command_sse_response;
 pub use rollout::create_fake_rollout;
 use serde::de::DeserializeOwned;

@@ -1,17 +1,18 @@
 use serde_json::json;
 use std::path::Path;

-pub fn create_shell_sse_response(
+pub fn create_shell_command_sse_response(
    command: Vec<String>,
    workdir: Option<&Path>,
    timeout_ms: Option<u64>,
    call_id: &str,
 ) -> anyhow::Result<String> {
-    // The `arguments`` for the `shell` tool is a serialized JSON object.
+    // The `arguments` for the `shell_command` tool is a serialized JSON object.
+    let command_str = shlex::try_join(command.iter().map(String::as_str))?;
    let tool_call_arguments = serde_json::to_string(&json!({
-        "command": command,
+        "command": command_str,
        "workdir": workdir.map(|w| w.to_string_lossy()),
-        "timeout": timeout_ms
+        "timeout_ms": timeout_ms
    }))?;
    let tool_call = json!({
        "choices": [
@@ -21,7 +22,7 @@ pub fn create_shell_sse_response(
                        {
                            "id": call_id,
                            "function": {
-                                "name": "shell",
+                                "name": "shell_command",
                                "arguments": tool_call_arguments
                            }
                        }
@@ -62,10 +63,10 @@ pub fn create_apply_patch_sse_response(
    patch_content: &str,
    call_id: &str,
 ) -> anyhow::Result<String> {
-    // Use shell command to call apply_patch with heredoc format
-    let shell_command = format!("apply_patch <<'EOF'\n{patch_content}\nEOF");
+    // Use shell_command to call apply_patch with heredoc format
+    let command = format!("apply_patch <<'EOF'\n{patch_content}\nEOF");
    let tool_call_arguments = serde_json::to_string(&json!({
-        "command": ["bash", "-lc", shell_command]
+        "command": command
    }))?;

    let tool_call = json!({
@@ -76,7 +77,7 @@ pub fn create_apply_patch_sse_response(
                        {
                            "id": call_id,
                            "function": {
-                                "name": "shell",
+                                "name": "shell_command",
                                "arguments": tool_call_arguments
                            }
                        }
@@ -2,7 +2,8 @@ use anyhow::Result;
 use app_test_support::McpProcess;
 use app_test_support::create_final_assistant_message_sse_response;
 use app_test_support::create_mock_chat_completions_server;
-use app_test_support::create_shell_sse_response;
+use app_test_support::create_shell_command_sse_response;
+use app_test_support::format_with_current_shell;
 use app_test_support::to_response;
 use codex_app_server_protocol::AddConversationListenerParams;
 use codex_app_server_protocol::AddConversationSubscriptionResponse;
@@ -56,7 +57,7 @@ async fn test_codex_jsonrpc_conversation_flow() -> Result<()> {
    // Create a mock model server that immediately ends each turn.
    // Two turns are expected: initial session configure + one user message.
    let responses = vec![
-        create_shell_sse_response(
+        create_shell_command_sse_response(
            vec!["ls".to_string()],
            Some(&working_directory),
            Some(5000),
@@ -175,7 +176,7 @@ async fn test_send_user_turn_changes_approval_policy_behavior() -> Result<()> {

    // Mock server will request a python shell call for the first and second turn, then finish.
    let responses = vec![
-        create_shell_sse_response(
+        create_shell_command_sse_response(
            vec![
                "python3".to_string(),
                "-c".to_string(),
@@ -186,7 +187,7 @@ async fn test_send_user_turn_changes_approval_policy_behavior() -> Result<()> {
            "call1",
        )?,
        create_final_assistant_message_sse_response("done 1")?,
-        create_shell_sse_response(
+        create_shell_command_sse_response(
            vec![
                "python3".to_string(),
                "-c".to_string(),
@@ -267,11 +268,7 @@ async fn test_send_user_turn_changes_approval_policy_behavior() -> Result<()> {
        ExecCommandApprovalParams {
            conversation_id,
            call_id: "call1".to_string(),
-            command: vec![
-                "python3".to_string(),
-                "-c".to_string(),
-                "print(42)".to_string(),
-            ],
+            command: format_with_current_shell("python3 -c 'print(42)'"),
            cwd: working_directory.clone(),
            reason: None,
            risk: None,
@@ -353,23 +350,15 @@ async fn test_send_user_turn_updates_sandbox_and_cwd_between_turns() -> Result<(
    std::fs::create_dir(&second_cwd)?;

    let responses = vec![
-        create_shell_sse_response(
-            vec![
-                "bash".to_string(),
-                "-lc".to_string(),
-                "echo first turn".to_string(),
-            ],
+        create_shell_command_sse_response(
+            vec!["echo".to_string(), "first".to_string(), "turn".to_string()],
            None,
            Some(5000),
            "call-first",
        )?,
        create_final_assistant_message_sse_response("done first")?,
-        create_shell_sse_response(
-            vec![
-                "bash".to_string(),
-                "-lc".to_string(),
-                "echo second turn".to_string(),
-            ],
+        create_shell_command_sse_response(
+            vec!["echo".to_string(), "second".to_string(), "turn".to_string()],
            None,
            Some(5000),
            "call-second",
@@ -481,13 +470,9 @@ async fn test_send_user_turn_updates_sandbox_and_cwd_between_turns() -> Result<(
        exec_begin.cwd, second_cwd,
        "exec turn should run from updated cwd"
    );
+    let expected_command = format_with_current_shell("echo second turn");
    assert_eq!(
-        exec_begin.command,
-        vec![
-            "bash".to_string(),
-            "-lc".to_string(),
-            "echo second turn".to_string()
-        ],
+        exec_begin.command, expected_command,
        "exec turn should run expected command"
    );

@@ -19,7 +19,7 @@ use tokio::time::timeout;

 use app_test_support::McpProcess;
 use app_test_support::create_mock_chat_completions_server;
-use app_test_support::create_shell_sse_response;
+use app_test_support::create_shell_command_sse_response;
 use app_test_support::to_response;

 const DEFAULT_READ_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(10);
@@ -56,7 +56,7 @@ async fn shell_command_interruption() -> anyhow::Result<()> {
    std::fs::create_dir(&working_directory)?;

    // Create mock server with a single SSE response: the long sleep command
-    let server = create_mock_chat_completions_server(vec![create_shell_sse_response(
+    let server = create_mock_chat_completions_server(vec![create_shell_command_sse_response(
        shell_command.clone(),
        Some(&working_directory),
        Some(10_000), // 10 seconds timeout in ms
@@ -3,7 +3,7 @@
 use anyhow::Result;
 use app_test_support::McpProcess;
 use app_test_support::create_mock_chat_completions_server;
-use app_test_support::create_shell_sse_response;
+use app_test_support::create_shell_command_sse_response;
 use app_test_support::to_response;
 use codex_app_server_protocol::JSONRPCNotification;
 use codex_app_server_protocol::JSONRPCResponse;
@@ -41,7 +41,7 @@ async fn turn_interrupt_aborts_running_turn() -> Result<()> {
    std::fs::create_dir(&working_directory)?;

    // Mock server: long-running shell command then (after abort) nothing else needed.
-    let server = create_mock_chat_completions_server(vec![create_shell_sse_response(
+    let server = create_mock_chat_completions_server(vec![create_shell_command_sse_response(
        shell_command.clone(),
        Some(&working_directory),
        Some(10_000),
@@ -4,7 +4,8 @@ use app_test_support::create_apply_patch_sse_response;
 use app_test_support::create_final_assistant_message_sse_response;
 use app_test_support::create_mock_chat_completions_server;
 use app_test_support::create_mock_chat_completions_server_unchecked;
-use app_test_support::create_shell_sse_response;
+use app_test_support::create_shell_command_sse_response;
+use app_test_support::format_with_current_shell_display;
 use app_test_support::to_response;
 use codex_app_server_protocol::ApprovalDecision;
 use codex_app_server_protocol::CommandExecutionStatus;
@@ -203,7 +204,7 @@ async fn turn_start_exec_approval_toggle_v2() -> Result<()> {
    // Mock server: first turn requests a shell call (elicitation), then completes.
    // Second turn same, but we'll set approval_policy=never to avoid elicitation.
    let responses = vec![
-        create_shell_sse_response(
+        create_shell_command_sse_response(
            vec![
                "python3".to_string(),
                "-c".to_string(),
@@ -214,7 +215,7 @@ async fn turn_start_exec_approval_toggle_v2() -> Result<()> {
            "call1",
        )?,
        create_final_assistant_message_sse_response("done 1")?,
-        create_shell_sse_response(
+        create_shell_command_sse_response(
            vec![
                "python3".to_string(),
                "-c".to_string(),
@@ -343,23 +344,15 @@ async fn turn_start_updates_sandbox_and_cwd_between_turns_v2() -> Result<()> {
    std::fs::create_dir(&second_cwd)?;

    let responses = vec![
-        create_shell_sse_response(
-            vec![
-                "bash".to_string(),
-                "-lc".to_string(),
-                "echo first turn".to_string(),
-            ],
+        create_shell_command_sse_response(
+            vec!["echo".to_string(), "first".to_string(), "turn".to_string()],
            None,
            Some(5000),
            "call-first",
        )?,
        create_final_assistant_message_sse_response("done first")?,
-        create_shell_sse_response(
-            vec![
-                "bash".to_string(),
-                "-lc".to_string(),
-                "echo second turn".to_string(),
-            ],
+        create_shell_command_sse_response(
+            vec!["echo".to_string(), "second".to_string(), "turn".to_string()],
            None,
            Some(5000),
            "call-second",
@@ -465,7 +458,8 @@ async fn turn_start_updates_sandbox_and_cwd_between_turns_v2() -> Result<()> {
        unreachable!("loop ensures we break on command execution items");
    };
    assert_eq!(cwd, second_cwd);
-    assert_eq!(command, "bash -lc 'echo second turn'");
+    let expected_command = format_with_current_shell_display("echo second turn");
+    assert_eq!(command, expected_command);
    assert_eq!(status, CommandExecutionStatus::InProgress);

    timeout(
@@ -480,6 +474,10 @@ async fn turn_start_updates_sandbox_and_cwd_between_turns_v2() -> Result<()> {
 #[tokio::test]
 async fn turn_start_file_change_approval_v2() -> Result<()> {
    skip_if_no_network!(Ok(()));
+    if cfg!(windows) {
+        // TODO apply_patch approvals are not parsed from powershell commands yet
+        return Ok(());
+    }

    let tmp = TempDir::new()?;
    let codex_home = tmp.path().join("codex_home");
@@ -626,6 +624,10 @@ async fn turn_start_file_change_approval_v2() -> Result<()> {
 #[tokio::test]
 async fn turn_start_file_change_approval_decline_v2() -> Result<()> {
    skip_if_no_network!(Ok(()));
+    if cfg!(windows) {
+        // TODO apply_patch approvals are not parsed from powershell commands yet
+        return Ok(());
+    }

    let tmp = TempDir::new()?;
    let codex_home = tmp.path().join("codex_home");
@@ -30,6 +30,7 @@ pub use standalone_executable::main;
 pub const APPLY_PATCH_TOOL_INSTRUCTIONS: &str = include_str!("../apply_patch_tool_instructions.md");

 const APPLY_PATCH_COMMANDS: [&str; 2] = ["apply_patch", "applypatch"];
+const APPLY_PATCH_SHELLS: [&str; 3] = ["bash", "zsh", "sh"];

 #[derive(Debug, Error, PartialEq)]
 pub enum ApplyPatchError {
@@ -96,6 +97,13 @@ pub struct ApplyPatchArgs {
    pub workdir: Option<String>,
 }

+fn shell_supports_apply_patch(shell: &str) -> bool {
+    std::path::Path::new(shell)
+        .file_name()
+        .and_then(|name| name.to_str())
+        .is_some_and(|name| APPLY_PATCH_SHELLS.contains(&name))
+}
+
 pub fn maybe_parse_apply_patch(argv: &[String]) -> MaybeApplyPatch {
    match argv {
        // Direct invocation: apply_patch <patch>
@@ -104,7 +112,7 @@ pub fn maybe_parse_apply_patch(argv: &[String]) -> MaybeApplyPatch {
            Err(e) => MaybeApplyPatch::PatchParseError(e),
        },
        // Bash heredoc form: (optional `cd <path> &&`) apply_patch <<'EOF' ...
-        [bash, flag, script] if bash == "bash" && flag == "-lc" => {
+        [shell, flag, script] if shell_supports_apply_patch(shell) && flag == "-lc" => {
            match extract_apply_patch_from_bash(script) {
                Ok((body, workdir)) => match parse_patch(&body) {
                    Ok(mut source) => {
@@ -224,12 +232,12 @@ pub fn maybe_parse_apply_patch_verified(argv: &[String], cwd: &Path) -> MaybeApp
                );
            }
        }
-        [bash, flag, script] if bash == "bash" && flag == "-lc" => {
-            if parse_patch(script).is_ok() {
-                return MaybeApplyPatchVerified::CorrectnessError(
-                    ApplyPatchError::ImplicitInvocation,
-                );
-            }
+        [shell, flag, script]
+            if shell_supports_apply_patch(shell)
+                && flag == "-lc"
+                && parse_patch(script).is_ok() =>
+        {
+            return MaybeApplyPatchVerified::CorrectnessError(ApplyPatchError::ImplicitInvocation);
        }
        _ => {}
    }
@@ -18,3 +18,4 @@ tempfile = { workspace = true }
 tokio = { workspace = true, features = ["time"] }
 walkdir = { workspace = true }
 wiremock = { workspace = true }
+shlex = { workspace = true }
@@ -172,6 +172,15 @@ pub fn sandbox_network_env_var() -> &'static str {
    codex_core::spawn::CODEX_SANDBOX_NETWORK_DISABLED_ENV_VAR
 }

+pub fn format_with_current_shell(command: &str) -> Vec<String> {
+    codex_core::shell::default_user_shell().derive_exec_args(command, true)
+}
+
+pub fn format_with_current_shell_display(command: &str) -> String {
+    let args = format_with_current_shell(command);
+    shlex::try_join(args.iter().map(String::as_str)).expect("serialize current shell command")
+}
+
 pub mod fs_wait {
    use anyhow::Result;
    use anyhow::anyhow;
@@ -462,8 +462,11 @@ pub fn ev_apply_patch_function_call(call_id: &str, patch: &str) -> Value {

 pub fn ev_shell_command_call(call_id: &str, command: &str) -> Value {
    let args = serde_json::json!({ "command": command });
-    let arguments = serde_json::to_string(&args).expect("serialize shell arguments");
+    ev_shell_command_call_with_args(call_id, &args)
+}

+pub fn ev_shell_command_call_with_args(call_id: &str, args: &serde_json::Value) -> Value {
+    let arguments = serde_json::to_string(args).expect("serialize shell command arguments");
    ev_function_call(call_id, "shell_command", &arguments)
 }

@@ -17,15 +17,11 @@ use core_test_support::wait_for_event;
 use regex_lite::Regex;
 use serde_json::json;

-/// Integration test: spawn a long‑running shell tool via a mocked Responses SSE
+/// Integration test: spawn a long‑running shell_command tool via a mocked Responses SSE
 /// function call, then interrupt the session and expect TurnAborted.
 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn interrupt_long_running_tool_emits_turn_aborted() {
-    let command = vec![
-        "bash".to_string(),
-        "-lc".to_string(),
-        "sleep 60".to_string(),
-    ];
+    let command = "sleep 60";

    let args = json!({
        "command": command,
@@ -33,14 +29,19 @@ async fn interrupt_long_running_tool_emits_turn_aborted() {
    })
    .to_string();
    let body = sse(vec![
-        ev_function_call("call_sleep", "shell", &args),
+        ev_function_call("call_sleep", "shell_command", &args),
        ev_completed("done"),
    ]);

    let server = start_mock_server().await;
    mount_sse_once(&server, body).await;

-    let codex = test_codex().build(&server).await.unwrap().codex;
+    let codex = test_codex()
+        .with_model("gpt-5.1")
+        .build(&server)
+        .await
+        .unwrap()
+        .codex;

    // Kick off a turn that triggers the function call.
    codex
@@ -67,11 +68,7 @@ async fn interrupt_long_running_tool_emits_turn_aborted() {
 /// responses server, and ensures the model receives the synthesized abort.
 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn interrupt_tool_records_history_entries() {
-    let command = vec![
-        "bash".to_string(),
-        "-lc".to_string(),
-        "sleep 60".to_string(),
-    ];
+    let command = "sleep 60";
    let call_id = "call-history";

    let args = json!({
@@ -81,7 +78,7 @@ async fn interrupt_tool_records_history_entries() {
    .to_string();
    let first_body = sse(vec![
        ev_response_created("resp-history"),
-        ev_function_call(call_id, "shell", &args),
+        ev_function_call(call_id, "shell_command", &args),
        ev_completed("resp-history"),
    ]);
    let follow_up_body = sse(vec![
@@ -92,7 +89,11 @@ async fn interrupt_tool_records_history_entries() {
    let server = start_mock_server().await;
    let response_mock = mount_sse_sequence(&server, vec![first_body, follow_up_body]).await;

-    let fixture = test_codex().build(&server).await.unwrap();
+    let fixture = test_codex()
+        .with_model("gpt-5.1")
+        .build(&server)
+        .await
+        .unwrap();
    let codex = Arc::clone(&fixture.codex);

    codex
@@ -667,7 +667,7 @@ async fn apply_patch_cli_verification_failure_has_no_side_effects(
 }

 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-async fn apply_patch_shell_heredoc_with_cd_updates_relative_workdir() -> Result<()> {
+async fn apply_patch_shell_command_heredoc_with_cd_updates_relative_workdir() -> Result<()> {
    skip_if_no_network!(Ok(()));

    let harness = apply_patch_harness_with(|config| {
@@ -684,14 +684,11 @@ async fn apply_patch_shell_heredoc_with_cd_updates_relative_workdir() -> Result<

    let script = "cd sub && apply_patch <<'EOF'\n*** Begin Patch\n*** Update File: in_sub.txt\n@@\n-before\n+after\n*** End Patch\nEOF\n";
    let call_id = "shell-heredoc-cd";
-    let args = json!({
-        "command": ["bash", "-lc", script],
-        "timeout_ms": 5_000,
-    });
+    let args = json!({ "command": script, "timeout_ms": 5_000 });
    let bodies = vec![
        sse(vec![
            ev_response_created("resp-1"),
-            ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
+            ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
            ev_completed("resp-1"),
        ]),
        sse(vec![
@@ -706,14 +703,14 @@ async fn apply_patch_shell_heredoc_with_cd_updates_relative_workdir() -> Result<
    let out = harness.function_call_stdout(call_id).await;
    assert!(
        out.contains("Success."),
-        "expected successful apply_patch invocation via shell: {out}"
+        "expected successful apply_patch invocation via shell_command: {out}"
    );
    assert_eq!(fs::read_to_string(&target)?, "after\n");
    Ok(())
 }

 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-async fn apply_patch_shell_failure_propagates_error_and_skips_diff() -> Result<()> {
+async fn apply_patch_shell_command_failure_propagates_error_and_skips_diff() -> Result<()> {
    skip_if_no_network!(Ok(()));

    let harness = apply_patch_harness_with(|config| {
@@ -730,14 +727,11 @@ async fn apply_patch_shell_failure_propagates_error_and_skips_diff() -> Result<(

    let script = "apply_patch <<'EOF'\n*** Begin Patch\n*** Update File: invalid.txt\n@@\n-nope\n+changed\n*** End Patch\nEOF\n";
    let call_id = "shell-apply-failure";
-    let args = json!({
-        "command": ["bash", "-lc", script],
-        "timeout_ms": 5_000,
-    });
+    let args = json!({ "command": script, "timeout_ms": 5_000 });
    let bodies = vec![
        sse(vec![
            ev_response_created("resp-1"),
-            ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
+            ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
            ev_completed("resp-1"),
        ]),
        sse(vec![
@@ -780,10 +774,6 @@ async fn apply_patch_shell_failure_propagates_error_and_skips_diff() -> Result<(
    );

    let out = harness.function_call_stdout(call_id).await;
-    assert!(
-        out.contains("apply_patch verification failed"),
-        "expected verification failure message"
-    );
    assert!(
        out.contains("Failed to find expected lines in"),
        "expected failure diagnostics: {out}"
@@ -71,7 +71,7 @@ enum ActionKind {
        response_body: &'static str,
    },
    RunCommand {
-        command: &'static [&'static str],
+        command: &'static str,
    },
    RunUnifiedExecCommand {
        command: &'static str,
@@ -97,20 +97,12 @@ impl ActionKind {
        server: &MockServer,
        call_id: &str,
        with_escalated_permissions: bool,
-    ) -> Result<(Value, Option<Vec<String>>)> {
+    ) -> Result<(Value, Option<String>)> {
        match self {
            ActionKind::WriteFile { target, content } => {
                let (path, _) = target.resolve_for_patch(test);
                let _ = fs::remove_file(&path);
-                let command = vec![
-                    "/bin/sh".to_string(),
-                    "-c".to_string(),
-                    format!(
-                        "printf {content:?} > {path:?} && cat {path:?}",
-                        content = content,
-                        path = path
-                    ),
-                ];
+                let command = format!("printf {content:?} > {path:?} && cat {path:?}");
                let event = shell_event(call_id, &command, 1_000, with_escalated_permissions)?;
                Ok((event, Some(command)))
            }
@@ -127,21 +119,18 @@ impl ActionKind {
                    .await;

                let url = format!("{}{}", server.uri(), endpoint);
+                let escaped_url = url.replace('\'', "\\'");
                let script = format!(
-                    "import sys\nimport urllib.request\nurl = {url:?}\ntry:\n    data = urllib.request.urlopen(url, timeout=2).read().decode()\n    print('OK:' + data.strip())\nexcept Exception as exc:\n    print('ERR:' + exc.__class__.__name__)\n    sys.exit(1)",
+                    "import sys\nimport urllib.request\nurl = '{escaped_url}'\ntry:\n    data = urllib.request.urlopen(url, timeout=2).read().decode()\n    print('OK:' + data.strip())\nexcept Exception as exc:\n    print('ERR:' + exc.__class__.__name__)\n    sys.exit(1)",
                );

-                let command = vec!["python3".to_string(), "-c".to_string(), script];
+                let command = format!("python3 -c \"{script}\"");
                let event = shell_event(call_id, &command, 1_000, with_escalated_permissions)?;
                Ok((event, Some(command)))
            }
            ActionKind::RunCommand { command } => {
-                let command: Vec<String> = command
-                    .iter()
-                    .map(std::string::ToString::to_string)
-                    .collect();
-                let event = shell_event(call_id, &command, 1_000, with_escalated_permissions)?;
-                Ok((event, Some(command)))
+                let event = shell_event(call_id, command, 1_000, with_escalated_permissions)?;
+                Ok((event, Some(command.to_string())))
            }
            ActionKind::RunUnifiedExecCommand {
                command,
@@ -154,14 +143,7 @@ impl ActionKind {
                    with_escalated_permissions,
                    *justification,
                )?;
-                Ok((
-                    event,
-                    Some(vec![
-                        "/bin/bash".to_string(),
-                        "-lc".to_string(),
-                        command.to_string(),
-                    ]),
-                ))
+                Ok((event, Some(command.to_string())))
            }
            ActionKind::ApplyPatchFunction { target, content } => {
                let (path, patch_path) = target.resolve_for_patch(test);
@@ -185,19 +167,19 @@ fn build_add_file_patch(patch_path: &str, content: &str) -> String {
    format!("*** Begin Patch\n*** Add File: {patch_path}\n+{content}\n*** End Patch\n")
 }

-fn shell_apply_patch_command(patch: &str) -> Vec<String> {
+fn shell_apply_patch_command(patch: &str) -> String {
    let mut script = String::from("apply_patch <<'PATCH'\n");
    script.push_str(patch);
    if !patch.ends_with('\n') {
        script.push('\n');
    }
    script.push_str("PATCH\n");
-    vec!["bash".to_string(), "-lc".to_string(), script]
+    script
 }

 fn shell_event(
    call_id: &str,
-    command: &[String],
+    command: &str,
    timeout_ms: u64,
    with_escalated_permissions: bool,
 ) -> Result<Value> {
@@ -209,7 +191,7 @@ fn shell_event(
        args["with_escalated_permissions"] = json!(true);
    }
    let args_str = serde_json::to_string(&args)?;
-    Ok(ev_function_call(call_id, "shell", &args_str))
+    Ok(ev_function_call(call_id, "shell_command", &args_str))
 }

 fn exec_command_event(
@@ -296,7 +278,10 @@ impl Expectation {
            }
            Expectation::FileCreatedNoExitCode { target, content } => {
                let (path, _) = target.resolve_for_patch(test);
-                assert_eq!(result.exit_code, None, "expected no exit code for {path:?}");
+                assert!(
+                    result.exit_code.is_none() || result.exit_code == Some(0),
+                    "expected no exit code for {path:?}",
+                );
                assert!(
                    result.stdout.contains(content),
                    "stdout missing {content:?}: {}",
@@ -385,8 +370,8 @@ impl Expectation {
                );
            }
            Expectation::NetworkSuccessNoExitCode { body_contains } => {
-                assert_eq!(
-                    result.exit_code, None,
+                assert!(
+                    result.exit_code.is_none() || result.exit_code == Some(0),
                    "expected no exit code for successful network call: {}",
                    result.stdout
                );
@@ -433,8 +418,8 @@ impl Expectation {
                );
            }
            Expectation::CommandSuccessNoExitCode { stdout_contains } => {
-                assert_eq!(
-                    result.exit_code, None,
+                assert!(
+                    result.exit_code.is_none() || result.exit_code == Some(0),
                    "expected no exit code for trusted command: {}",
                    result.stdout
                );
@@ -531,10 +516,18 @@ fn parse_result(item: &Value) -> CommandResult {
            CommandResult { exit_code, stdout }
        }
        Err(_) => {
+            let structured = Regex::new(r"(?s)^Exit code:\s*(-?\d+).*?Output:\n(.*)$").unwrap();
            let regex =
                Regex::new(r"(?s)^.*?Process exited with code (\d+)\n.*?Output:\n(.*)$").unwrap();
            // parse freeform output
-            if let Some(captures) = regex.captures(output_str) {
+            if let Some(captures) = structured.captures(output_str) {
+                let exit_code = captures.get(1).unwrap().as_str().parse::<i64>().unwrap();
+                let output = captures.get(2).unwrap().as_str();
+                CommandResult {
+                    exit_code: Some(exit_code),
+                    stdout: output.to_string(),
+                }
+            } else if let Some(captures) = regex.captures(output_str) {
                let exit_code = captures.get(1).unwrap().as_str().parse::<i64>().unwrap();
                let output = captures.get(2).unwrap().as_str();
                CommandResult {
@@ -553,7 +546,7 @@ fn parse_result(item: &Value) -> CommandResult {

 async fn expect_exec_approval(
    test: &TestCodex,
-    expected_command: &[String],
+    expected_command: &str,
 ) -> ExecApprovalRequestEvent {
    let event = wait_for_event(&test.codex, |event| {
        matches!(
@@ -565,7 +558,12 @@ async fn expect_exec_approval(

    match event {
        EventMsg::ExecApprovalRequest(approval) => {
-            assert_eq!(approval.command, expected_command);
+            let last_arg = approval
+                .command
+                .last()
+                .map(std::string::String::as_str)
+                .unwrap_or_default();
+            assert_eq!(last_arg, expected_command);
            approval
        }
        EventMsg::TaskComplete(_) => panic!("expected approval request before completion"),
@@ -660,7 +658,7 @@ fn scenarios() -> Vec<ScenarioSpec> {
            features: vec![],
            model_override: Some("gpt-5.1"),
            outcome: Outcome::Auto,
-            expectation: Expectation::FileCreatedNoExitCode {
+            expectation: Expectation::FileCreated {
                target: TargetPath::OutsideWorkspace("dfa_on_request_5_1.txt"),
                content: "danger-on-request",
            },
@@ -702,7 +700,7 @@ fn scenarios() -> Vec<ScenarioSpec> {
            approval_policy: UnlessTrusted,
            sandbox_policy: SandboxPolicy::DangerFullAccess,
            action: ActionKind::RunCommand {
-                command: &["echo", "trusted-unless"],
+                command: "echo trusted-unless",
            },
            with_escalated_permissions: false,
            features: vec![],
@@ -717,7 +715,7 @@ fn scenarios() -> Vec<ScenarioSpec> {
            approval_policy: UnlessTrusted,
            sandbox_policy: SandboxPolicy::DangerFullAccess,
            action: ActionKind::RunCommand {
-                command: &["echo", "trusted-unless"],
+                command: "echo trusted-unless",
            },
            with_escalated_permissions: false,
            features: vec![],
@@ -880,7 +878,7 @@ fn scenarios() -> Vec<ScenarioSpec> {
            approval_policy: OnRequest,
            sandbox_policy: SandboxPolicy::ReadOnly,
            action: ActionKind::RunCommand {
-                command: &["echo", "trusted-read-only"],
+                command: "echo trusted-read-only",
            },
            with_escalated_permissions: false,
            features: vec![],
@@ -895,7 +893,7 @@ fn scenarios() -> Vec<ScenarioSpec> {
            approval_policy: OnRequest,
            sandbox_policy: SandboxPolicy::ReadOnly,
            action: ActionKind::RunCommand {
-                command: &["echo", "trusted-read-only"],
+                command: "echo trusted-read-only",
            },
            with_escalated_permissions: false,
            features: vec![],
@@ -1020,7 +1018,7 @@ fn scenarios() -> Vec<ScenarioSpec> {
            },
        },
        ScenarioSpec {
-            name: "apply_patch_shell_requires_patch_approval",
+            name: "apply_patch_shell_command_requires_patch_approval",
            approval_policy: UnlessTrusted,
            sandbox_policy: workspace_write(false),
            action: ActionKind::ApplyPatchShell {
@@ -1114,7 +1112,7 @@ fn scenarios() -> Vec<ScenarioSpec> {
            },
        },
        ScenarioSpec {
-            name: "apply_patch_shell_outside_requires_patch_approval",
+            name: "apply_patch_shell_command_outside_requires_patch_approval",
            approval_policy: OnRequest,
            sandbox_policy: workspace_write(false),
            action: ActionKind::ApplyPatchShell {
@@ -1229,7 +1227,10 @@ fn scenarios() -> Vec<ScenarioSpec> {
                message_contains: if cfg!(target_os = "linux") {
                    &["Permission denied"]
                } else {
-                    &["Permission denied|Operation not permitted|Read-only file system"]
+                    &[
+                        "Permission denied|Operation not permitted|operation not permitted|\
+                         Read-only file system",
+                    ]
                },
            },
        },
@@ -1238,7 +1239,7 @@ fn scenarios() -> Vec<ScenarioSpec> {
            approval_policy: Never,
            sandbox_policy: SandboxPolicy::ReadOnly,
            action: ActionKind::RunCommand {
-                command: &["echo", "trusted-never"],
+                command: "echo trusted-never",
            },
            with_escalated_permissions: false,
            features: vec![],
@@ -1373,7 +1374,10 @@ fn scenarios() -> Vec<ScenarioSpec> {
                message_contains: if cfg!(target_os = "linux") {
                    &["Permission denied"]
                } else {
-                    &["Permission denied|Operation not permitted|Read-only file system"]
+                    &[
+                        "Permission denied|Operation not permitted|operation not permitted|\
+                         Read-only file system",
+                    ]
                },
            },
        },
@@ -1509,7 +1513,7 @@ async fn run_scenario(scenario: &ScenarioSpec) -> Result<()> {
            expected_reason,
        } => {
            let command = expected_command
-                .as_ref()
+                .as_deref()
                .expect("exec approval requires shell command");
            let approval = expect_exec_approval(&test, command).await;
            if let Some(expected_reason) = expected_reason {
@@ -1,3 +1,4 @@
+use codex_core::model_family::find_family_for_model;
 use codex_core::protocol::AskForApproval;
 use codex_core::protocol::EventMsg;
 use codex_core::protocol::Op;
@@ -25,17 +26,17 @@ use pretty_assertions::assert_eq;
 async fn codex_delegate_forwards_exec_approval_and_proceeds_on_approval() {
    skip_if_no_network!();

-    // Sub-agent turn 1: emit a shell function_call requiring approval, then complete.
+    // Sub-agent turn 1: emit a shell_command function_call requiring approval, then complete.
    let call_id = "call-exec-1";
    let args = serde_json::json!({
-        "command": ["bash", "-lc", "rm -rf delegated"],
+        "command": "rm -rf delegated",
        "timeout_ms": 1000,
        "with_escalated_permissions": true,
    })
    .to_string();
    let sse1 = sse(vec![
        ev_response_created("resp-1"),
-        ev_function_call(call_id, "shell", &args),
+        ev_function_call(call_id, "shell_command", &args),
        ev_completed("resp-1"),
    ]);

@@ -61,6 +62,8 @@ async fn codex_delegate_forwards_exec_approval_and_proceeds_on_approval() {
    let mut builder = test_codex().with_config(|config| {
        config.approval_policy = AskForApproval::OnRequest;
        config.sandbox_policy = SandboxPolicy::ReadOnly;
+        config.model = "gpt-5.1".to_string();
+        config.model_family = find_family_for_model("gpt-5.1").expect("gpt-5.1 is a valid model");
    });
    let test = builder.build(&server).await.expect("build test codex");

@@ -138,6 +141,8 @@ async fn codex_delegate_forwards_patch_approval_and_proceeds_on_decision() {
        // Use a restricted sandbox so patch approval is required
        config.sandbox_policy = SandboxPolicy::ReadOnly;
        config.include_apply_patch_tool = true;
+        config.model = "gpt-5.1".to_string();
+        config.model_family = find_family_for_model("gpt-5.1").expect("gpt-5.1 is a valid model");
    });
    let test = builder.build(&server).await.expect("build test codex");

@@ -1,6 +1,7 @@
 #![allow(clippy::unwrap_used, clippy::expect_used)]

 use anyhow::Result;
+use codex_core::model_family::find_family_for_model;
 use codex_core::protocol::AskForApproval;
 use codex_core::protocol::EventMsg;
 use codex_core::protocol::Op;
@@ -21,6 +22,11 @@ use std::fs;

 #[tokio::test]
 async fn execpolicy_blocks_shell_invocation() -> Result<()> {
+    // TODO execpolicy doesn't parse powershell commands yet
+    if cfg!(windows) {
+        return Ok(());
+    }
+
    let mut builder = test_codex().with_config(|config| {
        let policy_path = config.codex_home.join("policy").join("policy.codexpolicy");
        fs::create_dir_all(
@@ -34,13 +40,16 @@ async fn execpolicy_blocks_shell_invocation() -> Result<()> {
            r#"prefix_rule(pattern=["echo"], decision="forbidden")"#,
        )
        .expect("write policy file");
+        config.model = "gpt-5.1".to_string();
+        config.model_family =
+            find_family_for_model("gpt-5.1").expect("gpt-5.1 should have a model family");
    });
    let server = start_mock_server().await;
    let test = builder.build(&server).await?;

    let call_id = "shell-forbidden";
    let args = json!({
-        "command": ["echo", "blocked"],
+        "command": "echo blocked",
        "timeout_ms": 1_000,
    });

@@ -48,7 +57,7 @@ async fn execpolicy_blocks_shell_invocation() -> Result<()> {
        &server,
        sse(vec![
            ev_response_created("resp-1"),
-            ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
+            ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
            ev_completed("resp-1"),
        ]),
    )
@@ -146,10 +146,11 @@ async fn non_parallel_tools_run_serially() -> anyhow::Result<()> {
    skip_if_no_network!(Ok(()));

    let server = start_mock_server().await;
-    let test = test_codex().build(&server).await?;
+    let mut builder = test_codex().with_model("gpt-5.1");
+    let test = builder.build(&server).await?;

    let shell_args = json!({
-        "command": ["/bin/sh", "-c", "sleep 0.3"],
+        "command": "sleep 0.3",
        "timeout_ms": 1_000,
    });
    let args_one = serde_json::to_string(&shell_args)?;
@@ -157,8 +158,8 @@ async fn non_parallel_tools_run_serially() -> anyhow::Result<()> {

    let first_response = sse(vec![
        json!({"type": "response.created", "response": {"id": "resp-1"}}),
-        ev_function_call("call-1", "shell", &args_one),
-        ev_function_call("call-2", "shell", &args_two),
+        ev_function_call("call-1", "shell_command", &args_one),
+        ev_function_call("call-2", "shell_command", &args_two),
        ev_completed("resp-1"),
    ]);
    let second_response = sse(vec![
@@ -167,7 +168,7 @@ async fn non_parallel_tools_run_serially() -> anyhow::Result<()> {
    ]);
    mount_sse_sequence(&server, vec![first_response, second_response]).await;

-    let duration = run_turn_and_measure(&test, "run shell twice").await?;
+    let duration = run_turn_and_measure(&test, "run shell_command twice").await?;
    assert_serial_duration(duration);

    Ok(())
@@ -185,14 +186,14 @@ async fn mixed_tools_fall_back_to_serial() -> anyhow::Result<()> {
    })
    .to_string();
    let shell_args = serde_json::to_string(&json!({
-        "command": ["/bin/sh", "-c", "sleep 0.3"],
+        "command": "sleep 0.3",
        "timeout_ms": 1_000,
    }))?;

    let first_response = sse(vec![
        json!({"type": "response.created", "response": {"id": "resp-1"}}),
        ev_function_call("call-1", "test_sync_tool", &sync_args),
-        ev_function_call("call-2", "shell", &shell_args),
+        ev_function_call("call-2", "shell_command", &shell_args),
        ev_completed("resp-1"),
    ]);
    let second_response = sse(vec![
@@ -215,7 +216,7 @@ async fn tool_results_grouped() -> anyhow::Result<()> {
    let test = build_codex_with_test_tool(&server).await?;

    let shell_args = serde_json::to_string(&json!({
-        "command": ["/bin/sh", "-c", "echo 'shell output'"],
+        "command": "echo 'shell output'",
        "timeout_ms": 1_000,
    }))?;

@@ -223,9 +224,9 @@ async fn tool_results_grouped() -> anyhow::Result<()> {
        &server,
        sse(vec![
            json!({"type": "response.created", "response": {"id": "resp-1"}}),
-            ev_function_call("call-1", "shell", &shell_args),
-            ev_function_call("call-2", "shell", &shell_args),
-            ev_function_call("call-3", "shell", &shell_args),
+            ev_function_call("call-1", "shell_command", &shell_args),
+            ev_function_call("call-2", "shell_command", &shell_args),
+            ev_function_call("call-3", "shell_command", &shell_args),
            ev_completed("resp-1"),
        ]),
    )
@@ -98,7 +98,7 @@ async fn truncate_function_error_trims_respond_to_model() -> Result<()> {
    Ok(())
 }

-// Verifies that a standard tool call (shell) exceeding the model formatting
+// Verifies that a standard tool call (shell_command) exceeding the model formatting
 // limits is truncated before being sent back to the model.
 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn tool_call_output_configured_limit_chars_type() -> Result<()> {
@@ -106,7 +106,7 @@ async fn tool_call_output_configured_limit_chars_type() -> Result<()> {

    let server = start_mock_server().await;

-    // Use a model that exposes the generic shell tool.
+    // Use a model that exposes the shell_command tool.
    let mut builder = test_codex().with_model("gpt-5.1").with_config(|config| {
        config.tool_output_token_limit = Some(100_000);
    });
@@ -114,28 +114,22 @@ async fn tool_call_output_configured_limit_chars_type() -> Result<()> {
    let fixture = builder.build(&server).await?;

    let call_id = "shell-too-large";
-    let args = if cfg!(windows) {
-        serde_json::json!({
-            "command": [
-                "powershell",
-                "-Command",
-                "for ($i=1; $i -le 100000; $i++) { Write-Output $i }"
-            ],
-            "timeout_ms": 5_000,
-        })
+    let command = if cfg!(windows) {
+        "for ($i=1; $i -le 100000; $i++) { Write-Output $i }"
    } else {
-        serde_json::json!({
-            "command": ["/bin/sh", "-c", "seq 1 100000"],
-            "timeout_ms": 5_000,
-        })
+        "seq 1 100000"
    };
+    let args = serde_json::json!({
+        "command": command,
+        "timeout_ms": 5_000,
+    });

    // First response: model tells us to run the tool; second: complete the turn.
    mount_sse_once(
        &server,
        sse(vec![
            responses::ev_response_created("resp-1"),
-            responses::ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
+            responses::ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
            responses::ev_completed("resp-1"),
        ]),
    )
@@ -167,7 +161,10 @@ async fn tool_call_output_configured_limit_chars_type() -> Result<()> {
        "expected truncated shell output to be plain text"
    );

-    assert_eq!(output.len(), 400097, "we should be almost 100k tokens");
+    assert!(
+        (400000..=401000).contains(&output.len()),
+        "we should be almost 100k tokens"
+    );

    assert!(
        !output.contains("tokens truncated"),
@@ -177,7 +174,7 @@ async fn tool_call_output_configured_limit_chars_type() -> Result<()> {
    Ok(())
 }

-// Verifies that a standard tool call (shell) exceeding the model formatting
+// Verifies that a standard tool call (shell_command) exceeding the model formatting
 // limits is truncated before being sent back to the model.
 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn tool_call_output_exceeds_limit_truncated_chars_limit() -> Result<()> {
@@ -185,34 +182,28 @@ async fn tool_call_output_exceeds_limit_truncated_chars_limit() -> Result<()> {

    let server = start_mock_server().await;

-    // Use a model that exposes the generic shell tool.
+    // Use a model that exposes the shell_command tool.
    let mut builder = test_codex().with_model("gpt-5.1");

    let fixture = builder.build(&server).await?;

    let call_id = "shell-too-large";
-    let args = if cfg!(windows) {
-        serde_json::json!({
-            "command": [
-                "powershell",
-                "-Command",
-                "for ($i=1; $i -le 100000; $i++) { Write-Output $i }"
-            ],
-            "timeout_ms": 5_000,
-        })
+    let command = if cfg!(windows) {
+        "for ($i=1; $i -le 100000; $i++) { Write-Output $i }"
    } else {
-        serde_json::json!({
-            "command": ["/bin/sh", "-c", "seq 1 100000"],
-            "timeout_ms": 5_000,
-        })
+        "seq 1 100000"
    };
+    let args = serde_json::json!({
+        "command": command,
+        "timeout_ms": 5_000,
+    });

    // First response: model tells us to run the tool; second: complete the turn.
    mount_sse_once(
        &server,
        sse(vec![
            responses::ev_response_created("resp-1"),
-            responses::ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
+            responses::ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
            responses::ev_completed("resp-1"),
        ]),
    )
@@ -250,14 +241,14 @@ async fn tool_call_output_exceeds_limit_truncated_chars_limit() -> Result<()> {

    let len = output.len();
    assert!(
-        (9_900..=10_000).contains(&len),
+        (9_900..=10_100).contains(&len),
        "expected ~10k chars after truncation, got {len}"
    );

    Ok(())
 }

-// Verifies that a standard tool call (shell) exceeding the model formatting
+// Verifies that a standard tool call (shell_command) exceeding the model formatting
 // limits is truncated before being sent back to the model.
 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn tool_call_output_exceeds_limit_truncated_for_model() -> Result<()> {
@@ -265,7 +256,7 @@ async fn tool_call_output_exceeds_limit_truncated_for_model() -> Result<()> {

    let server = start_mock_server().await;

-    // Use a model that exposes the generic shell tool.
+    // Use a model that exposes the shell_command tool.
    let mut builder = test_codex().with_config(|config| {
        config.model = "gpt-5.1-codex".to_string();
        config.model_family =
@@ -274,28 +265,22 @@ async fn tool_call_output_exceeds_limit_truncated_for_model() -> Result<()> {
    let fixture = builder.build(&server).await?;

    let call_id = "shell-too-large";
-    let args = if cfg!(windows) {
-        serde_json::json!({
-            "command": [
-                "powershell",
-                "-Command",
-                "for ($i=1; $i -le 100000; $i++) { Write-Output $i }"
-            ],
-            "timeout_ms": 5_000,
-        })
+    let command = if cfg!(windows) {
+        "for ($i=1; $i -le 100000; $i++) { Write-Output $i }"
    } else {
-        serde_json::json!({
-            "command": ["/bin/sh", "-c", "seq 1 100000"],
-            "timeout_ms": 5_000,
-        })
+        "seq 1 100000"
    };
+    let args = serde_json::json!({
+        "command": command,
+        "timeout_ms": 5_000,
+    });

    // First response: model tells us to run the tool; second: complete the turn.
    mount_sse_once(
        &server,
        sse(vec![
            responses::ev_response_created("resp-1"),
-            responses::ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
+            responses::ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
            responses::ev_completed("resp-1"),
        ]),
    )
@@ -345,7 +330,7 @@ $"#;
    Ok(())
 }

-// Ensures shell tool outputs that exceed the line limit are truncated only once.
+// Ensures shell_command outputs that exceed the line limit are truncated only once.
 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn tool_call_output_truncated_only_once() -> Result<()> {
    skip_if_no_network!(Ok(()));
@@ -359,27 +344,21 @@ async fn tool_call_output_truncated_only_once() -> Result<()> {
    });
    let fixture = builder.build(&server).await?;
    let call_id = "shell-single-truncation";
-    let args = if cfg!(windows) {
-        serde_json::json!({
-            "command": [
-                "powershell",
-                "-Command",
-                "for ($i=1; $i -le 10000; $i++) { Write-Output $i }"
-            ],
-            "timeout_ms": 5_000,
-        })
+    let command = if cfg!(windows) {
+        "for ($i=1; $i -le 10000; $i++) { Write-Output $i }"
    } else {
-        serde_json::json!({
-            "command": ["/bin/sh", "-c", "seq 1 10000"],
-            "timeout_ms": 5_000,
-        })
+        "seq 1 10000"
    };
+    let args = serde_json::json!({
+        "command": command,
+        "timeout_ms": 5_000,
+    });

    mount_sse_once(
        &server,
        sse(vec![
            responses::ev_response_created("resp-1"),
-            responses::ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
+            responses::ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
            responses::ev_completed("resp-1"),
        ]),
    )
@@ -619,7 +598,7 @@ async fn token_policy_marker_reports_tokens() -> Result<()> {

    let call_id = "shell-token-marker";
    let args = json!({
-        "command": ["/bin/sh", "-c", "seq 1 150"],
+        "command": "seq 1 150",
        "timeout_ms": 5_000,
    });

@@ -627,7 +606,7 @@ async fn token_policy_marker_reports_tokens() -> Result<()> {
        &server,
        sse(vec![
            ev_response_created("resp-1"),
-            ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
+            ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
            ev_completed("resp-1"),
        ]),
    )
@@ -650,7 +629,7 @@ async fn token_policy_marker_reports_tokens() -> Result<()> {
        .function_call_output_text(call_id)
        .context("shell output present")?;

-    let pattern = r#"(?s)^\{"output":"Total output lines: 150\\n\\n1\\n2\\n3\\n4\\n5\\n.*?…\d+ tokens truncated…7\\n138\\n139\\n140\\n141\\n142\\n143\\n144\\n145\\n146\\n147\\n148\\n149\\n150\\n","metadata":\{"exit_code":0,"duration_seconds":0\.0\}\}$"#;
+    let pattern = r"(?s)^Exit code: 0\nWall time: [0-9]+(?:\.[0-9]+)? seconds\nTotal output lines: 150\nOutput:\n1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19.*tokens truncated.*129\n130\n131\n132\n133\n134\n135\n136\n137\n138\n139\n140\n141\n142\n143\n144\n145\n146\n147\n148\n149\n150\n$";

    assert_regex_match(pattern, &output);

@@ -672,7 +651,7 @@ async fn byte_policy_marker_reports_bytes() -> Result<()> {

    let call_id = "shell-byte-marker";
    let args = json!({
-        "command": ["/bin/sh", "-c", "seq 1 150"],
+        "command": "seq 1 150",
        "timeout_ms": 5_000,
    });

@@ -680,7 +659,7 @@ async fn byte_policy_marker_reports_bytes() -> Result<()> {
        &server,
        sse(vec![
            ev_response_created("resp-1"),
-            ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
+            ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
            ev_completed("resp-1"),
        ]),
    )
@@ -703,16 +682,16 @@ async fn byte_policy_marker_reports_bytes() -> Result<()> {
        .function_call_output_text(call_id)
        .context("shell output present")?;

-    let pattern = r#"(?s)^\{"output":"Total output lines: 150\\n\\n1\\n2\\n3\\n4\\n5.*?…\d+ chars truncated…7\\n138\\n139\\n140\\n141\\n142\\n143\\n144\\n145\\n146\\n147\\n148\\n149\\n150\\n","metadata":\{"exit_code":0,"duration_seconds":0\.0\}\}$"#;
+    let pattern = r"(?s)^Exit code: 0\nWall time: [0-9]+(?:\.[0-9]+)? seconds\nTotal output lines: 150\nOutput:\n1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19.*chars truncated.*129\n130\n131\n132\n133\n134\n135\n136\n137\n138\n139\n140\n141\n142\n143\n144\n145\n146\n147\n148\n149\n150\n$";

    assert_regex_match(pattern, &output);

    Ok(())
 }

-// Shell tool output should remain intact when the config opts into a large token budget.
+// shell_command output should remain intact when the config opts into a large token budget.
 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-async fn shell_tool_output_not_truncated_with_custom_limit() -> Result<()> {
+async fn shell_command_output_not_truncated_with_custom_limit() -> Result<()> {
    skip_if_no_network!(Ok(()));

    let server = start_mock_server().await;
@@ -726,7 +705,7 @@ async fn shell_tool_output_not_truncated_with_custom_limit() -> Result<()> {

    let call_id = "shell-no-trunc";
    let args = json!({
-        "command": ["/bin/sh", "-c", "seq 1 1000"],
+        "command": "seq 1 1000",
        "timeout_ms": 5_000,
    });
    let expected_body: String = (1..=1000).map(|i| format!("{i}\n")).collect();
@@ -735,7 +714,7 @@ async fn shell_tool_output_not_truncated_with_custom_limit() -> Result<()> {
        &server,
        sse(vec![
            ev_response_created("resp-1"),
-            ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
+            ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
            ev_completed("resp-1"),
        ]),
    )
@@ -279,23 +279,19 @@ async fn user_shell_command_is_truncated_only_once() -> anyhow::Result<()> {
        config.tool_output_token_limit = Some(100);
        config.model = "gpt-5.1-codex".to_string();
        config.model_family =
-            find_family_for_model("gpt-5-codex").expect("gpt-5-codex is a model family");
+            find_family_for_model("gpt-5.1-codex").expect("gpt-5.1-codex is a model family");
    });
    let fixture = builder.build(&server).await?;

    let call_id = "user-shell-double-truncation";
    let args = if cfg!(windows) {
        serde_json::json!({
-            "command": [
-                "powershell",
-                "-Command",
-                "for ($i=1; $i -le 2000; $i++) { Write-Output $i }"
-            ],
+            "command": "for ($i=1; $i -le 2000; $i++) { Write-Output $i }",
            "timeout_ms": 5_000,
        })
    } else {
        serde_json::json!({
-            "command": ["/bin/sh", "-c", "seq 1 2000"],
+            "command": "seq 1 2000",
            "timeout_ms": 5_000,
        })
    };
@@ -304,7 +300,7 @@ async fn user_shell_command_is_truncated_only_once() -> anyhow::Result<()> {
        &server,
        sse(vec![
            ev_response_created("resp-1"),
-            ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
+            ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
            ev_completed("resp-1"),
        ]),
    )
@@ -319,19 +315,22 @@ async fn user_shell_command_is_truncated_only_once() -> anyhow::Result<()> {
    .await;

    fixture
-        .submit_turn_with_policy("trigger big shell output", SandboxPolicy::DangerFullAccess)
+        .submit_turn_with_policy(
+            "trigger big shell_command output",
+            SandboxPolicy::DangerFullAccess,
+        )
        .await?;

    let output = mock2
        .single_request()
        .function_call_output_text(call_id)
-        .context("function_call_output present for shell call")?;
+        .context("function_call_output present for shell_command call")?;

    let truncation_headers = output.matches("Total output lines:").count();

    assert_eq!(
        truncation_headers, 1,
-        "shell output should carry only one truncation header: {output}"
+        "shell_command output should carry only one truncation header: {output}"
    );

    Ok(())
@@ -23,3 +23,5 @@ tokio = { workspace = true, features = [
    "rt-multi-thread",
 ] }
 wiremock = { workspace = true }
+core_test_support = { path = "../../../core/tests/common" }
+shlex = { workspace = true }
@@ -2,12 +2,13 @@ mod mcp_process;
 mod mock_model_server;
 mod responses;

+pub use core_test_support::format_with_current_shell;
 pub use mcp_process::McpProcess;
 use mcp_types::JSONRPCResponse;
 pub use mock_model_server::create_mock_chat_completions_server;
 pub use responses::create_apply_patch_sse_response;
 pub use responses::create_final_assistant_message_sse_response;
-pub use responses::create_shell_sse_response;
+pub use responses::create_shell_command_sse_response;
 use serde::de::DeserializeOwned;

 pub fn to_response<T: DeserializeOwned>(response: JSONRPCResponse) -> anyhow::Result<T> {
@@ -1,17 +1,18 @@
 use serde_json::json;
 use std::path::Path;

-pub fn create_shell_sse_response(
+pub fn create_shell_command_sse_response(
    command: Vec<String>,
    workdir: Option<&Path>,
    timeout_ms: Option<u64>,
    call_id: &str,
 ) -> anyhow::Result<String> {
-    // The `arguments`` for the `shell` tool is a serialized JSON object.
+    // The `arguments` for the `shell_command` tool is a serialized JSON object.
+    let command_str = shlex::try_join(command.iter().map(String::as_str))?;
    let tool_call_arguments = serde_json::to_string(&json!({
-        "command": command,
+        "command": command_str,
        "workdir": workdir.map(|w| w.to_string_lossy()),
-        "timeout": timeout_ms
+        "timeout_ms": timeout_ms
    }))?;
    let tool_call = json!({
        "choices": [
@@ -21,7 +22,7 @@ pub fn create_shell_sse_response(
                        {
                            "id": call_id,
                            "function": {
-                                "name": "shell",
+                                "name": "shell_command",
                                "arguments": tool_call_arguments
                            }
                        }
@@ -62,10 +63,10 @@ pub fn create_apply_patch_sse_response(
    patch_content: &str,
    call_id: &str,
 ) -> anyhow::Result<String> {
-    // Use shell command to call apply_patch with heredoc format
-    let shell_command = format!("apply_patch <<'EOF'\n{patch_content}\nEOF");
+    // Use shell_command to call apply_patch with heredoc format
+    let command = format!("apply_patch <<'EOF'\n{patch_content}\nEOF");
    let tool_call_arguments = serde_json::to_string(&json!({
-        "command": ["bash", "-lc", shell_command]
+        "command": command
    }))?;

    let tool_call = json!({
@@ -76,7 +77,7 @@ pub fn create_apply_patch_sse_response(
                        {
                            "id": call_id,
                            "function": {
-                                "name": "shell",
+                                "name": "shell_command",
                                "arguments": tool_call_arguments
                            }
                        }
@@ -30,7 +30,8 @@ use mcp_test_support::McpProcess;
 use mcp_test_support::create_apply_patch_sse_response;
 use mcp_test_support::create_final_assistant_message_sse_response;
 use mcp_test_support::create_mock_chat_completions_server;
-use mcp_test_support::create_shell_sse_response;
+use mcp_test_support::create_shell_command_sse_response;
+use mcp_test_support::format_with_current_shell;

 // Allow ample time on slower CI or under load to avoid flakes.
 const DEFAULT_READ_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20);
@@ -71,13 +72,16 @@ async fn shell_command_approval_triggers_elicitation() -> anyhow::Result<()> {
        "-c".to_string(),
        format!("import pathlib; pathlib.Path('{created_filename}').touch()"),
    ];
+    let expected_shell_command = format_with_current_shell(&format!(
+        "python3 -c \"import pathlib; pathlib.Path('{created_filename}').touch()\""
+    ));

    let McpHandle {
        process: mut mcp_process,
        server: _server,
        dir: _dir,
    } = create_mcp_process(vec![
-        create_shell_sse_response(
+        create_shell_command_sse_response(
            shell_command.clone(),
            Some(workdir_for_shell_function_call.path()),
            Some(5_000),
@@ -111,7 +115,7 @@ async fn shell_command_approval_triggers_elicitation() -> anyhow::Result<()> {
    )?;
    let expected_elicitation_request = create_expected_elicitation_request(
        elicitation_request_id.clone(),
-        shell_command.clone(),
+        expected_shell_command,
        workdir_for_shell_function_call.path(),
        codex_request_id.to_string(),
        params.codex_event_id.clone(),
@@ -218,6 +222,12 @@ async fn test_patch_approval_triggers_elicitation() {
 }

 async fn patch_approval_triggers_elicitation() -> anyhow::Result<()> {
+    if cfg!(windows) {
+        // powershell apply_patch shell calls are not parsed into apply patch approvals
+
+        return Ok(());
+    }
+
    let cwd = TempDir::new()?;
    let test_file = cwd.path().join("destination_file.txt");
    std::fs::write(&test_file, "original content\n")?;