avoid cloning sampling request input (#28306)

## Why Every model request cloned the full prepared input just to keep it for the legacy after-agent hook. That copy gets more expensive as the conversation grows. ## What Move the prepared input into the sampling loop and return it with the result. If the request retries, keep the first input so the hook still sees the same data as before. This removes one `O(n)` clone per sampling request, where `n` is the size of the prepared input. It saves `O(n)` copy work and `O(n)` temporary memory. No behavior change is intended. ## Performance Local rollout traces show turns reaching roughly 260k input tokens. On turns of that size, this removes the only unconditional full prepared-input clone on the happy path. That avoids one request-sized allocation/copy per sampling attempt for large conversations, and the savings scale linearly with request size. ## Testing - `just test -p codex-core continue_after_stream_error` - `just fix -p codex-core`
2026-07-01 00:31:56 +08:00 · 2026-06-15 17:26:44 +01:00
parent 828d7476a0
commit e67bc683f3
1 changed files with 9 additions and 4 deletions
@@ -238,12 +238,12 @@ pub(crate) async fn run_turn(
            Arc::clone(&turn_diff_tracker),
            &mut client_session,
            &responses_metadata,
-            sampling_request_input.clone(),
+            sampling_request_input,
            cancellation_token.child_token(),
        )
        .await
        {
-            Ok(sampling_request_output) => {
+            Ok((sampling_request_output, sampling_request_input)) => {
                let SamplingRequestResult {
                    needs_follow_up: model_needs_follow_up,
                    last_agent_message: sampling_request_last_agent_message,
@@ -1036,7 +1036,7 @@ async fn run_sampling_request(
    responses_metadata: &CodexResponsesMetadata,
    input: Vec<ResponseItem>,
    cancellation_token: CancellationToken,
-) -> CodexResult<SamplingRequestResult> {
+) -> CodexResult<(SamplingRequestResult, Vec<ResponseItem>)> {
    let router = built_tools(sess.as_ref(), turn_context.as_ref(), &cancellation_token).await?;

    let base_instructions = sess.get_base_instructions().await;
@@ -1056,6 +1056,7 @@ async fn run_sampling_request(
    let max_retries = turn_context.provider.info().stream_max_retries();
    let mut retries = 0;
    let mut initial_input = Some(input);
+    let mut original_input = None;
    loop {
        let prompt_input = if let Some(input) = initial_input.take() {
            input
@@ -1084,7 +1085,7 @@ async fn run_sampling_request(
        .await
        {
            Ok(output) => {
-                return Ok(output);
+                return Ok((output, original_input.unwrap_or(prompt.input)));
            }
            Err(CodexErr::ContextWindowExceeded) => {
                sess.set_total_tokens_full(&turn_context).await;
@@ -1100,6 +1101,10 @@ async fn run_sampling_request(
            Err(err) => err,
        };

+        if original_input.is_none() {
+            original_input = Some(prompt.input);
+        }
+
        if !err.is_retryable() {
            return Err(err);
        }