avoid cloning sampling request input (#28306)

## Why

Every model request cloned the full prepared input just to keep it for
the legacy after-agent hook. That copy gets more expensive as the
conversation grows.

## What

Move the prepared input into the sampling loop and return it with the
result. If the request retries, keep the first input so the hook still
sees the same data as before.

This removes one `O(n)` clone per sampling request, where `n` is the
size of the prepared input. It saves `O(n)` copy work and `O(n)`
temporary memory.

No behavior change is intended.

## Performance

Local rollout traces show turns reaching roughly 260k input tokens. On
turns of that size, this removes the only unconditional full
prepared-input clone on the happy path. That avoids one request-sized
allocation/copy per sampling attempt for large conversations, and the
savings scale linearly with request size.

## Testing

- `just test -p codex-core continue_after_stream_error`
- `just fix -p codex-core`
This commit is contained in:
jif
2026-06-15 17:26:44 +01:00
committed by GitHub
Unverified
parent 828d7476a0
commit e67bc683f3
+9 -4
View File
@@ -238,12 +238,12 @@ pub(crate) async fn run_turn(
Arc::clone(&turn_diff_tracker),
&mut client_session,
&responses_metadata,
sampling_request_input.clone(),
sampling_request_input,
cancellation_token.child_token(),
)
.await
{
Ok(sampling_request_output) => {
Ok((sampling_request_output, sampling_request_input)) => {
let SamplingRequestResult {
needs_follow_up: model_needs_follow_up,
last_agent_message: sampling_request_last_agent_message,
@@ -1036,7 +1036,7 @@ async fn run_sampling_request(
responses_metadata: &CodexResponsesMetadata,
input: Vec<ResponseItem>,
cancellation_token: CancellationToken,
) -> CodexResult<SamplingRequestResult> {
) -> CodexResult<(SamplingRequestResult, Vec<ResponseItem>)> {
let router = built_tools(sess.as_ref(), turn_context.as_ref(), &cancellation_token).await?;
let base_instructions = sess.get_base_instructions().await;
@@ -1056,6 +1056,7 @@ async fn run_sampling_request(
let max_retries = turn_context.provider.info().stream_max_retries();
let mut retries = 0;
let mut initial_input = Some(input);
let mut original_input = None;
loop {
let prompt_input = if let Some(input) = initial_input.take() {
input
@@ -1084,7 +1085,7 @@ async fn run_sampling_request(
.await
{
Ok(output) => {
return Ok(output);
return Ok((output, original_input.unwrap_or(prompt.input)));
}
Err(CodexErr::ContextWindowExceeded) => {
sess.set_total_tokens_full(&turn_context).await;
@@ -1100,6 +1101,10 @@ async fn run_sampling_request(
Err(err) => err,
};
if original_input.is_none() {
original_input = Some(prompt.input);
}
if !err.is_retryable() {
return Err(err);
}