fix: scope context remaining to body window (#29665)

## Why With `model_auto_compact_token_limit_scope = "body_after_prefix"`, the persistent prefix should not count against the active body window. `get_context_remaining` and the token-budget reminder should report the same usable body-after-prefix window that auto-compaction uses, rather than the total token count since the session began. This is stacked on #29664 so the mechanical move from `turn.rs` is isolated from the behavior fix. ## What - Extends `ContextWindowTokenStatus` with `context_remaining_tokens`. - Updates `get_context_remaining` to use the shared context-window accounting. - Adds integration coverage for body-after-prefix reminder timing and `get_context_remaining` output. ## Testing - `just test -p codex-core body_after_prefix_window` - `just test -p codex-core auto_compact_body_after_prefix` - `just fix -p codex-core`
2026-07-01 00:31:56 +08:00 · 2026-06-23 16:08:54 -07:00
parent 4dde907d27
commit 77e7ce1374
6 changed files with 204 additions and 45 deletions
@@ -8,23 +8,17 @@ pub(crate) struct ContextWindowTokenStatus {
    pub(crate) active_context_tokens: i64,
    // Usage counted against `model_auto_compact_token_limit` for the current scope.
    pub(crate) auto_compact_scope_tokens: i64,
-    pub(crate) auto_compact_scope_limit: i64,
+    pub(crate) auto_compact_scope_limit: Option<i64>,
    pub(crate) full_context_window_limit: Option<i64>,
+    pub(crate) tokens_until_compaction: Option<i64>,
    pub(crate) auto_compact_window_prefill_tokens: Option<i64>,
    pub(crate) full_context_window_limit_reached: bool,
    pub(crate) token_limit_reached: bool,
 }

-impl ContextWindowTokenStatus {
-    pub(crate) fn tokens_until_compaction(&self) -> i64 {
-        let full_context_remaining = self.full_context_window_limit.map_or(i64::MAX, |limit| {
-            limit.saturating_sub(self.active_context_tokens)
-        });
-        self.auto_compact_scope_limit
-            .saturating_sub(self.auto_compact_scope_tokens)
-            .min(full_context_remaining)
-            .max(0)
-    }
+struct BodyAfterPrefixWindowStatus {
+    full_context_window_limit: Option<i64>,
+    auto_compact_window_prefill_tokens: Option<i64>,
 }

 pub(crate) async fn context_window_token_status(
@@ -32,43 +26,65 @@ pub(crate) async fn context_window_token_status(
    turn_context: &TurnContext,
 ) -> ContextWindowTokenStatus {
    let active_context_tokens = sess.get_total_token_usage().await;
-    let mut auto_compact_window_prefill_tokens = None;
-    let (auto_compact_scope_tokens, auto_compact_scope_limit, full_context_window_limit) =
+
+    let (auto_compact_scope_tokens, auto_compact_scope_limit, body_window) =
        match turn_context.config.model_auto_compact_token_limit_scope {
            AutoCompactTokenLimitScope::Total => (
                active_context_tokens,
-                turn_context
-                    .model_info
-                    .auto_compact_token_limit()
-                    .unwrap_or(i64::MAX),
+                turn_context.model_info.auto_compact_token_limit(),
                None,
            ),
            AutoCompactTokenLimitScope::BodyAfterPrefix => {
                let window = sess.auto_compact_window_snapshot().await;
-                auto_compact_window_prefill_tokens = window.prefill_input_tokens;
                let baseline = window.prefill_input_tokens.unwrap_or(active_context_tokens);
+
+                let scope_limit = turn_context
+                    .config
+                    .model_auto_compact_token_limit
+                    .or_else(|| turn_context.model_info.auto_compact_token_limit());
+                let full_context_window_limit = turn_context.model_context_window();
+
                (
                    active_context_tokens.saturating_sub(baseline),
-                    turn_context
-                        .config
-                        .model_auto_compact_token_limit
-                        .or_else(|| turn_context.model_info.auto_compact_token_limit())
-                        .unwrap_or(i64::MAX),
-                    turn_context.model_context_window(),
+                    scope_limit,
+                    Some(BodyAfterPrefixWindowStatus {
+                        full_context_window_limit,
+                        auto_compact_window_prefill_tokens: window.prefill_input_tokens,
+                    }),
                )
            }
        };
+
+    let full_context_window_limit = body_window
+        .as_ref()
+        .and_then(|window| window.full_context_window_limit);
+    let auto_compact_window_prefill_tokens = body_window
+        .as_ref()
+        .and_then(|window| window.auto_compact_window_prefill_tokens);
+
    let full_context_window_limit_reached =
        full_context_window_limit.is_some_and(|full_context_window_limit| {
            active_context_tokens >= full_context_window_limit
        });
-    let token_limit_reached =
-        auto_compact_scope_tokens >= auto_compact_scope_limit || full_context_window_limit_reached;
+    let token_limit_reached = auto_compact_scope_limit
+        .is_some_and(|limit| auto_compact_scope_tokens >= limit)
+        || full_context_window_limit_reached;
+
+    let auto_compact_scope_remaining = auto_compact_scope_limit
+        .map(|limit| limit.saturating_sub(auto_compact_scope_tokens).max(0));
+    let full_context_remaining =
+        full_context_window_limit.map(|limit| limit.saturating_sub(active_context_tokens).max(0));
+    let tokens_until_compaction = match (auto_compact_scope_remaining, full_context_remaining) {
+        (Some(scope_remaining), Some(full_remaining)) => Some(scope_remaining.min(full_remaining)),
+        (scope_remaining, full_remaining) => scope_remaining.or(full_remaining),
+    };
+
    ContextWindowTokenStatus {
        active_context_tokens,
        auto_compact_scope_tokens,
        auto_compact_scope_limit,
        full_context_window_limit,
+        tokens_until_compaction,
        auto_compact_window_prefill_tokens,
        full_context_window_limit_reached,
        token_limit_reached,
@@ -6,11 +6,14 @@ use codex_features::Feature;
 pub(super) async fn maybe_record(
    sess: &Session,
    turn_context: &TurnContext,
-    tokens_until_compaction: i64,
+    tokens_until_compaction: Option<i64>,
 ) {
    if !turn_context.config.features.enabled(Feature::TokenBudget) {
        return;
    }
+    let Some(tokens_until_compaction) = tokens_until_compaction else {
+        return;
+    };

    let Some(config) = turn_context.config.token_budget.as_ref().filter(|config| {
        config
@@ -312,7 +312,7 @@ pub(crate) async fn run_turn(
                    total_usage_tokens = token_status.active_context_tokens,
                    auto_compact_scope_tokens = token_status.auto_compact_scope_tokens,
                    estimated_token_count = ?estimated_token_count,
-                    auto_compact_scope_limit = token_status.auto_compact_scope_limit,
+                    auto_compact_scope_limit = ?token_status.auto_compact_scope_limit,
                    auto_compact_limit_scope = ?turn_context.config.model_auto_compact_token_limit_scope,
                    auto_compact_window_prefill_tokens = ?token_status.auto_compact_window_prefill_tokens,
                    full_context_window_limit = ?token_status.full_context_window_limit,
@@ -324,11 +324,10 @@ pub(crate) async fn run_turn(
                    "post sampling token usage"
                );

-                let tokens_until_compaction = token_status.tokens_until_compaction();
                super::token_budget::maybe_record(
                    sess.as_ref(),
                    turn_context.as_ref(),
-                    tokens_until_compaction,
+                    token_status.tokens_until_compaction,
                )
                .await;

@@ -75,19 +75,15 @@ impl ToolExecutor<ToolInvocation> for GetContextRemainingHandler {
                ));
            }

-            let Some(model_context_window) = invocation.turn.model_context_window() else {
-                return Ok(boxed_tool_output(GetContextRemainingOutput::new(
-                    /*tokens_left*/ None,
-                )));
-            };
-            let active_context_tokens = invocation.session.get_total_token_usage().await.max(0);
-            let tokens_left = model_context_window
-                .saturating_sub(active_context_tokens)
-                .max(0);
+            let token_status = crate::session::context_window::context_window_token_status(
+                invocation.session.as_ref(),
+                invocation.turn.as_ref(),
+            )
+            .await;

-            Ok(boxed_tool_output(GetContextRemainingOutput::new(Some(
-                tokens_left,
-            ))))
+            Ok(boxed_tool_output(GetContextRemainingOutput::new(
+                token_status.tokens_until_compaction,
+            )))
        })
    }
 }
@@ -856,7 +856,7 @@ text(JSON.stringify(result));
    assert_eq!(
        parsed,
        serde_json::json!({
-            "tokens_left": 9500,
+            "tokens_left": 9000,
        })
    );

@@ -4,6 +4,7 @@ use codex_config::types::McpServerTransportConfig;
 use codex_core::config::TokenBudgetConfig;
 use codex_features::Feature;
 use codex_model_provider_info::built_in_model_providers;
+use codex_protocol::config_types::AutoCompactTokenLimitScope;
 use codex_protocol::protocol::CONTEXT_WINDOW_CLOSE_TAG;
 use codex_protocol::protocol::CONTEXT_WINDOW_OPEN_TAG;
 use codex_protocol::protocol::EventMsg;
@@ -79,6 +80,22 @@ fn tool_names(request: &ResponsesRequest) -> Vec<String> {
        .collect()
 }

+fn ev_completed_with_usage(id: &str, input_tokens: i64, output_tokens: i64) -> Value {
+    json!({
+        "type": "response.completed",
+        "response": {
+            "id": id,
+            "usage": {
+                "input_tokens": input_tokens,
+                "input_tokens_details": null,
+                "output_tokens": output_tokens,
+                "output_tokens_details": null,
+                "total_tokens": input_tokens + output_tokens
+            }
+        }
+    })
+}
+
 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn token_budget_context_is_only_emitted_with_full_context() -> Result<()> {
    skip_if_no_network!(Ok(()));
@@ -264,6 +281,70 @@ async fn token_budget_reminder_emits_after_crossing_compaction_threshold() -> Re
    Ok(())
 }

+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn token_budget_reminder_uses_body_after_prefix_window() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = start_mock_server().await;
+    let responses = mount_sse_sequence(
+        &server,
+        vec![
+            sse(vec![
+                ev_response_created("resp-1"),
+                ev_completed_with_tokens("resp-1", /*total_tokens*/ 8_000),
+            ]),
+            sse(vec![
+                ev_response_created("resp-2"),
+                ev_completed_with_tokens("resp-2", /*total_tokens*/ 8_600),
+            ]),
+            sse(vec![ev_response_created("resp-3"), ev_completed("resp-3")]),
+        ],
+    )
+    .await;
+    let test = test_codex()
+        .with_config(|config| {
+            config.model_context_window = Some(10_000);
+            config.model_auto_compact_token_limit = Some(1_000);
+            config.model_auto_compact_token_limit_scope =
+                AutoCompactTokenLimitScope::BodyAfterPrefix;
+            config.token_budget = Some(TokenBudgetConfig {
+                reminder_threshold_tokens: Some(600),
+                ..TokenBudgetConfig::default()
+            });
+            config
+                .features
+                .enable(Feature::TokenBudget)
+                .expect("test config should allow token budget");
+        })
+        .build(&server)
+        .await?;
+
+    test.submit_turn("establish prefix").await?;
+    test.submit_turn("grow body").await?;
+    test.submit_turn("observe reminder").await?;
+
+    let requests = responses.requests();
+    assert_eq!(requests.len(), 3);
+    let reminder = "Your context window is nearly exhausted (only 400 tokens remaining) and will be automatically reset for you soon. Once reset, message items in current context window will be cleared in the new window, but notes and history items will be persistent across windows.";
+    assert!(
+        requests[1]
+            .message_input_texts("developer")
+            .into_iter()
+            .all(|text| text != reminder),
+        "first-window prefix should not count against the body-after-prefix reminder threshold"
+    );
+    assert_eq!(
+        requests[2]
+            .message_input_texts("developer")
+            .into_iter()
+            .filter(|text| text == reminder)
+            .count(),
+        1
+    );
+
+    Ok(())
+}
+
 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn get_context_remaining_returns_token_budget_remaining_fragment() -> Result<()> {
    skip_if_no_network!(Ok(()));
@@ -315,7 +396,7 @@ async fn get_context_remaining_returns_token_budget_remaining_fragment() -> Resu
    );

    let thread_id = test.session_configured.thread_id;
-    let remaining_context = "You have 7000 tokens left in this context window.".to_string();
+    let remaining_context = "You have 6500 tokens left in this context window.".to_string();
    let token_budgets = token_budget_contexts(&requests[1]);
    assert_eq!(token_budgets.len(), 1);
    token_budget_window_ids(&token_budgets[0], thread_id);
@@ -328,7 +409,71 @@ async fn get_context_remaining_returns_token_budget_remaining_fragment() -> Resu
 }

 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-async fn get_context_remaining_returns_unknown_when_window_is_unavailable() -> Result<()> {
+async fn get_context_remaining_uses_body_after_prefix_window() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = start_mock_server().await;
+    let call_id = "remaining-call";
+    let responses = mount_sse_sequence(
+        &server,
+        vec![
+            sse(vec![
+                ev_response_created("resp-1"),
+                ev_assistant_message("msg-1", "noted"),
+                ev_completed_with_usage(
+                    "resp-1", /*input_tokens*/ 2_000, /*output_tokens*/ 500,
+                ),
+            ]),
+            sse(vec![
+                ev_response_created("resp-2"),
+                ev_function_call(call_id, "get_context_remaining", "{}"),
+                ev_completed_with_tokens("resp-2", /*total_tokens*/ 2_500),
+            ]),
+            sse(vec![
+                ev_response_created("resp-3"),
+                ev_assistant_message("msg-3", "done"),
+                ev_completed("resp-3"),
+            ]),
+        ],
+    )
+    .await;
+    let test = test_codex()
+        .with_config(|config| {
+            config.model_context_window = Some(10_000);
+            config.model_auto_compact_token_limit = Some(7_000);
+            config.model_auto_compact_token_limit_scope =
+                AutoCompactTokenLimitScope::BodyAfterPrefix;
+            config
+                .features
+                .enable(Feature::TokenBudget)
+                .expect("test config should allow token budget");
+        })
+        .build(&server)
+        .await?;
+
+    test.submit_turn("spend some tokens").await?;
+    test.submit_turn("check remaining context").await?;
+
+    let requests = responses.requests();
+    assert_eq!(requests.len(), 3);
+    assert!(
+        tool_names(&requests[1])
+            .iter()
+            .any(|name| name == "get_context_remaining"),
+        "get_context_remaining should be exposed when token budget is enabled"
+    );
+
+    let remaining_context = "You have 6500 tokens left in this context window.".to_string();
+    assert_eq!(
+        requests[2].function_call_output_content_and_success(call_id),
+        Some((Some(remaining_context), None))
+    );
+
+    Ok(())
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn get_context_remaining_returns_unknown_when_threshold_is_unbounded() -> Result<()> {
    skip_if_no_network!(Ok(()));

    let server = start_mock_server().await;