[codex] Add token budget context feature (#27438)

## Why

The model should be able to see bounded context-window budget metadata
when the `token_budget` feature is enabled. The full-window message is
only injected with full context, while normal turns get a smaller
follow-up only when reported usage first crosses a budget threshold.

## What changed

- Added the `TokenBudget` feature flag.
- Added `<token_budget>` developer fragments for full context-window
metadata and current-window remaining tokens.
- Inserted the threshold message during normal turn handling by
comparing token usage before and after sampling, avoiding persistent
threshold bookkeeping.
- Added core integration coverage for full-context-only metadata and
25/50/75 percent threshold messages.

## Verification

- `just test -p codex-core token_budget`
- `git diff --check`
This commit is contained in:
pakrym-oai
2026-06-10 20:07:06 -07:00
committed by GitHub
Unverified
parent ab4ce40042
commit 658af936fd
14 changed files with 399 additions and 7 deletions
+6
View File
@@ -624,6 +624,9 @@
"terminal_visualization_instructions": {
"type": "boolean"
},
"token_budget": {
"type": "boolean"
},
"tool_call_mcp_elicitation": {
"type": "boolean"
},
@@ -4759,6 +4762,9 @@
"terminal_visualization_instructions": {
"type": "boolean"
},
"token_budget": {
"type": "boolean"
},
"tool_call_mcp_elicitation": {
"type": "boolean"
},
+2 -1
View File
@@ -294,6 +294,7 @@ async fn run_compact_task_inner_impl(
let user_messages = collect_user_messages(history_items);
let mut new_history = build_compacted_history(Vec::new(), &user_messages, &summary_text);
let window_id = sess.advance_auto_compact_window_id().await;
if matches!(
initial_context_injection,
@@ -310,7 +311,7 @@ async fn run_compact_task_inner_impl(
let compacted_item = CompactedItem {
message: summary_text.clone(),
replacement_history: Some(new_history.clone()),
window_id: None,
window_id: Some(window_id),
};
sess.replace_compacted_history(new_history, reference_context_item, compacted_item)
.await;
+2 -1
View File
@@ -249,6 +249,7 @@ async fn run_remote_compact_task_inner_impl(
turn_metadata_header.as_deref(),
)
.await?;
let new_window_id = sess.advance_auto_compact_window_id().await;
new_history = process_compacted_history(
sess.as_ref(),
turn_context.as_ref(),
@@ -264,7 +265,7 @@ async fn run_remote_compact_task_inner_impl(
let compacted_item = CompactedItem {
message: String::new(),
replacement_history: Some(new_history.clone()),
window_id: None,
window_id: Some(new_window_id),
};
// Install is the semantic boundary where the compact endpoint's output becomes live
// thread history. Keep it distinct from the later inference request so the reducer can
+2 -1
View File
@@ -285,6 +285,7 @@ async fn run_remote_compact_task_inner_impl(
let (compacted_history, retained_images) =
build_v2_compacted_history(&prompt_input, compaction_output);
analytics_details.retained_image_count = Some(retained_images);
let new_window_id = sess.advance_auto_compact_window_id().await;
let new_history = process_compacted_history(
sess.as_ref(),
turn_context.as_ref(),
@@ -300,7 +301,7 @@ async fn run_remote_compact_task_inner_impl(
let compacted_item = CompactedItem {
message: String::new(),
replacement_history: Some(new_history.clone()),
window_id: None,
window_id: Some(new_window_id),
};
compaction_trace.record_installed(&CompactionCheckpointTracePayload {
input_history: &trace_input_history,
+3
View File
@@ -23,6 +23,7 @@ mod realtime_end_instructions;
mod realtime_start_instructions;
mod realtime_start_with_instructions;
mod subagent_notification;
mod token_budget_context;
mod turn_aborted;
mod user_instructions;
mod user_shell_command;
@@ -60,6 +61,8 @@ pub(crate) use realtime_end_instructions::RealtimeEndInstructions;
pub(crate) use realtime_start_instructions::RealtimeStartInstructions;
pub(crate) use realtime_start_with_instructions::RealtimeStartWithInstructions;
pub(crate) use subagent_notification::SubagentNotification;
pub(crate) use token_budget_context::TokenBudgetContext;
pub(crate) use token_budget_context::TokenBudgetRemainingContext;
pub(crate) use turn_aborted::TurnAborted;
pub(crate) use user_instructions::UserInstructions;
pub(crate) use user_shell_command::UserShellCommand;
@@ -0,0 +1,68 @@
use super::ContextualUserFragment;
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct TokenBudgetContext {
window_id: u64,
tokens_left: i64,
}
impl TokenBudgetContext {
pub(crate) fn new(window_id: u64, tokens_left: i64) -> Self {
Self {
window_id,
tokens_left,
}
}
}
impl ContextualUserFragment for TokenBudgetContext {
fn role(&self) -> &'static str {
"developer"
}
fn markers(&self) -> (&'static str, &'static str) {
Self::type_markers()
}
fn type_markers() -> (&'static str, &'static str) {
("<token_budget>\n", "\n</token_budget>")
}
fn body(&self) -> String {
let window_id = self.window_id;
let tokens_left = self.tokens_left;
format!(
"Current context window {window_id}.\nYou have {tokens_left} tokens left in this context window."
)
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct TokenBudgetRemainingContext {
tokens_left: i64,
}
impl TokenBudgetRemainingContext {
pub(crate) fn new(tokens_left: i64) -> Self {
Self { tokens_left }
}
}
impl ContextualUserFragment for TokenBudgetRemainingContext {
fn role(&self) -> &'static str {
"developer"
}
fn markers(&self) -> (&'static str, &'static str) {
Self::type_markers()
}
fn type_markers() -> (&'static str, &'static str) {
("<token_budget>\n", "\n</token_budget>")
}
fn body(&self) -> String {
let tokens_left = self.tokens_left;
format!("You have {tokens_left} tokens left in this context window.")
}
}
+1
View File
@@ -30,6 +30,7 @@ const CONTEXTUAL_DEVELOPER_PREFIXES: &[&str] = &[
COLLABORATION_MODE_OPEN_TAG,
REALTIME_CONVERSATION_OPEN_TAG,
"<personality_spec>",
"<token_budget>",
];
pub(crate) fn is_contextual_user_message_content(message: &[ContentItem]) -> bool {
+13
View File
@@ -1,3 +1,5 @@
use super::has_non_contextual_dev_message_content;
use super::is_contextual_dev_message_content;
use super::parse_turn_item;
use crate::context::ContextualUserFragment;
use crate::context::InternalContextSource;
@@ -16,6 +18,17 @@ use codex_protocol::models::WebSearchAction;
use codex_protocol::user_input::UserInput;
use pretty_assertions::assert_eq;
#[test]
fn recognizes_token_budget_as_contextual_developer_content() {
let content = vec![ContentItem::InputText {
text: "<token_budget>\nYou have 710 tokens left in this context window.\n</token_budget>"
.to_string(),
}];
assert!(is_contextual_dev_message_content(&content));
assert!(!has_non_contextual_dev_message_content(&content));
}
#[test]
fn parses_user_message_with_text_and_two_images() {
let img1 = "https://example.com/one.png".to_string();
+17 -4
View File
@@ -207,6 +207,7 @@ mod review;
mod rollout_reconstruction;
#[allow(clippy::module_inception)]
pub(crate) mod session;
mod token_budget;
pub(crate) mod turn;
pub(crate) mod turn_context;
use self::config_lock::export_config_lock_if_configured;
@@ -2712,15 +2713,13 @@ impl Session {
&self,
items: Vec<ResponseItem>,
reference_context_item: Option<TurnContextItem>,
mut compacted_item: CompactedItem,
compacted_item: CompactedItem,
) {
{
let mut state = self.state.lock().await;
state.replace_history(items, reference_context_item.clone());
}
compacted_item.window_id = Some(self.advance_auto_compact_window_id().await);
self.persist_rollout_items(&[RolloutItem::Compacted(compacted_item)])
.await;
if let Some(turn_context_item) = reference_context_item {
@@ -2805,6 +2804,7 @@ impl Session {
collaboration_mode,
base_instructions,
session_source,
auto_compact_window_id,
) = {
let state = self.state.lock().await;
(
@@ -2813,6 +2813,7 @@ impl Session {
state.session_configuration.collaboration_mode.clone(),
state.session_configuration.base_instructions.clone(),
state.session_configuration.session_source.clone(),
state.auto_compact_window_id(),
)
};
if let Some(model_switch_message) =
@@ -2962,6 +2963,18 @@ impl Session {
.render(),
);
}
// This is full-context metadata. Steady-state context diffs should not re-emit it.
if turn_context.features.enabled(Feature::TokenBudget)
&& let Some(model_context_window) = turn_context.model_context_window()
{
developer_sections.push(
crate::context::TokenBudgetContext::new(
auto_compact_window_id,
model_context_window,
)
.render(),
);
}
if turn_context.config.include_environment_context {
let shell = self.user_shell();
let subagents = self
@@ -3040,7 +3053,7 @@ impl Session {
format!("{thread_id}:{window_id}")
}
async fn advance_auto_compact_window_id(&self) -> u64 {
pub(crate) async fn advance_auto_compact_window_id(&self) -> u64 {
let mut state = self.state.lock().await;
state.advance_auto_compact_window_id()
}
+44
View File
@@ -0,0 +1,44 @@
use super::session::Session;
use super::turn_context::TurnContext;
use crate::context::ContextualUserFragment;
use codex_features::Feature;
const TOKEN_BUDGET_USAGE_THRESHOLDS: [i64; 3] = [25, 50, 75];
pub(super) async fn maybe_record_token_budget_remaining_context(
sess: &Session,
turn_context: &TurnContext,
tokens_before_sampling: i64,
tokens_after_sampling: i64,
) {
if !turn_context.features.enabled(Feature::TokenBudget) {
return;
}
let Some(model_context_window) = turn_context.model_context_window() else {
return;
};
if model_context_window <= 0 || tokens_after_sampling <= tokens_before_sampling {
return;
}
let tokens_before_sampling = tokens_before_sampling.max(0);
let tokens_after_sampling = tokens_after_sampling.max(0);
let crossed_threshold = TOKEN_BUDGET_USAGE_THRESHOLDS.iter().any(|threshold| {
tokens_before_sampling.saturating_mul(100) < model_context_window.saturating_mul(*threshold)
&& tokens_after_sampling.saturating_mul(100)
>= model_context_window.saturating_mul(*threshold)
});
if !crossed_threshold {
return;
}
let tokens_left = model_context_window
.saturating_sub(tokens_after_sampling)
.max(0);
let response_item = ContextualUserFragment::into(
crate::context::TokenBudgetRemainingContext::new(tokens_left),
);
sess.record_conversation_items(turn_context, std::slice::from_ref(&response_item))
.await;
}
+10
View File
@@ -225,6 +225,7 @@ pub(crate) async fn run_turn(
let turn_metadata_header = turn_context
.turn_metadata_state
.current_header_value_for_model_request(&window_id);
let tokens_before_sampling = sess.get_total_token_usage().await;
match run_sampling_request(
Arc::clone(&sess),
Arc::clone(&turn_context),
@@ -275,6 +276,15 @@ pub(crate) async fn run_turn(
"post sampling token usage"
);
let tokens_after_sampling = token_status.active_context_tokens;
super::token_budget::maybe_record_token_budget_remaining_context(
sess.as_ref(),
turn_context.as_ref(),
tokens_before_sampling,
tokens_after_sampling,
)
.await;
// as long as compaction works well in getting us way below the token limit, we shouldn't worry about being in an infinite loop.
if token_limit_reached && needs_follow_up {
if let Err(err) = run_auto_compact(
+1
View File
@@ -110,6 +110,7 @@ mod sqlite_state;
mod stream_error_allows_next_turn;
mod stream_no_completed;
mod subagent_notifications;
mod token_budget;
mod tool_harness;
mod tool_parallelism;
mod tools;
+222
View File
@@ -0,0 +1,222 @@
use anyhow::Result;
use codex_features::Feature;
use codex_model_provider_info::built_in_model_providers;
use codex_protocol::protocol::EventMsg;
use codex_protocol::protocol::Op;
use core_test_support::PathBufExt;
use core_test_support::responses::ResponsesRequest;
use core_test_support::responses::ev_assistant_message;
use core_test_support::responses::ev_completed;
use core_test_support::responses::ev_completed_with_tokens;
use core_test_support::responses::ev_response_created;
use core_test_support::responses::mount_sse_sequence;
use core_test_support::responses::sse;
use core_test_support::responses::start_mock_server;
use core_test_support::skip_if_no_network;
use core_test_support::test_codex::local;
use core_test_support::test_codex::test_codex;
use core_test_support::wait_for_event;
use pretty_assertions::assert_eq;
const CONFIGURED_CONTEXT_WINDOW: i64 = 128_000;
const EFFECTIVE_CONTEXT_WINDOW: i64 = CONFIGURED_CONTEXT_WINDOW * 95 / 100;
fn token_budget_texts(request: &ResponsesRequest) -> Vec<String> {
request
.message_input_texts("developer")
.into_iter()
.filter(|text| text.starts_with("<token_budget>"))
.collect()
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn token_budget_context_is_only_emitted_with_full_context() -> Result<()> {
skip_if_no_network!(Ok(()));
let server = start_mock_server().await;
let responses = mount_sse_sequence(
&server,
vec![
sse(vec![ev_response_created("resp-1"), ev_completed("resp-1")]),
sse(vec![ev_response_created("resp-2"), ev_completed("resp-2")]),
],
)
.await;
let test = test_codex()
.with_config(|config| {
config.model_context_window = Some(CONFIGURED_CONTEXT_WINDOW);
config
.features
.enable(Feature::TokenBudget)
.expect("test config should allow token budget");
})
.build(&server)
.await?;
test.submit_turn("first turn").await?;
let second_cwd = test.workspace_path("second-cwd");
std::fs::create_dir_all(&second_cwd)?;
test.submit_turn_with_environments("second turn", Some(vec![local(second_cwd.abs())]))
.await?;
let requests = responses.requests();
assert_eq!(requests.len(), 2);
let expected = vec![format!(
"<token_budget>\nCurrent context window 0.\nYou have {EFFECTIVE_CONTEXT_WINDOW} tokens left in this context window.\n</token_budget>"
)];
assert_eq!(
token_budget_texts(&requests[0]),
expected,
"initial full context should report context window 0"
);
assert_eq!(
token_budget_texts(&requests[1]),
expected,
"steady-state context update should not advance the context window"
);
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn token_budget_remaining_context_emits_on_first_threshold_crossing() -> Result<()> {
skip_if_no_network!(Ok(()));
let server = start_mock_server().await;
let responses = mount_sse_sequence(
&server,
vec![
sse(vec![
ev_response_created("resp-1"),
ev_completed_with_tokens("resp-1", /*total_tokens*/ 2_500),
]),
sse(vec![
ev_response_created("resp-2"),
ev_completed_with_tokens("resp-2", /*total_tokens*/ 3_000),
]),
sse(vec![
ev_response_created("resp-3"),
ev_completed_with_tokens("resp-3", /*total_tokens*/ 5_000),
]),
sse(vec![
ev_response_created("resp-4"),
ev_completed_with_tokens("resp-4", /*total_tokens*/ 8_000),
]),
sse(vec![ev_response_created("resp-5"), ev_completed("resp-5")]),
],
)
.await;
let test = test_codex()
.with_config(|config| {
config.model_context_window = Some(10_000);
config
.features
.enable(Feature::TokenBudget)
.expect("test config should allow token budget");
})
.build(&server)
.await?;
for turn in 1..=5 {
test.submit_turn(&format!("turn {turn}")).await?;
}
let requests = responses.requests();
assert_eq!(requests.len(), 5);
let full_context = "<token_budget>\nCurrent context window 0.\nYou have 9500 tokens left in this context window.\n</token_budget>"
.to_string();
let threshold_25 =
"<token_budget>\nYou have 7000 tokens left in this context window.\n</token_budget>"
.to_string();
let threshold_50 =
"<token_budget>\nYou have 4500 tokens left in this context window.\n</token_budget>"
.to_string();
let threshold_75 =
"<token_budget>\nYou have 1500 tokens left in this context window.\n</token_budget>"
.to_string();
assert_eq!(token_budget_texts(&requests[0]), vec![full_context.clone()]);
assert_eq!(
token_budget_texts(&requests[1]),
vec![full_context.clone(), threshold_25.clone()]
);
assert_eq!(
token_budget_texts(&requests[2]),
vec![full_context.clone(), threshold_25.clone()]
);
assert_eq!(
token_budget_texts(&requests[3]),
vec![
full_context.clone(),
threshold_25.clone(),
threshold_50.clone()
]
);
assert_eq!(
token_budget_texts(&requests[4]),
vec![full_context, threshold_25, threshold_50, threshold_75]
);
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn token_budget_context_uses_new_window_after_compaction() -> Result<()> {
skip_if_no_network!(Ok(()));
let server = start_mock_server().await;
let responses = mount_sse_sequence(
&server,
vec![
sse(vec![ev_response_created("resp-1"), ev_completed("resp-1")]),
sse(vec![
ev_response_created("resp-compact"),
ev_assistant_message("msg-compact", "compact summary"),
ev_completed("resp-compact"),
]),
sse(vec![ev_response_created("resp-2"), ev_completed("resp-2")]),
],
)
.await;
let mut model_provider = built_in_model_providers(/*openai_base_url*/ None)["openai"].clone();
model_provider.name = "OpenAI-compatible test provider".to_string();
model_provider.base_url = Some(format!("{}/v1", server.uri()));
model_provider.supports_websockets = false;
let test = test_codex()
.with_config(move |config| {
config.model_provider = model_provider;
config.model_context_window = Some(CONFIGURED_CONTEXT_WINDOW);
config
.features
.enable(Feature::TokenBudget)
.expect("test config should allow token budget");
})
.build(&server)
.await?;
test.submit_turn("before compact").await?;
test.codex.submit(Op::Compact).await?;
wait_for_event(&test.codex, |event| {
matches!(event, EventMsg::TurnComplete(_))
})
.await;
test.submit_turn("after compact").await?;
let requests = responses.requests();
assert_eq!(requests.len(), 3);
assert_eq!(
token_budget_texts(&requests[2]),
vec![format!(
"<token_budget>\nCurrent context window 1.\nYou have {EFFECTIVE_CONTEXT_WINDOW} tokens left in this context window.\n</token_budget>"
)],
"post-compaction full context should report context window 1"
);
Ok(())
}
+8
View File
@@ -197,6 +197,8 @@ pub enum Feature {
GuardianApproval,
/// Enable persisted thread goals and automatic goal continuation.
Goals,
/// Add current context-window metadata to model-visible context.
TokenBudget,
/// Route MCP tool approval prompts through the MCP elicitation request path.
ToolCallMcpElicitation,
/// Prompt Codex Apps connector auth failures through MCP URL elicitations.
@@ -1142,6 +1144,12 @@ pub const FEATURES: &[FeatureSpec] = &[
stage: Stage::Stable,
default_enabled: true,
},
FeatureSpec {
id: Feature::TokenBudget,
key: "token_budget",
stage: Stage::UnderDevelopment,
default_enabled: false,
},
FeatureSpec {
id: Feature::CollaborationModes,
key: "collaboration_modes",