mirror of
https://github.com/pchuan98/codex.git
synced 2026-07-01 00:31:56 +08:00
[codex] Retry temporarily offline exec-server recovery (#30098)
## Summary - retry ERS `409 environment_offline` responses inside the existing exec-server recovery loop - keep all other registry conflicts terminal - add focused coverage for both cases ## Root cause When an exec server disconnects and reconnects, the client already starts recovery and calls ERS `/connect`. During the transient executor presence gap, ERS can return `409 environment_offline`. The retry classifier treated every 409 as terminal, so the first response aborted the existing 25-second recovery window before the executor came back online. That then caused active processes to be marked lost. This change classifies only the structured `environment_offline` conflict as retryable. Recovery continues with the existing bounded deadline, exponential backoff, and jitter. ## Validation - `just test -p codex-exec-server client::recovery::tests` — 4 passed - `just fix -p codex-exec-server` — passed - `just fmt` — passed - Full `just test -p codex-exec-server` reached unrelated macOS filesystem-sandbox integration failures because nested `/usr/bin/sandbox-exec` is denied in this environment (`sandbox_apply: Operation not permitted`).
This commit is contained in:
committed by
GitHub
Unverified
parent
e8d4a1a411
commit
964b138c3d
@@ -530,10 +530,15 @@ fn is_retryable_registry_error(error: &ExecServerError) -> bool {
|
||||
if error.is_connect() || error.is_timeout()
|
||||
) || matches!(
|
||||
error,
|
||||
ExecServerError::EnvironmentRegistryHttp { status, .. }
|
||||
ExecServerError::EnvironmentRegistryHttp { status, code, .. }
|
||||
if status.is_server_error()
|
||||
|| *status == reqwest::StatusCode::REQUEST_TIMEOUT
|
||||
|| *status == reqwest::StatusCode::TOO_MANY_REQUESTS
|
||||
// TODO: Replace this coarse retry with an explicit registry/presence
|
||||
// recovery FSM so `environment_offline` is retried only while the
|
||||
// executor is expected to reconnect.
|
||||
|| (*status == reqwest::StatusCode::CONFLICT
|
||||
&& code.as_deref() == Some("environment_offline"))
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -2,10 +2,10 @@ use std::time::Duration;
|
||||
|
||||
use super::*;
|
||||
|
||||
fn registry_error() -> ExecServerError {
|
||||
fn registry_error(status: reqwest::StatusCode, code: Option<&str>) -> ExecServerError {
|
||||
ExecServerError::EnvironmentRegistryHttp {
|
||||
status: reqwest::StatusCode::TOO_MANY_REQUESTS,
|
||||
code: None,
|
||||
status,
|
||||
code: code.map(str::to_string),
|
||||
message: "registry unavailable".to_string(),
|
||||
}
|
||||
}
|
||||
@@ -33,8 +33,24 @@ fn registry_recovery_retry_delay_exponentially_backs_off_and_caps() {
|
||||
|
||||
#[test]
|
||||
fn recovery_retries_transient_registry_errors() {
|
||||
let error = registry_error();
|
||||
let error = registry_error(reqwest::StatusCode::TOO_MANY_REQUESTS, /*code*/ None);
|
||||
|
||||
assert!(is_retryable_registry_error(&error));
|
||||
assert!(is_retryable_recovery_error(&error));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn recovery_retries_environment_offline_conflicts() {
|
||||
let error = registry_error(reqwest::StatusCode::CONFLICT, Some("environment_offline"));
|
||||
|
||||
assert!(is_retryable_registry_error(&error));
|
||||
assert!(is_retryable_recovery_error(&error));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn recovery_does_not_retry_other_registry_conflicts() {
|
||||
let error = registry_error(reqwest::StatusCode::CONFLICT, Some("registration_conflict"));
|
||||
|
||||
assert!(!is_retryable_registry_error(&error));
|
||||
assert!(!is_retryable_recovery_error(&error));
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user