From b89d91f6ffee5474f23a97b23b401644c4e087d5 Mon Sep 17 00:00:00 2001 From: Celia Chen Date: Mon, 8 Jun 2026 14:41:05 -0700 Subject: [PATCH] feat: support oneOf and allOf in tool input schemas (#24118) ## Why Some connector golden schemas use JSON Schema composition keywords beyond `anyOf`, specifically top-level or nested `oneOf` and `allOf`. Codex currently needs to preserve those shapes when parsing MCP tool input schemas so connector tools do not lose valid schema structure during normalization. To prevent an increased Responses API error rate, this PR will be merged after the Responses API supports top-level `oneOf`/`allOf`. ## What Changed - Adds `oneOf` and `allOf` support to `JsonSchema`, matching the existing `anyOf` handling. - Traverses `oneOf` and `allOf` anywhere schema children are visited, including sanitization, definition reachability, description stripping, and deep schema compaction. - Adds a final large-schema compaction pass that prunes schema objects containing `anyOf`, `oneOf`, or `allOf` to `{}` if earlier compaction passes still leave the schema over budget. ## Validation Golden schema token validation over `2,025` schemas under `golden_schemas`, all parsed successfully. Token count is `o200k_base` over compact JSON from `parse_tool_input_schema`. | Percentile | Before PR | After oneOf/allOf | After pruning | |---|---:|---:|---:| | p0 | 9 | 9 | 9 | | p10 | 63 | 64 | 64 | | p25 | 86 | 87 | 87 | | p50 | 125 | 128 | 128 | | p75 | 203 | 206 | 206 | | p90 | 327 | 333 | 333 | | p95 | 460 | 473 | 473 | | p99 | 763 | 779 | 779 | | max | 891 | 955 | 955 | Totals: | Parser state | Total tokens | |---|---:| | Before PR | 345,713 | | After oneOf/allOf | 352,686 | | After pruning | 352,686 | The pruning column matches the oneOf/allOf column for this corpus because no parsed compact golden schema remains over the `4,000` compact-byte budget after the earlier compaction passes. --- codex-rs/tools/src/json_schema.rs | 65 +++++- codex-rs/tools/src/json_schema_tests.rs | 202 ++++++++++++++++++ .../json_schema_policy/google_drive.json | 13 +- 3 files changed, 270 insertions(+), 10 deletions(-) diff --git a/codex-rs/tools/src/json_schema.rs b/codex-rs/tools/src/json_schema.rs index 5352eaa91..528bb3adf 100644 --- a/codex-rs/tools/src/json_schema.rs +++ b/codex-rs/tools/src/json_schema.rs @@ -6,13 +6,15 @@ use std::collections::BTreeMap; use std::collections::BTreeSet; const DEFINITION_TABLE_KEYS: [&str; 2] = ["$defs", "definitions"]; -const SCHEMA_CHILD_KEYS: [&str; 2] = ["items", "anyOf"]; +const SCHEMA_CHILD_KEYS: [&str; 4] = ["items", "anyOf", "oneOf", "allOf"]; +const COMPOSITION_SCHEMA_KEYS: [&str; 3] = ["anyOf", "oneOf", "allOf"]; /// Primitive JSON Schema type names we support in tool definitions. /// /// This mirrors the OpenAI Structured Outputs subset for JSON Schema `type`: /// string, number, boolean, integer, object, array, and null. -/// Keywords such as `enum`, `const`, and `anyOf` are modeled separately. +/// Keywords such as `enum`, `const`, `anyOf`, `oneOf`, and `allOf` are modeled +/// separately. /// See . #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)] #[serde(rename_all = "lowercase")] @@ -61,6 +63,10 @@ pub struct JsonSchema { pub additional_properties: Option, #[serde(rename = "anyOf", skip_serializing_if = "Option::is_none")] pub any_of: Option>, + #[serde(rename = "oneOf", skip_serializing_if = "Option::is_none")] + pub one_of: Option>, + #[serde(rename = "allOf", skip_serializing_if = "Option::is_none")] + pub all_of: Option>, #[serde(rename = "$defs", skip_serializing_if = "Option::is_none")] pub defs: Option>, #[serde(skip_serializing_if = "Option::is_none")] @@ -85,6 +91,22 @@ impl JsonSchema { } } + pub fn one_of(variants: Vec, description: Option) -> Self { + Self { + description, + one_of: Some(variants), + ..Default::default() + } + } + + pub fn all_of(variants: Vec, description: Option) -> Self { + Self { + description, + all_of: Some(variants), + ..Default::default() + } + } + pub fn boolean(description: Option) -> Self { Self::typed(JsonSchemaPrimitiveType::Boolean, description) } @@ -219,6 +241,7 @@ const LARGE_SCHEMA_COMPACTION_PASSES: &[LargeSchemaCompactionPass] = &[ strip_schema_descriptions, drop_schema_definitions, collapse_deep_schema_objects_from_root, + prune_schema_compositions, ]; fn collapse_deep_schema_objects_from_root(value: &mut JsonValue) { @@ -397,6 +420,27 @@ fn collapse_deep_schema_objects(value: &mut JsonValue, depth: usize) { } } +fn prune_schema_compositions(value: &mut JsonValue) { + match value { + JsonValue::Array(values) => { + for value in values { + prune_schema_compositions(value); + } + } + JsonValue::Object(map) => { + if has_composition_keyword(map) { + *value = json!({}); + return; + } + + for_each_schema_child_mut(map, DefinitionTraversal::Skip, &mut |value| { + prune_schema_compositions(value); + }); + } + _ => {} + } +} + fn is_complex_schema_object(map: &serde_json::Map) -> bool { SCHEMA_CHILD_KEYS.iter().any(|key| map.contains_key(*key)) || map.contains_key("properties") @@ -404,10 +448,16 @@ fn is_complex_schema_object(map: &serde_json::Map) -> bool { || map.contains_key("$ref") } +fn has_composition_keyword(map: &serde_json::Map) -> bool { + COMPOSITION_SCHEMA_KEYS + .into_iter() + .any(|key| map.contains_key(key)) +} + /// Sanitize a JSON Schema (as serde_json::Value) so it can fit our limited /// schema representation. This function: /// - Ensures every typed schema object has a `"type"` when required. -/// - Preserves explicit `anyOf`. +/// - Preserves explicit `anyOf`, `oneOf`, and `allOf`. /// - Preserves `$ref` and reachable local `$defs` / `definitions`. /// - Collapses `const` into single-value `enum`. /// - Fills required child fields for object/array schema types, including @@ -443,8 +493,10 @@ fn sanitize_json_schema(value: &mut JsonValue) { if let Some(value) = map.get_mut("prefixItems") { sanitize_json_schema(value); } - if let Some(value) = map.get_mut("anyOf") { - sanitize_json_schema(value); + for key in COMPOSITION_SCHEMA_KEYS { + if let Some(value) = map.get_mut(key) { + sanitize_json_schema(value); + } } for table in DEFINITION_TABLE_KEYS { sanitize_schema_table(map, table); @@ -456,7 +508,8 @@ fn sanitize_json_schema(value: &mut JsonValue) { let mut schema_types = normalized_schema_types(map); - if schema_types.is_empty() && (map.contains_key("$ref") || map.contains_key("anyOf")) { + if schema_types.is_empty() && (map.contains_key("$ref") || has_composition_keyword(map)) + { return; } diff --git a/codex-rs/tools/src/json_schema_tests.rs b/codex-rs/tools/src/json_schema_tests.rs index 6973d06c0..1f245c6a6 100644 --- a/codex-rs/tools/src/json_schema_tests.rs +++ b/codex-rs/tools/src/json_schema_tests.rs @@ -728,6 +728,109 @@ fn parse_tool_input_schema_preserves_nested_any_of_property() { ); } +#[test] +fn parse_tool_input_schema_preserves_nested_one_of_property() { + // Example schema shape: + // { + // "type": "object", + // "properties": { + // "query": { + // "oneOf": [ + // { "const": "exact" }, + // { "type": "number" } + // ] + // } + // } + // } + // + // Expected normalization behavior: + // - The nested `oneOf` is preserved. + // - Child variants are recursively sanitized, including `const` to `enum`. + let schema = parse_tool_input_schema(&serde_json::json!({ + "type": "object", + "properties": { + "query": { + "oneOf": [ + { "const": "exact" }, + { "type": "number" } + ] + } + } + })) + .expect("parse schema"); + + assert_eq!( + schema, + JsonSchema::object( + BTreeMap::from([( + "query".to_string(), + JsonSchema::one_of( + vec![ + JsonSchema::string_enum( + vec![serde_json::json!("exact")], + /*description*/ None, + ), + JsonSchema::number(/*description*/ None), + ], + /*description*/ None, + ), + )]), + /*required*/ None, + /*additional_properties*/ None + ) + ); +} + +#[test] +fn parse_tool_input_schema_preserves_nested_all_of_property() { + // Example schema shape: + // { + // "type": "object", + // "properties": { + // "query": { + // "allOf": [ + // { "type": "string" }, + // { "description": "unrecognized by itself" } + // ] + // } + // } + // } + // + // Expected normalization behavior: + // - The nested `allOf` is preserved structurally rather than flattened. + // - Child variants are recursively sanitized. + let schema = parse_tool_input_schema(&serde_json::json!({ + "type": "object", + "properties": { + "query": { + "allOf": [ + { "type": "string" }, + { "description": "unrecognized by itself" } + ] + } + } + })) + .expect("parse schema"); + + assert_eq!( + schema, + JsonSchema::object( + BTreeMap::from([( + "query".to_string(), + JsonSchema::all_of( + vec![ + JsonSchema::string(/*description*/ None), + JsonSchema::default(), + ], + /*description*/ None, + ), + )]), + /*required*/ None, + /*additional_properties*/ None + ) + ); +} + #[test] fn parse_tool_input_schema_preserves_type_unions_without_rewriting_to_any_of() { // Example schema shape: @@ -1070,6 +1173,77 @@ fn parse_large_tool_input_schema_strips_descriptions_without_removing_descriptio ); } +#[test] +fn parse_large_tool_input_schema_prunes_compositions_as_last_resort() { + for composition_key in super::COMPOSITION_SCHEMA_KEYS { + let variants = vec![ + serde_json::json!({ + "type": "string", + "enum": ["first ".repeat(400)] + }), + serde_json::json!({ + "type": "string", + "enum": ["second ".repeat(400)] + }), + serde_json::json!({ + "type": "string", + "enum": ["third ".repeat(400)] + }), + ]; + + let mut choice = serde_json::Map::new(); + choice.insert( + composition_key.to_string(), + serde_json::Value::Array(variants), + ); + let schema = parse_tool_input_schema(&serde_json::json!({ + "type": "object", + "properties": { + "choice": choice + } + })) + .expect("parse schema"); + + assert_eq!( + serde_json::to_value(schema).expect("serialize schema"), + serde_json::json!({ + "type": "object", + "properties": { + "choice": {} + } + }) + ); + } +} + +#[test] +fn parse_large_tool_input_schema_prunes_single_composition_variant_if_still_over_budget() { + let schema = parse_tool_input_schema(&serde_json::json!({ + "type": "object", + "properties": { + "choice": { + "anyOf": [ + { + "type": "string", + "enum": ["x".repeat(4_500)] + } + ] + } + } + })) + .expect("parse schema"); + + assert_eq!( + serde_json::to_value(schema).expect("serialize schema"), + serde_json::json!({ + "type": "object", + "properties": { + "choice": {} + } + }) + ); +} + #[test] fn parse_large_tool_input_schema_preserves_object_enum_literal_descriptions() { let schema = parse_tool_input_schema(&serde_json::json!({ @@ -1397,10 +1571,24 @@ fn parse_tool_input_schema_collects_refs_from_schema_child_keywords() { {"$ref": "#/$defs/Choice"}, {"type": "string"} ] + }, + "exclusive_choice": { + "oneOf": [ + {"$ref": "#/$defs/ExclusiveChoice"}, + {"type": "integer"} + ] + }, + "combined": { + "allOf": [ + {"$ref": "#/$defs/Combined"}, + {"type": "object"} + ] } }, "$defs": { + "Combined": {"type": "object"}, "Choice": {"type": "boolean"}, + "ExclusiveChoice": {"type": "null"}, "Extra": {"type": "number"}, "Item": {"type": "string"}, "Unused": {"type": "null"} @@ -1419,6 +1607,18 @@ fn parse_tool_input_schema_collects_refs_from_schema_child_keywords() { {"type": "string"} ] }, + "combined": { + "allOf": [ + {"$ref": "#/$defs/Combined"}, + {"type": "object", "properties": {}} + ] + }, + "exclusive_choice": { + "oneOf": [ + {"$ref": "#/$defs/ExclusiveChoice"}, + {"type": "integer"} + ] + }, "items_holder": { "type": "array", "items": {"$ref": "#/$defs/Item"} @@ -1430,7 +1630,9 @@ fn parse_tool_input_schema_collects_refs_from_schema_child_keywords() { } }, "$defs": { + "Combined": {"type": "object", "properties": {}}, "Choice": {"type": "boolean"}, + "ExclusiveChoice": {"type": "null"}, "Extra": {"type": "number"}, "Item": {"type": "string"} } diff --git a/codex-rs/tools/tests/fixtures/json_schema_policy/google_drive.json b/codex-rs/tools/tests/fixtures/json_schema_policy/google_drive.json index bf37296cc..518038aee 100644 --- a/codex-rs/tools/tests/fixtures/json_schema_policy/google_drive.json +++ b/codex-rs/tools/tests/fixtures/json_schema_policy/google_drive.json @@ -53,13 +53,18 @@ { "pointer": "/properties/title/description", "value": "Copied file title." + }, + { + "pointer": "/properties/file/oneOf/0/type", + "value": "string" + }, + { + "pointer": "/properties/metadata/allOf/0/properties/source/type", + "value": "string" } ], "expected_pruned": [], - "expected_dropped_fields": [ - "/properties/file/oneOf", - "/properties/metadata/allOf" - ] + "expected_dropped_fields": [] } ] }