diff --git a/internal/api/modules/amp/fallback_handlers.go b/internal/api/modules/amp/fallback_handlers.go index 3ec6c85e..c17f3f85 100644 --- a/internal/api/modules/amp/fallback_handlers.go +++ b/internal/api/modules/amp/fallback_handlers.go @@ -133,8 +133,8 @@ func (fh *FallbackHandler) WrapHandler(handler gin.HandlerFunc) gin.HandlerFunc return } - // Normalize model (handles Gemini thinking suffixes) - normalizedModel, _ := util.NormalizeGeminiThinkingModel(modelName) + // Normalize model (handles dynamic thinking suffixes) + normalizedModel, _ := util.NormalizeThinkingModel(modelName) // Track resolved model for logging (may change if mapping is applied) resolvedModel := normalizedModel diff --git a/internal/registry/model_definitions.go b/internal/registry/model_definitions.go index de547182..2f87f195 100644 --- a/internal/registry/model_definitions.go +++ b/internal/registry/model_definitions.go @@ -26,60 +26,6 @@ func GetClaudeModels() []*ModelInfo { DisplayName: "Claude 4.5 Sonnet", ContextLength: 200000, MaxCompletionTokens: 64000, - }, - { - ID: "claude-sonnet-4-5-thinking", - Object: "model", - Created: 1759104000, // 2025-09-29 - OwnedBy: "anthropic", - Type: "claude", - DisplayName: "Claude 4.5 Sonnet Thinking", - ContextLength: 200000, - MaxCompletionTokens: 64000, - Thinking: &ThinkingSupport{Min: 1024, Max: 100000, ZeroAllowed: false, DynamicAllowed: true}, - }, - { - ID: "claude-opus-4-5-thinking", - Object: "model", - Created: 1761955200, // 2025-11-01 - OwnedBy: "anthropic", - Type: "claude", - DisplayName: "Claude 4.5 Opus Thinking", - ContextLength: 200000, - MaxCompletionTokens: 64000, - Thinking: &ThinkingSupport{Min: 1024, Max: 100000, ZeroAllowed: false, DynamicAllowed: true}, - }, - { - ID: "claude-opus-4-5-thinking-low", - Object: "model", - Created: 1761955200, // 2025-11-01 - OwnedBy: "anthropic", - Type: "claude", - DisplayName: "Claude 4.5 Opus Thinking Low", - ContextLength: 200000, - MaxCompletionTokens: 64000, - Thinking: &ThinkingSupport{Min: 1024, Max: 100000, ZeroAllowed: false, DynamicAllowed: true}, - }, - { - ID: "claude-opus-4-5-thinking-medium", - Object: "model", - Created: 1761955200, // 2025-11-01 - OwnedBy: "anthropic", - Type: "claude", - DisplayName: "Claude 4.5 Opus Thinking Medium", - ContextLength: 200000, - MaxCompletionTokens: 64000, - Thinking: &ThinkingSupport{Min: 1024, Max: 100000, ZeroAllowed: false, DynamicAllowed: true}, - }, - { - ID: "claude-opus-4-5-thinking-high", - Object: "model", - Created: 1761955200, // 2025-11-01 - OwnedBy: "anthropic", - Type: "claude", - DisplayName: "Claude 4.5 Opus Thinking High", - ContextLength: 200000, - MaxCompletionTokens: 64000, Thinking: &ThinkingSupport{Min: 1024, Max: 100000, ZeroAllowed: false, DynamicAllowed: true}, }, { @@ -92,6 +38,7 @@ func GetClaudeModels() []*ModelInfo { Description: "Premium model combining maximum intelligence with practical performance", ContextLength: 200000, MaxCompletionTokens: 64000, + Thinking: &ThinkingSupport{Min: 1024, Max: 100000, ZeroAllowed: false, DynamicAllowed: true}, }, { ID: "claude-opus-4-1-20250805", @@ -530,58 +477,6 @@ func GetOpenAIModels() []*ModelInfo { MaxCompletionTokens: 128000, SupportedParameters: []string{"tools"}, }, - { - ID: "gpt-5-minimal", - Object: "model", - Created: 1754524800, - OwnedBy: "openai", - Type: "openai", - Version: "gpt-5-2025-08-07", - DisplayName: "GPT 5 Minimal", - Description: "Stable version of GPT 5, The best model for coding and agentic tasks across domains.", - ContextLength: 400000, - MaxCompletionTokens: 128000, - SupportedParameters: []string{"tools"}, - }, - { - ID: "gpt-5-low", - Object: "model", - Created: 1754524800, - OwnedBy: "openai", - Type: "openai", - Version: "gpt-5-2025-08-07", - DisplayName: "GPT 5 Low", - Description: "Stable version of GPT 5, The best model for coding and agentic tasks across domains.", - ContextLength: 400000, - MaxCompletionTokens: 128000, - SupportedParameters: []string{"tools"}, - }, - { - ID: "gpt-5-medium", - Object: "model", - Created: 1754524800, - OwnedBy: "openai", - Type: "openai", - Version: "gpt-5-2025-08-07", - DisplayName: "GPT 5 Medium", - Description: "Stable version of GPT 5, The best model for coding and agentic tasks across domains.", - ContextLength: 400000, - MaxCompletionTokens: 128000, - SupportedParameters: []string{"tools"}, - }, - { - ID: "gpt-5-high", - Object: "model", - Created: 1754524800, - OwnedBy: "openai", - Type: "openai", - Version: "gpt-5-2025-08-07", - DisplayName: "GPT 5 High", - Description: "Stable version of GPT 5, The best model for coding and agentic tasks across domains.", - ContextLength: 400000, - MaxCompletionTokens: 128000, - SupportedParameters: []string{"tools"}, - }, { ID: "gpt-5-codex", Object: "model", @@ -595,45 +490,6 @@ func GetOpenAIModels() []*ModelInfo { MaxCompletionTokens: 128000, SupportedParameters: []string{"tools"}, }, - { - ID: "gpt-5-codex-low", - Object: "model", - Created: 1757894400, - OwnedBy: "openai", - Type: "openai", - Version: "gpt-5-2025-09-15", - DisplayName: "GPT 5 Codex Low", - Description: "Stable version of GPT 5 Codex, The best model for coding and agentic tasks across domains.", - ContextLength: 400000, - MaxCompletionTokens: 128000, - SupportedParameters: []string{"tools"}, - }, - { - ID: "gpt-5-codex-medium", - Object: "model", - Created: 1757894400, - OwnedBy: "openai", - Type: "openai", - Version: "gpt-5-2025-09-15", - DisplayName: "GPT 5 Codex Medium", - Description: "Stable version of GPT 5 Codex, The best model for coding and agentic tasks across domains.", - ContextLength: 400000, - MaxCompletionTokens: 128000, - SupportedParameters: []string{"tools"}, - }, - { - ID: "gpt-5-codex-high", - Object: "model", - Created: 1757894400, - OwnedBy: "openai", - Type: "openai", - Version: "gpt-5-2025-09-15", - DisplayName: "GPT 5 Codex High", - Description: "Stable version of GPT 5 Codex, The best model for coding and agentic tasks across domains.", - ContextLength: 400000, - MaxCompletionTokens: 128000, - SupportedParameters: []string{"tools"}, - }, { ID: "gpt-5-codex-mini", Object: "model", @@ -647,32 +503,6 @@ func GetOpenAIModels() []*ModelInfo { MaxCompletionTokens: 128000, SupportedParameters: []string{"tools"}, }, - { - ID: "gpt-5-codex-mini-medium", - Object: "model", - Created: 1762473600, - OwnedBy: "openai", - Type: "openai", - Version: "gpt-5-2025-11-07", - DisplayName: "GPT 5 Codex Mini Medium", - Description: "Stable version of GPT 5 Codex Mini: cheaper, faster, but less capable version of GPT 5 Codex.", - ContextLength: 400000, - MaxCompletionTokens: 128000, - SupportedParameters: []string{"tools"}, - }, - { - ID: "gpt-5-codex-mini-high", - Object: "model", - Created: 1762473600, - OwnedBy: "openai", - Type: "openai", - Version: "gpt-5-2025-11-07", - DisplayName: "GPT 5 Codex Mini High", - Description: "Stable version of GPT 5 Codex Mini: cheaper, faster, but less capable version of GPT 5 Codex.", - ContextLength: 400000, - MaxCompletionTokens: 128000, - SupportedParameters: []string{"tools"}, - }, { ID: "gpt-5.1", Object: "model", @@ -686,58 +516,6 @@ func GetOpenAIModels() []*ModelInfo { MaxCompletionTokens: 128000, SupportedParameters: []string{"tools"}, }, - { - ID: "gpt-5.1-none", - Object: "model", - Created: 1762905600, - OwnedBy: "openai", - Type: "openai", - Version: "gpt-5.1-2025-11-12", - DisplayName: "GPT 5.1 Nothink", - Description: "Stable version of GPT 5.1, The best model for coding and agentic tasks across domains.", - ContextLength: 400000, - MaxCompletionTokens: 128000, - SupportedParameters: []string{"tools"}, - }, - { - ID: "gpt-5.1-low", - Object: "model", - Created: 1762905600, - OwnedBy: "openai", - Type: "openai", - Version: "gpt-5.1-2025-11-12", - DisplayName: "GPT 5 Low", - Description: "Stable version of GPT 5, The best model for coding and agentic tasks across domains.", - ContextLength: 400000, - MaxCompletionTokens: 128000, - SupportedParameters: []string{"tools"}, - }, - { - ID: "gpt-5.1-medium", - Object: "model", - Created: 1762905600, - OwnedBy: "openai", - Type: "openai", - Version: "gpt-5.1-2025-11-12", - DisplayName: "GPT 5.1 Medium", - Description: "Stable version of GPT 5.1, The best model for coding and agentic tasks across domains.", - ContextLength: 400000, - MaxCompletionTokens: 128000, - SupportedParameters: []string{"tools"}, - }, - { - ID: "gpt-5.1-high", - Object: "model", - Created: 1762905600, - OwnedBy: "openai", - Type: "openai", - Version: "gpt-5.1-2025-11-12", - DisplayName: "GPT 5.1 High", - Description: "Stable version of GPT 5.1, The best model for coding and agentic tasks across domains.", - ContextLength: 400000, - MaxCompletionTokens: 128000, - SupportedParameters: []string{"tools"}, - }, { ID: "gpt-5.1-codex", Object: "model", @@ -751,45 +529,6 @@ func GetOpenAIModels() []*ModelInfo { MaxCompletionTokens: 128000, SupportedParameters: []string{"tools"}, }, - { - ID: "gpt-5.1-codex-low", - Object: "model", - Created: 1762905600, - OwnedBy: "openai", - Type: "openai", - Version: "gpt-5.1-2025-11-12", - DisplayName: "GPT 5.1 Codex Low", - Description: "Stable version of GPT 5.1 Codex, The best model for coding and agentic tasks across domains.", - ContextLength: 400000, - MaxCompletionTokens: 128000, - SupportedParameters: []string{"tools"}, - }, - { - ID: "gpt-5.1-codex-medium", - Object: "model", - Created: 1762905600, - OwnedBy: "openai", - Type: "openai", - Version: "gpt-5.1-2025-11-12", - DisplayName: "GPT 5.1 Codex Medium", - Description: "Stable version of GPT 5.1 Codex, The best model for coding and agentic tasks across domains.", - ContextLength: 400000, - MaxCompletionTokens: 128000, - SupportedParameters: []string{"tools"}, - }, - { - ID: "gpt-5.1-codex-high", - Object: "model", - Created: 1762905600, - OwnedBy: "openai", - Type: "openai", - Version: "gpt-5.1-2025-11-12", - DisplayName: "GPT 5.1 Codex High", - Description: "Stable version of GPT 5.1 Codex, The best model for coding and agentic tasks across domains.", - ContextLength: 400000, - MaxCompletionTokens: 128000, - SupportedParameters: []string{"tools"}, - }, { ID: "gpt-5.1-codex-mini", Object: "model", @@ -803,33 +542,6 @@ func GetOpenAIModels() []*ModelInfo { MaxCompletionTokens: 128000, SupportedParameters: []string{"tools"}, }, - { - ID: "gpt-5.1-codex-mini-medium", - Object: "model", - Created: 1762905600, - OwnedBy: "openai", - Type: "openai", - Version: "gpt-5.1-2025-11-12", - DisplayName: "GPT 5.1 Codex Mini Medium", - Description: "Stable version of GPT 5.1 Codex Mini: cheaper, faster, but less capable version of GPT 5.1 Codex.", - ContextLength: 400000, - MaxCompletionTokens: 128000, - SupportedParameters: []string{"tools"}, - }, - { - ID: "gpt-5.1-codex-mini-high", - Object: "model", - Created: 1762905600, - OwnedBy: "openai", - Type: "openai", - Version: "gpt-5.1-2025-11-12", - DisplayName: "GPT 5.1 Codex Mini High", - Description: "Stable version of GPT 5.1 Codex Mini: cheaper, faster, but less capable version of GPT 5.1 Codex.", - ContextLength: 400000, - MaxCompletionTokens: 128000, - SupportedParameters: []string{"tools"}, - }, - { ID: "gpt-5.1-codex-max", Object: "model", @@ -843,58 +555,6 @@ func GetOpenAIModels() []*ModelInfo { MaxCompletionTokens: 128000, SupportedParameters: []string{"tools"}, }, - { - ID: "gpt-5.1-codex-max-low", - Object: "model", - Created: 1763424000, - OwnedBy: "openai", - Type: "openai", - Version: "gpt-5.1-max", - DisplayName: "GPT 5.1 Codex Max Low", - Description: "Stable version of GPT 5.1 Codex Max Low", - ContextLength: 400000, - MaxCompletionTokens: 128000, - SupportedParameters: []string{"tools"}, - }, - { - ID: "gpt-5.1-codex-max-medium", - Object: "model", - Created: 1763424000, - OwnedBy: "openai", - Type: "openai", - Version: "gpt-5.1-max", - DisplayName: "GPT 5.1 Codex Max Medium", - Description: "Stable version of GPT 5.1 Codex Max Medium", - ContextLength: 400000, - MaxCompletionTokens: 128000, - SupportedParameters: []string{"tools"}, - }, - { - ID: "gpt-5.1-codex-max-high", - Object: "model", - Created: 1763424000, - OwnedBy: "openai", - Type: "openai", - Version: "gpt-5.1-max", - DisplayName: "GPT 5.1 Codex Max High", - Description: "Stable version of GPT 5.1 Codex Max High", - ContextLength: 400000, - MaxCompletionTokens: 128000, - SupportedParameters: []string{"tools"}, - }, - { - ID: "gpt-5.1-codex-max-xhigh", - Object: "model", - Created: 1763424000, - OwnedBy: "openai", - Type: "openai", - Version: "gpt-5.1-max", - DisplayName: "GPT 5.1 Codex Max XHigh", - Description: "Stable version of GPT 5.1 Codex Max XHigh", - ContextLength: 400000, - MaxCompletionTokens: 128000, - SupportedParameters: []string{"tools"}, - }, } } diff --git a/internal/runtime/executor/claude_executor.go b/internal/runtime/executor/claude_executor.go index 1a18c46a..c7470954 100644 --- a/internal/runtime/executor/claude_executor.go +++ b/internal/runtime/executor/claude_executor.go @@ -54,15 +54,22 @@ func (e *ClaudeExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, r // Use streaming translation to preserve function calling, except for claude. stream := from != to body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), stream) - modelForUpstream := req.Model - if modelOverride := e.resolveUpstreamModel(req.Model, auth); modelOverride != "" { - body, _ = sjson.SetBytes(body, "model", modelOverride) - modelForUpstream = modelOverride + upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata) + if upstreamModel == "" { + upstreamModel = req.Model } - // Inject thinking config based on model suffix for thinking variants - body = e.injectThinkingConfig(req.Model, body) + if modelOverride := e.resolveUpstreamModel(upstreamModel, auth); modelOverride != "" { + upstreamModel = modelOverride + } else if !strings.EqualFold(upstreamModel, req.Model) { + if modelOverride := e.resolveUpstreamModel(req.Model, auth); modelOverride != "" { + upstreamModel = modelOverride + } + } + body, _ = sjson.SetBytes(body, "model", upstreamModel) + // Inject thinking config based on model metadata for thinking variants + body = e.injectThinkingConfig(req.Model, req.Metadata, body) - if !strings.HasPrefix(modelForUpstream, "claude-3-5-haiku") { + if !strings.HasPrefix(upstreamModel, "claude-3-5-haiku") { body = checkSystemInstructions(body) } body = applyPayloadConfig(e.cfg, req.Model, body) @@ -161,11 +168,20 @@ func (e *ClaudeExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.A from := opts.SourceFormat to := sdktranslator.FromString("claude") body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), true) - if modelOverride := e.resolveUpstreamModel(req.Model, auth); modelOverride != "" { - body, _ = sjson.SetBytes(body, "model", modelOverride) + upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata) + if upstreamModel == "" { + upstreamModel = req.Model } - // Inject thinking config based on model suffix for thinking variants - body = e.injectThinkingConfig(req.Model, body) + if modelOverride := e.resolveUpstreamModel(upstreamModel, auth); modelOverride != "" { + upstreamModel = modelOverride + } else if !strings.EqualFold(upstreamModel, req.Model) { + if modelOverride := e.resolveUpstreamModel(req.Model, auth); modelOverride != "" { + upstreamModel = modelOverride + } + } + body, _ = sjson.SetBytes(body, "model", upstreamModel) + // Inject thinking config based on model metadata for thinking variants + body = e.injectThinkingConfig(req.Model, req.Metadata, body) body = checkSystemInstructions(body) body = applyPayloadConfig(e.cfg, req.Model, body) @@ -295,13 +311,20 @@ func (e *ClaudeExecutor) CountTokens(ctx context.Context, auth *cliproxyauth.Aut // Use streaming translation to preserve function calling, except for claude. stream := from != to body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), stream) - modelForUpstream := req.Model - if modelOverride := e.resolveUpstreamModel(req.Model, auth); modelOverride != "" { - body, _ = sjson.SetBytes(body, "model", modelOverride) - modelForUpstream = modelOverride + upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata) + if upstreamModel == "" { + upstreamModel = req.Model } + if modelOverride := e.resolveUpstreamModel(upstreamModel, auth); modelOverride != "" { + upstreamModel = modelOverride + } else if !strings.EqualFold(upstreamModel, req.Model) { + if modelOverride := e.resolveUpstreamModel(req.Model, auth); modelOverride != "" { + upstreamModel = modelOverride + } + } + body, _ = sjson.SetBytes(body, "model", upstreamModel) - if !strings.HasPrefix(modelForUpstream, "claude-3-5-haiku") { + if !strings.HasPrefix(upstreamModel, "claude-3-5-haiku") { body = checkSystemInstructions(body) } @@ -427,25 +450,15 @@ func extractAndRemoveBetas(body []byte) ([]string, []byte) { return betas, body } -// injectThinkingConfig adds thinking configuration based on model name suffix -func (e *ClaudeExecutor) injectThinkingConfig(modelName string, body []byte) []byte { +// injectThinkingConfig adds thinking configuration based on metadata or legacy suffixes. +func (e *ClaudeExecutor) injectThinkingConfig(modelName string, metadata map[string]any, body []byte) []byte { // Only inject if thinking config is not already present if gjson.GetBytes(body, "thinking").Exists() { return body } - var budgetTokens int - switch { - case strings.HasSuffix(modelName, "-thinking-low"): - budgetTokens = 1024 - case strings.HasSuffix(modelName, "-thinking-medium"): - budgetTokens = 8192 - case strings.HasSuffix(modelName, "-thinking-high"): - budgetTokens = 24576 - case strings.HasSuffix(modelName, "-thinking"): - // Default thinking without suffix uses medium budget - budgetTokens = 8192 - default: + budgetTokens, ok := resolveClaudeThinkingBudget(modelName, metadata) + if !ok || budgetTokens <= 0 { return body } @@ -454,6 +467,44 @@ func (e *ClaudeExecutor) injectThinkingConfig(modelName string, body []byte) []b return body } +func resolveClaudeThinkingBudget(modelName string, metadata map[string]any) (int, bool) { + budget, include, effort, matched := util.ThinkingFromMetadata(metadata) + if matched { + if include != nil && !*include { + return 0, false + } + if budget != nil { + normalized := util.NormalizeThinkingBudget(modelName, *budget) + if normalized > 0 { + return normalized, true + } + return 0, false + } + if effort != nil { + if derived, ok := util.ThinkingEffortToBudget(modelName, *effort); ok && derived > 0 { + return derived, true + } + } + } + return claudeBudgetFromSuffix(modelName) +} + +func claudeBudgetFromSuffix(modelName string) (int, bool) { + lower := strings.ToLower(strings.TrimSpace(modelName)) + switch { + case strings.HasSuffix(lower, "-thinking-low"): + return 1024, true + case strings.HasSuffix(lower, "-thinking-medium"): + return 8192, true + case strings.HasSuffix(lower, "-thinking-high"): + return 24576, true + case strings.HasSuffix(lower, "-thinking"): + return 8192, true + default: + return 0, false + } +} + // ensureMaxTokensForThinking ensures max_tokens > thinking.budget_tokens when thinking is enabled. // Anthropic API requires this constraint; violating it returns a 400 error. // This function should be called after all thinking configuration is finalized. @@ -491,35 +542,45 @@ func ensureMaxTokensForThinking(modelName string, body []byte) []byte { } func (e *ClaudeExecutor) resolveUpstreamModel(alias string, auth *cliproxyauth.Auth) string { - if alias == "" { + trimmed := strings.TrimSpace(alias) + if trimmed == "" { return "" } - // Hardcoded mappings for thinking models to actual Claude model names - switch alias { - case "claude-opus-4-5-thinking", "claude-opus-4-5-thinking-low", "claude-opus-4-5-thinking-medium", "claude-opus-4-5-thinking-high": - return "claude-opus-4-5-20251101" - case "claude-sonnet-4-5-thinking": - return "claude-sonnet-4-5-20250929" - } + entry := e.resolveClaudeConfig(auth) if entry == nil { return "" } + + normalizedModel, metadata := util.NormalizeThinkingModel(trimmed) + + // Candidate names to match against configured aliases/names. + candidates := []string{strings.TrimSpace(normalizedModel)} + if !strings.EqualFold(normalizedModel, trimmed) { + candidates = append(candidates, trimmed) + } + if original := util.ResolveOriginalModel(normalizedModel, metadata); original != "" && !strings.EqualFold(original, normalizedModel) { + candidates = append(candidates, original) + } + for i := range entry.Models { model := entry.Models[i] name := strings.TrimSpace(model.Name) modelAlias := strings.TrimSpace(model.Alias) - if modelAlias != "" { - if strings.EqualFold(modelAlias, alias) { + + for _, candidate := range candidates { + if candidate == "" { + continue + } + if modelAlias != "" && strings.EqualFold(modelAlias, candidate) { if name != "" { return name } - return alias + return candidate + } + if name != "" && strings.EqualFold(name, candidate) { + return name } - continue - } - if name != "" && strings.EqualFold(name, alias) { - return name } } return "" diff --git a/internal/runtime/executor/codex_executor.go b/internal/runtime/executor/codex_executor.go index 1c4291f6..46a30177 100644 --- a/internal/runtime/executor/codex_executor.go +++ b/internal/runtime/executor/codex_executor.go @@ -49,14 +49,14 @@ func (e *CodexExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, re reporter := newUsageReporter(ctx, e.Identifier(), req.Model, auth) defer reporter.trackFailure(ctx, &err) + upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata) + from := opts.SourceFormat to := sdktranslator.FromString("codex") body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), false) - - body = e.setReasoningEffortByAlias(req.Model, body) - + body = applyReasoningEffortMetadata(body, req.Metadata, req.Model) body = applyPayloadConfig(e.cfg, req.Model, body) - + body, _ = sjson.SetBytes(body, "model", upstreamModel) body, _ = sjson.SetBytes(body, "stream", true) body, _ = sjson.DeleteBytes(body, "previous_response_id") @@ -142,13 +142,16 @@ func (e *CodexExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Au reporter := newUsageReporter(ctx, e.Identifier(), req.Model, auth) defer reporter.trackFailure(ctx, &err) + upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata) + from := opts.SourceFormat to := sdktranslator.FromString("codex") body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), true) - body = e.setReasoningEffortByAlias(req.Model, body) + body = applyReasoningEffortMetadata(body, req.Metadata, req.Model) body = applyPayloadConfig(e.cfg, req.Model, body) body, _ = sjson.DeleteBytes(body, "previous_response_id") + body, _ = sjson.SetBytes(body, "model", upstreamModel) url := strings.TrimSuffix(baseURL, "/") + "/responses" httpReq, err := e.cacheHelper(ctx, from, url, req, body) @@ -235,14 +238,16 @@ func (e *CodexExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Au } func (e *CodexExecutor) CountTokens(ctx context.Context, auth *cliproxyauth.Auth, req cliproxyexecutor.Request, opts cliproxyexecutor.Options) (cliproxyexecutor.Response, error) { + upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata) + from := opts.SourceFormat to := sdktranslator.FromString("codex") body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), false) modelForCounting := req.Model - body = e.setReasoningEffortByAlias(req.Model, body) - + body = applyReasoningEffortMetadata(body, req.Metadata, req.Model) + body, _ = sjson.SetBytes(body, "model", upstreamModel) body, _ = sjson.DeleteBytes(body, "previous_response_id") body, _ = sjson.SetBytes(body, "stream", false) @@ -261,83 +266,6 @@ func (e *CodexExecutor) CountTokens(ctx context.Context, auth *cliproxyauth.Auth return cliproxyexecutor.Response{Payload: []byte(translated)}, nil } -func (e *CodexExecutor) setReasoningEffortByAlias(modelName string, payload []byte) []byte { - if util.InArray([]string{"gpt-5", "gpt-5-minimal", "gpt-5-low", "gpt-5-medium", "gpt-5-high"}, modelName) { - payload, _ = sjson.SetBytes(payload, "model", "gpt-5") - switch modelName { - case "gpt-5-minimal": - payload, _ = sjson.SetBytes(payload, "reasoning.effort", "minimal") - case "gpt-5-low": - payload, _ = sjson.SetBytes(payload, "reasoning.effort", "low") - case "gpt-5-medium": - payload, _ = sjson.SetBytes(payload, "reasoning.effort", "medium") - case "gpt-5-high": - payload, _ = sjson.SetBytes(payload, "reasoning.effort", "high") - } - } else if util.InArray([]string{"gpt-5-codex", "gpt-5-codex-low", "gpt-5-codex-medium", "gpt-5-codex-high"}, modelName) { - payload, _ = sjson.SetBytes(payload, "model", "gpt-5-codex") - switch modelName { - case "gpt-5-codex-low": - payload, _ = sjson.SetBytes(payload, "reasoning.effort", "low") - case "gpt-5-codex-medium": - payload, _ = sjson.SetBytes(payload, "reasoning.effort", "medium") - case "gpt-5-codex-high": - payload, _ = sjson.SetBytes(payload, "reasoning.effort", "high") - } - } else if util.InArray([]string{"gpt-5-codex-mini", "gpt-5-codex-mini-medium", "gpt-5-codex-mini-high"}, modelName) { - payload, _ = sjson.SetBytes(payload, "model", "gpt-5-codex-mini") - switch modelName { - case "gpt-5-codex-mini-medium": - payload, _ = sjson.SetBytes(payload, "reasoning.effort", "medium") - case "gpt-5-codex-mini-high": - payload, _ = sjson.SetBytes(payload, "reasoning.effort", "high") - } - } else if util.InArray([]string{"gpt-5.1", "gpt-5.1-none", "gpt-5.1-low", "gpt-5.1-medium", "gpt-5.1-high"}, modelName) { - payload, _ = sjson.SetBytes(payload, "model", "gpt-5.1") - switch modelName { - case "gpt-5.1-none": - payload, _ = sjson.SetBytes(payload, "reasoning.effort", "none") - case "gpt-5.1-low": - payload, _ = sjson.SetBytes(payload, "reasoning.effort", "low") - case "gpt-5.1-medium": - payload, _ = sjson.SetBytes(payload, "reasoning.effort", "medium") - case "gpt-5.1-high": - payload, _ = sjson.SetBytes(payload, "reasoning.effort", "high") - } - } else if util.InArray([]string{"gpt-5.1-codex", "gpt-5.1-codex-low", "gpt-5.1-codex-medium", "gpt-5.1-codex-high"}, modelName) { - payload, _ = sjson.SetBytes(payload, "model", "gpt-5.1-codex") - switch modelName { - case "gpt-5.1-codex-low": - payload, _ = sjson.SetBytes(payload, "reasoning.effort", "low") - case "gpt-5.1-codex-medium": - payload, _ = sjson.SetBytes(payload, "reasoning.effort", "medium") - case "gpt-5.1-codex-high": - payload, _ = sjson.SetBytes(payload, "reasoning.effort", "high") - } - } else if util.InArray([]string{"gpt-5.1-codex-mini", "gpt-5.1-codex-mini-medium", "gpt-5.1-codex-mini-high"}, modelName) { - payload, _ = sjson.SetBytes(payload, "model", "gpt-5.1-codex-mini") - switch modelName { - case "gpt-5.1-codex-mini-medium": - payload, _ = sjson.SetBytes(payload, "reasoning.effort", "medium") - case "gpt-5.1-codex-mini-high": - payload, _ = sjson.SetBytes(payload, "reasoning.effort", "high") - } - } else if util.InArray([]string{"gpt-5.1-codex-max", "gpt-5.1-codex-max-low", "gpt-5.1-codex-max-medium", "gpt-5.1-codex-max-high", "gpt-5.1-codex-max-xhigh"}, modelName) { - payload, _ = sjson.SetBytes(payload, "model", "gpt-5.1-codex-max") - switch modelName { - case "gpt-5.1-codex-max-low": - payload, _ = sjson.SetBytes(payload, "reasoning.effort", "low") - case "gpt-5.1-codex-max-medium": - payload, _ = sjson.SetBytes(payload, "reasoning.effort", "medium") - case "gpt-5.1-codex-max-high": - payload, _ = sjson.SetBytes(payload, "reasoning.effort", "high") - case "gpt-5.1-codex-max-xhigh": - payload, _ = sjson.SetBytes(payload, "reasoning.effort", "xhigh") - } - } - return payload -} - func tokenizerForCodexModel(model string) (tokenizer.Codec, error) { sanitized := strings.ToLower(strings.TrimSpace(model)) switch { diff --git a/internal/runtime/executor/gemini_executor.go b/internal/runtime/executor/gemini_executor.go index 8879a4f1..bd214b10 100644 --- a/internal/runtime/executor/gemini_executor.go +++ b/internal/runtime/executor/gemini_executor.go @@ -75,6 +75,8 @@ func (e *GeminiExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, r reporter := newUsageReporter(ctx, e.Identifier(), req.Model, auth) defer reporter.trackFailure(ctx, &err) + upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata) + // Official Gemini API via API key or OAuth bearer from := opts.SourceFormat to := sdktranslator.FromString("gemini") @@ -85,6 +87,7 @@ func (e *GeminiExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, r body = util.StripThinkingConfigIfUnsupported(req.Model, body) body = fixGeminiImageAspectRatio(req.Model, body) body = applyPayloadConfig(e.cfg, req.Model, body) + body, _ = sjson.SetBytes(body, "model", upstreamModel) action := "generateContent" if req.Metadata != nil { @@ -93,7 +96,7 @@ func (e *GeminiExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, r } } baseURL := resolveGeminiBaseURL(auth) - url := fmt.Sprintf("%s/%s/models/%s:%s", baseURL, glAPIVersion, req.Model, action) + url := fmt.Sprintf("%s/%s/models/%s:%s", baseURL, glAPIVersion, upstreamModel, action) if opts.Alt != "" && action != "countTokens" { url = url + fmt.Sprintf("?$alt=%s", opts.Alt) } @@ -167,6 +170,8 @@ func (e *GeminiExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.A reporter := newUsageReporter(ctx, e.Identifier(), req.Model, auth) defer reporter.trackFailure(ctx, &err) + upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata) + from := opts.SourceFormat to := sdktranslator.FromString("gemini") body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), true) @@ -176,9 +181,10 @@ func (e *GeminiExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.A body = util.StripThinkingConfigIfUnsupported(req.Model, body) body = fixGeminiImageAspectRatio(req.Model, body) body = applyPayloadConfig(e.cfg, req.Model, body) + body, _ = sjson.SetBytes(body, "model", upstreamModel) baseURL := resolveGeminiBaseURL(auth) - url := fmt.Sprintf("%s/%s/models/%s:%s", baseURL, glAPIVersion, req.Model, "streamGenerateContent") + url := fmt.Sprintf("%s/%s/models/%s:%s", baseURL, glAPIVersion, upstreamModel, "streamGenerateContent") if opts.Alt == "" { url = url + "?alt=sse" } else { diff --git a/internal/runtime/executor/gemini_vertex_executor.go b/internal/runtime/executor/gemini_vertex_executor.go index c7d10a67..cb41df48 100644 --- a/internal/runtime/executor/gemini_vertex_executor.go +++ b/internal/runtime/executor/gemini_vertex_executor.go @@ -105,10 +105,12 @@ func (e *GeminiVertexExecutor) CountTokens(ctx context.Context, auth *cliproxyau // countTokensWithServiceAccount handles token counting using service account credentials. func (e *GeminiVertexExecutor) countTokensWithServiceAccount(ctx context.Context, auth *cliproxyauth.Auth, req cliproxyexecutor.Request, opts cliproxyexecutor.Options, projectID, location string, saJSON []byte) (cliproxyexecutor.Response, error) { + upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata) + from := opts.SourceFormat to := sdktranslator.FromString("gemini") translatedReq := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), false) - if budgetOverride, includeOverride, ok := util.GeminiThinkingFromMetadata(req.Metadata); ok && util.ModelSupportsThinking(req.Model) { + if budgetOverride, includeOverride, ok := util.ResolveThinkingConfigFromMetadata(req.Model, req.Metadata); ok && util.ModelSupportsThinking(req.Model) { if budgetOverride != nil { norm := util.NormalizeThinkingBudget(req.Model, *budgetOverride) budgetOverride = &norm @@ -117,13 +119,14 @@ func (e *GeminiVertexExecutor) countTokensWithServiceAccount(ctx context.Context } translatedReq = util.StripThinkingConfigIfUnsupported(req.Model, translatedReq) translatedReq = fixGeminiImageAspectRatio(req.Model, translatedReq) + translatedReq, _ = sjson.SetBytes(translatedReq, "model", upstreamModel) respCtx := context.WithValue(ctx, "alt", opts.Alt) translatedReq, _ = sjson.DeleteBytes(translatedReq, "tools") translatedReq, _ = sjson.DeleteBytes(translatedReq, "generationConfig") translatedReq, _ = sjson.DeleteBytes(translatedReq, "safetySettings") baseURL := vertexBaseURL(location) - url := fmt.Sprintf("%s/%s/projects/%s/locations/%s/publishers/google/models/%s:%s", baseURL, vertexAPIVersion, projectID, location, req.Model, "countTokens") + url := fmt.Sprintf("%s/%s/projects/%s/locations/%s/publishers/google/models/%s:%s", baseURL, vertexAPIVersion, projectID, location, upstreamModel, "countTokens") httpReq, errNewReq := http.NewRequestWithContext(respCtx, http.MethodPost, url, bytes.NewReader(translatedReq)) if errNewReq != nil { @@ -191,10 +194,12 @@ func (e *GeminiVertexExecutor) countTokensWithServiceAccount(ctx context.Context // countTokensWithAPIKey handles token counting using API key credentials. func (e *GeminiVertexExecutor) countTokensWithAPIKey(ctx context.Context, auth *cliproxyauth.Auth, req cliproxyexecutor.Request, opts cliproxyexecutor.Options, apiKey, baseURL string) (cliproxyexecutor.Response, error) { + upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata) + from := opts.SourceFormat to := sdktranslator.FromString("gemini") translatedReq := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), false) - if budgetOverride, includeOverride, ok := util.GeminiThinkingFromMetadata(req.Metadata); ok && util.ModelSupportsThinking(req.Model) { + if budgetOverride, includeOverride, ok := util.ResolveThinkingConfigFromMetadata(req.Model, req.Metadata); ok && util.ModelSupportsThinking(req.Model) { if budgetOverride != nil { norm := util.NormalizeThinkingBudget(req.Model, *budgetOverride) budgetOverride = &norm @@ -203,6 +208,7 @@ func (e *GeminiVertexExecutor) countTokensWithAPIKey(ctx context.Context, auth * } translatedReq = util.StripThinkingConfigIfUnsupported(req.Model, translatedReq) translatedReq = fixGeminiImageAspectRatio(req.Model, translatedReq) + translatedReq, _ = sjson.SetBytes(translatedReq, "model", upstreamModel) respCtx := context.WithValue(ctx, "alt", opts.Alt) translatedReq, _ = sjson.DeleteBytes(translatedReq, "tools") translatedReq, _ = sjson.DeleteBytes(translatedReq, "generationConfig") @@ -286,10 +292,12 @@ func (e *GeminiVertexExecutor) executeWithServiceAccount(ctx context.Context, au reporter := newUsageReporter(ctx, e.Identifier(), req.Model, auth) defer reporter.trackFailure(ctx, &err) + upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata) + from := opts.SourceFormat to := sdktranslator.FromString("gemini") body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), false) - if budgetOverride, includeOverride, ok := util.GeminiThinkingFromMetadata(req.Metadata); ok && util.ModelSupportsThinking(req.Model) { + if budgetOverride, includeOverride, ok := util.ResolveThinkingConfigFromMetadata(req.Model, req.Metadata); ok && util.ModelSupportsThinking(req.Model) { if budgetOverride != nil { norm := util.NormalizeThinkingBudget(req.Model, *budgetOverride) budgetOverride = &norm @@ -301,6 +309,7 @@ func (e *GeminiVertexExecutor) executeWithServiceAccount(ctx context.Context, au body = util.StripThinkingConfigIfUnsupported(req.Model, body) body = fixGeminiImageAspectRatio(req.Model, body) body = applyPayloadConfig(e.cfg, req.Model, body) + body, _ = sjson.SetBytes(body, "model", upstreamModel) action := "generateContent" if req.Metadata != nil { @@ -309,7 +318,7 @@ func (e *GeminiVertexExecutor) executeWithServiceAccount(ctx context.Context, au } } baseURL := vertexBaseURL(location) - url := fmt.Sprintf("%s/%s/projects/%s/locations/%s/publishers/google/models/%s:%s", baseURL, vertexAPIVersion, projectID, location, req.Model, action) + url := fmt.Sprintf("%s/%s/projects/%s/locations/%s/publishers/google/models/%s:%s", baseURL, vertexAPIVersion, projectID, location, upstreamModel, action) if opts.Alt != "" && action != "countTokens" { url = url + fmt.Sprintf("?$alt=%s", opts.Alt) } @@ -383,10 +392,12 @@ func (e *GeminiVertexExecutor) executeWithAPIKey(ctx context.Context, auth *clip reporter := newUsageReporter(ctx, e.Identifier(), req.Model, auth) defer reporter.trackFailure(ctx, &err) + upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata) + from := opts.SourceFormat to := sdktranslator.FromString("gemini") body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), false) - if budgetOverride, includeOverride, ok := util.GeminiThinkingFromMetadata(req.Metadata); ok && util.ModelSupportsThinking(req.Model) { + if budgetOverride, includeOverride, ok := util.ResolveThinkingConfigFromMetadata(req.Model, req.Metadata); ok && util.ModelSupportsThinking(req.Model) { if budgetOverride != nil { norm := util.NormalizeThinkingBudget(req.Model, *budgetOverride) budgetOverride = &norm @@ -398,6 +409,7 @@ func (e *GeminiVertexExecutor) executeWithAPIKey(ctx context.Context, auth *clip body = util.StripThinkingConfigIfUnsupported(req.Model, body) body = fixGeminiImageAspectRatio(req.Model, body) body = applyPayloadConfig(e.cfg, req.Model, body) + body, _ = sjson.SetBytes(body, "model", upstreamModel) action := "generateContent" if req.Metadata != nil { @@ -410,7 +422,7 @@ func (e *GeminiVertexExecutor) executeWithAPIKey(ctx context.Context, auth *clip if baseURL == "" { baseURL = "https://generativelanguage.googleapis.com" } - url := fmt.Sprintf("%s/%s/publishers/google/models/%s:%s", baseURL, vertexAPIVersion, req.Model, action) + url := fmt.Sprintf("%s/%s/publishers/google/models/%s:%s", baseURL, vertexAPIVersion, upstreamModel, action) if opts.Alt != "" && action != "countTokens" { url = url + fmt.Sprintf("?$alt=%s", opts.Alt) } @@ -481,10 +493,12 @@ func (e *GeminiVertexExecutor) executeStreamWithServiceAccount(ctx context.Conte reporter := newUsageReporter(ctx, e.Identifier(), req.Model, auth) defer reporter.trackFailure(ctx, &err) + upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata) + from := opts.SourceFormat to := sdktranslator.FromString("gemini") body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), true) - if budgetOverride, includeOverride, ok := util.GeminiThinkingFromMetadata(req.Metadata); ok && util.ModelSupportsThinking(req.Model) { + if budgetOverride, includeOverride, ok := util.ResolveThinkingConfigFromMetadata(req.Model, req.Metadata); ok && util.ModelSupportsThinking(req.Model) { if budgetOverride != nil { norm := util.NormalizeThinkingBudget(req.Model, *budgetOverride) budgetOverride = &norm @@ -496,9 +510,10 @@ func (e *GeminiVertexExecutor) executeStreamWithServiceAccount(ctx context.Conte body = util.StripThinkingConfigIfUnsupported(req.Model, body) body = fixGeminiImageAspectRatio(req.Model, body) body = applyPayloadConfig(e.cfg, req.Model, body) + body, _ = sjson.SetBytes(body, "model", upstreamModel) baseURL := vertexBaseURL(location) - url := fmt.Sprintf("%s/%s/projects/%s/locations/%s/publishers/google/models/%s:%s", baseURL, vertexAPIVersion, projectID, location, req.Model, "streamGenerateContent") + url := fmt.Sprintf("%s/%s/projects/%s/locations/%s/publishers/google/models/%s:%s", baseURL, vertexAPIVersion, projectID, location, upstreamModel, "streamGenerateContent") if opts.Alt == "" { url = url + "?alt=sse" } else { @@ -595,10 +610,12 @@ func (e *GeminiVertexExecutor) executeStreamWithAPIKey(ctx context.Context, auth reporter := newUsageReporter(ctx, e.Identifier(), req.Model, auth) defer reporter.trackFailure(ctx, &err) + upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata) + from := opts.SourceFormat to := sdktranslator.FromString("gemini") body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), true) - if budgetOverride, includeOverride, ok := util.GeminiThinkingFromMetadata(req.Metadata); ok && util.ModelSupportsThinking(req.Model) { + if budgetOverride, includeOverride, ok := util.ResolveThinkingConfigFromMetadata(req.Model, req.Metadata); ok && util.ModelSupportsThinking(req.Model) { if budgetOverride != nil { norm := util.NormalizeThinkingBudget(req.Model, *budgetOverride) budgetOverride = &norm @@ -610,12 +627,13 @@ func (e *GeminiVertexExecutor) executeStreamWithAPIKey(ctx context.Context, auth body = util.StripThinkingConfigIfUnsupported(req.Model, body) body = fixGeminiImageAspectRatio(req.Model, body) body = applyPayloadConfig(e.cfg, req.Model, body) + body, _ = sjson.SetBytes(body, "model", upstreamModel) // For API key auth, use simpler URL format without project/location if baseURL == "" { baseURL = "https://generativelanguage.googleapis.com" } - url := fmt.Sprintf("%s/%s/publishers/google/models/%s:%s", baseURL, vertexAPIVersion, req.Model, "streamGenerateContent") + url := fmt.Sprintf("%s/%s/publishers/google/models/%s:%s", baseURL, vertexAPIVersion, upstreamModel, "streamGenerateContent") if opts.Alt == "" { url = url + "?alt=sse" } else { diff --git a/internal/runtime/executor/iflow_executor.go b/internal/runtime/executor/iflow_executor.go index 3589e922..c68a6431 100644 --- a/internal/runtime/executor/iflow_executor.go +++ b/internal/runtime/executor/iflow_executor.go @@ -57,6 +57,10 @@ func (e *IFlowExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, re from := opts.SourceFormat to := sdktranslator.FromString("openai") body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), false) + body = applyReasoningEffortMetadataChatCompletions(body, req.Metadata, req.Model) + if upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata); upstreamModel != "" { + body, _ = sjson.SetBytes(body, "model", upstreamModel) + } body = applyPayloadConfig(e.cfg, req.Model, body) endpoint := strings.TrimSuffix(baseURL, "/") + iflowDefaultEndpoint @@ -139,6 +143,10 @@ func (e *IFlowExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Au to := sdktranslator.FromString("openai") body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), true) + body = applyReasoningEffortMetadataChatCompletions(body, req.Metadata, req.Model) + if upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata); upstreamModel != "" { + body, _ = sjson.SetBytes(body, "model", upstreamModel) + } // Ensure tools array exists to avoid provider quirks similar to Qwen's behaviour. toolsResult := gjson.GetBytes(body, "tools") if toolsResult.Exists() && toolsResult.IsArray() && len(toolsResult.Array()) == 0 { diff --git a/internal/runtime/executor/openai_compat_executor.go b/internal/runtime/executor/openai_compat_executor.go index 55ec6dc9..93122c20 100644 --- a/internal/runtime/executor/openai_compat_executor.go +++ b/internal/runtime/executor/openai_compat_executor.go @@ -58,6 +58,10 @@ func (e *OpenAICompatExecutor) Execute(ctx context.Context, auth *cliproxyauth.A translated = e.overrideModel(translated, modelOverride) } translated = applyPayloadConfigWithRoot(e.cfg, req.Model, to.String(), "", translated) + translated = applyReasoningEffortMetadataChatCompletions(translated, req.Metadata, req.Model) + if upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata); upstreamModel != "" { + translated, _ = sjson.SetBytes(translated, "model", upstreamModel) + } url := strings.TrimSuffix(baseURL, "/") + "/chat/completions" httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(translated)) @@ -143,6 +147,10 @@ func (e *OpenAICompatExecutor) ExecuteStream(ctx context.Context, auth *cliproxy translated = e.overrideModel(translated, modelOverride) } translated = applyPayloadConfigWithRoot(e.cfg, req.Model, to.String(), "", translated) + translated = applyReasoningEffortMetadataChatCompletions(translated, req.Metadata, req.Model) + if upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata); upstreamModel != "" { + translated, _ = sjson.SetBytes(translated, "model", upstreamModel) + } url := strings.TrimSuffix(baseURL, "/") + "/chat/completions" httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(translated)) diff --git a/internal/runtime/executor/payload_helpers.go b/internal/runtime/executor/payload_helpers.go index 1465533a..37e3141a 100644 --- a/internal/runtime/executor/payload_helpers.go +++ b/internal/runtime/executor/payload_helpers.go @@ -12,8 +12,8 @@ import ( // applyThinkingMetadata applies thinking config from model suffix metadata (e.g., -reasoning, -thinking-N) // for standard Gemini format payloads. It normalizes the budget when the model supports thinking. func applyThinkingMetadata(payload []byte, metadata map[string]any, model string) []byte { - budgetOverride, includeOverride, ok := util.GeminiThinkingFromMetadata(metadata) - if !ok { + budgetOverride, includeOverride, ok := util.ResolveThinkingConfigFromMetadata(model, metadata) + if !ok || (budgetOverride == nil && includeOverride == nil) { return payload } if !util.ModelSupportsThinking(model) { @@ -29,17 +29,60 @@ func applyThinkingMetadata(payload []byte, metadata map[string]any, model string // applyThinkingMetadataCLI applies thinking config from model suffix metadata (e.g., -reasoning, -thinking-N) // for Gemini CLI format payloads (nested under "request"). It normalizes the budget when the model supports thinking. func applyThinkingMetadataCLI(payload []byte, metadata map[string]any, model string) []byte { - budgetOverride, includeOverride, ok := util.GeminiThinkingFromMetadata(metadata) - if !ok { + budgetOverride, includeOverride, ok := util.ResolveThinkingConfigFromMetadata(model, metadata) + if !ok || (budgetOverride == nil && includeOverride == nil) { return payload } - if budgetOverride != nil && util.ModelSupportsThinking(model) { + if !util.ModelSupportsThinking(model) { + return payload + } + if budgetOverride != nil { norm := util.NormalizeThinkingBudget(model, *budgetOverride) budgetOverride = &norm } return util.ApplyGeminiCLIThinkingConfig(payload, budgetOverride, includeOverride) } +// applyReasoningEffortMetadata applies reasoning effort overrides (reasoning.effort) when present in metadata. +// It avoids overwriting an existing reasoning.effort field and only applies to models that support thinking. +func applyReasoningEffortMetadata(payload []byte, metadata map[string]any, model string) []byte { + if len(metadata) == 0 { + return payload + } + if !util.ModelSupportsThinking(model) { + return payload + } + if gjson.GetBytes(payload, "reasoning.effort").Exists() { + return payload + } + if effort, ok := util.ReasoningEffortFromMetadata(metadata); ok && effort != "" { + if updated, err := sjson.SetBytes(payload, "reasoning.effort", effort); err == nil { + return updated + } + } + return payload +} + +// applyReasoningEffortMetadataChatCompletions applies reasoning_effort (OpenAI chat completions field) +// when present in metadata. It avoids overwriting an existing reasoning_effort field. +func applyReasoningEffortMetadataChatCompletions(payload []byte, metadata map[string]any, model string) []byte { + if len(metadata) == 0 { + return payload + } + if !util.ModelSupportsThinking(model) { + return payload + } + if gjson.GetBytes(payload, "reasoning_effort").Exists() { + return payload + } + if effort, ok := util.ReasoningEffortFromMetadata(metadata); ok && effort != "" { + if updated, err := sjson.SetBytes(payload, "reasoning_effort", effort); err == nil { + return updated + } + } + return payload +} + // applyPayloadConfig applies payload default and override rules from configuration // to the given JSON payload for the specified model. // Defaults only fill missing fields, while overrides always overwrite existing values. diff --git a/internal/runtime/executor/qwen_executor.go b/internal/runtime/executor/qwen_executor.go index 0c3e6b56..f060cb61 100644 --- a/internal/runtime/executor/qwen_executor.go +++ b/internal/runtime/executor/qwen_executor.go @@ -12,6 +12,7 @@ import ( qwenauth "github.com/router-for-me/CLIProxyAPI/v6/internal/auth/qwen" "github.com/router-for-me/CLIProxyAPI/v6/internal/config" + "github.com/router-for-me/CLIProxyAPI/v6/internal/util" cliproxyauth "github.com/router-for-me/CLIProxyAPI/v6/sdk/cliproxy/auth" cliproxyexecutor "github.com/router-for-me/CLIProxyAPI/v6/sdk/cliproxy/executor" sdktranslator "github.com/router-for-me/CLIProxyAPI/v6/sdk/translator" @@ -50,6 +51,10 @@ func (e *QwenExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, req from := opts.SourceFormat to := sdktranslator.FromString("openai") body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), false) + body = applyReasoningEffortMetadataChatCompletions(body, req.Metadata, req.Model) + if upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata); upstreamModel != "" { + body, _ = sjson.SetBytes(body, "model", upstreamModel) + } body = applyPayloadConfig(e.cfg, req.Model, body) url := strings.TrimSuffix(baseURL, "/") + "/chat/completions" @@ -121,6 +126,10 @@ func (e *QwenExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Aut to := sdktranslator.FromString("openai") body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), true) + body = applyReasoningEffortMetadataChatCompletions(body, req.Metadata, req.Model) + if upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata); upstreamModel != "" { + body, _ = sjson.SetBytes(body, "model", upstreamModel) + } toolsResult := gjson.GetBytes(body, "tools") // I'm addressing the Qwen3 "poisoning" issue, which is caused by the model needing a tool to be defined. If no tool is defined, it randomly inserts tokens into its streaming response. // This will have no real consequences. It's just to scare Qwen3. diff --git a/internal/util/gemini_thinking.go b/internal/util/gemini_thinking.go index fc389511..f33928b0 100644 --- a/internal/util/gemini_thinking.go +++ b/internal/util/gemini_thinking.go @@ -1,8 +1,6 @@ package util import ( - "encoding/json" - "strconv" "strings" "github.com/tidwall/gjson" @@ -15,80 +13,6 @@ const ( GeminiOriginalModelMetadataKey = "gemini_original_model" ) -func ParseGeminiThinkingSuffix(model string) (string, *int, *bool, bool) { - if model == "" { - return model, nil, nil, false - } - lower := strings.ToLower(model) - if !strings.HasPrefix(lower, "gemini-") { - return model, nil, nil, false - } - - if strings.HasSuffix(lower, "-nothinking") { - base := model[:len(model)-len("-nothinking")] - budgetValue := 0 - if strings.HasPrefix(lower, "gemini-2.5-pro") { - budgetValue = 128 - } - include := false - return base, &budgetValue, &include, true - } - - // Handle "-reasoning" suffix: enables thinking with dynamic budget (-1) - // Maps: gemini-2.5-flash-reasoning -> gemini-2.5-flash with thinkingBudget=-1 - if strings.HasSuffix(lower, "-reasoning") { - base := model[:len(model)-len("-reasoning")] - budgetValue := -1 // Dynamic budget - include := true - return base, &budgetValue, &include, true - } - - idx := strings.LastIndex(lower, "-thinking-") - if idx == -1 { - return model, nil, nil, false - } - - digits := model[idx+len("-thinking-"):] - if digits == "" { - return model, nil, nil, false - } - end := len(digits) - for i := 0; i < len(digits); i++ { - if digits[i] < '0' || digits[i] > '9' { - end = i - break - } - } - if end == 0 { - return model, nil, nil, false - } - valueStr := digits[:end] - value, err := strconv.Atoi(valueStr) - if err != nil { - return model, nil, nil, false - } - base := model[:idx] - budgetValue := value - return base, &budgetValue, nil, true -} - -func NormalizeGeminiThinkingModel(modelName string) (string, map[string]any) { - baseModel, budget, include, matched := ParseGeminiThinkingSuffix(modelName) - if !matched { - return baseModel, nil - } - metadata := map[string]any{ - GeminiOriginalModelMetadataKey: modelName, - } - if budget != nil { - metadata[GeminiThinkingBudgetMetadataKey] = *budget - } - if include != nil { - metadata[GeminiIncludeThoughtsMetadataKey] = *include - } - return baseModel, metadata -} - func ApplyGeminiThinkingConfig(body []byte, budget *int, includeThoughts *bool) []byte { if budget == nil && includeThoughts == nil { return body @@ -133,80 +57,6 @@ func ApplyGeminiCLIThinkingConfig(body []byte, budget *int, includeThoughts *boo return updated } -func GeminiThinkingFromMetadata(metadata map[string]any) (*int, *bool, bool) { - if len(metadata) == 0 { - return nil, nil, false - } - var ( - budgetPtr *int - includePtr *bool - matched bool - ) - if rawBudget, ok := metadata[GeminiThinkingBudgetMetadataKey]; ok { - switch v := rawBudget.(type) { - case int: - budget := v - budgetPtr = &budget - matched = true - case int32: - budget := int(v) - budgetPtr = &budget - matched = true - case int64: - budget := int(v) - budgetPtr = &budget - matched = true - case float64: - budget := int(v) - budgetPtr = &budget - matched = true - case json.Number: - if val, err := v.Int64(); err == nil { - budget := int(val) - budgetPtr = &budget - matched = true - } - } - } - if rawInclude, ok := metadata[GeminiIncludeThoughtsMetadataKey]; ok { - switch v := rawInclude.(type) { - case bool: - include := v - includePtr = &include - matched = true - case string: - if parsed, err := strconv.ParseBool(v); err == nil { - include := parsed - includePtr = &include - matched = true - } - case json.Number: - if val, err := v.Int64(); err == nil { - include := val != 0 - includePtr = &include - matched = true - } - case int: - include := v != 0 - includePtr = &include - matched = true - case int32: - include := v != 0 - includePtr = &include - matched = true - case int64: - include := v != 0 - includePtr = &include - matched = true - case float64: - include := v != 0 - includePtr = &include - matched = true - } - } - return budgetPtr, includePtr, matched -} - // modelsWithDefaultThinking lists models that should have thinking enabled by default // when no explicit thinkingConfig is provided. var modelsWithDefaultThinking = map[string]bool{ diff --git a/internal/util/thinking_suffix.go b/internal/util/thinking_suffix.go new file mode 100644 index 00000000..e3fd9136 --- /dev/null +++ b/internal/util/thinking_suffix.go @@ -0,0 +1,327 @@ +package util + +import ( + "encoding/json" + "strconv" + "strings" +) + +const ( + ThinkingBudgetMetadataKey = "thinking_budget" + ThinkingIncludeThoughtsMetadataKey = "thinking_include_thoughts" + ReasoningEffortMetadataKey = "reasoning_effort" + ThinkingOriginalModelMetadataKey = "thinking_original_model" +) + +// NormalizeThinkingModel parses dynamic thinking suffixes on model names and returns +// the normalized base model with extracted metadata. Supported patterns: +// - "-thinking-" extracts a numeric budget +// - "-thinking-" extracts a reasoning effort level (minimal/low/medium/high/xhigh/auto/none) +// - "-thinking" maps to a default reasoning effort of "medium" +// - "-reasoning" maps to dynamic budget (-1) and include_thoughts=true +// - "-nothinking" maps to budget=0 and include_thoughts=false +func NormalizeThinkingModel(modelName string) (string, map[string]any) { + if modelName == "" { + return modelName, nil + } + + lower := strings.ToLower(modelName) + baseModel := modelName + + var ( + budgetOverride *int + includeThoughts *bool + reasoningEffort *string + matched bool + ) + + switch { + case strings.HasSuffix(lower, "-nothinking"): + baseModel = modelName[:len(modelName)-len("-nothinking")] + budget := 0 + include := false + budgetOverride = &budget + includeThoughts = &include + matched = true + case strings.HasSuffix(lower, "-reasoning"): + baseModel = modelName[:len(modelName)-len("-reasoning")] + budget := -1 + include := true + budgetOverride = &budget + includeThoughts = &include + matched = true + default: + if idx := strings.LastIndex(lower, "-thinking-"); idx != -1 { + value := modelName[idx+len("-thinking-"):] + if value != "" { + if parsed, ok := parseIntPrefix(value); ok { + baseModel = modelName[:idx] + budgetOverride = &parsed + matched = true + } else if effort, okEffort := normalizeReasoningEffort(value); okEffort { + baseModel = modelName[:idx] + reasoningEffort = &effort + matched = true + } + } + } else if strings.HasSuffix(lower, "-thinking") { + baseModel = modelName[:len(modelName)-len("-thinking")] + effort := "medium" + reasoningEffort = &effort + matched = true + } + } + + if !matched { + return baseModel, nil + } + + metadata := map[string]any{ + ThinkingOriginalModelMetadataKey: modelName, + } + if budgetOverride != nil { + metadata[ThinkingBudgetMetadataKey] = *budgetOverride + } + if includeThoughts != nil { + metadata[ThinkingIncludeThoughtsMetadataKey] = *includeThoughts + } + if reasoningEffort != nil { + metadata[ReasoningEffortMetadataKey] = *reasoningEffort + } + return baseModel, metadata +} + +// ThinkingFromMetadata extracts thinking overrides from metadata produced by NormalizeThinkingModel. +// It accepts both the new generic keys and legacy Gemini-specific keys. +func ThinkingFromMetadata(metadata map[string]any) (*int, *bool, *string, bool) { + if len(metadata) == 0 { + return nil, nil, nil, false + } + + var ( + budgetPtr *int + includePtr *bool + effortPtr *string + matched bool + ) + + readBudget := func(key string) { + if budgetPtr != nil { + return + } + if raw, ok := metadata[key]; ok { + if v, okNumber := parseNumberToInt(raw); okNumber { + budget := v + budgetPtr = &budget + matched = true + } + } + } + + readInclude := func(key string) { + if includePtr != nil { + return + } + if raw, ok := metadata[key]; ok { + switch v := raw.(type) { + case bool: + val := v + includePtr = &val + matched = true + case *bool: + if v != nil { + val := *v + includePtr = &val + matched = true + } + } + } + } + + readEffort := func(key string) { + if effortPtr != nil { + return + } + if raw, ok := metadata[key]; ok { + if val, okStr := raw.(string); okStr && strings.TrimSpace(val) != "" { + normalized := strings.ToLower(strings.TrimSpace(val)) + effortPtr = &normalized + matched = true + } + } + } + + readBudget(ThinkingBudgetMetadataKey) + readBudget(GeminiThinkingBudgetMetadataKey) + readInclude(ThinkingIncludeThoughtsMetadataKey) + readInclude(GeminiIncludeThoughtsMetadataKey) + readEffort(ReasoningEffortMetadataKey) + readEffort("reasoning.effort") + + return budgetPtr, includePtr, effortPtr, matched +} + +// ResolveThinkingConfigFromMetadata derives thinking budget/include overrides, +// converting reasoning effort strings into budgets when possible. +func ResolveThinkingConfigFromMetadata(model string, metadata map[string]any) (*int, *bool, bool) { + budget, include, effort, matched := ThinkingFromMetadata(metadata) + if !matched { + return nil, nil, false + } + + if budget == nil && effort != nil { + if derived, ok := ThinkingEffortToBudget(model, *effort); ok { + budget = &derived + } + } + return budget, include, budget != nil || include != nil || effort != nil +} + +// ReasoningEffortFromMetadata resolves a reasoning effort string from metadata, +// inferring "auto" and "none" when budgets request dynamic or disabled thinking. +func ReasoningEffortFromMetadata(metadata map[string]any) (string, bool) { + budget, include, effort, matched := ThinkingFromMetadata(metadata) + if !matched { + return "", false + } + if effort != nil && *effort != "" { + return *effort, true + } + if budget != nil { + switch *budget { + case -1: + return "auto", true + case 0: + return "none", true + } + } + if include != nil && !*include { + return "none", true + } + return "", true +} + +// ThinkingEffortToBudget maps reasoning effort levels to approximate budgets, +// clamping the result to the model's supported range. +func ThinkingEffortToBudget(model, effort string) (int, bool) { + if effort == "" { + return 0, false + } + switch strings.ToLower(effort) { + case "none": + return 0, true + case "auto": + return NormalizeThinkingBudget(model, -1), true + case "minimal": + return NormalizeThinkingBudget(model, 512), true + case "low": + return NormalizeThinkingBudget(model, 1024), true + case "medium": + return NormalizeThinkingBudget(model, 8192), true + case "high": + return NormalizeThinkingBudget(model, 24576), true + case "xhigh": + return NormalizeThinkingBudget(model, 32768), true + default: + return 0, false + } +} + +// ResolveOriginalModel returns the original model name stored in metadata (if present), +// otherwise falls back to the provided model. +func ResolveOriginalModel(model string, metadata map[string]any) string { + normalize := func(name string) string { + if name == "" { + return "" + } + if base, _ := NormalizeThinkingModel(name); base != "" { + return base + } + return strings.TrimSpace(name) + } + + if metadata != nil { + if v, ok := metadata[ThinkingOriginalModelMetadataKey]; ok { + if s, okStr := v.(string); okStr && strings.TrimSpace(s) != "" { + if base := normalize(s); base != "" { + return base + } + } + } + if v, ok := metadata[GeminiOriginalModelMetadataKey]; ok { + if s, okStr := v.(string); okStr && strings.TrimSpace(s) != "" { + if base := normalize(s); base != "" { + return base + } + } + } + } + // Fallback: try to re-normalize the model name when metadata was dropped. + if base := normalize(model); base != "" { + return base + } + return model +} + +func parseIntPrefix(value string) (int, bool) { + if value == "" { + return 0, false + } + digits := strings.TrimLeft(value, "-") + if digits == "" { + return 0, false + } + end := len(digits) + for i := 0; i < len(digits); i++ { + if digits[i] < '0' || digits[i] > '9' { + end = i + break + } + } + if end == 0 { + return 0, false + } + val, err := strconv.Atoi(digits[:end]) + if err != nil { + return 0, false + } + return val, true +} + +func parseNumberToInt(raw any) (int, bool) { + switch v := raw.(type) { + case int: + return v, true + case int32: + return int(v), true + case int64: + return int(v), true + case float64: + return int(v), true + case json.Number: + if val, err := v.Int64(); err == nil { + return int(val), true + } + case string: + if strings.TrimSpace(v) == "" { + return 0, false + } + if parsed, err := strconv.Atoi(strings.TrimSpace(v)); err == nil { + return parsed, true + } + } + return 0, false +} + +func normalizeReasoningEffort(value string) (string, bool) { + if value == "" { + return "", false + } + effort := strings.ToLower(strings.TrimSpace(value)) + switch effort { + case "minimal", "low", "medium", "high", "xhigh", "auto", "none": + return effort, true + default: + return "", false + } +} diff --git a/sdk/api/handlers/handlers.go b/sdk/api/handlers/handlers.go index 76280b3a..5bfdf314 100644 --- a/sdk/api/handlers/handlers.go +++ b/sdk/api/handlers/handlers.go @@ -323,18 +323,32 @@ func (h *BaseAPIHandler) getRequestDetails(modelName string) (providers []string providerName, extractedModelName, isDynamic := h.parseDynamicModel(resolvedModelName) - // First, normalize the model name to handle suffixes like "-thinking-128" - // This needs to happen before determining the provider for non-dynamic models. - normalizedModel, metadata = normalizeModelMetadata(resolvedModelName) + targetModelName := resolvedModelName + if isDynamic { + targetModelName = extractedModelName + } + + // Normalize the model name to handle dynamic thinking suffixes before determining the provider. + normalizedModel, metadata = normalizeModelMetadata(targetModelName) if isDynamic { providers = []string{providerName} - // For dynamic models, the extractedModelName is already normalized by parseDynamicModel - // so we use it as the final normalizedModel. - normalizedModel = extractedModelName } else { // For non-dynamic models, use the normalizedModel to get the provider name. providers = util.GetProviderName(normalizedModel) + if len(providers) == 0 && metadata != nil { + if originalRaw, ok := metadata[util.ThinkingOriginalModelMetadataKey]; ok { + if originalModel, okStr := originalRaw.(string); okStr { + originalModel = strings.TrimSpace(originalModel) + if originalModel != "" && !strings.EqualFold(originalModel, normalizedModel) { + if altProviders := util.GetProviderName(originalModel); len(altProviders) > 0 { + providers = altProviders + normalizedModel = originalModel + } + } + } + } + } } if len(providers) == 0 { @@ -382,7 +396,7 @@ func cloneBytes(src []byte) []byte { } func normalizeModelMetadata(modelName string) (string, map[string]any) { - return util.NormalizeGeminiThinkingModel(modelName) + return util.NormalizeThinkingModel(modelName) } func cloneMetadata(src map[string]any) map[string]any {