From 33e53a2a566cef8af25f96fc7c8173bb9a199f26 Mon Sep 17 00:00:00 2001 From: Luis Pater Date: Fri, 26 Dec 2025 05:01:45 +0800 Subject: [PATCH] fix(translators): ensure correct handling and output of multimodal assistant content across request handlers --- .../antigravity_openai_request.go | 24 +++++++++++++++++- .../gemini-cli_openai_request.go | 25 ++++++++++++++++++- .../chat-completions/gemini_openai_request.go | 6 ++--- 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/internal/translator/antigravity/openai/chat-completions/antigravity_openai_request.go b/internal/translator/antigravity/openai/chat-completions/antigravity_openai_request.go index 573b8d45..ecabce95 100644 --- a/internal/translator/antigravity/openai/chat-completions/antigravity_openai_request.go +++ b/internal/translator/antigravity/openai/chat-completions/antigravity_openai_request.go @@ -249,8 +249,28 @@ func ConvertOpenAIRequestToAntigravity(modelName string, inputRawJSON []byte, _ p := 0 if content.Type == gjson.String { node, _ = sjson.SetBytes(node, "parts.-1.text", content.String()) - out, _ = sjson.SetRawBytes(out, "request.contents.-1", node) p++ + } else if content.IsArray() { + // Assistant multimodal content (e.g. text + image) -> single model content with parts + for _, item := range content.Array() { + switch item.Get("type").String() { + case "text": + p++ + case "image_url": + // If the assistant returned an inline data URL, preserve it for history fidelity. + imageURL := item.Get("image_url.url").String() + if len(imageURL) > 5 { // expect data:... + pieces := strings.SplitN(imageURL[5:], ";", 2) + if len(pieces) == 2 && len(pieces[1]) > 7 { + mime := pieces[0] + data := pieces[1][7:] + node, _ = sjson.SetBytes(node, "parts."+itoa(p)+".inlineData.mime_type", mime) + node, _ = sjson.SetBytes(node, "parts."+itoa(p)+".inlineData.data", data) + p++ + } + } + } + } } // Tool calls -> single model content with functionCall parts @@ -305,6 +325,8 @@ func ConvertOpenAIRequestToAntigravity(modelName string, inputRawJSON []byte, _ if pp > 0 { out, _ = sjson.SetRawBytes(out, "request.contents.-1", toolNode) } + } else { + out, _ = sjson.SetRawBytes(out, "request.contents.-1", node) } } } diff --git a/internal/translator/gemini-cli/openai/chat-completions/gemini-cli_openai_request.go b/internal/translator/gemini-cli/openai/chat-completions/gemini-cli_openai_request.go index feb80f65..e1d1a40b 100644 --- a/internal/translator/gemini-cli/openai/chat-completions/gemini-cli_openai_request.go +++ b/internal/translator/gemini-cli/openai/chat-completions/gemini-cli_openai_request.go @@ -218,8 +218,29 @@ func ConvertOpenAIRequestToGeminiCLI(modelName string, inputRawJSON []byte, _ bo if content.Type == gjson.String { // Assistant text -> single model content node, _ = sjson.SetBytes(node, "parts.-1.text", content.String()) - out, _ = sjson.SetRawBytes(out, "request.contents.-1", node) p++ + } else if content.IsArray() { + // Assistant multimodal content (e.g. text + image) -> single model content with parts + for _, item := range content.Array() { + switch item.Get("type").String() { + case "text": + node, _ = sjson.SetBytes(node, "parts."+itoa(p)+".text", item.Get("text").String()) + p++ + case "image_url": + // If the assistant returned an inline data URL, preserve it for history fidelity. + imageURL := item.Get("image_url.url").String() + if len(imageURL) > 5 { // expect data:... + pieces := strings.SplitN(imageURL[5:], ";", 2) + if len(pieces) == 2 && len(pieces[1]) > 7 { + mime := pieces[0] + data := pieces[1][7:] + node, _ = sjson.SetBytes(node, "parts."+itoa(p)+".inlineData.mime_type", mime) + node, _ = sjson.SetBytes(node, "parts."+itoa(p)+".inlineData.data", data) + p++ + } + } + } + } } // Tool calls -> single model content with functionCall parts @@ -260,6 +281,8 @@ func ConvertOpenAIRequestToGeminiCLI(modelName string, inputRawJSON []byte, _ bo if pp > 0 { out, _ = sjson.SetRawBytes(out, "request.contents.-1", toolNode) } + } else { + out, _ = sjson.SetRawBytes(out, "request.contents.-1", node) } } } diff --git a/internal/translator/gemini/openai/chat-completions/gemini_openai_request.go b/internal/translator/gemini/openai/chat-completions/gemini_openai_request.go index 7b8c5c68..f0902b38 100644 --- a/internal/translator/gemini/openai/chat-completions/gemini_openai_request.go +++ b/internal/translator/gemini/openai/chat-completions/gemini_openai_request.go @@ -233,18 +233,15 @@ func ConvertOpenAIRequestToGemini(modelName string, inputRawJSON []byte, _ bool) } else if role == "assistant" { node := []byte(`{"role":"model","parts":[]}`) p := 0 - if content.Type == gjson.String { // Assistant text -> single model content node, _ = sjson.SetBytes(node, "parts.-1.text", content.String()) - out, _ = sjson.SetRawBytes(out, "contents.-1", node) p++ } else if content.IsArray() { // Assistant multimodal content (e.g. text + image) -> single model content with parts for _, item := range content.Array() { switch item.Get("type").String() { case "text": - node, _ = sjson.SetBytes(node, "parts."+itoa(p)+".text", item.Get("text").String()) p++ case "image_url": // If the assistant returned an inline data URL, preserve it for history fidelity. @@ -261,7 +258,6 @@ func ConvertOpenAIRequestToGemini(modelName string, inputRawJSON []byte, _ bool) } } } - out, _ = sjson.SetRawBytes(out, "contents.-1", node) } // Tool calls -> single model content with functionCall parts @@ -302,6 +298,8 @@ func ConvertOpenAIRequestToGemini(modelName string, inputRawJSON []byte, _ bool) if pp > 0 { out, _ = sjson.SetRawBytes(out, "contents.-1", toolNode) } + } else { + out, _ = sjson.SetRawBytes(out, "contents.-1", node) } } }