From 41effa5aebd335c91500933e1d3db7c5dd475c8b Mon Sep 17 00:00:00 2001 From: hkfires <10558748+hkfires@users.noreply.github.com> Date: Sat, 20 Sep 2025 19:34:53 +0800 Subject: [PATCH] feat(gemini-web): Add support for image generation with Gemini models through the OpenAI chat completions translator. --- .../chat-completions/gemini_openai_request.go | 25 +++++++ .../gemini_openai_response.go | 68 ++++++++++++++++++- 2 files changed, 90 insertions(+), 3 deletions(-) diff --git a/internal/translator/gemini/openai/chat-completions/gemini_openai_request.go b/internal/translator/gemini/openai/chat-completions/gemini_openai_request.go index 6e842ab2..97320333 100644 --- a/internal/translator/gemini/openai/chat-completions/gemini_openai_request.go +++ b/internal/translator/gemini/openai/chat-completions/gemini_openai_request.go @@ -170,6 +170,31 @@ func ConvertOpenAIRequestToGemini(modelName string, inputRawJSON []byte, _ bool) node := []byte(`{"role":"model","parts":[{"text":""}]}`) node, _ = sjson.SetBytes(node, "parts.0.text", content.String()) out, _ = sjson.SetRawBytes(out, "contents.-1", node) + } else if content.IsArray() { + // Assistant multimodal content (e.g. text + image) -> single model content with parts + node := []byte(`{"role":"model","parts":[]}`) + p := 0 + for _, item := range content.Array() { + switch item.Get("type").String() { + case "text": + node, _ = sjson.SetBytes(node, "parts."+itoa(p)+".text", item.Get("text").String()) + p++ + case "image_url": + // If the assistant returned an inline data URL, preserve it for history fidelity. + imageURL := item.Get("image_url.url").String() + if len(imageURL) > 5 { // expect data:... + pieces := strings.SplitN(imageURL[5:], ";", 2) + if len(pieces) == 2 && len(pieces[1]) > 7 { + mime := pieces[0] + data := pieces[1][7:] + node, _ = sjson.SetBytes(node, "parts."+itoa(p)+".inlineData.mime_type", mime) + node, _ = sjson.SetBytes(node, "parts."+itoa(p)+".inlineData.data", data) + p++ + } + } + } + } + out, _ = sjson.SetRawBytes(out, "contents.-1", node) } else if !content.Exists() || content.Type == gjson.Null { // Tool calls -> single model content with functionCall parts tcs := m.Get("tool_calls") diff --git a/internal/translator/gemini/openai/chat-completions/gemini_openai_response.go b/internal/translator/gemini/openai/chat-completions/gemini_openai_response.go index 420812cb..f7c23b78 100644 --- a/internal/translator/gemini/openai/chat-completions/gemini_openai_response.go +++ b/internal/translator/gemini/openai/chat-completions/gemini_openai_response.go @@ -8,6 +8,7 @@ package chat_completions import ( "bytes" "context" + "encoding/json" "fmt" "time" @@ -99,6 +100,10 @@ func ConvertGeminiResponseToOpenAI(_ context.Context, _ string, originalRequestR partResult := partResults[i] partTextResult := partResult.Get("text") functionCallResult := partResult.Get("functionCall") + inlineDataResult := partResult.Get("inlineData") + if !inlineDataResult.Exists() { + inlineDataResult = partResult.Get("inline_data") + } if partTextResult.Exists() { // Handle text content, distinguishing between regular content and reasoning/thoughts. @@ -124,6 +129,34 @@ func ConvertGeminiResponseToOpenAI(_ context.Context, _ string, originalRequestR } template, _ = sjson.Set(template, "choices.0.delta.role", "assistant") template, _ = sjson.SetRaw(template, "choices.0.delta.tool_calls.-1", functionCallTemplate) + } else if inlineDataResult.Exists() { + data := inlineDataResult.Get("data").String() + if data == "" { + continue + } + mimeType := inlineDataResult.Get("mimeType").String() + if mimeType == "" { + mimeType = inlineDataResult.Get("mime_type").String() + } + if mimeType == "" { + mimeType = "image/png" + } + imageURL := fmt.Sprintf("data:%s;base64,%s", mimeType, data) + imagePayload, err := json.Marshal(map[string]any{ + "type": "image_url", + "image_url": map[string]string{ + "url": imageURL, + }, + }) + if err != nil { + continue + } + imagesResult := gjson.Get(template, "choices.0.delta.images") + if !imagesResult.Exists() || !imagesResult.IsArray() { + template, _ = sjson.SetRaw(template, "choices.0.delta.images", `[]`) + } + template, _ = sjson.Set(template, "choices.0.delta.role", "assistant") + template, _ = sjson.SetRaw(template, "choices.0.delta.images.-1", string(imagePayload)) } } } @@ -193,6 +226,10 @@ func ConvertGeminiResponseToOpenAINonStream(_ context.Context, _ string, origina partResult := partsResults[i] partTextResult := partResult.Get("text") functionCallResult := partResult.Get("functionCall") + inlineDataResult := partResult.Get("inlineData") + if !inlineDataResult.Exists() { + inlineDataResult = partResult.Get("inline_data") + } if partTextResult.Exists() { // Append text content, distinguishing between regular content and reasoning. @@ -217,9 +254,34 @@ func ConvertGeminiResponseToOpenAINonStream(_ context.Context, _ string, origina } template, _ = sjson.Set(template, "choices.0.message.role", "assistant") template, _ = sjson.SetRaw(template, "choices.0.message.tool_calls.-1", functionCallItemTemplate) - } else { - // If no usable content is found, return an empty string. - return "" + } else if inlineDataResult.Exists() { + data := inlineDataResult.Get("data").String() + if data == "" { + continue + } + mimeType := inlineDataResult.Get("mimeType").String() + if mimeType == "" { + mimeType = inlineDataResult.Get("mime_type").String() + } + if mimeType == "" { + mimeType = "image/png" + } + imageURL := fmt.Sprintf("data:%s;base64,%s", mimeType, data) + imagePayload, err := json.Marshal(map[string]any{ + "type": "image_url", + "image_url": map[string]string{ + "url": imageURL, + }, + }) + if err != nil { + continue + } + imagesResult := gjson.Get(template, "choices.0.message.images") + if !imagesResult.Exists() || !imagesResult.IsArray() { + template, _ = sjson.SetRaw(template, "choices.0.message.images", `[]`) + } + template, _ = sjson.Set(template, "choices.0.message.role", "assistant") + template, _ = sjson.SetRaw(template, "choices.0.message.images.-1", string(imagePayload)) } } }