Refactor API handlers organization and simplify error response handling

- Modularized handlers into dedicated packages (`gemini`, `claude`, `cli`) for better structure. - Centralized `ErrorResponse` and `ErrorDetail` types under `handlers` package for reuse. - Updated all handlers to utilize the shared `ErrorResponse` model. - Introduced specialization of handler structs (`GeminiAPIHandlers`, `ClaudeCodeAPIHandlers`, `GeminiCLIAPIHandlers`) for improved clarity and separation of concerns. - Refactored `getClient` logic with additional properties and better state management. Refactor `translator` package by modularizing code for `claude` and `gemini` - Moved Claude-specific logic (`PrepareClaudeRequest`, `ConvertCliToClaude`) to `translator/claude/code`. - Moved Gemini-specific logic (`FixCLIToolResponse`) to `translator/gemini/cli` for better package structure. - Updated affected handler imports and method references. Add comprehensive package-level documentation across key modules - Introduced detailed package-level documentation for core modules: `auth`, `client`, `cmd`, `handlers`, `util`, `watcher`, `config`, `translator`, and `api`. - Enhanced code readability and maintainability by clarifying the purpose and functionality of each package. - Aligned documentation style and tone with existing codebase conventions. Refactor API handlers and translator modules for improved clarity and consistency - Standardized handler struct names (`GeminiAPIHandlers`, `ClaudeCodeAPIHandlers`, `GeminiCLIAPIHandlers`, `OpenAIAPIHandlers`) and updated related comments. - Fixed unnecessary `else` blocks in streaming logic for cleaner error handling. - Renamed variables for better readability (`responseIdResult` to `responseIDResult`, `activationUrl` to `activationURL`, etc.). - Addressed minor inconsistencies in API handler comments and SSE header initialization. - Improved modularization of `claude` and `gemini` translator components. Standardize configuration field naming for consistency across modules - Renamed `ProxyUrl` to `ProxyURL`, `ApiKeys` to `APIKeys`, and `ConfigQuotaExceeded` to `QuotaExceeded`. - Updated all relevant references and comments in `config`, `auth`, `api`, `util`, and `watcher`. - Ensured consistent casing for `GlAPIKey` debug logs.
2026-02-03 04:50:52 +08:00 · 2025-08-05 17:37:00 +08:00
parent 00f33f5f3a
commit 1483c31c73
23 changed files with 1484 additions and 1267 deletions
--- a/internal/api/handlers/claude/code-handlers.go
+++ b/internal/api/handlers/claude/code-handlers.go
@@ -0,0 +1,208 @@
+// Package claude provides HTTP handlers for Claude API code-related functionality.
+// This package implements Claude-compatible streaming chat completions with sophisticated
+// client rotation and quota management systems to ensure high availability and optimal
+// resource utilization across multiple backend clients. It handles request translation
+// between Claude API format and the underlying Gemini backend, providing seamless
+// API compatibility while maintaining robust error handling and connection management.
+package claude
+
+import (
+	"context"
+	"fmt"
+	"github.com/gin-gonic/gin"
+	"github.com/luispater/CLIProxyAPI/internal/api/handlers"
+	"github.com/luispater/CLIProxyAPI/internal/api/translator/claude/code"
+	"github.com/luispater/CLIProxyAPI/internal/client"
+	log "github.com/sirupsen/logrus"
+	"net/http"
+	"strings"
+	"time"
+)
+
+// ClaudeCodeAPIHandlers contains the handlers for Claude API endpoints.
+// It holds a pool of clients to interact with the backend service.
+type ClaudeCodeAPIHandlers struct {
+	*handlers.APIHandlers
+}
+
+// NewClaudeCodeAPIHandlers creates a new Claude API handlers instance.
+// It takes an APIHandlers instance as input and returns a ClaudeCodeAPIHandlers.
+func NewClaudeCodeAPIHandlers(apiHandlers *handlers.APIHandlers) *ClaudeCodeAPIHandlers {
+	return &ClaudeCodeAPIHandlers{
+		APIHandlers: apiHandlers,
+	}
+}
+
+// ClaudeMessages handles Claude-compatible streaming chat completions.
+// This function implements a sophisticated client rotation and quota management system
+// to ensure high availability and optimal resource utilization across multiple backend clients.
+func (h *ClaudeCodeAPIHandlers) ClaudeMessages(c *gin.Context) {
+	// Extract raw JSON data from the incoming request
+	rawJSON, err := c.GetRawData()
+	// If data retrieval fails, return a 400 Bad Request error.
+	if err != nil {
+		c.JSON(http.StatusBadRequest, handlers.ErrorResponse{
+			Error: handlers.ErrorDetail{
+				Message: fmt.Sprintf("Invalid request: %v", err),
+				Type:    "invalid_request_error",
+			},
+		})
+		return
+	}
+
+	// Set up Server-Sent Events (SSE) headers for streaming response
+	// These headers are essential for maintaining a persistent connection
+	// and enabling real-time streaming of chat completions
+	c.Header("Content-Type", "text/event-stream")
+	c.Header("Cache-Control", "no-cache")
+	c.Header("Connection", "keep-alive")
+	c.Header("Access-Control-Allow-Origin", "*")
+
+	// Get the http.Flusher interface to manually flush the response.
+	// This is crucial for streaming as it allows immediate sending of data chunks
+	flusher, ok := c.Writer.(http.Flusher)
+	if !ok {
+		c.JSON(http.StatusInternalServerError, handlers.ErrorResponse{
+			Error: handlers.ErrorDetail{
+				Message: "Streaming not supported",
+				Type:    "server_error",
+			},
+		})
+		return
+	}
+
+	// Parse and prepare the Claude request, extracting model name, system instructions,
+	// conversation contents, and available tools from the raw JSON
+	modelName, systemInstruction, contents, tools := code.PrepareClaudeRequest(rawJSON)
+
+	// Map Claude model names to corresponding Gemini models
+	// This allows the proxy to handle Claude API calls using Gemini backend
+	if modelName == "claude-sonnet-4-20250514" {
+		modelName = "gemini-2.5-pro"
+	} else if modelName == "claude-3-5-haiku-20241022" {
+		modelName = "gemini-2.5-flash"
+	}
+
+	// Create a cancellable context for the backend client request
+	// This allows proper cleanup and cancellation of ongoing requests
+	cliCtx, cliCancel := context.WithCancel(context.Background())
+	var cliClient *client.Client
+	defer func() {
+		// Ensure the client's mutex is unlocked on function exit.
+		// This prevents deadlocks and ensures proper resource cleanup
+		if cliClient != nil {
+			cliClient.RequestMutex.Unlock()
+		}
+	}()
+
+	// Main client rotation loop with quota management
+	// This loop implements a sophisticated load balancing and failover mechanism
+outLoop:
+	for {
+		var errorResponse *client.ErrorMessage
+		cliClient, errorResponse = h.GetClient(modelName)
+		if errorResponse != nil {
+			c.Status(errorResponse.StatusCode)
+			_, _ = fmt.Fprint(c.Writer, errorResponse.Error)
+			flusher.Flush()
+			cliCancel()
+			return
+		}
+
+		// Determine the authentication method being used by the selected client
+		// This affects how responses are formatted and logged
+		isGlAPIKey := false
+		if glAPIKey := cliClient.GetGenerativeLanguageAPIKey(); glAPIKey != "" {
+			log.Debugf("Request use generative language API Key: %s", glAPIKey)
+			isGlAPIKey = true
+		} else {
+			log.Debugf("Request use account: %s, project id: %s", cliClient.GetEmail(), cliClient.GetProjectID())
+		}
+		// Initiate streaming communication with the backend client
+		// This returns two channels: one for response chunks and one for errors
+
+		includeThoughts := false
+		if userAgent, hasKey := c.Request.Header["User-Agent"]; hasKey {
+			includeThoughts = !strings.Contains(userAgent[0], "claude-cli")
+		}
+
+		respChan, errChan := cliClient.SendMessageStream(cliCtx, rawJSON, modelName, systemInstruction, contents, tools, includeThoughts)
+
+		// Track response state for proper Claude format conversion
+		hasFirstResponse := false
+		responseType := 0
+		responseIndex := 0
+
+		// Main streaming loop - handles multiple concurrent events using Go channels
+		// This select statement manages four different types of events simultaneously
+		for {
+			select {
+			// Case 1: Handle client disconnection
+			// Detects when the HTTP client has disconnected and cleans up resources
+			case <-c.Request.Context().Done():
+				if c.Request.Context().Err().Error() == "context canceled" {
+					log.Debugf("Client disconnected: %v", c.Request.Context().Err())
+					cliCancel() // Cancel the backend request to prevent resource leaks
+					return
+				}
+
+			// Case 2: Process incoming response chunks from the backend
+			// This handles the actual streaming data from the AI model
+			case chunk, okStream := <-respChan:
+				if !okStream {
+					// Stream has ended - send the final message_stop event
+					// This follows the Claude API specification for stream termination
+					_, _ = c.Writer.Write([]byte(`event: message_stop`))
+					_, _ = c.Writer.Write([]byte("\n"))
+					_, _ = c.Writer.Write([]byte(`data: {"type":"message_stop"}`))
+					_, _ = c.Writer.Write([]byte("\n\n\n"))
+
+					flusher.Flush()
+					cliCancel()
+					return
+				}
+				// Convert the backend response to Claude-compatible format
+				// This translation layer ensures API compatibility
+				claudeFormat := code.ConvertCliToClaude(chunk, isGlAPIKey, hasFirstResponse, &responseType, &responseIndex)
+				if claudeFormat != "" {
+					_, _ = c.Writer.Write([]byte(claudeFormat))
+					flusher.Flush() // Immediately send the chunk to the client
+				}
+				hasFirstResponse = true
+
+			// Case 3: Handle errors from the backend
+			// This manages various error conditions and implements retry logic
+			case errInfo, okError := <-errChan:
+				if okError {
+					// Special handling for quota exceeded errors
+					// If configured, attempt to switch to a different project/client
+					if errInfo.StatusCode == 429 && h.Cfg.QuotaExceeded.SwitchProject {
+						continue outLoop // Restart the client selection process
+					} else {
+						// Forward other errors directly to the client
+						c.Status(errInfo.StatusCode)
+						_, _ = fmt.Fprint(c.Writer, errInfo.Error.Error())
+						flusher.Flush()
+						cliCancel()
+					}
+					return
+				}
+
+			// Case 4: Send periodic keep-alive signals
+			// Prevents connection timeouts during long-running requests
+			case <-time.After(500 * time.Millisecond):
+				if hasFirstResponse {
+					// Send a ping event to maintain the connection
+					// This is especially important for slow AI model responses
+					output := "event: ping\n"
+					output = output + `data: {"type": "ping"}`
+					output = output + "\n\n\n"
+					_, _ = c.Writer.Write([]byte(output))
+
+					flusher.Flush()
+				}
+			}
+		}
+	}
+
+}
--- a/internal/api/handlers/gemini/cli/cli-handlers.go
+++ b/internal/api/handlers/gemini/cli/cli-handlers.go
@@ -0,0 +1,268 @@
+// Package cli provides HTTP handlers for Gemini CLI API functionality.
+// This package implements handlers that process CLI-specific requests for Gemini API operations,
+// including content generation and streaming content generation endpoints.
+// The handlers restrict access to localhost only and manage communication with the backend service.
+package cli
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"github.com/gin-gonic/gin"
+	"github.com/luispater/CLIProxyAPI/internal/api/handlers"
+	"github.com/luispater/CLIProxyAPI/internal/client"
+	"github.com/luispater/CLIProxyAPI/internal/util"
+	log "github.com/sirupsen/logrus"
+	"github.com/tidwall/gjson"
+	"github.com/tidwall/sjson"
+	"io"
+	"net/http"
+	"strings"
+	"time"
+)
+
+// GeminiCLIAPIHandlers contains the handlers for Gemini CLI API endpoints.
+// It holds a pool of clients to interact with the backend service.
+type GeminiCLIAPIHandlers struct {
+	*handlers.APIHandlers
+}
+
+// NewGeminiCLIAPIHandlers creates a new Gemini CLI API handlers instance.
+// It takes an APIHandlers instance as input and returns a GeminiCLIAPIHandlers.
+func NewGeminiCLIAPIHandlers(apiHandlers *handlers.APIHandlers) *GeminiCLIAPIHandlers {
+	return &GeminiCLIAPIHandlers{
+		APIHandlers: apiHandlers,
+	}
+}
+
+// CLIHandler handles CLI-specific requests for Gemini API operations.
+// It restricts access to localhost only and routes requests to appropriate internal handlers.
+func (h *GeminiCLIAPIHandlers) CLIHandler(c *gin.Context) {
+	if !strings.HasPrefix(c.Request.RemoteAddr, "127.0.0.1:") {
+		c.JSON(http.StatusForbidden, handlers.ErrorResponse{
+			Error: handlers.ErrorDetail{
+				Message: "CLI reply only allow local access",
+				Type:    "forbidden",
+			},
+		})
+		return
+	}
+
+	rawJSON, _ := c.GetRawData()
+	requestRawURI := c.Request.URL.Path
+	if requestRawURI == "/v1internal:generateContent" {
+		h.internalGenerateContent(c, rawJSON)
+	} else if requestRawURI == "/v1internal:streamGenerateContent" {
+		h.internalStreamGenerateContent(c, rawJSON)
+	} else {
+		reqBody := bytes.NewBuffer(rawJSON)
+		req, err := http.NewRequest("POST", fmt.Sprintf("https://cloudcode-pa.googleapis.com%s", c.Request.URL.RequestURI()), reqBody)
+		if err != nil {
+			c.JSON(http.StatusBadRequest, handlers.ErrorResponse{
+				Error: handlers.ErrorDetail{
+					Message: fmt.Sprintf("Invalid request: %v", err),
+					Type:    "invalid_request_error",
+				},
+			})
+			return
+		}
+		for key, value := range c.Request.Header {
+			req.Header[key] = value
+		}
+
+		httpClient, err := util.SetProxy(h.Cfg, &http.Client{})
+		if err != nil {
+			log.Fatalf("set proxy failed: %v", err)
+		}
+
+		resp, err := httpClient.Do(req)
+		if err != nil {
+			c.JSON(http.StatusBadRequest, handlers.ErrorResponse{
+				Error: handlers.ErrorDetail{
+					Message: fmt.Sprintf("Invalid request: %v", err),
+					Type:    "invalid_request_error",
+				},
+			})
+			return
+		}
+
+		if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+			defer func() {
+				if err = resp.Body.Close(); err != nil {
+					log.Printf("warn: failed to close response body: %v", err)
+				}
+			}()
+			bodyBytes, _ := io.ReadAll(resp.Body)
+
+			c.JSON(http.StatusBadRequest, handlers.ErrorResponse{
+				Error: handlers.ErrorDetail{
+					Message: string(bodyBytes),
+					Type:    "invalid_request_error",
+				},
+			})
+			return
+		}
+
+		defer func() {
+			_ = resp.Body.Close()
+		}()
+
+		for key, value := range resp.Header {
+			c.Header(key, value[0])
+		}
+		output, err := io.ReadAll(resp.Body)
+		if err != nil {
+			log.Errorf("Failed to read response body: %v", err)
+			return
+		}
+		_, _ = c.Writer.Write(output)
+	}
+}
+
+func (h *GeminiCLIAPIHandlers) internalStreamGenerateContent(c *gin.Context, rawJSON []byte) {
+	alt := h.GetAlt(c)
+
+	if alt == "" {
+		c.Header("Content-Type", "text/event-stream")
+		c.Header("Cache-Control", "no-cache")
+		c.Header("Connection", "keep-alive")
+		c.Header("Access-Control-Allow-Origin", "*")
+	}
+
+	// Get the http.Flusher interface to manually flush the response.
+	flusher, ok := c.Writer.(http.Flusher)
+	if !ok {
+		c.JSON(http.StatusInternalServerError, handlers.ErrorResponse{
+			Error: handlers.ErrorDetail{
+				Message: "Streaming not supported",
+				Type:    "server_error",
+			},
+		})
+		return
+	}
+
+	modelResult := gjson.GetBytes(rawJSON, "model")
+	modelName := modelResult.String()
+
+	cliCtx, cliCancel := context.WithCancel(context.Background())
+	var cliClient *client.Client
+	defer func() {
+		// Ensure the client's mutex is unlocked on function exit.
+		if cliClient != nil {
+			cliClient.RequestMutex.Unlock()
+		}
+	}()
+
+outLoop:
+	for {
+		var errorResponse *client.ErrorMessage
+		cliClient, errorResponse = h.GetClient(modelName)
+		if errorResponse != nil {
+			c.Status(errorResponse.StatusCode)
+			_, _ = fmt.Fprint(c.Writer, errorResponse.Error)
+			flusher.Flush()
+			cliCancel()
+			return
+		}
+
+		if glAPIKey := cliClient.GetGenerativeLanguageAPIKey(); glAPIKey != "" {
+			log.Debugf("Request use generative language API Key: %s", glAPIKey)
+		} else {
+			log.Debugf("Request use account: %s, project id: %s", cliClient.GetEmail(), cliClient.GetProjectID())
+		}
+		// Send the message and receive response chunks and errors via channels.
+		respChan, errChan := cliClient.SendRawMessageStream(cliCtx, rawJSON, "")
+		hasFirstResponse := false
+		for {
+			select {
+			// Handle client disconnection.
+			case <-c.Request.Context().Done():
+				if c.Request.Context().Err().Error() == "context canceled" {
+					log.Debugf("Client disconnected: %v", c.Request.Context().Err())
+					cliCancel() // Cancel the backend request.
+					return
+				}
+			// Process incoming response chunks.
+			case chunk, okStream := <-respChan:
+				if !okStream {
+					cliCancel()
+					return
+				}
+				hasFirstResponse = true
+				if cliClient.GetGenerativeLanguageAPIKey() != "" {
+					chunk, _ = sjson.SetRawBytes(chunk, "response", chunk)
+				}
+				_, _ = c.Writer.Write([]byte("data: "))
+				_, _ = c.Writer.Write(chunk)
+				_, _ = c.Writer.Write([]byte("\n\n"))
+				flusher.Flush()
+			// Handle errors from the backend.
+			case err, okError := <-errChan:
+				if okError {
+					if err.StatusCode == 429 && h.Cfg.QuotaExceeded.SwitchProject {
+						continue outLoop
+					} else {
+						c.Status(err.StatusCode)
+						_, _ = fmt.Fprint(c.Writer, err.Error.Error())
+						flusher.Flush()
+						cliCancel()
+					}
+					return
+				}
+			// Send a keep-alive signal to the client.
+			case <-time.After(500 * time.Millisecond):
+				if hasFirstResponse {
+					_, _ = c.Writer.Write([]byte("\n"))
+					flusher.Flush()
+				}
+			}
+		}
+	}
+}
+
+func (h *GeminiCLIAPIHandlers) internalGenerateContent(c *gin.Context, rawJSON []byte) {
+	c.Header("Content-Type", "application/json")
+
+	modelResult := gjson.GetBytes(rawJSON, "model")
+	modelName := modelResult.String()
+	cliCtx, cliCancel := context.WithCancel(context.Background())
+	var cliClient *client.Client
+	defer func() {
+		if cliClient != nil {
+			cliClient.RequestMutex.Unlock()
+		}
+	}()
+
+	for {
+		var errorResponse *client.ErrorMessage
+		cliClient, errorResponse = h.GetClient(modelName)
+		if errorResponse != nil {
+			c.Status(errorResponse.StatusCode)
+			_, _ = fmt.Fprint(c.Writer, errorResponse.Error)
+			cliCancel()
+			return
+		}
+
+		if glAPIKey := cliClient.GetGenerativeLanguageAPIKey(); glAPIKey != "" {
+			log.Debugf("Request use generative language API Key: %s", glAPIKey)
+		} else {
+			log.Debugf("Request use account: %s, project id: %s", cliClient.GetEmail(), cliClient.GetProjectID())
+		}
+
+		resp, err := cliClient.SendRawMessage(cliCtx, rawJSON, "")
+		if err != nil {
+			if err.StatusCode == 429 && h.Cfg.QuotaExceeded.SwitchProject {
+				continue
+			} else {
+				c.Status(err.StatusCode)
+				_, _ = c.Writer.Write([]byte(err.Error.Error()))
+				cliCancel()
+			}
+			break
+		} else {
+			_, _ = c.Writer.Write(resp)
+			cliCancel()
+			break
+		}
+	}
+}
--- a/internal/api/handlers/gemini/gemini-handlers.go
+++ b/internal/api/handlers/gemini/gemini-handlers.go
@@ -0,0 +1,437 @@
+// Package gemini provides HTTP handlers for Gemini API endpoints.
+// This package implements handlers for managing Gemini model operations including
+// model listing, content generation, streaming content generation, and token counting.
+// It serves as a proxy layer between clients and the Gemini backend service,
+// handling request translation, client management, and response processing.
+package gemini
+
+import (
+	"context"
+	"fmt"
+	"github.com/gin-gonic/gin"
+	"github.com/luispater/CLIProxyAPI/internal/api/handlers"
+	"github.com/luispater/CLIProxyAPI/internal/api/translator/gemini/cli"
+	"github.com/luispater/CLIProxyAPI/internal/client"
+	log "github.com/sirupsen/logrus"
+	"github.com/tidwall/gjson"
+	"github.com/tidwall/sjson"
+	"net/http"
+	"strings"
+	"time"
+)
+
+// GeminiAPIHandlers contains the handlers for Gemini API endpoints.
+// It holds a pool of clients to interact with the backend service.
+type GeminiAPIHandlers struct {
+	*handlers.APIHandlers
+}
+
+// NewGeminiAPIHandlers creates a new Gemini API handlers instance.
+// It takes an APIHandlers instance as input and returns a GeminiAPIHandlers.
+func NewGeminiAPIHandlers(apiHandlers *handlers.APIHandlers) *GeminiAPIHandlers {
+	return &GeminiAPIHandlers{
+		APIHandlers: apiHandlers,
+	}
+}
+
+// GeminiModels handles the Gemini models listing endpoint.
+// It returns a JSON response containing available Gemini models and their specifications.
+func (h *GeminiAPIHandlers) GeminiModels(c *gin.Context) {
+	c.Status(http.StatusOK)
+	c.Header("Content-Type", "application/json; charset=UTF-8")
+	_, _ = c.Writer.Write([]byte(`{"models":[{"name":"models/gemini-2.5-flash","version":"001","displayName":"Gemini `))
+	_, _ = c.Writer.Write([]byte(`2.5 Flash","description":"Stable version of Gemini 2.5 Flash, our mid-size multimod`))
+	_, _ = c.Writer.Write([]byte(`al model that supports up to 1 million tokens, released in June of 2025.","inputTok`))
+	_, _ = c.Writer.Write([]byte(`enLimit":1048576,"outputTokenLimit":65536,"supportedGenerationMethods":["generateCo`))
+	_, _ = c.Writer.Write([]byte(`ntent","countTokens","createCachedContent","batchGenerateContent"],"temperature":1,`))
+	_, _ = c.Writer.Write([]byte(`"topP":0.95,"topK":64,"maxTemperature":2,"thinking":true},{"name":"models/gemini-2.`))
+	_, _ = c.Writer.Write([]byte(`5-pro","version":"2.5","displayName":"Gemini 2.5 Pro","description":"Stable release`))
+	_, _ = c.Writer.Write([]byte(` (June 17th, 2025) of Gemini 2.5 Pro","inputTokenLimit":1048576,"outputTokenLimit":`))
+	_, _ = c.Writer.Write([]byte(`65536,"supportedGenerationMethods":["generateContent","countTokens","createCachedCo`))
+	_, _ = c.Writer.Write([]byte(`ntent","batchGenerateContent"],"temperature":1,"topP":0.95,"topK":64,"maxTemperatur`))
+	_, _ = c.Writer.Write([]byte(`e":2,"thinking":true}],"nextPageToken":""}`))
+}
+
+// GeminiGetHandler handles GET requests for specific Gemini model information.
+// It returns detailed information about a specific Gemini model based on the action parameter.
+func (h *GeminiAPIHandlers) GeminiGetHandler(c *gin.Context) {
+	var request struct {
+		Action string `uri:"action" binding:"required"`
+	}
+	if err := c.ShouldBindUri(&request); err != nil {
+		c.JSON(http.StatusBadRequest, handlers.ErrorResponse{
+			Error: handlers.ErrorDetail{
+				Message: fmt.Sprintf("Invalid request: %v", err),
+				Type:    "invalid_request_error",
+			},
+		})
+		return
+	}
+	if request.Action == "gemini-2.5-pro" {
+		c.Status(http.StatusOK)
+		c.Header("Content-Type", "application/json; charset=UTF-8")
+		_, _ = c.Writer.Write([]byte(`{"name":"models/gemini-2.5-pro","version":"2.5","displayName":"Gemini 2.5 Pro",`))
+		_, _ = c.Writer.Write([]byte(`"description":"Stable release (June 17th, 2025) of Gemini 2.5 Pro","inputTokenL`))
+		_, _ = c.Writer.Write([]byte(`imit":1048576,"outputTokenLimit":65536,"supportedGenerationMethods":["generateC`))
+		_, _ = c.Writer.Write([]byte(`ontent","countTokens","createCachedContent","batchGenerateContent"],"temperatur`))
+		_, _ = c.Writer.Write([]byte(`e":1,"topP":0.95,"topK":64,"maxTemperature":2,"thinking":true}`))
+	} else if request.Action == "gemini-2.5-flash" {
+		c.Status(http.StatusOK)
+		c.Header("Content-Type", "application/json; charset=UTF-8")
+		_, _ = c.Writer.Write([]byte(`{"name":"models/gemini-2.5-flash","version":"001","displayName":"Gemini 2.5 Fla`))
+		_, _ = c.Writer.Write([]byte(`sh","description":"Stable version of Gemini 2.5 Flash, our mid-size multimodal `))
+		_, _ = c.Writer.Write([]byte(`model that supports up to 1 million tokens, released in June of 2025.","inputTo`))
+		_, _ = c.Writer.Write([]byte(`kenLimit":1048576,"outputTokenLimit":65536,"supportedGenerationMethods":["gener`))
+		_, _ = c.Writer.Write([]byte(`ateContent","countTokens","createCachedContent","batchGenerateContent"],"temper`))
+		_, _ = c.Writer.Write([]byte(`ature":1,"topP":0.95,"topK":64,"maxTemperature":2,"thinking":true}`))
+	} else {
+		c.Status(http.StatusNotFound)
+		_, _ = c.Writer.Write([]byte(
+			`{"error":{"message":"Not Found","code":404,"status":"NOT_FOUND"}}`,
+		))
+	}
+}
+
+// GeminiHandler handles POST requests for Gemini API operations.
+// It routes requests to appropriate handlers based on the action parameter (model:method format).
+func (h *GeminiAPIHandlers) GeminiHandler(c *gin.Context) {
+	var request struct {
+		Action string `uri:"action" binding:"required"`
+	}
+	if err := c.ShouldBindUri(&request); err != nil {
+		c.JSON(http.StatusBadRequest, handlers.ErrorResponse{
+			Error: handlers.ErrorDetail{
+				Message: fmt.Sprintf("Invalid request: %v", err),
+				Type:    "invalid_request_error",
+			},
+		})
+		return
+	}
+	action := strings.Split(request.Action, ":")
+	if len(action) != 2 {
+		c.JSON(http.StatusNotFound, handlers.ErrorResponse{
+			Error: handlers.ErrorDetail{
+				Message: fmt.Sprintf("%s not found.", c.Request.URL.Path),
+				Type:    "invalid_request_error",
+			},
+		})
+		return
+	}
+
+	modelName := action[0]
+	method := action[1]
+	rawJSON, _ := c.GetRawData()
+	rawJSON, _ = sjson.SetBytes(rawJSON, "model", []byte(modelName))
+
+	if method == "generateContent" {
+		h.geminiGenerateContent(c, rawJSON)
+	} else if method == "streamGenerateContent" {
+		h.geminiStreamGenerateContent(c, rawJSON)
+	} else if method == "countTokens" {
+		h.geminiCountTokens(c, rawJSON)
+	}
+}
+
+func (h *GeminiAPIHandlers) geminiStreamGenerateContent(c *gin.Context, rawJSON []byte) {
+	alt := h.GetAlt(c)
+
+	if alt == "" {
+		c.Header("Content-Type", "text/event-stream")
+		c.Header("Cache-Control", "no-cache")
+		c.Header("Connection", "keep-alive")
+		c.Header("Access-Control-Allow-Origin", "*")
+	}
+
+	// Get the http.Flusher interface to manually flush the response.
+	flusher, ok := c.Writer.(http.Flusher)
+	if !ok {
+		c.JSON(http.StatusInternalServerError, handlers.ErrorResponse{
+			Error: handlers.ErrorDetail{
+				Message: "Streaming not supported",
+				Type:    "server_error",
+			},
+		})
+		return
+	}
+
+	modelResult := gjson.GetBytes(rawJSON, "model")
+	modelName := modelResult.String()
+
+	cliCtx, cliCancel := context.WithCancel(context.Background())
+	var cliClient *client.Client
+	defer func() {
+		// Ensure the client's mutex is unlocked on function exit.
+		if cliClient != nil {
+			cliClient.RequestMutex.Unlock()
+		}
+	}()
+
+outLoop:
+	for {
+		var errorResponse *client.ErrorMessage
+		cliClient, errorResponse = h.GetClient(modelName)
+		if errorResponse != nil {
+			c.Status(errorResponse.StatusCode)
+			_, _ = fmt.Fprint(c.Writer, errorResponse.Error)
+			flusher.Flush()
+			cliCancel()
+			return
+		}
+
+		template := ""
+		parsed := gjson.Parse(string(rawJSON))
+		contents := parsed.Get("request.contents")
+		if contents.Exists() {
+			template = string(rawJSON)
+		} else {
+			template = `{"project":"","request":{},"model":""}`
+			template, _ = sjson.SetRaw(template, "request", string(rawJSON))
+			template, _ = sjson.Set(template, "model", gjson.Get(template, "request.model").String())
+			template, _ = sjson.Delete(template, "request.model")
+		}
+
+		template, errFixCLIToolResponse := cli.FixCLIToolResponse(template)
+		if errFixCLIToolResponse != nil {
+			c.JSON(http.StatusInternalServerError, handlers.ErrorResponse{
+				Error: handlers.ErrorDetail{
+					Message: errFixCLIToolResponse.Error(),
+					Type:    "server_error",
+				},
+			})
+			cliCancel()
+			return
+		}
+
+		systemInstructionResult := gjson.Get(template, "request.system_instruction")
+		if systemInstructionResult.Exists() {
+			template, _ = sjson.SetRaw(template, "request.systemInstruction", systemInstructionResult.Raw)
+			template, _ = sjson.Delete(template, "request.system_instruction")
+		}
+		rawJSON = []byte(template)
+
+		if glAPIKey := cliClient.GetGenerativeLanguageAPIKey(); glAPIKey != "" {
+			log.Debugf("Request use generative language API Key: %s", glAPIKey)
+		} else {
+			log.Debugf("Request use account: %s, project id: %s", cliClient.GetEmail(), cliClient.GetProjectID())
+		}
+
+		// Send the message and receive response chunks and errors via channels.
+		respChan, errChan := cliClient.SendRawMessageStream(cliCtx, rawJSON, alt)
+		for {
+			select {
+			// Handle client disconnection.
+			case <-c.Request.Context().Done():
+				if c.Request.Context().Err().Error() == "context canceled" {
+					log.Debugf("Client disconnected: %v", c.Request.Context().Err())
+					cliCancel() // Cancel the backend request.
+					return
+				}
+			// Process incoming response chunks.
+			case chunk, okStream := <-respChan:
+				if !okStream {
+					cliCancel()
+					return
+				}
+				if cliClient.GetGenerativeLanguageAPIKey() == "" {
+					if alt == "" {
+						responseResult := gjson.GetBytes(chunk, "response")
+						if responseResult.Exists() {
+							chunk = []byte(responseResult.Raw)
+						}
+					} else {
+						chunkTemplate := "[]"
+						responseResult := gjson.ParseBytes(chunk)
+						if responseResult.IsArray() {
+							responseResultItems := responseResult.Array()
+							for i := 0; i < len(responseResultItems); i++ {
+								responseResultItem := responseResultItems[i]
+								if responseResultItem.Get("response").Exists() {
+									chunkTemplate, _ = sjson.SetRaw(chunkTemplate, "-1", responseResultItem.Get("response").Raw)
+								}
+							}
+						}
+						chunk = []byte(chunkTemplate)
+					}
+				}
+				if alt == "" {
+					_, _ = c.Writer.Write([]byte("data: "))
+					_, _ = c.Writer.Write(chunk)
+					_, _ = c.Writer.Write([]byte("\n\n"))
+				} else {
+					_, _ = c.Writer.Write(chunk)
+				}
+				flusher.Flush()
+			// Handle errors from the backend.
+			case err, okError := <-errChan:
+				if okError {
+					if err.StatusCode == 429 && h.Cfg.QuotaExceeded.SwitchProject {
+						log.Debugf("quota exceeded, switch client")
+						continue outLoop
+					} else {
+						log.Debugf("error code :%d, error: %v", err.StatusCode, err.Error.Error())
+						c.Status(err.StatusCode)
+						_, _ = fmt.Fprint(c.Writer, err.Error.Error())
+						flusher.Flush()
+						cliCancel()
+					}
+					return
+				}
+			// Send a keep-alive signal to the client.
+			case <-time.After(500 * time.Millisecond):
+			}
+		}
+	}
+}
+
+func (h *GeminiAPIHandlers) geminiCountTokens(c *gin.Context, rawJSON []byte) {
+	c.Header("Content-Type", "application/json")
+
+	alt := h.GetAlt(c)
+	// orgrawJSON := rawJSON
+	modelResult := gjson.GetBytes(rawJSON, "model")
+	modelName := modelResult.String()
+	cliCtx, cliCancel := context.WithCancel(context.Background())
+	var cliClient *client.Client
+	defer func() {
+		if cliClient != nil {
+			cliClient.RequestMutex.Unlock()
+		}
+	}()
+
+	for {
+		var errorResponse *client.ErrorMessage
+		cliClient, errorResponse = h.GetClient(modelName, false)
+		if errorResponse != nil {
+			c.Status(errorResponse.StatusCode)
+			_, _ = fmt.Fprint(c.Writer, errorResponse.Error)
+			cliCancel()
+			return
+		}
+
+		if glAPIKey := cliClient.GetGenerativeLanguageAPIKey(); glAPIKey != "" {
+			log.Debugf("Request use generative language API Key: %s", glAPIKey)
+		} else {
+			log.Debugf("Request use account: %s, project id: %s", cliClient.GetEmail(), cliClient.GetProjectID())
+
+			template := `{"request":{}}`
+			if gjson.GetBytes(rawJSON, "generateContentRequest").Exists() {
+				template, _ = sjson.SetRaw(template, "request", gjson.GetBytes(rawJSON, "generateContentRequest").Raw)
+				template, _ = sjson.Delete(template, "generateContentRequest")
+			} else if gjson.GetBytes(rawJSON, "contents").Exists() {
+				template, _ = sjson.SetRaw(template, "request.contents", gjson.GetBytes(rawJSON, "contents").Raw)
+				template, _ = sjson.Delete(template, "contents")
+			}
+			rawJSON = []byte(template)
+		}
+
+		resp, err := cliClient.SendRawTokenCount(cliCtx, rawJSON, alt)
+		if err != nil {
+			if err.StatusCode == 429 && h.Cfg.QuotaExceeded.SwitchProject {
+				continue
+			} else {
+				c.Status(err.StatusCode)
+				_, _ = c.Writer.Write([]byte(err.Error.Error()))
+				cliCancel()
+				// log.Debugf(err.Error.Error())
+				// log.Debugf(string(rawJSON))
+				// log.Debugf(string(orgrawJSON))
+			}
+			break
+		} else {
+			if cliClient.GetGenerativeLanguageAPIKey() == "" {
+				responseResult := gjson.GetBytes(resp, "response")
+				if responseResult.Exists() {
+					resp = []byte(responseResult.Raw)
+				}
+			}
+			_, _ = c.Writer.Write(resp)
+			cliCancel()
+			break
+		}
+	}
+}
+
+func (h *GeminiAPIHandlers) geminiGenerateContent(c *gin.Context, rawJSON []byte) {
+	c.Header("Content-Type", "application/json")
+
+	alt := h.GetAlt(c)
+
+	modelResult := gjson.GetBytes(rawJSON, "model")
+	modelName := modelResult.String()
+	cliCtx, cliCancel := context.WithCancel(context.Background())
+	var cliClient *client.Client
+	defer func() {
+		if cliClient != nil {
+			cliClient.RequestMutex.Unlock()
+		}
+	}()
+
+	for {
+		var errorResponse *client.ErrorMessage
+		cliClient, errorResponse = h.GetClient(modelName)
+		if errorResponse != nil {
+			c.Status(errorResponse.StatusCode)
+			_, _ = fmt.Fprint(c.Writer, errorResponse.Error)
+			cliCancel()
+			return
+		}
+
+		template := ""
+		parsed := gjson.Parse(string(rawJSON))
+		contents := parsed.Get("request.contents")
+		if contents.Exists() {
+			template = string(rawJSON)
+		} else {
+			template = `{"project":"","request":{},"model":""}`
+			template, _ = sjson.SetRaw(template, "request", string(rawJSON))
+			template, _ = sjson.Set(template, "model", gjson.Get(template, "request.model").String())
+			template, _ = sjson.Delete(template, "request.model")
+		}
+
+		template, errFixCLIToolResponse := cli.FixCLIToolResponse(template)
+		if errFixCLIToolResponse != nil {
+			c.JSON(http.StatusInternalServerError, handlers.ErrorResponse{
+				Error: handlers.ErrorDetail{
+					Message: errFixCLIToolResponse.Error(),
+					Type:    "server_error",
+				},
+			})
+			cliCancel()
+			return
+		}
+
+		systemInstructionResult := gjson.Get(template, "request.system_instruction")
+		if systemInstructionResult.Exists() {
+			template, _ = sjson.SetRaw(template, "request.systemInstruction", systemInstructionResult.Raw)
+			template, _ = sjson.Delete(template, "request.system_instruction")
+		}
+		rawJSON = []byte(template)
+
+		if glAPIKey := cliClient.GetGenerativeLanguageAPIKey(); glAPIKey != "" {
+			log.Debugf("Request use generative language API Key: %s", glAPIKey)
+		} else {
+			log.Debugf("Request use account: %s, project id: %s", cliClient.GetEmail(), cliClient.GetProjectID())
+		}
+		resp, err := cliClient.SendRawMessage(cliCtx, rawJSON, alt)
+		if err != nil {
+			if err.StatusCode == 429 && h.Cfg.QuotaExceeded.SwitchProject {
+				continue
+			} else {
+				c.Status(err.StatusCode)
+				_, _ = c.Writer.Write([]byte(err.Error.Error()))
+				cliCancel()
+			}
+			break
+		} else {
+			if cliClient.GetGenerativeLanguageAPIKey() == "" {
+				responseResult := gjson.GetBytes(resp, "response")
+				if responseResult.Exists() {
+					resp = []byte(responseResult.Raw)
+				}
+			}
+			_, _ = c.Writer.Write(resp)
+			cliCancel()
+			break
+		}
+	}
+}
--- a/internal/api/handlers/handlers.go
+++ b/internal/api/handlers/handlers.go
@@ -0,0 +1,122 @@
+// Package handlers provides core API handler functionality for the CLI Proxy API server.
+// It includes common types, client management, load balancing, and error handling
+// shared across all API endpoint handlers (OpenAI, Claude, Gemini).
+package handlers
+
+import (
+	"fmt"
+	"github.com/gin-gonic/gin"
+	"github.com/luispater/CLIProxyAPI/internal/client"
+	"github.com/luispater/CLIProxyAPI/internal/config"
+	log "github.com/sirupsen/logrus"
+	"sync"
+)
+
+// ErrorResponse represents a standard error response format for the API.
+// It contains a single ErrorDetail field.
+type ErrorResponse struct {
+	Error ErrorDetail `json:"error"`
+}
+
+// ErrorDetail provides specific information about an error that occurred.
+// It includes a human-readable message, an error type, and an optional error code.
+type ErrorDetail struct {
+	// A human-readable message providing more details about the error.
+	Message string `json:"message"`
+	// The type of error that occurred (e.g., "invalid_request_error").
+	Type string `json:"type"`
+	// A short code identifying the error, if applicable.
+	Code string `json:"code,omitempty"`
+}
+
+// APIHandlers contains the handlers for API endpoints.
+// It holds a pool of clients to interact with the backend service.
+type APIHandlers struct {
+	CliClients          []*client.Client
+	Cfg                 *config.Config
+	Mutex               *sync.Mutex
+	LastUsedClientIndex int
+}
+
+// NewAPIHandlers creates a new API handlers instance.
+// It takes a slice of clients and a debug flag as input.
+func NewAPIHandlers(cliClients []*client.Client, cfg *config.Config) *APIHandlers {
+	return &APIHandlers{
+		CliClients:          cliClients,
+		Cfg:                 cfg,
+		Mutex:               &sync.Mutex{},
+		LastUsedClientIndex: 0,
+	}
+}
+
+// UpdateClients updates the handlers' client list and configuration
+func (h *APIHandlers) UpdateClients(clients []*client.Client, cfg *config.Config) {
+	h.CliClients = clients
+	h.Cfg = cfg
+}
+
+// GetClient returns an available client from the pool using round-robin load balancing.
+// It checks for quota limits and tries to find an unlocked client for immediate use.
+// The modelName parameter is used to check quota status for specific models.
+func (h *APIHandlers) GetClient(modelName string, isGenerateContent ...bool) (*client.Client, *client.ErrorMessage) {
+	if len(h.CliClients) == 0 {
+		return nil, &client.ErrorMessage{StatusCode: 500, Error: fmt.Errorf("no clients available")}
+	}
+
+	var cliClient *client.Client
+
+	// Lock the mutex to update the last used client index
+	h.Mutex.Lock()
+	startIndex := h.LastUsedClientIndex
+	if (len(isGenerateContent) > 0 && isGenerateContent[0]) || len(isGenerateContent) == 0 {
+		currentIndex := (startIndex + 1) % len(h.CliClients)
+		h.LastUsedClientIndex = currentIndex
+	}
+	h.Mutex.Unlock()
+
+	// Reorder the client to start from the last used index
+	reorderedClients := make([]*client.Client, 0)
+	for i := 0; i < len(h.CliClients); i++ {
+		cliClient = h.CliClients[(startIndex+1+i)%len(h.CliClients)]
+		if cliClient.IsModelQuotaExceeded(modelName) {
+			log.Debugf("Model %s is quota exceeded for account %s, project id: %s", modelName, cliClient.GetEmail(), cliClient.GetProjectID())
+			cliClient = nil
+			continue
+		}
+		reorderedClients = append(reorderedClients, cliClient)
+	}
+
+	if len(reorderedClients) == 0 {
+		return nil, &client.ErrorMessage{StatusCode: 429, Error: fmt.Errorf(`{"error":{"code":429,"message":"All the models of '%s' are quota exceeded","status":"RESOURCE_EXHAUSTED"}}`, modelName)}
+	}
+
+	locked := false
+	for i := 0; i < len(reorderedClients); i++ {
+		cliClient = reorderedClients[i]
+		if cliClient.RequestMutex.TryLock() {
+			locked = true
+			break
+		}
+	}
+	if !locked {
+		cliClient = h.CliClients[0]
+		cliClient.RequestMutex.Lock()
+	}
+
+	return cliClient, nil
+}
+
+// GetAlt extracts the 'alt' parameter from the request query string.
+// It checks both 'alt' and '$alt' parameters and returns the appropriate value.
+func (h *APIHandlers) GetAlt(c *gin.Context) string {
+	var alt string
+	var hasAlt bool
+	alt, hasAlt = c.GetQuery("alt")
+	if !hasAlt {
+		alt, _ = c.GetQuery("$alt")
+	}
+	if alt == "sse" {
+		return ""
+	}
+	return alt
+}
--- a/internal/api/handlers/openai/openai-handlers.go
+++ b/internal/api/handlers/openai/openai-handlers.go
@@ -0,0 +1,264 @@
+// Package openai provides HTTP handlers for OpenAI API endpoints.
+// This package implements the OpenAI-compatible API interface, including model listing
+// and chat completion functionality. It supports both streaming and non-streaming responses,
+// and manages a pool of clients to interact with backend services.
+// The handlers translate OpenAI API requests to the appropriate backend format and
+// convert responses back to OpenAI-compatible format.
+package openai
+
+import (
+	"context"
+	"fmt"
+	"github.com/luispater/CLIProxyAPI/internal/api/handlers"
+	"github.com/luispater/CLIProxyAPI/internal/api/translator/openai"
+	"github.com/luispater/CLIProxyAPI/internal/client"
+	log "github.com/sirupsen/logrus"
+	"github.com/tidwall/gjson"
+	"net/http"
+	"time"
+
+	"github.com/gin-gonic/gin"
+)
+
+// OpenAIAPIHandlers contains the handlers for OpenAI API endpoints.
+// It holds a pool of clients to interact with the backend service.
+type OpenAIAPIHandlers struct {
+	*handlers.APIHandlers
+}
+
+// NewOpenAIAPIHandlers creates a new OpenAI API handlers instance.
+// It takes an APIHandlers instance as input and returns an OpenAIAPIHandlers.
+func NewOpenAIAPIHandlers(apiHandlers *handlers.APIHandlers) *OpenAIAPIHandlers {
+	return &OpenAIAPIHandlers{
+		APIHandlers: apiHandlers,
+	}
+}
+
+// Models handles the /v1/models endpoint.
+// It returns a hardcoded list of available AI models.
+func (h *OpenAIAPIHandlers) Models(c *gin.Context) {
+	c.JSON(http.StatusOK, gin.H{
+		"data": []map[string]any{
+			{
+				"id":                    "gemini-2.5-pro",
+				"object":                "model",
+				"version":               "2.5",
+				"name":                  "Gemini 2.5 Pro",
+				"description":           "Stable release (June 17th, 2025) of Gemini 2.5 Pro",
+				"context_length":        1048576,
+				"max_completion_tokens": 65536,
+				"supported_parameters": []string{
+					"tools",
+					"temperature",
+					"top_p",
+					"top_k",
+				},
+				"temperature":    1,
+				"topP":           0.95,
+				"topK":           64,
+				"maxTemperature": 2,
+				"thinking":       true,
+			},
+			{
+				"id":                    "gemini-2.5-flash",
+				"object":                "model",
+				"version":               "001",
+				"name":                  "Gemini 2.5 Flash",
+				"description":           "Stable version of Gemini 2.5 Flash, our mid-size multimodal model that supports up to 1 million tokens, released in June of 2025.",
+				"context_length":        1048576,
+				"max_completion_tokens": 65536,
+				"supported_parameters": []string{
+					"tools",
+					"temperature",
+					"top_p",
+					"top_k",
+				},
+				"temperature":    1,
+				"topP":           0.95,
+				"topK":           64,
+				"maxTemperature": 2,
+				"thinking":       true,
+			},
+		},
+	})
+}
+
+// ChatCompletions handles the /v1/chat/completions endpoint.
+// It determines whether the request is for a streaming or non-streaming response
+// and calls the appropriate handler.
+func (h *OpenAIAPIHandlers) ChatCompletions(c *gin.Context) {
+	rawJSON, err := c.GetRawData()
+	// If data retrieval fails, return a 400 Bad Request error.
+	if err != nil {
+		c.JSON(http.StatusBadRequest, handlers.ErrorResponse{
+			Error: handlers.ErrorDetail{
+				Message: fmt.Sprintf("Invalid request: %v", err),
+				Type:    "invalid_request_error",
+			},
+		})
+		return
+	}
+
+	// Check if the client requested a streaming response.
+	streamResult := gjson.GetBytes(rawJSON, "stream")
+	if streamResult.Type == gjson.True {
+		h.handleStreamingResponse(c, rawJSON)
+	} else {
+		h.handleNonStreamingResponse(c, rawJSON)
+	}
+}
+
+// handleNonStreamingResponse handles non-streaming chat completion responses.
+// It selects a client from the pool, sends the request, and aggregates the response
+// before sending it back to the client.
+func (h *OpenAIAPIHandlers) handleNonStreamingResponse(c *gin.Context, rawJSON []byte) {
+	c.Header("Content-Type", "application/json")
+
+	modelName, systemInstruction, contents, tools := openai.PrepareRequest(rawJSON)
+	cliCtx, cliCancel := context.WithCancel(context.Background())
+	var cliClient *client.Client
+	defer func() {
+		if cliClient != nil {
+			cliClient.RequestMutex.Unlock()
+		}
+	}()
+
+	for {
+		var errorResponse *client.ErrorMessage
+		cliClient, errorResponse = h.GetClient(modelName)
+		if errorResponse != nil {
+			c.Status(errorResponse.StatusCode)
+			_, _ = fmt.Fprint(c.Writer, errorResponse.Error)
+			cliCancel()
+			return
+		}
+
+		isGlAPIKey := false
+		if glAPIKey := cliClient.GetGenerativeLanguageAPIKey(); glAPIKey != "" {
+			log.Debugf("Request use generative language API Key: %s", glAPIKey)
+			isGlAPIKey = true
+		} else {
+			log.Debugf("Request use account: %s, project id: %s", cliClient.GetEmail(), cliClient.GetProjectID())
+		}
+
+		resp, err := cliClient.SendMessage(cliCtx, rawJSON, modelName, systemInstruction, contents, tools)
+		if err != nil {
+			if err.StatusCode == 429 && h.Cfg.QuotaExceeded.SwitchProject {
+				continue
+			} else {
+				c.Status(err.StatusCode)
+				_, _ = c.Writer.Write([]byte(err.Error.Error()))
+				cliCancel()
+			}
+			break
+		} else {
+			openAIFormat := openai.ConvertCliToOpenAINonStream(resp, time.Now().Unix(), isGlAPIKey)
+			if openAIFormat != "" {
+				_, _ = c.Writer.Write([]byte(openAIFormat))
+			}
+			cliCancel()
+			break
+		}
+	}
+}
+
+// handleStreamingResponse handles streaming responses
+func (h *OpenAIAPIHandlers) handleStreamingResponse(c *gin.Context, rawJSON []byte) {
+	c.Header("Content-Type", "text/event-stream")
+	c.Header("Cache-Control", "no-cache")
+	c.Header("Connection", "keep-alive")
+	c.Header("Access-Control-Allow-Origin", "*")
+
+	// Get the http.Flusher interface to manually flush the response.
+	flusher, ok := c.Writer.(http.Flusher)
+	if !ok {
+		c.JSON(http.StatusInternalServerError, handlers.ErrorResponse{
+			Error: handlers.ErrorDetail{
+				Message: "Streaming not supported",
+				Type:    "server_error",
+			},
+		})
+		return
+	}
+
+	// Prepare the request for the backend client.
+	modelName, systemInstruction, contents, tools := openai.PrepareRequest(rawJSON)
+	cliCtx, cliCancel := context.WithCancel(context.Background())
+	var cliClient *client.Client
+	defer func() {
+		// Ensure the client's mutex is unlocked on function exit.
+		if cliClient != nil {
+			cliClient.RequestMutex.Unlock()
+		}
+	}()
+
+outLoop:
+	for {
+		var errorResponse *client.ErrorMessage
+		cliClient, errorResponse = h.GetClient(modelName)
+		if errorResponse != nil {
+			c.Status(errorResponse.StatusCode)
+			_, _ = fmt.Fprint(c.Writer, errorResponse.Error)
+			flusher.Flush()
+			cliCancel()
+			return
+		}
+
+		isGlAPIKey := false
+		if glAPIKey := cliClient.GetGenerativeLanguageAPIKey(); glAPIKey != "" {
+			log.Debugf("Request use generative language API Key: %s", glAPIKey)
+			isGlAPIKey = true
+		} else {
+			log.Debugf("Request use account: %s, project id: %s", cliClient.GetEmail(), cliClient.GetProjectID())
+		}
+		// Send the message and receive response chunks and errors via channels.
+		respChan, errChan := cliClient.SendMessageStream(cliCtx, rawJSON, modelName, systemInstruction, contents, tools)
+		hasFirstResponse := false
+		for {
+			select {
+			// Handle client disconnection.
+			case <-c.Request.Context().Done():
+				if c.Request.Context().Err().Error() == "context canceled" {
+					log.Debugf("Client disconnected: %v", c.Request.Context().Err())
+					cliCancel() // Cancel the backend request.
+					return
+				}
+			// Process incoming response chunks.
+			case chunk, okStream := <-respChan:
+				if !okStream {
+					// Stream is closed, send the final [DONE] message.
+					_, _ = fmt.Fprintf(c.Writer, "data: [DONE]\n\n")
+					flusher.Flush()
+					cliCancel()
+					return
+				}
+				// Convert the chunk to OpenAI format and send it to the client.
+				hasFirstResponse = true
+				openAIFormat := openai.ConvertCliToOpenAI(chunk, time.Now().Unix(), isGlAPIKey)
+				if openAIFormat != "" {
+					_, _ = fmt.Fprintf(c.Writer, "data: %s\n\n", openAIFormat)
+					flusher.Flush()
+				}
+			// Handle errors from the backend.
+			case err, okError := <-errChan:
+				if okError {
+					if err.StatusCode == 429 && h.Cfg.QuotaExceeded.SwitchProject {
+						continue outLoop
+					} else {
+						c.Status(err.StatusCode)
+						_, _ = fmt.Fprint(c.Writer, err.Error.Error())
+						flusher.Flush()
+						cliCancel()
+					}
+					return
+				}
+			// Send a keep-alive signal to the client.
+			case <-time.After(500 * time.Millisecond):
+				if hasFirstResponse {
+					_, _ = c.Writer.Write([]byte(": CLI-PROXY-API PROCESSING\n\n"))
+					flusher.Flush()
+				}
+			}
+		}
+	}
+}