Add ClaudeMessages handler for SSE-compatible chat completions

- Introduced `ClaudeMessages` to handle Claude-compatible streaming chat completions. - Implemented client rotation, quota management, and dynamic model name mapping for better load balancing and resource utilization. - Enhanced response streaming with real-time chunking and Claude format conversion. - Added error handling for quota exhaustion, client disconnections, and backend failures.
2026-02-19 04:40:52 +08:00 · 2025-07-11 13:53:09 +08:00
parent 3c0c61aaf1
commit c601542f6f
1 changed files with 234 additions and 0 deletions
--- a/internal/api/claude-code-handlers.go
+++ b/internal/api/claude-code-handlers.go
@@ -0,0 +1,234 @@
 package api
 import (
 	"context"
 	"fmt"
 	"github.com/gin-gonic/gin"
 	"github.com/luispater/CLIProxyAPI/internal/api/translator"
 	"github.com/luispater/CLIProxyAPI/internal/client"
 	log "github.com/sirupsen/logrus"
 	"net/http"
 	"strings"
 	"time"
 )
 // ClaudeMessages handles Claude-compatible streaming chat completions.
 // This function implements a sophisticated client rotation and quota management system
 // to ensure high availability and optimal resource utilization across multiple backend clients.
 func (h *APIHandlers) ClaudeMessages(c *gin.Context) {
 	// Extract raw JSON data from the incoming request
 	rawJson, err := c.GetRawData()
 	// If data retrieval fails, return a 400 Bad Request error.
 	if err != nil {
 		c.JSON(http.StatusBadRequest, ErrorResponse{
 			Error: ErrorDetail{
 				Message: fmt.Sprintf("Invalid request: %v", err),
 				Type:    "invalid_request_error",
 			},
 		})
 		return
 	}
 	// Set up Server-Sent Events (SSE) headers for streaming response
 	// These headers are essential for maintaining a persistent connection
 	// and enabling real-time streaming of chat completions
 	c.Header("Content-Type", "text/event-stream")
 	c.Header("Cache-Control", "no-cache")
 	c.Header("Connection", "keep-alive")
 	c.Header("Access-Control-Allow-Origin", "*")
 	// Get the http.Flusher interface to manually flush the response.
 	// This is crucial for streaming as it allows immediate sending of data chunks
 	flusher, ok := c.Writer.(http.Flusher)
 	if !ok {
 		c.JSON(http.StatusInternalServerError, ErrorResponse{
 			Error: ErrorDetail{
 				Message: "Streaming not supported",
 				Type:    "server_error",
 			},
 		})
 		return
 	}
 	// Parse and prepare the Claude request, extracting model name, system instructions,
 	// conversation contents, and available tools from the raw JSON
 	modelName, systemInstruction, contents, tools := translator.PrepareClaudeRequest(rawJson)
 	// Map Claude model names to corresponding Gemini models
 	// This allows the proxy to handle Claude API calls using Gemini backend
 	if modelName == "claude-sonnet-4-20250514" {
 		modelName = "gemini-2.5-pro"
 	} else if modelName == "claude-3-5-haiku-20241022" {
 		modelName = "gemini-2.5-flash"
 	}
 	// Create a cancellable context for the backend client request
 	// This allows proper cleanup and cancellation of ongoing requests
 	cliCtx, cliCancel := context.WithCancel(context.Background())
 	var cliClient *client.Client
 	defer func() {
 		// Ensure the client's mutex is unlocked on function exit.
 		// This prevents deadlocks and ensures proper resource cleanup
 		if cliClient != nil {
 			cliClient.RequestMutex.Unlock()
 		}
 	}()
 	// Main client rotation loop with quota management
 	// This loop implements a sophisticated load balancing and failover mechanism
 outLoop:
 	for {
 		// Thread-safe client index rotation to distribute load evenly
 		// This ensures fair usage across all available clients
 		mutex.Lock()
 		startIndex := lastUsedClientIndex
 		currentIndex := (startIndex + 1) % len(h.cliClients)
 		lastUsedClientIndex = currentIndex
 		mutex.Unlock()
 		// Build a list of available clients, starting from the next client in rotation
 		// This implements round-robin load balancing while filtering out quota-exceeded clients
 		reorderedClients := make([]*client.Client, 0)
 		for i := 0; i < len(h.cliClients); i++ {
 			cliClient = h.cliClients[(startIndex+1+i)%len(h.cliClients)]
 			// Skip clients that have exceeded their quota for the requested model
 			if cliClient.IsModelQuotaExceeded(modelName) {
 				// Log different messages based on authentication method (API key vs account)
 				if cliClient.GetGenerativeLanguageAPIKey() == "" {
 					log.Debugf("Model %s is quota exceeded for account %s, project id: %s", modelName, cliClient.GetEmail(), cliClient.GetProjectID())
 				} else {
 					log.Debugf("Model %s is quota exceeded for generative language API Key: %s", modelName, cliClient.GetGenerativeLanguageAPIKey())
 				}
 				cliClient = nil
 				continue
 			}
 			reorderedClients = append(reorderedClients, cliClient)
 		}
 		// If all clients have exceeded quota, return a 429 Too Many Requests error
 		if len(reorderedClients) == 0 {
 			c.Status(429)
 			_, _ = fmt.Fprint(c.Writer, fmt.Sprintf(`{"error":{"code":429,"message":"All the models of '%s' are quota exceeded","status":"RESOURCE_EXHAUSTED"}}`, modelName))
 			flusher.Flush()
 			cliCancel()
 			return
 		}
 		// Attempt to acquire a lock on an available client using non-blocking TryLock
 		// This prevents blocking when a client is busy with another request
 		locked := false
 		for i := 0; i < len(reorderedClients); i++ {
 			cliClient = reorderedClients[i]
 			if cliClient.RequestMutex.TryLock() {
 				locked = true
 				break
 			}
 		}
 		// If no client is immediately available, fall back to blocking on the first client
 		// This ensures the request will eventually be processed
 		if !locked {
 			cliClient = h.cliClients[0]
 			cliClient.RequestMutex.Lock()
 		}
 		// Determine the authentication method being used by the selected client
 		// This affects how responses are formatted and logged
 		isGlAPIKey := false
 		if glAPIKey := cliClient.GetGenerativeLanguageAPIKey(); glAPIKey != "" {
 			log.Debugf("Request use generative language API Key: %s", glAPIKey)
 			isGlAPIKey = true
 		} else {
 			log.Debugf("Request use account: %s, project id: %s", cliClient.GetEmail(), cliClient.GetProjectID())
 		}
 		// Initiate streaming communication with the backend client
 		// This returns two channels: one for response chunks and one for errors
 		includeThoughts := false
 		if userAgent, hasKey := c.Request.Header["User-Agent"]; hasKey {
 			includeThoughts = !strings.Contains(userAgent[0], "claude-cli")
 		}
 		respChan, errChan := cliClient.SendMessageStream(cliCtx, rawJson, modelName, systemInstruction, contents, tools, includeThoughts)
 		// Track response state for proper Claude format conversion
 		hasFirstResponse := false
 		responseType := 0
 		responseIndex := 0
 		// Main streaming loop - handles multiple concurrent events using Go channels
 		// This select statement manages four different types of events simultaneously
 		for {
 			select {
 			// Case 1: Handle client disconnection
 			// Detects when the HTTP client has disconnected and cleans up resources
 			case <-c.Request.Context().Done():
 				if c.Request.Context().Err().Error() == "context canceled" {
 					log.Debugf("Client disconnected: %v", c.Request.Context().Err())
 					cliCancel() // Cancel the backend request to prevent resource leaks
 					return
 				}
 			// Case 2: Process incoming response chunks from the backend
 			// This handles the actual streaming data from the AI model
 			case chunk, okStream := <-respChan:
 				if !okStream {
 					// Stream has ended - send the final message_stop event
 					// This follows the Claude API specification for stream termination
 					_, _ = c.Writer.Write([]byte(`event: message_stop`))
 					_, _ = c.Writer.Write([]byte("\n"))
 					_, _ = c.Writer.Write([]byte(`data: {"type":"message_stop"}`))
 					_, _ = c.Writer.Write([]byte("\n\n\n"))
 					flusher.Flush()
 					cliCancel()
 					return
 				} else {
 					// Convert the backend response to Claude-compatible format
 					// This translation layer ensures API compatibility
 					claudeFormat := translator.ConvertCliToClaude(chunk, isGlAPIKey, hasFirstResponse, &responseType, &responseIndex)
 					if claudeFormat != "" {
 						_, _ = c.Writer.Write([]byte(claudeFormat))
 						flusher.Flush() // Immediately send the chunk to the client
 					}
 					hasFirstResponse = true
 				}
 			// Case 3: Handle errors from the backend
 			// This manages various error conditions and implements retry logic
 			case errInfo, okError := <-errChan:
 				if okError {
 					// Special handling for quota exceeded errors
 					// If configured, attempt to switch to a different project/client
 					if errInfo.StatusCode == 429 && h.cfg.QuotaExceeded.SwitchProject {
 						continue outLoop // Restart the client selection process
 					} else {
 						// Forward other errors directly to the client
 						c.Status(errInfo.StatusCode)
 						_, _ = fmt.Fprint(c.Writer, errInfo.Error.Error())
 						flusher.Flush()
 						cliCancel()
 					}
 					return
 				}
 			// Case 4: Send periodic keep-alive signals
 			// Prevents connection timeouts during long-running requests
 			case <-time.After(500 * time.Millisecond):
 				if hasFirstResponse {
 					// Send a ping event to maintain the connection
 					// This is especially important for slow AI model responses
 					output := "event: ping\n"
 					output = output + `data: {"type": "ping"}`
 					output = output + "\n\n\n"
 					_, _ = c.Writer.Write([]byte(output))
 					flusher.Flush()
 				}
 			}
 		}
 	}
 }