feat(executor): add intelligent retry logic for 429 rate limits

Implement Google RetryInfo.retryDelay support for handling 429 rate limit errors. Retries same model up to 3 times using exact delays from Google's API before trying fallback models. - Add parseRetryDelay() to extract Google's retry guidance - Implement inner retry loop in Execute() and ExecuteStream() - Context-aware waiting with cancellation support - Cap delays at 60s maximum for safety
2026-02-19 04:40:52 +08:00 · 2025-11-19 12:45:59 -07:00
parent 782bba0bc4
commit 6a3de3a89c
1 changed files with 266 additions and 144 deletions
--- a/internal/runtime/executor/gemini_cli_executor.go
+++ b/internal/runtime/executor/gemini_cli_executor.go
@@ -99,7 +99,15 @@ func (e *GeminiCLIExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth
 	var lastStatus int
 	var lastBody []byte
 	// Get max retry count from config, default to 3 if not set
 	maxRetries := e.cfg.RequestRetry
 	if maxRetries <= 0 {
 		maxRetries = 3
 	}
 	for idx, attemptModel := range models {
 		// Inner retry loop for 429 errors on the same model
 		for retryCount := 0; retryCount <= maxRetries; retryCount++ {
 			payload := append([]byte(nil), basePayload...)
 			if action == "countTokens" {
 				payload = deleteJSONField(payload, "project")
@@ -171,19 +179,45 @@ func (e *GeminiCLIExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth
 			lastStatus = httpResp.StatusCode
 			lastBody = append([]byte(nil), data...)
 			log.Debugf("request error, error status: %d, error body: %s", httpResp.StatusCode, summarizeErrorBody(httpResp.Header.Get("Content-Type"), data))
 		if httpResp.StatusCode == 429 {
 			if idx+1 < len(models) {
 				log.Debugf("gemini cli executor: rate limited, retrying with next model: %s", models[idx+1])
 			} else {
 				log.Debug("gemini cli executor: rate limited, no additional fallback model")
 			}
 			continue
 		}
 			// Handle 429 rate limit errors with retry
 			if httpResp.StatusCode == 429 {
 				if retryCount < maxRetries {
 					// Parse retry delay from Google's response
 					retryDelay := parseRetryDelay(data)
 					log.Infof("gemini cli executor: rate limited (429), retrying model %s in %v (attempt %d/%d)", attemptModel, retryDelay, retryCount+1, maxRetries)
 					// Wait for the specified delay
 					select {
 					case <-time.After(retryDelay):
 						// Continue to next retry iteration
 						continue
 					case <-ctx.Done():
 						// Context cancelled, return immediately
 						err = ctx.Err()
 						return resp, err
 					}
 				} else {
 					// Exhausted retries for this model, try next model if available
 					if idx+1 < len(models) {
 						log.Infof("gemini cli executor: rate limited, exhausted %d retries for model %s, trying fallback model: %s", maxRetries, attemptModel, models[idx+1])
 						break // Break inner loop to try next model
 					} else {
 						log.Infof("gemini cli executor: rate limited, exhausted %d retries for model %s, no additional fallback model", maxRetries, attemptModel)
 						// No more models to try, will return error below
 					}
 				}
 			} else {
 				// Non-429 error, don't retry this model
 				err = statusErr{code: httpResp.StatusCode, msg: string(data)}
 				return resp, err
 			}
 			// Break inner loop if we hit this point (no retry needed or exhausted retries)
 			break
 		}
 	}
 	if len(lastBody) > 0 {
 		appendAPIResponseChunk(ctx, e.cfg, lastBody)
 	}
@@ -235,8 +269,20 @@ func (e *GeminiCLIExecutor) ExecuteStream(ctx context.Context, auth *cliproxyaut
 	var lastStatus int
 	var lastBody []byte
 	// Get max retry count from config, default to 3 if not set
 	maxRetries := e.cfg.RequestRetry
 	if maxRetries <= 0 {
 		maxRetries = 3
 	}
 	for idx, attemptModel := range models {
-		payload := append([]byte(nil), basePayload...)
+		var httpResp *http.Response
 		var payload []byte
 		var errDo error
 		// Inner retry loop for 429 errors on the same model
 		for retryCount := 0; retryCount <= maxRetries; retryCount++ {
 			payload = append([]byte(nil), basePayload...)
 			payload = setJSONField(payload, "project", projectID)
 			payload = setJSONField(payload, "model", attemptModel)
@@ -275,7 +321,7 @@ func (e *GeminiCLIExecutor) ExecuteStream(ctx context.Context, auth *cliproxyaut
 				AuthValue: authValue,
 			})
-		httpResp, errDo := httpClient.Do(reqHTTP)
+			httpResp, errDo = httpClient.Do(reqHTTP)
 			if errDo != nil {
 				recordAPIResponseError(ctx, e.cfg, errDo)
 				err = errDo
@@ -296,18 +342,49 @@ func (e *GeminiCLIExecutor) ExecuteStream(ctx context.Context, auth *cliproxyaut
 				lastStatus = httpResp.StatusCode
 				lastBody = append([]byte(nil), data...)
 				log.Debugf("request error, error status: %d, error body: %s", httpResp.StatusCode, summarizeErrorBody(httpResp.Header.Get("Content-Type"), data))
 				// Handle 429 rate limit errors with retry
 				if httpResp.StatusCode == 429 {
-				if idx+1 < len(models) {
+					if retryCount < maxRetries {
-					log.Debugf("gemini cli executor: rate limited, retrying with next model: %s", models[idx+1])
+						// Parse retry delay from Google's response
-				} else {
+						retryDelay := parseRetryDelay(data)
-					log.Debug("gemini cli executor: rate limited, no additional fallback model")
+						log.Infof("gemini cli executor: rate limited (429), retrying stream model %s in %v (attempt %d/%d)", attemptModel, retryDelay, retryCount+1, maxRetries)
-				}
+
 						// Wait for the specified delay
 						select {
 						case <-time.After(retryDelay):
 							// Continue to next retry iteration
 							continue
 						case <-ctx.Done():
 							// Context cancelled, return immediately
 							err = ctx.Err()
 							return nil, err
 						}
 					} else {
 						// Exhausted retries for this model, try next model if available
 						if idx+1 < len(models) {
 							log.Infof("gemini cli executor: rate limited, exhausted %d retries for stream model %s, trying fallback model: %s", maxRetries, attemptModel, models[idx+1])
 							break // Break inner loop to try next model
 						} else {
 							log.Infof("gemini cli executor: rate limited, exhausted %d retries for stream model %s, no additional fallback model", maxRetries, attemptModel)
 							// No more models to try, will return error below
 						}
 					}
 				} else {
 					// Non-429 error, don't retry this model
 					err = statusErr{code: httpResp.StatusCode, msg: string(data)}
 					return nil, err
 				}
 				// Break inner loop if we hit this point (no retry needed or exhausted retries)
 				break
 			}
 			// Success - httpResp.StatusCode is 2xx, break out of retry loop
 			// and proceed to streaming logic below
 			break
 		}
 		out := make(chan cliproxyexecutor.StreamChunk)
 		stream = out
 		go func(resp *http.Response, reqBody []byte, attempt string) {
@@ -769,3 +846,48 @@ func fixGeminiCLIImageAspectRatio(modelName string, rawJSON []byte) []byte {
 	}
 	return rawJSON
 }
 // parseRetryDelay extracts the retry delay from a Google API 429 error response.
 // The error response contains a RetryInfo.retryDelay field in the format "0.847655010s".
 // Returns the duration to wait, or a default duration if parsing fails.
 func parseRetryDelay(errorBody []byte) time.Duration {
 	const defaultDelay = 1 * time.Second
 	const maxDelay = 60 * time.Second
 	// Try to parse the retryDelay from the error response
 	// Format: error.details[].retryDelay where @type == "type.googleapis.com/google.rpc.RetryInfo"
 	details := gjson.GetBytes(errorBody, "error.details")
 	if !details.Exists() || !details.IsArray() {
 		log.Debugf("parseRetryDelay: no error.details found, using default delay %v", defaultDelay)
 		return defaultDelay
 	}
 	for _, detail := range details.Array() {
 		typeVal := detail.Get("@type").String()
 		if typeVal == "type.googleapis.com/google.rpc.RetryInfo" {
 			retryDelay := detail.Get("retryDelay").String()
 			if retryDelay != "" {
 				// Parse duration string like "0.847655010s"
 				duration, err := time.ParseDuration(retryDelay)
 				if err != nil {
 					log.Debugf("parseRetryDelay: failed to parse duration %q: %v, using default", retryDelay, err)
 					return defaultDelay
 				}
 				// Cap at maxDelay to prevent excessive waits
 				if duration > maxDelay {
 					log.Debugf("parseRetryDelay: capping delay from %v to %v", duration, maxDelay)
 					return maxDelay
 				}
 				if duration < 0 {
 					log.Debugf("parseRetryDelay: negative delay %v, using default", duration)
 					return defaultDelay
 				}
 				log.Debugf("parseRetryDelay: using delay %v from API response", duration)
 				return duration
 			}
 		}
 	}
 	log.Debugf("parseRetryDelay: no RetryInfo found, using default delay %v", defaultDelay)
 	return defaultDelay
 }