fix(server): resolve memory leaks causing OOM in k8s deployment

- usage/logger_plugin: cap modelStats.Details at 1000 entries per model
- cache/signature_cache: add background cleanup for expired sessions (10 min)
- management/handler: add background cleanup for stale IP rate-limit entries (1 hr)
- executor/cache_helpers: add mutex protection and TTL cleanup for codexCacheMap (15 min)
- executor/codex_executor: use thread-safe cache accessors

Add reproduction tests demonstrating leak behavior before/after fixes.

Amp-Thread-ID: https://ampcode.com/threads/T-019ba0fc-1d7b-7338-8e1d-ca0520412777
Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
hemanta212
2026-01-09 13:33:46 +05:45
parent d47b7dc79a
commit 47dacce6ea
7 changed files with 526 additions and 6 deletions

View File

@@ -24,8 +24,15 @@ import (
type attemptInfo struct {
count int
blockedUntil time.Time
lastActivity time.Time // track last activity for cleanup
}
// attemptCleanupInterval controls how often stale IP entries are purged
const attemptCleanupInterval = 1 * time.Hour
// attemptMaxIdleTime controls how long an IP can be idle before cleanup
const attemptMaxIdleTime = 2 * time.Hour
// Handler aggregates config reference, persistence path and helpers.
type Handler struct {
cfg *config.Config
@@ -47,7 +54,7 @@ func NewHandler(cfg *config.Config, configFilePath string, manager *coreauth.Man
envSecret, _ := os.LookupEnv("MANAGEMENT_PASSWORD")
envSecret = strings.TrimSpace(envSecret)
return &Handler{
h := &Handler{
cfg: cfg,
configFilePath: configFilePath,
failedAttempts: make(map[string]*attemptInfo),
@@ -57,6 +64,38 @@ func NewHandler(cfg *config.Config, configFilePath string, manager *coreauth.Man
allowRemoteOverride: envSecret != "",
envSecret: envSecret,
}
h.startAttemptCleanup()
return h
}
// startAttemptCleanup launches a background goroutine that periodically
// removes stale IP entries from failedAttempts to prevent memory leaks.
func (h *Handler) startAttemptCleanup() {
go func() {
ticker := time.NewTicker(attemptCleanupInterval)
defer ticker.Stop()
for range ticker.C {
h.purgeStaleAttempts()
}
}()
}
// purgeStaleAttempts removes IP entries that have been idle beyond attemptMaxIdleTime
// and whose ban (if any) has expired.
func (h *Handler) purgeStaleAttempts() {
now := time.Now()
h.attemptsMu.Lock()
defer h.attemptsMu.Unlock()
for ip, ai := range h.failedAttempts {
// Skip if still banned
if !ai.blockedUntil.IsZero() && now.Before(ai.blockedUntil) {
continue
}
// Remove if idle too long
if now.Sub(ai.lastActivity) > attemptMaxIdleTime {
delete(h.failedAttempts, ip)
}
}
}
// NewHandler creates a new management handler instance.
@@ -149,6 +188,7 @@ func (h *Handler) Middleware() gin.HandlerFunc {
h.failedAttempts[clientIP] = aip
}
aip.count++
aip.lastActivity = time.Now()
if aip.count >= maxFailures {
aip.blockedUntil = time.Now().Add(banDuration)
aip.count = 0