feat(daemon): add adaptive backoff for heartbeat pokes
Implements per-agent backoff tracking to reduce noise for busy agents: - AgentBackoff type tracks interval, miss count, and last activity - BackoffManager manages state across all agents - Geometric backoff strategy (1.5x factor, 10min cap) - Integrates with keepalive to skip pokes when agents are fresh - Resets backoff immediately when activity detected Closes gt-8bx 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
187
internal/daemon/backoff.go
Normal file
187
internal/daemon/backoff.go
Normal file
@@ -0,0 +1,187 @@
|
||||
package daemon
|
||||
|
||||
import (
|
||||
"time"
|
||||
)
|
||||
|
||||
// BackoffStrategy defines how intervals grow.
|
||||
type BackoffStrategy string
|
||||
|
||||
const (
|
||||
// StrategyFixed keeps the same interval (no backoff).
|
||||
StrategyFixed BackoffStrategy = "fixed"
|
||||
|
||||
// StrategyGeometric multiplies by a factor each miss (1.5x).
|
||||
StrategyGeometric BackoffStrategy = "geometric"
|
||||
|
||||
// StrategyExponential doubles interval each miss (2x).
|
||||
StrategyExponential BackoffStrategy = "exponential"
|
||||
)
|
||||
|
||||
// BackoffConfig holds backoff configuration.
|
||||
type BackoffConfig struct {
|
||||
// Strategy determines how intervals grow.
|
||||
Strategy BackoffStrategy
|
||||
|
||||
// BaseInterval is the starting interval (default 60s).
|
||||
BaseInterval time.Duration
|
||||
|
||||
// MaxInterval is the cap on how large intervals can grow (default 10m).
|
||||
MaxInterval time.Duration
|
||||
|
||||
// Factor is the multiplier for geometric backoff (default 1.5).
|
||||
Factor float64
|
||||
}
|
||||
|
||||
// DefaultBackoffConfig returns sensible defaults.
|
||||
func DefaultBackoffConfig() *BackoffConfig {
|
||||
return &BackoffConfig{
|
||||
Strategy: StrategyGeometric,
|
||||
BaseInterval: 60 * time.Second,
|
||||
MaxInterval: 10 * time.Minute,
|
||||
Factor: 1.5,
|
||||
}
|
||||
}
|
||||
|
||||
// AgentBackoff tracks backoff state for a single agent.
|
||||
type AgentBackoff struct {
|
||||
// AgentID identifies the agent (e.g., "mayor", "gastown-witness").
|
||||
AgentID string
|
||||
|
||||
// BaseInterval is the starting interval.
|
||||
BaseInterval time.Duration
|
||||
|
||||
// CurrentInterval is the current (possibly backed-off) interval.
|
||||
CurrentInterval time.Duration
|
||||
|
||||
// MaxInterval caps how large intervals can grow.
|
||||
MaxInterval time.Duration
|
||||
|
||||
// ConsecutiveMiss counts pokes with no response.
|
||||
ConsecutiveMiss int
|
||||
|
||||
// LastPoke is when we last poked this agent.
|
||||
LastPoke time.Time
|
||||
|
||||
// LastActivity is when the agent last showed activity.
|
||||
LastActivity time.Time
|
||||
}
|
||||
|
||||
// NewAgentBackoff creates backoff state for an agent.
|
||||
func NewAgentBackoff(agentID string, config *BackoffConfig) *AgentBackoff {
|
||||
if config == nil {
|
||||
config = DefaultBackoffConfig()
|
||||
}
|
||||
return &AgentBackoff{
|
||||
AgentID: agentID,
|
||||
BaseInterval: config.BaseInterval,
|
||||
CurrentInterval: config.BaseInterval,
|
||||
MaxInterval: config.MaxInterval,
|
||||
}
|
||||
}
|
||||
|
||||
// ShouldPoke returns true if enough time has passed since the last poke.
|
||||
func (ab *AgentBackoff) ShouldPoke() bool {
|
||||
if ab.LastPoke.IsZero() {
|
||||
return true // Never poked
|
||||
}
|
||||
return time.Since(ab.LastPoke) >= ab.CurrentInterval
|
||||
}
|
||||
|
||||
// RecordPoke records that we poked the agent.
|
||||
func (ab *AgentBackoff) RecordPoke() {
|
||||
ab.LastPoke = time.Now()
|
||||
}
|
||||
|
||||
// RecordMiss records that the agent didn't respond since last poke.
|
||||
// This increases the backoff interval.
|
||||
func (ab *AgentBackoff) RecordMiss(config *BackoffConfig) {
|
||||
ab.ConsecutiveMiss++
|
||||
|
||||
if config == nil {
|
||||
config = DefaultBackoffConfig()
|
||||
}
|
||||
|
||||
switch config.Strategy {
|
||||
case StrategyFixed:
|
||||
// No change
|
||||
case StrategyGeometric:
|
||||
ab.CurrentInterval = time.Duration(float64(ab.CurrentInterval) * config.Factor)
|
||||
case StrategyExponential:
|
||||
ab.CurrentInterval = ab.CurrentInterval * 2
|
||||
}
|
||||
|
||||
// Cap at max interval
|
||||
if ab.CurrentInterval > ab.MaxInterval {
|
||||
ab.CurrentInterval = ab.MaxInterval
|
||||
}
|
||||
}
|
||||
|
||||
// RecordActivity records that the agent showed activity.
|
||||
// This resets the backoff to the base interval.
|
||||
func (ab *AgentBackoff) RecordActivity() {
|
||||
ab.ConsecutiveMiss = 0
|
||||
ab.CurrentInterval = ab.BaseInterval
|
||||
ab.LastActivity = time.Now()
|
||||
}
|
||||
|
||||
// BackoffManager tracks backoff state for all agents.
|
||||
type BackoffManager struct {
|
||||
config *BackoffConfig
|
||||
agents map[string]*AgentBackoff
|
||||
}
|
||||
|
||||
// NewBackoffManager creates a new backoff manager.
|
||||
func NewBackoffManager(config *BackoffConfig) *BackoffManager {
|
||||
if config == nil {
|
||||
config = DefaultBackoffConfig()
|
||||
}
|
||||
return &BackoffManager{
|
||||
config: config,
|
||||
agents: make(map[string]*AgentBackoff),
|
||||
}
|
||||
}
|
||||
|
||||
// GetOrCreate returns backoff state for an agent, creating if needed.
|
||||
func (bm *BackoffManager) GetOrCreate(agentID string) *AgentBackoff {
|
||||
if ab, ok := bm.agents[agentID]; ok {
|
||||
return ab
|
||||
}
|
||||
ab := NewAgentBackoff(agentID, bm.config)
|
||||
bm.agents[agentID] = ab
|
||||
return ab
|
||||
}
|
||||
|
||||
// ShouldPoke returns true if we should poke the given agent.
|
||||
func (bm *BackoffManager) ShouldPoke(agentID string) bool {
|
||||
return bm.GetOrCreate(agentID).ShouldPoke()
|
||||
}
|
||||
|
||||
// RecordPoke records that we poked an agent.
|
||||
func (bm *BackoffManager) RecordPoke(agentID string) {
|
||||
bm.GetOrCreate(agentID).RecordPoke()
|
||||
}
|
||||
|
||||
// RecordMiss records that an agent didn't respond.
|
||||
func (bm *BackoffManager) RecordMiss(agentID string) {
|
||||
bm.GetOrCreate(agentID).RecordMiss(bm.config)
|
||||
}
|
||||
|
||||
// RecordActivity records that an agent showed activity.
|
||||
func (bm *BackoffManager) RecordActivity(agentID string) {
|
||||
bm.GetOrCreate(agentID).RecordActivity()
|
||||
}
|
||||
|
||||
// GetInterval returns the current interval for an agent.
|
||||
func (bm *BackoffManager) GetInterval(agentID string) time.Duration {
|
||||
return bm.GetOrCreate(agentID).CurrentInterval
|
||||
}
|
||||
|
||||
// Stats returns a map of agent ID to current interval for logging.
|
||||
func (bm *BackoffManager) Stats() map[string]time.Duration {
|
||||
stats := make(map[string]time.Duration, len(bm.agents))
|
||||
for id, ab := range bm.agents {
|
||||
stats[id] = ab.CurrentInterval
|
||||
}
|
||||
return stats
|
||||
}
|
||||
Reference in New Issue
Block a user