Remove StateIdle and idle polecat concept
The transient polecat model says: "Polecats exist only while working." This removes the deprecated StateIdle and updates the codebase: - Remove StateIdle from polecat/types.go (keep StateActive for legacy data) - Update manager.go: Get() returns StateDone (not StateIdle) when no work - Update manager.go: Add/Recreate return StateWorking (not StateIdle) - Remove zombie scan logic from deacon.go (no idle polecats to scan for) - Update tests to reflect new behavior The correct lifecycle is now: - Spawn: polecat created with work (StateWorking) - Work: sessions cycle, sandbox persists - Done: polecat signals completion (StateDone) - Nuke: Witness destroys sandbox 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -186,33 +186,6 @@ This helps the Deacon understand which agents may need attention.`,
|
||||
RunE: runDeaconHealthState,
|
||||
}
|
||||
|
||||
var deaconZombieScanCmd = &cobra.Command{
|
||||
Use: "zombie-scan [rig]",
|
||||
Short: "Scan for idle polecats that should have been nuked",
|
||||
Long: `Backup check for polecats the Witness should have cleaned up.
|
||||
|
||||
Scans for "zombie" polecats that meet ALL of these criteria:
|
||||
- State: idle or done (no active work)
|
||||
- Session: not running (tmux session dead)
|
||||
- No hooked work
|
||||
- Last activity: older than threshold (default 10 minutes)
|
||||
|
||||
These are polecats that the Witness should have nuked but didn't.
|
||||
This provides defense-in-depth against Witness failures.
|
||||
|
||||
Actions:
|
||||
1. Log warning about witness failure
|
||||
2. Nuke the zombie polecat directly
|
||||
3. Notify mayor of witness issue (optional)
|
||||
|
||||
Examples:
|
||||
gt deacon zombie-scan # Scan all rigs
|
||||
gt deacon zombie-scan gastown # Scan specific rig
|
||||
gt deacon zombie-scan --dry-run # Preview only
|
||||
gt deacon zombie-scan --threshold=5m # Custom staleness threshold`,
|
||||
Args: cobra.MaximumNArgs(1),
|
||||
RunE: runDeaconZombieScan,
|
||||
}
|
||||
|
||||
var (
|
||||
triggerTimeout time.Duration
|
||||
@@ -225,11 +198,6 @@ var (
|
||||
// Force kill flags
|
||||
forceKillReason string
|
||||
forceKillSkipNotify bool
|
||||
|
||||
// Zombie scan flags
|
||||
zombieScanDryRun bool
|
||||
zombieScanThreshold time.Duration
|
||||
zombieScanNuke bool
|
||||
)
|
||||
|
||||
func init() {
|
||||
@@ -243,7 +211,6 @@ func init() {
|
||||
deaconCmd.AddCommand(deaconHealthCheckCmd)
|
||||
deaconCmd.AddCommand(deaconForceKillCmd)
|
||||
deaconCmd.AddCommand(deaconHealthStateCmd)
|
||||
deaconCmd.AddCommand(deaconZombieScanCmd)
|
||||
|
||||
// Flags for trigger-pending
|
||||
deaconTriggerPendingCmd.Flags().DurationVar(&triggerTimeout, "timeout", 2*time.Second,
|
||||
@@ -263,14 +230,6 @@ func init() {
|
||||
deaconForceKillCmd.Flags().BoolVar(&forceKillSkipNotify, "skip-notify", false,
|
||||
"Skip sending notification mail to mayor")
|
||||
|
||||
// Flags for zombie-scan
|
||||
deaconZombieScanCmd.Flags().BoolVarP(&zombieScanDryRun, "dry-run", "n", false,
|
||||
"Show what would be done without nuking")
|
||||
deaconZombieScanCmd.Flags().DurationVar(&zombieScanThreshold, "threshold", 10*time.Minute,
|
||||
"Staleness threshold for zombie detection")
|
||||
deaconZombieScanCmd.Flags().BoolVar(&zombieScanNuke, "nuke", true,
|
||||
"Nuke detected zombies (use --nuke=false to report only)")
|
||||
|
||||
rootCmd.AddCommand(deaconCmd)
|
||||
}
|
||||
|
||||
@@ -865,258 +824,6 @@ func runDeaconHealthState(cmd *cobra.Command, args []string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// runDeaconZombieScan scans for idle polecats that should have been nuked by the Witness.
|
||||
// This is a defense-in-depth backup check.
|
||||
func runDeaconZombieScan(cmd *cobra.Command, args []string) error {
|
||||
townRoot, err := workspace.FindFromCwdOrError()
|
||||
if err != nil {
|
||||
return fmt.Errorf("not in a Gas Town workspace: %w", err)
|
||||
}
|
||||
|
||||
t := tmux.NewTmux()
|
||||
|
||||
// Get list of rigs to scan
|
||||
var rigsToScan []string
|
||||
if len(args) > 0 {
|
||||
rigsToScan = []string{args[0]}
|
||||
} else {
|
||||
// Scan all rigs by finding directories with polecats/ subdirectories
|
||||
entries, err := os.ReadDir(townRoot)
|
||||
if err != nil {
|
||||
return fmt.Errorf("reading town root: %w", err)
|
||||
}
|
||||
for _, entry := range entries {
|
||||
if !entry.IsDir() {
|
||||
continue
|
||||
}
|
||||
// Skip non-rig directories
|
||||
if entry.Name() == "deacon" || entry.Name() == "mayor" ||
|
||||
entry.Name() == "plugins" || entry.Name() == "docs" ||
|
||||
strings.HasPrefix(entry.Name(), ".") {
|
||||
continue
|
||||
}
|
||||
// Check if it has a polecats directory
|
||||
polecatsDir := filepath.Join(townRoot, entry.Name(), "polecats")
|
||||
if info, err := os.Stat(polecatsDir); err == nil && info.IsDir() {
|
||||
rigsToScan = append(rigsToScan, entry.Name())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(rigsToScan) == 0 {
|
||||
fmt.Printf("%s No rigs found to scan\n", style.Dim.Render("○"))
|
||||
return nil
|
||||
}
|
||||
|
||||
fmt.Printf("%s Scanning for zombie polecats (threshold: %s)...\n",
|
||||
style.Bold.Render("🧟"), zombieScanThreshold)
|
||||
|
||||
var zombies []zombieInfo
|
||||
for _, rigName := range rigsToScan {
|
||||
rigZombies, err := scanRigForZombies(townRoot, rigName, t)
|
||||
if err != nil {
|
||||
style.PrintWarning("failed to scan rig %s: %v", rigName, err)
|
||||
continue
|
||||
}
|
||||
zombies = append(zombies, rigZombies...)
|
||||
}
|
||||
|
||||
if len(zombies) == 0 {
|
||||
fmt.Printf("%s No zombies found (all polecats healthy)\n", style.Bold.Render("✓"))
|
||||
return nil
|
||||
}
|
||||
|
||||
// Report zombies
|
||||
fmt.Printf("\n%s Found %d zombie(s):\n\n", style.Bold.Render("⚠"), len(zombies))
|
||||
for _, z := range zombies {
|
||||
fmt.Printf(" %s %s/%s\n", style.Dim.Render("🧟"), z.rig, z.name)
|
||||
fmt.Printf(" State: %s, Session: %s\n", z.state, z.sessionStatus)
|
||||
fmt.Printf(" Hooked work: %s\n", z.hookedWork)
|
||||
fmt.Printf(" Last activity: %s ago\n", z.staleness.Round(time.Second))
|
||||
fmt.Printf(" Reason: %s\n", z.reason)
|
||||
fmt.Println()
|
||||
}
|
||||
|
||||
// Nuke zombies if enabled
|
||||
if zombieScanNuke && !zombieScanDryRun {
|
||||
fmt.Printf("%s Nuking zombies...\n", style.Bold.Render("💀"))
|
||||
for _, z := range zombies {
|
||||
if err := nukeZombie(townRoot, z, t); err != nil {
|
||||
style.PrintWarning("failed to nuke %s/%s: %v", z.rig, z.name, err)
|
||||
} else {
|
||||
fmt.Printf(" %s Nuked %s/%s\n", style.Bold.Render("✓"), z.rig, z.name)
|
||||
}
|
||||
}
|
||||
|
||||
// Notify mayor about witness failure
|
||||
notifyMayorOfWitnessFailure(townRoot, zombies)
|
||||
} else if zombieScanDryRun {
|
||||
fmt.Printf("%s Dry run - would nuke %d zombie(s)\n", style.Dim.Render("ℹ"), len(zombies))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// zombieInfo holds information about a detected zombie polecat.
|
||||
type zombieInfo struct {
|
||||
rig string
|
||||
name string
|
||||
state string
|
||||
sessionStatus string
|
||||
hookedWork string
|
||||
staleness time.Duration
|
||||
reason string
|
||||
sessionName string
|
||||
}
|
||||
|
||||
// scanRigForZombies scans a rig for zombie polecats.
|
||||
func scanRigForZombies(townRoot, rigName string, t *tmux.Tmux) ([]zombieInfo, error) {
|
||||
rigPath := filepath.Join(townRoot, rigName)
|
||||
polecatsDir := filepath.Join(rigPath, "polecats")
|
||||
|
||||
entries, err := os.ReadDir(polecatsDir)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil, nil // No polecats dir
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var zombies []zombieInfo
|
||||
for _, entry := range entries {
|
||||
if !entry.IsDir() {
|
||||
continue
|
||||
}
|
||||
name := entry.Name()
|
||||
|
||||
// Build session name for this polecat
|
||||
sessionName := fmt.Sprintf("gt-%s-%s", rigName, name)
|
||||
|
||||
// Check if session is running
|
||||
sessionRunning, _ := t.HasSession(sessionName)
|
||||
|
||||
// Check for hooked work
|
||||
hookedWork := checkPolecatHookedWork(townRoot, rigName, name)
|
||||
|
||||
// Get last activity time from polecat directory
|
||||
polecatPath := filepath.Join(polecatsDir, name)
|
||||
staleness := getPolecatStaleness(polecatPath)
|
||||
|
||||
// Determine if this is a zombie
|
||||
state := "unknown"
|
||||
if sessionRunning {
|
||||
state = "session_running"
|
||||
continue // Not a zombie if session is running
|
||||
}
|
||||
state = "session_dead"
|
||||
|
||||
// Check all zombie criteria
|
||||
if hookedWork != "" {
|
||||
// Has hooked work - not a zombie (just needs to be started)
|
||||
continue
|
||||
}
|
||||
|
||||
if staleness < zombieScanThreshold {
|
||||
// Recently active - not stale enough
|
||||
continue
|
||||
}
|
||||
|
||||
// This is a zombie
|
||||
zombies = append(zombies, zombieInfo{
|
||||
rig: rigName,
|
||||
name: name,
|
||||
state: state,
|
||||
sessionStatus: "not running",
|
||||
hookedWork: "none",
|
||||
staleness: staleness,
|
||||
reason: fmt.Sprintf("idle for %s with no session or hooked work", staleness.Round(time.Minute)),
|
||||
sessionName: sessionName,
|
||||
})
|
||||
}
|
||||
|
||||
return zombies, nil
|
||||
}
|
||||
|
||||
// checkPolecatHookedWork checks if a polecat has hooked work.
|
||||
func checkPolecatHookedWork(townRoot, rigName, polecatName string) string {
|
||||
// Query beads for hooked issues assigned to this polecat
|
||||
assignee := fmt.Sprintf("%s/polecats/%s", rigName, polecatName)
|
||||
cmd := exec.Command("bd", "list", "--status=hooked", "--assignee="+assignee, "--json")
|
||||
cmd.Dir = townRoot
|
||||
|
||||
output, err := cmd.Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
var issues []struct {
|
||||
ID string `json:"id"`
|
||||
Title string `json:"title"`
|
||||
}
|
||||
if err := json.Unmarshal(output, &issues); err != nil || len(issues) == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
return issues[0].ID
|
||||
}
|
||||
|
||||
// getPolecatStaleness returns how long since the polecat was last active.
|
||||
func getPolecatStaleness(polecatPath string) time.Duration {
|
||||
// Check .beads/last-touched if it exists
|
||||
lastTouchedPath := filepath.Join(polecatPath, ".beads", "last-touched")
|
||||
if info, err := os.Stat(lastTouchedPath); err == nil {
|
||||
return time.Since(info.ModTime())
|
||||
}
|
||||
|
||||
// Fall back to directory modification time
|
||||
if info, err := os.Stat(polecatPath); err == nil {
|
||||
return time.Since(info.ModTime())
|
||||
}
|
||||
|
||||
// Very stale if we can't determine
|
||||
return 24 * time.Hour
|
||||
}
|
||||
|
||||
// nukeZombie cleans up a zombie polecat.
|
||||
func nukeZombie(townRoot string, z zombieInfo, t *tmux.Tmux) error { //nolint:unparam // error return kept for future use
|
||||
// Step 1: Kill tmux session if somehow still exists
|
||||
if exists, _ := t.HasSession(z.sessionName); exists {
|
||||
_ = t.KillSession(z.sessionName)
|
||||
}
|
||||
|
||||
// Step 2: Run gt polecat nuke to clean up
|
||||
cmd := exec.Command("gt", "polecat", "nuke", z.name, "--rig="+z.rig, "--force")
|
||||
cmd.Dir = townRoot
|
||||
if err := cmd.Run(); err != nil {
|
||||
// Non-fatal - polecat might already be cleaned up
|
||||
style.PrintWarning("polecat nuke returned error (may be already cleaned): %v", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// notifyMayorOfWitnessFailure notifies the mayor about witness cleanup failures.
|
||||
func notifyMayorOfWitnessFailure(townRoot string, zombies []zombieInfo) {
|
||||
if len(zombies) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// Group by rig
|
||||
rigCounts := make(map[string]int)
|
||||
for _, z := range zombies {
|
||||
rigCounts[z.rig]++
|
||||
}
|
||||
|
||||
var details strings.Builder
|
||||
details.WriteString("Deacon detected zombie polecats that Witness should have cleaned:\n\n")
|
||||
for rig, count := range rigCounts {
|
||||
details.WriteString(fmt.Sprintf("- %s: %d zombie(s)\n", rig, count))
|
||||
}
|
||||
details.WriteString("\nDeacon has nuked them directly. Check Witness health.")
|
||||
|
||||
sendMail(townRoot, "mayor/", "⚠️ Witness cleanup failure detected", details.String())
|
||||
}
|
||||
|
||||
// agentAddressToIDs converts an agent address to bead ID and session name.
|
||||
// Supports formats: "gastown/polecats/max", "gastown/witness", "deacon", "mayor"
|
||||
// Note: Town-level agents (Mayor, Deacon) use hq- prefix bead IDs stored in town beads.
|
||||
|
||||
@@ -259,13 +259,13 @@ func (m *Manager) AddWithOptions(name string, opts AddOptions) (*Polecat, error)
|
||||
fmt.Printf("Warning: could not create agent bead: %v\n", err)
|
||||
}
|
||||
|
||||
// Return polecat with derived state (no issue assigned yet = idle)
|
||||
// Return polecat with working state (transient model: polecats are spawned with work)
|
||||
// State is derived from beads, not stored in state.json
|
||||
now := time.Now()
|
||||
polecat := &Polecat{
|
||||
Name: name,
|
||||
Rig: m.rig.Name,
|
||||
State: StateIdle, // No issue assigned yet
|
||||
State: StateWorking, // Transient model: polecat spawns with work
|
||||
ClonePath: polecatPath,
|
||||
Branch: branchName,
|
||||
CreatedAt: now,
|
||||
@@ -478,12 +478,12 @@ func (m *Manager) RepairWorktreeWithOptions(name string, force bool, opts AddOpt
|
||||
fmt.Printf("Warning: could not create agent bead: %v\n", err)
|
||||
}
|
||||
|
||||
// Return fresh polecat
|
||||
// Return fresh polecat in working state (transient model: polecats are spawned with work)
|
||||
now := time.Now()
|
||||
return &Polecat{
|
||||
Name: name,
|
||||
Rig: m.rig.Name,
|
||||
State: StateIdle,
|
||||
State: StateWorking,
|
||||
ClonePath: polecatPath,
|
||||
Branch: branchName,
|
||||
CreatedAt: now,
|
||||
@@ -543,9 +543,8 @@ func (m *Manager) List() ([]*Polecat, error) {
|
||||
|
||||
// Get returns a specific polecat by name.
|
||||
// State is derived from beads assignee field:
|
||||
// - If an issue is assigned to this polecat and is open/in_progress: StateWorking
|
||||
// - If an issue is assigned but closed: StateDone
|
||||
// - If no issue assigned: StateIdle
|
||||
// - If an issue is assigned to this polecat: StateWorking
|
||||
// - If no issue assigned: StateDone (ready for cleanup - transient polecats should have work)
|
||||
func (m *Manager) Get(name string) (*Polecat, error) {
|
||||
if !m.exists(name) {
|
||||
return nil, ErrPolecatNotFound
|
||||
@@ -557,7 +556,7 @@ func (m *Manager) Get(name string) (*Polecat, error) {
|
||||
// SetState updates a polecat's state.
|
||||
// In the beads model, state is derived from issue status:
|
||||
// - StateWorking/StateActive: issue status set to in_progress
|
||||
// - StateDone/StateIdle: assignee cleared from issue
|
||||
// - StateDone: assignee cleared from issue (polecat ready for cleanup)
|
||||
// - StateStuck: issue status set to blocked (if supported)
|
||||
// If beads is not available, this is a no-op.
|
||||
func (m *Manager) SetState(name string, state State) error {
|
||||
@@ -582,8 +581,8 @@ func (m *Manager) SetState(name string, state State) error {
|
||||
return fmt.Errorf("setting issue status: %w", err)
|
||||
}
|
||||
}
|
||||
case StateDone, StateIdle:
|
||||
// Clear assignment when done/idle
|
||||
case StateDone:
|
||||
// Clear assignment when done (polecat ready for cleanup)
|
||||
if issue != nil {
|
||||
empty := ""
|
||||
if err := m.beads.Update(issue.ID, beads.UpdateOptions{Assignee: &empty}); err != nil {
|
||||
@@ -654,7 +653,8 @@ func (m *Manager) ClearIssue(name string) error {
|
||||
}
|
||||
|
||||
// loadFromBeads gets polecat info from beads assignee field.
|
||||
// State is simple: issue assigned → working, no issue → idle.
|
||||
// State is simple: issue assigned → working, no issue → done (ready for cleanup).
|
||||
// Transient polecats should always have work; no work means ready for Witness cleanup.
|
||||
// We don't interpret issue status (ZFC: Go is transport, not decision-maker).
|
||||
func (m *Manager) loadFromBeads(name string) (*Polecat, error) {
|
||||
polecatPath := m.polecatDir(name)
|
||||
@@ -671,20 +671,20 @@ func (m *Manager) loadFromBeads(name string) (*Polecat, error) {
|
||||
assignee := m.assigneeID(name)
|
||||
issue, beadsErr := m.beads.GetAssignedIssue(assignee)
|
||||
if beadsErr != nil {
|
||||
// If beads query fails, return basic polecat info
|
||||
// This allows the system to work even if beads is not available
|
||||
// If beads query fails, return basic polecat info as working
|
||||
// (assume polecat is doing something if it exists)
|
||||
return &Polecat{
|
||||
Name: name,
|
||||
Rig: m.rig.Name,
|
||||
State: StateIdle,
|
||||
State: StateWorking,
|
||||
ClonePath: polecatPath,
|
||||
Branch: branchName,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Simple rule: has issue = working, no issue = idle
|
||||
// We don't interpret issue.Status - that's for Claude to decide
|
||||
state := StateIdle
|
||||
// Transient model: has issue = working, no issue = done (ready for cleanup)
|
||||
// Polecats without work should be nuked by the Witness
|
||||
state := StateDone
|
||||
issueID := ""
|
||||
if issue != nil {
|
||||
issueID = issue.ID
|
||||
|
||||
@@ -17,8 +17,7 @@ func TestStateIsActive(t *testing.T) {
|
||||
{StateWorking, true},
|
||||
{StateDone, false},
|
||||
{StateStuck, false},
|
||||
// Legacy states are treated as active
|
||||
{StateIdle, true},
|
||||
// Legacy active state is treated as active
|
||||
{StateActive, true},
|
||||
}
|
||||
|
||||
@@ -34,7 +33,6 @@ func TestStateIsWorking(t *testing.T) {
|
||||
state State
|
||||
working bool
|
||||
}{
|
||||
{StateIdle, false},
|
||||
{StateActive, false},
|
||||
{StateWorking, true},
|
||||
{StateDone, false},
|
||||
@@ -143,8 +141,9 @@ func TestAssigneeID(t *testing.T) {
|
||||
// Note: State persistence tests removed - state is now derived from beads assignee field.
|
||||
// Integration tests should verify beads-based state management.
|
||||
|
||||
func TestGetReturnsIdleWithoutBeads(t *testing.T) {
|
||||
// When beads is not available, Get should return StateIdle
|
||||
func TestGetReturnsWorkingWithoutBeads(t *testing.T) {
|
||||
// When beads is not available, Get should return StateWorking
|
||||
// (assume the polecat is doing something if it exists)
|
||||
root := t.TempDir()
|
||||
polecatDir := filepath.Join(root, "polecats", "Test")
|
||||
if err := os.MkdirAll(polecatDir, 0755); err != nil {
|
||||
@@ -163,7 +162,7 @@ func TestGetReturnsIdleWithoutBeads(t *testing.T) {
|
||||
}
|
||||
m := NewManager(r, git.NewGit(root))
|
||||
|
||||
// Get should return polecat with StateIdle (no beads = no assignment)
|
||||
// Get should return polecat with StateWorking (assume active if beads unavailable)
|
||||
polecat, err := m.Get("Test")
|
||||
if err != nil {
|
||||
t.Fatalf("Get: %v", err)
|
||||
@@ -172,8 +171,8 @@ func TestGetReturnsIdleWithoutBeads(t *testing.T) {
|
||||
if polecat.Name != "Test" {
|
||||
t.Errorf("Name = %q, want Test", polecat.Name)
|
||||
}
|
||||
if polecat.State != StateIdle {
|
||||
t.Errorf("State = %v, want StateIdle (beads not available)", polecat.State)
|
||||
if polecat.State != StateWorking {
|
||||
t.Errorf("State = %v, want StateWorking (beads not available)", polecat.State)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -19,10 +19,9 @@ const (
|
||||
// StateStuck means the polecat needs assistance.
|
||||
StateStuck State = "stuck"
|
||||
|
||||
// Legacy states for backward compatibility during transition.
|
||||
// New code should not use these.
|
||||
StateIdle State = "idle" // Deprecated: use StateWorking
|
||||
StateActive State = "active" // Deprecated: use StateWorking
|
||||
// StateActive is deprecated: use StateWorking.
|
||||
// Kept only for backward compatibility with existing data.
|
||||
StateActive State = "active"
|
||||
)
|
||||
|
||||
// IsWorking returns true if the polecat is currently working.
|
||||
@@ -32,9 +31,9 @@ func (s State) IsWorking() bool {
|
||||
|
||||
// IsActive returns true if the polecat session is actively working.
|
||||
// For transient polecats, this is true for working state and
|
||||
// legacy idle/active states (treated as working).
|
||||
// legacy active state (treated as working).
|
||||
func (s State) IsActive() bool {
|
||||
return s == StateWorking || s == StateIdle || s == StateActive
|
||||
return s == StateWorking || s == StateActive
|
||||
}
|
||||
|
||||
// Polecat represents a worker agent in a rig.
|
||||
|
||||
Reference in New Issue
Block a user