From af95b7b7f4ab75b7a8446a76a5fb4729b0132625 Mon Sep 17 00:00:00 2001 From: prime Date: Sun, 4 Jan 2026 14:36:23 -0800 Subject: [PATCH] Remove StateIdle and idle polecat concept MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The transient polecat model says: "Polecats exist only while working." This removes the deprecated StateIdle and updates the codebase: - Remove StateIdle from polecat/types.go (keep StateActive for legacy data) - Update manager.go: Get() returns StateDone (not StateIdle) when no work - Update manager.go: Add/Recreate return StateWorking (not StateIdle) - Remove zombie scan logic from deacon.go (no idle polecats to scan for) - Update tests to reflect new behavior The correct lifecycle is now: - Spawn: polecat created with work (StateWorking) - Work: sessions cycle, sandbox persists - Done: polecat signals completion (StateDone) - Nuke: Witness destroys sandbox 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- internal/cmd/deacon.go | 293 ------------------------------- internal/polecat/manager.go | 34 ++-- internal/polecat/manager_test.go | 15 +- internal/polecat/types.go | 11 +- 4 files changed, 29 insertions(+), 324 deletions(-) diff --git a/internal/cmd/deacon.go b/internal/cmd/deacon.go index 34b5f6fe..33d45b3c 100644 --- a/internal/cmd/deacon.go +++ b/internal/cmd/deacon.go @@ -186,33 +186,6 @@ This helps the Deacon understand which agents may need attention.`, RunE: runDeaconHealthState, } -var deaconZombieScanCmd = &cobra.Command{ - Use: "zombie-scan [rig]", - Short: "Scan for idle polecats that should have been nuked", - Long: `Backup check for polecats the Witness should have cleaned up. - -Scans for "zombie" polecats that meet ALL of these criteria: -- State: idle or done (no active work) -- Session: not running (tmux session dead) -- No hooked work -- Last activity: older than threshold (default 10 minutes) - -These are polecats that the Witness should have nuked but didn't. -This provides defense-in-depth against Witness failures. - -Actions: -1. Log warning about witness failure -2. Nuke the zombie polecat directly -3. Notify mayor of witness issue (optional) - -Examples: - gt deacon zombie-scan # Scan all rigs - gt deacon zombie-scan gastown # Scan specific rig - gt deacon zombie-scan --dry-run # Preview only - gt deacon zombie-scan --threshold=5m # Custom staleness threshold`, - Args: cobra.MaximumNArgs(1), - RunE: runDeaconZombieScan, -} var ( triggerTimeout time.Duration @@ -225,11 +198,6 @@ var ( // Force kill flags forceKillReason string forceKillSkipNotify bool - - // Zombie scan flags - zombieScanDryRun bool - zombieScanThreshold time.Duration - zombieScanNuke bool ) func init() { @@ -243,7 +211,6 @@ func init() { deaconCmd.AddCommand(deaconHealthCheckCmd) deaconCmd.AddCommand(deaconForceKillCmd) deaconCmd.AddCommand(deaconHealthStateCmd) - deaconCmd.AddCommand(deaconZombieScanCmd) // Flags for trigger-pending deaconTriggerPendingCmd.Flags().DurationVar(&triggerTimeout, "timeout", 2*time.Second, @@ -263,14 +230,6 @@ func init() { deaconForceKillCmd.Flags().BoolVar(&forceKillSkipNotify, "skip-notify", false, "Skip sending notification mail to mayor") - // Flags for zombie-scan - deaconZombieScanCmd.Flags().BoolVarP(&zombieScanDryRun, "dry-run", "n", false, - "Show what would be done without nuking") - deaconZombieScanCmd.Flags().DurationVar(&zombieScanThreshold, "threshold", 10*time.Minute, - "Staleness threshold for zombie detection") - deaconZombieScanCmd.Flags().BoolVar(&zombieScanNuke, "nuke", true, - "Nuke detected zombies (use --nuke=false to report only)") - rootCmd.AddCommand(deaconCmd) } @@ -865,258 +824,6 @@ func runDeaconHealthState(cmd *cobra.Command, args []string) error { return nil } -// runDeaconZombieScan scans for idle polecats that should have been nuked by the Witness. -// This is a defense-in-depth backup check. -func runDeaconZombieScan(cmd *cobra.Command, args []string) error { - townRoot, err := workspace.FindFromCwdOrError() - if err != nil { - return fmt.Errorf("not in a Gas Town workspace: %w", err) - } - - t := tmux.NewTmux() - - // Get list of rigs to scan - var rigsToScan []string - if len(args) > 0 { - rigsToScan = []string{args[0]} - } else { - // Scan all rigs by finding directories with polecats/ subdirectories - entries, err := os.ReadDir(townRoot) - if err != nil { - return fmt.Errorf("reading town root: %w", err) - } - for _, entry := range entries { - if !entry.IsDir() { - continue - } - // Skip non-rig directories - if entry.Name() == "deacon" || entry.Name() == "mayor" || - entry.Name() == "plugins" || entry.Name() == "docs" || - strings.HasPrefix(entry.Name(), ".") { - continue - } - // Check if it has a polecats directory - polecatsDir := filepath.Join(townRoot, entry.Name(), "polecats") - if info, err := os.Stat(polecatsDir); err == nil && info.IsDir() { - rigsToScan = append(rigsToScan, entry.Name()) - } - } - } - - if len(rigsToScan) == 0 { - fmt.Printf("%s No rigs found to scan\n", style.Dim.Render("○")) - return nil - } - - fmt.Printf("%s Scanning for zombie polecats (threshold: %s)...\n", - style.Bold.Render("🧟"), zombieScanThreshold) - - var zombies []zombieInfo - for _, rigName := range rigsToScan { - rigZombies, err := scanRigForZombies(townRoot, rigName, t) - if err != nil { - style.PrintWarning("failed to scan rig %s: %v", rigName, err) - continue - } - zombies = append(zombies, rigZombies...) - } - - if len(zombies) == 0 { - fmt.Printf("%s No zombies found (all polecats healthy)\n", style.Bold.Render("✓")) - return nil - } - - // Report zombies - fmt.Printf("\n%s Found %d zombie(s):\n\n", style.Bold.Render("⚠"), len(zombies)) - for _, z := range zombies { - fmt.Printf(" %s %s/%s\n", style.Dim.Render("🧟"), z.rig, z.name) - fmt.Printf(" State: %s, Session: %s\n", z.state, z.sessionStatus) - fmt.Printf(" Hooked work: %s\n", z.hookedWork) - fmt.Printf(" Last activity: %s ago\n", z.staleness.Round(time.Second)) - fmt.Printf(" Reason: %s\n", z.reason) - fmt.Println() - } - - // Nuke zombies if enabled - if zombieScanNuke && !zombieScanDryRun { - fmt.Printf("%s Nuking zombies...\n", style.Bold.Render("💀")) - for _, z := range zombies { - if err := nukeZombie(townRoot, z, t); err != nil { - style.PrintWarning("failed to nuke %s/%s: %v", z.rig, z.name, err) - } else { - fmt.Printf(" %s Nuked %s/%s\n", style.Bold.Render("✓"), z.rig, z.name) - } - } - - // Notify mayor about witness failure - notifyMayorOfWitnessFailure(townRoot, zombies) - } else if zombieScanDryRun { - fmt.Printf("%s Dry run - would nuke %d zombie(s)\n", style.Dim.Render("ℹ"), len(zombies)) - } - - return nil -} - -// zombieInfo holds information about a detected zombie polecat. -type zombieInfo struct { - rig string - name string - state string - sessionStatus string - hookedWork string - staleness time.Duration - reason string - sessionName string -} - -// scanRigForZombies scans a rig for zombie polecats. -func scanRigForZombies(townRoot, rigName string, t *tmux.Tmux) ([]zombieInfo, error) { - rigPath := filepath.Join(townRoot, rigName) - polecatsDir := filepath.Join(rigPath, "polecats") - - entries, err := os.ReadDir(polecatsDir) - if err != nil { - if os.IsNotExist(err) { - return nil, nil // No polecats dir - } - return nil, err - } - - var zombies []zombieInfo - for _, entry := range entries { - if !entry.IsDir() { - continue - } - name := entry.Name() - - // Build session name for this polecat - sessionName := fmt.Sprintf("gt-%s-%s", rigName, name) - - // Check if session is running - sessionRunning, _ := t.HasSession(sessionName) - - // Check for hooked work - hookedWork := checkPolecatHookedWork(townRoot, rigName, name) - - // Get last activity time from polecat directory - polecatPath := filepath.Join(polecatsDir, name) - staleness := getPolecatStaleness(polecatPath) - - // Determine if this is a zombie - state := "unknown" - if sessionRunning { - state = "session_running" - continue // Not a zombie if session is running - } - state = "session_dead" - - // Check all zombie criteria - if hookedWork != "" { - // Has hooked work - not a zombie (just needs to be started) - continue - } - - if staleness < zombieScanThreshold { - // Recently active - not stale enough - continue - } - - // This is a zombie - zombies = append(zombies, zombieInfo{ - rig: rigName, - name: name, - state: state, - sessionStatus: "not running", - hookedWork: "none", - staleness: staleness, - reason: fmt.Sprintf("idle for %s with no session or hooked work", staleness.Round(time.Minute)), - sessionName: sessionName, - }) - } - - return zombies, nil -} - -// checkPolecatHookedWork checks if a polecat has hooked work. -func checkPolecatHookedWork(townRoot, rigName, polecatName string) string { - // Query beads for hooked issues assigned to this polecat - assignee := fmt.Sprintf("%s/polecats/%s", rigName, polecatName) - cmd := exec.Command("bd", "list", "--status=hooked", "--assignee="+assignee, "--json") - cmd.Dir = townRoot - - output, err := cmd.Output() - if err != nil { - return "" - } - - var issues []struct { - ID string `json:"id"` - Title string `json:"title"` - } - if err := json.Unmarshal(output, &issues); err != nil || len(issues) == 0 { - return "" - } - - return issues[0].ID -} - -// getPolecatStaleness returns how long since the polecat was last active. -func getPolecatStaleness(polecatPath string) time.Duration { - // Check .beads/last-touched if it exists - lastTouchedPath := filepath.Join(polecatPath, ".beads", "last-touched") - if info, err := os.Stat(lastTouchedPath); err == nil { - return time.Since(info.ModTime()) - } - - // Fall back to directory modification time - if info, err := os.Stat(polecatPath); err == nil { - return time.Since(info.ModTime()) - } - - // Very stale if we can't determine - return 24 * time.Hour -} - -// nukeZombie cleans up a zombie polecat. -func nukeZombie(townRoot string, z zombieInfo, t *tmux.Tmux) error { //nolint:unparam // error return kept for future use - // Step 1: Kill tmux session if somehow still exists - if exists, _ := t.HasSession(z.sessionName); exists { - _ = t.KillSession(z.sessionName) - } - - // Step 2: Run gt polecat nuke to clean up - cmd := exec.Command("gt", "polecat", "nuke", z.name, "--rig="+z.rig, "--force") - cmd.Dir = townRoot - if err := cmd.Run(); err != nil { - // Non-fatal - polecat might already be cleaned up - style.PrintWarning("polecat nuke returned error (may be already cleaned): %v", err) - } - - return nil -} - -// notifyMayorOfWitnessFailure notifies the mayor about witness cleanup failures. -func notifyMayorOfWitnessFailure(townRoot string, zombies []zombieInfo) { - if len(zombies) == 0 { - return - } - - // Group by rig - rigCounts := make(map[string]int) - for _, z := range zombies { - rigCounts[z.rig]++ - } - - var details strings.Builder - details.WriteString("Deacon detected zombie polecats that Witness should have cleaned:\n\n") - for rig, count := range rigCounts { - details.WriteString(fmt.Sprintf("- %s: %d zombie(s)\n", rig, count)) - } - details.WriteString("\nDeacon has nuked them directly. Check Witness health.") - - sendMail(townRoot, "mayor/", "⚠️ Witness cleanup failure detected", details.String()) -} - // agentAddressToIDs converts an agent address to bead ID and session name. // Supports formats: "gastown/polecats/max", "gastown/witness", "deacon", "mayor" // Note: Town-level agents (Mayor, Deacon) use hq- prefix bead IDs stored in town beads. diff --git a/internal/polecat/manager.go b/internal/polecat/manager.go index de6e109a..2e2a5f10 100644 --- a/internal/polecat/manager.go +++ b/internal/polecat/manager.go @@ -259,13 +259,13 @@ func (m *Manager) AddWithOptions(name string, opts AddOptions) (*Polecat, error) fmt.Printf("Warning: could not create agent bead: %v\n", err) } - // Return polecat with derived state (no issue assigned yet = idle) + // Return polecat with working state (transient model: polecats are spawned with work) // State is derived from beads, not stored in state.json now := time.Now() polecat := &Polecat{ Name: name, Rig: m.rig.Name, - State: StateIdle, // No issue assigned yet + State: StateWorking, // Transient model: polecat spawns with work ClonePath: polecatPath, Branch: branchName, CreatedAt: now, @@ -478,12 +478,12 @@ func (m *Manager) RepairWorktreeWithOptions(name string, force bool, opts AddOpt fmt.Printf("Warning: could not create agent bead: %v\n", err) } - // Return fresh polecat + // Return fresh polecat in working state (transient model: polecats are spawned with work) now := time.Now() return &Polecat{ Name: name, Rig: m.rig.Name, - State: StateIdle, + State: StateWorking, ClonePath: polecatPath, Branch: branchName, CreatedAt: now, @@ -543,9 +543,8 @@ func (m *Manager) List() ([]*Polecat, error) { // Get returns a specific polecat by name. // State is derived from beads assignee field: -// - If an issue is assigned to this polecat and is open/in_progress: StateWorking -// - If an issue is assigned but closed: StateDone -// - If no issue assigned: StateIdle +// - If an issue is assigned to this polecat: StateWorking +// - If no issue assigned: StateDone (ready for cleanup - transient polecats should have work) func (m *Manager) Get(name string) (*Polecat, error) { if !m.exists(name) { return nil, ErrPolecatNotFound @@ -557,7 +556,7 @@ func (m *Manager) Get(name string) (*Polecat, error) { // SetState updates a polecat's state. // In the beads model, state is derived from issue status: // - StateWorking/StateActive: issue status set to in_progress -// - StateDone/StateIdle: assignee cleared from issue +// - StateDone: assignee cleared from issue (polecat ready for cleanup) // - StateStuck: issue status set to blocked (if supported) // If beads is not available, this is a no-op. func (m *Manager) SetState(name string, state State) error { @@ -582,8 +581,8 @@ func (m *Manager) SetState(name string, state State) error { return fmt.Errorf("setting issue status: %w", err) } } - case StateDone, StateIdle: - // Clear assignment when done/idle + case StateDone: + // Clear assignment when done (polecat ready for cleanup) if issue != nil { empty := "" if err := m.beads.Update(issue.ID, beads.UpdateOptions{Assignee: &empty}); err != nil { @@ -654,7 +653,8 @@ func (m *Manager) ClearIssue(name string) error { } // loadFromBeads gets polecat info from beads assignee field. -// State is simple: issue assigned → working, no issue → idle. +// State is simple: issue assigned → working, no issue → done (ready for cleanup). +// Transient polecats should always have work; no work means ready for Witness cleanup. // We don't interpret issue status (ZFC: Go is transport, not decision-maker). func (m *Manager) loadFromBeads(name string) (*Polecat, error) { polecatPath := m.polecatDir(name) @@ -671,20 +671,20 @@ func (m *Manager) loadFromBeads(name string) (*Polecat, error) { assignee := m.assigneeID(name) issue, beadsErr := m.beads.GetAssignedIssue(assignee) if beadsErr != nil { - // If beads query fails, return basic polecat info - // This allows the system to work even if beads is not available + // If beads query fails, return basic polecat info as working + // (assume polecat is doing something if it exists) return &Polecat{ Name: name, Rig: m.rig.Name, - State: StateIdle, + State: StateWorking, ClonePath: polecatPath, Branch: branchName, }, nil } - // Simple rule: has issue = working, no issue = idle - // We don't interpret issue.Status - that's for Claude to decide - state := StateIdle + // Transient model: has issue = working, no issue = done (ready for cleanup) + // Polecats without work should be nuked by the Witness + state := StateDone issueID := "" if issue != nil { issueID = issue.ID diff --git a/internal/polecat/manager_test.go b/internal/polecat/manager_test.go index 53293dbb..a5b7d317 100644 --- a/internal/polecat/manager_test.go +++ b/internal/polecat/manager_test.go @@ -17,8 +17,7 @@ func TestStateIsActive(t *testing.T) { {StateWorking, true}, {StateDone, false}, {StateStuck, false}, - // Legacy states are treated as active - {StateIdle, true}, + // Legacy active state is treated as active {StateActive, true}, } @@ -34,7 +33,6 @@ func TestStateIsWorking(t *testing.T) { state State working bool }{ - {StateIdle, false}, {StateActive, false}, {StateWorking, true}, {StateDone, false}, @@ -143,8 +141,9 @@ func TestAssigneeID(t *testing.T) { // Note: State persistence tests removed - state is now derived from beads assignee field. // Integration tests should verify beads-based state management. -func TestGetReturnsIdleWithoutBeads(t *testing.T) { - // When beads is not available, Get should return StateIdle +func TestGetReturnsWorkingWithoutBeads(t *testing.T) { + // When beads is not available, Get should return StateWorking + // (assume the polecat is doing something if it exists) root := t.TempDir() polecatDir := filepath.Join(root, "polecats", "Test") if err := os.MkdirAll(polecatDir, 0755); err != nil { @@ -163,7 +162,7 @@ func TestGetReturnsIdleWithoutBeads(t *testing.T) { } m := NewManager(r, git.NewGit(root)) - // Get should return polecat with StateIdle (no beads = no assignment) + // Get should return polecat with StateWorking (assume active if beads unavailable) polecat, err := m.Get("Test") if err != nil { t.Fatalf("Get: %v", err) @@ -172,8 +171,8 @@ func TestGetReturnsIdleWithoutBeads(t *testing.T) { if polecat.Name != "Test" { t.Errorf("Name = %q, want Test", polecat.Name) } - if polecat.State != StateIdle { - t.Errorf("State = %v, want StateIdle (beads not available)", polecat.State) + if polecat.State != StateWorking { + t.Errorf("State = %v, want StateWorking (beads not available)", polecat.State) } } diff --git a/internal/polecat/types.go b/internal/polecat/types.go index a38c33bd..cf071f38 100644 --- a/internal/polecat/types.go +++ b/internal/polecat/types.go @@ -19,10 +19,9 @@ const ( // StateStuck means the polecat needs assistance. StateStuck State = "stuck" - // Legacy states for backward compatibility during transition. - // New code should not use these. - StateIdle State = "idle" // Deprecated: use StateWorking - StateActive State = "active" // Deprecated: use StateWorking + // StateActive is deprecated: use StateWorking. + // Kept only for backward compatibility with existing data. + StateActive State = "active" ) // IsWorking returns true if the polecat is currently working. @@ -32,9 +31,9 @@ func (s State) IsWorking() bool { // IsActive returns true if the polecat session is actively working. // For transient polecats, this is true for working state and -// legacy idle/active states (treated as working). +// legacy active state (treated as working). func (s State) IsActive() bool { - return s == StateWorking || s == StateIdle || s == StateActive + return s == StateWorking || s == StateActive } // Polecat represents a worker agent in a rig.