fix(polecat): kill orphan sessions and clear stale hooks during allocation (#448)

ReconcilePool now detects and kills orphan tmux sessions (sessions without
corresponding polecat directories). This prevents allocation from being
blocked by broken state from crashed polecats.

Changes:
- Add tmux to Manager to check for orphan sessions during reconciliation
- Add ReconcilePoolWith for testable session/directory reconciliation logic
- Always clear hook_bead slot when reopening agent beads (fixes stale hooks)
- Prune stale git worktree entries during reconciliation

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Julian Knutsen
2026-01-13 21:37:00 +00:00
committed by GitHub
parent bd2a5ab56a
commit 65c5e05c43
10 changed files with 297 additions and 34 deletions

View File

@@ -228,10 +228,11 @@ func (b *Beads) CreateOrReopenAgentBead(id, title string, fields *AgentFields) (
}
}
// Clear any existing hook slot (handles stale state from previous lifecycle)
_, _ = b.run("slot", "clear", id, "hook")
// Set the hook slot if specified
if fields != nil && fields.HookBead != "" {
// Clear any existing hook first, then set new one
_, _ = b.run("slot", "clear", id, "hook")
if _, err := b.run("slot", "set", id, "hook", fields.HookBead); err != nil {
// Non-fatal: warn but continue
fmt.Printf("Warning: could not set hook slot: %v\n", err)

View File

@@ -330,7 +330,8 @@ func getPolecatManager(rigName string) (*polecat.Manager, *rig.Rig, error) {
}
polecatGit := git.NewGit(r.Path)
mgr := polecat.NewManager(r, polecatGit)
t := tmux.NewTmux()
mgr := polecat.NewManager(r, polecatGit, t)
return mgr, r, nil
}
@@ -363,7 +364,7 @@ func runPolecatList(cmd *cobra.Command, args []string) error {
for _, r := range rigs {
polecatGit := git.NewGit(r.Path)
mgr := polecat.NewManager(r, polecatGit)
mgr := polecat.NewManager(r, polecatGit, t)
polecatMgr := polecat.NewSessionManager(t, r)
polecats, err := mgr.List()

View File

@@ -221,7 +221,8 @@ func runPolecatIdentityAdd(cmd *cobra.Command, args []string) error {
// Generate name if not provided
if polecatName == "" {
polecatGit := git.NewGit(r.Path)
mgr := polecat.NewManager(r, polecatGit)
t := tmux.NewTmux()
mgr := polecat.NewManager(r, polecatGit, t)
polecatName, err = mgr.AllocateName()
if err != nil {
return fmt.Errorf("generating polecat name: %w", err)
@@ -294,7 +295,7 @@ func runPolecatIdentityList(cmd *cobra.Command, args []string) error {
// Check if worktree exists
worktreeExists := false
mgr := polecat.NewManager(r, nil)
mgr := polecat.NewManager(r, nil, t)
if p, err := mgr.Get(name); err == nil && p != nil {
worktreeExists = true
}
@@ -396,7 +397,7 @@ func runPolecatIdentityShow(cmd *cobra.Command, args []string) error {
// Check worktree and session
t := tmux.NewTmux()
polecatMgr := polecat.NewSessionManager(t, r)
mgr := polecat.NewManager(r, nil)
mgr := polecat.NewManager(r, nil, t)
worktreeExists := false
var clonePath string

View File

@@ -64,9 +64,10 @@ func SpawnPolecatForSling(rigName string, opts SlingSpawnOptions) (*SpawnedPolec
return nil, fmt.Errorf("rig '%s' not found", rigName)
}
// Get polecat manager
// Get polecat manager (with tmux for session-aware allocation)
polecatGit := git.NewGit(r.Path)
polecatMgr := polecat.NewManager(r, polecatGit)
t := tmux.NewTmux()
polecatMgr := polecat.NewManager(r, polecatGit, t)
// Allocate a new polecat name
polecatName, err := polecatMgr.AllocateName()
@@ -124,8 +125,7 @@ func SpawnPolecatForSling(rigName string, opts SlingSpawnOptions) (*SpawnedPolec
fmt.Printf("Using account: %s\n", accountHandle)
}
// Start session
t := tmux.NewTmux()
// Start session (reuse tmux from manager)
polecatSessMgr := polecat.NewSessionManager(t, r)
// Check if already running

View File

@@ -921,7 +921,7 @@ func runRigShutdown(cmd *cobra.Command, args []string) error {
// Check all polecats for uncommitted work (unless nuclear)
if !rigShutdownNuclear {
polecatGit := git.NewGit(r.Path)
polecatMgr := polecat.NewManager(r, polecatGit)
polecatMgr := polecat.NewManager(r, polecatGit, nil) // nil tmux: just listing
polecats, err := polecatMgr.List()
if err == nil && len(polecats) > 0 {
var problemPolecats []struct {
@@ -1105,7 +1105,7 @@ func runRigStatus(cmd *cobra.Command, args []string) error {
// Polecats
polecatGit := git.NewGit(r.Path)
polecatMgr := polecat.NewManager(r, polecatGit)
polecatMgr := polecat.NewManager(r, polecatGit, t)
polecats, err := polecatMgr.List()
fmt.Printf("%s", style.Bold.Render("Polecats"))
if err != nil || len(polecats) == 0 {
@@ -1198,7 +1198,7 @@ func runRigStop(cmd *cobra.Command, args []string) error {
// Check all polecats for uncommitted work (unless nuclear)
if !rigStopNuclear {
polecatGit := git.NewGit(r.Path)
polecatMgr := polecat.NewManager(r, polecatGit)
polecatMgr := polecat.NewManager(r, polecatGit, nil) // nil tmux: just listing
polecats, err := polecatMgr.List()
if err == nil && len(polecats) > 0 {
var problemPolecats []struct {
@@ -1330,7 +1330,7 @@ func runRigRestart(cmd *cobra.Command, args []string) error {
// Check all polecats for uncommitted work (unless nuclear)
if !rigRestartNuclear {
polecatGit := git.NewGit(r.Path)
polecatMgr := polecat.NewManager(r, polecatGit)
polecatMgr := polecat.NewManager(r, polecatGit, nil) // nil tmux: just listing
polecats, err := polecatMgr.List()
if err == nil && len(polecats) > 0 {
var problemPolecats []struct {

View File

@@ -682,7 +682,7 @@ func cleanupPolecats(townRoot string) {
for _, r := range rigs {
polecatGit := git.NewGit(r.Path)
polecatMgr := polecat.NewManager(r, polecatGit)
polecatMgr := polecat.NewManager(r, polecatGit, nil) // nil tmux: just listing, not allocating
polecats, err := polecatMgr.List()
if err != nil {

View File

@@ -494,7 +494,7 @@ func spawnSwarmWorkersFromBeads(r *rig.Rig, townRoot string, swarmID string, wor
t := tmux.NewTmux()
polecatSessMgr := polecat.NewSessionManager(t, r)
polecatGit := git.NewGit(r.Path)
polecatMgr := polecat.NewManager(r, polecatGit)
polecatMgr := polecat.NewManager(r, polecatGit, t)
// Pair workers with tasks (round-robin if more tasks than workers)
workerIdx := 0

View File

@@ -16,6 +16,7 @@ import (
"github.com/steveyegge/gastown/internal/config"
"github.com/steveyegge/gastown/internal/git"
"github.com/steveyegge/gastown/internal/rig"
"github.com/steveyegge/gastown/internal/tmux"
"github.com/steveyegge/gastown/internal/workspace"
)
@@ -47,10 +48,11 @@ type Manager struct {
git *git.Git
beads *beads.Beads
namePool *NamePool
tmux *tmux.Tmux
}
// NewManager creates a new polecat manager.
func NewManager(r *rig.Rig, g *git.Git) *Manager {
func NewManager(r *rig.Rig, g *git.Git, t *tmux.Tmux) *Manager {
// Use the resolved beads directory to find where bd commands should run.
// For tracked beads: rig/.beads/redirect -> mayor/rig/.beads, so use mayor/rig
// For local beads: rig/.beads is the database, so use rig root
@@ -82,6 +84,7 @@ func NewManager(r *rig.Rig, g *git.Git) *Manager {
git: g,
beads: beads.NewWithBeadsDir(beadsPath, resolvedBeads),
namePool: pool,
tmux: t,
}
}
@@ -631,21 +634,70 @@ func (m *Manager) RepairWorktreeWithOptions(name string, force bool, opts AddOpt
}, nil
}
// ReconcilePool derives pool InUse state from existing polecat directories.
// This implements ZFC: InUse is discovered from filesystem, not tracked separately.
// ReconcilePool derives pool InUse state from existing polecat directories and active sessions.
// This implements ZFC: InUse is discovered from filesystem and tmux, not tracked separately.
// Called before each allocation to ensure InUse reflects reality.
//
// In addition to directory checks, this also:
// - Kills orphaned tmux sessions (sessions without directories are broken)
func (m *Manager) ReconcilePool() {
// Get polecats with existing directories
polecats, err := m.List()
if err != nil {
return
}
var names []string
var namesWithDirs []string
for _, p := range polecats {
names = append(names, p.Name)
namesWithDirs = append(namesWithDirs, p.Name)
}
m.namePool.Reconcile(names)
// Get names with tmux sessions
var namesWithSessions []string
if m.tmux != nil {
poolNames := m.namePool.getNames()
for _, name := range poolNames {
sessionName := fmt.Sprintf("gt-%s-%s", m.rig.Name, name)
hasSession, _ := m.tmux.HasSession(sessionName)
if hasSession {
namesWithSessions = append(namesWithSessions, name)
}
}
}
m.ReconcilePoolWith(namesWithDirs, namesWithSessions)
// Prune any stale git worktree entries (handles manually deleted directories)
if repoGit, err := m.repoBase(); err == nil {
_ = repoGit.WorktreePrune()
}
}
// ReconcilePoolWith reconciles the name pool given lists of names from different sources.
// This is the testable core of ReconcilePool.
//
// - namesWithDirs: names that have existing worktree directories (in use)
// - namesWithSessions: names that have tmux sessions
//
// Names with sessions but no directories are orphans and their sessions are killed.
// Only namesWithDirs are marked as in-use for allocation.
func (m *Manager) ReconcilePoolWith(namesWithDirs, namesWithSessions []string) {
dirSet := make(map[string]bool)
for _, name := range namesWithDirs {
dirSet[name] = true
}
// Kill orphaned sessions (session exists but no directory)
if m.tmux != nil {
for _, name := range namesWithSessions {
if !dirSet[name] {
sessionName := fmt.Sprintf("gt-%s-%s", m.rig.Name, name)
_ = m.tmux.KillSession(sessionName)
}
}
}
m.namePool.Reconcile(namesWithDirs)
// Note: No Save() needed - InUse is transient state, only OverflowNext is persisted
}

View File

@@ -4,6 +4,7 @@ import (
"os"
"os/exec"
"path/filepath"
"sort"
"testing"
"github.com/steveyegge/gastown/internal/git"
@@ -72,7 +73,7 @@ func TestListEmpty(t *testing.T) {
Name: "test-rig",
Path: root,
}
m := NewManager(r, git.NewGit(root))
m := NewManager(r, git.NewGit(root), nil)
polecats, err := m.List()
if err != nil {
@@ -89,7 +90,7 @@ func TestGetNotFound(t *testing.T) {
Name: "test-rig",
Path: root,
}
m := NewManager(r, git.NewGit(root))
m := NewManager(r, git.NewGit(root), nil)
_, err := m.Get("nonexistent")
if err != ErrPolecatNotFound {
@@ -103,7 +104,7 @@ func TestRemoveNotFound(t *testing.T) {
Name: "test-rig",
Path: root,
}
m := NewManager(r, git.NewGit(root))
m := NewManager(r, git.NewGit(root), nil)
err := m.Remove("nonexistent", false)
if err != ErrPolecatNotFound {
@@ -116,7 +117,7 @@ func TestPolecatDir(t *testing.T) {
Name: "test-rig",
Path: "/home/user/ai/test-rig",
}
m := NewManager(r, git.NewGit(r.Path))
m := NewManager(r, git.NewGit(r.Path), nil)
dir := m.polecatDir("Toast")
expected := "/home/user/ai/test-rig/polecats/Toast"
@@ -130,7 +131,7 @@ func TestAssigneeID(t *testing.T) {
Name: "test-rig",
Path: "/home/user/ai/test-rig",
}
m := NewManager(r, git.NewGit(r.Path))
m := NewManager(r, git.NewGit(r.Path), nil)
id := m.assigneeID("Toast")
expected := "test-rig/Toast"
@@ -168,7 +169,7 @@ func TestGetReturnsWorkingWithoutBeads(t *testing.T) {
Name: "test-rig",
Path: root,
}
m := NewManager(r, git.NewGit(root))
m := NewManager(r, git.NewGit(root), nil)
// Get should return polecat with StateWorking (assume active if beads unavailable)
polecat, err := m.Get("Test")
@@ -207,7 +208,7 @@ func TestListWithPolecats(t *testing.T) {
Name: "test-rig",
Path: root,
}
m := NewManager(r, git.NewGit(root))
m := NewManager(r, git.NewGit(root), nil)
polecats, err := m.List()
if err != nil {
@@ -240,7 +241,7 @@ func TestSetStateWithoutBeads(t *testing.T) {
Name: "test-rig",
Path: root,
}
m := NewManager(r, git.NewGit(root))
m := NewManager(r, git.NewGit(root), nil)
// SetState should succeed (no-op when no issue assigned)
err := m.SetState("Test", StateActive)
@@ -266,7 +267,7 @@ func TestClearIssueWithoutAssignment(t *testing.T) {
Name: "test-rig",
Path: root,
}
m := NewManager(r, git.NewGit(root))
m := NewManager(r, git.NewGit(root), nil)
// ClearIssue should succeed even when no issue assigned
err := m.ClearIssue("Test")
@@ -334,7 +335,7 @@ func TestAddWithOptions_HasAgentsMD(t *testing.T) {
Name: "rig",
Path: root,
}
m := NewManager(r, git.NewGit(root))
m := NewManager(r, git.NewGit(root), nil)
// Create polecat via AddWithOptions
polecat, err := m.AddWithOptions("TestAgent", AddOptions{})
@@ -417,7 +418,7 @@ func TestAddWithOptions_AgentsMDFallback(t *testing.T) {
Name: "rig",
Path: root,
}
m := NewManager(r, git.NewGit(root))
m := NewManager(r, git.NewGit(root), nil)
// Create polecat via AddWithOptions
polecat, err := m.AddWithOptions("TestFallback", AddOptions{})
@@ -440,3 +441,208 @@ func TestAddWithOptions_AgentsMDFallback(t *testing.T) {
t.Errorf("AGENTS.md content = %q, want %q", string(content), string(agentsMDContent))
}
}
// TestReconcilePoolWith tests all permutations of directory and session existence.
// This is the core allocation policy logic.
//
// Truth table:
// HasDir | HasSession | Result
// -------|------------|------------------
// false | false | available (not in-use)
// true | false | in-use (normal finished polecat)
// false | true | orphan → kill session, available
// true | true | in-use (normal working polecat)
func TestReconcilePoolWith(t *testing.T) {
t.Parallel()
tests := []struct {
name string
namesWithDirs []string
namesWithSessions []string
wantInUse []string // names that should be marked in-use
wantOrphans []string // sessions that should be killed
}{
{
name: "no dirs, no sessions - all available",
namesWithDirs: []string{},
namesWithSessions: []string{},
wantInUse: []string{},
wantOrphans: []string{},
},
{
name: "has dir, no session - in use",
namesWithDirs: []string{"toast"},
namesWithSessions: []string{},
wantInUse: []string{"toast"},
wantOrphans: []string{},
},
{
name: "no dir, has session - orphan killed",
namesWithDirs: []string{},
namesWithSessions: []string{"nux"},
wantInUse: []string{},
wantOrphans: []string{"nux"},
},
{
name: "has dir, has session - in use",
namesWithDirs: []string{"capable"},
namesWithSessions: []string{"capable"},
wantInUse: []string{"capable"},
wantOrphans: []string{},
},
{
name: "mixed: one with dir, one orphan session",
namesWithDirs: []string{"toast"},
namesWithSessions: []string{"toast", "nux"},
wantInUse: []string{"toast"},
wantOrphans: []string{"nux"},
},
{
name: "multiple dirs, no sessions",
namesWithDirs: []string{"toast", "nux", "capable"},
namesWithSessions: []string{},
wantInUse: []string{"capable", "nux", "toast"},
wantOrphans: []string{},
},
{
name: "multiple orphan sessions",
namesWithDirs: []string{},
namesWithSessions: []string{"slit", "rictus"},
wantInUse: []string{},
wantOrphans: []string{"rictus", "slit"},
},
{
name: "complex: dirs, valid sessions, orphan sessions",
namesWithDirs: []string{"toast", "capable"},
namesWithSessions: []string{"toast", "nux", "slit"},
wantInUse: []string{"capable", "toast"},
wantOrphans: []string{"nux", "slit"},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Create temporary directory for pool state
tmpDir, err := os.MkdirTemp("", "reconcile-test-*")
if err != nil {
t.Fatal(err)
}
defer func() { _ = os.RemoveAll(tmpDir) }()
// Create rig and manager (nil tmux for unit test)
r := &rig.Rig{
Name: "testrig",
Path: tmpDir,
}
m := NewManager(r, nil, nil)
// Call ReconcilePoolWith
m.ReconcilePoolWith(tt.namesWithDirs, tt.namesWithSessions)
// Verify in-use names
gotInUse := m.namePool.ActiveNames()
sort.Strings(gotInUse)
sort.Strings(tt.wantInUse)
if len(gotInUse) != len(tt.wantInUse) {
t.Errorf("in-use count: got %d, want %d", len(gotInUse), len(tt.wantInUse))
}
for i := range tt.wantInUse {
if i >= len(gotInUse) || gotInUse[i] != tt.wantInUse[i] {
t.Errorf("in-use names: got %v, want %v", gotInUse, tt.wantInUse)
break
}
}
// Verify orphans would be identified correctly
// (actual killing requires tmux, tested separately)
dirSet := make(map[string]bool)
for _, name := range tt.namesWithDirs {
dirSet[name] = true
}
var gotOrphans []string
for _, name := range tt.namesWithSessions {
if !dirSet[name] {
gotOrphans = append(gotOrphans, name)
}
}
sort.Strings(gotOrphans)
sort.Strings(tt.wantOrphans)
if len(gotOrphans) != len(tt.wantOrphans) {
t.Errorf("orphan count: got %d, want %d", len(gotOrphans), len(tt.wantOrphans))
}
for i := range tt.wantOrphans {
if i >= len(gotOrphans) || gotOrphans[i] != tt.wantOrphans[i] {
t.Errorf("orphans: got %v, want %v", gotOrphans, tt.wantOrphans)
break
}
}
})
}
}
// TestReconcilePoolWith_Allocation verifies that allocation respects reconciled state.
func TestReconcilePoolWith_Allocation(t *testing.T) {
t.Parallel()
tmpDir, err := os.MkdirTemp("", "reconcile-alloc-test-*")
if err != nil {
t.Fatal(err)
}
defer func() { _ = os.RemoveAll(tmpDir) }()
r := &rig.Rig{
Name: "testrig",
Path: tmpDir,
}
m := NewManager(r, nil, nil)
// Mark first few pool names as in-use via directories
// (furiosa, nux, slit are first 3 in mad-max theme)
m.ReconcilePoolWith([]string{"furiosa", "nux", "slit"}, []string{})
// First allocation should skip in-use names
name, err := m.namePool.Allocate()
if err != nil {
t.Fatalf("Allocate: %v", err)
}
// Should get "rictus" (4th in mad-max theme), not furiosa/nux/slit
if name == "furiosa" || name == "nux" || name == "slit" {
t.Errorf("allocated in-use name %q, should have skipped", name)
}
if name != "rictus" {
t.Errorf("expected rictus (4th name), got %q", name)
}
}
// TestReconcilePoolWith_OrphanDoesNotBlockAllocation verifies orphan sessions
// don't prevent name allocation (they're killed, freeing the name).
func TestReconcilePoolWith_OrphanDoesNotBlockAllocation(t *testing.T) {
t.Parallel()
tmpDir, err := os.MkdirTemp("", "reconcile-orphan-test-*")
if err != nil {
t.Fatal(err)
}
defer func() { _ = os.RemoveAll(tmpDir) }()
r := &rig.Rig{
Name: "testrig",
Path: tmpDir,
}
m := NewManager(r, nil, nil)
// furiosa has orphan session (no dir) - should NOT block allocation
m.ReconcilePoolWith([]string{}, []string{"furiosa"})
// furiosa should be available (orphan session killed, name freed)
name, err := m.namePool.Allocate()
if err != nil {
t.Fatalf("Allocate: %v", err)
}
if name != "furiosa" {
t.Errorf("expected furiosa (orphan freed), got %q", name)
}
}

View File

@@ -146,6 +146,8 @@ func (m *SessionManager) Start(polecat string, opts SessionStartOptions) error {
sessionID := m.SessionName(polecat)
// Check if session already exists
// Note: Orphan sessions are cleaned up by ReconcilePool during AllocateName,
// so by this point, any existing session should be legitimately in use.
running, err := m.tmux.HasSession(sessionID)
if err != nil {
return fmt.Errorf("checking session: %w", err)