Files
gastown/internal/cmd/start.go
Roland Tritsch 6bfe61f796 Fix daemon shutdown detection bug
## Problem
gt shutdown failed to stop orphaned daemon processes because the
detection mechanism ignored errors and had no fallback.

## Root Cause
stopDaemonIfRunning() ignored errors from daemon.IsRunning(), causing:
1. Stale PID files to hide running daemons
2. Corrupted PID files to return silent false
3. No fallback detection for orphaned processes
4. Early return when no sessions running prevented daemon check

## Solution
1. Enhanced IsRunning() to return detailed errors
2. Added process name verification (prevents PID reuse false positives)
3. Added fallback orphan detection using pgrep
4. Fixed stopDaemonIfRunning() to handle errors and use fallback
5. Added daemon check even when no sessions are running

## Testing
Verified shutdown now:
- Detects and reports stale/corrupted PID files
- Finds orphaned daemon processes
- Kills all daemon processes reliably
- Reports detailed status during shutdown
- Works even when no other sessions are running

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-20 20:25:25 -08:00

1084 lines
33 KiB
Go

package cmd
import (
"bufio"
"errors"
"fmt"
"os"
"path/filepath"
"strings"
"sync"
"sync/atomic"
"syscall"
"time"
"github.com/spf13/cobra"
"github.com/steveyegge/gastown/internal/config"
"github.com/steveyegge/gastown/internal/constants"
"github.com/steveyegge/gastown/internal/crew"
"github.com/steveyegge/gastown/internal/daemon"
"github.com/steveyegge/gastown/internal/deacon"
"github.com/steveyegge/gastown/internal/git"
"github.com/steveyegge/gastown/internal/mayor"
"github.com/steveyegge/gastown/internal/polecat"
"github.com/steveyegge/gastown/internal/refinery"
"github.com/steveyegge/gastown/internal/rig"
"github.com/steveyegge/gastown/internal/session"
"github.com/steveyegge/gastown/internal/style"
"github.com/steveyegge/gastown/internal/tmux"
"github.com/steveyegge/gastown/internal/util"
"github.com/steveyegge/gastown/internal/witness"
"github.com/steveyegge/gastown/internal/workspace"
)
var (
startAll bool
startAgentOverride string
startCrewRig string
startCrewAccount string
startCrewAgentOverride string
shutdownGraceful bool
shutdownWait int
shutdownAll bool
shutdownForce bool
shutdownYes bool
shutdownPolecatsOnly bool
shutdownNuclear bool
shutdownCleanupOrphans bool
shutdownCleanupOrphansGrace int
)
var startCmd = &cobra.Command{
Use: "start [path]",
GroupID: GroupServices,
Short: "Start Gas Town or a crew workspace",
Long: `Start Gas Town by launching the Deacon and Mayor.
The Deacon is the health-check orchestrator that monitors Mayor and Witnesses.
The Mayor is the global coordinator that dispatches work.
By default, other agents (Witnesses, Refineries) are started lazily as needed.
Use --all to start Witnesses and Refineries for all registered rigs immediately.
Crew shortcut:
If a path like "rig/crew/name" is provided, starts that crew workspace.
This is equivalent to 'gt start crew rig/name'.
To stop Gas Town, use 'gt shutdown'.`,
Args: cobra.MaximumNArgs(1),
RunE: runStart,
}
var shutdownCmd = &cobra.Command{
Use: "shutdown",
GroupID: GroupServices,
Short: "Shutdown Gas Town with cleanup",
Long: `Shutdown Gas Town by stopping agents and cleaning up polecats.
This is the "done for the day" command - it stops everything AND removes
polecat worktrees/branches. For a reversible pause, use 'gt down' instead.
Comparison:
gt down - Pause (stop processes, keep worktrees) - reversible
gt shutdown - Done (stop + cleanup worktrees) - permanent cleanup
After killing sessions, polecats are cleaned up:
- Worktrees are removed
- Polecat branches are deleted
- Polecats with uncommitted work are SKIPPED (protected)
Shutdown levels (progressively more aggressive):
(default) - Stop infrastructure + polecats + cleanup
--all - Also stop crew sessions
--polecats-only - Only stop polecats (leaves infrastructure running)
Use --force or --yes to skip confirmation prompt.
Use --graceful to allow agents time to save state before killing.
Use --nuclear to force cleanup even if polecats have uncommitted work (DANGER).
Use --cleanup-orphans to kill orphaned Claude processes (TTY-less, older than 60s).
Use --cleanup-orphans-grace-secs to set the grace period (default 60s).`,
RunE: runShutdown,
}
var startCrewCmd = &cobra.Command{
Use: "crew <name>",
Short: "Start a crew workspace (creates if needed)",
Long: `Start a crew workspace, creating it if it doesn't exist.
This is a convenience command that combines 'gt crew add' and 'gt crew at --detached'.
The crew session starts in the background with Claude running and ready.
The name can include the rig in slash format (e.g., greenplace/joe).
If not specified, the rig is inferred from the current directory.
Examples:
gt start crew joe # Start joe in current rig
gt start crew greenplace/joe # Start joe in gastown rig
gt start crew joe --rig beads # Start joe in beads rig`,
Args: cobra.ExactArgs(1),
RunE: runStartCrew,
}
func init() {
startCmd.Flags().BoolVarP(&startAll, "all", "a", false,
"Also start Witnesses and Refineries for all rigs")
startCmd.Flags().StringVar(&startAgentOverride, "agent", "", "Agent alias to run Mayor/Deacon with (overrides town default)")
startCrewCmd.Flags().StringVar(&startCrewRig, "rig", "", "Rig to use")
startCrewCmd.Flags().StringVar(&startCrewAccount, "account", "", "Claude Code account handle to use")
startCrewCmd.Flags().StringVar(&startCrewAgentOverride, "agent", "", "Agent alias to run crew worker with (overrides rig/town default)")
startCmd.AddCommand(startCrewCmd)
shutdownCmd.Flags().BoolVarP(&shutdownGraceful, "graceful", "g", false,
"Send ESC to agents and wait for them to handoff before killing")
shutdownCmd.Flags().IntVarP(&shutdownWait, "wait", "w", 30,
"Seconds to wait for graceful shutdown (default 30)")
shutdownCmd.Flags().BoolVarP(&shutdownAll, "all", "a", false,
"Also stop crew sessions (by default, crew is preserved)")
shutdownCmd.Flags().BoolVarP(&shutdownForce, "force", "f", false,
"Skip confirmation prompt (alias for --yes)")
shutdownCmd.Flags().BoolVarP(&shutdownYes, "yes", "y", false,
"Skip confirmation prompt")
shutdownCmd.Flags().BoolVar(&shutdownPolecatsOnly, "polecats-only", false,
"Only stop polecats (minimal shutdown)")
shutdownCmd.Flags().BoolVar(&shutdownNuclear, "nuclear", false,
"Force cleanup even if polecats have uncommitted work (DANGER: may lose work)")
shutdownCmd.Flags().BoolVar(&shutdownCleanupOrphans, "cleanup-orphans", false,
"Clean up orphaned Claude processes (TTY-less processes older than 60s)")
shutdownCmd.Flags().IntVar(&shutdownCleanupOrphansGrace, "cleanup-orphans-grace-secs", 60,
"Grace period in seconds between SIGTERM and SIGKILL when cleaning orphans (default 60)")
rootCmd.AddCommand(startCmd)
rootCmd.AddCommand(shutdownCmd)
}
func runStart(cmd *cobra.Command, args []string) error {
// Check if arg looks like a crew path (rig/crew/name)
if len(args) == 1 && strings.Contains(args[0], "/crew/") {
// Parse rig/crew/name format
parts := strings.SplitN(args[0], "/crew/", 2)
if len(parts) == 2 && parts[0] != "" && parts[1] != "" {
// Route to crew start with rig/name format
crewArg := parts[0] + "/" + parts[1]
return runStartCrew(cmd, []string{crewArg})
}
}
// Verify we're in a Gas Town workspace
townRoot, err := workspace.FindFromCwdOrError()
if err != nil {
return fmt.Errorf("not in a Gas Town workspace: %w", err)
}
if err := config.EnsureDaemonPatrolConfig(townRoot); err != nil {
fmt.Printf(" %s Could not ensure daemon config: %v\n", style.Dim.Render("○"), err)
}
t := tmux.NewTmux()
fmt.Printf("Starting Gas Town from %s\n\n", style.Dim.Render(townRoot))
fmt.Println("Starting all agents in parallel...")
fmt.Println()
// Discover rigs once upfront to avoid redundant calls from parallel goroutines
rigs, rigsErr := discoverAllRigs(townRoot)
if rigsErr != nil {
fmt.Printf(" %s Could not discover rigs: %v\n", style.Dim.Render("○"), rigsErr)
// Continue anyway - core agents don't need rigs
}
// Start all agent groups in parallel for maximum speed
var wg sync.WaitGroup
var mu sync.Mutex // Protects stdout
var coreErr error
// Start core agents (Mayor and Deacon) in background
wg.Add(1)
go func() {
defer wg.Done()
if err := startCoreAgents(townRoot, startAgentOverride, &mu); err != nil {
mu.Lock()
coreErr = err
mu.Unlock()
}
}()
// Start rig agents (witnesses, refineries) if --all
if startAll && rigs != nil {
wg.Add(1)
go func() {
defer wg.Done()
startRigAgents(rigs, &mu)
}()
}
// Start configured crew
if rigs != nil {
wg.Add(1)
go func() {
defer wg.Done()
startConfiguredCrew(t, rigs, townRoot, &mu)
}()
}
wg.Wait()
if coreErr != nil {
return coreErr
}
fmt.Println()
fmt.Printf("%s Gas Town is running\n", style.Bold.Render("✓"))
fmt.Println()
fmt.Printf(" Attach to Mayor: %s\n", style.Dim.Render("gt mayor attach"))
fmt.Printf(" Attach to Deacon: %s\n", style.Dim.Render("gt deacon attach"))
fmt.Printf(" Check status: %s\n", style.Dim.Render("gt status"))
return nil
}
// startCoreAgents starts Mayor and Deacon sessions in parallel using the Manager pattern.
// The mutex is used to synchronize output with other parallel startup operations.
func startCoreAgents(townRoot string, agentOverride string, mu *sync.Mutex) error {
var wg sync.WaitGroup
var firstErr error
var errMu sync.Mutex
// Start Mayor in goroutine
wg.Add(1)
go func() {
defer wg.Done()
mayorMgr := mayor.NewManager(townRoot)
if err := mayorMgr.Start(agentOverride); err != nil {
if errors.Is(err, mayor.ErrAlreadyRunning) {
mu.Lock()
fmt.Printf(" %s Mayor already running\n", style.Dim.Render("○"))
mu.Unlock()
} else {
errMu.Lock()
if firstErr == nil {
firstErr = fmt.Errorf("starting Mayor: %w", err)
}
errMu.Unlock()
mu.Lock()
fmt.Printf(" %s Mayor failed: %v\n", style.Dim.Render("○"), err)
mu.Unlock()
}
} else {
mu.Lock()
fmt.Printf(" %s Mayor started\n", style.Bold.Render("✓"))
mu.Unlock()
}
}()
// Start Deacon in goroutine
wg.Add(1)
go func() {
defer wg.Done()
deaconMgr := deacon.NewManager(townRoot)
if err := deaconMgr.Start(agentOverride); err != nil {
if errors.Is(err, deacon.ErrAlreadyRunning) {
mu.Lock()
fmt.Printf(" %s Deacon already running\n", style.Dim.Render("○"))
mu.Unlock()
} else {
errMu.Lock()
if firstErr == nil {
firstErr = fmt.Errorf("starting Deacon: %w", err)
}
errMu.Unlock()
mu.Lock()
fmt.Printf(" %s Deacon failed: %v\n", style.Dim.Render("○"), err)
mu.Unlock()
}
} else {
mu.Lock()
fmt.Printf(" %s Deacon started\n", style.Bold.Render("✓"))
mu.Unlock()
}
}()
wg.Wait()
return firstErr
}
// startRigAgents starts witness and refinery for all rigs in parallel.
// Called when --all flag is passed to gt start.
func startRigAgents(rigs []*rig.Rig, mu *sync.Mutex) {
var wg sync.WaitGroup
for _, r := range rigs {
wg.Add(2) // Witness + Refinery
// Start Witness in goroutine
go func(r *rig.Rig) {
defer wg.Done()
msg := startWitnessForRig(r)
mu.Lock()
fmt.Print(msg)
mu.Unlock()
}(r)
// Start Refinery in goroutine
go func(r *rig.Rig) {
defer wg.Done()
msg := startRefineryForRig(r)
mu.Lock()
fmt.Print(msg)
mu.Unlock()
}(r)
}
wg.Wait()
}
// startWitnessForRig starts the witness for a single rig and returns a status message.
func startWitnessForRig(r *rig.Rig) string {
witMgr := witness.NewManager(r)
if err := witMgr.Start(false, "", nil); err != nil {
if errors.Is(err, witness.ErrAlreadyRunning) {
return fmt.Sprintf(" %s %s witness already running\n", style.Dim.Render("○"), r.Name)
}
return fmt.Sprintf(" %s %s witness failed: %v\n", style.Dim.Render("○"), r.Name, err)
}
return fmt.Sprintf(" %s %s witness started\n", style.Bold.Render("✓"), r.Name)
}
// startRefineryForRig starts the refinery for a single rig and returns a status message.
func startRefineryForRig(r *rig.Rig) string {
refineryMgr := refinery.NewManager(r)
if err := refineryMgr.Start(false, ""); err != nil {
if errors.Is(err, refinery.ErrAlreadyRunning) {
return fmt.Sprintf(" %s %s refinery already running\n", style.Dim.Render("○"), r.Name)
}
return fmt.Sprintf(" %s %s refinery failed: %v\n", style.Dim.Render("○"), r.Name, err)
}
return fmt.Sprintf(" %s %s refinery started\n", style.Bold.Render("✓"), r.Name)
}
// startConfiguredCrew starts crew members configured in rig settings in parallel.
func startConfiguredCrew(t *tmux.Tmux, rigs []*rig.Rig, townRoot string, mu *sync.Mutex) {
var wg sync.WaitGroup
var startedAny int32 // Use atomic for thread-safe flag
for _, r := range rigs {
crewToStart := getCrewToStart(r)
for _, crewName := range crewToStart {
wg.Add(1)
go func(r *rig.Rig, crewName string) {
defer wg.Done()
msg, started := startOrRestartCrewMember(t, r, crewName, townRoot)
mu.Lock()
fmt.Print(msg)
mu.Unlock()
if started {
atomic.StoreInt32(&startedAny, 1)
}
}(r, crewName)
}
}
wg.Wait()
if atomic.LoadInt32(&startedAny) == 0 {
mu.Lock()
fmt.Printf(" %s No crew configured or all already running\n", style.Dim.Render("○"))
mu.Unlock()
}
}
// startOrRestartCrewMember starts or restarts a single crew member and returns a status message.
func startOrRestartCrewMember(t *tmux.Tmux, r *rig.Rig, crewName, townRoot string) (msg string, started bool) {
sessionID := crewSessionName(r.Name, crewName)
if running, _ := t.HasSession(sessionID); running {
// Session exists - check if agent is still running
agentCfg := config.ResolveRoleAgentConfig(constants.RoleCrew, townRoot, r.Path)
if !t.IsAgentRunning(sessionID, config.ExpectedPaneCommands(agentCfg)...) {
// Agent has exited, restart it
// Build startup beacon for predecessor discovery via /resume
address := fmt.Sprintf("%s/crew/%s", r.Name, crewName)
beacon := session.FormatStartupNudge(session.StartupNudgeConfig{
Recipient: address,
Sender: "human",
Topic: "restart",
})
agentCmd := config.BuildCrewStartupCommand(r.Name, crewName, r.Path, beacon)
if err := t.SendKeys(sessionID, agentCmd); err != nil {
return fmt.Sprintf(" %s %s/%s restart failed: %v\n", style.Dim.Render("○"), r.Name, crewName, err), false
}
return fmt.Sprintf(" %s %s/%s agent restarted\n", style.Bold.Render("✓"), r.Name, crewName), true
}
return fmt.Sprintf(" %s %s/%s already running\n", style.Dim.Render("○"), r.Name, crewName), false
}
if err := startCrewMember(r.Name, crewName, townRoot); err != nil {
return fmt.Sprintf(" %s %s/%s failed: %v\n", style.Dim.Render("○"), r.Name, crewName, err), false
}
return fmt.Sprintf(" %s %s/%s started\n", style.Bold.Render("✓"), r.Name, crewName), true
}
// discoverAllRigs finds all rigs in the workspace.
func discoverAllRigs(townRoot string) ([]*rig.Rig, error) {
rigsConfigPath := filepath.Join(townRoot, "mayor", "rigs.json")
rigsConfig, err := config.LoadRigsConfig(rigsConfigPath)
if err != nil {
return nil, fmt.Errorf("loading rigs config: %w", err)
}
g := git.NewGit(townRoot)
rigMgr := rig.NewManager(townRoot, rigsConfig, g)
return rigMgr.DiscoverRigs()
}
func runShutdown(cmd *cobra.Command, args []string) error {
t := tmux.NewTmux()
// Find workspace root for polecat cleanup
townRoot, _ := workspace.FindFromCwd()
// Collect sessions to show what will be stopped
sessions, err := t.ListSessions()
if err != nil {
return fmt.Errorf("listing sessions: %w", err)
}
// Get session names for categorization
mayorSession := getMayorSessionName()
deaconSession := getDeaconSessionName()
toStop, preserved := categorizeSessions(sessions, mayorSession, deaconSession)
if len(toStop) == 0 {
fmt.Printf("%s Gas Town was not running\n", style.Dim.Render("○"))
// Still check for orphaned daemons even if no sessions are running
if townRoot != "" {
fmt.Println()
fmt.Println("Checking for orphaned daemon...")
stopDaemonIfRunning(townRoot)
}
return nil
}
// Show what will happen
fmt.Println("Sessions to stop:")
for _, sess := range toStop {
fmt.Printf(" %s %s\n", style.Bold.Render("→"), sess)
}
if len(preserved) > 0 && !shutdownAll {
fmt.Println()
fmt.Println("Sessions preserved (crew):")
for _, sess := range preserved {
fmt.Printf(" %s %s\n", style.Dim.Render("○"), sess)
}
}
fmt.Println()
// Confirmation prompt
if !shutdownYes && !shutdownForce {
fmt.Printf("Proceed with shutdown? [y/N] ")
reader := bufio.NewReader(os.Stdin)
response, _ := reader.ReadString('\n')
response = strings.TrimSpace(strings.ToLower(response))
if response != "y" && response != "yes" {
fmt.Println("Shutdown canceled.")
return nil
}
}
if shutdownGraceful {
return runGracefulShutdown(t, toStop, townRoot)
}
return runImmediateShutdown(t, toStop, townRoot)
}
// categorizeSessions splits sessions into those to stop and those to preserve.
// mayorSession and deaconSession are the dynamic session names for the current town.
func categorizeSessions(sessions []string, mayorSession, deaconSession string) (toStop, preserved []string) {
for _, sess := range sessions {
// Gas Town sessions use gt- (rig-level) or hq- (town-level) prefix
if !strings.HasPrefix(sess, "gt-") && !strings.HasPrefix(sess, "hq-") {
continue // Not a Gas Town session
}
// Check if it's a crew session (pattern: gt-<rig>-crew-<name>)
isCrew := strings.Contains(sess, "-crew-")
// Check if it's a polecat session (pattern: gt-<rig>-<name> where name is not crew/witness/refinery)
isPolecat := false
if !isCrew && sess != mayorSession && sess != deaconSession {
parts := strings.Split(sess, "-")
if len(parts) >= 3 {
role := parts[2]
if role != "witness" && role != "refinery" && role != "crew" {
isPolecat = true
}
}
}
// Decide based on flags
if shutdownPolecatsOnly {
// Only stop polecats
if isPolecat {
toStop = append(toStop, sess)
} else {
preserved = append(preserved, sess)
}
} else if shutdownAll {
// Stop everything including crew
toStop = append(toStop, sess)
} else {
// Default: preserve crew
if isCrew {
preserved = append(preserved, sess)
} else {
toStop = append(toStop, sess)
}
}
}
return
}
func runGracefulShutdown(t *tmux.Tmux, gtSessions []string, townRoot string) error {
fmt.Printf("Graceful shutdown of Gas Town (waiting up to %ds)...\n\n", shutdownWait)
// Phase 1: Send ESC to all agents to interrupt them
fmt.Printf("Phase 1: Sending ESC to %d agent(s)...\n", len(gtSessions))
for _, sess := range gtSessions {
fmt.Printf(" %s Interrupting %s\n", style.Bold.Render("→"), sess)
_ = t.SendKeysRaw(sess, "Escape") // best-effort interrupt
}
// Phase 2: Send shutdown message asking agents to handoff
fmt.Printf("\nPhase 2: Requesting handoff from agents...\n")
shutdownMsg := "[SHUTDOWN] Gas Town is shutting down. Please save your state and update your handoff bead, then type /exit or wait to be terminated."
for _, sess := range gtSessions {
// Small delay then send the message
time.Sleep(constants.ShutdownNotifyDelay)
_ = t.SendKeys(sess, shutdownMsg) // best-effort notification
}
// Phase 3: Wait for agents to complete handoff
fmt.Printf("\nPhase 3: Waiting %ds for agents to complete handoff...\n", shutdownWait)
fmt.Printf(" %s\n", style.Dim.Render("(Press Ctrl-C to force immediate shutdown)"))
// Wait with countdown
for remaining := shutdownWait; remaining > 0; remaining -= 5 {
if remaining < shutdownWait {
fmt.Printf(" %s %ds remaining...\n", style.Dim.Render("⏳"), remaining)
}
sleepTime := 5
if remaining < 5 {
sleepTime = remaining
}
time.Sleep(time.Duration(sleepTime) * time.Second)
}
// Phase 4: Kill sessions in correct order
fmt.Printf("\nPhase 4: Terminating sessions...\n")
mayorSession := getMayorSessionName()
deaconSession := getDeaconSessionName()
stopped := killSessionsInOrder(t, gtSessions, mayorSession, deaconSession)
// Phase 5: Cleanup orphaned Claude processes if requested
if shutdownCleanupOrphans {
fmt.Printf("\nPhase 5: Cleaning up orphaned Claude processes...\n")
cleanupOrphanedClaude(shutdownCleanupOrphansGrace)
}
// Phase 6: Cleanup polecat worktrees and branches
fmt.Printf("\nPhase 6: Cleaning up polecats...\n")
if townRoot != "" {
cleanupPolecats(townRoot)
}
// Phase 7: Stop the daemon
fmt.Printf("\nPhase 7: Stopping daemon...\n")
if townRoot != "" {
stopDaemonIfRunning(townRoot)
}
fmt.Println()
fmt.Printf("%s Graceful shutdown complete (%d sessions stopped)\n", style.Bold.Render("✓"), stopped)
return nil
}
func runImmediateShutdown(t *tmux.Tmux, gtSessions []string, townRoot string) error {
fmt.Println("Shutting down Gas Town...")
mayorSession := getMayorSessionName()
deaconSession := getDeaconSessionName()
stopped := killSessionsInOrder(t, gtSessions, mayorSession, deaconSession)
// Cleanup orphaned Claude processes if requested
if shutdownCleanupOrphans {
fmt.Println()
fmt.Println("Cleaning up orphaned Claude processes...")
cleanupOrphanedClaude(shutdownCleanupOrphansGrace)
}
// Cleanup polecat worktrees and branches
if townRoot != "" {
fmt.Println()
fmt.Println("Cleaning up polecats...")
cleanupPolecats(townRoot)
}
// Stop the daemon
if townRoot != "" {
fmt.Println()
fmt.Println("Stopping daemon...")
stopDaemonIfRunning(townRoot)
}
fmt.Println()
fmt.Printf("%s Gas Town shutdown complete (%d sessions stopped)\n", style.Bold.Render("✓"), stopped)
return nil
}
// killSessionsInOrder stops sessions in the correct order:
// 1. Deacon first (so it doesn't restart others)
// 2. Everything except Mayor
// 3. Mayor last
// mayorSession and deaconSession are the dynamic session names for the current town.
//
// Returns the count of sessions that were successfully stopped (verified by checking
// if the session no longer exists after the kill attempt).
func killSessionsInOrder(t *tmux.Tmux, sessions []string, mayorSession, deaconSession string) int {
stopped := 0
// Helper to check if session is in our list
inList := func(sess string) bool {
for _, s := range sessions {
if s == sess {
return true
}
}
return false
}
// Helper to kill a session and verify it was stopped
killAndVerify := func(sess string) bool {
// Check if session exists before attempting to kill
exists, _ := t.HasSession(sess)
if !exists {
return false // Session already gone
}
// Attempt to kill the session and its processes
_ = t.KillSessionWithProcesses(sess)
// Verify the session is actually gone (ignore error, check existence)
// KillSessionWithProcesses might return an error even if it successfully
// killed the processes and the session auto-closed
stillExists, _ := t.HasSession(sess)
if !stillExists {
fmt.Printf(" %s %s stopped\n", style.Bold.Render("✓"), sess)
return true
}
return false
}
// 1. Stop Deacon first
if inList(deaconSession) {
if killAndVerify(deaconSession) {
stopped++
}
}
// 2. Stop others (except Mayor)
for _, sess := range sessions {
if sess == deaconSession || sess == mayorSession {
continue
}
if killAndVerify(sess) {
stopped++
}
}
// 3. Stop Mayor last
if inList(mayorSession) {
if killAndVerify(mayorSession) {
stopped++
}
}
return stopped
}
// cleanupPolecats removes polecat worktrees and branches for all rigs.
// It refuses to clean up polecats with uncommitted work unless --nuclear is set.
func cleanupPolecats(townRoot string) {
// Load rigs config
rigsConfigPath := filepath.Join(townRoot, "mayor", "rigs.json")
rigsConfig, err := config.LoadRigsConfig(rigsConfigPath)
if err != nil {
fmt.Printf(" %s Could not load rigs config: %v\n", style.Dim.Render("○"), err)
return
}
g := git.NewGit(townRoot)
rigMgr := rig.NewManager(townRoot, rigsConfig, g)
// Discover all rigs
rigs, err := rigMgr.DiscoverRigs()
if err != nil {
fmt.Printf(" %s Could not discover rigs: %v\n", style.Dim.Render("○"), err)
return
}
totalCleaned := 0
totalSkipped := 0
var uncommittedPolecats []string
for _, r := range rigs {
polecatGit := git.NewGit(r.Path)
polecatMgr := polecat.NewManager(r, polecatGit, nil) // nil tmux: just listing, not allocating
polecats, err := polecatMgr.List()
if err != nil {
continue
}
for _, p := range polecats {
// Check for uncommitted work
pGit := git.NewGit(p.ClonePath)
status, err := pGit.CheckUncommittedWork()
if err != nil {
// Can't check, be safe and skip unless nuclear
if !shutdownNuclear {
fmt.Printf(" %s %s/%s: could not check status, skipping\n",
style.Dim.Render("○"), r.Name, p.Name)
totalSkipped++
continue
}
} else if !status.Clean() {
// Has uncommitted work
if !shutdownNuclear {
uncommittedPolecats = append(uncommittedPolecats,
fmt.Sprintf("%s/%s (%s)", r.Name, p.Name, status.String()))
totalSkipped++
continue
}
// Nuclear mode: warn but proceed
fmt.Printf(" %s %s/%s: NUCLEAR - removing despite %s\n",
style.Bold.Render("⚠"), r.Name, p.Name, status.String())
}
// Clean: remove worktree and branch
if err := polecatMgr.RemoveWithOptions(p.Name, true, shutdownNuclear); err != nil {
fmt.Printf(" %s %s/%s: cleanup failed: %v\n",
style.Dim.Render("○"), r.Name, p.Name, err)
totalSkipped++
continue
}
// Delete the polecat branch from mayor's clone
branchName := fmt.Sprintf("polecat/%s", p.Name)
mayorPath := filepath.Join(r.Path, "mayor", "rig")
mayorGit := git.NewGit(mayorPath)
_ = mayorGit.DeleteBranch(branchName, true) // Ignore errors
fmt.Printf(" %s %s/%s: cleaned up\n", style.Bold.Render("✓"), r.Name, p.Name)
totalCleaned++
}
}
// Summary
if len(uncommittedPolecats) > 0 {
fmt.Println()
fmt.Printf(" %s Polecats with uncommitted work (use --nuclear to force):\n",
style.Bold.Render("⚠"))
for _, pc := range uncommittedPolecats {
fmt.Printf(" • %s\n", pc)
}
}
if totalCleaned > 0 || totalSkipped > 0 {
fmt.Printf(" Cleaned: %d, Skipped: %d\n", totalCleaned, totalSkipped)
} else {
fmt.Printf(" %s No polecats to clean up\n", style.Dim.Render("○"))
}
}
// stopDaemonIfRunning stops the daemon if it is running.
// This prevents the daemon from restarting agents after shutdown.
// Uses robust detection with fallback to process search.
func stopDaemonIfRunning(townRoot string) {
// Primary detection: PID file
running, pid, err := daemon.IsRunning(townRoot)
if err != nil {
// Detection error - report it but continue with fallback
fmt.Printf(" %s Daemon detection warning: %s\n", style.Bold.Render("⚠"), err.Error())
}
if running {
// PID file points to live daemon - stop it
if err := daemon.StopDaemon(townRoot); err != nil {
fmt.Printf(" %s Failed to stop daemon (PID %d): %s\n",
style.Bold.Render("✗"), pid, err.Error())
} else {
fmt.Printf(" %s Daemon stopped (was PID %d)\n", style.Bold.Render("✓"), pid)
}
} else {
fmt.Printf(" %s Daemon not tracked by PID file\n", style.Dim.Render("○"))
}
// Fallback: Search for orphaned daemon processes
orphaned, err := daemon.FindOrphanedDaemons()
if err != nil {
fmt.Printf(" %s Warning: failed to search for orphaned daemons: %v\n",
style.Dim.Render("○"), err)
return
}
if len(orphaned) > 0 {
fmt.Printf(" %s Found %d orphaned daemon process(es): %v\n",
style.Bold.Render("⚠"), len(orphaned), orphaned)
killed, err := daemon.KillOrphanedDaemons()
if err != nil {
fmt.Printf(" %s Failed to kill orphaned daemons: %v\n",
style.Bold.Render("✗"), err)
} else if killed > 0 {
fmt.Printf(" %s Killed %d orphaned daemon(s)\n",
style.Bold.Render("✓"), killed)
}
}
}
// runStartCrew starts a crew workspace, creating it if it doesn't exist.
// This combines the functionality of 'gt crew add' and 'gt crew at --detached'.
func runStartCrew(cmd *cobra.Command, args []string) error {
name := args[0]
// Parse rig/name format (e.g., "greenplace/joe" -> rig=gastown, name=joe)
rigName := startCrewRig
if parsedRig, crewName, ok := parseRigSlashName(name); ok {
if rigName == "" {
rigName = parsedRig
}
name = crewName
}
// Find workspace
townRoot, err := workspace.FindFromCwdOrError()
if err != nil {
return fmt.Errorf("not in a Gas Town workspace: %w", err)
}
// If rig still not specified, try to infer from cwd
if rigName == "" {
rigName, err = inferRigFromCwd(townRoot)
if err != nil {
return fmt.Errorf("could not determine rig (use --rig flag or rig/name format): %w", err)
}
}
// Load rigs config
rigsConfigPath := filepath.Join(townRoot, "mayor", "rigs.json")
rigsConfig, err := config.LoadRigsConfig(rigsConfigPath)
if err != nil {
rigsConfig = &config.RigsConfig{Rigs: make(map[string]config.RigEntry)}
}
// Get rig
g := git.NewGit(townRoot)
rigMgr := rig.NewManager(townRoot, rigsConfig, g)
r, err := rigMgr.GetRig(rigName)
if err != nil {
return fmt.Errorf("rig '%s' not found", rigName)
}
// Create crew manager
crewGit := git.NewGit(r.Path)
crewMgr := crew.NewManager(r, crewGit)
// Resolve account for Claude config
accountsPath := constants.MayorAccountsPath(townRoot)
claudeConfigDir, accountHandle, err := config.ResolveAccountConfigDir(accountsPath, startCrewAccount)
if err != nil {
return fmt.Errorf("resolving account: %w", err)
}
if accountHandle != "" {
fmt.Printf("Using account: %s\n", accountHandle)
}
// Use manager's Start() method - handles workspace creation, settings, and session
err = crewMgr.Start(name, crew.StartOptions{
Account: startCrewAccount,
ClaudeConfigDir: claudeConfigDir,
AgentOverride: startCrewAgentOverride,
})
if err != nil {
if errors.Is(err, crew.ErrSessionRunning) {
fmt.Printf("%s Session already running: %s\n", style.Dim.Render("○"), crewMgr.SessionName(name))
} else {
return err
}
} else {
fmt.Printf("%s Started crew workspace: %s/%s\n",
style.Bold.Render("✓"), rigName, name)
}
fmt.Printf("Attach with: %s\n", style.Dim.Render(fmt.Sprintf("gt crew at %s", name)))
return nil
}
// getCrewToStart reads rig settings and parses the crew.startup field.
// Returns a list of crew names to start.
func getCrewToStart(r *rig.Rig) []string {
// Load rig settings
settingsPath := filepath.Join(r.Path, "settings", "config.json")
settings, err := config.LoadRigSettings(settingsPath)
if err != nil {
return nil
}
if settings.Crew == nil || settings.Crew.Startup == "" || settings.Crew.Startup == "none" {
return nil
}
startup := settings.Crew.Startup
// Handle "all" - list all existing crew
if startup == "all" {
crewGit := git.NewGit(r.Path)
crewMgr := crew.NewManager(r, crewGit)
workers, err := crewMgr.List()
if err != nil {
return nil
}
var names []string
for _, w := range workers {
names = append(names, w.Name)
}
return names
}
// Parse names: "max", "max and joe", "max, joe", "max, joe, emma"
// Replace "and" with comma for uniform parsing
startup = strings.ReplaceAll(startup, " and ", ", ")
parts := strings.Split(startup, ",")
var names []string
for _, part := range parts {
name := strings.TrimSpace(part)
if name != "" {
names = append(names, name)
}
}
return names
}
// startCrewMember starts a single crew member, creating if needed.
// This is a simplified version of runStartCrew that doesn't print output.
func startCrewMember(rigName, crewName, townRoot string) error {
// Load rigs config
rigsConfigPath := filepath.Join(townRoot, "mayor", "rigs.json")
rigsConfig, err := config.LoadRigsConfig(rigsConfigPath)
if err != nil {
rigsConfig = &config.RigsConfig{Rigs: make(map[string]config.RigEntry)}
}
// Get rig
g := git.NewGit(townRoot)
rigMgr := rig.NewManager(townRoot, rigsConfig, g)
r, err := rigMgr.GetRig(rigName)
if err != nil {
return fmt.Errorf("rig '%s' not found", rigName)
}
// Create crew manager and use Start() method
crewGit := git.NewGit(r.Path)
crewMgr := crew.NewManager(r, crewGit)
// Start handles workspace creation, settings, and session all in one
err = crewMgr.Start(crewName, crew.StartOptions{})
if err != nil && !errors.Is(err, crew.ErrSessionRunning) {
return err
}
return nil
}
// cleanupOrphanedClaude finds and kills orphaned Claude processes with a grace period.
// This is a simpler synchronous implementation that:
// 1. Finds orphaned processes (TTY-less, older than 60s, not in Gas Town sessions)
// 2. Sends SIGTERM to all of them
// 3. Waits for the grace period
// 4. Sends SIGKILL to any that are still alive
func cleanupOrphanedClaude(graceSecs int) {
// Find orphaned processes
orphans, err := util.FindOrphanedClaudeProcesses()
if err != nil {
fmt.Printf(" %s Warning: %v\n", style.Bold.Render("⚠"), err)
return
}
if len(orphans) == 0 {
fmt.Printf(" %s No orphaned processes found\n", style.Dim.Render("○"))
return
}
// Send SIGTERM to all orphans
var termPIDs []int
for _, orphan := range orphans {
if err := syscall.Kill(orphan.PID, syscall.SIGTERM); err != nil {
if err != syscall.ESRCH {
fmt.Printf(" %s PID %d: failed to send SIGTERM: %v\n",
style.Bold.Render("⚠"), orphan.PID, err)
}
continue
}
termPIDs = append(termPIDs, orphan.PID)
fmt.Printf(" %s PID %d: sent SIGTERM (waiting %ds before SIGKILL)\n",
style.Bold.Render("→"), orphan.PID, graceSecs)
}
if len(termPIDs) == 0 {
return
}
// Wait for grace period
fmt.Printf(" %s Waiting %d seconds for processes to terminate gracefully...\n",
style.Dim.Render("⏳"), graceSecs)
time.Sleep(time.Duration(graceSecs) * time.Second)
// Check which processes are still alive and send SIGKILL
var killedCount, alreadyDeadCount int
for _, pid := range termPIDs {
// Check if process still exists
if err := syscall.Kill(pid, 0); err != nil {
// Process is gone (either died from SIGTERM or doesn't exist)
alreadyDeadCount++
continue
}
// Process still alive - send SIGKILL
if err := syscall.Kill(pid, syscall.SIGKILL); err != nil {
if err != syscall.ESRCH {
fmt.Printf(" %s PID %d: failed to send SIGKILL: %v\n",
style.Bold.Render("⚠"), pid, err)
}
continue
}
killedCount++
fmt.Printf(" %s PID %d: sent SIGKILL (did not respond to SIGTERM)\n",
style.Bold.Render("✓"), pid)
}
if alreadyDeadCount > 0 {
fmt.Printf(" %s %d process(es) terminated gracefully from SIGTERM\n",
style.Bold.Render("✓"), alreadyDeadCount)
}
if killedCount == 0 && alreadyDeadCount > 0 {
fmt.Printf(" %s All processes cleaned up successfully\n",
style.Bold.Render("✓"))
}
}