## Problem gt shutdown failed to stop orphaned daemon processes because the detection mechanism ignored errors and had no fallback. ## Root Cause stopDaemonIfRunning() ignored errors from daemon.IsRunning(), causing: 1. Stale PID files to hide running daemons 2. Corrupted PID files to return silent false 3. No fallback detection for orphaned processes 4. Early return when no sessions running prevented daemon check ## Solution 1. Enhanced IsRunning() to return detailed errors 2. Added process name verification (prevents PID reuse false positives) 3. Added fallback orphan detection using pgrep 4. Fixed stopDaemonIfRunning() to handle errors and use fallback 5. Added daemon check even when no sessions are running ## Testing Verified shutdown now: - Detects and reports stale/corrupted PID files - Finds orphaned daemon processes - Kills all daemon processes reliably - Reports detailed status during shutdown - Works even when no other sessions are running Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
1084 lines
33 KiB
Go
1084 lines
33 KiB
Go
package cmd
|
|
|
|
import (
|
|
"bufio"
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"sync"
|
|
"sync/atomic"
|
|
"syscall"
|
|
"time"
|
|
|
|
"github.com/spf13/cobra"
|
|
"github.com/steveyegge/gastown/internal/config"
|
|
"github.com/steveyegge/gastown/internal/constants"
|
|
"github.com/steveyegge/gastown/internal/crew"
|
|
"github.com/steveyegge/gastown/internal/daemon"
|
|
"github.com/steveyegge/gastown/internal/deacon"
|
|
"github.com/steveyegge/gastown/internal/git"
|
|
"github.com/steveyegge/gastown/internal/mayor"
|
|
"github.com/steveyegge/gastown/internal/polecat"
|
|
"github.com/steveyegge/gastown/internal/refinery"
|
|
"github.com/steveyegge/gastown/internal/rig"
|
|
"github.com/steveyegge/gastown/internal/session"
|
|
"github.com/steveyegge/gastown/internal/style"
|
|
"github.com/steveyegge/gastown/internal/tmux"
|
|
"github.com/steveyegge/gastown/internal/util"
|
|
"github.com/steveyegge/gastown/internal/witness"
|
|
"github.com/steveyegge/gastown/internal/workspace"
|
|
)
|
|
|
|
var (
|
|
startAll bool
|
|
startAgentOverride string
|
|
startCrewRig string
|
|
startCrewAccount string
|
|
startCrewAgentOverride string
|
|
shutdownGraceful bool
|
|
shutdownWait int
|
|
shutdownAll bool
|
|
shutdownForce bool
|
|
shutdownYes bool
|
|
shutdownPolecatsOnly bool
|
|
shutdownNuclear bool
|
|
shutdownCleanupOrphans bool
|
|
shutdownCleanupOrphansGrace int
|
|
)
|
|
|
|
var startCmd = &cobra.Command{
|
|
Use: "start [path]",
|
|
GroupID: GroupServices,
|
|
Short: "Start Gas Town or a crew workspace",
|
|
Long: `Start Gas Town by launching the Deacon and Mayor.
|
|
|
|
The Deacon is the health-check orchestrator that monitors Mayor and Witnesses.
|
|
The Mayor is the global coordinator that dispatches work.
|
|
|
|
By default, other agents (Witnesses, Refineries) are started lazily as needed.
|
|
Use --all to start Witnesses and Refineries for all registered rigs immediately.
|
|
|
|
Crew shortcut:
|
|
If a path like "rig/crew/name" is provided, starts that crew workspace.
|
|
This is equivalent to 'gt start crew rig/name'.
|
|
|
|
To stop Gas Town, use 'gt shutdown'.`,
|
|
Args: cobra.MaximumNArgs(1),
|
|
RunE: runStart,
|
|
}
|
|
|
|
var shutdownCmd = &cobra.Command{
|
|
Use: "shutdown",
|
|
GroupID: GroupServices,
|
|
Short: "Shutdown Gas Town with cleanup",
|
|
Long: `Shutdown Gas Town by stopping agents and cleaning up polecats.
|
|
|
|
This is the "done for the day" command - it stops everything AND removes
|
|
polecat worktrees/branches. For a reversible pause, use 'gt down' instead.
|
|
|
|
Comparison:
|
|
gt down - Pause (stop processes, keep worktrees) - reversible
|
|
gt shutdown - Done (stop + cleanup worktrees) - permanent cleanup
|
|
|
|
After killing sessions, polecats are cleaned up:
|
|
- Worktrees are removed
|
|
- Polecat branches are deleted
|
|
- Polecats with uncommitted work are SKIPPED (protected)
|
|
|
|
Shutdown levels (progressively more aggressive):
|
|
(default) - Stop infrastructure + polecats + cleanup
|
|
--all - Also stop crew sessions
|
|
--polecats-only - Only stop polecats (leaves infrastructure running)
|
|
|
|
Use --force or --yes to skip confirmation prompt.
|
|
Use --graceful to allow agents time to save state before killing.
|
|
Use --nuclear to force cleanup even if polecats have uncommitted work (DANGER).
|
|
Use --cleanup-orphans to kill orphaned Claude processes (TTY-less, older than 60s).
|
|
Use --cleanup-orphans-grace-secs to set the grace period (default 60s).`,
|
|
RunE: runShutdown,
|
|
}
|
|
|
|
var startCrewCmd = &cobra.Command{
|
|
Use: "crew <name>",
|
|
Short: "Start a crew workspace (creates if needed)",
|
|
Long: `Start a crew workspace, creating it if it doesn't exist.
|
|
|
|
This is a convenience command that combines 'gt crew add' and 'gt crew at --detached'.
|
|
The crew session starts in the background with Claude running and ready.
|
|
|
|
The name can include the rig in slash format (e.g., greenplace/joe).
|
|
If not specified, the rig is inferred from the current directory.
|
|
|
|
Examples:
|
|
gt start crew joe # Start joe in current rig
|
|
gt start crew greenplace/joe # Start joe in gastown rig
|
|
gt start crew joe --rig beads # Start joe in beads rig`,
|
|
Args: cobra.ExactArgs(1),
|
|
RunE: runStartCrew,
|
|
}
|
|
|
|
func init() {
|
|
startCmd.Flags().BoolVarP(&startAll, "all", "a", false,
|
|
"Also start Witnesses and Refineries for all rigs")
|
|
startCmd.Flags().StringVar(&startAgentOverride, "agent", "", "Agent alias to run Mayor/Deacon with (overrides town default)")
|
|
|
|
startCrewCmd.Flags().StringVar(&startCrewRig, "rig", "", "Rig to use")
|
|
startCrewCmd.Flags().StringVar(&startCrewAccount, "account", "", "Claude Code account handle to use")
|
|
startCrewCmd.Flags().StringVar(&startCrewAgentOverride, "agent", "", "Agent alias to run crew worker with (overrides rig/town default)")
|
|
startCmd.AddCommand(startCrewCmd)
|
|
|
|
shutdownCmd.Flags().BoolVarP(&shutdownGraceful, "graceful", "g", false,
|
|
"Send ESC to agents and wait for them to handoff before killing")
|
|
shutdownCmd.Flags().IntVarP(&shutdownWait, "wait", "w", 30,
|
|
"Seconds to wait for graceful shutdown (default 30)")
|
|
shutdownCmd.Flags().BoolVarP(&shutdownAll, "all", "a", false,
|
|
"Also stop crew sessions (by default, crew is preserved)")
|
|
shutdownCmd.Flags().BoolVarP(&shutdownForce, "force", "f", false,
|
|
"Skip confirmation prompt (alias for --yes)")
|
|
shutdownCmd.Flags().BoolVarP(&shutdownYes, "yes", "y", false,
|
|
"Skip confirmation prompt")
|
|
shutdownCmd.Flags().BoolVar(&shutdownPolecatsOnly, "polecats-only", false,
|
|
"Only stop polecats (minimal shutdown)")
|
|
shutdownCmd.Flags().BoolVar(&shutdownNuclear, "nuclear", false,
|
|
"Force cleanup even if polecats have uncommitted work (DANGER: may lose work)")
|
|
shutdownCmd.Flags().BoolVar(&shutdownCleanupOrphans, "cleanup-orphans", false,
|
|
"Clean up orphaned Claude processes (TTY-less processes older than 60s)")
|
|
shutdownCmd.Flags().IntVar(&shutdownCleanupOrphansGrace, "cleanup-orphans-grace-secs", 60,
|
|
"Grace period in seconds between SIGTERM and SIGKILL when cleaning orphans (default 60)")
|
|
|
|
rootCmd.AddCommand(startCmd)
|
|
rootCmd.AddCommand(shutdownCmd)
|
|
}
|
|
|
|
func runStart(cmd *cobra.Command, args []string) error {
|
|
// Check if arg looks like a crew path (rig/crew/name)
|
|
if len(args) == 1 && strings.Contains(args[0], "/crew/") {
|
|
// Parse rig/crew/name format
|
|
parts := strings.SplitN(args[0], "/crew/", 2)
|
|
if len(parts) == 2 && parts[0] != "" && parts[1] != "" {
|
|
// Route to crew start with rig/name format
|
|
crewArg := parts[0] + "/" + parts[1]
|
|
return runStartCrew(cmd, []string{crewArg})
|
|
}
|
|
}
|
|
|
|
// Verify we're in a Gas Town workspace
|
|
townRoot, err := workspace.FindFromCwdOrError()
|
|
if err != nil {
|
|
return fmt.Errorf("not in a Gas Town workspace: %w", err)
|
|
}
|
|
|
|
if err := config.EnsureDaemonPatrolConfig(townRoot); err != nil {
|
|
fmt.Printf(" %s Could not ensure daemon config: %v\n", style.Dim.Render("○"), err)
|
|
}
|
|
|
|
t := tmux.NewTmux()
|
|
|
|
fmt.Printf("Starting Gas Town from %s\n\n", style.Dim.Render(townRoot))
|
|
fmt.Println("Starting all agents in parallel...")
|
|
fmt.Println()
|
|
|
|
// Discover rigs once upfront to avoid redundant calls from parallel goroutines
|
|
rigs, rigsErr := discoverAllRigs(townRoot)
|
|
if rigsErr != nil {
|
|
fmt.Printf(" %s Could not discover rigs: %v\n", style.Dim.Render("○"), rigsErr)
|
|
// Continue anyway - core agents don't need rigs
|
|
}
|
|
|
|
// Start all agent groups in parallel for maximum speed
|
|
var wg sync.WaitGroup
|
|
var mu sync.Mutex // Protects stdout
|
|
var coreErr error
|
|
|
|
// Start core agents (Mayor and Deacon) in background
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
if err := startCoreAgents(townRoot, startAgentOverride, &mu); err != nil {
|
|
mu.Lock()
|
|
coreErr = err
|
|
mu.Unlock()
|
|
}
|
|
}()
|
|
|
|
// Start rig agents (witnesses, refineries) if --all
|
|
if startAll && rigs != nil {
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
startRigAgents(rigs, &mu)
|
|
}()
|
|
}
|
|
|
|
// Start configured crew
|
|
if rigs != nil {
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
startConfiguredCrew(t, rigs, townRoot, &mu)
|
|
}()
|
|
}
|
|
|
|
wg.Wait()
|
|
|
|
if coreErr != nil {
|
|
return coreErr
|
|
}
|
|
|
|
fmt.Println()
|
|
fmt.Printf("%s Gas Town is running\n", style.Bold.Render("✓"))
|
|
fmt.Println()
|
|
fmt.Printf(" Attach to Mayor: %s\n", style.Dim.Render("gt mayor attach"))
|
|
fmt.Printf(" Attach to Deacon: %s\n", style.Dim.Render("gt deacon attach"))
|
|
fmt.Printf(" Check status: %s\n", style.Dim.Render("gt status"))
|
|
|
|
return nil
|
|
}
|
|
|
|
// startCoreAgents starts Mayor and Deacon sessions in parallel using the Manager pattern.
|
|
// The mutex is used to synchronize output with other parallel startup operations.
|
|
func startCoreAgents(townRoot string, agentOverride string, mu *sync.Mutex) error {
|
|
var wg sync.WaitGroup
|
|
var firstErr error
|
|
var errMu sync.Mutex
|
|
|
|
// Start Mayor in goroutine
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
mayorMgr := mayor.NewManager(townRoot)
|
|
if err := mayorMgr.Start(agentOverride); err != nil {
|
|
if errors.Is(err, mayor.ErrAlreadyRunning) {
|
|
mu.Lock()
|
|
fmt.Printf(" %s Mayor already running\n", style.Dim.Render("○"))
|
|
mu.Unlock()
|
|
} else {
|
|
errMu.Lock()
|
|
if firstErr == nil {
|
|
firstErr = fmt.Errorf("starting Mayor: %w", err)
|
|
}
|
|
errMu.Unlock()
|
|
mu.Lock()
|
|
fmt.Printf(" %s Mayor failed: %v\n", style.Dim.Render("○"), err)
|
|
mu.Unlock()
|
|
}
|
|
} else {
|
|
mu.Lock()
|
|
fmt.Printf(" %s Mayor started\n", style.Bold.Render("✓"))
|
|
mu.Unlock()
|
|
}
|
|
}()
|
|
|
|
// Start Deacon in goroutine
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
deaconMgr := deacon.NewManager(townRoot)
|
|
if err := deaconMgr.Start(agentOverride); err != nil {
|
|
if errors.Is(err, deacon.ErrAlreadyRunning) {
|
|
mu.Lock()
|
|
fmt.Printf(" %s Deacon already running\n", style.Dim.Render("○"))
|
|
mu.Unlock()
|
|
} else {
|
|
errMu.Lock()
|
|
if firstErr == nil {
|
|
firstErr = fmt.Errorf("starting Deacon: %w", err)
|
|
}
|
|
errMu.Unlock()
|
|
mu.Lock()
|
|
fmt.Printf(" %s Deacon failed: %v\n", style.Dim.Render("○"), err)
|
|
mu.Unlock()
|
|
}
|
|
} else {
|
|
mu.Lock()
|
|
fmt.Printf(" %s Deacon started\n", style.Bold.Render("✓"))
|
|
mu.Unlock()
|
|
}
|
|
}()
|
|
|
|
wg.Wait()
|
|
return firstErr
|
|
}
|
|
|
|
// startRigAgents starts witness and refinery for all rigs in parallel.
|
|
// Called when --all flag is passed to gt start.
|
|
func startRigAgents(rigs []*rig.Rig, mu *sync.Mutex) {
|
|
var wg sync.WaitGroup
|
|
|
|
for _, r := range rigs {
|
|
wg.Add(2) // Witness + Refinery
|
|
|
|
// Start Witness in goroutine
|
|
go func(r *rig.Rig) {
|
|
defer wg.Done()
|
|
msg := startWitnessForRig(r)
|
|
mu.Lock()
|
|
fmt.Print(msg)
|
|
mu.Unlock()
|
|
}(r)
|
|
|
|
// Start Refinery in goroutine
|
|
go func(r *rig.Rig) {
|
|
defer wg.Done()
|
|
msg := startRefineryForRig(r)
|
|
mu.Lock()
|
|
fmt.Print(msg)
|
|
mu.Unlock()
|
|
}(r)
|
|
}
|
|
|
|
wg.Wait()
|
|
}
|
|
|
|
// startWitnessForRig starts the witness for a single rig and returns a status message.
|
|
func startWitnessForRig(r *rig.Rig) string {
|
|
witMgr := witness.NewManager(r)
|
|
if err := witMgr.Start(false, "", nil); err != nil {
|
|
if errors.Is(err, witness.ErrAlreadyRunning) {
|
|
return fmt.Sprintf(" %s %s witness already running\n", style.Dim.Render("○"), r.Name)
|
|
}
|
|
return fmt.Sprintf(" %s %s witness failed: %v\n", style.Dim.Render("○"), r.Name, err)
|
|
}
|
|
return fmt.Sprintf(" %s %s witness started\n", style.Bold.Render("✓"), r.Name)
|
|
}
|
|
|
|
// startRefineryForRig starts the refinery for a single rig and returns a status message.
|
|
func startRefineryForRig(r *rig.Rig) string {
|
|
refineryMgr := refinery.NewManager(r)
|
|
if err := refineryMgr.Start(false, ""); err != nil {
|
|
if errors.Is(err, refinery.ErrAlreadyRunning) {
|
|
return fmt.Sprintf(" %s %s refinery already running\n", style.Dim.Render("○"), r.Name)
|
|
}
|
|
return fmt.Sprintf(" %s %s refinery failed: %v\n", style.Dim.Render("○"), r.Name, err)
|
|
}
|
|
return fmt.Sprintf(" %s %s refinery started\n", style.Bold.Render("✓"), r.Name)
|
|
}
|
|
|
|
// startConfiguredCrew starts crew members configured in rig settings in parallel.
|
|
func startConfiguredCrew(t *tmux.Tmux, rigs []*rig.Rig, townRoot string, mu *sync.Mutex) {
|
|
var wg sync.WaitGroup
|
|
var startedAny int32 // Use atomic for thread-safe flag
|
|
|
|
for _, r := range rigs {
|
|
crewToStart := getCrewToStart(r)
|
|
for _, crewName := range crewToStart {
|
|
wg.Add(1)
|
|
go func(r *rig.Rig, crewName string) {
|
|
defer wg.Done()
|
|
msg, started := startOrRestartCrewMember(t, r, crewName, townRoot)
|
|
mu.Lock()
|
|
fmt.Print(msg)
|
|
mu.Unlock()
|
|
if started {
|
|
atomic.StoreInt32(&startedAny, 1)
|
|
}
|
|
}(r, crewName)
|
|
}
|
|
}
|
|
|
|
wg.Wait()
|
|
|
|
if atomic.LoadInt32(&startedAny) == 0 {
|
|
mu.Lock()
|
|
fmt.Printf(" %s No crew configured or all already running\n", style.Dim.Render("○"))
|
|
mu.Unlock()
|
|
}
|
|
}
|
|
|
|
// startOrRestartCrewMember starts or restarts a single crew member and returns a status message.
|
|
func startOrRestartCrewMember(t *tmux.Tmux, r *rig.Rig, crewName, townRoot string) (msg string, started bool) {
|
|
sessionID := crewSessionName(r.Name, crewName)
|
|
if running, _ := t.HasSession(sessionID); running {
|
|
// Session exists - check if agent is still running
|
|
agentCfg := config.ResolveRoleAgentConfig(constants.RoleCrew, townRoot, r.Path)
|
|
if !t.IsAgentRunning(sessionID, config.ExpectedPaneCommands(agentCfg)...) {
|
|
// Agent has exited, restart it
|
|
// Build startup beacon for predecessor discovery via /resume
|
|
address := fmt.Sprintf("%s/crew/%s", r.Name, crewName)
|
|
beacon := session.FormatStartupNudge(session.StartupNudgeConfig{
|
|
Recipient: address,
|
|
Sender: "human",
|
|
Topic: "restart",
|
|
})
|
|
agentCmd := config.BuildCrewStartupCommand(r.Name, crewName, r.Path, beacon)
|
|
if err := t.SendKeys(sessionID, agentCmd); err != nil {
|
|
return fmt.Sprintf(" %s %s/%s restart failed: %v\n", style.Dim.Render("○"), r.Name, crewName, err), false
|
|
}
|
|
return fmt.Sprintf(" %s %s/%s agent restarted\n", style.Bold.Render("✓"), r.Name, crewName), true
|
|
}
|
|
return fmt.Sprintf(" %s %s/%s already running\n", style.Dim.Render("○"), r.Name, crewName), false
|
|
}
|
|
|
|
if err := startCrewMember(r.Name, crewName, townRoot); err != nil {
|
|
return fmt.Sprintf(" %s %s/%s failed: %v\n", style.Dim.Render("○"), r.Name, crewName, err), false
|
|
}
|
|
return fmt.Sprintf(" %s %s/%s started\n", style.Bold.Render("✓"), r.Name, crewName), true
|
|
}
|
|
|
|
// discoverAllRigs finds all rigs in the workspace.
|
|
func discoverAllRigs(townRoot string) ([]*rig.Rig, error) {
|
|
rigsConfigPath := filepath.Join(townRoot, "mayor", "rigs.json")
|
|
rigsConfig, err := config.LoadRigsConfig(rigsConfigPath)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("loading rigs config: %w", err)
|
|
}
|
|
|
|
g := git.NewGit(townRoot)
|
|
rigMgr := rig.NewManager(townRoot, rigsConfig, g)
|
|
|
|
return rigMgr.DiscoverRigs()
|
|
}
|
|
|
|
func runShutdown(cmd *cobra.Command, args []string) error {
|
|
t := tmux.NewTmux()
|
|
|
|
// Find workspace root for polecat cleanup
|
|
townRoot, _ := workspace.FindFromCwd()
|
|
|
|
// Collect sessions to show what will be stopped
|
|
sessions, err := t.ListSessions()
|
|
if err != nil {
|
|
return fmt.Errorf("listing sessions: %w", err)
|
|
}
|
|
|
|
// Get session names for categorization
|
|
mayorSession := getMayorSessionName()
|
|
deaconSession := getDeaconSessionName()
|
|
toStop, preserved := categorizeSessions(sessions, mayorSession, deaconSession)
|
|
|
|
if len(toStop) == 0 {
|
|
fmt.Printf("%s Gas Town was not running\n", style.Dim.Render("○"))
|
|
|
|
// Still check for orphaned daemons even if no sessions are running
|
|
if townRoot != "" {
|
|
fmt.Println()
|
|
fmt.Println("Checking for orphaned daemon...")
|
|
stopDaemonIfRunning(townRoot)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Show what will happen
|
|
fmt.Println("Sessions to stop:")
|
|
for _, sess := range toStop {
|
|
fmt.Printf(" %s %s\n", style.Bold.Render("→"), sess)
|
|
}
|
|
if len(preserved) > 0 && !shutdownAll {
|
|
fmt.Println()
|
|
fmt.Println("Sessions preserved (crew):")
|
|
for _, sess := range preserved {
|
|
fmt.Printf(" %s %s\n", style.Dim.Render("○"), sess)
|
|
}
|
|
}
|
|
fmt.Println()
|
|
|
|
// Confirmation prompt
|
|
if !shutdownYes && !shutdownForce {
|
|
fmt.Printf("Proceed with shutdown? [y/N] ")
|
|
reader := bufio.NewReader(os.Stdin)
|
|
response, _ := reader.ReadString('\n')
|
|
response = strings.TrimSpace(strings.ToLower(response))
|
|
if response != "y" && response != "yes" {
|
|
fmt.Println("Shutdown canceled.")
|
|
return nil
|
|
}
|
|
}
|
|
|
|
if shutdownGraceful {
|
|
return runGracefulShutdown(t, toStop, townRoot)
|
|
}
|
|
return runImmediateShutdown(t, toStop, townRoot)
|
|
}
|
|
|
|
// categorizeSessions splits sessions into those to stop and those to preserve.
|
|
// mayorSession and deaconSession are the dynamic session names for the current town.
|
|
func categorizeSessions(sessions []string, mayorSession, deaconSession string) (toStop, preserved []string) {
|
|
for _, sess := range sessions {
|
|
// Gas Town sessions use gt- (rig-level) or hq- (town-level) prefix
|
|
if !strings.HasPrefix(sess, "gt-") && !strings.HasPrefix(sess, "hq-") {
|
|
continue // Not a Gas Town session
|
|
}
|
|
|
|
// Check if it's a crew session (pattern: gt-<rig>-crew-<name>)
|
|
isCrew := strings.Contains(sess, "-crew-")
|
|
|
|
// Check if it's a polecat session (pattern: gt-<rig>-<name> where name is not crew/witness/refinery)
|
|
isPolecat := false
|
|
if !isCrew && sess != mayorSession && sess != deaconSession {
|
|
parts := strings.Split(sess, "-")
|
|
if len(parts) >= 3 {
|
|
role := parts[2]
|
|
if role != "witness" && role != "refinery" && role != "crew" {
|
|
isPolecat = true
|
|
}
|
|
}
|
|
}
|
|
|
|
// Decide based on flags
|
|
if shutdownPolecatsOnly {
|
|
// Only stop polecats
|
|
if isPolecat {
|
|
toStop = append(toStop, sess)
|
|
} else {
|
|
preserved = append(preserved, sess)
|
|
}
|
|
} else if shutdownAll {
|
|
// Stop everything including crew
|
|
toStop = append(toStop, sess)
|
|
} else {
|
|
// Default: preserve crew
|
|
if isCrew {
|
|
preserved = append(preserved, sess)
|
|
} else {
|
|
toStop = append(toStop, sess)
|
|
}
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
func runGracefulShutdown(t *tmux.Tmux, gtSessions []string, townRoot string) error {
|
|
fmt.Printf("Graceful shutdown of Gas Town (waiting up to %ds)...\n\n", shutdownWait)
|
|
|
|
// Phase 1: Send ESC to all agents to interrupt them
|
|
fmt.Printf("Phase 1: Sending ESC to %d agent(s)...\n", len(gtSessions))
|
|
for _, sess := range gtSessions {
|
|
fmt.Printf(" %s Interrupting %s\n", style.Bold.Render("→"), sess)
|
|
_ = t.SendKeysRaw(sess, "Escape") // best-effort interrupt
|
|
}
|
|
|
|
// Phase 2: Send shutdown message asking agents to handoff
|
|
fmt.Printf("\nPhase 2: Requesting handoff from agents...\n")
|
|
shutdownMsg := "[SHUTDOWN] Gas Town is shutting down. Please save your state and update your handoff bead, then type /exit or wait to be terminated."
|
|
for _, sess := range gtSessions {
|
|
// Small delay then send the message
|
|
time.Sleep(constants.ShutdownNotifyDelay)
|
|
_ = t.SendKeys(sess, shutdownMsg) // best-effort notification
|
|
}
|
|
|
|
// Phase 3: Wait for agents to complete handoff
|
|
fmt.Printf("\nPhase 3: Waiting %ds for agents to complete handoff...\n", shutdownWait)
|
|
fmt.Printf(" %s\n", style.Dim.Render("(Press Ctrl-C to force immediate shutdown)"))
|
|
|
|
// Wait with countdown
|
|
for remaining := shutdownWait; remaining > 0; remaining -= 5 {
|
|
if remaining < shutdownWait {
|
|
fmt.Printf(" %s %ds remaining...\n", style.Dim.Render("⏳"), remaining)
|
|
}
|
|
sleepTime := 5
|
|
if remaining < 5 {
|
|
sleepTime = remaining
|
|
}
|
|
time.Sleep(time.Duration(sleepTime) * time.Second)
|
|
}
|
|
|
|
// Phase 4: Kill sessions in correct order
|
|
fmt.Printf("\nPhase 4: Terminating sessions...\n")
|
|
mayorSession := getMayorSessionName()
|
|
deaconSession := getDeaconSessionName()
|
|
stopped := killSessionsInOrder(t, gtSessions, mayorSession, deaconSession)
|
|
|
|
// Phase 5: Cleanup orphaned Claude processes if requested
|
|
if shutdownCleanupOrphans {
|
|
fmt.Printf("\nPhase 5: Cleaning up orphaned Claude processes...\n")
|
|
cleanupOrphanedClaude(shutdownCleanupOrphansGrace)
|
|
}
|
|
|
|
// Phase 6: Cleanup polecat worktrees and branches
|
|
fmt.Printf("\nPhase 6: Cleaning up polecats...\n")
|
|
if townRoot != "" {
|
|
cleanupPolecats(townRoot)
|
|
}
|
|
|
|
// Phase 7: Stop the daemon
|
|
fmt.Printf("\nPhase 7: Stopping daemon...\n")
|
|
if townRoot != "" {
|
|
stopDaemonIfRunning(townRoot)
|
|
}
|
|
|
|
fmt.Println()
|
|
fmt.Printf("%s Graceful shutdown complete (%d sessions stopped)\n", style.Bold.Render("✓"), stopped)
|
|
return nil
|
|
}
|
|
|
|
func runImmediateShutdown(t *tmux.Tmux, gtSessions []string, townRoot string) error {
|
|
fmt.Println("Shutting down Gas Town...")
|
|
|
|
mayorSession := getMayorSessionName()
|
|
deaconSession := getDeaconSessionName()
|
|
stopped := killSessionsInOrder(t, gtSessions, mayorSession, deaconSession)
|
|
|
|
// Cleanup orphaned Claude processes if requested
|
|
if shutdownCleanupOrphans {
|
|
fmt.Println()
|
|
fmt.Println("Cleaning up orphaned Claude processes...")
|
|
cleanupOrphanedClaude(shutdownCleanupOrphansGrace)
|
|
}
|
|
|
|
// Cleanup polecat worktrees and branches
|
|
if townRoot != "" {
|
|
fmt.Println()
|
|
fmt.Println("Cleaning up polecats...")
|
|
cleanupPolecats(townRoot)
|
|
}
|
|
|
|
// Stop the daemon
|
|
if townRoot != "" {
|
|
fmt.Println()
|
|
fmt.Println("Stopping daemon...")
|
|
stopDaemonIfRunning(townRoot)
|
|
}
|
|
|
|
fmt.Println()
|
|
fmt.Printf("%s Gas Town shutdown complete (%d sessions stopped)\n", style.Bold.Render("✓"), stopped)
|
|
|
|
return nil
|
|
}
|
|
|
|
// killSessionsInOrder stops sessions in the correct order:
|
|
// 1. Deacon first (so it doesn't restart others)
|
|
// 2. Everything except Mayor
|
|
// 3. Mayor last
|
|
// mayorSession and deaconSession are the dynamic session names for the current town.
|
|
//
|
|
// Returns the count of sessions that were successfully stopped (verified by checking
|
|
// if the session no longer exists after the kill attempt).
|
|
func killSessionsInOrder(t *tmux.Tmux, sessions []string, mayorSession, deaconSession string) int {
|
|
stopped := 0
|
|
|
|
// Helper to check if session is in our list
|
|
inList := func(sess string) bool {
|
|
for _, s := range sessions {
|
|
if s == sess {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// Helper to kill a session and verify it was stopped
|
|
killAndVerify := func(sess string) bool {
|
|
// Check if session exists before attempting to kill
|
|
exists, _ := t.HasSession(sess)
|
|
if !exists {
|
|
return false // Session already gone
|
|
}
|
|
|
|
// Attempt to kill the session and its processes
|
|
_ = t.KillSessionWithProcesses(sess)
|
|
|
|
// Verify the session is actually gone (ignore error, check existence)
|
|
// KillSessionWithProcesses might return an error even if it successfully
|
|
// killed the processes and the session auto-closed
|
|
stillExists, _ := t.HasSession(sess)
|
|
if !stillExists {
|
|
fmt.Printf(" %s %s stopped\n", style.Bold.Render("✓"), sess)
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// 1. Stop Deacon first
|
|
if inList(deaconSession) {
|
|
if killAndVerify(deaconSession) {
|
|
stopped++
|
|
}
|
|
}
|
|
|
|
// 2. Stop others (except Mayor)
|
|
for _, sess := range sessions {
|
|
if sess == deaconSession || sess == mayorSession {
|
|
continue
|
|
}
|
|
if killAndVerify(sess) {
|
|
stopped++
|
|
}
|
|
}
|
|
|
|
// 3. Stop Mayor last
|
|
if inList(mayorSession) {
|
|
if killAndVerify(mayorSession) {
|
|
stopped++
|
|
}
|
|
}
|
|
|
|
return stopped
|
|
}
|
|
|
|
// cleanupPolecats removes polecat worktrees and branches for all rigs.
|
|
// It refuses to clean up polecats with uncommitted work unless --nuclear is set.
|
|
func cleanupPolecats(townRoot string) {
|
|
// Load rigs config
|
|
rigsConfigPath := filepath.Join(townRoot, "mayor", "rigs.json")
|
|
rigsConfig, err := config.LoadRigsConfig(rigsConfigPath)
|
|
if err != nil {
|
|
fmt.Printf(" %s Could not load rigs config: %v\n", style.Dim.Render("○"), err)
|
|
return
|
|
}
|
|
|
|
g := git.NewGit(townRoot)
|
|
rigMgr := rig.NewManager(townRoot, rigsConfig, g)
|
|
|
|
// Discover all rigs
|
|
rigs, err := rigMgr.DiscoverRigs()
|
|
if err != nil {
|
|
fmt.Printf(" %s Could not discover rigs: %v\n", style.Dim.Render("○"), err)
|
|
return
|
|
}
|
|
|
|
totalCleaned := 0
|
|
totalSkipped := 0
|
|
var uncommittedPolecats []string
|
|
|
|
for _, r := range rigs {
|
|
polecatGit := git.NewGit(r.Path)
|
|
polecatMgr := polecat.NewManager(r, polecatGit, nil) // nil tmux: just listing, not allocating
|
|
|
|
polecats, err := polecatMgr.List()
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
for _, p := range polecats {
|
|
// Check for uncommitted work
|
|
pGit := git.NewGit(p.ClonePath)
|
|
status, err := pGit.CheckUncommittedWork()
|
|
if err != nil {
|
|
// Can't check, be safe and skip unless nuclear
|
|
if !shutdownNuclear {
|
|
fmt.Printf(" %s %s/%s: could not check status, skipping\n",
|
|
style.Dim.Render("○"), r.Name, p.Name)
|
|
totalSkipped++
|
|
continue
|
|
}
|
|
} else if !status.Clean() {
|
|
// Has uncommitted work
|
|
if !shutdownNuclear {
|
|
uncommittedPolecats = append(uncommittedPolecats,
|
|
fmt.Sprintf("%s/%s (%s)", r.Name, p.Name, status.String()))
|
|
totalSkipped++
|
|
continue
|
|
}
|
|
// Nuclear mode: warn but proceed
|
|
fmt.Printf(" %s %s/%s: NUCLEAR - removing despite %s\n",
|
|
style.Bold.Render("⚠"), r.Name, p.Name, status.String())
|
|
}
|
|
|
|
// Clean: remove worktree and branch
|
|
if err := polecatMgr.RemoveWithOptions(p.Name, true, shutdownNuclear); err != nil {
|
|
fmt.Printf(" %s %s/%s: cleanup failed: %v\n",
|
|
style.Dim.Render("○"), r.Name, p.Name, err)
|
|
totalSkipped++
|
|
continue
|
|
}
|
|
|
|
// Delete the polecat branch from mayor's clone
|
|
branchName := fmt.Sprintf("polecat/%s", p.Name)
|
|
mayorPath := filepath.Join(r.Path, "mayor", "rig")
|
|
mayorGit := git.NewGit(mayorPath)
|
|
_ = mayorGit.DeleteBranch(branchName, true) // Ignore errors
|
|
|
|
fmt.Printf(" %s %s/%s: cleaned up\n", style.Bold.Render("✓"), r.Name, p.Name)
|
|
totalCleaned++
|
|
}
|
|
}
|
|
|
|
// Summary
|
|
if len(uncommittedPolecats) > 0 {
|
|
fmt.Println()
|
|
fmt.Printf(" %s Polecats with uncommitted work (use --nuclear to force):\n",
|
|
style.Bold.Render("⚠"))
|
|
for _, pc := range uncommittedPolecats {
|
|
fmt.Printf(" • %s\n", pc)
|
|
}
|
|
}
|
|
|
|
if totalCleaned > 0 || totalSkipped > 0 {
|
|
fmt.Printf(" Cleaned: %d, Skipped: %d\n", totalCleaned, totalSkipped)
|
|
} else {
|
|
fmt.Printf(" %s No polecats to clean up\n", style.Dim.Render("○"))
|
|
}
|
|
}
|
|
|
|
// stopDaemonIfRunning stops the daemon if it is running.
|
|
// This prevents the daemon from restarting agents after shutdown.
|
|
// Uses robust detection with fallback to process search.
|
|
func stopDaemonIfRunning(townRoot string) {
|
|
// Primary detection: PID file
|
|
running, pid, err := daemon.IsRunning(townRoot)
|
|
|
|
if err != nil {
|
|
// Detection error - report it but continue with fallback
|
|
fmt.Printf(" %s Daemon detection warning: %s\n", style.Bold.Render("⚠"), err.Error())
|
|
}
|
|
|
|
if running {
|
|
// PID file points to live daemon - stop it
|
|
if err := daemon.StopDaemon(townRoot); err != nil {
|
|
fmt.Printf(" %s Failed to stop daemon (PID %d): %s\n",
|
|
style.Bold.Render("✗"), pid, err.Error())
|
|
} else {
|
|
fmt.Printf(" %s Daemon stopped (was PID %d)\n", style.Bold.Render("✓"), pid)
|
|
}
|
|
} else {
|
|
fmt.Printf(" %s Daemon not tracked by PID file\n", style.Dim.Render("○"))
|
|
}
|
|
|
|
// Fallback: Search for orphaned daemon processes
|
|
orphaned, err := daemon.FindOrphanedDaemons()
|
|
if err != nil {
|
|
fmt.Printf(" %s Warning: failed to search for orphaned daemons: %v\n",
|
|
style.Dim.Render("○"), err)
|
|
return
|
|
}
|
|
|
|
if len(orphaned) > 0 {
|
|
fmt.Printf(" %s Found %d orphaned daemon process(es): %v\n",
|
|
style.Bold.Render("⚠"), len(orphaned), orphaned)
|
|
|
|
killed, err := daemon.KillOrphanedDaemons()
|
|
if err != nil {
|
|
fmt.Printf(" %s Failed to kill orphaned daemons: %v\n",
|
|
style.Bold.Render("✗"), err)
|
|
} else if killed > 0 {
|
|
fmt.Printf(" %s Killed %d orphaned daemon(s)\n",
|
|
style.Bold.Render("✓"), killed)
|
|
}
|
|
}
|
|
}
|
|
|
|
// runStartCrew starts a crew workspace, creating it if it doesn't exist.
|
|
// This combines the functionality of 'gt crew add' and 'gt crew at --detached'.
|
|
func runStartCrew(cmd *cobra.Command, args []string) error {
|
|
name := args[0]
|
|
|
|
// Parse rig/name format (e.g., "greenplace/joe" -> rig=gastown, name=joe)
|
|
rigName := startCrewRig
|
|
if parsedRig, crewName, ok := parseRigSlashName(name); ok {
|
|
if rigName == "" {
|
|
rigName = parsedRig
|
|
}
|
|
name = crewName
|
|
}
|
|
|
|
// Find workspace
|
|
townRoot, err := workspace.FindFromCwdOrError()
|
|
if err != nil {
|
|
return fmt.Errorf("not in a Gas Town workspace: %w", err)
|
|
}
|
|
|
|
// If rig still not specified, try to infer from cwd
|
|
if rigName == "" {
|
|
rigName, err = inferRigFromCwd(townRoot)
|
|
if err != nil {
|
|
return fmt.Errorf("could not determine rig (use --rig flag or rig/name format): %w", err)
|
|
}
|
|
}
|
|
|
|
// Load rigs config
|
|
rigsConfigPath := filepath.Join(townRoot, "mayor", "rigs.json")
|
|
rigsConfig, err := config.LoadRigsConfig(rigsConfigPath)
|
|
if err != nil {
|
|
rigsConfig = &config.RigsConfig{Rigs: make(map[string]config.RigEntry)}
|
|
}
|
|
|
|
// Get rig
|
|
g := git.NewGit(townRoot)
|
|
rigMgr := rig.NewManager(townRoot, rigsConfig, g)
|
|
r, err := rigMgr.GetRig(rigName)
|
|
if err != nil {
|
|
return fmt.Errorf("rig '%s' not found", rigName)
|
|
}
|
|
|
|
// Create crew manager
|
|
crewGit := git.NewGit(r.Path)
|
|
crewMgr := crew.NewManager(r, crewGit)
|
|
|
|
// Resolve account for Claude config
|
|
accountsPath := constants.MayorAccountsPath(townRoot)
|
|
claudeConfigDir, accountHandle, err := config.ResolveAccountConfigDir(accountsPath, startCrewAccount)
|
|
if err != nil {
|
|
return fmt.Errorf("resolving account: %w", err)
|
|
}
|
|
if accountHandle != "" {
|
|
fmt.Printf("Using account: %s\n", accountHandle)
|
|
}
|
|
|
|
// Use manager's Start() method - handles workspace creation, settings, and session
|
|
err = crewMgr.Start(name, crew.StartOptions{
|
|
Account: startCrewAccount,
|
|
ClaudeConfigDir: claudeConfigDir,
|
|
AgentOverride: startCrewAgentOverride,
|
|
})
|
|
if err != nil {
|
|
if errors.Is(err, crew.ErrSessionRunning) {
|
|
fmt.Printf("%s Session already running: %s\n", style.Dim.Render("○"), crewMgr.SessionName(name))
|
|
} else {
|
|
return err
|
|
}
|
|
} else {
|
|
fmt.Printf("%s Started crew workspace: %s/%s\n",
|
|
style.Bold.Render("✓"), rigName, name)
|
|
}
|
|
|
|
fmt.Printf("Attach with: %s\n", style.Dim.Render(fmt.Sprintf("gt crew at %s", name)))
|
|
return nil
|
|
}
|
|
|
|
// getCrewToStart reads rig settings and parses the crew.startup field.
|
|
// Returns a list of crew names to start.
|
|
func getCrewToStart(r *rig.Rig) []string {
|
|
// Load rig settings
|
|
settingsPath := filepath.Join(r.Path, "settings", "config.json")
|
|
settings, err := config.LoadRigSettings(settingsPath)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
|
|
if settings.Crew == nil || settings.Crew.Startup == "" || settings.Crew.Startup == "none" {
|
|
return nil
|
|
}
|
|
|
|
startup := settings.Crew.Startup
|
|
|
|
// Handle "all" - list all existing crew
|
|
if startup == "all" {
|
|
crewGit := git.NewGit(r.Path)
|
|
crewMgr := crew.NewManager(r, crewGit)
|
|
workers, err := crewMgr.List()
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
var names []string
|
|
for _, w := range workers {
|
|
names = append(names, w.Name)
|
|
}
|
|
return names
|
|
}
|
|
|
|
// Parse names: "max", "max and joe", "max, joe", "max, joe, emma"
|
|
// Replace "and" with comma for uniform parsing
|
|
startup = strings.ReplaceAll(startup, " and ", ", ")
|
|
parts := strings.Split(startup, ",")
|
|
|
|
var names []string
|
|
for _, part := range parts {
|
|
name := strings.TrimSpace(part)
|
|
if name != "" {
|
|
names = append(names, name)
|
|
}
|
|
}
|
|
|
|
return names
|
|
}
|
|
|
|
// startCrewMember starts a single crew member, creating if needed.
|
|
// This is a simplified version of runStartCrew that doesn't print output.
|
|
func startCrewMember(rigName, crewName, townRoot string) error {
|
|
// Load rigs config
|
|
rigsConfigPath := filepath.Join(townRoot, "mayor", "rigs.json")
|
|
rigsConfig, err := config.LoadRigsConfig(rigsConfigPath)
|
|
if err != nil {
|
|
rigsConfig = &config.RigsConfig{Rigs: make(map[string]config.RigEntry)}
|
|
}
|
|
|
|
// Get rig
|
|
g := git.NewGit(townRoot)
|
|
rigMgr := rig.NewManager(townRoot, rigsConfig, g)
|
|
r, err := rigMgr.GetRig(rigName)
|
|
if err != nil {
|
|
return fmt.Errorf("rig '%s' not found", rigName)
|
|
}
|
|
|
|
// Create crew manager and use Start() method
|
|
crewGit := git.NewGit(r.Path)
|
|
crewMgr := crew.NewManager(r, crewGit)
|
|
|
|
// Start handles workspace creation, settings, and session all in one
|
|
err = crewMgr.Start(crewName, crew.StartOptions{})
|
|
if err != nil && !errors.Is(err, crew.ErrSessionRunning) {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// cleanupOrphanedClaude finds and kills orphaned Claude processes with a grace period.
|
|
// This is a simpler synchronous implementation that:
|
|
// 1. Finds orphaned processes (TTY-less, older than 60s, not in Gas Town sessions)
|
|
// 2. Sends SIGTERM to all of them
|
|
// 3. Waits for the grace period
|
|
// 4. Sends SIGKILL to any that are still alive
|
|
func cleanupOrphanedClaude(graceSecs int) {
|
|
// Find orphaned processes
|
|
orphans, err := util.FindOrphanedClaudeProcesses()
|
|
if err != nil {
|
|
fmt.Printf(" %s Warning: %v\n", style.Bold.Render("⚠"), err)
|
|
return
|
|
}
|
|
|
|
if len(orphans) == 0 {
|
|
fmt.Printf(" %s No orphaned processes found\n", style.Dim.Render("○"))
|
|
return
|
|
}
|
|
|
|
// Send SIGTERM to all orphans
|
|
var termPIDs []int
|
|
for _, orphan := range orphans {
|
|
if err := syscall.Kill(orphan.PID, syscall.SIGTERM); err != nil {
|
|
if err != syscall.ESRCH {
|
|
fmt.Printf(" %s PID %d: failed to send SIGTERM: %v\n",
|
|
style.Bold.Render("⚠"), orphan.PID, err)
|
|
}
|
|
continue
|
|
}
|
|
termPIDs = append(termPIDs, orphan.PID)
|
|
fmt.Printf(" %s PID %d: sent SIGTERM (waiting %ds before SIGKILL)\n",
|
|
style.Bold.Render("→"), orphan.PID, graceSecs)
|
|
}
|
|
|
|
if len(termPIDs) == 0 {
|
|
return
|
|
}
|
|
|
|
// Wait for grace period
|
|
fmt.Printf(" %s Waiting %d seconds for processes to terminate gracefully...\n",
|
|
style.Dim.Render("⏳"), graceSecs)
|
|
time.Sleep(time.Duration(graceSecs) * time.Second)
|
|
|
|
// Check which processes are still alive and send SIGKILL
|
|
var killedCount, alreadyDeadCount int
|
|
for _, pid := range termPIDs {
|
|
// Check if process still exists
|
|
if err := syscall.Kill(pid, 0); err != nil {
|
|
// Process is gone (either died from SIGTERM or doesn't exist)
|
|
alreadyDeadCount++
|
|
continue
|
|
}
|
|
|
|
// Process still alive - send SIGKILL
|
|
if err := syscall.Kill(pid, syscall.SIGKILL); err != nil {
|
|
if err != syscall.ESRCH {
|
|
fmt.Printf(" %s PID %d: failed to send SIGKILL: %v\n",
|
|
style.Bold.Render("⚠"), pid, err)
|
|
}
|
|
continue
|
|
}
|
|
killedCount++
|
|
fmt.Printf(" %s PID %d: sent SIGKILL (did not respond to SIGTERM)\n",
|
|
style.Bold.Render("✓"), pid)
|
|
}
|
|
|
|
if alreadyDeadCount > 0 {
|
|
fmt.Printf(" %s %d process(es) terminated gracefully from SIGTERM\n",
|
|
style.Bold.Render("✓"), alreadyDeadCount)
|
|
}
|
|
if killedCount == 0 && alreadyDeadCount > 0 {
|
|
fmt.Printf(" %s All processes cleaned up successfully\n",
|
|
style.Bold.Render("✓"))
|
|
}
|
|
}
|