When Claude sessions were terminated using KillSession(), bash subprocesses spawned by Claude's Bash tool could survive because they ignore SIGHUP. This caused zombie processes to accumulate over time. Changed all critical session termination paths to use KillSessionWithProcesses() which explicitly kills all descendant processes before terminating the session. Fixes: gt-ew3tk Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
361 lines
10 KiB
Go
361 lines
10 KiB
Go
package cmd
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
|
|
"github.com/spf13/cobra"
|
|
"github.com/steveyegge/gastown/internal/style"
|
|
"github.com/steveyegge/gastown/internal/tmux"
|
|
"github.com/steveyegge/gastown/internal/witness"
|
|
"github.com/steveyegge/gastown/internal/workspace"
|
|
)
|
|
|
|
// Witness command flags
|
|
var (
|
|
witnessForeground bool
|
|
witnessStatusJSON bool
|
|
witnessAgentOverride string
|
|
witnessEnvOverrides []string
|
|
)
|
|
|
|
var witnessCmd = &cobra.Command{
|
|
Use: "witness",
|
|
GroupID: GroupAgents,
|
|
Short: "Manage the Witness (per-rig polecat health monitor)",
|
|
RunE: requireSubcommand,
|
|
Long: `Manage the Witness - the per-rig polecat health monitor.
|
|
|
|
The Witness patrols a single rig, watching over its polecats:
|
|
- Detects stalled polecats (crashed or stuck mid-work)
|
|
- Nudges unresponsive sessions back to life
|
|
- Cleans up zombie polecats (finished but failed to exit)
|
|
- Nukes sandboxes when polecats complete via 'gt done'
|
|
|
|
The Witness does NOT force session cycles or interrupt working polecats.
|
|
Polecats manage their own sessions (via gt handoff). The Witness handles
|
|
failures and edge cases only.
|
|
|
|
One Witness per rig. The Deacon monitors all Witnesses.
|
|
|
|
Role shortcuts: "witness" in mail/nudge addresses resolves to this rig's Witness.`,
|
|
}
|
|
|
|
var witnessStartCmd = &cobra.Command{
|
|
Use: "start <rig>",
|
|
Aliases: []string{"spawn"},
|
|
Short: "Start the witness",
|
|
Long: `Start the Witness for a rig.
|
|
|
|
Launches the monitoring agent which watches for stuck polecats and orphaned
|
|
sandboxes, taking action to keep work flowing.
|
|
|
|
Self-Cleaning Model: Polecats nuke themselves after work. The Witness handles
|
|
crash recovery (restart with hooked work) and orphan cleanup (nuke abandoned
|
|
sandboxes). There is no "idle" state - polecats either have work or don't exist.
|
|
|
|
Examples:
|
|
gt witness start greenplace
|
|
gt witness start greenplace --agent codex
|
|
gt witness start greenplace --env ANTHROPIC_MODEL=claude-3-haiku
|
|
gt witness start greenplace --foreground`,
|
|
Args: cobra.ExactArgs(1),
|
|
RunE: runWitnessStart,
|
|
}
|
|
|
|
var witnessStopCmd = &cobra.Command{
|
|
Use: "stop <rig>",
|
|
Short: "Stop the witness",
|
|
Long: `Stop a running Witness.
|
|
|
|
Gracefully stops the witness monitoring agent.`,
|
|
Args: cobra.ExactArgs(1),
|
|
RunE: runWitnessStop,
|
|
}
|
|
|
|
var witnessStatusCmd = &cobra.Command{
|
|
Use: "status <rig>",
|
|
Short: "Show witness status",
|
|
Long: `Show the status of a rig's Witness.
|
|
|
|
Displays running state, monitored polecats, and statistics.`,
|
|
Args: cobra.ExactArgs(1),
|
|
RunE: runWitnessStatus,
|
|
}
|
|
|
|
var witnessAttachCmd = &cobra.Command{
|
|
Use: "attach [rig]",
|
|
Aliases: []string{"at"},
|
|
Short: "Attach to witness session",
|
|
Long: `Attach to the Witness tmux session for a rig.
|
|
|
|
Attaches the current terminal to the witness's tmux session.
|
|
Detach with Ctrl-B D.
|
|
|
|
If the witness is not running, this will start it first.
|
|
If rig is not specified, infers it from the current directory.
|
|
|
|
Examples:
|
|
gt witness attach greenplace
|
|
gt witness attach # infer rig from cwd`,
|
|
Args: cobra.MaximumNArgs(1),
|
|
RunE: runWitnessAttach,
|
|
}
|
|
|
|
var witnessRestartCmd = &cobra.Command{
|
|
Use: "restart <rig>",
|
|
Short: "Restart the witness",
|
|
Long: `Restart the Witness for a rig.
|
|
|
|
Stops the current session (if running) and starts a fresh one.
|
|
|
|
Examples:
|
|
gt witness restart greenplace
|
|
gt witness restart greenplace --agent codex
|
|
gt witness restart greenplace --env ANTHROPIC_MODEL=claude-3-haiku`,
|
|
Args: cobra.ExactArgs(1),
|
|
RunE: runWitnessRestart,
|
|
}
|
|
|
|
func init() {
|
|
// Start flags
|
|
witnessStartCmd.Flags().BoolVar(&witnessForeground, "foreground", false, "Run in foreground (default: background)")
|
|
witnessStartCmd.Flags().StringVar(&witnessAgentOverride, "agent", "", "Agent alias to run the Witness with (overrides town default)")
|
|
witnessStartCmd.Flags().StringArrayVar(&witnessEnvOverrides, "env", nil, "Environment variable override (KEY=VALUE, can be repeated)")
|
|
|
|
// Status flags
|
|
witnessStatusCmd.Flags().BoolVar(&witnessStatusJSON, "json", false, "Output as JSON")
|
|
|
|
// Restart flags
|
|
witnessRestartCmd.Flags().StringVar(&witnessAgentOverride, "agent", "", "Agent alias to run the Witness with (overrides town default)")
|
|
witnessRestartCmd.Flags().StringArrayVar(&witnessEnvOverrides, "env", nil, "Environment variable override (KEY=VALUE, can be repeated)")
|
|
|
|
// Add subcommands
|
|
witnessCmd.AddCommand(witnessStartCmd)
|
|
witnessCmd.AddCommand(witnessStopCmd)
|
|
witnessCmd.AddCommand(witnessRestartCmd)
|
|
witnessCmd.AddCommand(witnessStatusCmd)
|
|
witnessCmd.AddCommand(witnessAttachCmd)
|
|
|
|
rootCmd.AddCommand(witnessCmd)
|
|
}
|
|
|
|
// getWitnessManager creates a witness manager for a rig.
|
|
func getWitnessManager(rigName string) (*witness.Manager, error) {
|
|
_, r, err := getRig(rigName)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
mgr := witness.NewManager(r)
|
|
return mgr, nil
|
|
}
|
|
|
|
func runWitnessStart(cmd *cobra.Command, args []string) error {
|
|
rigName := args[0]
|
|
|
|
mgr, err := getWitnessManager(rigName)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
fmt.Printf("Starting witness for %s...\n", rigName)
|
|
|
|
if err := mgr.Start(witnessForeground, witnessAgentOverride, witnessEnvOverrides); err != nil {
|
|
if err == witness.ErrAlreadyRunning {
|
|
fmt.Printf("%s Witness is already running\n", style.Dim.Render("⚠"))
|
|
fmt.Printf(" %s\n", style.Dim.Render("Use 'gt witness attach' to connect"))
|
|
return nil
|
|
}
|
|
return fmt.Errorf("starting witness: %w", err)
|
|
}
|
|
|
|
if witnessForeground {
|
|
fmt.Printf("%s Note: Foreground mode no longer runs patrol loop\n", style.Dim.Render("⚠"))
|
|
fmt.Printf(" %s\n", style.Dim.Render("Patrol logic is now handled by mol-witness-patrol molecule"))
|
|
return nil
|
|
}
|
|
|
|
fmt.Printf("%s Witness started for %s\n", style.Bold.Render("✓"), rigName)
|
|
fmt.Printf(" %s\n", style.Dim.Render("Use 'gt witness attach' to connect"))
|
|
fmt.Printf(" %s\n", style.Dim.Render("Use 'gt witness status' to check progress"))
|
|
return nil
|
|
}
|
|
|
|
func runWitnessStop(cmd *cobra.Command, args []string) error {
|
|
rigName := args[0]
|
|
|
|
mgr, err := getWitnessManager(rigName)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Kill tmux session if it exists.
|
|
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
|
|
t := tmux.NewTmux()
|
|
sessionName := witnessSessionName(rigName)
|
|
running, _ := t.HasSession(sessionName)
|
|
if running {
|
|
if err := t.KillSessionWithProcesses(sessionName); err != nil {
|
|
style.PrintWarning("failed to kill session: %v", err)
|
|
}
|
|
}
|
|
|
|
// Update state file
|
|
if err := mgr.Stop(); err != nil {
|
|
if err == witness.ErrNotRunning && !running {
|
|
fmt.Printf("%s Witness is not running\n", style.Dim.Render("⚠"))
|
|
return nil
|
|
}
|
|
// Even if manager.Stop fails, if we killed the session it's stopped
|
|
if !running {
|
|
return fmt.Errorf("stopping witness: %w", err)
|
|
}
|
|
}
|
|
|
|
fmt.Printf("%s Witness stopped for %s\n", style.Bold.Render("✓"), rigName)
|
|
return nil
|
|
}
|
|
|
|
// WitnessStatusOutput is the JSON output format for witness status.
|
|
type WitnessStatusOutput struct {
|
|
Running bool `json:"running"`
|
|
RigName string `json:"rig_name"`
|
|
Session string `json:"session,omitempty"`
|
|
MonitoredPolecats []string `json:"monitored_polecats,omitempty"`
|
|
}
|
|
|
|
func runWitnessStatus(cmd *cobra.Command, args []string) error {
|
|
rigName := args[0]
|
|
|
|
// Get rig for polecat info
|
|
_, r, err := getRig(rigName)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
mgr := witness.NewManager(r)
|
|
|
|
// ZFC: tmux is source of truth for running state
|
|
running, _ := mgr.IsRunning()
|
|
sessionInfo, _ := mgr.Status() // may be nil if not running
|
|
|
|
// Polecats come from rig config, not state file
|
|
polecats := r.Polecats
|
|
|
|
// JSON output
|
|
if witnessStatusJSON {
|
|
output := WitnessStatusOutput{
|
|
Running: running,
|
|
RigName: rigName,
|
|
MonitoredPolecats: polecats,
|
|
}
|
|
if sessionInfo != nil {
|
|
output.Session = sessionInfo.Name
|
|
}
|
|
enc := json.NewEncoder(os.Stdout)
|
|
enc.SetIndent("", " ")
|
|
return enc.Encode(output)
|
|
}
|
|
|
|
// Human-readable output
|
|
fmt.Printf("%s Witness: %s\n\n", style.Bold.Render(AgentTypeIcons[AgentWitness]), rigName)
|
|
|
|
if running {
|
|
fmt.Printf(" State: %s\n", style.Bold.Render("● running"))
|
|
if sessionInfo != nil {
|
|
fmt.Printf(" Session: %s\n", sessionInfo.Name)
|
|
}
|
|
} else {
|
|
fmt.Printf(" State: %s\n", style.Dim.Render("○ stopped"))
|
|
}
|
|
|
|
// Show monitored polecats
|
|
fmt.Printf("\n %s\n", style.Bold.Render("Monitored Polecats:"))
|
|
if len(polecats) == 0 {
|
|
fmt.Printf(" %s\n", style.Dim.Render("(none)"))
|
|
} else {
|
|
for _, p := range polecats {
|
|
fmt.Printf(" • %s\n", p)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// witnessSessionName returns the tmux session name for a rig's witness.
|
|
func witnessSessionName(rigName string) string {
|
|
return fmt.Sprintf("gt-%s-witness", rigName)
|
|
}
|
|
|
|
func runWitnessAttach(cmd *cobra.Command, args []string) error {
|
|
rigName := ""
|
|
if len(args) > 0 {
|
|
rigName = args[0]
|
|
}
|
|
|
|
// Infer rig from cwd if not provided
|
|
if rigName == "" {
|
|
townRoot, err := workspace.FindFromCwdOrError()
|
|
if err != nil {
|
|
return fmt.Errorf("not in a Gas Town workspace: %w", err)
|
|
}
|
|
rigName, err = inferRigFromCwd(townRoot)
|
|
if err != nil {
|
|
return fmt.Errorf("could not determine rig: %w\nUsage: gt witness attach <rig>", err)
|
|
}
|
|
}
|
|
|
|
// Verify rig exists and get manager
|
|
mgr, err := getWitnessManager(rigName)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
sessionName := witnessSessionName(rigName)
|
|
|
|
// Ensure session exists (creates if needed)
|
|
if err := mgr.Start(false, "", nil); err != nil && err != witness.ErrAlreadyRunning {
|
|
return err
|
|
} else if err == nil {
|
|
fmt.Printf("Started witness session for %s\n", rigName)
|
|
}
|
|
|
|
// Attach to the session
|
|
tmuxPath, err := exec.LookPath("tmux")
|
|
if err != nil {
|
|
return fmt.Errorf("tmux not found: %w", err)
|
|
}
|
|
|
|
attachCmd := exec.Command(tmuxPath, "attach-session", "-t", sessionName)
|
|
attachCmd.Stdin = os.Stdin
|
|
attachCmd.Stdout = os.Stdout
|
|
attachCmd.Stderr = os.Stderr
|
|
return attachCmd.Run()
|
|
}
|
|
|
|
func runWitnessRestart(cmd *cobra.Command, args []string) error {
|
|
rigName := args[0]
|
|
|
|
mgr, err := getWitnessManager(rigName)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
fmt.Printf("Restarting witness for %s...\n", rigName)
|
|
|
|
// Stop existing session (non-fatal: may not be running)
|
|
_ = mgr.Stop()
|
|
|
|
// Start fresh
|
|
if err := mgr.Start(false, witnessAgentOverride, witnessEnvOverrides); err != nil {
|
|
return fmt.Errorf("starting witness: %w", err)
|
|
}
|
|
|
|
fmt.Printf("%s Witness restarted for %s\n", style.Bold.Render("✓"), rigName)
|
|
fmt.Printf(" %s\n", style.Dim.Render("Use 'gt witness attach' to connect"))
|
|
return nil
|
|
}
|