diff --git a/internal/cmd/start.go b/internal/cmd/start.go index c4c5311c..bfd7fb82 100644 --- a/internal/cmd/start.go +++ b/internal/cmd/start.go @@ -450,6 +450,14 @@ func runShutdown(cmd *cobra.Command, args []string) error { if len(toStop) == 0 { fmt.Printf("%s Gas Town was not running\n", style.Dim.Render("○")) + + // Still check for orphaned daemons even if no sessions are running + if townRoot != "" { + fmt.Println() + fmt.Println("Checking for orphaned daemon...") + stopDaemonIfRunning(townRoot) + } + return nil } @@ -797,16 +805,48 @@ func cleanupPolecats(townRoot string) { // stopDaemonIfRunning stops the daemon if it is running. // This prevents the daemon from restarting agents after shutdown. +// Uses robust detection with fallback to process search. func stopDaemonIfRunning(townRoot string) { - running, _, _ := daemon.IsRunning(townRoot) + // Primary detection: PID file + running, pid, err := daemon.IsRunning(townRoot) + + if err != nil { + // Detection error - report it but continue with fallback + fmt.Printf(" %s Daemon detection warning: %s\n", style.Bold.Render("⚠"), err.Error()) + } + if running { + // PID file points to live daemon - stop it if err := daemon.StopDaemon(townRoot); err != nil { - fmt.Printf(" %s Daemon: %s\n", style.Dim.Render("○"), err.Error()) + fmt.Printf(" %s Failed to stop daemon (PID %d): %s\n", + style.Bold.Render("✗"), pid, err.Error()) } else { - fmt.Printf(" %s Daemon stopped\n", style.Bold.Render("✓")) + fmt.Printf(" %s Daemon stopped (was PID %d)\n", style.Bold.Render("✓"), pid) } } else { - fmt.Printf(" %s Daemon not running\n", style.Dim.Render("○")) + fmt.Printf(" %s Daemon not tracked by PID file\n", style.Dim.Render("○")) + } + + // Fallback: Search for orphaned daemon processes + orphaned, err := daemon.FindOrphanedDaemons() + if err != nil { + fmt.Printf(" %s Warning: failed to search for orphaned daemons: %v\n", + style.Dim.Render("○"), err) + return + } + + if len(orphaned) > 0 { + fmt.Printf(" %s Found %d orphaned daemon process(es): %v\n", + style.Bold.Render("⚠"), len(orphaned), orphaned) + + killed, err := daemon.KillOrphanedDaemons() + if err != nil { + fmt.Printf(" %s Failed to kill orphaned daemons: %v\n", + style.Bold.Render("✗"), err) + } else if killed > 0 { + fmt.Printf(" %s Killed %d orphaned daemon(s)\n", + style.Bold.Render("✓"), killed) + } } } diff --git a/internal/daemon/daemon.go b/internal/daemon/daemon.go index 3f3b6a6b..7c25dc2a 100755 --- a/internal/daemon/daemon.go +++ b/internal/daemon/daemon.go @@ -1,6 +1,7 @@ package daemon import ( + "bytes" "context" "encoding/json" "fmt" @@ -680,31 +681,63 @@ func IsRunning(townRoot string) (bool, int, error) { if os.IsNotExist(err) { return false, 0, nil } - return false, 0, err + // Return error for other failures (permissions, I/O) + return false, 0, fmt.Errorf("reading PID file: %w", err) } - pid, err := strconv.Atoi(string(data)) + pidStr := strings.TrimSpace(string(data)) + pid, err := strconv.Atoi(pidStr) if err != nil { - return false, 0, nil + // Corrupted PID file - return error, not silent false + return false, 0, fmt.Errorf("invalid PID in file %q: %w", pidStr, err) } - // Check if process is running + // Check if process is alive process, err := os.FindProcess(pid) if err != nil { return false, 0, nil } // On Unix, FindProcess always succeeds. Send signal 0 to check if alive. - err = process.Signal(syscall.Signal(0)) - if err != nil { + if err := process.Signal(syscall.Signal(0)); err != nil { // Process not running, clean up stale PID file - _ = os.Remove(pidFile) + if err := os.Remove(pidFile); err == nil { + // Successfully cleaned up stale file + return false, 0, fmt.Errorf("removed stale PID file (process %d not found)", pid) + } + return false, 0, nil + } + + // CRITICAL: Verify it's actually our daemon, not PID reuse + if !isGasTownDaemon(pid) { + // PID reused by different process + if err := os.Remove(pidFile); err == nil { + return false, 0, fmt.Errorf("removed stale PID file (PID %d is not gt daemon)", pid) + } return false, 0, nil } return true, pid, nil } +// isGasTownDaemon checks if a PID is actually a gt daemon run process. +// This prevents false positives from PID reuse. +func isGasTownDaemon(pid int) bool { + // Read /proc//cmdline to verify process name + cmdlineFile := fmt.Sprintf("/proc/%d/cmdline", pid) + data, err := os.ReadFile(cmdlineFile) + if err != nil { + return false + } + + // cmdline is null-separated, convert to space-separated + cmdline := string(bytes.ReplaceAll(data, []byte{0}, []byte(" "))) + cmdline = strings.TrimSpace(cmdline) + + // Check if it's "gt daemon run" or "/path/to/gt daemon run" + return strings.Contains(cmdline, "gt") && strings.Contains(cmdline, "daemon") && strings.Contains(cmdline, "run") +} + // StopDaemon stops the running daemon for the given town. // Note: The file lock in Run() prevents multiple daemons per town, so we only // need to kill the process from the PID file. @@ -743,6 +776,74 @@ func StopDaemon(townRoot string) error { return nil } +// FindOrphanedDaemons finds all gt daemon run processes that aren't tracked by PID file. +// Returns list of orphaned PIDs. +func FindOrphanedDaemons() ([]int, error) { + // Use pgrep to find all "daemon run" processes (broad search, then verify with isGasTownDaemon) + cmd := exec.Command("pgrep", "-f", "daemon run") + output, err := cmd.Output() + if err != nil { + // Exit code 1 means no processes found - that's OK + if exitErr, ok := err.(*exec.ExitError); ok && exitErr.ExitCode() == 1 { + return nil, nil + } + return nil, fmt.Errorf("pgrep failed: %w", err) + } + + // Parse PIDs + var pids []int + for _, line := range strings.Split(strings.TrimSpace(string(output)), "\n") { + if line == "" { + continue + } + pid, err := strconv.Atoi(line) + if err != nil { + continue + } + // Verify it's actually gt daemon (filters out unrelated processes) + if isGasTownDaemon(pid) { + pids = append(pids, pid) + } + } + + return pids, nil +} + +// KillOrphanedDaemons finds and kills any orphaned gt daemon processes. +// Returns number of processes killed. +func KillOrphanedDaemons() (int, error) { + pids, err := FindOrphanedDaemons() + if err != nil { + return 0, err + } + + killed := 0 + for _, pid := range pids { + process, err := os.FindProcess(pid) + if err != nil { + continue + } + + // Try SIGTERM first + if err := process.Signal(syscall.SIGTERM); err != nil { + continue + } + + // Wait for graceful shutdown + time.Sleep(200 * time.Millisecond) + + // Check if still alive + if err := process.Signal(syscall.Signal(0)); err == nil { + // Still alive, force kill + _ = process.Signal(syscall.SIGKILL) + } + + killed++ + } + + return killed, nil +} + // checkPolecatSessionHealth proactively validates polecat tmux sessions. // This detects crashed polecats that: // 1. Have work-on-hook (assigned work)