Fix daemon shutdown detection bug

## Problem
gt shutdown failed to stop orphaned daemon processes because the
detection mechanism ignored errors and had no fallback.

## Root Cause
stopDaemonIfRunning() ignored errors from daemon.IsRunning(), causing:
1. Stale PID files to hide running daemons
2. Corrupted PID files to return silent false
3. No fallback detection for orphaned processes
4. Early return when no sessions running prevented daemon check

## Solution
1. Enhanced IsRunning() to return detailed errors
2. Added process name verification (prevents PID reuse false positives)
3. Added fallback orphan detection using pgrep
4. Fixed stopDaemonIfRunning() to handle errors and use fallback
5. Added daemon check even when no sessions are running

## Testing
Verified shutdown now:
- Detects and reports stale/corrupted PID files
- Finds orphaned daemon processes
- Kills all daemon processes reliably
- Reports detailed status during shutdown
- Works even when no other sessions are running

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
Roland Tritsch
2026-01-18 07:51:48 +00:00
committed by Steve Yegge
parent 2aadb0165b
commit 6bfe61f796
2 changed files with 152 additions and 11 deletions

View File

@@ -450,6 +450,14 @@ func runShutdown(cmd *cobra.Command, args []string) error {
if len(toStop) == 0 {
fmt.Printf("%s Gas Town was not running\n", style.Dim.Render("○"))
// Still check for orphaned daemons even if no sessions are running
if townRoot != "" {
fmt.Println()
fmt.Println("Checking for orphaned daemon...")
stopDaemonIfRunning(townRoot)
}
return nil
}
@@ -797,16 +805,48 @@ func cleanupPolecats(townRoot string) {
// stopDaemonIfRunning stops the daemon if it is running.
// This prevents the daemon from restarting agents after shutdown.
// Uses robust detection with fallback to process search.
func stopDaemonIfRunning(townRoot string) {
running, _, _ := daemon.IsRunning(townRoot)
// Primary detection: PID file
running, pid, err := daemon.IsRunning(townRoot)
if err != nil {
// Detection error - report it but continue with fallback
fmt.Printf(" %s Daemon detection warning: %s\n", style.Bold.Render("⚠"), err.Error())
}
if running {
// PID file points to live daemon - stop it
if err := daemon.StopDaemon(townRoot); err != nil {
fmt.Printf(" %s Daemon: %s\n", style.Dim.Render("○"), err.Error())
fmt.Printf(" %s Failed to stop daemon (PID %d): %s\n",
style.Bold.Render("✗"), pid, err.Error())
} else {
fmt.Printf(" %s Daemon stopped\n", style.Bold.Render("✓"))
fmt.Printf(" %s Daemon stopped (was PID %d)\n", style.Bold.Render("✓"), pid)
}
} else {
fmt.Printf(" %s Daemon not running\n", style.Dim.Render("○"))
fmt.Printf(" %s Daemon not tracked by PID file\n", style.Dim.Render("○"))
}
// Fallback: Search for orphaned daemon processes
orphaned, err := daemon.FindOrphanedDaemons()
if err != nil {
fmt.Printf(" %s Warning: failed to search for orphaned daemons: %v\n",
style.Dim.Render("○"), err)
return
}
if len(orphaned) > 0 {
fmt.Printf(" %s Found %d orphaned daemon process(es): %v\n",
style.Bold.Render("⚠"), len(orphaned), orphaned)
killed, err := daemon.KillOrphanedDaemons()
if err != nil {
fmt.Printf(" %s Failed to kill orphaned daemons: %v\n",
style.Bold.Render("✗"), err)
} else if killed > 0 {
fmt.Printf(" %s Killed %d orphaned daemon(s)\n",
style.Bold.Render("✓"), killed)
}
}
}

View File

@@ -1,6 +1,7 @@
package daemon
import (
"bytes"
"context"
"encoding/json"
"fmt"
@@ -680,31 +681,63 @@ func IsRunning(townRoot string) (bool, int, error) {
if os.IsNotExist(err) {
return false, 0, nil
}
return false, 0, err
// Return error for other failures (permissions, I/O)
return false, 0, fmt.Errorf("reading PID file: %w", err)
}
pid, err := strconv.Atoi(string(data))
pidStr := strings.TrimSpace(string(data))
pid, err := strconv.Atoi(pidStr)
if err != nil {
return false, 0, nil
// Corrupted PID file - return error, not silent false
return false, 0, fmt.Errorf("invalid PID in file %q: %w", pidStr, err)
}
// Check if process is running
// Check if process is alive
process, err := os.FindProcess(pid)
if err != nil {
return false, 0, nil
}
// On Unix, FindProcess always succeeds. Send signal 0 to check if alive.
err = process.Signal(syscall.Signal(0))
if err != nil {
if err := process.Signal(syscall.Signal(0)); err != nil {
// Process not running, clean up stale PID file
_ = os.Remove(pidFile)
if err := os.Remove(pidFile); err == nil {
// Successfully cleaned up stale file
return false, 0, fmt.Errorf("removed stale PID file (process %d not found)", pid)
}
return false, 0, nil
}
// CRITICAL: Verify it's actually our daemon, not PID reuse
if !isGasTownDaemon(pid) {
// PID reused by different process
if err := os.Remove(pidFile); err == nil {
return false, 0, fmt.Errorf("removed stale PID file (PID %d is not gt daemon)", pid)
}
return false, 0, nil
}
return true, pid, nil
}
// isGasTownDaemon checks if a PID is actually a gt daemon run process.
// This prevents false positives from PID reuse.
func isGasTownDaemon(pid int) bool {
// Read /proc/<pid>/cmdline to verify process name
cmdlineFile := fmt.Sprintf("/proc/%d/cmdline", pid)
data, err := os.ReadFile(cmdlineFile)
if err != nil {
return false
}
// cmdline is null-separated, convert to space-separated
cmdline := string(bytes.ReplaceAll(data, []byte{0}, []byte(" ")))
cmdline = strings.TrimSpace(cmdline)
// Check if it's "gt daemon run" or "/path/to/gt daemon run"
return strings.Contains(cmdline, "gt") && strings.Contains(cmdline, "daemon") && strings.Contains(cmdline, "run")
}
// StopDaemon stops the running daemon for the given town.
// Note: The file lock in Run() prevents multiple daemons per town, so we only
// need to kill the process from the PID file.
@@ -743,6 +776,74 @@ func StopDaemon(townRoot string) error {
return nil
}
// FindOrphanedDaemons finds all gt daemon run processes that aren't tracked by PID file.
// Returns list of orphaned PIDs.
func FindOrphanedDaemons() ([]int, error) {
// Use pgrep to find all "daemon run" processes (broad search, then verify with isGasTownDaemon)
cmd := exec.Command("pgrep", "-f", "daemon run")
output, err := cmd.Output()
if err != nil {
// Exit code 1 means no processes found - that's OK
if exitErr, ok := err.(*exec.ExitError); ok && exitErr.ExitCode() == 1 {
return nil, nil
}
return nil, fmt.Errorf("pgrep failed: %w", err)
}
// Parse PIDs
var pids []int
for _, line := range strings.Split(strings.TrimSpace(string(output)), "\n") {
if line == "" {
continue
}
pid, err := strconv.Atoi(line)
if err != nil {
continue
}
// Verify it's actually gt daemon (filters out unrelated processes)
if isGasTownDaemon(pid) {
pids = append(pids, pid)
}
}
return pids, nil
}
// KillOrphanedDaemons finds and kills any orphaned gt daemon processes.
// Returns number of processes killed.
func KillOrphanedDaemons() (int, error) {
pids, err := FindOrphanedDaemons()
if err != nil {
return 0, err
}
killed := 0
for _, pid := range pids {
process, err := os.FindProcess(pid)
if err != nil {
continue
}
// Try SIGTERM first
if err := process.Signal(syscall.SIGTERM); err != nil {
continue
}
// Wait for graceful shutdown
time.Sleep(200 * time.Millisecond)
// Check if still alive
if err := process.Signal(syscall.Signal(0)); err == nil {
// Still alive, force kill
_ = process.Signal(syscall.SIGKILL)
}
killed++
}
return killed, nil
}
// checkPolecatSessionHealth proactively validates polecat tmux sessions.
// This detects crashed polecats that:
// 1. Have work-on-hook (assigned work)