Fix daemon shutdown detection bug
## Problem gt shutdown failed to stop orphaned daemon processes because the detection mechanism ignored errors and had no fallback. ## Root Cause stopDaemonIfRunning() ignored errors from daemon.IsRunning(), causing: 1. Stale PID files to hide running daemons 2. Corrupted PID files to return silent false 3. No fallback detection for orphaned processes 4. Early return when no sessions running prevented daemon check ## Solution 1. Enhanced IsRunning() to return detailed errors 2. Added process name verification (prevents PID reuse false positives) 3. Added fallback orphan detection using pgrep 4. Fixed stopDaemonIfRunning() to handle errors and use fallback 5. Added daemon check even when no sessions are running ## Testing Verified shutdown now: - Detects and reports stale/corrupted PID files - Finds orphaned daemon processes - Kills all daemon processes reliably - Reports detailed status during shutdown - Works even when no other sessions are running Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
committed by
Steve Yegge
parent
2aadb0165b
commit
6bfe61f796
@@ -1,6 +1,7 @@
|
||||
package daemon
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
@@ -680,31 +681,63 @@ func IsRunning(townRoot string) (bool, int, error) {
|
||||
if os.IsNotExist(err) {
|
||||
return false, 0, nil
|
||||
}
|
||||
return false, 0, err
|
||||
// Return error for other failures (permissions, I/O)
|
||||
return false, 0, fmt.Errorf("reading PID file: %w", err)
|
||||
}
|
||||
|
||||
pid, err := strconv.Atoi(string(data))
|
||||
pidStr := strings.TrimSpace(string(data))
|
||||
pid, err := strconv.Atoi(pidStr)
|
||||
if err != nil {
|
||||
return false, 0, nil
|
||||
// Corrupted PID file - return error, not silent false
|
||||
return false, 0, fmt.Errorf("invalid PID in file %q: %w", pidStr, err)
|
||||
}
|
||||
|
||||
// Check if process is running
|
||||
// Check if process is alive
|
||||
process, err := os.FindProcess(pid)
|
||||
if err != nil {
|
||||
return false, 0, nil
|
||||
}
|
||||
|
||||
// On Unix, FindProcess always succeeds. Send signal 0 to check if alive.
|
||||
err = process.Signal(syscall.Signal(0))
|
||||
if err != nil {
|
||||
if err := process.Signal(syscall.Signal(0)); err != nil {
|
||||
// Process not running, clean up stale PID file
|
||||
_ = os.Remove(pidFile)
|
||||
if err := os.Remove(pidFile); err == nil {
|
||||
// Successfully cleaned up stale file
|
||||
return false, 0, fmt.Errorf("removed stale PID file (process %d not found)", pid)
|
||||
}
|
||||
return false, 0, nil
|
||||
}
|
||||
|
||||
// CRITICAL: Verify it's actually our daemon, not PID reuse
|
||||
if !isGasTownDaemon(pid) {
|
||||
// PID reused by different process
|
||||
if err := os.Remove(pidFile); err == nil {
|
||||
return false, 0, fmt.Errorf("removed stale PID file (PID %d is not gt daemon)", pid)
|
||||
}
|
||||
return false, 0, nil
|
||||
}
|
||||
|
||||
return true, pid, nil
|
||||
}
|
||||
|
||||
// isGasTownDaemon checks if a PID is actually a gt daemon run process.
|
||||
// This prevents false positives from PID reuse.
|
||||
func isGasTownDaemon(pid int) bool {
|
||||
// Read /proc/<pid>/cmdline to verify process name
|
||||
cmdlineFile := fmt.Sprintf("/proc/%d/cmdline", pid)
|
||||
data, err := os.ReadFile(cmdlineFile)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
// cmdline is null-separated, convert to space-separated
|
||||
cmdline := string(bytes.ReplaceAll(data, []byte{0}, []byte(" ")))
|
||||
cmdline = strings.TrimSpace(cmdline)
|
||||
|
||||
// Check if it's "gt daemon run" or "/path/to/gt daemon run"
|
||||
return strings.Contains(cmdline, "gt") && strings.Contains(cmdline, "daemon") && strings.Contains(cmdline, "run")
|
||||
}
|
||||
|
||||
// StopDaemon stops the running daemon for the given town.
|
||||
// Note: The file lock in Run() prevents multiple daemons per town, so we only
|
||||
// need to kill the process from the PID file.
|
||||
@@ -743,6 +776,74 @@ func StopDaemon(townRoot string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// FindOrphanedDaemons finds all gt daemon run processes that aren't tracked by PID file.
|
||||
// Returns list of orphaned PIDs.
|
||||
func FindOrphanedDaemons() ([]int, error) {
|
||||
// Use pgrep to find all "daemon run" processes (broad search, then verify with isGasTownDaemon)
|
||||
cmd := exec.Command("pgrep", "-f", "daemon run")
|
||||
output, err := cmd.Output()
|
||||
if err != nil {
|
||||
// Exit code 1 means no processes found - that's OK
|
||||
if exitErr, ok := err.(*exec.ExitError); ok && exitErr.ExitCode() == 1 {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, fmt.Errorf("pgrep failed: %w", err)
|
||||
}
|
||||
|
||||
// Parse PIDs
|
||||
var pids []int
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(output)), "\n") {
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
pid, err := strconv.Atoi(line)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
// Verify it's actually gt daemon (filters out unrelated processes)
|
||||
if isGasTownDaemon(pid) {
|
||||
pids = append(pids, pid)
|
||||
}
|
||||
}
|
||||
|
||||
return pids, nil
|
||||
}
|
||||
|
||||
// KillOrphanedDaemons finds and kills any orphaned gt daemon processes.
|
||||
// Returns number of processes killed.
|
||||
func KillOrphanedDaemons() (int, error) {
|
||||
pids, err := FindOrphanedDaemons()
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
killed := 0
|
||||
for _, pid := range pids {
|
||||
process, err := os.FindProcess(pid)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// Try SIGTERM first
|
||||
if err := process.Signal(syscall.SIGTERM); err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// Wait for graceful shutdown
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
|
||||
// Check if still alive
|
||||
if err := process.Signal(syscall.Signal(0)); err == nil {
|
||||
// Still alive, force kill
|
||||
_ = process.Signal(syscall.SIGKILL)
|
||||
}
|
||||
|
||||
killed++
|
||||
}
|
||||
|
||||
return killed, nil
|
||||
}
|
||||
|
||||
// checkPolecatSessionHealth proactively validates polecat tmux sessions.
|
||||
// This detects crashed polecats that:
|
||||
// 1. Have work-on-hook (assigned work)
|
||||
|
||||
Reference in New Issue
Block a user