Merge pull request #108 from jakehemmerle/fix/daemon-orphan-race-condition

fix(daemon): prevent orphan daemons via file locking
This commit is contained in:
Steve Yegge
2026-01-04 13:01:12 -08:00
committed by GitHub
2 changed files with 34 additions and 4 deletions
+10 -1
View File
@@ -117,7 +117,7 @@ func runDaemonStart(cmd *cobra.Command, args []string) error {
return fmt.Errorf("starting daemon: %w", err)
}
// Wait a moment for the daemon to initialize
// Wait a moment for the daemon to initialize and acquire the lock
time.Sleep(200 * time.Millisecond)
// Verify it started
@@ -129,6 +129,15 @@ func runDaemonStart(cmd *cobra.Command, args []string) error {
return fmt.Errorf("daemon failed to start (check logs with 'gt daemon logs')")
}
// Check if our spawned process is the one that won the race.
// If another concurrent start won, our process would have exited after
// failing to acquire the lock, and the PID file would have a different PID.
if pid != daemonCmd.Process.Pid {
// Another daemon won the race - that's fine, report it
fmt.Printf("%s Daemon already running (PID %d)\n", style.Bold.Render("●"), pid)
return nil
}
fmt.Printf("%s Daemon started (PID %d)\n", style.Bold.Render("✓"), pid)
return nil
}
+24 -3
View File
@@ -67,6 +67,22 @@ func New(config *Config) (*Daemon, error) {
func (d *Daemon) Run() error {
d.logger.Printf("Daemon starting (PID %d)", os.Getpid())
// Acquire exclusive lock to prevent multiple daemons from running.
// This prevents the TOCTOU race condition where multiple concurrent starts
// can all pass the IsRunning() check before any writes the PID file.
lockFile := filepath.Join(d.config.TownRoot, "daemon", "daemon.lock")
lock, err := os.OpenFile(lockFile, os.O_CREATE|os.O_RDWR, 0644)
if err != nil {
return fmt.Errorf("opening lock file: %w", err)
}
defer lock.Close()
// Try to acquire exclusive lock (non-blocking)
if err := syscall.Flock(int(lock.Fd()), syscall.LOCK_EX|syscall.LOCK_NB); err != nil {
return fmt.Errorf("daemon already running (lock held by another process)")
}
defer func() { _ = syscall.Flock(int(lock.Fd()), syscall.LOCK_UN) }()
// Write PID file
if err := os.WriteFile(d.config.PidFile, []byte(strconv.Itoa(os.Getpid())), 0644); err != nil {
return fmt.Errorf("writing PID file: %w", err)
@@ -684,6 +700,9 @@ func (d *Daemon) Stop() {
}
// IsRunning checks if a daemon is running for the given town.
// It checks the PID file and verifies the process is alive.
// Note: The file lock in Run() is the authoritative mechanism for preventing
// duplicate daemons. This function is for status checks and cleanup.
func IsRunning(townRoot string) (bool, int, error) {
pidFile := filepath.Join(townRoot, "daemon", "daemon.pid")
data, err := os.ReadFile(pidFile)
@@ -708,7 +727,7 @@ func IsRunning(townRoot string) (bool, int, error) {
// On Unix, FindProcess always succeeds. Send signal 0 to check if alive.
err = process.Signal(syscall.Signal(0))
if err != nil {
// Process not running, clean up stale PID file (best-effort cleanup)
// Process not running, clean up stale PID file
_ = os.Remove(pidFile)
return false, 0, nil
}
@@ -717,6 +736,8 @@ func IsRunning(townRoot string) (bool, int, error) {
}
// StopDaemon stops the running daemon for the given town.
// Note: The file lock in Run() prevents multiple daemons per town, so we only
// need to kill the process from the PID file.
func StopDaemon(townRoot string) error {
running, pid, err := IsRunning(townRoot)
if err != nil {
@@ -741,11 +762,11 @@ func StopDaemon(townRoot string) error {
// Check if still running
if err := process.Signal(syscall.Signal(0)); err == nil {
// Still running, force kill (best-effort)
// Still running, force kill
_ = process.Signal(syscall.SIGKILL)
}
// Clean up PID file (best-effort cleanup)
// Clean up PID file
pidFile := filepath.Join(townRoot, "daemon", "daemon.pid")
_ = os.Remove(pidFile)