Fix daemon orphaning: track parent PID and exit when parent dies

- Add ParentPID field to DaemonLockInfo struct
- Daemon monitors parent process every 10 seconds
- Gracefully exits when parent process dies
- Prevents accumulation of orphaned daemons from dead sessions
- Fixes race conditions from multiple daemons on same database

Closes bd-zpnq
This commit is contained in:
Steve Yegge
2025-11-07 18:57:43 -08:00
parent a6c9579645
commit 7fa0c93195
5 changed files with 105 additions and 5 deletions

View File

@@ -372,6 +372,10 @@ func runDaemonLoop(interval time.Duration, autoCommit, autoPush bool, logPath, p
doSync := createSyncFunc(ctx, store, autoCommit, autoPush, log)
doSync()
// Get parent PID for monitoring (exit if parent dies)
parentPID := os.Getppid()
log.log("Monitoring parent process (PID %d)", parentPID)
// Choose event loop based on BEADS_DAEMON_MODE
daemonMode := os.Getenv("BEADS_DAEMON_MODE")
if daemonMode == "" {
@@ -385,18 +389,18 @@ func runDaemonLoop(interval time.Duration, autoCommit, autoPush bool, logPath, p
if jsonlPath == "" {
log.log("Error: JSONL path not found, cannot use event-driven mode")
log.log("Falling back to polling mode")
runEventLoop(ctx, cancel, ticker, doSync, server, serverErrChan, log)
runEventLoop(ctx, cancel, ticker, doSync, server, serverErrChan, parentPID, log)
} else {
// Event-driven mode uses separate export-only and import-only functions
doExport := createExportFunc(ctx, store, autoCommit, autoPush, log)
doAutoImport := createAutoImportFunc(ctx, store, log)
runEventDrivenLoop(ctx, cancel, server, serverErrChan, store, jsonlPath, doExport, doAutoImport, log)
runEventDrivenLoop(ctx, cancel, server, serverErrChan, store, jsonlPath, doExport, doAutoImport, parentPID, log)
}
case "poll":
log.log("Using polling mode (interval: %v)", interval)
runEventLoop(ctx, cancel, ticker, doSync, server, serverErrChan, log)
runEventLoop(ctx, cancel, ticker, doSync, server, serverErrChan, parentPID, log)
default:
log.log("Unknown BEADS_DAEMON_MODE: %s (valid: poll, events), defaulting to poll", daemonMode)
runEventLoop(ctx, cancel, ticker, doSync, server, serverErrChan, log)
runEventLoop(ctx, cancel, ticker, doSync, server, serverErrChan, parentPID, log)
}
}

View File

@@ -15,6 +15,7 @@ import (
// - File system changes (JSONL modifications)
// - RPC mutations (create, update, delete)
// - Git operations (via hooks, optional)
// - Parent process monitoring (exit if parent dies)
func runEventDrivenLoop(
ctx context.Context,
cancel context.CancelFunc,
@@ -24,6 +25,7 @@ func runEventDrivenLoop(
jsonlPath string,
doExport func(),
doAutoImport func(),
parentPID int,
log daemonLogger,
) {
sigChan := make(chan os.Signal, 1)
@@ -83,6 +85,10 @@ func runEventDrivenLoop(
healthTicker := time.NewTicker(60 * time.Second)
defer healthTicker.Stop()
// Parent process check (every 10 seconds)
parentCheckTicker := time.NewTicker(10 * time.Second)
defer parentCheckTicker.Stop()
// Dropped events safety net (faster recovery than health check)
droppedEventsTicker := time.NewTicker(1 * time.Second)
defer droppedEventsTicker.Stop()
@@ -101,6 +107,17 @@ func runEventDrivenLoop(
// Periodic health validation (not sync)
checkDaemonHealth(ctx, store, log)
case <-parentCheckTicker.C:
// Check if parent process is still alive
if !checkParentProcessAlive(parentPID) {
log.log("Parent process (PID %d) died, shutting down daemon", parentPID)
cancel()
if err := server.Stop(); err != nil {
log.log("Error stopping server: %v", err)
}
return
}
case <-func() <-chan time.Time {
if fallbackTicker != nil {
return fallbackTicker.C

View File

@@ -16,6 +16,7 @@ var ErrDaemonLocked = errors.New("daemon lock already held by another process")
// DaemonLockInfo represents the metadata stored in the daemon.lock file
type DaemonLockInfo struct {
PID int `json:"pid"`
ParentPID int `json:"parent_pid,omitempty"` // Parent process ID (0 if not tracked)
Database string `json:"database"`
Version string `json:"version"`
StartedAt time.Time `json:"started_at"`
@@ -63,6 +64,7 @@ func acquireDaemonLock(beadsDir string, dbPath string) (*DaemonLock, error) {
// Write JSON metadata to the lock file
lockInfo := DaemonLockInfo{
PID: os.Getpid(),
ParentPID: os.Getppid(),
Database: dbPath,
Version: Version,
StartedAt: time.Now().UTC(),

View File

@@ -0,0 +1,45 @@
package main
import (
"os/exec"
"path/filepath"
"testing"
)
// TestDaemonExitsWhenParentDies verifies that the daemon exits when its parent process dies
func TestDaemonExitsWhenParentDies(t *testing.T) {
if testing.Short() {
t.Skip("Skipping integration test in short mode")
}
t.Skip("Manual test - requires daemon to be run externally")
// This is a manual test scenario:
// 1. Start a shell process that spawns the daemon
// 2. Verify daemon tracks parent PID
// 3. Kill the shell process
// 4. Verify daemon exits within 10-15 seconds
//
// To test manually:
// $ sh -c 'bd daemon --interval 5s & sleep 100' &
// $ SHELL_PID=$!
// $ # Check daemon.lock has parent_pid set to SHELL_PID
// $ kill $SHELL_PID
// $ # Daemon should exit within 10-15 seconds
}
func mustAbs(t *testing.T, path string) string {
abs, err := filepath.Abs(path)
if err != nil {
t.Fatalf("Failed to get absolute path: %v", err)
}
return abs
}
func runGitCmd(t *testing.T, dir string, args ...string) {
cmd := exec.Command("git", args...)
cmd.Dir = dir
if err := cmd.Run(); err != nil {
t.Fatalf("git %v failed: %v", args, err)
}
}

View File

@@ -73,12 +73,34 @@ func runGlobalDaemon(log daemonLogger) {
log.log("Global daemon stopped")
}
// checkParentProcessAlive checks if the parent process is still running.
// Returns true if parent is alive, false if it died.
// Returns true if parent PID is 0 or 1 (not tracked, or adopted by init).
func checkParentProcessAlive(parentPID int) bool {
if parentPID == 0 {
// Parent PID not tracked (older lock files)
return true
}
if parentPID == 1 {
// Adopted by init - parent died
return false
}
// Check if parent process is running
return isProcessRunning(parentPID)
}
// runEventLoop runs the daemon event loop (polling mode)
func runEventLoop(ctx context.Context, cancel context.CancelFunc, ticker *time.Ticker, doSync func(), server *rpc.Server, serverErrChan chan error, log daemonLogger) {
func runEventLoop(ctx context.Context, cancel context.CancelFunc, ticker *time.Ticker, doSync func(), server *rpc.Server, serverErrChan chan error, parentPID int, log daemonLogger) {
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, daemonSignals...)
defer signal.Stop(sigChan)
// Parent process check (every 10 seconds)
parentCheckTicker := time.NewTicker(10 * time.Second)
defer parentCheckTicker.Stop()
for {
select {
case <-ticker.C:
@@ -86,6 +108,16 @@ func runEventLoop(ctx context.Context, cancel context.CancelFunc, ticker *time.T
return
}
doSync()
case <-parentCheckTicker.C:
// Check if parent process is still alive
if !checkParentProcessAlive(parentPID) {
log.log("Parent process (PID %d) died, shutting down daemon", parentPID)
cancel()
if err := server.Stop(); err != nil {
log.log("Error stopping server: %v", err)
}
return
}
case sig := <-sigChan:
if isReloadSignal(sig) {
log.log("Received reload signal, ignoring (daemon continues running)")