Files
beads/cmd/bd/daemon_event_loop.go
Steve Yegge 58fe00057c feat: Complete GH #353 follow-up phases (bd-9nw, bd-u3t, bd-e0o)
Implements all three follow-up phases for sandbox environment support:

**Phase 1 (bd-9nw): Documentation** 
- Comprehensive sandbox troubleshooting section in TROUBLESHOOTING.md
  - Detailed symptoms, root causes, and escape hatches
  - Step-by-step troubleshooting workflow
  - Comparison table for --sandbox, --force, and --allow-stale flags
- Global flags section added to CLI_REFERENCE.md
  - Documents --sandbox, --allow-stale, and --force flags
  - Usage examples and when to use each flag
- GitHub issue #353 comment with immediate workarounds

**Phase 2 (bd-u3t): Sandbox Auto-Detection** 
- Automatic sandbox detection using syscall.Kill permission checks
  - cmd/bd/sandbox_unix.go: Unix/Linux/macOS implementation
  - cmd/bd/sandbox_windows.go: Windows stub (conservative approach)
  - cmd/bd/sandbox_test.go: Comprehensive test coverage
- Auto-enables sandbox mode when detected
  - Shows: "ℹ️  Sandbox detected, using direct mode"
  - Respects explicit --sandbox or --no-daemon flags
- Updated documentation to reflect auto-detection (v0.21.1+)

**Phase 3 (bd-e0o): Enhanced Daemon Robustness** 
- Permission-aware process checks in cmd/bd/daemon_unix.go
  - Correctly handles EPERM (operation not permitted) from syscall.Kill
  - Treats EPERM as "process exists but not signable" = running
  - Prevents false negatives in sandboxed environments
- Metadata health check in cmd/bd/daemon_event_loop.go
  - Periodic verification that metadata is accessible
  - Helps detect external import operations (bd import --force)
  - Non-fatal logging for diagnostics
- Comprehensive test suite in cmd/bd/daemon_unix_test.go
  - Self-check, init process, nonexistent process, parent process tests

**Impact:**
- Codex users: No manual intervention needed, auto-detected
- Stuck states: Three escape hatches (--sandbox, --force, --allow-stale)
- Daemon robustness: Handles permission-restricted environments gracefully
- All three follow-up issues (bd-9nw, bd-u3t, bd-e0o) closed

**Files changed:**
- cmd/bd/main.go: Auto-detection logic in PersistentPreRun
- cmd/bd/sandbox_unix.go: Unix sandbox detection (new)
- cmd/bd/sandbox_windows.go: Windows sandbox detection stub (new)
- cmd/bd/sandbox_test.go: Sandbox detection tests (new)
- cmd/bd/daemon_unix.go: Permission-aware isProcessRunning()
- cmd/bd/daemon_unix_test.go: Process check tests (new)
- cmd/bd/daemon_event_loop.go: Metadata health check
- docs/TROUBLESHOOTING.md: Comprehensive sandbox section
- docs/CLI_REFERENCE.md: Global flags documentation

Closes bd-9nw, bd-u3t, bd-e0o
Related: GH #353

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-21 19:32:45 -05:00

186 lines
5.0 KiB
Go

package main
import (
"context"
"os"
"os/signal"
"time"
"github.com/steveyegge/beads/internal/rpc"
"github.com/steveyegge/beads/internal/storage"
)
// runEventDrivenLoop implements event-driven daemon architecture.
// Replaces polling ticker with reactive event handlers for:
// - File system changes (JSONL modifications)
// - RPC mutations (create, update, delete)
// - Git operations (via hooks, optional)
// - Parent process monitoring (exit if parent dies)
func runEventDrivenLoop(
ctx context.Context,
cancel context.CancelFunc,
server *rpc.Server,
serverErrChan chan error,
store storage.Storage,
jsonlPath string,
doExport func(),
doAutoImport func(),
parentPID int,
log daemonLogger,
) {
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, daemonSignals...)
defer signal.Stop(sigChan)
// Debounced sync actions
exportDebouncer := NewDebouncer(500*time.Millisecond, func() {
log.log("Export triggered by mutation events")
doExport()
})
defer exportDebouncer.Cancel()
importDebouncer := NewDebouncer(500*time.Millisecond, func() {
log.log("Import triggered by file change")
doAutoImport()
})
defer importDebouncer.Cancel()
// Start file watcher for JSONL changes
watcher, err := NewFileWatcher(jsonlPath, func() {
importDebouncer.Trigger()
})
var fallbackTicker *time.Ticker
if err != nil {
log.log("WARNING: File watcher unavailable (%v), using 60s polling fallback", err)
watcher = nil
// Fallback ticker to check for remote changes when watcher unavailable
fallbackTicker = time.NewTicker(60 * time.Second)
defer fallbackTicker.Stop()
} else {
watcher.Start(ctx, log)
defer func() { _ = watcher.Close() }()
}
// Handle mutation events from RPC server
mutationChan := server.MutationChan()
go func() {
for {
select {
case event, ok := <-mutationChan:
if !ok {
// Channel closed (should never happen, but handle defensively)
log.log("Mutation channel closed; exiting listener")
return
}
log.log("Mutation detected: %s %s", event.Type, event.IssueID)
exportDebouncer.Trigger()
case <-ctx.Done():
return
}
}
}()
// Periodic health check
healthTicker := time.NewTicker(60 * time.Second)
defer healthTicker.Stop()
// Parent process check (every 10 seconds)
parentCheckTicker := time.NewTicker(10 * time.Second)
defer parentCheckTicker.Stop()
// Dropped events safety net (faster recovery than health check)
droppedEventsTicker := time.NewTicker(1 * time.Second)
defer droppedEventsTicker.Stop()
for {
select {
case <-droppedEventsTicker.C:
// Check for dropped mutation events every second
dropped := server.ResetDroppedEventsCount()
if dropped > 0 {
log.log("WARNING: %d mutation events were dropped, triggering export", dropped)
exportDebouncer.Trigger()
}
case <-healthTicker.C:
// Periodic health validation (not sync)
checkDaemonHealth(ctx, store, log)
case <-parentCheckTicker.C:
// Check if parent process is still alive
if !checkParentProcessAlive(parentPID) {
log.log("Parent process (PID %d) died, shutting down daemon", parentPID)
cancel()
if err := server.Stop(); err != nil {
log.log("Error stopping server: %v", err)
}
return
}
case <-func() <-chan time.Time {
if fallbackTicker != nil {
return fallbackTicker.C
}
// Never fire if watcher is available
return make(chan time.Time)
}():
log.log("Fallback ticker: checking for remote changes")
importDebouncer.Trigger()
case sig := <-sigChan:
if isReloadSignal(sig) {
log.log("Received reload signal, ignoring")
continue
}
log.log("Received signal %v, shutting down...", sig)
cancel()
if err := server.Stop(); err != nil {
log.log("Error stopping server: %v", err)
}
return
case <-ctx.Done():
log.log("Context canceled, shutting down")
if watcher != nil {
_ = watcher.Close()
}
if err := server.Stop(); err != nil {
log.log("Error stopping server: %v", err)
}
return
case err := <-serverErrChan:
log.log("RPC server failed: %v", err)
cancel()
if watcher != nil {
_ = watcher.Close()
}
if stopErr := server.Stop(); stopErr != nil {
log.log("Error stopping server: %v", stopErr)
}
return
}
}
}
// checkDaemonHealth performs periodic health validation.
// Separate from sync operations - just validates state.
//
// Implements bd-e0o: Phase 3 daemon robustness for GH #353
func checkDaemonHealth(ctx context.Context, store storage.Storage, log daemonLogger) {
// Health check: Verify metadata is accessible
// This helps detect if external operations (like bd import --force) have modified metadata
// Without this, daemon may continue operating with stale metadata cache
if _, err := store.GetMetadata(ctx, "last_import_hash"); err != nil {
log.log("Health check: metadata read failed: %v", err)
// Non-fatal: daemon continues but logs the issue
// This helps diagnose stuck states in sandboxed environments
}
// TODO(bd-gqo): Add additional health checks:
// - Database integrity check
// - Disk space check
// - Memory usage check
}