fix(daemon): prevent zombie state after database file replacement (#1213)

fix(daemon): prevent zombie state after database file replacement

Adds checkFreshness() to health check paths (GetMetadata, GetConfig, GetAllConfig) and refactors reconnect() to validate new connection before closing old.

PR-URL: https://github.com/steveyegge/beads/pull/1213
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
Roland Tritsch
2026-01-22 06:46:59 +00:00
committed by GitHub
parent 78e1d6f229
commit 5264d7aa60
8 changed files with 588 additions and 23 deletions

View File

@@ -24,6 +24,7 @@ func (s *SQLiteStorage) SetConfig(ctx context.Context, key, value string) error
// GetConfig gets a configuration value
func (s *SQLiteStorage) GetConfig(ctx context.Context, key string) (string, error) {
s.checkFreshness()
// Hold read lock during database operations to prevent reconnect() from
// closing the connection mid-query (GH#607 race condition fix)
s.reconnectMu.RLock()
@@ -39,6 +40,7 @@ func (s *SQLiteStorage) GetConfig(ctx context.Context, key string) (string, erro
// GetAllConfig gets all configuration key-value pairs
func (s *SQLiteStorage) GetAllConfig(ctx context.Context) (map[string]string, error) {
s.checkFreshness()
// Hold read lock during database operations to prevent reconnect() from
// closing the connection mid-query (GH#607 race condition fix)
s.reconnectMu.RLock()
@@ -114,6 +116,7 @@ func (s *SQLiteStorage) SetMetadata(ctx context.Context, key, value string) erro
// GetMetadata gets a metadata value (for internal state like import hashes)
func (s *SQLiteStorage) GetMetadata(ctx context.Context, key string) (string, error) {
s.checkFreshness()
// Hold read lock during database operations to prevent reconnect() from
// closing the connection mid-query (GH#607 race condition fix)
s.reconnectMu.RLock()

View File

@@ -2,6 +2,7 @@
package sqlite
import (
"log"
"os"
"sync"
"time"
@@ -112,10 +113,11 @@ func (fc *FreshnessChecker) Check() bool {
return false
}
// debugPrintf is a no-op in production but can be enabled for debugging
// debugPrintf conditionally logs debug messages when BD_DEBUG_FRESHNESS is set
var debugPrintf = func(format string, args ...interface{}) {
// Uncomment for debugging:
// fmt.Printf(format, args...)
if os.Getenv("BD_DEBUG_FRESHNESS") != "" {
log.Printf("[freshness] "+format, args...)
}
}
// DebugState returns the current tracked state for testing/debugging.

View File

@@ -983,3 +983,381 @@ func BenchmarkGetIssueWithFreshness(b *testing.B) {
}
}
}
// TestGetMetadataTriggersReconnection verifies that GetMetadata() detects file replacement.
// This addresses the bug where health checks call GetMetadata() but reconnection never triggers.
func TestGetMetadataTriggersReconnection(t *testing.T) {
// Create temp directory
tmpDir, err := os.MkdirTemp("", "beads-metadata-reconnect-*")
if err != nil {
t.Fatalf("failed to create temp dir: %v", err)
}
defer os.RemoveAll(tmpDir)
dbPath := filepath.Join(tmpDir, "test.db")
ctx := context.Background()
// Create daemon store
store, err := New(ctx, dbPath)
if err != nil {
t.Fatalf("failed to create store: %v", err)
}
defer store.Close()
// Initialize
if err := store.SetConfig(ctx, "issue_prefix", "bd"); err != nil {
t.Fatalf("failed to set issue_prefix: %v", err)
}
// Set a metadata value
if err := store.SetMetadata(ctx, "test_key", "original_value"); err != nil {
t.Fatalf("failed to set metadata: %v", err)
}
// Enable freshness checking
store.EnableFreshnessChecking()
// Get the current inode
inodeBefore := getInode(dbPath)
// Create a second database with different metadata
tmpDir2, err := os.MkdirTemp("", "beads-metadata-reconnect-2-*")
if err != nil {
t.Fatalf("failed to create temp dir 2: %v", err)
}
defer os.RemoveAll(tmpDir2)
dbPath2 := filepath.Join(tmpDir2, "test.db")
store2, err := New(ctx, dbPath2)
if err != nil {
t.Fatalf("failed to create store2: %v", err)
}
store2.SetConfig(ctx, "issue_prefix", "bd")
store2.SetMetadata(ctx, "test_key", "replaced_value")
store2.Close()
// Replace the database file (simulate git merge)
os.Remove(dbPath + "-wal")
os.Remove(dbPath + "-shm")
dbContent, err := os.ReadFile(dbPath2)
if err != nil {
t.Fatalf("failed to read replacement db: %v", err)
}
tempFile := dbPath + ".new"
if err := os.WriteFile(tempFile, dbContent, 0644); err != nil {
t.Fatalf("failed to write temp file: %v", err)
}
if err := os.Rename(tempFile, dbPath); err != nil {
t.Fatalf("failed to rename: %v", err)
}
inodeAfter := getInode(dbPath)
t.Logf("Inode changed: %d -> %d", inodeBefore, inodeAfter)
// Small delay to ensure filesystem settles
time.Sleep(100 * time.Millisecond)
// Call GetMetadata - this should trigger reconnection
value, err := store.GetMetadata(ctx, "test_key")
if err != nil {
t.Fatalf("GetMetadata failed: %v", err)
}
// Verify we see the new value from the replaced database
if value != "replaced_value" {
t.Errorf("GetMetadata did not trigger reconnection: got %q, want %q", value, "replaced_value")
} else {
t.Logf("SUCCESS: GetMetadata triggered reconnection and saw new value")
}
}
// TestGetConfigTriggersReconnection verifies that GetConfig() detects file replacement.
// This is critical because daemon auto-settings use GetConfig() frequently.
func TestGetConfigTriggersReconnection(t *testing.T) {
// Create temp directory
tmpDir, err := os.MkdirTemp("", "beads-config-reconnect-*")
if err != nil {
t.Fatalf("failed to create temp dir: %v", err)
}
defer os.RemoveAll(tmpDir)
dbPath := filepath.Join(tmpDir, "test.db")
ctx := context.Background()
// Create daemon store
store, err := New(ctx, dbPath)
if err != nil {
t.Fatalf("failed to create store: %v", err)
}
defer store.Close()
// Initialize with original config
if err := store.SetConfig(ctx, "issue_prefix", "bd"); err != nil {
t.Fatalf("failed to set issue_prefix: %v", err)
}
if err := store.SetConfig(ctx, "test_setting", "original"); err != nil {
t.Fatalf("failed to set test_setting: %v", err)
}
// Enable freshness checking
store.EnableFreshnessChecking()
// Get the current inode
inodeBefore := getInode(dbPath)
// Create a second database with different config
tmpDir2, err := os.MkdirTemp("", "beads-config-reconnect-2-*")
if err != nil {
t.Fatalf("failed to create temp dir 2: %v", err)
}
defer os.RemoveAll(tmpDir2)
dbPath2 := filepath.Join(tmpDir2, "test.db")
store2, err := New(ctx, dbPath2)
if err != nil {
t.Fatalf("failed to create store2: %v", err)
}
store2.SetConfig(ctx, "issue_prefix", "bd")
store2.SetConfig(ctx, "test_setting", "replaced")
store2.Close()
// Replace the database file
os.Remove(dbPath + "-wal")
os.Remove(dbPath + "-shm")
dbContent, err := os.ReadFile(dbPath2)
if err != nil {
t.Fatalf("failed to read replacement db: %v", err)
}
tempFile := dbPath + ".new"
if err := os.WriteFile(tempFile, dbContent, 0644); err != nil {
t.Fatalf("failed to write temp file: %v", err)
}
if err := os.Rename(tempFile, dbPath); err != nil {
t.Fatalf("failed to rename: %v", err)
}
inodeAfter := getInode(dbPath)
t.Logf("Inode changed: %d -> %d", inodeBefore, inodeAfter)
// Small delay to ensure filesystem settles
time.Sleep(100 * time.Millisecond)
// Call GetConfig - this should trigger reconnection
value, err := store.GetConfig(ctx, "test_setting")
if err != nil {
t.Fatalf("GetConfig failed: %v", err)
}
// Verify we see the new value from the replaced database
if value != "replaced" {
t.Errorf("GetConfig did not trigger reconnection: got %q, want %q", value, "replaced")
} else {
t.Logf("SUCCESS: GetConfig triggered reconnection and saw new value")
}
}
// TestHealthCheckAfterFileReplacement simulates the exact production bug scenario.
// Health checks call GetMetadata() every 60 seconds. If the database file is replaced
// and reconnection fails, the daemon enters a zombie state.
func TestHealthCheckAfterFileReplacement(t *testing.T) {
// Create temp directory
tmpDir, err := os.MkdirTemp("", "beads-health-check-*")
if err != nil {
t.Fatalf("failed to create temp dir: %v", err)
}
defer os.RemoveAll(tmpDir)
dbPath := filepath.Join(tmpDir, "test.db")
ctx := context.Background()
// Create daemon store
store, err := New(ctx, dbPath)
if err != nil {
t.Fatalf("failed to create store: %v", err)
}
defer store.Close()
// Initialize
if err := store.SetConfig(ctx, "issue_prefix", "bd"); err != nil {
t.Fatalf("failed to set issue_prefix: %v", err)
}
// Set health check metadata
if err := store.SetMetadata(ctx, "last_health_check", "2024-01-01T00:00:00Z"); err != nil {
t.Fatalf("failed to set health check metadata: %v", err)
}
// Enable freshness checking
store.EnableFreshnessChecking()
// Simulate first health check (before file replacement)
healthTime1, err := store.GetMetadata(ctx, "last_health_check")
if err != nil {
t.Fatalf("initial health check failed: %v", err)
}
t.Logf("Initial health check: %s", healthTime1)
// Get the current inode
inodeBefore := getInode(dbPath)
// Create a second database (simulating git merge bringing new database)
tmpDir2, err := os.MkdirTemp("", "beads-health-check-2-*")
if err != nil {
t.Fatalf("failed to create temp dir 2: %v", err)
}
defer os.RemoveAll(tmpDir2)
dbPath2 := filepath.Join(tmpDir2, "test.db")
store2, err := New(ctx, dbPath2)
if err != nil {
t.Fatalf("failed to create store2: %v", err)
}
store2.SetConfig(ctx, "issue_prefix", "bd")
store2.SetMetadata(ctx, "last_health_check", "2024-01-02T00:00:00Z")
store2.Close()
// Replace the database file (simulate git pull/merge)
os.Remove(dbPath + "-wal")
os.Remove(dbPath + "-shm")
dbContent, err := os.ReadFile(dbPath2)
if err != nil {
t.Fatalf("failed to read replacement db: %v", err)
}
tempFile := dbPath + ".new"
if err := os.WriteFile(tempFile, dbContent, 0644); err != nil {
t.Fatalf("failed to write temp file: %v", err)
}
if err := os.Rename(tempFile, dbPath); err != nil {
t.Fatalf("failed to rename: %v", err)
}
inodeAfter := getInode(dbPath)
t.Logf("File replacement: inode %d -> %d", inodeBefore, inodeAfter)
// Small delay to simulate time passing
time.Sleep(100 * time.Millisecond)
// Simulate health check after file replacement
// This is where the production bug occurred - GetMetadata() would fail with
// "sql: database is closed" and never recover
healthTime2, err := store.GetMetadata(ctx, "last_health_check")
if err != nil {
t.Errorf("Health check after file replacement failed: %v (ZOMBIE STATE BUG)", err)
} else {
t.Logf("Health check after replacement: %s", healthTime2)
if healthTime2 != "2024-01-02T00:00:00Z" {
t.Errorf("Health check returned stale data: got %q, want %q", healthTime2, "2024-01-02T00:00:00Z")
} else {
t.Logf("SUCCESS: Health check saw new data after reconnection")
}
}
// Verify subsequent health checks also work
for i := range 3 {
healthTime, err := store.GetMetadata(ctx, "last_health_check")
if err != nil {
t.Errorf("Health check %d failed: %v", i+1, err)
} else {
t.Logf("Health check %d: %s", i+1, healthTime)
}
time.Sleep(50 * time.Millisecond)
}
}
// TestReconnectNewConnectionFailure verifies that if a new connection fails to open,
// the old connection remains usable. This is the key improvement from Phase 2 of the fix.
func TestReconnectNewConnectionFailure(t *testing.T) {
// Create temp directory
tmpDir, err := os.MkdirTemp("", "beads-reconnect-failure-*")
if err != nil {
t.Fatalf("failed to create temp dir: %v", err)
}
defer os.RemoveAll(tmpDir)
dbPath := filepath.Join(tmpDir, "test.db")
ctx := context.Background()
// Create daemon store
store, err := New(ctx, dbPath)
if err != nil {
t.Fatalf("failed to create store: %v", err)
}
defer store.Close()
// Initialize
if err := store.SetConfig(ctx, "issue_prefix", "bd"); err != nil {
t.Fatalf("failed to set issue_prefix: %v", err)
}
// Create an issue
issue := &types.Issue{
Title: "Test Issue",
Status: types.StatusOpen,
Priority: 2,
IssueType: types.TypeTask,
}
if err := store.CreateIssue(ctx, issue, "test-user"); err != nil {
t.Fatalf("failed to create issue: %v", err)
}
issueID := issue.ID
// Enable freshness checking
store.EnableFreshnessChecking()
// Verify issue is accessible
gotIssue, err := store.GetIssue(ctx, issueID)
if err != nil || gotIssue == nil {
t.Fatalf("issue should be accessible: err=%v", err)
}
// Now simulate a scenario where reconnect() would be called but would fail
// We'll delete the database file entirely, which will cause the new connection
// to fail to open properly
os.Remove(dbPath)
os.Remove(dbPath + "-wal")
os.Remove(dbPath + "-shm")
t.Logf("Database file deleted")
// Touch the file to trigger freshness check (create empty file)
// This will change the inode and trigger reconnection attempt
if err := os.WriteFile(dbPath, []byte{}, 0644); err != nil {
t.Fatalf("failed to create empty db file: %v", err)
}
// Small delay
time.Sleep(100 * time.Millisecond)
// Try to access the issue - with Phase 2 fix, the OLD connection should still work
// even though reconnect() fails (because new connection is invalid)
// NOTE: This might still fail because the old connection's file descriptor
// points to the deleted file, which is expected. The key test is that we don't
// get a panic or enter a zombie state.
_, err = store.GetIssue(ctx, issueID)
// The operation might fail (which is okay since file was deleted),
// but it should NOT panic and should handle the error gracefully
t.Logf("GetIssue after file deletion: err=%v", err)
// The important thing is that the store is still in a consistent state
// and can recover once the database is restored
// Now restore a valid database
store2, err := New(ctx, dbPath)
if err != nil {
t.Fatalf("failed to create new store: %v", err)
}
store2.SetConfig(ctx, "issue_prefix", "bd")
store2.Close()
// After restoration, freshness check should detect the change and reconnect
time.Sleep(100 * time.Millisecond)
// Try to use the store again - should work after database is restored
_, err = store.GetConfig(ctx, "issue_prefix")
t.Logf("GetConfig after database restoration: err=%v", err)
// The key test: verify that store didn't enter zombie state
// If Phase 2 fix is working, GetConfig should either succeed or fail gracefully
// without leaving the store permanently broken
}

View File

@@ -491,7 +491,11 @@ func (s *SQLiteStorage) DisableFreshnessChecking() {
// This should be called before read operations in daemon mode.
func (s *SQLiteStorage) checkFreshness() {
if s.freshness != nil {
s.freshness.Check()
wasReplaced := s.freshness.Check()
if wasReplaced {
// Log that reconnection was attempted
debugPrintf("Database file replaced, reconnection triggered\n")
}
}
}
@@ -502,25 +506,19 @@ func (s *SQLiteStorage) reconnect() error {
defer s.reconnectMu.Unlock()
if s.closed.Load() {
return nil
return fmt.Errorf("storage is closed")
}
// Close the old connection - log but continue since connection may be stale/invalid
if err := s.db.Close(); err != nil {
// Old connection might already be broken after file replacement - this is expected
debugPrintf("reconnect: close old connection: %v (continuing)\n", err)
}
// Open a new connection
// Open NEW connection FIRST (don't close old one yet)
db, err := sql.Open("sqlite3", s.connStr)
if err != nil {
return fmt.Errorf("failed to reconnect: %w", err)
return fmt.Errorf("failed to open new connection: %w", err)
}
// Restore connection pool settings
// Configure connection pool for new connection
s.configureConnectionPool(db)
// Re-enable WAL mode for file-based databases (or DELETE for WSL2 Windows paths)
// Re-enable WAL mode (or DELETE for WSL2)
isInMemory := s.dbPath == ":memory:" ||
(strings.HasPrefix(s.connStr, "file:") && strings.Contains(s.connStr, "mode=memory"))
if !isInMemory {
@@ -530,19 +528,25 @@ func (s *SQLiteStorage) reconnect() error {
}
if _, err := db.Exec("PRAGMA journal_mode=" + journalMode); err != nil {
_ = db.Close()
return fmt.Errorf("failed to enable %s mode on reconnect: %w", journalMode, err)
return fmt.Errorf("failed to enable %s mode: %w", journalMode, err)
}
}
// Test the new connection
// VALIDATE new connection works
if err := db.Ping(); err != nil {
_ = db.Close()
return fmt.Errorf("failed to ping on reconnect: %w", err)
return fmt.Errorf("failed to ping new connection: %w", err)
}
// Swap in the new connection
// SUCCESS: Swap connections (old one can now fail safely)
oldDB := s.db
s.db = db
// Close old connection (errors are non-fatal since file may be deleted)
if err := oldDB.Close(); err != nil {
debugPrintf("reconnect: close old connection: %v (non-fatal)\n", err)
}
// Update freshness checker state
if s.freshness != nil {
s.freshness.UpdateState()