fix(sqlite): use BEGIN IMMEDIATE without retry loop (GH#1272)
Some checks failed
CI / Check version consistency (push) Successful in 3s
CI / Check for .beads changes (push) Has been skipped
CI / Test (ubuntu-latest) (push) Failing after 8m13s
CI / Lint (push) Failing after 3m18s
CI / Test Nix Flake (push) Failing after 1m7s
Nightly Full Tests / Full Test Suite (push) Failing after 36m59s
CI / Test (macos-latest) (push) Has been cancelled
CI / Test (Windows - smoke) (push) Has been cancelled

The original PR added retry logic on top of BEGIN IMMEDIATE, but this caused
multi-minute hangs because:

1. Connection has busy_timeout=30s set via pragma
2. Each BEGIN IMMEDIATE waits up to 30s before returning SQLITE_BUSY
3. With 5 retries, worst case was 5 × 30s = 150+ seconds

The fix removes the retry loop since SQLite's busy_timeout already handles
retries internally. BEGIN IMMEDIATE still acquires the write lock early,
preventing deadlocks - we just let busy_timeout handle contention.

Root cause analysis in bd-9ldm.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
ruby
2026-01-22 18:55:30 -08:00
committed by John Ogle
parent 442ad0f0e5
commit a45b441bc5
5 changed files with 32 additions and 99 deletions

View File

@@ -273,8 +273,9 @@ func (s *SQLiteStorage) CreateIssuesWithFullOptions(ctx context.Context, issues
}
defer func() { _ = conn.Close() }()
// Use retry logic with exponential backoff to handle SQLITE_BUSY under concurrent load
if err := beginImmediateWithRetry(ctx, conn, 5, 10*time.Millisecond); err != nil {
// Start IMMEDIATE transaction to acquire write lock early.
// The connection's busy_timeout pragma (30s) handles retries if locked.
if _, err := conn.ExecContext(ctx, "BEGIN IMMEDIATE"); err != nil {
return fmt.Errorf("failed to begin immediate transaction: %w", err)
}

View File

@@ -158,8 +158,8 @@ func (s *SQLiteStorage) CreateIssue(ctx context.Context, issue *types.Issue, act
// We use raw Exec instead of BeginTx because database/sql doesn't support transaction
// modes in BeginTx, and modernc.org/sqlite's BeginTx always uses DEFERRED mode.
//
// Use retry logic with exponential backoff to handle SQLITE_BUSY under concurrent load
if err := beginImmediateWithRetry(ctx, conn, 5, 10*time.Millisecond); err != nil {
// The connection's busy_timeout pragma (30s) handles retries if locked.
if _, err := conn.ExecContext(ctx, "BEGIN IMMEDIATE"); err != nil {
return fmt.Errorf("failed to begin immediate transaction: %w", err)
}

View File

@@ -48,8 +48,9 @@ func (s *SQLiteStorage) RunInTransaction(ctx context.Context, fn func(tx storage
defer func() { _ = conn.Close() }()
// Start IMMEDIATE transaction to acquire write lock early.
// Use retry logic with exponential backoff to handle SQLITE_BUSY
if err := beginImmediateWithRetry(ctx, conn, 5, 10*time.Millisecond); err != nil {
// BEGIN IMMEDIATE prevents deadlocks by acquiring the write lock upfront.
// The connection's busy_timeout pragma (30s) handles retries if locked.
if _, err := conn.ExecContext(ctx, "BEGIN IMMEDIATE"); err != nil {
return fmt.Errorf("failed to begin transaction: %w", err)
}

View File

@@ -4,7 +4,6 @@ import (
"context"
"database/sql"
"strings"
"time"
)
// QueryContext exposes the underlying database QueryContext method for advanced queries
@@ -18,10 +17,13 @@ func (s *SQLiteStorage) BeginTx(ctx context.Context) (*sql.Tx, error) {
return s.db.BeginTx(ctx, nil)
}
// withTx executes a function within a database transaction with retry logic.
// Uses BEGIN IMMEDIATE with exponential backoff retry on SQLITE_BUSY errors.
// If the function returns an error, the transaction is rolled back.
// Otherwise, the transaction is committed.
// withTx executes a function within a database transaction.
// Uses BEGIN IMMEDIATE to acquire the write lock early, preventing deadlocks
// in concurrent scenarios. If the function returns an error, the transaction
// is rolled back. Otherwise, the transaction is committed.
//
// The connection's busy_timeout pragma (30s by default) handles SQLITE_BUSY
// retries internally - no additional retry logic is needed here.
//
// This fixes GH#1272: database lock errors during concurrent operations.
func (s *SQLiteStorage) withTx(ctx context.Context, fn func(*sql.Conn) error) error {
@@ -34,8 +36,10 @@ func (s *SQLiteStorage) withTx(ctx context.Context, fn func(*sql.Conn) error) er
defer func() { _ = conn.Close() }()
// Start IMMEDIATE transaction to acquire write lock early.
// Use retry logic with exponential backoff to handle SQLITE_BUSY
if err := beginImmediateWithRetry(ctx, conn, 5, 10*time.Millisecond); err != nil {
// BEGIN IMMEDIATE prevents deadlocks by acquiring the write lock upfront
// rather than upgrading from a read lock later. The connection's
// busy_timeout pragma (30s) handles retries if another writer holds the lock.
if _, err := conn.ExecContext(ctx, "BEGIN IMMEDIATE"); err != nil {
return wrapDBError("begin transaction", err)
}
@@ -103,64 +107,3 @@ func IsBusyError(err error) bool {
return strings.Contains(errStr, "database is locked") ||
strings.Contains(errStr, "SQLITE_BUSY")
}
// beginImmediateWithRetry starts an IMMEDIATE transaction with exponential backoff retry
// on SQLITE_BUSY errors. This addresses bd-ola6: under concurrent write load, BEGIN IMMEDIATE
// can fail with SQLITE_BUSY, so we retry with exponential backoff instead of failing immediately.
//
// Parameters:
// - ctx: context for cancellation checking
// - conn: dedicated database connection (must use same connection for entire transaction)
// - maxRetries: maximum number of retry attempts (default: 5)
// - initialDelay: initial backoff delay (default: 10ms)
//
// Returns error if:
// - Context is canceled
// - BEGIN IMMEDIATE fails with non-busy error
// - All retries exhausted with SQLITE_BUSY
func beginImmediateWithRetry(ctx context.Context, conn *sql.Conn, maxRetries int, initialDelay time.Duration) error {
if maxRetries <= 0 {
maxRetries = 5
}
if initialDelay <= 0 {
initialDelay = 10 * time.Millisecond
}
var lastErr error
delay := initialDelay
for attempt := 0; attempt <= maxRetries; attempt++ {
// Check context cancellation before each attempt
if err := ctx.Err(); err != nil {
return err
}
// Attempt to begin transaction
_, err := conn.ExecContext(ctx, "BEGIN IMMEDIATE")
if err == nil {
return nil // Success
}
lastErr = err
// If not a busy error, fail immediately
if !IsBusyError(err) {
return err
}
// On last attempt, don't sleep
if attempt == maxRetries {
break
}
// Exponential backoff: sleep before retry
select {
case <-time.After(delay):
delay *= 2 // Double the delay for next attempt
case <-ctx.Done():
return ctx.Err()
}
}
return lastErr // Return the last SQLITE_BUSY error after exhausting retries
}

View File

@@ -4,6 +4,7 @@ import (
"context"
"database/sql"
"errors"
"strings"
"testing"
)
@@ -257,21 +258,24 @@ func TestIsBusyError(t *testing.T) {
}
}
func TestBeginImmediateWithRetry(t *testing.T) {
// TestBeginImmediate tests that BEGIN IMMEDIATE transactions work correctly.
// Note: The retry logic was removed because SQLite's busy_timeout pragma (30s)
// already handles retries internally. See GH#1272 for details.
func TestBeginImmediate(t *testing.T) {
ctx := context.Background()
store := newTestStore(t, t.TempDir()+"/test.db")
defer store.Close()
t.Run("successful on first try", func(t *testing.T) {
t.Run("successful BEGIN IMMEDIATE", func(t *testing.T) {
conn, err := store.db.Conn(ctx)
if err != nil {
t.Fatalf("Failed to acquire connection: %v", err)
}
defer conn.Close()
err = beginImmediateWithRetry(ctx, conn, 5, 10)
_, err = conn.ExecContext(ctx, "BEGIN IMMEDIATE")
if err != nil {
t.Errorf("beginImmediateWithRetry failed: %v", err)
t.Errorf("BEGIN IMMEDIATE failed: %v", err)
}
// Rollback to clean up
@@ -288,30 +292,14 @@ func TestBeginImmediateWithRetry(t *testing.T) {
cancelCtx, cancel := context.WithCancel(ctx)
cancel() // Cancel immediately
err = beginImmediateWithRetry(cancelCtx, conn, 5, 10)
_, err = conn.ExecContext(cancelCtx, "BEGIN IMMEDIATE")
if err == nil {
t.Error("Expected context cancellation error, got nil")
t.Error("Expected error due to canceled context, got nil")
_, _ = conn.ExecContext(context.Background(), "ROLLBACK")
}
if !errors.Is(err, context.Canceled) {
t.Errorf("Expected context.Canceled, got %v", err)
// sqlite3 driver returns "interrupted" error rather than wrapping context.Canceled
if err != nil && !errors.Is(err, context.Canceled) && !strings.Contains(err.Error(), "interrupt") {
t.Errorf("Expected context cancellation or interrupt error, got %v", err)
}
})
t.Run("defaults for invalid parameters", func(t *testing.T) {
conn, err := store.db.Conn(ctx)
if err != nil {
t.Fatalf("Failed to acquire connection: %v", err)
}
defer conn.Close()
// Should use defaults (5 retries, 10ms delay) when passed invalid values
err = beginImmediateWithRetry(ctx, conn, 0, 0)
if err != nil {
t.Errorf("beginImmediateWithRetry with invalid params failed: %v", err)
}
// Rollback to clean up
_, _ = conn.ExecContext(context.Background(), "ROLLBACK")
})
}