fix(sqlite): use BEGIN IMMEDIATE without retry loop (GH#1272)
Some checks failed
CI / Check version consistency (push) Successful in 3s
CI / Check for .beads changes (push) Has been skipped
CI / Test (ubuntu-latest) (push) Failing after 8m13s
CI / Lint (push) Failing after 3m18s
CI / Test Nix Flake (push) Failing after 1m7s
Nightly Full Tests / Full Test Suite (push) Failing after 36m59s
CI / Test (macos-latest) (push) Has been cancelled
CI / Test (Windows - smoke) (push) Has been cancelled
Some checks failed
CI / Check version consistency (push) Successful in 3s
CI / Check for .beads changes (push) Has been skipped
CI / Test (ubuntu-latest) (push) Failing after 8m13s
CI / Lint (push) Failing after 3m18s
CI / Test Nix Flake (push) Failing after 1m7s
Nightly Full Tests / Full Test Suite (push) Failing after 36m59s
CI / Test (macos-latest) (push) Has been cancelled
CI / Test (Windows - smoke) (push) Has been cancelled
The original PR added retry logic on top of BEGIN IMMEDIATE, but this caused multi-minute hangs because: 1. Connection has busy_timeout=30s set via pragma 2. Each BEGIN IMMEDIATE waits up to 30s before returning SQLITE_BUSY 3. With 5 retries, worst case was 5 × 30s = 150+ seconds The fix removes the retry loop since SQLite's busy_timeout already handles retries internally. BEGIN IMMEDIATE still acquires the write lock early, preventing deadlocks - we just let busy_timeout handle contention. Root cause analysis in bd-9ldm. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -273,8 +273,9 @@ func (s *SQLiteStorage) CreateIssuesWithFullOptions(ctx context.Context, issues
|
||||
}
|
||||
defer func() { _ = conn.Close() }()
|
||||
|
||||
// Use retry logic with exponential backoff to handle SQLITE_BUSY under concurrent load
|
||||
if err := beginImmediateWithRetry(ctx, conn, 5, 10*time.Millisecond); err != nil {
|
||||
// Start IMMEDIATE transaction to acquire write lock early.
|
||||
// The connection's busy_timeout pragma (30s) handles retries if locked.
|
||||
if _, err := conn.ExecContext(ctx, "BEGIN IMMEDIATE"); err != nil {
|
||||
return fmt.Errorf("failed to begin immediate transaction: %w", err)
|
||||
}
|
||||
|
||||
|
||||
@@ -158,8 +158,8 @@ func (s *SQLiteStorage) CreateIssue(ctx context.Context, issue *types.Issue, act
|
||||
// We use raw Exec instead of BeginTx because database/sql doesn't support transaction
|
||||
// modes in BeginTx, and modernc.org/sqlite's BeginTx always uses DEFERRED mode.
|
||||
//
|
||||
// Use retry logic with exponential backoff to handle SQLITE_BUSY under concurrent load
|
||||
if err := beginImmediateWithRetry(ctx, conn, 5, 10*time.Millisecond); err != nil {
|
||||
// The connection's busy_timeout pragma (30s) handles retries if locked.
|
||||
if _, err := conn.ExecContext(ctx, "BEGIN IMMEDIATE"); err != nil {
|
||||
return fmt.Errorf("failed to begin immediate transaction: %w", err)
|
||||
}
|
||||
|
||||
|
||||
@@ -48,8 +48,9 @@ func (s *SQLiteStorage) RunInTransaction(ctx context.Context, fn func(tx storage
|
||||
defer func() { _ = conn.Close() }()
|
||||
|
||||
// Start IMMEDIATE transaction to acquire write lock early.
|
||||
// Use retry logic with exponential backoff to handle SQLITE_BUSY
|
||||
if err := beginImmediateWithRetry(ctx, conn, 5, 10*time.Millisecond); err != nil {
|
||||
// BEGIN IMMEDIATE prevents deadlocks by acquiring the write lock upfront.
|
||||
// The connection's busy_timeout pragma (30s) handles retries if locked.
|
||||
if _, err := conn.ExecContext(ctx, "BEGIN IMMEDIATE"); err != nil {
|
||||
return fmt.Errorf("failed to begin transaction: %w", err)
|
||||
}
|
||||
|
||||
|
||||
@@ -4,7 +4,6 @@ import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// QueryContext exposes the underlying database QueryContext method for advanced queries
|
||||
@@ -18,10 +17,13 @@ func (s *SQLiteStorage) BeginTx(ctx context.Context) (*sql.Tx, error) {
|
||||
return s.db.BeginTx(ctx, nil)
|
||||
}
|
||||
|
||||
// withTx executes a function within a database transaction with retry logic.
|
||||
// Uses BEGIN IMMEDIATE with exponential backoff retry on SQLITE_BUSY errors.
|
||||
// If the function returns an error, the transaction is rolled back.
|
||||
// Otherwise, the transaction is committed.
|
||||
// withTx executes a function within a database transaction.
|
||||
// Uses BEGIN IMMEDIATE to acquire the write lock early, preventing deadlocks
|
||||
// in concurrent scenarios. If the function returns an error, the transaction
|
||||
// is rolled back. Otherwise, the transaction is committed.
|
||||
//
|
||||
// The connection's busy_timeout pragma (30s by default) handles SQLITE_BUSY
|
||||
// retries internally - no additional retry logic is needed here.
|
||||
//
|
||||
// This fixes GH#1272: database lock errors during concurrent operations.
|
||||
func (s *SQLiteStorage) withTx(ctx context.Context, fn func(*sql.Conn) error) error {
|
||||
@@ -34,8 +36,10 @@ func (s *SQLiteStorage) withTx(ctx context.Context, fn func(*sql.Conn) error) er
|
||||
defer func() { _ = conn.Close() }()
|
||||
|
||||
// Start IMMEDIATE transaction to acquire write lock early.
|
||||
// Use retry logic with exponential backoff to handle SQLITE_BUSY
|
||||
if err := beginImmediateWithRetry(ctx, conn, 5, 10*time.Millisecond); err != nil {
|
||||
// BEGIN IMMEDIATE prevents deadlocks by acquiring the write lock upfront
|
||||
// rather than upgrading from a read lock later. The connection's
|
||||
// busy_timeout pragma (30s) handles retries if another writer holds the lock.
|
||||
if _, err := conn.ExecContext(ctx, "BEGIN IMMEDIATE"); err != nil {
|
||||
return wrapDBError("begin transaction", err)
|
||||
}
|
||||
|
||||
@@ -103,64 +107,3 @@ func IsBusyError(err error) bool {
|
||||
return strings.Contains(errStr, "database is locked") ||
|
||||
strings.Contains(errStr, "SQLITE_BUSY")
|
||||
}
|
||||
|
||||
// beginImmediateWithRetry starts an IMMEDIATE transaction with exponential backoff retry
|
||||
// on SQLITE_BUSY errors. This addresses bd-ola6: under concurrent write load, BEGIN IMMEDIATE
|
||||
// can fail with SQLITE_BUSY, so we retry with exponential backoff instead of failing immediately.
|
||||
//
|
||||
// Parameters:
|
||||
// - ctx: context for cancellation checking
|
||||
// - conn: dedicated database connection (must use same connection for entire transaction)
|
||||
// - maxRetries: maximum number of retry attempts (default: 5)
|
||||
// - initialDelay: initial backoff delay (default: 10ms)
|
||||
//
|
||||
// Returns error if:
|
||||
// - Context is canceled
|
||||
// - BEGIN IMMEDIATE fails with non-busy error
|
||||
// - All retries exhausted with SQLITE_BUSY
|
||||
func beginImmediateWithRetry(ctx context.Context, conn *sql.Conn, maxRetries int, initialDelay time.Duration) error {
|
||||
if maxRetries <= 0 {
|
||||
maxRetries = 5
|
||||
}
|
||||
if initialDelay <= 0 {
|
||||
initialDelay = 10 * time.Millisecond
|
||||
}
|
||||
|
||||
var lastErr error
|
||||
delay := initialDelay
|
||||
|
||||
for attempt := 0; attempt <= maxRetries; attempt++ {
|
||||
// Check context cancellation before each attempt
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Attempt to begin transaction
|
||||
_, err := conn.ExecContext(ctx, "BEGIN IMMEDIATE")
|
||||
if err == nil {
|
||||
return nil // Success
|
||||
}
|
||||
|
||||
lastErr = err
|
||||
|
||||
// If not a busy error, fail immediately
|
||||
if !IsBusyError(err) {
|
||||
return err
|
||||
}
|
||||
|
||||
// On last attempt, don't sleep
|
||||
if attempt == maxRetries {
|
||||
break
|
||||
}
|
||||
|
||||
// Exponential backoff: sleep before retry
|
||||
select {
|
||||
case <-time.After(delay):
|
||||
delay *= 2 // Double the delay for next attempt
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
}
|
||||
}
|
||||
|
||||
return lastErr // Return the last SQLITE_BUSY error after exhausting retries
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
@@ -257,21 +258,24 @@ func TestIsBusyError(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestBeginImmediateWithRetry(t *testing.T) {
|
||||
// TestBeginImmediate tests that BEGIN IMMEDIATE transactions work correctly.
|
||||
// Note: The retry logic was removed because SQLite's busy_timeout pragma (30s)
|
||||
// already handles retries internally. See GH#1272 for details.
|
||||
func TestBeginImmediate(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newTestStore(t, t.TempDir()+"/test.db")
|
||||
defer store.Close()
|
||||
|
||||
t.Run("successful on first try", func(t *testing.T) {
|
||||
t.Run("successful BEGIN IMMEDIATE", func(t *testing.T) {
|
||||
conn, err := store.db.Conn(ctx)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to acquire connection: %v", err)
|
||||
}
|
||||
defer conn.Close()
|
||||
|
||||
err = beginImmediateWithRetry(ctx, conn, 5, 10)
|
||||
_, err = conn.ExecContext(ctx, "BEGIN IMMEDIATE")
|
||||
if err != nil {
|
||||
t.Errorf("beginImmediateWithRetry failed: %v", err)
|
||||
t.Errorf("BEGIN IMMEDIATE failed: %v", err)
|
||||
}
|
||||
|
||||
// Rollback to clean up
|
||||
@@ -288,30 +292,14 @@ func TestBeginImmediateWithRetry(t *testing.T) {
|
||||
cancelCtx, cancel := context.WithCancel(ctx)
|
||||
cancel() // Cancel immediately
|
||||
|
||||
err = beginImmediateWithRetry(cancelCtx, conn, 5, 10)
|
||||
_, err = conn.ExecContext(cancelCtx, "BEGIN IMMEDIATE")
|
||||
if err == nil {
|
||||
t.Error("Expected context cancellation error, got nil")
|
||||
t.Error("Expected error due to canceled context, got nil")
|
||||
_, _ = conn.ExecContext(context.Background(), "ROLLBACK")
|
||||
}
|
||||
if !errors.Is(err, context.Canceled) {
|
||||
t.Errorf("Expected context.Canceled, got %v", err)
|
||||
// sqlite3 driver returns "interrupted" error rather than wrapping context.Canceled
|
||||
if err != nil && !errors.Is(err, context.Canceled) && !strings.Contains(err.Error(), "interrupt") {
|
||||
t.Errorf("Expected context cancellation or interrupt error, got %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("defaults for invalid parameters", func(t *testing.T) {
|
||||
conn, err := store.db.Conn(ctx)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to acquire connection: %v", err)
|
||||
}
|
||||
defer conn.Close()
|
||||
|
||||
// Should use defaults (5 retries, 10ms delay) when passed invalid values
|
||||
err = beginImmediateWithRetry(ctx, conn, 0, 0)
|
||||
if err != nil {
|
||||
t.Errorf("beginImmediateWithRetry with invalid params failed: %v", err)
|
||||
}
|
||||
|
||||
// Rollback to clean up
|
||||
_, _ = conn.ExecContext(context.Background(), "ROLLBACK")
|
||||
})
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user