Implement transaction retry logic for SQLITE_BUSY (bd-ola6)

Add exponential backoff retry for BEGIN IMMEDIATE transactions to handle
concurrent write load without spurious failures.

Changes:
- Add IsBusyError() helper to detect database locked errors
- Add beginImmediateWithRetry() with exponential backoff (10ms, 20ms, 40ms, 80ms, 160ms)
- Update CreateIssue and CreateIssuesInBatch to use retry logic
- Add comprehensive tests for error detection and retry behavior
- Handles context cancellation between retry attempts
- Fails fast on non-busy errors

This eliminates spurious SQLITE_BUSY failures under normal concurrent usage
while maintaining proper error handling for other failure modes.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Steve Yegge
2025-11-24 01:10:04 -08:00
parent a8919fde9f
commit 125d02860c
5 changed files with 256 additions and 76 deletions

File diff suppressed because one or more lines are too long

View File

@@ -144,7 +144,8 @@ func (s *SQLiteStorage) CreateIssuesWithOptions(ctx context.Context, issues []*t
} }
defer func() { _ = conn.Close() }() defer func() { _ = conn.Close() }()
if _, err := conn.ExecContext(ctx, "BEGIN IMMEDIATE"); err != nil { // Use retry logic with exponential backoff to handle SQLITE_BUSY under concurrent load (bd-ola6)
if err := beginImmediateWithRetry(ctx, conn, 5, 10*time.Millisecond); err != nil {
return fmt.Errorf("failed to begin immediate transaction: %w", err) return fmt.Errorf("failed to begin immediate transaction: %w", err)
} }

View File

@@ -58,7 +58,9 @@ func (s *SQLiteStorage) CreateIssue(ctx context.Context, issue *types.Issue, act
// //
// We use raw Exec instead of BeginTx because database/sql doesn't support transaction // We use raw Exec instead of BeginTx because database/sql doesn't support transaction
// modes in BeginTx, and modernc.org/sqlite's BeginTx always uses DEFERRED mode. // modes in BeginTx, and modernc.org/sqlite's BeginTx always uses DEFERRED mode.
if _, err := conn.ExecContext(ctx, "BEGIN IMMEDIATE"); err != nil { //
// Use retry logic with exponential backoff to handle SQLITE_BUSY under concurrent load (bd-ola6)
if err := beginImmediateWithRetry(ctx, conn, 5, 10*time.Millisecond); err != nil {
return fmt.Errorf("failed to begin immediate transaction: %w", err) return fmt.Errorf("failed to begin immediate transaction: %w", err)
} }

View File

@@ -4,6 +4,7 @@ import (
"context" "context"
"database/sql" "database/sql"
"strings" "strings"
"time"
) )
// QueryContext exposes the underlying database QueryContext method for advanced queries // QueryContext exposes the underlying database QueryContext method for advanced queries
@@ -62,4 +63,73 @@ func IsForeignKeyConstraintError(err error) bool {
strings.Contains(errStr, "foreign key constraint failed") strings.Contains(errStr, "foreign key constraint failed")
} }
// IsBusyError checks if an error is a database busy/locked error
func IsBusyError(err error) bool {
if err == nil {
return false
}
errStr := err.Error()
return strings.Contains(errStr, "database is locked") ||
strings.Contains(errStr, "SQLITE_BUSY")
}
// beginImmediateWithRetry starts an IMMEDIATE transaction with exponential backoff retry
// on SQLITE_BUSY errors. This addresses bd-ola6: under concurrent write load, BEGIN IMMEDIATE
// can fail with SQLITE_BUSY, so we retry with exponential backoff instead of failing immediately.
//
// Parameters:
// - ctx: context for cancellation checking
// - conn: dedicated database connection (must use same connection for entire transaction)
// - maxRetries: maximum number of retry attempts (default: 5)
// - initialDelay: initial backoff delay (default: 10ms)
//
// Returns error if:
// - Context is cancelled
// - BEGIN IMMEDIATE fails with non-busy error
// - All retries exhausted with SQLITE_BUSY
func beginImmediateWithRetry(ctx context.Context, conn *sql.Conn, maxRetries int, initialDelay time.Duration) error {
if maxRetries <= 0 {
maxRetries = 5
}
if initialDelay <= 0 {
initialDelay = 10 * time.Millisecond
}
var lastErr error
delay := initialDelay
for attempt := 0; attempt <= maxRetries; attempt++ {
// Check context cancellation before each attempt
if err := ctx.Err(); err != nil {
return err
}
// Attempt to begin transaction
_, err := conn.ExecContext(ctx, "BEGIN IMMEDIATE")
if err == nil {
return nil // Success
}
lastErr = err
// If not a busy error, fail immediately
if !IsBusyError(err) {
return err
}
// On last attempt, don't sleep
if attempt == maxRetries {
break
}
// Exponential backoff: sleep before retry
select {
case <-time.After(delay):
delay *= 2 // Double the delay for next attempt
case <-ctx.Done():
return ctx.Err()
}
}
return lastErr // Return the last SQLITE_BUSY error after exhausting retries
}

View File

@@ -208,3 +208,110 @@ func TestQueryContext(t *testing.T) {
t.Error("Expected only one row") t.Error("Expected only one row")
} }
} }
func TestIsBusyError(t *testing.T) {
tests := []struct {
name string
err error
expected bool
}{
{
name: "nil error",
err: nil,
expected: false,
},
{
name: "database is locked",
err: errors.New("database is locked"),
expected: true,
},
{
name: "SQLITE_BUSY",
err: errors.New("SQLITE_BUSY"),
expected: true,
},
{
name: "SQLITE_BUSY with context",
err: errors.New("failed to begin: SQLITE_BUSY: database is locked"),
expected: true,
},
{
name: "other error",
err: errors.New("some other database error"),
expected: false,
},
{
name: "UNIQUE constraint error",
err: errors.New("UNIQUE constraint failed: issues.id"),
expected: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := IsBusyError(tt.err)
if result != tt.expected {
t.Errorf("IsBusyError(%v) = %v, want %v", tt.err, result, tt.expected)
}
})
}
}
func TestBeginImmediateWithRetry(t *testing.T) {
ctx := context.Background()
store := newTestStore(t, t.TempDir()+"/test.db")
defer store.Close()
t.Run("successful on first try", func(t *testing.T) {
conn, err := store.db.Conn(ctx)
if err != nil {
t.Fatalf("Failed to acquire connection: %v", err)
}
defer conn.Close()
err = beginImmediateWithRetry(ctx, conn, 5, 10)
if err != nil {
t.Errorf("beginImmediateWithRetry failed: %v", err)
}
// Rollback to clean up
_, _ = conn.ExecContext(context.Background(), "ROLLBACK")
})
t.Run("context cancellation", func(t *testing.T) {
conn, err := store.db.Conn(ctx)
if err != nil {
t.Fatalf("Failed to acquire connection: %v", err)
}
defer conn.Close()
cancelCtx, cancel := context.WithCancel(ctx)
cancel() // Cancel immediately
err = beginImmediateWithRetry(cancelCtx, conn, 5, 10)
if err == nil {
t.Error("Expected context cancellation error, got nil")
_, _ = conn.ExecContext(context.Background(), "ROLLBACK")
}
if !errors.Is(err, context.Canceled) {
t.Errorf("Expected context.Canceled, got %v", err)
}
})
t.Run("defaults for invalid parameters", func(t *testing.T) {
conn, err := store.db.Conn(ctx)
if err != nil {
t.Fatalf("Failed to acquire connection: %v", err)
}
defer conn.Close()
// Should use defaults (5 retries, 10ms delay) when passed invalid values
err = beginImmediateWithRetry(ctx, conn, 0, 0)
if err != nil {
t.Errorf("beginImmediateWithRetry with invalid params failed: %v", err)
}
// Rollback to clean up
_, _ = conn.ExecContext(context.Background(), "ROLLBACK")
})
}