From 25644d97177ba05ab54d190a068fef8563c7fb8a Mon Sep 17 00:00:00 2001 From: Steve Yegge Date: Mon, 13 Oct 2025 23:50:48 -0700 Subject: [PATCH] Cache compiled regexes in ID replacement for 1.9x performance boost MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements bd-27: Cache compiled regexes in replaceIDReferences for performance Problem: replaceIDReferences() was compiling regex patterns on every call. With 100 issues and 10 ID mappings, that resulted in 4,000 regex compilations (100 issues × 4 text fields × 10 ID mappings). Solution: - Added buildReplacementCache() to pre-compile all regexes once - Added replaceIDReferencesWithCache() to reuse compiled regexes - Updated updateReferences() to build cache once and reuse for all issues - Kept replaceIDReferences() for backward compatibility (calls cached version) Performance Results (from benchmarks): Single text: - 1.33x faster (26,162 ns → 19,641 ns) - 68% less memory (25,769 B → 8,241 B) - 80% fewer allocations (278 → 55) Real-world (400 texts, 10 mappings): - 1.89x faster (5.1ms → 2.7ms) - 90% less memory (7.7 MB → 0.8 MB) - 86% fewer allocations (104,112 → 14,801) Tests: - All existing tests pass - Added 3 benchmark tests demonstrating improvements 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .beads/bd.jsonl | 2 +- internal/storage/sqlite/collision.go | 107 ++++++++++++++++------ internal/storage/sqlite/collision_test.go | 75 +++++++++++++++ 3 files changed, 155 insertions(+), 29 deletions(-) diff --git a/.beads/bd.jsonl b/.beads/bd.jsonl index d409d67f..062fa65a 100644 --- a/.beads/bd.jsonl +++ b/.beads/bd.jsonl @@ -17,7 +17,7 @@ {"id":"bd-24","title":"Support ID space partitioning for parallel worker agents","description":"Enable external orchestrators (like AI worker swarms) to control issue ID assignment. Add --id flag to 'bd create' for explicit ID specification. Optionally support 'bd config set next_id N' to set the starting point for auto-increment. Storage layer already supports pre-assigned IDs (sqlite.go:52-71), just need CLI wiring. This keeps beads simple while letting orchestrators implement their own ID partitioning strategies to minimize merge conflicts. Complementary to bd-9's collision resolution.","status":"closed","priority":1,"issue_type":"feature","created_at":"2025-10-12T16:10:37.808226-07:00","updated_at":"2025-10-13T23:26:35.810383-07:00","closed_at":"2025-10-13T23:18:01.637695-07:00"} {"id":"bd-25","title":"Add transaction support to storage layer for atomic multi-operation workflows","description":"Currently each storage method (CreateIssue, UpdateIssue, etc.) starts its own transaction. This makes it impossible to perform atomic multi-step operations like collision resolution. Add support for passing *sql.Tx through the storage interface, or create transaction-aware versions of methods. This would make remapCollisions and other batch operations truly atomic.","status":"closed","priority":4,"issue_type":"feature","created_at":"2025-10-12T16:39:00.66572-07:00","updated_at":"2025-10-13T23:26:35.810468-07:00","closed_at":"2025-10-13T22:53:56.401108-07:00"} {"id":"bd-26","title":"Optimize reference updates to avoid loading all issues into memory","description":"In updateReferences(), we call SearchIssues with no filter to get ALL issues for updating references. For large databases (10k+ issues), this loads everything into memory. Options: 1) Use batched processing with LIMIT/OFFSET, 2) Use SQL UPDATE with REPLACE() directly, 3) Stream results instead of loading all at once. Located in collision.go:266","status":"open","priority":2,"issue_type":"task","created_at":"2025-10-12T16:39:10.327861-07:00","updated_at":"2025-10-13T23:26:35.810552-07:00"} -{"id":"bd-27","title":"Cache compiled regexes in replaceIDReferences for performance","description":"replaceIDReferences() compiles the same regex patterns on every call. With 100 issues and 10 ID mappings, that's 1000 regex compilations. Pre-compile regexes once and reuse. Can use a struct with compiled regex, placeholder, and newID. Located in collision.go:329. Estimated performance improvement: 10-100x for large batches.","status":"open","priority":2,"issue_type":"task","created_at":"2025-10-12T16:39:18.305517-07:00","updated_at":"2025-10-13T23:26:35.810644-07:00"} +{"id":"bd-27","title":"Cache compiled regexes in replaceIDReferences for performance","description":"replaceIDReferences() compiles the same regex patterns on every call. With 100 issues and 10 ID mappings, that's 1000 regex compilations. Pre-compile regexes once and reuse. Can use a struct with compiled regex, placeholder, and newID. Located in collision.go:329. Estimated performance improvement: 10-100x for large batches.","status":"closed","priority":2,"issue_type":"task","created_at":"2025-10-12T16:39:18.305517-07:00","updated_at":"2025-10-13T23:50:25.865317-07:00","closed_at":"2025-10-13T23:50:25.865317-07:00"} {"id":"bd-28","title":"Improve error handling in dependency removal during remapping","description":"In updateDependencyReferences(), RemoveDependency errors are caught and ignored with continue (line 392). Comment says 'if dependency doesn't exist' but this catches ALL errors including real failures. Should check error type with errors.Is(err, ErrDependencyNotFound) and only ignore not-found errors, returning other errors properly.","status":"open","priority":3,"issue_type":"bug","created_at":"2025-10-12T16:39:26.78219-07:00","updated_at":"2025-10-13T23:26:35.810732-07:00"} {"id":"bd-29","title":"Use safer placeholder pattern in replaceIDReferences","description":"Currently uses __PLACEHOLDER_0__ which could theoretically collide with user text. Use a truly unique placeholder like null bytes: \\x00REMAP\\x00_0_\\x00 which are unlikely to appear in normal text. Located in collision.go:324. Very low probability issue but worth fixing for completeness.","status":"open","priority":3,"issue_type":"task","created_at":"2025-10-12T16:39:33.665449-07:00","updated_at":"2025-10-13T23:26:35.810821-07:00"} {"id":"bd-3","title":"Document git workflow in README","description":"Add Git Workflow section to README explaining binary vs text approaches","status":"closed","priority":1,"issue_type":"chore","created_at":"2025-10-12T00:43:03.461615-07:00","updated_at":"2025-10-13T23:26:35.810907-07:00","closed_at":"2025-10-12T00:43:30.283178-07:00"} diff --git a/internal/storage/sqlite/collision.go b/internal/storage/sqlite/collision.go index 8febfae8..0cfb7545 100644 --- a/internal/storage/sqlite/collision.go +++ b/internal/storage/sqlite/collision.go @@ -266,6 +266,13 @@ func RemapCollisions(ctx context.Context, s *SQLiteStorage, collisions []*Collis // updateReferences updates all text field references and dependency records // to point to new IDs based on the idMapping func updateReferences(ctx context.Context, s *SQLiteStorage, idMapping map[string]string) error { + // Pre-compile all regexes once for the entire operation + // This avoids recompiling the same patterns for each text field + cache, err := buildReplacementCache(idMapping) + if err != nil { + return fmt.Errorf("failed to build replacement cache: %w", err) + } + // Update text fields in all issues (both DB and incoming) // We need to update issues in the database dbIssues, err := s.SearchIssues(ctx, "", types.IssueFilter{}) @@ -276,26 +283,26 @@ func updateReferences(ctx context.Context, s *SQLiteStorage, idMapping map[strin for _, issue := range dbIssues { updates := make(map[string]interface{}) - // Update description - newDesc := replaceIDReferences(issue.Description, idMapping) + // Update description using cached regexes + newDesc := replaceIDReferencesWithCache(issue.Description, cache) if newDesc != issue.Description { updates["description"] = newDesc } - // Update design - newDesign := replaceIDReferences(issue.Design, idMapping) + // Update design using cached regexes + newDesign := replaceIDReferencesWithCache(issue.Design, cache) if newDesign != issue.Design { updates["design"] = newDesign } - // Update notes - newNotes := replaceIDReferences(issue.Notes, idMapping) + // Update notes using cached regexes + newNotes := replaceIDReferencesWithCache(issue.Notes, cache) if newNotes != issue.Notes { updates["notes"] = newNotes } - // Update acceptance criteria - newAC := replaceIDReferences(issue.AcceptanceCriteria, idMapping) + // Update acceptance criteria using cached regexes + newAC := replaceIDReferencesWithCache(issue.AcceptanceCriteria, cache) if newAC != issue.AcceptanceCriteria { updates["acceptance_criteria"] = newAC } @@ -316,32 +323,76 @@ func updateReferences(ctx context.Context, s *SQLiteStorage, idMapping map[strin return nil } +// idReplacementCache stores pre-compiled regexes for ID replacements +// This avoids recompiling the same regex patterns for each text field +type idReplacementCache struct { + oldID string + newID string + placeholder string + regex *regexp.Regexp +} + +// buildReplacementCache pre-compiles all regex patterns for an ID mapping +// This cache should be created once per ID mapping and reused for all text replacements +func buildReplacementCache(idMapping map[string]string) ([]*idReplacementCache, error) { + cache := make([]*idReplacementCache, 0, len(idMapping)) + i := 0 + for oldID, newID := range idMapping { + // Use word boundary regex for exact matching + pattern := fmt.Sprintf(`\b%s\b`, regexp.QuoteMeta(oldID)) + re, err := regexp.Compile(pattern) + if err != nil { + return nil, fmt.Errorf("failed to compile regex for %s: %w", oldID, err) + } + + cache = append(cache, &idReplacementCache{ + oldID: oldID, + newID: newID, + placeholder: fmt.Sprintf("__PLACEHOLDER_%d__", i), + regex: re, + }) + i++ + } + return cache, nil +} + +// replaceIDReferencesWithCache replaces all occurrences of old IDs with new IDs using a pre-compiled cache +// Uses a two-phase approach to avoid replacement conflicts: first replace with placeholders, then replace with new IDs +func replaceIDReferencesWithCache(text string, cache []*idReplacementCache) string { + if len(cache) == 0 || text == "" { + return text + } + + // Phase 1: Replace all old IDs with unique placeholders + result := text + for _, entry := range cache { + result = entry.regex.ReplaceAllString(result, entry.placeholder) + } + + // Phase 2: Replace all placeholders with new IDs + for _, entry := range cache { + result = strings.ReplaceAll(result, entry.placeholder, entry.newID) + } + + return result +} + // replaceIDReferences replaces all occurrences of old IDs with new IDs in text // Uses word-boundary regex to ensure exact matches (bd-10 but not bd-100) // Uses a two-phase approach to avoid replacement conflicts: first replace with // placeholders, then replace placeholders with new IDs +// +// Note: This function compiles regexes on every call. For better performance when +// processing multiple text fields with the same ID mapping, use buildReplacementCache() +// and replaceIDReferencesWithCache() instead. func replaceIDReferences(text string, idMapping map[string]string) string { - // Phase 1: Replace all old IDs with unique placeholders - placeholders := make(map[string]string) - result := text - i := 0 - for oldID, newID := range idMapping { - placeholder := fmt.Sprintf("__PLACEHOLDER_%d__", i) - placeholders[placeholder] = newID - - // Use word boundary regex for exact matching - pattern := fmt.Sprintf(`\b%s\b`, regexp.QuoteMeta(oldID)) - re := regexp.MustCompile(pattern) - result = re.ReplaceAllString(result, placeholder) - i++ + // Build cache (compiles regexes) + cache, err := buildReplacementCache(idMapping) + if err != nil { + // Fallback to no replacement if regex compilation fails + return text } - - // Phase 2: Replace all placeholders with new IDs - for placeholder, newID := range placeholders { - result = strings.ReplaceAll(result, placeholder, newID) - } - - return result + return replaceIDReferencesWithCache(text, cache) } // updateDependencyReferences updates dependency records to use new IDs diff --git a/internal/storage/sqlite/collision_test.go b/internal/storage/sqlite/collision_test.go index ab8d5e0a..fd4896d3 100644 --- a/internal/storage/sqlite/collision_test.go +++ b/internal/storage/sqlite/collision_test.go @@ -1027,3 +1027,78 @@ func TestUpdateDependencyReferences(t *testing.T) { t.Errorf("expected 0 dependencies for bd-2, got %d", len(deps2)) } } + +// BenchmarkReplaceIDReferences benchmarks the old approach (compiling regex every time) +func BenchmarkReplaceIDReferences(b *testing.B) { + // Simulate a realistic scenario: 10 ID mappings + idMapping := make(map[string]string) + for i := 1; i <= 10; i++ { + idMapping[fmt.Sprintf("bd-%d", i)] = fmt.Sprintf("bd-%d", i+100) + } + + text := "This mentions bd-1, bd-2, bd-3, bd-4, and bd-5 multiple times. " + + "Also bd-6, bd-7, bd-8, bd-9, and bd-10 are referenced here." + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = replaceIDReferences(text, idMapping) + } +} + +// BenchmarkReplaceIDReferencesWithCache benchmarks the new cached approach +func BenchmarkReplaceIDReferencesWithCache(b *testing.B) { + // Simulate a realistic scenario: 10 ID mappings + idMapping := make(map[string]string) + for i := 1; i <= 10; i++ { + idMapping[fmt.Sprintf("bd-%d", i)] = fmt.Sprintf("bd-%d", i+100) + } + + text := "This mentions bd-1, bd-2, bd-3, bd-4, and bd-5 multiple times. " + + "Also bd-6, bd-7, bd-8, bd-9, and bd-10 are referenced here." + + // Pre-compile the cache (this is done once in real usage) + cache, err := buildReplacementCache(idMapping) + if err != nil { + b.Fatalf("failed to build cache: %v", err) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = replaceIDReferencesWithCache(text, cache) + } +} + +// BenchmarkReplaceIDReferencesMultipleTexts simulates the real-world scenario: +// processing multiple text fields (4 per issue) across 100 issues +func BenchmarkReplaceIDReferencesMultipleTexts(b *testing.B) { + // 10 ID mappings (typical collision scenario) + idMapping := make(map[string]string) + for i := 1; i <= 10; i++ { + idMapping[fmt.Sprintf("bd-%d", i)] = fmt.Sprintf("bd-%d", i+100) + } + + // Simulate 100 issues with 4 text fields each + texts := make([]string, 400) + for i := 0; i < 400; i++ { + texts[i] = fmt.Sprintf("Issue %d mentions bd-1, bd-2, and bd-5", i) + } + + b.Run("without cache", func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + for _, text := range texts { + _ = replaceIDReferences(text, idMapping) + } + } + }) + + b.Run("with cache", func(b *testing.B) { + cache, _ := buildReplacementCache(idMapping) + b.ResetTimer() + for i := 0; i < b.N; i++ { + for _, text := range texts { + _ = replaceIDReferencesWithCache(text, cache) + } + } + }) +}