Cache compiled regexes in ID replacement for 1.9x performance boost

Implements bd-27: Cache compiled regexes in replaceIDReferences for performance

Problem:
replaceIDReferences() was compiling regex patterns on every call. With 100
issues and 10 ID mappings, that resulted in 4,000 regex compilations (100
issues × 4 text fields × 10 ID mappings).

Solution:
- Added buildReplacementCache() to pre-compile all regexes once
- Added replaceIDReferencesWithCache() to reuse compiled regexes
- Updated updateReferences() to build cache once and reuse for all issues
- Kept replaceIDReferences() for backward compatibility (calls cached version)

Performance Results (from benchmarks):
Single text:
- 1.33x faster (26,162 ns → 19,641 ns)
- 68% less memory (25,769 B → 8,241 B)
- 80% fewer allocations (278 → 55)

Real-world (400 texts, 10 mappings):
- 1.89x faster (5.1ms → 2.7ms)
- 90% less memory (7.7 MB → 0.8 MB)
- 86% fewer allocations (104,112 → 14,801)

Tests:
- All existing tests pass
- Added 3 benchmark tests demonstrating improvements

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Steve Yegge
2025-10-13 23:50:48 -07:00
parent 33412871eb
commit 25644d9717
3 changed files with 155 additions and 29 deletions

View File

@@ -266,6 +266,13 @@ func RemapCollisions(ctx context.Context, s *SQLiteStorage, collisions []*Collis
// updateReferences updates all text field references and dependency records
// to point to new IDs based on the idMapping
func updateReferences(ctx context.Context, s *SQLiteStorage, idMapping map[string]string) error {
// Pre-compile all regexes once for the entire operation
// This avoids recompiling the same patterns for each text field
cache, err := buildReplacementCache(idMapping)
if err != nil {
return fmt.Errorf("failed to build replacement cache: %w", err)
}
// Update text fields in all issues (both DB and incoming)
// We need to update issues in the database
dbIssues, err := s.SearchIssues(ctx, "", types.IssueFilter{})
@@ -276,26 +283,26 @@ func updateReferences(ctx context.Context, s *SQLiteStorage, idMapping map[strin
for _, issue := range dbIssues {
updates := make(map[string]interface{})
// Update description
newDesc := replaceIDReferences(issue.Description, idMapping)
// Update description using cached regexes
newDesc := replaceIDReferencesWithCache(issue.Description, cache)
if newDesc != issue.Description {
updates["description"] = newDesc
}
// Update design
newDesign := replaceIDReferences(issue.Design, idMapping)
// Update design using cached regexes
newDesign := replaceIDReferencesWithCache(issue.Design, cache)
if newDesign != issue.Design {
updates["design"] = newDesign
}
// Update notes
newNotes := replaceIDReferences(issue.Notes, idMapping)
// Update notes using cached regexes
newNotes := replaceIDReferencesWithCache(issue.Notes, cache)
if newNotes != issue.Notes {
updates["notes"] = newNotes
}
// Update acceptance criteria
newAC := replaceIDReferences(issue.AcceptanceCriteria, idMapping)
// Update acceptance criteria using cached regexes
newAC := replaceIDReferencesWithCache(issue.AcceptanceCriteria, cache)
if newAC != issue.AcceptanceCriteria {
updates["acceptance_criteria"] = newAC
}
@@ -316,32 +323,76 @@ func updateReferences(ctx context.Context, s *SQLiteStorage, idMapping map[strin
return nil
}
// idReplacementCache stores pre-compiled regexes for ID replacements
// This avoids recompiling the same regex patterns for each text field
type idReplacementCache struct {
oldID string
newID string
placeholder string
regex *regexp.Regexp
}
// buildReplacementCache pre-compiles all regex patterns for an ID mapping
// This cache should be created once per ID mapping and reused for all text replacements
func buildReplacementCache(idMapping map[string]string) ([]*idReplacementCache, error) {
cache := make([]*idReplacementCache, 0, len(idMapping))
i := 0
for oldID, newID := range idMapping {
// Use word boundary regex for exact matching
pattern := fmt.Sprintf(`\b%s\b`, regexp.QuoteMeta(oldID))
re, err := regexp.Compile(pattern)
if err != nil {
return nil, fmt.Errorf("failed to compile regex for %s: %w", oldID, err)
}
cache = append(cache, &idReplacementCache{
oldID: oldID,
newID: newID,
placeholder: fmt.Sprintf("__PLACEHOLDER_%d__", i),
regex: re,
})
i++
}
return cache, nil
}
// replaceIDReferencesWithCache replaces all occurrences of old IDs with new IDs using a pre-compiled cache
// Uses a two-phase approach to avoid replacement conflicts: first replace with placeholders, then replace with new IDs
func replaceIDReferencesWithCache(text string, cache []*idReplacementCache) string {
if len(cache) == 0 || text == "" {
return text
}
// Phase 1: Replace all old IDs with unique placeholders
result := text
for _, entry := range cache {
result = entry.regex.ReplaceAllString(result, entry.placeholder)
}
// Phase 2: Replace all placeholders with new IDs
for _, entry := range cache {
result = strings.ReplaceAll(result, entry.placeholder, entry.newID)
}
return result
}
// replaceIDReferences replaces all occurrences of old IDs with new IDs in text
// Uses word-boundary regex to ensure exact matches (bd-10 but not bd-100)
// Uses a two-phase approach to avoid replacement conflicts: first replace with
// placeholders, then replace placeholders with new IDs
//
// Note: This function compiles regexes on every call. For better performance when
// processing multiple text fields with the same ID mapping, use buildReplacementCache()
// and replaceIDReferencesWithCache() instead.
func replaceIDReferences(text string, idMapping map[string]string) string {
// Phase 1: Replace all old IDs with unique placeholders
placeholders := make(map[string]string)
result := text
i := 0
for oldID, newID := range idMapping {
placeholder := fmt.Sprintf("__PLACEHOLDER_%d__", i)
placeholders[placeholder] = newID
// Use word boundary regex for exact matching
pattern := fmt.Sprintf(`\b%s\b`, regexp.QuoteMeta(oldID))
re := regexp.MustCompile(pattern)
result = re.ReplaceAllString(result, placeholder)
i++
// Build cache (compiles regexes)
cache, err := buildReplacementCache(idMapping)
if err != nil {
// Fallback to no replacement if regex compilation fails
return text
}
// Phase 2: Replace all placeholders with new IDs
for placeholder, newID := range placeholders {
result = strings.ReplaceAll(result, placeholder, newID)
}
return result
return replaceIDReferencesWithCache(text, cache)
}
// updateDependencyReferences updates dependency records to use new IDs

View File

@@ -1027,3 +1027,78 @@ func TestUpdateDependencyReferences(t *testing.T) {
t.Errorf("expected 0 dependencies for bd-2, got %d", len(deps2))
}
}
// BenchmarkReplaceIDReferences benchmarks the old approach (compiling regex every time)
func BenchmarkReplaceIDReferences(b *testing.B) {
// Simulate a realistic scenario: 10 ID mappings
idMapping := make(map[string]string)
for i := 1; i <= 10; i++ {
idMapping[fmt.Sprintf("bd-%d", i)] = fmt.Sprintf("bd-%d", i+100)
}
text := "This mentions bd-1, bd-2, bd-3, bd-4, and bd-5 multiple times. " +
"Also bd-6, bd-7, bd-8, bd-9, and bd-10 are referenced here."
b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = replaceIDReferences(text, idMapping)
}
}
// BenchmarkReplaceIDReferencesWithCache benchmarks the new cached approach
func BenchmarkReplaceIDReferencesWithCache(b *testing.B) {
// Simulate a realistic scenario: 10 ID mappings
idMapping := make(map[string]string)
for i := 1; i <= 10; i++ {
idMapping[fmt.Sprintf("bd-%d", i)] = fmt.Sprintf("bd-%d", i+100)
}
text := "This mentions bd-1, bd-2, bd-3, bd-4, and bd-5 multiple times. " +
"Also bd-6, bd-7, bd-8, bd-9, and bd-10 are referenced here."
// Pre-compile the cache (this is done once in real usage)
cache, err := buildReplacementCache(idMapping)
if err != nil {
b.Fatalf("failed to build cache: %v", err)
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = replaceIDReferencesWithCache(text, cache)
}
}
// BenchmarkReplaceIDReferencesMultipleTexts simulates the real-world scenario:
// processing multiple text fields (4 per issue) across 100 issues
func BenchmarkReplaceIDReferencesMultipleTexts(b *testing.B) {
// 10 ID mappings (typical collision scenario)
idMapping := make(map[string]string)
for i := 1; i <= 10; i++ {
idMapping[fmt.Sprintf("bd-%d", i)] = fmt.Sprintf("bd-%d", i+100)
}
// Simulate 100 issues with 4 text fields each
texts := make([]string, 400)
for i := 0; i < 400; i++ {
texts[i] = fmt.Sprintf("Issue %d mentions bd-1, bd-2, and bd-5", i)
}
b.Run("without cache", func(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
for _, text := range texts {
_ = replaceIDReferences(text, idMapping)
}
}
})
b.Run("with cache", func(b *testing.B) {
cache, _ := buildReplacementCache(idMapping)
b.ResetTimer()
for i := 0; i < b.N; i++ {
for _, text := range texts {
_ = replaceIDReferencesWithCache(text, cache)
}
}
})
}