Implement content-first idempotent import (bd-98)

- Refactored upsertIssues to match by content hash first, then by ID
- Added buildHashMap, buildIDMap, and handleRename helper functions
- Import now detects and handles renames (same content, different ID)
- Importing same data multiple times is idempotent (reports Unchanged)
- Exported BuildReplacementCache and ReplaceIDReferencesWithCache for reuse
- All 30+ existing import tests pass
- Improved convergence for N-way collision scenarios

Changes:
- internal/importer/importer.go: Content-first matching in upsertIssues
- internal/storage/sqlite/collision.go: Exported helper functions
- internal/storage/sqlite/collision_test.go: Updated function names

Amp-Thread-ID: https://ampcode.com/threads/T-3df96ad8-7c0e-4190-87b5-6d5327718f0a
Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
Steve Yegge
2025-10-28 20:40:36 -07:00
parent 44df03ae55
commit ff02615f61
3 changed files with 179 additions and 68 deletions

View File

@@ -263,68 +263,179 @@ func handleCollisions(ctx context.Context, sqliteStore *sqlite.SQLiteStorage, is
return issues, nil
}
// upsertIssues creates new issues or updates existing ones
func upsertIssues(ctx context.Context, sqliteStore *sqlite.SQLiteStorage, issues []*types.Issue, opts Options, result *Result) error {
var newIssues []*types.Issue
seenNew := make(map[string]int)
// buildHashMap creates a map of content hash → issue for O(1) lookup
func buildHashMap(issues []*types.Issue) map[string]*types.Issue {
result := make(map[string]*types.Issue)
for _, issue := range issues {
// Check if issue exists in DB
existing, err := sqliteStore.GetIssue(ctx, issue.ID)
if err != nil {
return fmt.Errorf("error checking issue %s: %w", issue.ID, err)
if issue.ContentHash != "" {
result[issue.ContentHash] = issue
}
}
return result
}
// buildIDMap creates a map of ID → issue for O(1) lookup
func buildIDMap(issues []*types.Issue) map[string]*types.Issue {
result := make(map[string]*types.Issue)
for _, issue := range issues {
result[issue.ID] = issue
}
return result
}
// handleRename handles content match with different IDs (rename detected)
func handleRename(ctx context.Context, s *sqlite.SQLiteStorage, existing *types.Issue, incoming *types.Issue) error {
// Delete old ID
if err := s.DeleteIssue(ctx, existing.ID); err != nil {
return fmt.Errorf("failed to delete old ID %s: %w", existing.ID, err)
}
// Create with new ID
if err := s.CreateIssue(ctx, incoming, "import-rename"); err != nil {
return fmt.Errorf("failed to create renamed issue %s: %w", incoming.ID, err)
}
// Update references from old ID to new ID
idMapping := map[string]string{existing.ID: incoming.ID}
cache, err := sqlite.BuildReplacementCache(idMapping)
if err != nil {
return fmt.Errorf("failed to build replacement cache: %w", err)
}
// Get all issues to update references
dbIssues, err := s.SearchIssues(ctx, "", types.IssueFilter{})
if err != nil {
return fmt.Errorf("failed to get issues for reference update: %w", err)
}
// Update text field references in all issues
for _, issue := range dbIssues {
updates := make(map[string]interface{})
newDesc := sqlite.ReplaceIDReferencesWithCache(issue.Description, cache)
if newDesc != issue.Description {
updates["description"] = newDesc
}
if existing != nil {
// Issue exists - update it unless SkipUpdate is set
if opts.SkipUpdate {
result.Skipped++
continue
newDesign := sqlite.ReplaceIDReferencesWithCache(issue.Design, cache)
if newDesign != issue.Design {
updates["design"] = newDesign
}
newNotes := sqlite.ReplaceIDReferencesWithCache(issue.Notes, cache)
if newNotes != issue.Notes {
updates["notes"] = newNotes
}
newAC := sqlite.ReplaceIDReferencesWithCache(issue.AcceptanceCriteria, cache)
if newAC != issue.AcceptanceCriteria {
updates["acceptance_criteria"] = newAC
}
if len(updates) > 0 {
if err := s.UpdateIssue(ctx, issue.ID, updates, "import-rename"); err != nil {
return fmt.Errorf("failed to update references in issue %s: %w", issue.ID, err)
}
}
}
// Build updates map
updates := make(map[string]interface{})
updates["title"] = issue.Title
updates["description"] = issue.Description
updates["status"] = issue.Status
updates["priority"] = issue.Priority
updates["issue_type"] = issue.IssueType
updates["design"] = issue.Design
updates["acceptance_criteria"] = issue.AcceptanceCriteria
updates["notes"] = issue.Notes
return nil
}
if issue.Assignee != "" {
updates["assignee"] = issue.Assignee
} else {
updates["assignee"] = nil
}
// upsertIssues creates new issues or updates existing ones using content-first matching
func upsertIssues(ctx context.Context, sqliteStore *sqlite.SQLiteStorage, issues []*types.Issue, opts Options, result *Result) error {
// Get all DB issues once
dbIssues, err := sqliteStore.SearchIssues(ctx, "", types.IssueFilter{})
if err != nil {
return fmt.Errorf("failed to get DB issues: %w", err)
}
dbByHash := buildHashMap(dbIssues)
dbByID := buildIDMap(dbIssues)
if issue.ExternalRef != nil && *issue.ExternalRef != "" {
updates["external_ref"] = *issue.ExternalRef
} else {
updates["external_ref"] = nil
}
// Track what we need to create
var newIssues []*types.Issue
seenHashes := make(map[string]bool)
// Only update if data actually changed
if IssueDataChanged(existing, updates) {
if err := sqliteStore.UpdateIssue(ctx, issue.ID, updates, "import"); err != nil {
return fmt.Errorf("error updating issue %s: %w", issue.ID, err)
}
result.Updated++
} else {
for _, incoming := range issues {
hash := incoming.ContentHash
if hash == "" {
// Shouldn't happen (computed earlier), but be defensive
hash = incoming.ComputeContentHash()
incoming.ContentHash = hash
}
// Skip duplicates within incoming batch
if seenHashes[hash] {
result.Skipped++
continue
}
seenHashes[hash] = true
// Phase 1: Match by content hash first
if existing, found := dbByHash[hash]; found {
// Same content exists
if existing.ID == incoming.ID {
// Exact match (same content, same ID) - idempotent case
result.Unchanged++
} else {
// Same content, different ID - rename detected
if !opts.SkipUpdate {
if err := handleRename(ctx, sqliteStore, existing, incoming); err != nil {
return fmt.Errorf("failed to handle rename %s -> %s: %w", existing.ID, incoming.ID, err)
}
result.Updated++
} else {
result.Skipped++
}
}
continue
}
// Phase 2: New content - check for ID collision
if existingWithID, found := dbByID[incoming.ID]; found {
// ID exists but different content - this is a collision
// The collision should have been handled earlier by handleCollisions
// If we reach here, it means collision wasn't resolved - treat as update
if !opts.SkipUpdate {
// Build updates map
updates := make(map[string]interface{})
updates["title"] = incoming.Title
updates["description"] = incoming.Description
updates["status"] = incoming.Status
updates["priority"] = incoming.Priority
updates["issue_type"] = incoming.IssueType
updates["design"] = incoming.Design
updates["acceptance_criteria"] = incoming.AcceptanceCriteria
updates["notes"] = incoming.Notes
if incoming.Assignee != "" {
updates["assignee"] = incoming.Assignee
} else {
updates["assignee"] = nil
}
if incoming.ExternalRef != nil && *incoming.ExternalRef != "" {
updates["external_ref"] = *incoming.ExternalRef
} else {
updates["external_ref"] = nil
}
// Only update if data actually changed
if IssueDataChanged(existingWithID, updates) {
if err := sqliteStore.UpdateIssue(ctx, incoming.ID, updates, "import"); err != nil {
return fmt.Errorf("error updating issue %s: %w", incoming.ID, err)
}
result.Updated++
} else {
result.Unchanged++
}
} else {
result.Skipped++
}
} else {
// New issue - check for duplicates in import batch
if idx, seen := seenNew[issue.ID]; seen {
if opts.Strict {
return fmt.Errorf("duplicate issue ID %s in import (line %d)", issue.ID, idx)
}
result.Skipped++
continue
}
seenNew[issue.ID] = len(newIssues)
newIssues = append(newIssues, issue)
// Truly new issue
newIssues = append(newIssues, incoming)
}
}

View File

@@ -478,7 +478,7 @@ func RemapCollisions(ctx context.Context, s *SQLiteStorage, collisions []*Collis
func updateReferences(ctx context.Context, s *SQLiteStorage, idMapping map[string]string) error {
// Pre-compile all regexes once for the entire operation
// This avoids recompiling the same patterns for each text field
cache, err := buildReplacementCache(idMapping)
cache, err := BuildReplacementCache(idMapping)
if err != nil {
return fmt.Errorf("failed to build replacement cache: %w", err)
}
@@ -494,25 +494,25 @@ func updateReferences(ctx context.Context, s *SQLiteStorage, idMapping map[strin
updates := make(map[string]interface{})
// Update description using cached regexes
newDesc := replaceIDReferencesWithCache(issue.Description, cache)
newDesc := ReplaceIDReferencesWithCache(issue.Description, cache)
if newDesc != issue.Description {
updates["description"] = newDesc
}
// Update design using cached regexes
newDesign := replaceIDReferencesWithCache(issue.Design, cache)
newDesign := ReplaceIDReferencesWithCache(issue.Design, cache)
if newDesign != issue.Design {
updates["design"] = newDesign
}
// Update notes using cached regexes
newNotes := replaceIDReferencesWithCache(issue.Notes, cache)
newNotes := ReplaceIDReferencesWithCache(issue.Notes, cache)
if newNotes != issue.Notes {
updates["notes"] = newNotes
}
// Update acceptance criteria using cached regexes
newAC := replaceIDReferencesWithCache(issue.AcceptanceCriteria, cache)
newAC := ReplaceIDReferencesWithCache(issue.AcceptanceCriteria, cache)
if newAC != issue.AcceptanceCriteria {
updates["acceptance_criteria"] = newAC
}
@@ -542,9 +542,9 @@ type idReplacementCache struct {
regex *regexp.Regexp
}
// buildReplacementCache pre-compiles all regex patterns for an ID mapping
// BuildReplacementCache pre-compiles all regex patterns for an ID mapping
// This cache should be created once per ID mapping and reused for all text replacements
func buildReplacementCache(idMapping map[string]string) ([]*idReplacementCache, error) {
func BuildReplacementCache(idMapping map[string]string) ([]*idReplacementCache, error) {
cache := make([]*idReplacementCache, 0, len(idMapping))
i := 0
for oldID, newID := range idMapping {
@@ -566,9 +566,9 @@ func buildReplacementCache(idMapping map[string]string) ([]*idReplacementCache,
return cache, nil
}
// replaceIDReferencesWithCache replaces all occurrences of old IDs with new IDs using a pre-compiled cache
// ReplaceIDReferencesWithCache replaces all occurrences of old IDs with new IDs using a pre-compiled cache
// Uses a two-phase approach to avoid replacement conflicts: first replace with placeholders, then replace with new IDs
func replaceIDReferencesWithCache(text string, cache []*idReplacementCache) string {
func ReplaceIDReferencesWithCache(text string, cache []*idReplacementCache) string {
if len(cache) == 0 || text == "" {
return text
}
@@ -593,16 +593,16 @@ func replaceIDReferencesWithCache(text string, cache []*idReplacementCache) stri
// placeholders, then replace placeholders with new IDs
//
// Note: This function compiles regexes on every call. For better performance when
// processing multiple text fields with the same ID mapping, use buildReplacementCache()
// and replaceIDReferencesWithCache() instead.
// processing multiple text fields with the same ID mapping, use BuildReplacementCache()
// and ReplaceIDReferencesWithCache() instead.
func replaceIDReferences(text string, idMapping map[string]string) string {
// Build cache (compiles regexes)
cache, err := buildReplacementCache(idMapping)
cache, err := BuildReplacementCache(idMapping)
if err != nil {
// Fallback to no replacement if regex compilation fails
return text
}
return replaceIDReferencesWithCache(text, cache)
return ReplaceIDReferencesWithCache(text, cache)
}
// updateDependencyReferences updates dependency records to use new IDs

View File

@@ -802,14 +802,14 @@ func BenchmarkReplaceIDReferencesWithCache(b *testing.B) {
"Also bd-6, bd-7, bd-8, bd-9, and bd-10 are referenced here."
// Pre-compile the cache (this is done once in real usage)
cache, err := buildReplacementCache(idMapping)
cache, err := BuildReplacementCache(idMapping)
if err != nil {
b.Fatalf("failed to build cache: %v", err)
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = replaceIDReferencesWithCache(text, cache)
_ = ReplaceIDReferencesWithCache(text, cache)
}
}
@@ -838,11 +838,11 @@ func BenchmarkReplaceIDReferencesMultipleTexts(b *testing.B) {
})
b.Run("with cache", func(b *testing.B) {
cache, _ := buildReplacementCache(idMapping)
cache, _ := BuildReplacementCache(idMapping)
b.ResetTimer()
for i := 0; i < b.N; i++ {
for _, text := range texts {
_ = replaceIDReferencesWithCache(text, cache)
_ = ReplaceIDReferencesWithCache(text, cache)
}
}
})