Fix bd-421: Add deduplication to prevent importing duplicate issues
- Added deduplicateIncomingIssues() to consolidate content-identical issues - DetectCollisions now deduplicates within incoming batch before processing - Keeps issue with smallest ID when duplicates found - Added comprehensive test suite in collision_dedup_test.go - Export clean JSONL with bd-421 fix applied Amp-Thread-ID: https://ampcode.com/threads/T-c17dd8bf-c298-4a80-baa5-55fa7c7bb9a3 Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
File diff suppressed because one or more lines are too long
@@ -40,7 +40,11 @@ func DetectCollisions(ctx context.Context, s *SQLiteStorage, incomingIssues []*t
|
|||||||
NewIssues: make([]string, 0),
|
NewIssues: make([]string, 0),
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, incoming := range incomingIssues {
|
// Phase 1: Deduplicate within incoming batch
|
||||||
|
// Group by content hash to find duplicates with different IDs
|
||||||
|
deduped := deduplicateIncomingIssues(incomingIssues)
|
||||||
|
|
||||||
|
for _, incoming := range deduped {
|
||||||
// Check if issue exists in database
|
// Check if issue exists in database
|
||||||
existing, err := s.GetIssue(ctx, incoming.ID)
|
existing, err := s.GetIssue(ctx, incoming.ID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -233,6 +237,61 @@ func countReferences(issueID string, allIssues []*types.Issue, allDeps map[strin
|
|||||||
return count, nil
|
return count, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// deduplicateIncomingIssues removes content-duplicate issues within the incoming batch
|
||||||
|
// Returns deduplicated slice, keeping the first issue ID (lexicographically) for each unique content
|
||||||
|
func deduplicateIncomingIssues(issues []*types.Issue) []*types.Issue {
|
||||||
|
// Group issues by content hash (ignoring ID and timestamps)
|
||||||
|
type contentKey struct {
|
||||||
|
title string
|
||||||
|
description string
|
||||||
|
design string
|
||||||
|
acceptanceCriteria string
|
||||||
|
notes string
|
||||||
|
status string
|
||||||
|
priority int
|
||||||
|
issueType string
|
||||||
|
assignee string
|
||||||
|
}
|
||||||
|
|
||||||
|
seen := make(map[contentKey]*types.Issue)
|
||||||
|
result := make([]*types.Issue, 0, len(issues))
|
||||||
|
|
||||||
|
for _, issue := range issues {
|
||||||
|
key := contentKey{
|
||||||
|
title: issue.Title,
|
||||||
|
description: issue.Description,
|
||||||
|
design: issue.Design,
|
||||||
|
acceptanceCriteria: issue.AcceptanceCriteria,
|
||||||
|
notes: issue.Notes,
|
||||||
|
status: string(issue.Status),
|
||||||
|
priority: issue.Priority,
|
||||||
|
issueType: string(issue.IssueType),
|
||||||
|
assignee: issue.Assignee,
|
||||||
|
}
|
||||||
|
|
||||||
|
if existing, found := seen[key]; found {
|
||||||
|
// Duplicate found - keep the one with lexicographically smaller ID
|
||||||
|
if issue.ID < existing.ID {
|
||||||
|
// Replace existing with this one (smaller ID)
|
||||||
|
for i, r := range result {
|
||||||
|
if r.ID == existing.ID {
|
||||||
|
result[i] = issue
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
seen[key] = issue
|
||||||
|
}
|
||||||
|
// Otherwise skip this duplicate
|
||||||
|
} else {
|
||||||
|
// First time seeing this content
|
||||||
|
seen[key] = issue
|
||||||
|
result = append(result, issue)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
// RemapCollisions handles ID remapping for colliding issues
|
// RemapCollisions handles ID remapping for colliding issues
|
||||||
// Takes sorted collisions (fewest references first) and remaps them to new IDs
|
// Takes sorted collisions (fewest references first) and remaps them to new IDs
|
||||||
// Returns a map of old ID -> new ID for reporting
|
// Returns a map of old ID -> new ID for reporting
|
||||||
|
|||||||
88
internal/storage/sqlite/collision_dedup_test.go
Normal file
88
internal/storage/sqlite/collision_dedup_test.go
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
package sqlite
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/steveyegge/beads/internal/types"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TestDeduplicateIncomingIssues tests that duplicate issues within the incoming batch are consolidated
|
||||||
|
func TestDeduplicateIncomingIssues(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
incoming []*types.Issue
|
||||||
|
want int // expected number of issues after deduplication
|
||||||
|
wantIDs []string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "no duplicates",
|
||||||
|
incoming: []*types.Issue{
|
||||||
|
{ID: "bd-1", Title: "Issue 1", Status: types.StatusOpen, Priority: 1, IssueType: types.TypeTask},
|
||||||
|
{ID: "bd-2", Title: "Issue 2", Status: types.StatusOpen, Priority: 1, IssueType: types.TypeTask},
|
||||||
|
},
|
||||||
|
want: 2,
|
||||||
|
wantIDs: []string{"bd-1", "bd-2"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "exact content duplicates - keep smallest ID",
|
||||||
|
incoming: []*types.Issue{
|
||||||
|
{ID: "bd-226", Title: "Epic: Fix status/closed_at inconsistency", Description: "Implement hybrid solution", Status: types.StatusOpen, Priority: 1, IssueType: types.TypeEpic},
|
||||||
|
{ID: "bd-367", Title: "Epic: Fix status/closed_at inconsistency", Description: "Implement hybrid solution", Status: types.StatusOpen, Priority: 1, IssueType: types.TypeEpic},
|
||||||
|
{ID: "bd-396", Title: "Epic: Fix status/closed_at inconsistency", Description: "Implement hybrid solution", Status: types.StatusOpen, Priority: 1, IssueType: types.TypeEpic},
|
||||||
|
},
|
||||||
|
want: 1,
|
||||||
|
wantIDs: []string{"bd-226"}, // Keep smallest ID
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "partial duplicates - keep unique ones",
|
||||||
|
incoming: []*types.Issue{
|
||||||
|
{ID: "bd-1", Title: "Task A", Status: types.StatusOpen, Priority: 1, IssueType: types.TypeTask},
|
||||||
|
{ID: "bd-2", Title: "Task A", Status: types.StatusOpen, Priority: 1, IssueType: types.TypeTask}, // Dup of bd-1
|
||||||
|
{ID: "bd-3", Title: "Task B", Status: types.StatusOpen, Priority: 1, IssueType: types.TypeTask}, // Unique
|
||||||
|
{ID: "bd-4", Title: "Task B", Status: types.StatusOpen, Priority: 1, IssueType: types.TypeTask}, // Dup of bd-3
|
||||||
|
},
|
||||||
|
want: 2,
|
||||||
|
wantIDs: []string{"bd-1", "bd-3"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "duplicates with different timestamps - timestamps ignored",
|
||||||
|
incoming: []*types.Issue{
|
||||||
|
{ID: "bd-100", Title: "Task", Status: types.StatusOpen, Priority: 1, IssueType: types.TypeTask},
|
||||||
|
{ID: "bd-101", Title: "Task", Status: types.StatusOpen, Priority: 1, IssueType: types.TypeTask},
|
||||||
|
},
|
||||||
|
want: 1,
|
||||||
|
wantIDs: []string{"bd-100"}, // Keep smallest ID
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "different priority - not duplicates",
|
||||||
|
incoming: []*types.Issue{
|
||||||
|
{ID: "bd-1", Title: "Task", Status: types.StatusOpen, Priority: 1, IssueType: types.TypeTask},
|
||||||
|
{ID: "bd-2", Title: "Task", Status: types.StatusOpen, Priority: 2, IssueType: types.TypeTask},
|
||||||
|
},
|
||||||
|
want: 2,
|
||||||
|
wantIDs: []string{"bd-1", "bd-2"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
result := deduplicateIncomingIssues(tt.incoming)
|
||||||
|
|
||||||
|
if len(result) != tt.want {
|
||||||
|
t.Errorf("deduplicateIncomingIssues() returned %d issues, want %d", len(result), tt.want)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check that the expected IDs are present
|
||||||
|
resultIDs := make(map[string]bool)
|
||||||
|
for _, issue := range result {
|
||||||
|
resultIDs[issue.ID] = true
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, wantID := range tt.wantIDs {
|
||||||
|
if !resultIDs[wantID] {
|
||||||
|
t.Errorf("expected ID %s not found in result", wantID)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user