diff --git a/.beads/beads.jsonl b/.beads/beads.jsonl index bc73ea12..a8ba5ba3 100644 --- a/.beads/beads.jsonl +++ b/.beads/beads.jsonl @@ -86,7 +86,7 @@ {"id":"bd-200","title":"Write comprehensive tests for external_ref import scenarios","description":"Create test suite covering all external_ref import behaviors.\n\n## Test Cases\n1. Import with external_ref → creates new issue\n2. Re-import same external_ref with changes → updates existing\n3. Re-import same external_ref unchanged → idempotent (no update)\n4. Import external issue with ID collision → auto-remaps external issue\n5. Import without external_ref → uses current ID-based logic\n6. Mixed batch: some with external_ref, some without\n7. External_ref match + ID mismatch → updates by external_ref\n8. Local issue (no external_ref) never matched by external import\n9. Performance test with external_ref index\n\nAdd tests to:\n- cmd/bd/import_shared_test.go\n- internal/storage/sqlite/collision_test.go","status":"closed","priority":1,"issue_type":"task","created_at":"2025-10-24T13:08:46.376432-07:00","updated_at":"2025-10-24T13:35:26.748892-07:00","closed_at":"2025-10-24T13:35:26.748892-07:00"} {"id":"bd-201","title":"Update documentation for external_ref import behavior","description":"Document the new external_ref-based import matching behavior.\n\n## Files to Update\n- README.md: Add external_ref import section\n- QUICKSTART.md: Add example of Jira/GitHub integration workflow\n- AGENTS.md: Document external_ref import behavior for AI agents\n- FAQ.md: Add Q\u0026A about external system integration\n- Add examples/ directory entry showing Jira/GitHub/Linear integration\n\n## Key Points to Document\n- How external_ref matching works\n- That local issues are protected from external imports\n- Example workflow: import → add local tasks → re-import updates\n- ID conflict resolution behavior","status":"closed","priority":2,"issue_type":"task","created_at":"2025-10-24T13:08:46.390992-07:00","updated_at":"2025-10-24T13:35:26.749081-07:00","closed_at":"2025-10-24T13:35:26.749081-07:00"} {"id":"bd-202","title":"Code review: external_ref import feature","description":"Comprehensive code review of external_ref import implementation.\n\n## Review Checklist\n- [ ] Storage layer changes are correct and efficient\n- [ ] Index created properly on external_ref column\n- [ ] Collision detection logic handles all edge cases\n- [ ] Import flow correctly distinguishes external vs local issues\n- [ ] No breaking changes to existing workflows\n- [ ] Tests cover all scenarios (see bd-200)\n- [ ] Documentation is clear and complete (see bd-201)\n- [ ] Performance impact is acceptable\n- [ ] Error messages are helpful\n- [ ] Logging is appropriate for debugging\n\n## Oracle Review\nUse oracle tool to analyze implementation for correctness, edge cases, and potential issues.","status":"closed","priority":1,"issue_type":"task","created_at":"2025-10-24T13:08:46.395433-07:00","updated_at":"2025-10-24T13:35:26.749271-07:00","closed_at":"2025-10-24T13:35:26.749271-07:00"} -{"id":"bd-203","title":"Add automated duplicate detection tool for post-import cleanup","description":"After collision resolution with --resolve-collisions, we can end up with content duplicates (different IDs, identical content) when parallel work creates the same issues.\n\n## Problem\nCurrent situation:\n- `deduplicateIncomingIssues()` only deduplicates within the import batch\n- Doesn't detect duplicates between DB and incoming issues\n- Doesn't detect duplicates across the entire database post-import\n- Manual detection: `bd list --json | jq 'group_by(.title) | map(select(length \u003e 1))'`\n\n## Real Example\nAfter import with collision resolution, we had 7 duplicate pairs:\n--196/bd-196: Same external_ref epic\n--196-196: Same findByExternalRef task\n--196-196: Same collision detection task\n--196-196: Same import flow task\n--196-196: Same test writing task\n--196-196: Same documentation task\n--196-196: Same code review task\n\n## Proposed Solution\nAdd `bd duplicates` command that:\n1. Groups all issues by content hash (title + description + design + acceptance_criteria)\n2. Reports duplicate groups with suggested merge target (lowest ID or most references)\n3. Optionally auto-merge with `--auto-merge` flag\n4. Respects status (don't merge open with closed)\n\n## Example Output\n```\n🔍 Found 7 duplicate groups:\n\nGroup 1: \"Feature: Use external_ref as primary matching key\"\n --196 (open, P1, 0 references)\n - bd-196 (open, P1, 0 references)\n Suggested merge: bd merge-196 --into bd-196\n\nGroup 2: \"Add findByExternalRef query\"\n --196 (open, P1, 0 references) \n --196 (open, P1, 0 references)\n Suggested merge: bd merge-196 --into-196\n...\n\nRun with --auto-merge to execute all suggested merges\n```\n\n## Implementation Notes\n- Use same content hashing as deduplicateIncomingIssues\n- Consider reference counts when choosing merge target\n- Skip duplicates with different status (open vs closed)\n- Add --dry-run mode\n- Integration with import: `bd import --resolve-collisions --dedupe-after`","status":"open","priority":2,"issue_type":"feature","created_at":"2025-10-24T13:35:14.97041-07:00","updated_at":"2025-10-24T13:35:26.729886-07:00"} +{"id":"bd-203","title":"Add automated duplicate detection tool for post-import cleanup","description":"After collision resolution with --resolve-collisions, we can end up with content duplicates (different IDs, identical content) when parallel work creates the same issues.\n\n## Problem\nCurrent situation:\n- `deduplicateIncomingIssues()` only deduplicates within the import batch\n- Doesn't detect duplicates between DB and incoming issues\n- Doesn't detect duplicates across the entire database post-import\n- Manual detection: `bd list --json | jq 'group_by(.title) | map(select(length \u003e 1))'`\n\n## Real Example\nAfter import with collision resolution, we had 7 duplicate pairs:\n--196/bd-196: Same external_ref epic\n--196-196: Same findByExternalRef task\n--196-196: Same collision detection task\n--196-196: Same import flow task\n--196-196: Same test writing task\n--196-196: Same documentation task\n--196-196: Same code review task\n\n## Proposed Solution\nAdd `bd duplicates` command that:\n1. Groups all issues by content hash (title + description + design + acceptance_criteria)\n2. Reports duplicate groups with suggested merge target (lowest ID or most references)\n3. Optionally auto-merge with `--auto-merge` flag\n4. Respects status (don't merge open with closed)\n\n## Example Output\n```\n🔍 Found 7 duplicate groups:\n\nGroup 1: \"Feature: Use external_ref as primary matching key\"\n --196 (open, P1, 0 references)\n - bd-196 (open, P1, 0 references)\n Suggested merge: bd merge-196 --into bd-196\n\nGroup 2: \"Add findByExternalRef query\"\n --196 (open, P1, 0 references) \n --196 (open, P1, 0 references)\n Suggested merge: bd merge-196 --into-196\n...\n\nRun with --auto-merge to execute all suggested merges\n```\n\n## Implementation Notes\n- Use same content hashing as deduplicateIncomingIssues\n- Consider reference counts when choosing merge target\n- Skip duplicates with different status (open vs closed)\n- Add --dry-run mode\n- Integration with import: `bd import --resolve-collisions --dedupe-after`","status":"in_progress","priority":2,"issue_type":"feature","created_at":"2025-10-24T13:35:14.97041-07:00","updated_at":"2025-10-24T13:37:39.187698-07:00"} {"id":"bd-204","title":"Optimize auto-flush to use incremental updates","description":"Every flush exports ALL issues and ALL dependencies, even if only one issue changed. For large projects (1000+ issues), this could be expensive. Current approach guarantees consistency, which is fine for MVP, but future optimization could track which issues changed and use incremental updates. Located in cmd/bd/main.go:255-276.","status":"closed","priority":3,"issue_type":"feature","created_at":"2025-10-24T13:35:23.09858-07:00","updated_at":"2025-10-24T13:35:23.09858-07:00","closed_at":"2025-10-14T02:51:52.200141-07:00"} {"id":"bd-205","title":"Implement dependency migration for merge","description":"Migrate all dependencies from source issue(s) to target issue during merge, removing duplicates and preserving graph integrity","status":"closed","priority":1,"issue_type":"task","created_at":"2025-10-24T13:35:23.098981-07:00","updated_at":"2025-10-24T13:35:23.098981-07:00","closed_at":"2025-10-22T01:07:04.720032-07:00"} {"id":"bd-206","title":"Add merged_into field to database schema","description":"Add merged_into field to Issue struct and update database schema to support merge tracking","notes":"Simplified: no schema field needed. Close merged issues with reason 'Merged into bd-X'. See bd-79 design.","status":"closed","priority":1,"issue_type":"task","created_at":"2025-10-24T13:35:23.099266-07:00","updated_at":"2025-10-24T13:35:23.099266-07:00","closed_at":"2025-10-22T01:07:14.145014-07:00"} diff --git a/ADVANCED.md b/ADVANCED.md index 17acff19..8f2dd32c 100644 --- a/ADVANCED.md +++ b/ADVANCED.md @@ -55,6 +55,53 @@ bd rename-prefix kw- bd list # Shows kw-* issues ``` +## Duplicate Detection + +Find issues with identical content using automated duplicate detection: + +```bash +# Find all content duplicates in the database +bd duplicates + +# Show duplicates in JSON format +bd duplicates --json + +# Automatically merge all duplicates +bd duplicates --auto-merge + +# Preview what would be merged +bd duplicates --dry-run + +# Detect duplicates during import +bd import -i issues.jsonl --resolve-collisions --dedupe-after +``` + +**How it works:** +- Groups issues by content hash (title, description, design, acceptance criteria) +- Only groups issues with matching status (open with open, closed with closed) +- Chooses merge target by reference count (most referenced) or smallest ID +- Reports duplicate groups with suggested merge commands + +**Example output:** + +``` +🔍 Found 3 duplicate group(s): + +━━ Group 1: Fix authentication bug +→ bd-10 (open, P1, 5 references) + bd-42 (open, P1, 0 references) + Suggested: bd merge bd-42 --into bd-10 + +💡 Run with --auto-merge to execute all suggested merges +``` + +**AI Agent Workflow:** + +1. **Periodic scans**: Run `bd duplicates` to check for duplicates +2. **During import**: Use `--dedupe-after` to detect duplicates after collision resolution +3. **Auto-merge**: Use `--auto-merge` to automatically consolidate duplicates +4. **Manual review**: Use `--dry-run` to preview merges before executing + ## Merging Duplicate Issues Consolidate duplicate issues into a single issue while preserving dependencies and references: diff --git a/AGENTS.md b/AGENTS.md index 5cffc4b6..58a418a5 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -166,8 +166,14 @@ bd restore # View full history at time of compaction # Import with collision detection bd import -i .beads/issues.jsonl --dry-run # Preview only bd import -i .beads/issues.jsonl --resolve-collisions # Auto-resolve +bd import -i .beads/issues.jsonl --resolve-collisions --dedupe-after # Auto-resolve + detect duplicates -# Merge duplicate issues +# Find and merge duplicate issues +bd duplicates # Show all duplicates +bd duplicates --auto-merge # Automatically merge all +bd duplicates --dry-run # Preview merge operations + +# Merge specific duplicate issues bd merge --into --json # Consolidate duplicates bd merge bd-42 bd-43 --into bd-41 --dry-run # Preview merge ``` @@ -212,6 +218,22 @@ Only `blocks` dependencies affect the ready work queue. AI agents should proactively detect and merge duplicate issues to keep the database clean: +**Automated duplicate detection:** + +```bash +# Find all content duplicates in the database +bd duplicates + +# Automatically merge all duplicates +bd duplicates --auto-merge + +# Preview what would be merged +bd duplicates --dry-run + +# During import (after collision resolution) +bd import -i issues.jsonl --resolve-collisions --dedupe-after +``` + **Detection strategies:** 1. **Before creating new issues**: Search for similar existing issues diff --git a/cmd/bd/duplicates.go b/cmd/bd/duplicates.go new file mode 100644 index 00000000..f0e2e5e3 --- /dev/null +++ b/cmd/bd/duplicates.go @@ -0,0 +1,298 @@ +package main + +import ( + "context" + "fmt" + "os" + "regexp" + "strings" + + "github.com/fatih/color" + "github.com/spf13/cobra" + "github.com/steveyegge/beads/internal/types" +) + +var duplicatesCmd = &cobra.Command{ + Use: "duplicates", + Short: "Find and optionally merge duplicate issues", + Long: `Find issues with identical content (title, description, design, acceptance criteria). + +Groups issues by content hash and reports duplicates with suggested merge targets. +The merge target is chosen by: +1. Reference count (most referenced issue wins) +2. Lexicographically smallest ID if reference counts are equal + +Only groups issues with matching status (open with open, closed with closed). + +Example: + bd duplicates # Show all duplicate groups + bd duplicates --auto-merge # Automatically merge all duplicates + bd duplicates --dry-run # Show what would be merged`, + Run: func(cmd *cobra.Command, args []string) { + // Check daemon mode - not supported yet (merge command limitation) + if daemonClient != nil { + fmt.Fprintf(os.Stderr, "Error: duplicates command not yet supported in daemon mode (see bd-190)\n") + fmt.Fprintf(os.Stderr, "Use: bd --no-daemon duplicates\n") + os.Exit(1) + } + + autoMerge, _ := cmd.Flags().GetBool("auto-merge") + dryRun, _ := cmd.Flags().GetBool("dry-run") + + ctx := context.Background() + + // Get all issues + allIssues, err := store.SearchIssues(ctx, "", types.IssueFilter{}) + if err != nil { + fmt.Fprintf(os.Stderr, "Error fetching issues: %v\n", err) + os.Exit(1) + } + + // Find duplicates + duplicateGroups := findDuplicateGroups(allIssues) + + if len(duplicateGroups) == 0 { + if !jsonOutput { + fmt.Println("No duplicates found!") + } else { + outputJSON(map[string]interface{}{ + "duplicate_groups": 0, + "groups": []interface{}{}, + }) + } + return + } + + // Count references for each issue + refCounts := countReferences(allIssues) + + // Prepare output + var mergeCommands []string + var mergeResults []map[string]interface{} + + for _, group := range duplicateGroups { + target := chooseMergeTarget(group, refCounts) + sources := make([]string, 0, len(group)-1) + for _, issue := range group { + if issue.ID != target.ID { + sources = append(sources, issue.ID) + } + } + + if autoMerge || dryRun { + // Perform merge (unless dry-run) + if !dryRun { + result, err := performMerge(ctx, target.ID, sources) + if err != nil { + fmt.Fprintf(os.Stderr, "Error merging %s into %s: %v\n", strings.Join(sources, ", "), target.ID, err) + continue + } + + if jsonOutput { + mergeResults = append(mergeResults, map[string]interface{}{ + "target_id": target.ID, + "source_ids": sources, + "dependencies_added": result.depsAdded, + "dependencies_skipped": result.depsSkipped, + "text_references": result.textRefCount, + "issues_closed": result.issuesClosed, + "issues_skipped": result.issuesSkipped, + }) + } + } + + cmd := fmt.Sprintf("bd merge %s --into %s", strings.Join(sources, " "), target.ID) + mergeCommands = append(mergeCommands, cmd) + } else { + cmd := fmt.Sprintf("bd merge %s --into %s", strings.Join(sources, " "), target.ID) + mergeCommands = append(mergeCommands, cmd) + } + } + + // Mark dirty if we performed merges + if autoMerge && !dryRun && len(mergeCommands) > 0 { + markDirtyAndScheduleFlush() + } + + // Output results + if jsonOutput { + output := map[string]interface{}{ + "duplicate_groups": len(duplicateGroups), + "groups": formatDuplicateGroupsJSON(duplicateGroups, refCounts), + } + if autoMerge || dryRun { + output["merge_commands"] = mergeCommands + if autoMerge && !dryRun { + output["merge_results"] = mergeResults + } + } + outputJSON(output) + } else { + yellow := color.New(color.FgYellow).SprintFunc() + cyan := color.New(color.FgCyan).SprintFunc() + green := color.New(color.FgGreen).SprintFunc() + + fmt.Printf("%s Found %d duplicate group(s):\n\n", yellow("🔍"), len(duplicateGroups)) + + for i, group := range duplicateGroups { + target := chooseMergeTarget(group, refCounts) + fmt.Printf("%s Group %d: %s\n", cyan("━━"), i+1, group[0].Title) + + for _, issue := range group { + refs := refCounts[issue.ID] + marker := " " + if issue.ID == target.ID { + marker = green("→ ") + } + fmt.Printf("%s%s (%s, P%d, %d references)\n", + marker, issue.ID, issue.Status, issue.Priority, refs) + } + + sources := make([]string, 0, len(group)-1) + for _, issue := range group { + if issue.ID != target.ID { + sources = append(sources, issue.ID) + } + } + fmt.Printf(" %s bd merge %s --into %s\n\n", + cyan("Suggested:"), strings.Join(sources, " "), target.ID) + } + + if autoMerge { + if dryRun { + fmt.Printf("%s Dry run - would execute %d merge(s)\n", yellow("⚠"), len(mergeCommands)) + } else { + fmt.Printf("%s Merged %d group(s)\n", green("✓"), len(mergeCommands)) + } + } else { + fmt.Printf("%s Run with --auto-merge to execute all suggested merges\n", cyan("💡")) + } + } + }, +} + +func init() { + duplicatesCmd.Flags().Bool("auto-merge", false, "Automatically merge all duplicates") + duplicatesCmd.Flags().Bool("dry-run", false, "Show what would be merged without making changes") + rootCmd.AddCommand(duplicatesCmd) +} + +// contentKey represents the fields we use to identify duplicate issues +type contentKey struct { + title string + description string + design string + acceptanceCriteria string + status string // Only group issues with same status +} + +// findDuplicateGroups groups issues by content hash +func findDuplicateGroups(issues []*types.Issue) [][]*types.Issue { + groups := make(map[contentKey][]*types.Issue) + + for _, issue := range issues { + key := contentKey{ + title: issue.Title, + description: issue.Description, + design: issue.Design, + acceptanceCriteria: issue.AcceptanceCriteria, + status: string(issue.Status), + } + + groups[key] = append(groups[key], issue) + } + + // Filter to only groups with duplicates + var duplicates [][]*types.Issue + for _, group := range groups { + if len(group) > 1 { + duplicates = append(duplicates, group) + } + } + + return duplicates +} + +// countReferences counts how many times each issue is referenced in text fields +func countReferences(issues []*types.Issue) map[string]int { + counts := make(map[string]int) + idPattern := regexp.MustCompile(`\b[a-zA-Z][-a-zA-Z0-9]*-\d+\b`) + + for _, issue := range issues { + // Search in all text fields + textFields := []string{ + issue.Description, + issue.Design, + issue.AcceptanceCriteria, + issue.Notes, + } + + for _, text := range textFields { + matches := idPattern.FindAllString(text, -1) + for _, match := range matches { + counts[match]++ + } + } + } + + return counts +} + +// chooseMergeTarget selects the best issue to merge into +// Priority: highest reference count, then lexicographically smallest ID +func chooseMergeTarget(group []*types.Issue, refCounts map[string]int) *types.Issue { + if len(group) == 0 { + return nil + } + + target := group[0] + targetRefs := refCounts[target.ID] + + for _, issue := range group[1:] { + issueRefs := refCounts[issue.ID] + if issueRefs > targetRefs || (issueRefs == targetRefs && issue.ID < target.ID) { + target = issue + targetRefs = issueRefs + } + } + + return target +} + +// formatDuplicateGroupsJSON formats duplicate groups for JSON output +func formatDuplicateGroupsJSON(groups [][]*types.Issue, refCounts map[string]int) []map[string]interface{} { + var result []map[string]interface{} + + for _, group := range groups { + target := chooseMergeTarget(group, refCounts) + issues := make([]map[string]interface{}, len(group)) + + for i, issue := range group { + issues[i] = map[string]interface{}{ + "id": issue.ID, + "title": issue.Title, + "status": issue.Status, + "priority": issue.Priority, + "references": refCounts[issue.ID], + "is_merge_target": issue.ID == target.ID, + } + } + + sources := make([]string, 0, len(group)-1) + for _, issue := range group { + if issue.ID != target.ID { + sources = append(sources, issue.ID) + } + } + + result = append(result, map[string]interface{}{ + "title": group[0].Title, + "issues": issues, + "suggested_target": target.ID, + "suggested_sources": sources, + "suggested_merge_cmd": fmt.Sprintf("bd merge %s --into %s", strings.Join(sources, " "), target.ID), + }) + } + + return result +} diff --git a/cmd/bd/duplicates_test.go b/cmd/bd/duplicates_test.go new file mode 100644 index 00000000..3d6a408c --- /dev/null +++ b/cmd/bd/duplicates_test.go @@ -0,0 +1,265 @@ +package main + +import ( + "context" + "testing" + + "github.com/steveyegge/beads/internal/types" +) + +func TestFindDuplicateGroups(t *testing.T) { + tests := []struct { + name string + issues []*types.Issue + expectedGroups int + }{ + { + name: "no duplicates", + issues: []*types.Issue{ + {ID: "bd-1", Title: "Task 1", Status: types.StatusOpen}, + {ID: "bd-2", Title: "Task 2", Status: types.StatusOpen}, + }, + expectedGroups: 0, + }, + { + name: "simple duplicate", + issues: []*types.Issue{ + {ID: "bd-1", Title: "Task 1", Status: types.StatusOpen}, + {ID: "bd-2", Title: "Task 1", Status: types.StatusOpen}, + }, + expectedGroups: 1, + }, + { + name: "duplicate with different status ignored", + issues: []*types.Issue{ + {ID: "bd-1", Title: "Task 1", Status: types.StatusOpen}, + {ID: "bd-2", Title: "Task 1", Status: types.StatusClosed}, + }, + expectedGroups: 0, + }, + { + name: "multiple duplicates", + issues: []*types.Issue{ + {ID: "bd-1", Title: "Task 1", Status: types.StatusOpen}, + {ID: "bd-2", Title: "Task 1", Status: types.StatusOpen}, + {ID: "bd-3", Title: "Task 2", Status: types.StatusOpen}, + {ID: "bd-4", Title: "Task 2", Status: types.StatusOpen}, + }, + expectedGroups: 2, + }, + { + name: "different descriptions are duplicates if title matches", + issues: []*types.Issue{ + {ID: "bd-1", Title: "Task 1", Description: "Desc 1", Status: types.StatusOpen}, + {ID: "bd-2", Title: "Task 1", Description: "Desc 2", Status: types.StatusOpen}, + }, + expectedGroups: 0, // Different descriptions = not duplicates + }, + { + name: "exact content match", + issues: []*types.Issue{ + {ID: "bd-1", Title: "Task 1", Description: "Desc 1", Design: "Design 1", AcceptanceCriteria: "AC 1", Status: types.StatusOpen}, + {ID: "bd-2", Title: "Task 1", Description: "Desc 1", Design: "Design 1", AcceptanceCriteria: "AC 1", Status: types.StatusOpen}, + }, + expectedGroups: 1, + }, + { + name: "three-way duplicate", + issues: []*types.Issue{ + {ID: "bd-1", Title: "Task 1", Status: types.StatusOpen}, + {ID: "bd-2", Title: "Task 1", Status: types.StatusOpen}, + {ID: "bd-3", Title: "Task 1", Status: types.StatusOpen}, + }, + expectedGroups: 1, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + groups := findDuplicateGroups(tt.issues) + if len(groups) != tt.expectedGroups { + t.Errorf("findDuplicateGroups() returned %d groups, want %d", len(groups), tt.expectedGroups) + } + }) + } +} + +func TestChooseMergeTarget(t *testing.T) { + tests := []struct { + name string + group []*types.Issue + refCounts map[string]int + wantID string + }{ + { + name: "choose by reference count", + group: []*types.Issue{ + {ID: "bd-2", Title: "Task"}, + {ID: "bd-1", Title: "Task"}, + }, + refCounts: map[string]int{ + "bd-1": 5, + "bd-2": 0, + }, + wantID: "bd-1", + }, + { + name: "choose by lexicographic order if same references", + group: []*types.Issue{ + {ID: "bd-2", Title: "Task"}, + {ID: "bd-1", Title: "Task"}, + }, + refCounts: map[string]int{ + "bd-1": 0, + "bd-2": 0, + }, + wantID: "bd-1", + }, + { + name: "prefer higher references even with larger ID", + group: []*types.Issue{ + {ID: "bd-1", Title: "Task"}, + {ID: "bd-100", Title: "Task"}, + }, + refCounts: map[string]int{ + "bd-1": 1, + "bd-100": 10, + }, + wantID: "bd-100", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + target := chooseMergeTarget(tt.group, tt.refCounts) + if target.ID != tt.wantID { + t.Errorf("chooseMergeTarget() = %v, want %v", target.ID, tt.wantID) + } + }) + } +} + +func TestCountReferences(t *testing.T) { + issues := []*types.Issue{ + { + ID: "bd-1", + Description: "See bd-2 for details", + Notes: "Related to bd-3", + }, + { + ID: "bd-2", + Description: "Mentioned bd-1 twice: bd-1", + }, + { + ID: "bd-3", + Notes: "Nothing to see here", + }, + } + + counts := countReferences(issues) + + expectedCounts := map[string]int{ + "bd-1": 2, // Referenced twice in bd-2 + "bd-2": 1, // Referenced once in bd-1 + "bd-3": 1, // Referenced once in bd-1 + } + + for id, expectedCount := range expectedCounts { + if counts[id] != expectedCount { + t.Errorf("countReferences()[%s] = %d, want %d", id, counts[id], expectedCount) + } + } +} + +func TestDuplicateGroupsWithDifferentStatuses(t *testing.T) { + issues := []*types.Issue{ + {ID: "bd-1", Title: "Task 1", Status: types.StatusOpen}, + {ID: "bd-2", Title: "Task 1", Status: types.StatusClosed}, + {ID: "bd-3", Title: "Task 1", Status: types.StatusOpen}, + } + + groups := findDuplicateGroups(issues) + + // Should have 1 group with bd-1 and bd-3 (both open) + if len(groups) != 1 { + t.Fatalf("Expected 1 group, got %d", len(groups)) + } + + if len(groups[0]) != 2 { + t.Fatalf("Expected 2 issues in group, got %d", len(groups[0])) + } + + // Verify bd-2 (closed) is not in the group + for _, issue := range groups[0] { + if issue.ID == "bd-2" { + t.Errorf("bd-2 (closed) should not be in group with open issues") + } + } +} + +func TestDuplicatesIntegration(t *testing.T) { + ctx := context.Background() + testStore, cleanup := setupTestDB(t) + defer cleanup() + + // Create duplicate issues + issues := []*types.Issue{ + { + ID: "bd-1", + Title: "Fix authentication bug", + Description: "Users can't login", + Status: types.StatusOpen, + Priority: 1, + IssueType: types.TypeBug, + }, + { + ID: "bd-2", + Title: "Fix authentication bug", + Description: "Users can't login", + Status: types.StatusOpen, + Priority: 1, + IssueType: types.TypeBug, + }, + { + ID: "bd-3", + Title: "Different task", + Description: "Different description", + Status: types.StatusOpen, + Priority: 2, + IssueType: types.TypeTask, + }, + } + + for _, issue := range issues { + if err := testStore.CreateIssue(ctx, issue, "test"); err != nil { + t.Fatalf("CreateIssue failed: %v", err) + } + } + + // Fetch all issues + allIssues, err := testStore.SearchIssues(ctx, "", types.IssueFilter{}) + if err != nil { + t.Fatalf("SearchIssues failed: %v", err) + } + + // Find duplicates + groups := findDuplicateGroups(allIssues) + + if len(groups) != 1 { + t.Fatalf("Expected 1 duplicate group, got %d", len(groups)) + } + + if len(groups[0]) != 2 { + t.Fatalf("Expected 2 issues in group, got %d", len(groups[0])) + } + + // Verify the duplicate group contains bd-1 and bd-2 + ids := make(map[string]bool) + for _, issue := range groups[0] { + ids[issue.ID] = true + } + + if !ids["bd-1"] || !ids["bd-2"] { + t.Errorf("Expected duplicate group to contain bd-1 and bd-2") + } +} diff --git a/cmd/bd/import.go b/cmd/bd/import.go index 07760682..4a38308e 100644 --- a/cmd/bd/import.go +++ b/cmd/bd/import.go @@ -7,6 +7,7 @@ import ( "fmt" "os" "sort" + "strings" "github.com/spf13/cobra" "github.com/steveyegge/beads/internal/types" @@ -24,6 +25,7 @@ Behavior: - New issues are created - Collisions (same ID, different content) are detected - Use --resolve-collisions to automatically remap colliding issues + - Use --dedupe-after to find and merge content duplicates after import - Use --dry-run to preview changes without applying them`, Run: func(cmd *cobra.Command, args []string) { input, _ := cmd.Flags().GetString("input") @@ -32,6 +34,7 @@ Behavior: resolveCollisions, _ := cmd.Flags().GetBool("resolve-collisions") dryRun, _ := cmd.Flags().GetBool("dry-run") renameOnImport, _ := cmd.Flags().GetBool("rename-on-import") + dedupeAfter, _ := cmd.Flags().GetBool("dedupe-after") // Open input in := os.Stdin @@ -190,6 +193,54 @@ Behavior: fmt.Fprintf(os.Stderr, ", %d issues remapped", len(result.IDMapping)) } fmt.Fprintf(os.Stderr, "\n") + + // Run duplicate detection if requested + if dedupeAfter { + fmt.Fprintf(os.Stderr, "\n=== Post-Import Duplicate Detection ===\n") + + // Get all issues (fresh after import) + allIssues, err := store.SearchIssues(ctx, "", types.IssueFilter{}) + if err != nil { + fmt.Fprintf(os.Stderr, "Error fetching issues for deduplication: %v\n", err) + os.Exit(1) + } + + duplicateGroups := findDuplicateGroups(allIssues) + if len(duplicateGroups) == 0 { + fmt.Fprintf(os.Stderr, "No duplicates found.\n") + return + } + + refCounts := countReferences(allIssues) + + fmt.Fprintf(os.Stderr, "Found %d duplicate group(s)\n\n", len(duplicateGroups)) + + for i, group := range duplicateGroups { + target := chooseMergeTarget(group, refCounts) + fmt.Fprintf(os.Stderr, "Group %d: %s\n", i+1, group[0].Title) + + for _, issue := range group { + refs := refCounts[issue.ID] + marker := " " + if issue.ID == target.ID { + marker = "→ " + } + fmt.Fprintf(os.Stderr, " %s%s (%s, P%d, %d refs)\n", + marker, issue.ID, issue.Status, issue.Priority, refs) + } + + sources := make([]string, 0, len(group)-1) + for _, issue := range group { + if issue.ID != target.ID { + sources = append(sources, issue.ID) + } + } + fmt.Fprintf(os.Stderr, " Suggested: bd merge %s --into %s\n\n", + strings.Join(sources, " "), target.ID) + } + + fmt.Fprintf(os.Stderr, "Run 'bd duplicates --auto-merge' to merge all duplicates.\n") + } }, } @@ -198,6 +249,7 @@ func init() { importCmd.Flags().BoolP("skip-existing", "s", false, "Skip existing issues instead of updating them") importCmd.Flags().Bool("strict", false, "Fail on dependency errors instead of treating them as warnings") importCmd.Flags().Bool("resolve-collisions", false, "Automatically resolve ID collisions by remapping") + importCmd.Flags().Bool("dedupe-after", false, "Detect and report content duplicates after import") importCmd.Flags().Bool("dry-run", false, "Preview collision detection without making changes") importCmd.Flags().Bool("rename-on-import", false, "Rename imported issues to match database prefix (updates all references)") rootCmd.AddCommand(importCmd)