Add bd duplicates command for automated duplicate detection (bd-203)

- New 'bd duplicates' command finds content duplicates across database - Groups by content hash (title, description, design, acceptance criteria) - Chooses merge target by reference count or smallest ID - Supports --auto-merge and --dry-run flags - Added --dedupe-after flag to 'bd import' for post-import detection - Comprehensive test coverage for duplicate detection - Updated AGENTS.md and ADVANCED.md with usage examples Amp-Thread-ID: https://ampcode.com/threads/T-6f99566f-c979-43ed-bd8f-5aa38b0f6191 Co-authored-by: Amp <amp@ampcode.com>
2025-10-24 13:45:04 -07:00
parent 9b2c551923
commit 3195b8062b
6 changed files with 686 additions and 2 deletions
@@ -0,0 +1,298 @@
+package main
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"regexp"
+	"strings"
+
+	"github.com/fatih/color"
+	"github.com/spf13/cobra"
+	"github.com/steveyegge/beads/internal/types"
+)
+
+var duplicatesCmd = &cobra.Command{
+	Use:   "duplicates",
+	Short: "Find and optionally merge duplicate issues",
+	Long: `Find issues with identical content (title, description, design, acceptance criteria).
+
+Groups issues by content hash and reports duplicates with suggested merge targets.
+The merge target is chosen by:
+1. Reference count (most referenced issue wins)
+2. Lexicographically smallest ID if reference counts are equal
+
+Only groups issues with matching status (open with open, closed with closed).
+
+Example:
+  bd duplicates                    # Show all duplicate groups
+  bd duplicates --auto-merge       # Automatically merge all duplicates
+  bd duplicates --dry-run          # Show what would be merged`,
+	Run: func(cmd *cobra.Command, args []string) {
+		// Check daemon mode - not supported yet (merge command limitation)
+		if daemonClient != nil {
+			fmt.Fprintf(os.Stderr, "Error: duplicates command not yet supported in daemon mode (see bd-190)\n")
+			fmt.Fprintf(os.Stderr, "Use: bd --no-daemon duplicates\n")
+			os.Exit(1)
+		}
+
+		autoMerge, _ := cmd.Flags().GetBool("auto-merge")
+		dryRun, _ := cmd.Flags().GetBool("dry-run")
+
+		ctx := context.Background()
+
+		// Get all issues
+		allIssues, err := store.SearchIssues(ctx, "", types.IssueFilter{})
+		if err != nil {
+			fmt.Fprintf(os.Stderr, "Error fetching issues: %v\n", err)
+			os.Exit(1)
+		}
+
+		// Find duplicates
+		duplicateGroups := findDuplicateGroups(allIssues)
+
+		if len(duplicateGroups) == 0 {
+			if !jsonOutput {
+				fmt.Println("No duplicates found!")
+			} else {
+				outputJSON(map[string]interface{}{
+					"duplicate_groups": 0,
+					"groups":           []interface{}{},
+				})
+			}
+			return
+		}
+
+		// Count references for each issue
+		refCounts := countReferences(allIssues)
+
+		// Prepare output
+		var mergeCommands []string
+		var mergeResults []map[string]interface{}
+
+		for _, group := range duplicateGroups {
+			target := chooseMergeTarget(group, refCounts)
+			sources := make([]string, 0, len(group)-1)
+			for _, issue := range group {
+				if issue.ID != target.ID {
+					sources = append(sources, issue.ID)
+				}
+			}
+
+			if autoMerge || dryRun {
+				// Perform merge (unless dry-run)
+				if !dryRun {
+					result, err := performMerge(ctx, target.ID, sources)
+					if err != nil {
+						fmt.Fprintf(os.Stderr, "Error merging %s into %s: %v\n", strings.Join(sources, ", "), target.ID, err)
+						continue
+					}
+
+					if jsonOutput {
+						mergeResults = append(mergeResults, map[string]interface{}{
+							"target_id":            target.ID,
+							"source_ids":           sources,
+							"dependencies_added":   result.depsAdded,
+							"dependencies_skipped": result.depsSkipped,
+							"text_references":      result.textRefCount,
+							"issues_closed":        result.issuesClosed,
+							"issues_skipped":       result.issuesSkipped,
+						})
+					}
+				}
+
+				cmd := fmt.Sprintf("bd merge %s --into %s", strings.Join(sources, " "), target.ID)
+				mergeCommands = append(mergeCommands, cmd)
+			} else {
+				cmd := fmt.Sprintf("bd merge %s --into %s", strings.Join(sources, " "), target.ID)
+				mergeCommands = append(mergeCommands, cmd)
+			}
+		}
+
+		// Mark dirty if we performed merges
+		if autoMerge && !dryRun && len(mergeCommands) > 0 {
+			markDirtyAndScheduleFlush()
+		}
+
+		// Output results
+		if jsonOutput {
+			output := map[string]interface{}{
+				"duplicate_groups": len(duplicateGroups),
+				"groups":           formatDuplicateGroupsJSON(duplicateGroups, refCounts),
+			}
+			if autoMerge || dryRun {
+				output["merge_commands"] = mergeCommands
+				if autoMerge && !dryRun {
+					output["merge_results"] = mergeResults
+				}
+			}
+			outputJSON(output)
+		} else {
+			yellow := color.New(color.FgYellow).SprintFunc()
+			cyan := color.New(color.FgCyan).SprintFunc()
+			green := color.New(color.FgGreen).SprintFunc()
+
+			fmt.Printf("%s Found %d duplicate group(s):\n\n", yellow("🔍"), len(duplicateGroups))
+
+			for i, group := range duplicateGroups {
+				target := chooseMergeTarget(group, refCounts)
+				fmt.Printf("%s Group %d: %s\n", cyan("━━"), i+1, group[0].Title)
+
+				for _, issue := range group {
+					refs := refCounts[issue.ID]
+					marker := "  "
+					if issue.ID == target.ID {
+						marker = green("→ ")
+					}
+					fmt.Printf("%s%s (%s, P%d, %d references)\n",
+						marker, issue.ID, issue.Status, issue.Priority, refs)
+				}
+
+				sources := make([]string, 0, len(group)-1)
+				for _, issue := range group {
+					if issue.ID != target.ID {
+						sources = append(sources, issue.ID)
+					}
+				}
+				fmt.Printf("  %s bd merge %s --into %s\n\n",
+					cyan("Suggested:"), strings.Join(sources, " "), target.ID)
+			}
+
+			if autoMerge {
+				if dryRun {
+					fmt.Printf("%s Dry run - would execute %d merge(s)\n", yellow("⚠"), len(mergeCommands))
+				} else {
+					fmt.Printf("%s Merged %d group(s)\n", green("✓"), len(mergeCommands))
+				}
+			} else {
+				fmt.Printf("%s Run with --auto-merge to execute all suggested merges\n", cyan("💡"))
+			}
+		}
+	},
+}
+
+func init() {
+	duplicatesCmd.Flags().Bool("auto-merge", false, "Automatically merge all duplicates")
+	duplicatesCmd.Flags().Bool("dry-run", false, "Show what would be merged without making changes")
+	rootCmd.AddCommand(duplicatesCmd)
+}
+
+// contentKey represents the fields we use to identify duplicate issues
+type contentKey struct {
+	title              string
+	description        string
+	design             string
+	acceptanceCriteria string
+	status             string // Only group issues with same status
+}
+
+// findDuplicateGroups groups issues by content hash
+func findDuplicateGroups(issues []*types.Issue) [][]*types.Issue {
+	groups := make(map[contentKey][]*types.Issue)
+
+	for _, issue := range issues {
+		key := contentKey{
+			title:              issue.Title,
+			description:        issue.Description,
+			design:             issue.Design,
+			acceptanceCriteria: issue.AcceptanceCriteria,
+			status:             string(issue.Status),
+		}
+
+		groups[key] = append(groups[key], issue)
+	}
+
+	// Filter to only groups with duplicates
+	var duplicates [][]*types.Issue
+	for _, group := range groups {
+		if len(group) > 1 {
+			duplicates = append(duplicates, group)
+		}
+	}
+
+	return duplicates
+}
+
+// countReferences counts how many times each issue is referenced in text fields
+func countReferences(issues []*types.Issue) map[string]int {
+	counts := make(map[string]int)
+	idPattern := regexp.MustCompile(`\b[a-zA-Z][-a-zA-Z0-9]*-\d+\b`)
+
+	for _, issue := range issues {
+		// Search in all text fields
+		textFields := []string{
+			issue.Description,
+			issue.Design,
+			issue.AcceptanceCriteria,
+			issue.Notes,
+		}
+
+		for _, text := range textFields {
+			matches := idPattern.FindAllString(text, -1)
+			for _, match := range matches {
+				counts[match]++
+			}
+		}
+	}
+
+	return counts
+}
+
+// chooseMergeTarget selects the best issue to merge into
+// Priority: highest reference count, then lexicographically smallest ID
+func chooseMergeTarget(group []*types.Issue, refCounts map[string]int) *types.Issue {
+	if len(group) == 0 {
+		return nil
+	}
+
+	target := group[0]
+	targetRefs := refCounts[target.ID]
+
+	for _, issue := range group[1:] {
+		issueRefs := refCounts[issue.ID]
+		if issueRefs > targetRefs || (issueRefs == targetRefs && issue.ID < target.ID) {
+			target = issue
+			targetRefs = issueRefs
+		}
+	}
+
+	return target
+}
+
+// formatDuplicateGroupsJSON formats duplicate groups for JSON output
+func formatDuplicateGroupsJSON(groups [][]*types.Issue, refCounts map[string]int) []map[string]interface{} {
+	var result []map[string]interface{}
+
+	for _, group := range groups {
+		target := chooseMergeTarget(group, refCounts)
+		issues := make([]map[string]interface{}, len(group))
+
+		for i, issue := range group {
+			issues[i] = map[string]interface{}{
+				"id":              issue.ID,
+				"title":           issue.Title,
+				"status":          issue.Status,
+				"priority":        issue.Priority,
+				"references":      refCounts[issue.ID],
+				"is_merge_target": issue.ID == target.ID,
+			}
+		}
+
+		sources := make([]string, 0, len(group)-1)
+		for _, issue := range group {
+			if issue.ID != target.ID {
+				sources = append(sources, issue.ID)
+			}
+		}
+
+		result = append(result, map[string]interface{}{
+			"title":               group[0].Title,
+			"issues":              issues,
+			"suggested_target":    target.ID,
+			"suggested_sources":   sources,
+			"suggested_merge_cmd": fmt.Sprintf("bd merge %s --into %s", strings.Join(sources, " "), target.ID),
+		})
+	}
+
+	return result
+}
@@ -0,0 +1,265 @@
+package main
+
+import (
+	"context"
+	"testing"
+
+	"github.com/steveyegge/beads/internal/types"
+)
+
+func TestFindDuplicateGroups(t *testing.T) {
+	tests := []struct {
+		name           string
+		issues         []*types.Issue
+		expectedGroups int
+	}{
+		{
+			name: "no duplicates",
+			issues: []*types.Issue{
+				{ID: "bd-1", Title: "Task 1", Status: types.StatusOpen},
+				{ID: "bd-2", Title: "Task 2", Status: types.StatusOpen},
+			},
+			expectedGroups: 0,
+		},
+		{
+			name: "simple duplicate",
+			issues: []*types.Issue{
+				{ID: "bd-1", Title: "Task 1", Status: types.StatusOpen},
+				{ID: "bd-2", Title: "Task 1", Status: types.StatusOpen},
+			},
+			expectedGroups: 1,
+		},
+		{
+			name: "duplicate with different status ignored",
+			issues: []*types.Issue{
+				{ID: "bd-1", Title: "Task 1", Status: types.StatusOpen},
+				{ID: "bd-2", Title: "Task 1", Status: types.StatusClosed},
+			},
+			expectedGroups: 0,
+		},
+		{
+			name: "multiple duplicates",
+			issues: []*types.Issue{
+				{ID: "bd-1", Title: "Task 1", Status: types.StatusOpen},
+				{ID: "bd-2", Title: "Task 1", Status: types.StatusOpen},
+				{ID: "bd-3", Title: "Task 2", Status: types.StatusOpen},
+				{ID: "bd-4", Title: "Task 2", Status: types.StatusOpen},
+			},
+			expectedGroups: 2,
+		},
+		{
+			name: "different descriptions are duplicates if title matches",
+			issues: []*types.Issue{
+				{ID: "bd-1", Title: "Task 1", Description: "Desc 1", Status: types.StatusOpen},
+				{ID: "bd-2", Title: "Task 1", Description: "Desc 2", Status: types.StatusOpen},
+			},
+			expectedGroups: 0, // Different descriptions = not duplicates
+		},
+		{
+			name: "exact content match",
+			issues: []*types.Issue{
+				{ID: "bd-1", Title: "Task 1", Description: "Desc 1", Design: "Design 1", AcceptanceCriteria: "AC 1", Status: types.StatusOpen},
+				{ID: "bd-2", Title: "Task 1", Description: "Desc 1", Design: "Design 1", AcceptanceCriteria: "AC 1", Status: types.StatusOpen},
+			},
+			expectedGroups: 1,
+		},
+		{
+			name: "three-way duplicate",
+			issues: []*types.Issue{
+				{ID: "bd-1", Title: "Task 1", Status: types.StatusOpen},
+				{ID: "bd-2", Title: "Task 1", Status: types.StatusOpen},
+				{ID: "bd-3", Title: "Task 1", Status: types.StatusOpen},
+			},
+			expectedGroups: 1,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			groups := findDuplicateGroups(tt.issues)
+			if len(groups) != tt.expectedGroups {
+				t.Errorf("findDuplicateGroups() returned %d groups, want %d", len(groups), tt.expectedGroups)
+			}
+		})
+	}
+}
+
+func TestChooseMergeTarget(t *testing.T) {
+	tests := []struct {
+		name      string
+		group     []*types.Issue
+		refCounts map[string]int
+		wantID    string
+	}{
+		{
+			name: "choose by reference count",
+			group: []*types.Issue{
+				{ID: "bd-2", Title: "Task"},
+				{ID: "bd-1", Title: "Task"},
+			},
+			refCounts: map[string]int{
+				"bd-1": 5,
+				"bd-2": 0,
+			},
+			wantID: "bd-1",
+		},
+		{
+			name: "choose by lexicographic order if same references",
+			group: []*types.Issue{
+				{ID: "bd-2", Title: "Task"},
+				{ID: "bd-1", Title: "Task"},
+			},
+			refCounts: map[string]int{
+				"bd-1": 0,
+				"bd-2": 0,
+			},
+			wantID: "bd-1",
+		},
+		{
+			name: "prefer higher references even with larger ID",
+			group: []*types.Issue{
+				{ID: "bd-1", Title: "Task"},
+				{ID: "bd-100", Title: "Task"},
+			},
+			refCounts: map[string]int{
+				"bd-1":   1,
+				"bd-100": 10,
+			},
+			wantID: "bd-100",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			target := chooseMergeTarget(tt.group, tt.refCounts)
+			if target.ID != tt.wantID {
+				t.Errorf("chooseMergeTarget() = %v, want %v", target.ID, tt.wantID)
+			}
+		})
+	}
+}
+
+func TestCountReferences(t *testing.T) {
+	issues := []*types.Issue{
+		{
+			ID:          "bd-1",
+			Description: "See bd-2 for details",
+			Notes:       "Related to bd-3",
+		},
+		{
+			ID:          "bd-2",
+			Description: "Mentioned bd-1 twice: bd-1",
+		},
+		{
+			ID:    "bd-3",
+			Notes: "Nothing to see here",
+		},
+	}
+
+	counts := countReferences(issues)
+
+	expectedCounts := map[string]int{
+		"bd-1": 2, // Referenced twice in bd-2
+		"bd-2": 1, // Referenced once in bd-1
+		"bd-3": 1, // Referenced once in bd-1
+	}
+
+	for id, expectedCount := range expectedCounts {
+		if counts[id] != expectedCount {
+			t.Errorf("countReferences()[%s] = %d, want %d", id, counts[id], expectedCount)
+		}
+	}
+}
+
+func TestDuplicateGroupsWithDifferentStatuses(t *testing.T) {
+	issues := []*types.Issue{
+		{ID: "bd-1", Title: "Task 1", Status: types.StatusOpen},
+		{ID: "bd-2", Title: "Task 1", Status: types.StatusClosed},
+		{ID: "bd-3", Title: "Task 1", Status: types.StatusOpen},
+	}
+
+	groups := findDuplicateGroups(issues)
+
+	// Should have 1 group with bd-1 and bd-3 (both open)
+	if len(groups) != 1 {
+		t.Fatalf("Expected 1 group, got %d", len(groups))
+	}
+
+	if len(groups[0]) != 2 {
+		t.Fatalf("Expected 2 issues in group, got %d", len(groups[0]))
+	}
+
+	// Verify bd-2 (closed) is not in the group
+	for _, issue := range groups[0] {
+		if issue.ID == "bd-2" {
+			t.Errorf("bd-2 (closed) should not be in group with open issues")
+		}
+	}
+}
+
+func TestDuplicatesIntegration(t *testing.T) {
+	ctx := context.Background()
+	testStore, cleanup := setupTestDB(t)
+	defer cleanup()
+
+	// Create duplicate issues
+	issues := []*types.Issue{
+		{
+			ID:          "bd-1",
+			Title:       "Fix authentication bug",
+			Description: "Users can't login",
+			Status:      types.StatusOpen,
+			Priority:    1,
+			IssueType:   types.TypeBug,
+		},
+		{
+			ID:          "bd-2",
+			Title:       "Fix authentication bug",
+			Description: "Users can't login",
+			Status:      types.StatusOpen,
+			Priority:    1,
+			IssueType:   types.TypeBug,
+		},
+		{
+			ID:          "bd-3",
+			Title:       "Different task",
+			Description: "Different description",
+			Status:      types.StatusOpen,
+			Priority:    2,
+			IssueType:   types.TypeTask,
+		},
+	}
+
+	for _, issue := range issues {
+		if err := testStore.CreateIssue(ctx, issue, "test"); err != nil {
+			t.Fatalf("CreateIssue failed: %v", err)
+		}
+	}
+
+	// Fetch all issues
+	allIssues, err := testStore.SearchIssues(ctx, "", types.IssueFilter{})
+	if err != nil {
+		t.Fatalf("SearchIssues failed: %v", err)
+	}
+
+	// Find duplicates
+	groups := findDuplicateGroups(allIssues)
+
+	if len(groups) != 1 {
+		t.Fatalf("Expected 1 duplicate group, got %d", len(groups))
+	}
+
+	if len(groups[0]) != 2 {
+		t.Fatalf("Expected 2 issues in group, got %d", len(groups[0]))
+	}
+
+	// Verify the duplicate group contains bd-1 and bd-2
+	ids := make(map[string]bool)
+	for _, issue := range groups[0] {
+		ids[issue.ID] = true
+	}
+
+	if !ids["bd-1"] || !ids["bd-2"] {
+		t.Errorf("Expected duplicate group to contain bd-1 and bd-2")
+	}
+}
@@ -7,6 +7,7 @@ import (
 	"fmt"
 	"os"
 	"sort"
+	"strings"

 	"github.com/spf13/cobra"
 	"github.com/steveyegge/beads/internal/types"
@@ -24,6 +25,7 @@ Behavior:
  - New issues are created
  - Collisions (same ID, different content) are detected
  - Use --resolve-collisions to automatically remap colliding issues
+  - Use --dedupe-after to find and merge content duplicates after import
  - Use --dry-run to preview changes without applying them`,
 	Run: func(cmd *cobra.Command, args []string) {
 		input, _ := cmd.Flags().GetString("input")
@@ -32,6 +34,7 @@ Behavior:
 		resolveCollisions, _ := cmd.Flags().GetBool("resolve-collisions")
 		dryRun, _ := cmd.Flags().GetBool("dry-run")
 		renameOnImport, _ := cmd.Flags().GetBool("rename-on-import")
+		dedupeAfter, _ := cmd.Flags().GetBool("dedupe-after")

 		// Open input
 		in := os.Stdin
@@ -190,6 +193,54 @@ Behavior:
 			fmt.Fprintf(os.Stderr, ", %d issues remapped", len(result.IDMapping))
 		}
 		fmt.Fprintf(os.Stderr, "\n")
+
+		// Run duplicate detection if requested
+		if dedupeAfter {
+			fmt.Fprintf(os.Stderr, "\n=== Post-Import Duplicate Detection ===\n")
+
+			// Get all issues (fresh after import)
+			allIssues, err := store.SearchIssues(ctx, "", types.IssueFilter{})
+			if err != nil {
+				fmt.Fprintf(os.Stderr, "Error fetching issues for deduplication: %v\n", err)
+				os.Exit(1)
+			}
+
+			duplicateGroups := findDuplicateGroups(allIssues)
+			if len(duplicateGroups) == 0 {
+				fmt.Fprintf(os.Stderr, "No duplicates found.\n")
+				return
+			}
+
+			refCounts := countReferences(allIssues)
+
+			fmt.Fprintf(os.Stderr, "Found %d duplicate group(s)\n\n", len(duplicateGroups))
+
+			for i, group := range duplicateGroups {
+				target := chooseMergeTarget(group, refCounts)
+				fmt.Fprintf(os.Stderr, "Group %d: %s\n", i+1, group[0].Title)
+
+				for _, issue := range group {
+					refs := refCounts[issue.ID]
+					marker := "  "
+					if issue.ID == target.ID {
+						marker = "→ "
+					}
+					fmt.Fprintf(os.Stderr, "  %s%s (%s, P%d, %d refs)\n",
+						marker, issue.ID, issue.Status, issue.Priority, refs)
+				}
+
+				sources := make([]string, 0, len(group)-1)
+				for _, issue := range group {
+					if issue.ID != target.ID {
+						sources = append(sources, issue.ID)
+					}
+				}
+				fmt.Fprintf(os.Stderr, "  Suggested: bd merge %s --into %s\n\n",
+					strings.Join(sources, " "), target.ID)
+			}
+
+			fmt.Fprintf(os.Stderr, "Run 'bd duplicates --auto-merge' to merge all duplicates.\n")
+		}
 	},
 }

@@ -198,6 +249,7 @@ func init() {
 	importCmd.Flags().BoolP("skip-existing", "s", false, "Skip existing issues instead of updating them")
 	importCmd.Flags().Bool("strict", false, "Fail on dependency errors instead of treating them as warnings")
 	importCmd.Flags().Bool("resolve-collisions", false, "Automatically resolve ID collisions by remapping")
+	importCmd.Flags().Bool("dedupe-after", false, "Detect and report content duplicates after import")
 	importCmd.Flags().Bool("dry-run", false, "Preview collision detection without making changes")
 	importCmd.Flags().Bool("rename-on-import", false, "Rename imported issues to match database prefix (updates all references)")
 	rootCmd.AddCommand(importCmd)