Switch from hex to Base36 encoding for issue IDs (GH #213)

This change improves information density by using Base36 (0-9, a-z) instead of hex (0-9, a-f) for hash-based issue IDs. Key benefits: - Shorter IDs: Can now use 3-char IDs (was 4-char minimum) - Better scaling: 3 chars good for ~160 issues, 4 chars for ~980 issues - Case-insensitive: Maintains excellent CLI usability - Backward compatible: Old hex IDs continue to work Changes: - Implemented Base36 encoding with proper truncation (keep LSB) - Updated adaptive length thresholds (3-8 chars instead of 4-8) - Fixed collision probability math to match encoding (was calculating for base36 but encoding in hex - now both use base36) - Fixed ID parser bug (use prefixWithHyphen for substring matching) - Updated all tests and test data patterns Fixes #213 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-03 12:02:15 -08:00
parent add5599d7e
commit b4cb636d92
14 changed files with 176 additions and 89 deletions
@@ -50,36 +50,18 @@ func TestAdaptiveIDLength_E2E(t *testing.T) {
 		return issue.ID
 	}
 	
-	// Test 1: First few issues should use 4-char IDs
-	t.Run("first_50_issues_use_4_chars", func(t *testing.T) {
+	// Test 1: First few issues should use 3-char IDs (base36 allows shorter IDs)
+	t.Run("first_50_issues_use_3_chars", func(t *testing.T) {
 		for i := 0; i < 50; i++ {
 			title := formatTitle("Issue %d", i)
-			createAndCheckLength(title, 4)
+			createAndCheckLength(title, 3)
 		}
 	})
-	
-	// Test 2: Issues 50-500 should still use 4 chars (7% collision at 500)
-	t.Run("issues_50_to_500_use_4_chars", func(t *testing.T) {
-		for i := 50; i < 500; i++ {
-			title := formatTitle("Issue %d", i)
-			id := createAndCheckLength(title, 4)
-			// Most should be 4 chars, but collisions might push some to 5
-			// We allow up to 5 chars as progressive fallback
-			hashPart := strings.TrimPrefix(id, "test-")
-			if len(hashPart) > 5 {
-				t.Errorf("Issue %d has hash length %d, expected 4-5", i, len(hashPart))
-			}
-		}
-	})
-	
-	// Test 3: At 1000 issues, should scale to 5 chars
-	// Note: We don't enforce exact length in this test because the adaptive
-	// algorithm will keep using 4 chars until collision probability exceeds 25%
-	// At 600 issues we're still below that threshold
-	t.Run("verify_adaptive_scaling_works", func(t *testing.T) {
-		// Just verify that we can create more issues and the algorithm doesn't break
-		// The actual length will be determined by the adaptive algorithm
-		for i := 500; i < 550; i++ {
+
+	// Test 2: Issues 50-200 should transition to 4 chars
+	// (3 chars good up to ~160 issues with 25% threshold)
+	t.Run("issues_50_to_200_use_3_or_4_chars", func(t *testing.T) {
+		for i := 50; i < 200; i++ {
 			title := formatTitle("Issue %d", i)
 			issue := &types.Issue{
 				Title:       title,
@@ -88,15 +70,42 @@ func TestAdaptiveIDLength_E2E(t *testing.T) {
 				Priority:    1,
 				IssueType:   "task",
 			}
-			
+
 			if err := db.CreateIssue(ctx, issue, "test@example.com"); err != nil {
 				t.Fatalf("Failed to create issue: %v", err)
 			}
-			
-			// Should use 4-6 chars depending on database size
+
+			// Most should be 3 chars initially, transitioning to 4 after ~160
 			hashPart := strings.TrimPrefix(issue.ID, "test-")
-			if len(hashPart) < 4 || len(hashPart) > 6 {
-				t.Errorf("Issue %d has hash length %d, expected 4-6", i, len(hashPart))
+			if len(hashPart) < 3 || len(hashPart) > 4 {
+				t.Errorf("Issue %d has hash length %d, expected 3-4", i, len(hashPart))
+			}
+		}
+	})
+	
+	// Test 3: At 500-1000 issues, should scale to 4-5 chars
+	// (4 chars good up to ~980 issues with 25% threshold)
+	t.Run("verify_adaptive_scaling_works", func(t *testing.T) {
+		// Just verify that we can create more issues and the algorithm doesn't break
+		// The actual length will be determined by the adaptive algorithm
+		for i := 200; i < 250; i++ {
+			title := formatTitle("Issue %d", i)
+			issue := &types.Issue{
+				Title:       title,
+				Description: "Test",
+				Status:      "open",
+				Priority:    1,
+				IssueType:   "task",
+			}
+
+			if err := db.CreateIssue(ctx, issue, "test@example.com"); err != nil {
+				t.Fatalf("Failed to create issue: %v", err)
+			}
+
+			// Should use 4-5 chars depending on database size
+			hashPart := strings.TrimPrefix(issue.ID, "test-")
+			if len(hashPart) < 3 || len(hashPart) > 5 {
+				t.Errorf("Issue %d has hash length %d, expected 3-5", i, len(hashPart))
 			}
 		}
 	})
@@ -11,19 +11,26 @@ import (
 type AdaptiveIDConfig struct {
 	// MaxCollisionProbability is the threshold at which we scale up ID length (e.g., 0.25 = 25%)
 	MaxCollisionProbability float64
-	
-	// MinLength is the minimum hash length to use (default 4)
+
+	// MinLength is the minimum hash length to use (default 3)
 	MinLength int
-	
+
 	// MaxLength is the maximum hash length to use (default 8)
 	MaxLength int
 }

-// DefaultAdaptiveConfig returns sensible defaults
+// DefaultAdaptiveConfig returns sensible defaults for base36 encoding
+// With base36 (0-9, a-z), we can use shorter IDs than hex:
+//   3 chars: ~46K namespace, good for up to ~160 issues (25% collision prob)
+//   4 chars: ~1.7M namespace, good for up to ~980 issues
+//   5 chars: ~60M namespace, good for up to ~5.9K issues
+//   6 chars: ~2.2B namespace, good for up to ~35K issues
+//   7 chars: ~78B namespace, good for up to ~212K issues
+//   8 chars: ~2.8T namespace, good for up to ~1M+ issues
 func DefaultAdaptiveConfig() AdaptiveIDConfig {
 	return AdaptiveIDConfig{
 		MaxCollisionProbability: 0.25, // 25% threshold
-		MinLength:               4,
+		MinLength:               3,
 		MaxLength:               8,
 	}
 }
@@ -32,7 +39,7 @@ func DefaultAdaptiveConfig() AdaptiveIDConfig {
 // P(collision) ≈ 1 - e^(-n²/2N)
 // where n = number of items, N = total possible values
 func collisionProbability(numIssues int, idLength int) float64 {
-	const base = 36.0 // lowercase alphanumeric (0-9, a-z)
+	const base = 36.0 // base36 encoding (0-9, a-z)
 	totalPossibilities := math.Pow(base, float64(idLength))
 	exponent := -float64(numIssues*numIssues) / (2.0 * totalPossibilities)
 	return 1.0 - math.Exp(exponent)
@@ -45,35 +45,41 @@ func TestComputeAdaptiveLength(t *testing.T) {
 		want      int
 	}{
 		{
-			name:      "small database uses 4 chars",
+			name:      "tiny database uses 3 chars",
 			numIssues: 50,
 			config:    DefaultAdaptiveConfig(),
-			want:      4,
+			want:      3,
 		},
 		{
-			name:      "medium database uses 4 chars",
+			name:      "small database uses 4 chars",
 			numIssues: 500,
 			config:    DefaultAdaptiveConfig(),
 			want:      4,
 		},
 		{
-			name:      "large database uses 5 chars",
-			numIssues: 1000,
+			name:      "medium database uses 5 chars",
+			numIssues: 3000,
 			config:    DefaultAdaptiveConfig(),
 			want:      5,
 		},
 		{
-			name:      "very large database uses 6 chars",
-			numIssues: 10000,
+			name:      "large database uses 6 chars",
+			numIssues: 20000,
 			config:    DefaultAdaptiveConfig(),
 			want:      6,
 		},
+		{
+			name:      "very large database uses 7 chars",
+			numIssues: 100000,
+			config:    DefaultAdaptiveConfig(),
+			want:      7,
+		},
 		{
 			name:      "custom threshold - stricter",
 			numIssues: 200,
 			config: AdaptiveIDConfig{
 				MaxCollisionProbability: 0.01, // 1% threshold
-				MinLength:               4,
+				MinLength:               3,
 				MaxLength:               8,
 			},
 			want: 5,
@@ -83,7 +89,7 @@ func TestComputeAdaptiveLength(t *testing.T) {
 			numIssues: 1000,
 			config: AdaptiveIDConfig{
 				MaxCollisionProbability: 0.50, // 50% threshold
-				MinLength:               4,
+				MinLength:               3,
 				MaxLength:               8,
 			},
 			want: 4,
@@ -112,6 +118,7 @@ func TestGenerateHashID_VariableLengths(t *testing.T) {
 		length       int
 		expectedLen  int // length of hash portion (without prefix)
 	}{
+		{3, 3},
 		{4, 4},
 		{5, 5},
 		{6, 6},
@@ -152,20 +159,20 @@ func TestGetAdaptiveIDLength_Integration(t *testing.T) {
 		t.Fatalf("Failed to set prefix: %v", err)
 	}
 	
-	// Test default config (should use 4 chars for empty database)
+	// Test default config (should use 3 chars for empty database)
 	conn, err := db.db.Conn(ctx)
 	if err != nil {
 		t.Fatalf("Failed to get connection: %v", err)
 	}
 	defer conn.Close()
-	
+
 	length, err := GetAdaptiveIDLength(ctx, conn, "test")
 	if err != nil {
 		t.Fatalf("GetAdaptiveIDLength failed: %v", err)
 	}
-	
-	if length != 4 {
-		t.Errorf("Empty database should use 4 chars, got %d", length)
+
+	if length != 3 {
+		t.Errorf("Empty database should use 3 chars, got %d", length)
 	}
 	
 	// Test custom config
@@ -35,10 +35,10 @@ func TestHashIDGeneration(t *testing.T) {
 		t.Fatalf("Failed to create issue: %v", err)
 	}

-	// Verify hash ID format: bd-<4-8 hex chars> with adaptive length (bd-ea2a13)
-	// For empty/small database, should use 4 chars
-	if len(issue.ID) < 7 || len(issue.ID) > 11 { // "bd-" (3) + 4-8 hex chars = 7-11
-		t.Errorf("Expected ID length 7-11, got %d: %s", len(issue.ID), issue.ID)
+	// Verify hash ID format: bd-<3-8 base36 chars> with adaptive length
+	// For empty/small database, should use 3 chars
+	if len(issue.ID) < 6 || len(issue.ID) > 11 { // "bd-" (3) + 3-8 base36 chars = 6-11
+		t.Errorf("Expected ID length 6-11, got %d: %s", len(issue.ID), issue.ID)
 	}

 	if issue.ID[:3] != "bd-" {
@@ -182,9 +182,9 @@ func TestHashIDBatchCreation(t *testing.T) {
 		}
 		ids[issue.ID] = true

-		// Verify hash ID format (4-8 chars with adaptive length)
-		if len(issue.ID) < 7 || len(issue.ID) > 11 {
-			t.Errorf("Expected ID length 7-11, got %d: %s", len(issue.ID), issue.ID)
+		// Verify hash ID format (3-8 chars with adaptive length)
+		if len(issue.ID) < 6 || len(issue.ID) > 11 {
+			t.Errorf("Expected ID length 6-11, got %d: %s", len(issue.ID), issue.ID)
 		}
 		if issue.ID[:3] != "bd-" {
 			t.Errorf("Expected ID to start with 'bd-', got: %s", issue.ID)
@@ -4,14 +4,75 @@ import (
 	"context"
 	"crypto/sha256"
 	"database/sql"
-	"encoding/hex"
 	"fmt"
+	"math/big"
 	"strings"
 	"time"

 	"github.com/steveyegge/beads/internal/types"
 )

+// base36Alphabet is the character set for base36 encoding (0-9, a-z)
+const base36Alphabet = "0123456789abcdefghijklmnopqrstuvwxyz"
+
+// encodeBase36 converts a byte slice to a base36 string of specified length
+// Takes the first N bytes and converts them to base36 representation
+func encodeBase36(data []byte, length int) string {
+	// Convert bytes to big integer
+	num := new(big.Int).SetBytes(data)
+
+	// Convert to base36
+	var result strings.Builder
+	base := big.NewInt(36)
+	zero := big.NewInt(0)
+	mod := new(big.Int)
+
+	// Build the string in reverse
+	chars := make([]byte, 0, length)
+	for num.Cmp(zero) > 0 {
+		num.DivMod(num, base, mod)
+		chars = append(chars, base36Alphabet[mod.Int64()])
+	}
+
+	// Reverse the string
+	for i := len(chars) - 1; i >= 0; i-- {
+		result.WriteByte(chars[i])
+	}
+
+	// Pad with zeros if needed
+	str := result.String()
+	if len(str) < length {
+		str = strings.Repeat("0", length-len(str)) + str
+	}
+
+	// Truncate to exact length if needed (keep least significant digits)
+	if len(str) > length {
+		str = str[len(str)-length:]
+	}
+
+	return str
+}
+
+// isValidBase36 checks if a string contains only base36 characters
+func isValidBase36(s string) bool {
+	for _, c := range s {
+		if !((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z')) {
+			return false
+		}
+	}
+	return true
+}
+
+// isValidHex checks if a string contains only hex characters
+func isValidHex(s string) bool {
+	for _, c := range s {
+		if !((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f')) {
+			return false
+		}
+	}
+	return true
+}
+
 // ValidateIssueIDPrefix validates that an issue ID matches the configured prefix
 // Supports both top-level (bd-a3f8e9) and hierarchical (bd-a3f8e9.1) IDs
 func ValidateIssueIDPrefix(id, prefix string) error {
@@ -150,36 +211,39 @@ func EnsureIDs(ctx context.Context, conn *sql.Conn, prefix string, issues []*typ
 }

 // generateHashID creates a hash-based ID for a top-level issue.
-// For child issues, use the parent ID with a numeric suffix (e.g., "bd-a3f8e9.1").
-// Supports adaptive length from 4-8 chars based on database size (bd-ea2a13).
+// For child issues, use the parent ID with a numeric suffix (e.g., "bd-x7k9p.1").
+// Supports adaptive length from 3-8 chars based on database size.
 // Includes a nonce parameter to handle same-length collisions.
+// Uses base36 encoding (0-9, a-z) for better information density than hex.
 func generateHashID(prefix, title, description, creator string, timestamp time.Time, length, nonce int) string {
 	// Combine inputs into a stable content string
 	// Include nonce to handle hash collisions
 	content := fmt.Sprintf("%s|%s|%s|%d|%d", title, description, creator, timestamp.UnixNano(), nonce)
-	
+
 	// Hash the content
 	hash := sha256.Sum256([]byte(content))
-	
-	// Use variable length (4-8 hex chars)
-	// length determines how many bytes to use (2, 2.5, 3, 3.5, or 4)
-	var shortHash string
+
+	// Use base36 encoding with variable length (3-8 chars)
+	// Determine how many bytes to use based on desired output length
+	var numBytes int
 	switch length {
+	case 3:
+		numBytes = 2 // 2 bytes = 16 bits ≈ 3.09 base36 chars
 	case 4:
-		shortHash = hex.EncodeToString(hash[:2])
+		numBytes = 3 // 3 bytes = 24 bits ≈ 4.63 base36 chars
 	case 5:
-		// 2.5 bytes: use 3 bytes but take only first 5 chars
-		shortHash = hex.EncodeToString(hash[:3])[:5]
+		numBytes = 4 // 4 bytes = 32 bits ≈ 6.18 base36 chars
 	case 6:
-		shortHash = hex.EncodeToString(hash[:3])
+		numBytes = 4 // 4 bytes = 32 bits ≈ 6.18 base36 chars
 	case 7:
-		// 3.5 bytes: use 4 bytes but take only first 7 chars
-		shortHash = hex.EncodeToString(hash[:4])[:7]
+		numBytes = 5 // 5 bytes = 40 bits ≈ 7.73 base36 chars
 	case 8:
-		shortHash = hex.EncodeToString(hash[:4])
+		numBytes = 5 // 5 bytes = 40 bits ≈ 7.73 base36 chars
 	default:
-		shortHash = hex.EncodeToString(hash[:3]) // default to 6
+		numBytes = 3 // default to 3 chars
 	}
-	
+
+	shortHash := encodeBase36(hash[:numBytes], length)
+
 	return fmt.Sprintf("%s-%s", prefix, shortHash)
 }
@@ -78,11 +78,11 @@ func ResolvePartialID(ctx context.Context, store storage.Storage, input string)
 	}
 	
 	// Extract the hash part for substring matching
-	hashPart := strings.TrimPrefix(normalizedID, prefix)
-	
+	hashPart := strings.TrimPrefix(normalizedID, prefixWithHyphen)
+
 	var matches []string
 	for _, issue := range issues {
-		issueHash := strings.TrimPrefix(issue.ID, prefix)
+		issueHash := strings.TrimPrefix(issue.ID, prefixWithHyphen)
 		// Check if the issue hash contains the input hash as substring
 		if strings.Contains(issueHash, hashPart) {
 			matches = append(matches, issue.ID)