From b4cb636d92c0803c84090355c24e8e59e6d8e3ca Mon Sep 17 00:00:00 2001 From: Steve Yegge Date: Mon, 3 Nov 2025 12:02:15 -0800 Subject: [PATCH] Switch from hex to Base36 encoding for issue IDs (GH #213) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change improves information density by using Base36 (0-9, a-z) instead of hex (0-9, a-f) for hash-based issue IDs. Key benefits: - Shorter IDs: Can now use 3-char IDs (was 4-char minimum) - Better scaling: 3 chars good for ~160 issues, 4 chars for ~980 issues - Case-insensitive: Maintains excellent CLI usability - Backward compatible: Old hex IDs continue to work Changes: - Implemented Base36 encoding with proper truncation (keep LSB) - Updated adaptive length thresholds (3-8 chars instead of 4-8) - Fixed collision probability math to match encoding (was calculating for base36 but encoding in hex - now both use base36) - Fixed ID parser bug (use prefixWithHyphen for substring matching) - Updated all tests and test data patterns Fixes #213 šŸ¤– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- cmd/bd/testdata/blocked.txt | 2 +- cmd/bd/testdata/close.txt | 2 +- cmd/bd/testdata/dep_add.txt | 4 +- cmd/bd/testdata/dep_remove.txt | 4 +- cmd/bd/testdata/dep_tree.txt | 4 +- cmd/bd/testdata/show.txt | 2 +- cmd/bd/testdata/stats.txt | 2 +- cmd/bd/testdata/update.txt | 2 +- internal/storage/sqlite/adaptive_e2e_test.go | 71 ++++++++------ internal/storage/sqlite/adaptive_length.go | 19 ++-- .../storage/sqlite/adaptive_length_test.go | 35 ++++--- internal/storage/sqlite/hash_id_test.go | 14 +-- internal/storage/sqlite/ids.go | 98 +++++++++++++++---- internal/utils/id_parser.go | 6 +- 14 files changed, 176 insertions(+), 89 deletions(-) diff --git a/cmd/bd/testdata/blocked.txt b/cmd/bd/testdata/blocked.txt index bda1f5cf..aa543e38 100644 --- a/cmd/bd/testdata/blocked.txt +++ b/cmd/bd/testdata/blocked.txt @@ -4,7 +4,7 @@ bd init --prefix test # Create first issue bd create 'First issue' cp stdout first.txt -exec sh -c 'grep -oE "test-[a-f0-9]+" first.txt > first_id.txt' +exec sh -c 'grep -oE "test-[a-z0-9]+" first.txt > first_id.txt' # Create second issue that depends on first exec sh -c 'bd create "Second issue" --deps $(cat first_id.txt)' diff --git a/cmd/bd/testdata/close.txt b/cmd/bd/testdata/close.txt index 1711383b..42201438 100644 --- a/cmd/bd/testdata/close.txt +++ b/cmd/bd/testdata/close.txt @@ -4,7 +4,7 @@ bd init --prefix test # Create issue and capture its hash ID bd create 'Issue to close' cp stdout issue.txt -exec sh -c 'grep -oE "test-[a-f0-9]+" issue.txt > issue_id.txt' +exec sh -c 'grep -oE "test-[a-z0-9]+" issue.txt > issue_id.txt' # Close the issue exec sh -c 'bd close $(cat issue_id.txt) --reason Fixed' diff --git a/cmd/bd/testdata/dep_add.txt b/cmd/bd/testdata/dep_add.txt index 777e6586..3254869e 100644 --- a/cmd/bd/testdata/dep_add.txt +++ b/cmd/bd/testdata/dep_add.txt @@ -11,8 +11,8 @@ cp stdout second.txt grep 'Created issue: test-' second.txt # Extract IDs using grep (hash IDs are test-XXXXXXXX format) -exec sh -c 'grep -oE "test-[a-f0-9]+" first.txt > first_id.txt' -exec sh -c 'grep -oE "test-[a-f0-9]+" second.txt > second_id.txt' +exec sh -c 'grep -oE "test-[a-z0-9]+" first.txt > first_id.txt' +exec sh -c 'grep -oE "test-[a-z0-9]+" second.txt > second_id.txt' # Add dependency: second depends on first exec sh -c 'bd dep add $(cat second_id.txt) $(cat first_id.txt)' diff --git a/cmd/bd/testdata/dep_remove.txt b/cmd/bd/testdata/dep_remove.txt index 8eedefb1..7a092c19 100644 --- a/cmd/bd/testdata/dep_remove.txt +++ b/cmd/bd/testdata/dep_remove.txt @@ -4,11 +4,11 @@ bd init --prefix test # Create issues and capture their hash IDs bd create 'First issue' cp stdout first.txt -exec sh -c 'grep -oE "test-[a-f0-9]+" first.txt > first_id.txt' +exec sh -c 'grep -oE "test-[a-z0-9]+" first.txt > first_id.txt' bd create 'Second issue' cp stdout second.txt -exec sh -c 'grep -oE "test-[a-f0-9]+" second.txt > second_id.txt' +exec sh -c 'grep -oE "test-[a-z0-9]+" second.txt > second_id.txt' # Add dependency exec sh -c 'bd dep add $(cat second_id.txt) $(cat first_id.txt)' diff --git a/cmd/bd/testdata/dep_tree.txt b/cmd/bd/testdata/dep_tree.txt index 3f32e918..d57373b6 100644 --- a/cmd/bd/testdata/dep_tree.txt +++ b/cmd/bd/testdata/dep_tree.txt @@ -4,11 +4,11 @@ bd init --prefix test # Create issues and capture their hash IDs bd create 'Root issue' cp stdout root.txt -exec sh -c 'grep -oE "test-[a-f0-9]+" root.txt > root_id.txt' +exec sh -c 'grep -oE "test-[a-z0-9]+" root.txt > root_id.txt' bd create 'Child issue' cp stdout child.txt -exec sh -c 'grep -oE "test-[a-f0-9]+" child.txt > child_id.txt' +exec sh -c 'grep -oE "test-[a-z0-9]+" child.txt > child_id.txt' # Add dependency: child depends on root exec sh -c 'bd dep add $(cat child_id.txt) $(cat root_id.txt)' diff --git a/cmd/bd/testdata/show.txt b/cmd/bd/testdata/show.txt index 3397ba36..051ceae0 100644 --- a/cmd/bd/testdata/show.txt +++ b/cmd/bd/testdata/show.txt @@ -7,7 +7,7 @@ cp stdout issue.txt grep 'Created issue: test-' issue.txt # Extract ID using grep -exec sh -c 'grep -oE "test-[a-f0-9]+" issue.txt > issue_id.txt' +exec sh -c 'grep -oE "test-[a-z0-9]+" issue.txt > issue_id.txt' # Show the issue exec sh -c 'bd show $(cat issue_id.txt)' diff --git a/cmd/bd/testdata/stats.txt b/cmd/bd/testdata/stats.txt index 7517ba64..9010ab3f 100644 --- a/cmd/bd/testdata/stats.txt +++ b/cmd/bd/testdata/stats.txt @@ -4,7 +4,7 @@ bd init --prefix test # Create issues bd create 'First issue' cp stdout first.txt -exec sh -c 'grep -oE "test-[a-f0-9]+" first.txt > first_id.txt' +exec sh -c 'grep -oE "test-[a-z0-9]+" first.txt > first_id.txt' bd create 'Second issue' diff --git a/cmd/bd/testdata/update.txt b/cmd/bd/testdata/update.txt index 2c7fab57..18967f14 100644 --- a/cmd/bd/testdata/update.txt +++ b/cmd/bd/testdata/update.txt @@ -7,7 +7,7 @@ cp stdout issue.txt grep 'Created issue: test-' issue.txt # Extract ID using grep -exec sh -c 'grep -oE "test-[a-f0-9]+" issue.txt > issue_id.txt' +exec sh -c 'grep -oE "test-[a-z0-9]+" issue.txt > issue_id.txt' # Update the issue status exec sh -c 'bd update $(cat issue_id.txt) --status in_progress' diff --git a/internal/storage/sqlite/adaptive_e2e_test.go b/internal/storage/sqlite/adaptive_e2e_test.go index 40805b7d..a51c6ea2 100644 --- a/internal/storage/sqlite/adaptive_e2e_test.go +++ b/internal/storage/sqlite/adaptive_e2e_test.go @@ -50,36 +50,18 @@ func TestAdaptiveIDLength_E2E(t *testing.T) { return issue.ID } - // Test 1: First few issues should use 4-char IDs - t.Run("first_50_issues_use_4_chars", func(t *testing.T) { + // Test 1: First few issues should use 3-char IDs (base36 allows shorter IDs) + t.Run("first_50_issues_use_3_chars", func(t *testing.T) { for i := 0; i < 50; i++ { title := formatTitle("Issue %d", i) - createAndCheckLength(title, 4) + createAndCheckLength(title, 3) } }) - - // Test 2: Issues 50-500 should still use 4 chars (7% collision at 500) - t.Run("issues_50_to_500_use_4_chars", func(t *testing.T) { - for i := 50; i < 500; i++ { - title := formatTitle("Issue %d", i) - id := createAndCheckLength(title, 4) - // Most should be 4 chars, but collisions might push some to 5 - // We allow up to 5 chars as progressive fallback - hashPart := strings.TrimPrefix(id, "test-") - if len(hashPart) > 5 { - t.Errorf("Issue %d has hash length %d, expected 4-5", i, len(hashPart)) - } - } - }) - - // Test 3: At 1000 issues, should scale to 5 chars - // Note: We don't enforce exact length in this test because the adaptive - // algorithm will keep using 4 chars until collision probability exceeds 25% - // At 600 issues we're still below that threshold - t.Run("verify_adaptive_scaling_works", func(t *testing.T) { - // Just verify that we can create more issues and the algorithm doesn't break - // The actual length will be determined by the adaptive algorithm - for i := 500; i < 550; i++ { + + // Test 2: Issues 50-200 should transition to 4 chars + // (3 chars good up to ~160 issues with 25% threshold) + t.Run("issues_50_to_200_use_3_or_4_chars", func(t *testing.T) { + for i := 50; i < 200; i++ { title := formatTitle("Issue %d", i) issue := &types.Issue{ Title: title, @@ -88,15 +70,42 @@ func TestAdaptiveIDLength_E2E(t *testing.T) { Priority: 1, IssueType: "task", } - + if err := db.CreateIssue(ctx, issue, "test@example.com"); err != nil { t.Fatalf("Failed to create issue: %v", err) } - - // Should use 4-6 chars depending on database size + + // Most should be 3 chars initially, transitioning to 4 after ~160 hashPart := strings.TrimPrefix(issue.ID, "test-") - if len(hashPart) < 4 || len(hashPart) > 6 { - t.Errorf("Issue %d has hash length %d, expected 4-6", i, len(hashPart)) + if len(hashPart) < 3 || len(hashPart) > 4 { + t.Errorf("Issue %d has hash length %d, expected 3-4", i, len(hashPart)) + } + } + }) + + // Test 3: At 500-1000 issues, should scale to 4-5 chars + // (4 chars good up to ~980 issues with 25% threshold) + t.Run("verify_adaptive_scaling_works", func(t *testing.T) { + // Just verify that we can create more issues and the algorithm doesn't break + // The actual length will be determined by the adaptive algorithm + for i := 200; i < 250; i++ { + title := formatTitle("Issue %d", i) + issue := &types.Issue{ + Title: title, + Description: "Test", + Status: "open", + Priority: 1, + IssueType: "task", + } + + if err := db.CreateIssue(ctx, issue, "test@example.com"); err != nil { + t.Fatalf("Failed to create issue: %v", err) + } + + // Should use 4-5 chars depending on database size + hashPart := strings.TrimPrefix(issue.ID, "test-") + if len(hashPart) < 3 || len(hashPart) > 5 { + t.Errorf("Issue %d has hash length %d, expected 3-5", i, len(hashPart)) } } }) diff --git a/internal/storage/sqlite/adaptive_length.go b/internal/storage/sqlite/adaptive_length.go index 2b2df0a4..0009f736 100644 --- a/internal/storage/sqlite/adaptive_length.go +++ b/internal/storage/sqlite/adaptive_length.go @@ -11,19 +11,26 @@ import ( type AdaptiveIDConfig struct { // MaxCollisionProbability is the threshold at which we scale up ID length (e.g., 0.25 = 25%) MaxCollisionProbability float64 - - // MinLength is the minimum hash length to use (default 4) + + // MinLength is the minimum hash length to use (default 3) MinLength int - + // MaxLength is the maximum hash length to use (default 8) MaxLength int } -// DefaultAdaptiveConfig returns sensible defaults +// DefaultAdaptiveConfig returns sensible defaults for base36 encoding +// With base36 (0-9, a-z), we can use shorter IDs than hex: +// 3 chars: ~46K namespace, good for up to ~160 issues (25% collision prob) +// 4 chars: ~1.7M namespace, good for up to ~980 issues +// 5 chars: ~60M namespace, good for up to ~5.9K issues +// 6 chars: ~2.2B namespace, good for up to ~35K issues +// 7 chars: ~78B namespace, good for up to ~212K issues +// 8 chars: ~2.8T namespace, good for up to ~1M+ issues func DefaultAdaptiveConfig() AdaptiveIDConfig { return AdaptiveIDConfig{ MaxCollisionProbability: 0.25, // 25% threshold - MinLength: 4, + MinLength: 3, MaxLength: 8, } } @@ -32,7 +39,7 @@ func DefaultAdaptiveConfig() AdaptiveIDConfig { // P(collision) ā‰ˆ 1 - e^(-n²/2N) // where n = number of items, N = total possible values func collisionProbability(numIssues int, idLength int) float64 { - const base = 36.0 // lowercase alphanumeric (0-9, a-z) + const base = 36.0 // base36 encoding (0-9, a-z) totalPossibilities := math.Pow(base, float64(idLength)) exponent := -float64(numIssues*numIssues) / (2.0 * totalPossibilities) return 1.0 - math.Exp(exponent) diff --git a/internal/storage/sqlite/adaptive_length_test.go b/internal/storage/sqlite/adaptive_length_test.go index fa7a4d71..56e6d483 100644 --- a/internal/storage/sqlite/adaptive_length_test.go +++ b/internal/storage/sqlite/adaptive_length_test.go @@ -45,35 +45,41 @@ func TestComputeAdaptiveLength(t *testing.T) { want int }{ { - name: "small database uses 4 chars", + name: "tiny database uses 3 chars", numIssues: 50, config: DefaultAdaptiveConfig(), - want: 4, + want: 3, }, { - name: "medium database uses 4 chars", + name: "small database uses 4 chars", numIssues: 500, config: DefaultAdaptiveConfig(), want: 4, }, { - name: "large database uses 5 chars", - numIssues: 1000, + name: "medium database uses 5 chars", + numIssues: 3000, config: DefaultAdaptiveConfig(), want: 5, }, { - name: "very large database uses 6 chars", - numIssues: 10000, + name: "large database uses 6 chars", + numIssues: 20000, config: DefaultAdaptiveConfig(), want: 6, }, + { + name: "very large database uses 7 chars", + numIssues: 100000, + config: DefaultAdaptiveConfig(), + want: 7, + }, { name: "custom threshold - stricter", numIssues: 200, config: AdaptiveIDConfig{ MaxCollisionProbability: 0.01, // 1% threshold - MinLength: 4, + MinLength: 3, MaxLength: 8, }, want: 5, @@ -83,7 +89,7 @@ func TestComputeAdaptiveLength(t *testing.T) { numIssues: 1000, config: AdaptiveIDConfig{ MaxCollisionProbability: 0.50, // 50% threshold - MinLength: 4, + MinLength: 3, MaxLength: 8, }, want: 4, @@ -112,6 +118,7 @@ func TestGenerateHashID_VariableLengths(t *testing.T) { length int expectedLen int // length of hash portion (without prefix) }{ + {3, 3}, {4, 4}, {5, 5}, {6, 6}, @@ -152,20 +159,20 @@ func TestGetAdaptiveIDLength_Integration(t *testing.T) { t.Fatalf("Failed to set prefix: %v", err) } - // Test default config (should use 4 chars for empty database) + // Test default config (should use 3 chars for empty database) conn, err := db.db.Conn(ctx) if err != nil { t.Fatalf("Failed to get connection: %v", err) } defer conn.Close() - + length, err := GetAdaptiveIDLength(ctx, conn, "test") if err != nil { t.Fatalf("GetAdaptiveIDLength failed: %v", err) } - - if length != 4 { - t.Errorf("Empty database should use 4 chars, got %d", length) + + if length != 3 { + t.Errorf("Empty database should use 3 chars, got %d", length) } // Test custom config diff --git a/internal/storage/sqlite/hash_id_test.go b/internal/storage/sqlite/hash_id_test.go index 85507f2e..67f6bdf4 100644 --- a/internal/storage/sqlite/hash_id_test.go +++ b/internal/storage/sqlite/hash_id_test.go @@ -35,10 +35,10 @@ func TestHashIDGeneration(t *testing.T) { t.Fatalf("Failed to create issue: %v", err) } - // Verify hash ID format: bd-<4-8 hex chars> with adaptive length (bd-ea2a13) - // For empty/small database, should use 4 chars - if len(issue.ID) < 7 || len(issue.ID) > 11 { // "bd-" (3) + 4-8 hex chars = 7-11 - t.Errorf("Expected ID length 7-11, got %d: %s", len(issue.ID), issue.ID) + // Verify hash ID format: bd-<3-8 base36 chars> with adaptive length + // For empty/small database, should use 3 chars + if len(issue.ID) < 6 || len(issue.ID) > 11 { // "bd-" (3) + 3-8 base36 chars = 6-11 + t.Errorf("Expected ID length 6-11, got %d: %s", len(issue.ID), issue.ID) } if issue.ID[:3] != "bd-" { @@ -182,9 +182,9 @@ func TestHashIDBatchCreation(t *testing.T) { } ids[issue.ID] = true - // Verify hash ID format (4-8 chars with adaptive length) - if len(issue.ID) < 7 || len(issue.ID) > 11 { - t.Errorf("Expected ID length 7-11, got %d: %s", len(issue.ID), issue.ID) + // Verify hash ID format (3-8 chars with adaptive length) + if len(issue.ID) < 6 || len(issue.ID) > 11 { + t.Errorf("Expected ID length 6-11, got %d: %s", len(issue.ID), issue.ID) } if issue.ID[:3] != "bd-" { t.Errorf("Expected ID to start with 'bd-', got: %s", issue.ID) diff --git a/internal/storage/sqlite/ids.go b/internal/storage/sqlite/ids.go index 6bf503b9..7832c546 100644 --- a/internal/storage/sqlite/ids.go +++ b/internal/storage/sqlite/ids.go @@ -4,14 +4,75 @@ import ( "context" "crypto/sha256" "database/sql" - "encoding/hex" "fmt" + "math/big" "strings" "time" "github.com/steveyegge/beads/internal/types" ) +// base36Alphabet is the character set for base36 encoding (0-9, a-z) +const base36Alphabet = "0123456789abcdefghijklmnopqrstuvwxyz" + +// encodeBase36 converts a byte slice to a base36 string of specified length +// Takes the first N bytes and converts them to base36 representation +func encodeBase36(data []byte, length int) string { + // Convert bytes to big integer + num := new(big.Int).SetBytes(data) + + // Convert to base36 + var result strings.Builder + base := big.NewInt(36) + zero := big.NewInt(0) + mod := new(big.Int) + + // Build the string in reverse + chars := make([]byte, 0, length) + for num.Cmp(zero) > 0 { + num.DivMod(num, base, mod) + chars = append(chars, base36Alphabet[mod.Int64()]) + } + + // Reverse the string + for i := len(chars) - 1; i >= 0; i-- { + result.WriteByte(chars[i]) + } + + // Pad with zeros if needed + str := result.String() + if len(str) < length { + str = strings.Repeat("0", length-len(str)) + str + } + + // Truncate to exact length if needed (keep least significant digits) + if len(str) > length { + str = str[len(str)-length:] + } + + return str +} + +// isValidBase36 checks if a string contains only base36 characters +func isValidBase36(s string) bool { + for _, c := range s { + if !((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z')) { + return false + } + } + return true +} + +// isValidHex checks if a string contains only hex characters +func isValidHex(s string) bool { + for _, c := range s { + if !((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f')) { + return false + } + } + return true +} + // ValidateIssueIDPrefix validates that an issue ID matches the configured prefix // Supports both top-level (bd-a3f8e9) and hierarchical (bd-a3f8e9.1) IDs func ValidateIssueIDPrefix(id, prefix string) error { @@ -150,36 +211,39 @@ func EnsureIDs(ctx context.Context, conn *sql.Conn, prefix string, issues []*typ } // generateHashID creates a hash-based ID for a top-level issue. -// For child issues, use the parent ID with a numeric suffix (e.g., "bd-a3f8e9.1"). -// Supports adaptive length from 4-8 chars based on database size (bd-ea2a13). +// For child issues, use the parent ID with a numeric suffix (e.g., "bd-x7k9p.1"). +// Supports adaptive length from 3-8 chars based on database size. // Includes a nonce parameter to handle same-length collisions. +// Uses base36 encoding (0-9, a-z) for better information density than hex. func generateHashID(prefix, title, description, creator string, timestamp time.Time, length, nonce int) string { // Combine inputs into a stable content string // Include nonce to handle hash collisions content := fmt.Sprintf("%s|%s|%s|%d|%d", title, description, creator, timestamp.UnixNano(), nonce) - + // Hash the content hash := sha256.Sum256([]byte(content)) - - // Use variable length (4-8 hex chars) - // length determines how many bytes to use (2, 2.5, 3, 3.5, or 4) - var shortHash string + + // Use base36 encoding with variable length (3-8 chars) + // Determine how many bytes to use based on desired output length + var numBytes int switch length { + case 3: + numBytes = 2 // 2 bytes = 16 bits ā‰ˆ 3.09 base36 chars case 4: - shortHash = hex.EncodeToString(hash[:2]) + numBytes = 3 // 3 bytes = 24 bits ā‰ˆ 4.63 base36 chars case 5: - // 2.5 bytes: use 3 bytes but take only first 5 chars - shortHash = hex.EncodeToString(hash[:3])[:5] + numBytes = 4 // 4 bytes = 32 bits ā‰ˆ 6.18 base36 chars case 6: - shortHash = hex.EncodeToString(hash[:3]) + numBytes = 4 // 4 bytes = 32 bits ā‰ˆ 6.18 base36 chars case 7: - // 3.5 bytes: use 4 bytes but take only first 7 chars - shortHash = hex.EncodeToString(hash[:4])[:7] + numBytes = 5 // 5 bytes = 40 bits ā‰ˆ 7.73 base36 chars case 8: - shortHash = hex.EncodeToString(hash[:4]) + numBytes = 5 // 5 bytes = 40 bits ā‰ˆ 7.73 base36 chars default: - shortHash = hex.EncodeToString(hash[:3]) // default to 6 + numBytes = 3 // default to 3 chars } - + + shortHash := encodeBase36(hash[:numBytes], length) + return fmt.Sprintf("%s-%s", prefix, shortHash) } diff --git a/internal/utils/id_parser.go b/internal/utils/id_parser.go index d52af578..7c5b7cc8 100644 --- a/internal/utils/id_parser.go +++ b/internal/utils/id_parser.go @@ -78,11 +78,11 @@ func ResolvePartialID(ctx context.Context, store storage.Storage, input string) } // Extract the hash part for substring matching - hashPart := strings.TrimPrefix(normalizedID, prefix) - + hashPart := strings.TrimPrefix(normalizedID, prefixWithHyphen) + var matches []string for _, issue := range issues { - issueHash := strings.TrimPrefix(issue.ID, prefix) + issueHash := strings.TrimPrefix(issue.ID, prefixWithHyphen) // Check if the issue hash contains the input hash as substring if strings.Contains(issueHash, hashPart) { matches = append(matches, issue.ID)