Files
beads/scripts/collision-calculator.go
Steve Yegge 76d3403d0a Implement adaptive ID length scaling (bd-ea2a13)
- Start with 4-char IDs for small databases (0-500 issues)
- Scale to 5-char at 500-1500 issues, 6-char at 1500+
- Configurable via max_collision_prob, min/max_hash_length
- Birthday paradox math ensures collision probability stays under threshold
- Comprehensive tests and documentation
- Collision calculator tool for analysis

Also filed bd-aa744b to remove sequential ID code path.
2025-10-30 21:40:52 -07:00

121 lines
3.1 KiB
Go

package main
import (
"fmt"
"math"
)
// Birthday paradox: P(collision) ≈ 1 - e^(-n²/2N)
// where n = number of items, N = total possible values
func collisionProbability(numIssues int, idLength int) float64 {
base := 36.0 // lowercase alphanumeric
totalPossibilities := math.Pow(base, float64(idLength))
exponent := -float64(numIssues*numIssues) / (2.0 * totalPossibilities)
return 1.0 - math.Exp(exponent)
}
// Find the expected number of collisions
func expectedCollisions(numIssues int, idLength int) float64 {
// Expected number of pairs that collide
totalPairs := float64(numIssues * (numIssues - 1) / 2)
return totalPairs * (1.0 / math.Pow(36, float64(idLength)))
}
// Find optimal ID length for a given database size and max collision probability
func optimalIdLength(numIssues int, maxCollisionProb float64) int {
for length := 3; length <= 12; length++ {
prob := collisionProbability(numIssues, length)
if prob <= maxCollisionProb {
return length
}
}
return 12 // fallback
}
func main() {
fmt.Println("=== Collision Probability Analysis ===")
dbSizes := []int{50, 100, 200, 500, 1000, 2000, 5000, 10000}
idLengths := []int{4, 5, 6, 7, 8}
// Print table header
fmt.Printf("%-10s", "DB Size")
for _, length := range idLengths {
fmt.Printf("%8d-char", length)
}
fmt.Println()
fmt.Println("----------------------------------------------------------")
// Print collision probabilities
for _, size := range dbSizes {
fmt.Printf("%-10d", size)
for _, length := range idLengths {
prob := collisionProbability(size, length)
fmt.Printf("%11.2f%%", prob*100)
}
fmt.Println()
}
fmt.Println("\n=== Recommended ID Length by Threshold ===")
thresholds := []float64{0.10, 0.25, 0.50}
fmt.Printf("%-10s", "DB Size")
for _, threshold := range thresholds {
fmt.Printf("%10.0f%%", threshold*100)
}
fmt.Println()
fmt.Println("----------------------------------")
for _, size := range dbSizes {
fmt.Printf("%-10d", size)
for _, threshold := range thresholds {
optimal := optimalIdLength(size, threshold)
fmt.Printf("%10d", optimal)
}
fmt.Println()
}
fmt.Println("\n=== Expected Number of Collisions ===")
fmt.Printf("%-10s", "DB Size")
for _, length := range idLengths {
fmt.Printf("%10d-char", length)
}
fmt.Println()
fmt.Println("----------------------------------------------------------")
for _, size := range dbSizes {
fmt.Printf("%-10d", size)
for _, length := range idLengths {
expected := expectedCollisions(size, length)
fmt.Printf("%14.2f", expected)
}
fmt.Println()
}
fmt.Println("\n=== Adaptive Scaling Strategy ===")
fmt.Println("Threshold: 25% collision probability")
fmt.Printf("%-15s %-12s %-20s\n", "DB Size Range", "ID Length", "Collision Prob")
fmt.Println("-------------------------------------------------------")
ranges := []struct {
min, max int
}{
{0, 50},
{51, 150},
{151, 500},
{501, 1500},
{1501, 5000},
{5001, 15000},
}
threshold := 0.25
for _, r := range ranges {
optimal := optimalIdLength(r.max, threshold)
prob := collisionProbability(r.max, optimal)
fmt.Printf("%-15s %-12d %18.2f%%\n",
fmt.Sprintf("%d-%d", r.min, r.max),
optimal,
prob*100)
}
}