- Start with 4-char IDs for small databases (0-500 issues) - Scale to 5-char at 500-1500 issues, 6-char at 1500+ - Configurable via max_collision_prob, min/max_hash_length - Birthday paradox math ensures collision probability stays under threshold - Comprehensive tests and documentation - Collision calculator tool for analysis Also filed bd-aa744b to remove sequential ID code path.
121 lines
3.1 KiB
Go
121 lines
3.1 KiB
Go
package main
|
|
|
|
import (
|
|
"fmt"
|
|
"math"
|
|
)
|
|
|
|
// Birthday paradox: P(collision) ≈ 1 - e^(-n²/2N)
|
|
// where n = number of items, N = total possible values
|
|
func collisionProbability(numIssues int, idLength int) float64 {
|
|
base := 36.0 // lowercase alphanumeric
|
|
totalPossibilities := math.Pow(base, float64(idLength))
|
|
exponent := -float64(numIssues*numIssues) / (2.0 * totalPossibilities)
|
|
return 1.0 - math.Exp(exponent)
|
|
}
|
|
|
|
// Find the expected number of collisions
|
|
func expectedCollisions(numIssues int, idLength int) float64 {
|
|
// Expected number of pairs that collide
|
|
totalPairs := float64(numIssues * (numIssues - 1) / 2)
|
|
return totalPairs * (1.0 / math.Pow(36, float64(idLength)))
|
|
}
|
|
|
|
// Find optimal ID length for a given database size and max collision probability
|
|
func optimalIdLength(numIssues int, maxCollisionProb float64) int {
|
|
for length := 3; length <= 12; length++ {
|
|
prob := collisionProbability(numIssues, length)
|
|
if prob <= maxCollisionProb {
|
|
return length
|
|
}
|
|
}
|
|
return 12 // fallback
|
|
}
|
|
|
|
func main() {
|
|
fmt.Println("=== Collision Probability Analysis ===")
|
|
|
|
dbSizes := []int{50, 100, 200, 500, 1000, 2000, 5000, 10000}
|
|
idLengths := []int{4, 5, 6, 7, 8}
|
|
|
|
// Print table header
|
|
fmt.Printf("%-10s", "DB Size")
|
|
for _, length := range idLengths {
|
|
fmt.Printf("%8d-char", length)
|
|
}
|
|
fmt.Println()
|
|
fmt.Println("----------------------------------------------------------")
|
|
|
|
// Print collision probabilities
|
|
for _, size := range dbSizes {
|
|
fmt.Printf("%-10d", size)
|
|
for _, length := range idLengths {
|
|
prob := collisionProbability(size, length)
|
|
fmt.Printf("%11.2f%%", prob*100)
|
|
}
|
|
fmt.Println()
|
|
}
|
|
|
|
fmt.Println("\n=== Recommended ID Length by Threshold ===")
|
|
|
|
thresholds := []float64{0.10, 0.25, 0.50}
|
|
fmt.Printf("%-10s", "DB Size")
|
|
for _, threshold := range thresholds {
|
|
fmt.Printf("%10.0f%%", threshold*100)
|
|
}
|
|
fmt.Println()
|
|
fmt.Println("----------------------------------")
|
|
|
|
for _, size := range dbSizes {
|
|
fmt.Printf("%-10d", size)
|
|
for _, threshold := range thresholds {
|
|
optimal := optimalIdLength(size, threshold)
|
|
fmt.Printf("%10d", optimal)
|
|
}
|
|
fmt.Println()
|
|
}
|
|
|
|
fmt.Println("\n=== Expected Number of Collisions ===")
|
|
fmt.Printf("%-10s", "DB Size")
|
|
for _, length := range idLengths {
|
|
fmt.Printf("%10d-char", length)
|
|
}
|
|
fmt.Println()
|
|
fmt.Println("----------------------------------------------------------")
|
|
|
|
for _, size := range dbSizes {
|
|
fmt.Printf("%-10d", size)
|
|
for _, length := range idLengths {
|
|
expected := expectedCollisions(size, length)
|
|
fmt.Printf("%14.2f", expected)
|
|
}
|
|
fmt.Println()
|
|
}
|
|
|
|
fmt.Println("\n=== Adaptive Scaling Strategy ===")
|
|
fmt.Println("Threshold: 25% collision probability")
|
|
fmt.Printf("%-15s %-12s %-20s\n", "DB Size Range", "ID Length", "Collision Prob")
|
|
fmt.Println("-------------------------------------------------------")
|
|
|
|
ranges := []struct {
|
|
min, max int
|
|
}{
|
|
{0, 50},
|
|
{51, 150},
|
|
{151, 500},
|
|
{501, 1500},
|
|
{1501, 5000},
|
|
{5001, 15000},
|
|
}
|
|
|
|
threshold := 0.25
|
|
for _, r := range ranges {
|
|
optimal := optimalIdLength(r.max, threshold)
|
|
prob := collisionProbability(r.max, optimal)
|
|
fmt.Printf("%-15s %-12d %18.2f%%\n",
|
|
fmt.Sprintf("%d-%d", r.min, r.max),
|
|
optimal,
|
|
prob*100)
|
|
}
|
|
}
|