Implement adaptive ID length scaling (bd-ea2a13)
- Start with 4-char IDs for small databases (0-500 issues) - Scale to 5-char at 500-1500 issues, 6-char at 1500+ - Configurable via max_collision_prob, min/max_hash_length - Birthday paradox math ensures collision probability stays under threshold - Comprehensive tests and documentation - Collision calculator tool for analysis Also filed bd-aa744b to remove sequential ID code path.
This commit is contained in:
120
scripts/collision-calculator.go
Normal file
120
scripts/collision-calculator.go
Normal file
@@ -0,0 +1,120 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
)
|
||||
|
||||
// Birthday paradox: P(collision) ≈ 1 - e^(-n²/2N)
|
||||
// where n = number of items, N = total possible values
|
||||
func collisionProbability(numIssues int, idLength int) float64 {
|
||||
base := 36.0 // lowercase alphanumeric
|
||||
totalPossibilities := math.Pow(base, float64(idLength))
|
||||
exponent := -float64(numIssues*numIssues) / (2.0 * totalPossibilities)
|
||||
return 1.0 - math.Exp(exponent)
|
||||
}
|
||||
|
||||
// Find the expected number of collisions
|
||||
func expectedCollisions(numIssues int, idLength int) float64 {
|
||||
// Expected number of pairs that collide
|
||||
totalPairs := float64(numIssues * (numIssues - 1) / 2)
|
||||
return totalPairs * (1.0 / math.Pow(36, float64(idLength)))
|
||||
}
|
||||
|
||||
// Find optimal ID length for a given database size and max collision probability
|
||||
func optimalIdLength(numIssues int, maxCollisionProb float64) int {
|
||||
for length := 3; length <= 12; length++ {
|
||||
prob := collisionProbability(numIssues, length)
|
||||
if prob <= maxCollisionProb {
|
||||
return length
|
||||
}
|
||||
}
|
||||
return 12 // fallback
|
||||
}
|
||||
|
||||
func main() {
|
||||
fmt.Println("=== Collision Probability Analysis ===")
|
||||
|
||||
dbSizes := []int{50, 100, 200, 500, 1000, 2000, 5000, 10000}
|
||||
idLengths := []int{4, 5, 6, 7, 8}
|
||||
|
||||
// Print table header
|
||||
fmt.Printf("%-10s", "DB Size")
|
||||
for _, length := range idLengths {
|
||||
fmt.Printf("%8d-char", length)
|
||||
}
|
||||
fmt.Println()
|
||||
fmt.Println("----------------------------------------------------------")
|
||||
|
||||
// Print collision probabilities
|
||||
for _, size := range dbSizes {
|
||||
fmt.Printf("%-10d", size)
|
||||
for _, length := range idLengths {
|
||||
prob := collisionProbability(size, length)
|
||||
fmt.Printf("%11.2f%%", prob*100)
|
||||
}
|
||||
fmt.Println()
|
||||
}
|
||||
|
||||
fmt.Println("\n=== Recommended ID Length by Threshold ===")
|
||||
|
||||
thresholds := []float64{0.10, 0.25, 0.50}
|
||||
fmt.Printf("%-10s", "DB Size")
|
||||
for _, threshold := range thresholds {
|
||||
fmt.Printf("%10.0f%%", threshold*100)
|
||||
}
|
||||
fmt.Println()
|
||||
fmt.Println("----------------------------------")
|
||||
|
||||
for _, size := range dbSizes {
|
||||
fmt.Printf("%-10d", size)
|
||||
for _, threshold := range thresholds {
|
||||
optimal := optimalIdLength(size, threshold)
|
||||
fmt.Printf("%10d", optimal)
|
||||
}
|
||||
fmt.Println()
|
||||
}
|
||||
|
||||
fmt.Println("\n=== Expected Number of Collisions ===")
|
||||
fmt.Printf("%-10s", "DB Size")
|
||||
for _, length := range idLengths {
|
||||
fmt.Printf("%10d-char", length)
|
||||
}
|
||||
fmt.Println()
|
||||
fmt.Println("----------------------------------------------------------")
|
||||
|
||||
for _, size := range dbSizes {
|
||||
fmt.Printf("%-10d", size)
|
||||
for _, length := range idLengths {
|
||||
expected := expectedCollisions(size, length)
|
||||
fmt.Printf("%14.2f", expected)
|
||||
}
|
||||
fmt.Println()
|
||||
}
|
||||
|
||||
fmt.Println("\n=== Adaptive Scaling Strategy ===")
|
||||
fmt.Println("Threshold: 25% collision probability")
|
||||
fmt.Printf("%-15s %-12s %-20s\n", "DB Size Range", "ID Length", "Collision Prob")
|
||||
fmt.Println("-------------------------------------------------------")
|
||||
|
||||
ranges := []struct {
|
||||
min, max int
|
||||
}{
|
||||
{0, 50},
|
||||
{51, 150},
|
||||
{151, 500},
|
||||
{501, 1500},
|
||||
{1501, 5000},
|
||||
{5001, 15000},
|
||||
}
|
||||
|
||||
threshold := 0.25
|
||||
for _, r := range ranges {
|
||||
optimal := optimalIdLength(r.max, threshold)
|
||||
prob := collisionProbability(r.max, optimal)
|
||||
fmt.Printf("%-15s %-12d %18.2f%%\n",
|
||||
fmt.Sprintf("%d-%d", r.min, r.max),
|
||||
optimal,
|
||||
prob*100)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user