gh2jsonl: add intra-batch collision detection for hash IDs

When using --id-mode hash, the script now tracks generated IDs and
retries with increasing nonce (0-9) then increasing length (up to 8)
if a collision is detected within the same import batch.

This matches the collision handling behavior in the Go implementation
(internal/storage/sqlite/ids.go).

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Steve Yegge
2025-11-25 20:33:35 -08:00
parent 384debad0f
commit 6c06f5cb5f

View File

@@ -140,6 +140,7 @@ class GitHubToBeads:
self.hash_length = hash_length # 3-8 chars for hash mode
self.issues: List[Dict[str, Any]] = []
self.gh_id_to_bd_id: Dict[int, str] = {}
self.used_ids: set = set() # Track generated IDs for collision detection
def fetch_from_api(self, repo: str, token: Optional[str] = None, state: str = "all"):
"""Fetch issues from GitHub API."""
@@ -321,21 +322,40 @@ class GitHubToBeads:
created_at_str = created_at_str[:-1] + '+00:00'
created_at = datetime.fromisoformat(created_at_str)
# Generate hash ID
bd_id = generate_hash_id(
prefix=self.prefix,
title=gh_issue["title"],
description=gh_issue.get("body") or "",
creator=creator,
timestamp=created_at,
length=self.hash_length,
nonce=0
)
# Generate hash ID with collision detection
# Try increasing nonce, then increasing length (matching Go implementation)
bd_id = None
max_length = 8
for length in range(self.hash_length, max_length + 1):
for nonce in range(10):
candidate = generate_hash_id(
prefix=self.prefix,
title=gh_issue["title"],
description=gh_issue.get("body") or "",
creator=creator,
timestamp=created_at,
length=length,
nonce=nonce
)
if candidate not in self.used_ids:
bd_id = candidate
break
if bd_id:
break
if not bd_id:
raise RuntimeError(
f"Failed to generate unique ID for issue #{gh_id} after trying "
f"lengths {self.hash_length}-{max_length} with 10 nonces each"
)
else:
# Sequential mode (existing behavior)
bd_id = f"{self.prefix}-{self.issue_counter}"
self.issue_counter += 1
# Track used ID
self.used_ids.add(bd_id)
# Store mapping
self.gh_id_to_bd_id[gh_id] = bd_id