gh2jsonl: add intra-batch collision detection for hash IDs

When using --id-mode hash, the script now tracks generated IDs and
retries with increasing nonce (0-9) then increasing length (up to 8)
if a collision is detected within the same import batch.

This matches the collision handling behavior in the Go implementation
(internal/storage/sqlite/ids.go).

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Steve Yegge
2025-11-25 20:33:35 -08:00
parent 384debad0f
commit 6c06f5cb5f

View File

@@ -140,6 +140,7 @@ class GitHubToBeads:
self.hash_length = hash_length # 3-8 chars for hash mode self.hash_length = hash_length # 3-8 chars for hash mode
self.issues: List[Dict[str, Any]] = [] self.issues: List[Dict[str, Any]] = []
self.gh_id_to_bd_id: Dict[int, str] = {} self.gh_id_to_bd_id: Dict[int, str] = {}
self.used_ids: set = set() # Track generated IDs for collision detection
def fetch_from_api(self, repo: str, token: Optional[str] = None, state: str = "all"): def fetch_from_api(self, repo: str, token: Optional[str] = None, state: str = "all"):
"""Fetch issues from GitHub API.""" """Fetch issues from GitHub API."""
@@ -321,21 +322,40 @@ class GitHubToBeads:
created_at_str = created_at_str[:-1] + '+00:00' created_at_str = created_at_str[:-1] + '+00:00'
created_at = datetime.fromisoformat(created_at_str) created_at = datetime.fromisoformat(created_at_str)
# Generate hash ID # Generate hash ID with collision detection
bd_id = generate_hash_id( # Try increasing nonce, then increasing length (matching Go implementation)
bd_id = None
max_length = 8
for length in range(self.hash_length, max_length + 1):
for nonce in range(10):
candidate = generate_hash_id(
prefix=self.prefix, prefix=self.prefix,
title=gh_issue["title"], title=gh_issue["title"],
description=gh_issue.get("body") or "", description=gh_issue.get("body") or "",
creator=creator, creator=creator,
timestamp=created_at, timestamp=created_at,
length=self.hash_length, length=length,
nonce=0 nonce=nonce
)
if candidate not in self.used_ids:
bd_id = candidate
break
if bd_id:
break
if not bd_id:
raise RuntimeError(
f"Failed to generate unique ID for issue #{gh_id} after trying "
f"lengths {self.hash_length}-{max_length} with 10 nonces each"
) )
else: else:
# Sequential mode (existing behavior) # Sequential mode (existing behavior)
bd_id = f"{self.prefix}-{self.issue_counter}" bd_id = f"{self.prefix}-{self.issue_counter}"
self.issue_counter += 1 self.issue_counter += 1
# Track used ID
self.used_ids.add(bd_id)
# Store mapping # Store mapping
self.gh_id_to_bd_id[gh_id] = bd_id self.gh_id_to_bd_id[gh_id] = bd_id