From 6c06f5cb5fcfd420b20e04f1ab4af81b2a9f12a2 Mon Sep 17 00:00:00 2001 From: Steve Yegge Date: Tue, 25 Nov 2025 20:33:35 -0800 Subject: [PATCH] gh2jsonl: add intra-batch collision detection for hash IDs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When using --id-mode hash, the script now tracks generated IDs and retries with increasing nonce (0-9) then increasing length (up to 8) if a collision is detected within the same import batch. This matches the collision handling behavior in the Go implementation (internal/storage/sqlite/ids.go). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- examples/github-import/gh2jsonl.py | 40 ++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/examples/github-import/gh2jsonl.py b/examples/github-import/gh2jsonl.py index 12d12c68..14082995 100755 --- a/examples/github-import/gh2jsonl.py +++ b/examples/github-import/gh2jsonl.py @@ -140,6 +140,7 @@ class GitHubToBeads: self.hash_length = hash_length # 3-8 chars for hash mode self.issues: List[Dict[str, Any]] = [] self.gh_id_to_bd_id: Dict[int, str] = {} + self.used_ids: set = set() # Track generated IDs for collision detection def fetch_from_api(self, repo: str, token: Optional[str] = None, state: str = "all"): """Fetch issues from GitHub API.""" @@ -321,21 +322,40 @@ class GitHubToBeads: created_at_str = created_at_str[:-1] + '+00:00' created_at = datetime.fromisoformat(created_at_str) - # Generate hash ID - bd_id = generate_hash_id( - prefix=self.prefix, - title=gh_issue["title"], - description=gh_issue.get("body") or "", - creator=creator, - timestamp=created_at, - length=self.hash_length, - nonce=0 - ) + # Generate hash ID with collision detection + # Try increasing nonce, then increasing length (matching Go implementation) + bd_id = None + max_length = 8 + for length in range(self.hash_length, max_length + 1): + for nonce in range(10): + candidate = generate_hash_id( + prefix=self.prefix, + title=gh_issue["title"], + description=gh_issue.get("body") or "", + creator=creator, + timestamp=created_at, + length=length, + nonce=nonce + ) + if candidate not in self.used_ids: + bd_id = candidate + break + if bd_id: + break + + if not bd_id: + raise RuntimeError( + f"Failed to generate unique ID for issue #{gh_id} after trying " + f"lengths {self.hash_length}-{max_length} with 10 nonces each" + ) else: # Sequential mode (existing behavior) bd_id = f"{self.prefix}-{self.issue_counter}" self.issue_counter += 1 + # Track used ID + self.used_ids.add(bd_id) + # Store mapping self.gh_id_to_bd_id[gh_id] = bd_id