gh2jsonl: add intra-batch collision detection for hash IDs

When using --id-mode hash, the script now tracks generated IDs and retries with increasing nonce (0-9) then increasing length (up to 8) if a collision is detected within the same import batch. This matches the collision handling behavior in the Go implementation (internal/storage/sqlite/ids.go). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-25 20:33:35 -08:00
parent 384debad0f
commit 6c06f5cb5f
1 changed files with 30 additions and 10 deletions
--- a/examples/github-import/gh2jsonl.py
+++ b/examples/github-import/gh2jsonl.py
@@ -140,6 +140,7 @@ class GitHubToBeads:
        self.hash_length = hash_length  # 3-8 chars for hash mode
        self.issues: List[Dict[str, Any]] = []
        self.gh_id_to_bd_id: Dict[int, str] = {}
        self.used_ids: set = set()  # Track generated IDs for collision detection
    def fetch_from_api(self, repo: str, token: Optional[str] = None, state: str = "all"):
        """Fetch issues from GitHub API."""
@@ -321,21 +322,40 @@ class GitHubToBeads:
                created_at_str = created_at_str[:-1] + '+00:00'
            created_at = datetime.fromisoformat(created_at_str)
-            # Generate hash ID
+            # Generate hash ID with collision detection
-            bd_id = generate_hash_id(
+            # Try increasing nonce, then increasing length (matching Go implementation)
            bd_id = None
            max_length = 8
            for length in range(self.hash_length, max_length + 1):
                for nonce in range(10):
                    candidate = generate_hash_id(
                        prefix=self.prefix,
                        title=gh_issue["title"],
                        description=gh_issue.get("body") or "",
                        creator=creator,
                        timestamp=created_at,
-                length=self.hash_length,
+                        length=length,
-                nonce=0
+                        nonce=nonce
                    )
                    if candidate not in self.used_ids:
                        bd_id = candidate
                        break
                if bd_id:
                    break
            if not bd_id:
                raise RuntimeError(
                    f"Failed to generate unique ID for issue #{gh_id} after trying "
                    f"lengths {self.hash_length}-{max_length} with 10 nonces each"
                )
        else:
            # Sequential mode (existing behavior)
            bd_id = f"{self.prefix}-{self.issue_counter}"
            self.issue_counter += 1
        # Track used ID
        self.used_ids.add(bd_id)
        # Store mapping
        self.gh_id_to_bd_id[gh_id] = bd_id