gh2jsonl: add intra-batch collision detection for hash IDs
When using --id-mode hash, the script now tracks generated IDs and retries with increasing nonce (0-9) then increasing length (up to 8) if a collision is detected within the same import batch. This matches the collision handling behavior in the Go implementation (internal/storage/sqlite/ids.go). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -140,6 +140,7 @@ class GitHubToBeads:
|
|||||||
self.hash_length = hash_length # 3-8 chars for hash mode
|
self.hash_length = hash_length # 3-8 chars for hash mode
|
||||||
self.issues: List[Dict[str, Any]] = []
|
self.issues: List[Dict[str, Any]] = []
|
||||||
self.gh_id_to_bd_id: Dict[int, str] = {}
|
self.gh_id_to_bd_id: Dict[int, str] = {}
|
||||||
|
self.used_ids: set = set() # Track generated IDs for collision detection
|
||||||
|
|
||||||
def fetch_from_api(self, repo: str, token: Optional[str] = None, state: str = "all"):
|
def fetch_from_api(self, repo: str, token: Optional[str] = None, state: str = "all"):
|
||||||
"""Fetch issues from GitHub API."""
|
"""Fetch issues from GitHub API."""
|
||||||
@@ -321,21 +322,40 @@ class GitHubToBeads:
|
|||||||
created_at_str = created_at_str[:-1] + '+00:00'
|
created_at_str = created_at_str[:-1] + '+00:00'
|
||||||
created_at = datetime.fromisoformat(created_at_str)
|
created_at = datetime.fromisoformat(created_at_str)
|
||||||
|
|
||||||
# Generate hash ID
|
# Generate hash ID with collision detection
|
||||||
bd_id = generate_hash_id(
|
# Try increasing nonce, then increasing length (matching Go implementation)
|
||||||
|
bd_id = None
|
||||||
|
max_length = 8
|
||||||
|
for length in range(self.hash_length, max_length + 1):
|
||||||
|
for nonce in range(10):
|
||||||
|
candidate = generate_hash_id(
|
||||||
prefix=self.prefix,
|
prefix=self.prefix,
|
||||||
title=gh_issue["title"],
|
title=gh_issue["title"],
|
||||||
description=gh_issue.get("body") or "",
|
description=gh_issue.get("body") or "",
|
||||||
creator=creator,
|
creator=creator,
|
||||||
timestamp=created_at,
|
timestamp=created_at,
|
||||||
length=self.hash_length,
|
length=length,
|
||||||
nonce=0
|
nonce=nonce
|
||||||
|
)
|
||||||
|
if candidate not in self.used_ids:
|
||||||
|
bd_id = candidate
|
||||||
|
break
|
||||||
|
if bd_id:
|
||||||
|
break
|
||||||
|
|
||||||
|
if not bd_id:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Failed to generate unique ID for issue #{gh_id} after trying "
|
||||||
|
f"lengths {self.hash_length}-{max_length} with 10 nonces each"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Sequential mode (existing behavior)
|
# Sequential mode (existing behavior)
|
||||||
bd_id = f"{self.prefix}-{self.issue_counter}"
|
bd_id = f"{self.prefix}-{self.issue_counter}"
|
||||||
self.issue_counter += 1
|
self.issue_counter += 1
|
||||||
|
|
||||||
|
# Track used ID
|
||||||
|
self.used_ids.add(bd_id)
|
||||||
|
|
||||||
# Store mapping
|
# Store mapping
|
||||||
self.gh_id_to_bd_id[gh_id] = bd_id
|
self.gh_id_to_bd_id[gh_id] = bd_id
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user