Merge pull request #383 from deangiberson/main

gh2jsonl: Add hash-based ID generation support
This commit is contained in:
Steve Yegge
2025-11-25 20:29:38 -08:00
committed by GitHub

View File

@@ -6,18 +6,29 @@ Supports two input modes:
1. GitHub API - Fetch issues directly from a repository 1. GitHub API - Fetch issues directly from a repository
2. JSON Export - Parse exported GitHub issues JSON 2. JSON Export - Parse exported GitHub issues JSON
ID Modes:
1. Sequential - Traditional numeric IDs (bd-1, bd-2, ...)
2. Hash - Content-based hash IDs (bd-a3f2dd, bd-7k9p1x, ...)
Usage: Usage:
# From GitHub API # From GitHub API (sequential IDs)
export GITHUB_TOKEN=ghp_your_token_here export GITHUB_TOKEN=ghp_your_token_here
python gh2jsonl.py --repo owner/repo | bd import python gh2jsonl.py --repo owner/repo | bd import
# Hash-based IDs (matches bd create behavior)
python gh2jsonl.py --repo owner/repo --id-mode hash | bd import
# From exported JSON file # From exported JSON file
python gh2jsonl.py --file issues.json | bd import python gh2jsonl.py --file issues.json | bd import
# Hash IDs with custom length (4-8 chars)
python gh2jsonl.py --repo owner/repo --id-mode hash --hash-length 4 | bd import
# Save to file first # Save to file first
python gh2jsonl.py --repo owner/repo > issues.jsonl python gh2jsonl.py --repo owner/repo > issues.jsonl
""" """
import hashlib
import json import json
import os import os
import re import re
@@ -29,12 +40,104 @@ from urllib.request import Request, urlopen
from urllib.error import HTTPError, URLError from urllib.error import HTTPError, URLError
def encode_base36(data: bytes, length: int) -> str:
"""
Convert bytes to base36 string of specified length.
Matches the Go implementation in internal/storage/sqlite/ids.go:encodeBase36
Uses lowercase alphanumeric characters (0-9, a-z) for encoding.
"""
# Convert bytes to integer (big-endian)
num = int.from_bytes(data, byteorder='big')
# Base36 alphabet (0-9, a-z)
alphabet = '0123456789abcdefghijklmnopqrstuvwxyz'
# Convert to base36
if num == 0:
result = '0'
else:
result = ''
while num > 0:
num, remainder = divmod(num, 36)
result = alphabet[remainder] + result
# Pad with zeros if needed
result = result.zfill(length)
# Truncate to exact length (keep rightmost/least significant digits)
if len(result) > length:
result = result[-length:]
return result
def generate_hash_id(
prefix: str,
title: str,
description: str,
creator: str,
timestamp: datetime,
length: int = 6,
nonce: int = 0
) -> str:
"""
Generate hash-based ID matching bd's algorithm.
Matches the Go implementation in internal/storage/sqlite/ids.go:generateHashID
Args:
prefix: Issue prefix (e.g., "bd", "myproject")
title: Issue title
description: Issue description/body
creator: Issue creator username
timestamp: Issue creation timestamp
length: Hash length in characters (3-8)
nonce: Nonce for collision handling (default: 0)
Returns:
Formatted ID like "bd-a3f2dd" or "myproject-7k9p1x"
"""
# Convert timestamp to nanoseconds (matching Go's UnixNano())
timestamp_nano = int(timestamp.timestamp() * 1_000_000_000)
# Combine inputs with pipe delimiter (matching Go format string)
content = f"{title}|{description}|{creator}|{timestamp_nano}|{nonce}"
# SHA256 hash
hash_bytes = hashlib.sha256(content.encode('utf-8')).digest()
# Determine byte count based on length (from ids.go:258-273)
num_bytes_map = {
3: 2, # 2 bytes = 16 bits ≈ 3.09 base36 chars
4: 3, # 3 bytes = 24 bits ≈ 4.63 base36 chars
5: 4, # 4 bytes = 32 bits ≈ 6.18 base36 chars
6: 4, # 4 bytes = 32 bits ≈ 6.18 base36 chars
7: 5, # 5 bytes = 40 bits ≈ 7.73 base36 chars
8: 5, # 5 bytes = 40 bits ≈ 7.73 base36 chars
}
num_bytes = num_bytes_map.get(length, 3)
# Encode first num_bytes to base36
short_hash = encode_base36(hash_bytes[:num_bytes], length)
return f"{prefix}-{short_hash}"
class GitHubToBeads: class GitHubToBeads:
"""Convert GitHub Issues to bd JSONL format.""" """Convert GitHub Issues to bd JSONL format."""
def __init__(self, prefix: str = "bd", start_id: int = 1): def __init__(
self,
prefix: str = "bd",
start_id: int = 1,
id_mode: str = "sequential",
hash_length: int = 6
):
self.prefix = prefix self.prefix = prefix
self.issue_counter = start_id self.issue_counter = start_id
self.id_mode = id_mode # "sequential" or "hash"
self.hash_length = hash_length # 3-8 chars for hash mode
self.issues: List[Dict[str, Any]] = [] self.issues: List[Dict[str, Any]] = []
self.gh_id_to_bd_id: Dict[int, str] = {} self.gh_id_to_bd_id: Dict[int, str] = {}
@@ -202,8 +305,36 @@ class GitHubToBeads:
def convert_issue(self, gh_issue: Dict[str, Any]) -> Dict[str, Any]: def convert_issue(self, gh_issue: Dict[str, Any]) -> Dict[str, Any]:
"""Convert a single GitHub issue to bd format.""" """Convert a single GitHub issue to bd format."""
gh_id = gh_issue["number"] gh_id = gh_issue["number"]
bd_id = f"{self.prefix}-{self.issue_counter}"
self.issue_counter += 1 # Generate ID based on mode
if self.id_mode == "hash":
# Extract creator (use "github-import" as fallback)
creator = "github-import"
if gh_issue.get("user"):
if isinstance(gh_issue["user"], dict):
creator = gh_issue["user"].get("login", "github-import")
# Parse created_at timestamp
created_at_str = gh_issue["created_at"]
# Handle both ISO format with Z and +00:00
if created_at_str.endswith('Z'):
created_at_str = created_at_str[:-1] + '+00:00'
created_at = datetime.fromisoformat(created_at_str)
# Generate hash ID
bd_id = generate_hash_id(
prefix=self.prefix,
title=gh_issue["title"],
description=gh_issue.get("body") or "",
creator=creator,
timestamp=created_at,
length=self.hash_length,
nonce=0
)
else:
# Sequential mode (existing behavior)
bd_id = f"{self.prefix}-{self.issue_counter}"
self.issue_counter += 1
# Store mapping # Store mapping
self.gh_id_to_bd_id[gh_id] = bd_id self.gh_id_to_bd_id[gh_id] = bd_id
@@ -307,18 +438,24 @@ def main():
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=""" epilog="""
Examples: Examples:
# From GitHub API # From GitHub API (sequential IDs)
export GITHUB_TOKEN=ghp_... export GITHUB_TOKEN=ghp_...
python gh2jsonl.py --repo owner/repo | bd import python gh2jsonl.py --repo owner/repo | bd import
# Hash-based IDs (matches bd create behavior)
python gh2jsonl.py --repo owner/repo --id-mode hash | bd import
# From JSON file # From JSON file
python gh2jsonl.py --file issues.json > issues.jsonl python gh2jsonl.py --file issues.json > issues.jsonl
# Hash IDs with custom length
python gh2jsonl.py --repo owner/repo --id-mode hash --hash-length 4 | bd import
# Fetch only open issues # Fetch only open issues
python gh2jsonl.py --repo owner/repo --state open python gh2jsonl.py --repo owner/repo --state open
# Custom prefix and start ID # Custom prefix with hash IDs
python gh2jsonl.py --repo owner/repo --prefix myproject --start-id 100 python gh2jsonl.py --repo owner/repo --prefix myproject --id-mode hash
""" """
) )
@@ -352,6 +489,19 @@ Examples:
default=1, default=1,
help="Starting issue number (default: 1)" help="Starting issue number (default: 1)"
) )
parser.add_argument(
"--id-mode",
choices=["sequential", "hash"],
default="sequential",
help="ID generation mode: sequential (bd-1, bd-2) or hash (bd-a3f2dd) (default: sequential)"
)
parser.add_argument(
"--hash-length",
type=int,
default=6,
choices=[3, 4, 5, 6, 7, 8],
help="Hash ID length in characters when using --id-mode hash (default: 6)"
)
args = parser.parse_args() args = parser.parse_args()
@@ -363,7 +513,12 @@ Examples:
parser.error("Cannot use both --repo and --file") parser.error("Cannot use both --repo and --file")
# Create converter # Create converter
converter = GitHubToBeads(prefix=args.prefix, start_id=args.start_id) converter = GitHubToBeads(
prefix=args.prefix,
start_id=args.start_id,
id_mode=args.id_mode,
hash_length=args.hash_length
)
# Load issues # Load issues
if args.repo: if args.repo: