Merge pull request #383 from deangiberson/main
gh2jsonl: Add hash-based ID generation support
This commit is contained in:
@@ -6,18 +6,29 @@ Supports two input modes:
|
|||||||
1. GitHub API - Fetch issues directly from a repository
|
1. GitHub API - Fetch issues directly from a repository
|
||||||
2. JSON Export - Parse exported GitHub issues JSON
|
2. JSON Export - Parse exported GitHub issues JSON
|
||||||
|
|
||||||
|
ID Modes:
|
||||||
|
1. Sequential - Traditional numeric IDs (bd-1, bd-2, ...)
|
||||||
|
2. Hash - Content-based hash IDs (bd-a3f2dd, bd-7k9p1x, ...)
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
# From GitHub API
|
# From GitHub API (sequential IDs)
|
||||||
export GITHUB_TOKEN=ghp_your_token_here
|
export GITHUB_TOKEN=ghp_your_token_here
|
||||||
python gh2jsonl.py --repo owner/repo | bd import
|
python gh2jsonl.py --repo owner/repo | bd import
|
||||||
|
|
||||||
|
# Hash-based IDs (matches bd create behavior)
|
||||||
|
python gh2jsonl.py --repo owner/repo --id-mode hash | bd import
|
||||||
|
|
||||||
# From exported JSON file
|
# From exported JSON file
|
||||||
python gh2jsonl.py --file issues.json | bd import
|
python gh2jsonl.py --file issues.json | bd import
|
||||||
|
|
||||||
|
# Hash IDs with custom length (4-8 chars)
|
||||||
|
python gh2jsonl.py --repo owner/repo --id-mode hash --hash-length 4 | bd import
|
||||||
|
|
||||||
# Save to file first
|
# Save to file first
|
||||||
python gh2jsonl.py --repo owner/repo > issues.jsonl
|
python gh2jsonl.py --repo owner/repo > issues.jsonl
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
@@ -29,12 +40,104 @@ from urllib.request import Request, urlopen
|
|||||||
from urllib.error import HTTPError, URLError
|
from urllib.error import HTTPError, URLError
|
||||||
|
|
||||||
|
|
||||||
|
def encode_base36(data: bytes, length: int) -> str:
|
||||||
|
"""
|
||||||
|
Convert bytes to base36 string of specified length.
|
||||||
|
|
||||||
|
Matches the Go implementation in internal/storage/sqlite/ids.go:encodeBase36
|
||||||
|
Uses lowercase alphanumeric characters (0-9, a-z) for encoding.
|
||||||
|
"""
|
||||||
|
# Convert bytes to integer (big-endian)
|
||||||
|
num = int.from_bytes(data, byteorder='big')
|
||||||
|
|
||||||
|
# Base36 alphabet (0-9, a-z)
|
||||||
|
alphabet = '0123456789abcdefghijklmnopqrstuvwxyz'
|
||||||
|
|
||||||
|
# Convert to base36
|
||||||
|
if num == 0:
|
||||||
|
result = '0'
|
||||||
|
else:
|
||||||
|
result = ''
|
||||||
|
while num > 0:
|
||||||
|
num, remainder = divmod(num, 36)
|
||||||
|
result = alphabet[remainder] + result
|
||||||
|
|
||||||
|
# Pad with zeros if needed
|
||||||
|
result = result.zfill(length)
|
||||||
|
|
||||||
|
# Truncate to exact length (keep rightmost/least significant digits)
|
||||||
|
if len(result) > length:
|
||||||
|
result = result[-length:]
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def generate_hash_id(
|
||||||
|
prefix: str,
|
||||||
|
title: str,
|
||||||
|
description: str,
|
||||||
|
creator: str,
|
||||||
|
timestamp: datetime,
|
||||||
|
length: int = 6,
|
||||||
|
nonce: int = 0
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Generate hash-based ID matching bd's algorithm.
|
||||||
|
|
||||||
|
Matches the Go implementation in internal/storage/sqlite/ids.go:generateHashID
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prefix: Issue prefix (e.g., "bd", "myproject")
|
||||||
|
title: Issue title
|
||||||
|
description: Issue description/body
|
||||||
|
creator: Issue creator username
|
||||||
|
timestamp: Issue creation timestamp
|
||||||
|
length: Hash length in characters (3-8)
|
||||||
|
nonce: Nonce for collision handling (default: 0)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Formatted ID like "bd-a3f2dd" or "myproject-7k9p1x"
|
||||||
|
"""
|
||||||
|
# Convert timestamp to nanoseconds (matching Go's UnixNano())
|
||||||
|
timestamp_nano = int(timestamp.timestamp() * 1_000_000_000)
|
||||||
|
|
||||||
|
# Combine inputs with pipe delimiter (matching Go format string)
|
||||||
|
content = f"{title}|{description}|{creator}|{timestamp_nano}|{nonce}"
|
||||||
|
|
||||||
|
# SHA256 hash
|
||||||
|
hash_bytes = hashlib.sha256(content.encode('utf-8')).digest()
|
||||||
|
|
||||||
|
# Determine byte count based on length (from ids.go:258-273)
|
||||||
|
num_bytes_map = {
|
||||||
|
3: 2, # 2 bytes = 16 bits ≈ 3.09 base36 chars
|
||||||
|
4: 3, # 3 bytes = 24 bits ≈ 4.63 base36 chars
|
||||||
|
5: 4, # 4 bytes = 32 bits ≈ 6.18 base36 chars
|
||||||
|
6: 4, # 4 bytes = 32 bits ≈ 6.18 base36 chars
|
||||||
|
7: 5, # 5 bytes = 40 bits ≈ 7.73 base36 chars
|
||||||
|
8: 5, # 5 bytes = 40 bits ≈ 7.73 base36 chars
|
||||||
|
}
|
||||||
|
num_bytes = num_bytes_map.get(length, 3)
|
||||||
|
|
||||||
|
# Encode first num_bytes to base36
|
||||||
|
short_hash = encode_base36(hash_bytes[:num_bytes], length)
|
||||||
|
|
||||||
|
return f"{prefix}-{short_hash}"
|
||||||
|
|
||||||
|
|
||||||
class GitHubToBeads:
|
class GitHubToBeads:
|
||||||
"""Convert GitHub Issues to bd JSONL format."""
|
"""Convert GitHub Issues to bd JSONL format."""
|
||||||
|
|
||||||
def __init__(self, prefix: str = "bd", start_id: int = 1):
|
def __init__(
|
||||||
|
self,
|
||||||
|
prefix: str = "bd",
|
||||||
|
start_id: int = 1,
|
||||||
|
id_mode: str = "sequential",
|
||||||
|
hash_length: int = 6
|
||||||
|
):
|
||||||
self.prefix = prefix
|
self.prefix = prefix
|
||||||
self.issue_counter = start_id
|
self.issue_counter = start_id
|
||||||
|
self.id_mode = id_mode # "sequential" or "hash"
|
||||||
|
self.hash_length = hash_length # 3-8 chars for hash mode
|
||||||
self.issues: List[Dict[str, Any]] = []
|
self.issues: List[Dict[str, Any]] = []
|
||||||
self.gh_id_to_bd_id: Dict[int, str] = {}
|
self.gh_id_to_bd_id: Dict[int, str] = {}
|
||||||
|
|
||||||
@@ -202,8 +305,36 @@ class GitHubToBeads:
|
|||||||
def convert_issue(self, gh_issue: Dict[str, Any]) -> Dict[str, Any]:
|
def convert_issue(self, gh_issue: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
"""Convert a single GitHub issue to bd format."""
|
"""Convert a single GitHub issue to bd format."""
|
||||||
gh_id = gh_issue["number"]
|
gh_id = gh_issue["number"]
|
||||||
bd_id = f"{self.prefix}-{self.issue_counter}"
|
|
||||||
self.issue_counter += 1
|
# Generate ID based on mode
|
||||||
|
if self.id_mode == "hash":
|
||||||
|
# Extract creator (use "github-import" as fallback)
|
||||||
|
creator = "github-import"
|
||||||
|
if gh_issue.get("user"):
|
||||||
|
if isinstance(gh_issue["user"], dict):
|
||||||
|
creator = gh_issue["user"].get("login", "github-import")
|
||||||
|
|
||||||
|
# Parse created_at timestamp
|
||||||
|
created_at_str = gh_issue["created_at"]
|
||||||
|
# Handle both ISO format with Z and +00:00
|
||||||
|
if created_at_str.endswith('Z'):
|
||||||
|
created_at_str = created_at_str[:-1] + '+00:00'
|
||||||
|
created_at = datetime.fromisoformat(created_at_str)
|
||||||
|
|
||||||
|
# Generate hash ID
|
||||||
|
bd_id = generate_hash_id(
|
||||||
|
prefix=self.prefix,
|
||||||
|
title=gh_issue["title"],
|
||||||
|
description=gh_issue.get("body") or "",
|
||||||
|
creator=creator,
|
||||||
|
timestamp=created_at,
|
||||||
|
length=self.hash_length,
|
||||||
|
nonce=0
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Sequential mode (existing behavior)
|
||||||
|
bd_id = f"{self.prefix}-{self.issue_counter}"
|
||||||
|
self.issue_counter += 1
|
||||||
|
|
||||||
# Store mapping
|
# Store mapping
|
||||||
self.gh_id_to_bd_id[gh_id] = bd_id
|
self.gh_id_to_bd_id[gh_id] = bd_id
|
||||||
@@ -307,18 +438,24 @@ def main():
|
|||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
epilog="""
|
epilog="""
|
||||||
Examples:
|
Examples:
|
||||||
# From GitHub API
|
# From GitHub API (sequential IDs)
|
||||||
export GITHUB_TOKEN=ghp_...
|
export GITHUB_TOKEN=ghp_...
|
||||||
python gh2jsonl.py --repo owner/repo | bd import
|
python gh2jsonl.py --repo owner/repo | bd import
|
||||||
|
|
||||||
|
# Hash-based IDs (matches bd create behavior)
|
||||||
|
python gh2jsonl.py --repo owner/repo --id-mode hash | bd import
|
||||||
|
|
||||||
# From JSON file
|
# From JSON file
|
||||||
python gh2jsonl.py --file issues.json > issues.jsonl
|
python gh2jsonl.py --file issues.json > issues.jsonl
|
||||||
|
|
||||||
|
# Hash IDs with custom length
|
||||||
|
python gh2jsonl.py --repo owner/repo --id-mode hash --hash-length 4 | bd import
|
||||||
|
|
||||||
# Fetch only open issues
|
# Fetch only open issues
|
||||||
python gh2jsonl.py --repo owner/repo --state open
|
python gh2jsonl.py --repo owner/repo --state open
|
||||||
|
|
||||||
# Custom prefix and start ID
|
# Custom prefix with hash IDs
|
||||||
python gh2jsonl.py --repo owner/repo --prefix myproject --start-id 100
|
python gh2jsonl.py --repo owner/repo --prefix myproject --id-mode hash
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -352,6 +489,19 @@ Examples:
|
|||||||
default=1,
|
default=1,
|
||||||
help="Starting issue number (default: 1)"
|
help="Starting issue number (default: 1)"
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--id-mode",
|
||||||
|
choices=["sequential", "hash"],
|
||||||
|
default="sequential",
|
||||||
|
help="ID generation mode: sequential (bd-1, bd-2) or hash (bd-a3f2dd) (default: sequential)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--hash-length",
|
||||||
|
type=int,
|
||||||
|
default=6,
|
||||||
|
choices=[3, 4, 5, 6, 7, 8],
|
||||||
|
help="Hash ID length in characters when using --id-mode hash (default: 6)"
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
@@ -363,7 +513,12 @@ Examples:
|
|||||||
parser.error("Cannot use both --repo and --file")
|
parser.error("Cannot use both --repo and --file")
|
||||||
|
|
||||||
# Create converter
|
# Create converter
|
||||||
converter = GitHubToBeads(prefix=args.prefix, start_id=args.start_id)
|
converter = GitHubToBeads(
|
||||||
|
prefix=args.prefix,
|
||||||
|
start_id=args.start_id,
|
||||||
|
id_mode=args.id_mode,
|
||||||
|
hash_length=args.hash_length
|
||||||
|
)
|
||||||
|
|
||||||
# Load issues
|
# Load issues
|
||||||
if args.repo:
|
if args.repo:
|
||||||
|
|||||||
Reference in New Issue
Block a user