feat(refinery): Add parallel worker support with MR claiming (gt-kgszr)

Implements Option A from the issue: multiple refinery workers with locking.

Changes to mrqueue:
- Add ClaimedBy/ClaimedAt fields to MR struct
- Add Claim(id, worker) method with atomic file operations
- Add Release(id) method to unclaim on failure
- Add ListUnclaimed() to find available work
- Add ListClaimedBy(worker) for worker-specific queries
- Claims expire after 10 minutes for crash recovery

New CLI commands:
- gt refinery claim <mr-id> - Claim MR for processing
- gt refinery release <mr-id> - Release claim back to queue
- gt refinery unclaimed - List available MRs

Formula updates:
- queue-scan now uses gt refinery unclaimed
- process-branch claims MR before processing
- handle-failures releases claim on test failure
- Claims prevent double-processing by parallel workers

Worker ID comes from GT_REFINERY_WORKER env var (default: refinery-1).

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
toast
2026-01-01 18:46:29 -08:00
committed by Steve Yegge
parent e159489edb
commit 3ecaf9d6fe
3 changed files with 307 additions and 5 deletions

View File

@@ -27,6 +27,10 @@ type MR struct {
Priority int `json:"priority"` // Priority (lower = higher priority)
CreatedAt time.Time `json:"created_at"`
AgentBead string `json:"agent_bead,omitempty"` // Agent bead ID that created this MR (for traceability)
// Claiming fields for parallel refinery workers
ClaimedBy string `json:"claimed_by,omitempty"` // Worker ID that claimed this MR
ClaimedAt *time.Time `json:"claimed_at,omitempty"` // When the MR was claimed
}
// Queue manages the MR storage.
@@ -182,3 +186,128 @@ func (q *Queue) Count() int {
func (q *Queue) Dir() string {
return q.dir
}
// ClaimStaleTimeout is how long before a claimed MR is considered stale.
// If a worker claims an MR but doesn't process it within this time,
// another worker can reclaim it.
const ClaimStaleTimeout = 10 * time.Minute
// Claim attempts to claim an MR for processing by a specific worker.
// Returns nil if successful, ErrAlreadyClaimed if another worker has it,
// or ErrNotFound if the MR doesn't exist.
// Uses atomic file operations to prevent race conditions.
func (q *Queue) Claim(id, workerID string) error {
path := filepath.Join(q.dir, id+".json")
// Read current state
mr, err := q.load(path)
if err != nil {
if os.IsNotExist(err) {
return ErrNotFound
}
return fmt.Errorf("loading MR: %w", err)
}
// Check if already claimed by another worker
if mr.ClaimedBy != "" && mr.ClaimedBy != workerID {
// Check if claim is stale (worker may have crashed)
if mr.ClaimedAt != nil && time.Since(*mr.ClaimedAt) < ClaimStaleTimeout {
return ErrAlreadyClaimed
}
// Stale claim - allow reclaim
}
// Claim the MR
now := time.Now()
mr.ClaimedBy = workerID
mr.ClaimedAt = &now
// Write atomically
data, err := json.MarshalIndent(mr, "", " ")
if err != nil {
return fmt.Errorf("marshaling MR: %w", err)
}
// Write to temp file first, then rename (atomic on most filesystems)
tmpPath := path + ".tmp"
if err := os.WriteFile(tmpPath, data, 0644); err != nil {
return fmt.Errorf("writing temp file: %w", err)
}
if err := os.Rename(tmpPath, path); err != nil {
os.Remove(tmpPath) // cleanup
return fmt.Errorf("renaming temp file: %w", err)
}
return nil
}
// Release releases a claimed MR back to the queue.
// Called when processing fails and the MR should be retried.
func (q *Queue) Release(id string) error {
path := filepath.Join(q.dir, id+".json")
mr, err := q.load(path)
if err != nil {
if os.IsNotExist(err) {
return nil // Already removed
}
return fmt.Errorf("loading MR: %w", err)
}
// Clear claim
mr.ClaimedBy = ""
mr.ClaimedAt = nil
data, err := json.MarshalIndent(mr, "", " ")
if err != nil {
return fmt.Errorf("marshaling MR: %w", err)
}
return os.WriteFile(path, data, 0644)
}
// ListUnclaimed returns MRs that are not claimed or have stale claims.
// Sorted by priority then creation time.
func (q *Queue) ListUnclaimed() ([]*MR, error) {
all, err := q.List()
if err != nil {
return nil, err
}
var unclaimed []*MR
for _, mr := range all {
if mr.ClaimedBy == "" {
unclaimed = append(unclaimed, mr)
continue
}
// Check if claim is stale
if mr.ClaimedAt != nil && time.Since(*mr.ClaimedAt) >= ClaimStaleTimeout {
unclaimed = append(unclaimed, mr)
}
}
return unclaimed, nil
}
// ListClaimedBy returns MRs claimed by a specific worker.
func (q *Queue) ListClaimedBy(workerID string) ([]*MR, error) {
all, err := q.List()
if err != nil {
return nil, err
}
var claimed []*MR
for _, mr := range all {
if mr.ClaimedBy == workerID {
claimed = append(claimed, mr)
}
}
return claimed, nil
}
// Common errors for claiming
var (
ErrNotFound = fmt.Errorf("merge request not found")
ErrAlreadyClaimed = fmt.Errorf("merge request already claimed by another worker")
)