feat(refinery): Add parallel worker support with MR claiming (gt-kgszr)
Implements Option A from the issue: multiple refinery workers with locking. Changes to mrqueue: - Add ClaimedBy/ClaimedAt fields to MR struct - Add Claim(id, worker) method with atomic file operations - Add Release(id) method to unclaim on failure - Add ListUnclaimed() to find available work - Add ListClaimedBy(worker) for worker-specific queries - Claims expire after 10 minutes for crash recovery New CLI commands: - gt refinery claim <mr-id> - Claim MR for processing - gt refinery release <mr-id> - Release claim back to queue - gt refinery unclaimed - List available MRs Formula updates: - queue-scan now uses gt refinery unclaimed - process-branch claims MR before processing - handle-failures releases claim on test failure - Claims prevent double-processing by parallel workers Worker ID comes from GT_REFINERY_WORKER env var (default: refinery-1). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -27,6 +27,10 @@ type MR struct {
|
||||
Priority int `json:"priority"` // Priority (lower = higher priority)
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
AgentBead string `json:"agent_bead,omitempty"` // Agent bead ID that created this MR (for traceability)
|
||||
|
||||
// Claiming fields for parallel refinery workers
|
||||
ClaimedBy string `json:"claimed_by,omitempty"` // Worker ID that claimed this MR
|
||||
ClaimedAt *time.Time `json:"claimed_at,omitempty"` // When the MR was claimed
|
||||
}
|
||||
|
||||
// Queue manages the MR storage.
|
||||
@@ -182,3 +186,128 @@ func (q *Queue) Count() int {
|
||||
func (q *Queue) Dir() string {
|
||||
return q.dir
|
||||
}
|
||||
|
||||
// ClaimStaleTimeout is how long before a claimed MR is considered stale.
|
||||
// If a worker claims an MR but doesn't process it within this time,
|
||||
// another worker can reclaim it.
|
||||
const ClaimStaleTimeout = 10 * time.Minute
|
||||
|
||||
// Claim attempts to claim an MR for processing by a specific worker.
|
||||
// Returns nil if successful, ErrAlreadyClaimed if another worker has it,
|
||||
// or ErrNotFound if the MR doesn't exist.
|
||||
// Uses atomic file operations to prevent race conditions.
|
||||
func (q *Queue) Claim(id, workerID string) error {
|
||||
path := filepath.Join(q.dir, id+".json")
|
||||
|
||||
// Read current state
|
||||
mr, err := q.load(path)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return ErrNotFound
|
||||
}
|
||||
return fmt.Errorf("loading MR: %w", err)
|
||||
}
|
||||
|
||||
// Check if already claimed by another worker
|
||||
if mr.ClaimedBy != "" && mr.ClaimedBy != workerID {
|
||||
// Check if claim is stale (worker may have crashed)
|
||||
if mr.ClaimedAt != nil && time.Since(*mr.ClaimedAt) < ClaimStaleTimeout {
|
||||
return ErrAlreadyClaimed
|
||||
}
|
||||
// Stale claim - allow reclaim
|
||||
}
|
||||
|
||||
// Claim the MR
|
||||
now := time.Now()
|
||||
mr.ClaimedBy = workerID
|
||||
mr.ClaimedAt = &now
|
||||
|
||||
// Write atomically
|
||||
data, err := json.MarshalIndent(mr, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshaling MR: %w", err)
|
||||
}
|
||||
|
||||
// Write to temp file first, then rename (atomic on most filesystems)
|
||||
tmpPath := path + ".tmp"
|
||||
if err := os.WriteFile(tmpPath, data, 0644); err != nil {
|
||||
return fmt.Errorf("writing temp file: %w", err)
|
||||
}
|
||||
if err := os.Rename(tmpPath, path); err != nil {
|
||||
os.Remove(tmpPath) // cleanup
|
||||
return fmt.Errorf("renaming temp file: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Release releases a claimed MR back to the queue.
|
||||
// Called when processing fails and the MR should be retried.
|
||||
func (q *Queue) Release(id string) error {
|
||||
path := filepath.Join(q.dir, id+".json")
|
||||
|
||||
mr, err := q.load(path)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil // Already removed
|
||||
}
|
||||
return fmt.Errorf("loading MR: %w", err)
|
||||
}
|
||||
|
||||
// Clear claim
|
||||
mr.ClaimedBy = ""
|
||||
mr.ClaimedAt = nil
|
||||
|
||||
data, err := json.MarshalIndent(mr, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshaling MR: %w", err)
|
||||
}
|
||||
|
||||
return os.WriteFile(path, data, 0644)
|
||||
}
|
||||
|
||||
// ListUnclaimed returns MRs that are not claimed or have stale claims.
|
||||
// Sorted by priority then creation time.
|
||||
func (q *Queue) ListUnclaimed() ([]*MR, error) {
|
||||
all, err := q.List()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var unclaimed []*MR
|
||||
for _, mr := range all {
|
||||
if mr.ClaimedBy == "" {
|
||||
unclaimed = append(unclaimed, mr)
|
||||
continue
|
||||
}
|
||||
// Check if claim is stale
|
||||
if mr.ClaimedAt != nil && time.Since(*mr.ClaimedAt) >= ClaimStaleTimeout {
|
||||
unclaimed = append(unclaimed, mr)
|
||||
}
|
||||
}
|
||||
|
||||
return unclaimed, nil
|
||||
}
|
||||
|
||||
// ListClaimedBy returns MRs claimed by a specific worker.
|
||||
func (q *Queue) ListClaimedBy(workerID string) ([]*MR, error) {
|
||||
all, err := q.List()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var claimed []*MR
|
||||
for _, mr := range all {
|
||||
if mr.ClaimedBy == workerID {
|
||||
claimed = append(claimed, mr)
|
||||
}
|
||||
}
|
||||
|
||||
return claimed, nil
|
||||
}
|
||||
|
||||
// Common errors for claiming
|
||||
var (
|
||||
ErrNotFound = fmt.Errorf("merge request not found")
|
||||
ErrAlreadyClaimed = fmt.Errorf("merge request already claimed by another worker")
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user