Add identity collision prevention, detection, and correction (gt-xp2s)
- internal/lock: New package with PID-based lockfiles for worker identity - gt prime: Acquire identity lock for crew/polecat roles, fail on collision - gt agents check: Detect stale locks and identity collisions - gt agents fix: Clean up stale locks - gt doctor: New identity-collision check with --fix support 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -1,14 +1,19 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
"github.com/steveyegge/gastown/internal/lock"
|
||||
"github.com/steveyegge/gastown/internal/style"
|
||||
"github.com/steveyegge/gastown/internal/tmux"
|
||||
"github.com/steveyegge/gastown/internal/workspace"
|
||||
)
|
||||
|
||||
// AgentType represents the type of Gas Town agent.
|
||||
@@ -71,11 +76,49 @@ var agentsListCmd = &cobra.Command{
|
||||
RunE: runAgentsList,
|
||||
}
|
||||
|
||||
var agentsAllFlag bool
|
||||
var agentsCheckCmd = &cobra.Command{
|
||||
Use: "check",
|
||||
Short: "Check for identity collisions and stale locks",
|
||||
Long: `Check for identity collisions and stale locks.
|
||||
|
||||
This command helps detect situations where multiple Claude processes
|
||||
think they own the same worker identity.
|
||||
|
||||
Output shows:
|
||||
- Active tmux sessions with gt- prefix
|
||||
- Identity locks in worker directories
|
||||
- Collisions (multiple agents claiming same identity)
|
||||
- Stale locks (dead PIDs)`,
|
||||
RunE: runAgentsCheck,
|
||||
}
|
||||
|
||||
var agentsFixCmd = &cobra.Command{
|
||||
Use: "fix",
|
||||
Short: "Fix identity collisions and clean up stale locks",
|
||||
Long: `Clean up identity collisions and stale locks.
|
||||
|
||||
This command:
|
||||
1. Removes stale locks (where the PID is dead)
|
||||
2. Reports collisions that need manual intervention
|
||||
|
||||
For collisions with live processes, you must manually:
|
||||
- Kill the duplicate session, OR
|
||||
- Decide which agent should own the identity`,
|
||||
RunE: runAgentsFix,
|
||||
}
|
||||
|
||||
var (
|
||||
agentsAllFlag bool
|
||||
agentsCheckJSON bool
|
||||
)
|
||||
|
||||
func init() {
|
||||
agentsCmd.PersistentFlags().BoolVarP(&agentsAllFlag, "all", "a", false, "Include polecats in the menu")
|
||||
agentsCheckCmd.Flags().BoolVar(&agentsCheckJSON, "json", false, "Output as JSON")
|
||||
|
||||
agentsCmd.AddCommand(agentsListCmd)
|
||||
agentsCmd.AddCommand(agentsCheckCmd)
|
||||
agentsCmd.AddCommand(agentsFixCmd)
|
||||
rootCmd.AddCommand(agentsCmd)
|
||||
}
|
||||
|
||||
@@ -333,3 +376,200 @@ func runAgentsList(cmd *cobra.Command, args []string) error {
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// CollisionReport holds the results of a collision check.
|
||||
type CollisionReport struct {
|
||||
TotalSessions int `json:"total_sessions"`
|
||||
TotalLocks int `json:"total_locks"`
|
||||
Collisions int `json:"collisions"`
|
||||
StaleLocks int `json:"stale_locks"`
|
||||
Issues []CollisionIssue `json:"issues,omitempty"`
|
||||
Locks map[string]*lock.LockInfo `json:"locks,omitempty"`
|
||||
}
|
||||
|
||||
// CollisionIssue describes a single collision or lock issue.
|
||||
type CollisionIssue struct {
|
||||
Type string `json:"type"` // "stale", "collision", "orphaned"
|
||||
WorkerDir string `json:"worker_dir"`
|
||||
Message string `json:"message"`
|
||||
PID int `json:"pid,omitempty"`
|
||||
SessionID string `json:"session_id,omitempty"`
|
||||
}
|
||||
|
||||
func runAgentsCheck(cmd *cobra.Command, args []string) error {
|
||||
townRoot, err := workspace.FindFromCwdOrError()
|
||||
if err != nil {
|
||||
return fmt.Errorf("not in a Gas Town workspace: %w", err)
|
||||
}
|
||||
|
||||
report, err := buildCollisionReport(townRoot)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if agentsCheckJSON {
|
||||
enc := json.NewEncoder(os.Stdout)
|
||||
enc.SetIndent("", " ")
|
||||
return enc.Encode(report)
|
||||
}
|
||||
|
||||
// Text output
|
||||
if len(report.Issues) == 0 {
|
||||
fmt.Printf("%s All agents healthy\n", style.Bold.Render("✓"))
|
||||
fmt.Printf(" Sessions: %d, Locks: %d\n", report.TotalSessions, report.TotalLocks)
|
||||
return nil
|
||||
}
|
||||
|
||||
fmt.Printf("%s\n\n", style.Bold.Render("⚠️ Issues Detected"))
|
||||
fmt.Printf("Collisions: %d, Stale locks: %d\n\n", report.Collisions, report.StaleLocks)
|
||||
|
||||
for _, issue := range report.Issues {
|
||||
fmt.Printf("%s %s\n", style.Bold.Render("!"), issue.Message)
|
||||
fmt.Printf(" Dir: %s\n", issue.WorkerDir)
|
||||
if issue.PID > 0 {
|
||||
fmt.Printf(" PID: %d\n", issue.PID)
|
||||
}
|
||||
fmt.Println()
|
||||
}
|
||||
|
||||
fmt.Printf("Run %s to fix stale locks\n", style.Dim.Render("gt agents fix"))
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func runAgentsFix(cmd *cobra.Command, args []string) error {
|
||||
townRoot, err := workspace.FindFromCwdOrError()
|
||||
if err != nil {
|
||||
return fmt.Errorf("not in a Gas Town workspace: %w", err)
|
||||
}
|
||||
|
||||
// Clean stale locks
|
||||
cleaned, err := lock.CleanStaleLocks(townRoot)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cleaning stale locks: %w", err)
|
||||
}
|
||||
|
||||
if cleaned > 0 {
|
||||
fmt.Printf("%s Cleaned %d stale lock(s)\n", style.Bold.Render("✓"), cleaned)
|
||||
} else {
|
||||
fmt.Printf("%s No stale locks found\n", style.Dim.Render("○"))
|
||||
}
|
||||
|
||||
// Check for remaining issues
|
||||
report, err := buildCollisionReport(townRoot)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if report.Collisions > 0 {
|
||||
fmt.Println()
|
||||
fmt.Printf("%s %d collision(s) require manual intervention:\n\n",
|
||||
style.Bold.Render("⚠"), report.Collisions)
|
||||
|
||||
for _, issue := range report.Issues {
|
||||
if issue.Type == "collision" {
|
||||
fmt.Printf(" %s %s\n", style.Bold.Render("!"), issue.Message)
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Println()
|
||||
fmt.Printf("To fix, close duplicate sessions or remove lock files manually.\n")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func buildCollisionReport(townRoot string) (*CollisionReport, error) {
|
||||
report := &CollisionReport{
|
||||
Locks: make(map[string]*lock.LockInfo),
|
||||
}
|
||||
|
||||
// Get all tmux sessions
|
||||
t := tmux.NewTmux()
|
||||
sessions, err := t.ListSessions()
|
||||
if err != nil {
|
||||
sessions = []string{} // Continue even if tmux not running
|
||||
}
|
||||
|
||||
// Filter to gt- sessions
|
||||
var gtSessions []string
|
||||
for _, s := range sessions {
|
||||
if strings.HasPrefix(s, "gt-") {
|
||||
gtSessions = append(gtSessions, s)
|
||||
}
|
||||
}
|
||||
report.TotalSessions = len(gtSessions)
|
||||
|
||||
// Find all locks
|
||||
locks, err := lock.FindAllLocks(townRoot)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("finding locks: %w", err)
|
||||
}
|
||||
report.TotalLocks = len(locks)
|
||||
report.Locks = locks
|
||||
|
||||
// Check each lock for issues
|
||||
for workerDir, lockInfo := range locks {
|
||||
if lockInfo.IsStale() {
|
||||
report.StaleLocks++
|
||||
report.Issues = append(report.Issues, CollisionIssue{
|
||||
Type: "stale",
|
||||
WorkerDir: workerDir,
|
||||
Message: fmt.Sprintf("Stale lock (dead PID %d)", lockInfo.PID),
|
||||
PID: lockInfo.PID,
|
||||
SessionID: lockInfo.SessionID,
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
// Check if the locked session exists in tmux
|
||||
expectedSession := guessSessionFromWorkerDir(workerDir, townRoot)
|
||||
if expectedSession != "" {
|
||||
found := false
|
||||
for _, s := range gtSessions {
|
||||
if s == expectedSession {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
// Lock exists but session doesn't - potential orphan or collision
|
||||
report.Collisions++
|
||||
report.Issues = append(report.Issues, CollisionIssue{
|
||||
Type: "orphaned",
|
||||
WorkerDir: workerDir,
|
||||
Message: fmt.Sprintf("Lock exists (PID %d) but no tmux session '%s'", lockInfo.PID, expectedSession),
|
||||
PID: lockInfo.PID,
|
||||
SessionID: lockInfo.SessionID,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return report, nil
|
||||
}
|
||||
|
||||
func guessSessionFromWorkerDir(workerDir, townRoot string) string {
|
||||
relPath, err := filepath.Rel(townRoot, workerDir)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
parts := strings.Split(filepath.ToSlash(relPath), "/")
|
||||
if len(parts) < 3 {
|
||||
return ""
|
||||
}
|
||||
|
||||
rig := parts[0]
|
||||
workerType := parts[1]
|
||||
workerName := parts[2]
|
||||
|
||||
switch workerType {
|
||||
case "crew":
|
||||
return fmt.Sprintf("gt-%s-crew-%s", rig, workerName)
|
||||
case "polecats":
|
||||
return fmt.Sprintf("gt-%s-%s", rig, workerName)
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
@@ -60,6 +60,7 @@ func runDoctor(cmd *cobra.Command, args []string) error {
|
||||
d.Register(doctor.NewOrphanProcessCheck())
|
||||
d.Register(doctor.NewBranchCheck())
|
||||
d.Register(doctor.NewBeadsSyncOrphanCheck())
|
||||
d.Register(doctor.NewIdentityCollisionCheck())
|
||||
|
||||
// Ephemeral beads checks
|
||||
d.Register(doctor.NewEphemeralExistsCheck())
|
||||
|
||||
@@ -2,6 +2,7 @@ package cmd
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
@@ -10,6 +11,7 @@ import (
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
"github.com/steveyegge/gastown/internal/beads"
|
||||
"github.com/steveyegge/gastown/internal/lock"
|
||||
"github.com/steveyegge/gastown/internal/style"
|
||||
"github.com/steveyegge/gastown/internal/templates"
|
||||
"github.com/steveyegge/gastown/internal/workspace"
|
||||
@@ -74,6 +76,11 @@ func runPrime(cmd *cobra.Command, args []string) error {
|
||||
// Detect role
|
||||
ctx := detectRole(cwd, townRoot)
|
||||
|
||||
// Check and acquire identity lock for worker roles
|
||||
if err := acquireIdentityLock(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Ensure beads redirect exists for worktree-based roles
|
||||
ensureBeadsRedirect(ctx)
|
||||
|
||||
@@ -668,6 +675,57 @@ func outputDeaconPatrolContext(ctx RoleContext) {
|
||||
fmt.Println(" gt mol bond mol-deacon-patrol")
|
||||
}
|
||||
|
||||
// acquireIdentityLock checks and acquires the identity lock for worker roles.
|
||||
// This prevents multiple agents from claiming the same worker identity.
|
||||
// Returns an error if another agent already owns this identity.
|
||||
func acquireIdentityLock(ctx RoleContext) error {
|
||||
// Only lock worker roles (polecat, crew)
|
||||
// Infrastructure roles (mayor, witness, refinery, deacon) are singletons
|
||||
// managed by tmux session names, so they don't need file-based locks
|
||||
if ctx.Role != RolePolecat && ctx.Role != RoleCrew {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Create lock for this worker directory
|
||||
l := lock.New(ctx.WorkDir)
|
||||
|
||||
// Determine session ID from environment or context
|
||||
sessionID := os.Getenv("TMUX_PANE")
|
||||
if sessionID == "" {
|
||||
// Fall back to a descriptive identifier
|
||||
sessionID = fmt.Sprintf("%s/%s", ctx.Rig, ctx.Polecat)
|
||||
}
|
||||
|
||||
// Try to acquire the lock
|
||||
if err := l.Acquire(sessionID); err != nil {
|
||||
if errors.Is(err, lock.ErrLocked) {
|
||||
// Another agent owns this identity
|
||||
fmt.Printf("\n%s\n\n", style.Bold.Render("⚠️ IDENTITY COLLISION DETECTED"))
|
||||
fmt.Printf("Another agent already claims this worker identity.\n\n")
|
||||
|
||||
// Show lock details
|
||||
if info, readErr := l.Read(); readErr == nil {
|
||||
fmt.Printf("Lock holder:\n")
|
||||
fmt.Printf(" PID: %d\n", info.PID)
|
||||
fmt.Printf(" Session: %s\n", info.SessionID)
|
||||
fmt.Printf(" Acquired: %s\n", info.AcquiredAt.Format("2006-01-02 15:04:05"))
|
||||
fmt.Println()
|
||||
}
|
||||
|
||||
fmt.Printf("To resolve:\n")
|
||||
fmt.Printf(" 1. Find the other session and close it, OR\n")
|
||||
fmt.Printf(" 2. Run: gt doctor --fix (cleans stale locks)\n")
|
||||
fmt.Printf(" 3. If lock is stale: rm %s/.gastown/agent.lock\n", ctx.WorkDir)
|
||||
fmt.Println()
|
||||
|
||||
return fmt.Errorf("cannot claim identity %s/%s: %w", ctx.Rig, ctx.Polecat, err)
|
||||
}
|
||||
return fmt.Errorf("acquiring identity lock: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// ensureBeadsRedirect ensures the .beads/redirect file exists for worktree-based roles.
|
||||
// This handles cases where git clean or other operations delete the redirect file.
|
||||
func ensureBeadsRedirect(ctx RoleContext) {
|
||||
|
||||
Reference in New Issue
Block a user