Add identity collision prevention, detection, and correction (gt-xp2s)

- internal/lock: New package with PID-based lockfiles for worker identity
- gt prime: Acquire identity lock for crew/polecat roles, fail on collision
- gt agents check: Detect stale locks and identity collisions
- gt agents fix: Clean up stale locks
- gt doctor: New identity-collision check with --fix support

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Steve Yegge
2025-12-22 00:05:46 -08:00
parent 6ee0f98bf3
commit 65baefdc06
5 changed files with 734 additions and 1 deletions

View File

@@ -1,14 +1,19 @@
package cmd
import (
"encoding/json"
"fmt"
"os"
"os/exec"
"path/filepath"
"sort"
"strings"
"github.com/spf13/cobra"
"github.com/steveyegge/gastown/internal/lock"
"github.com/steveyegge/gastown/internal/style"
"github.com/steveyegge/gastown/internal/tmux"
"github.com/steveyegge/gastown/internal/workspace"
)
// AgentType represents the type of Gas Town agent.
@@ -71,11 +76,49 @@ var agentsListCmd = &cobra.Command{
RunE: runAgentsList,
}
var agentsAllFlag bool
var agentsCheckCmd = &cobra.Command{
Use: "check",
Short: "Check for identity collisions and stale locks",
Long: `Check for identity collisions and stale locks.
This command helps detect situations where multiple Claude processes
think they own the same worker identity.
Output shows:
- Active tmux sessions with gt- prefix
- Identity locks in worker directories
- Collisions (multiple agents claiming same identity)
- Stale locks (dead PIDs)`,
RunE: runAgentsCheck,
}
var agentsFixCmd = &cobra.Command{
Use: "fix",
Short: "Fix identity collisions and clean up stale locks",
Long: `Clean up identity collisions and stale locks.
This command:
1. Removes stale locks (where the PID is dead)
2. Reports collisions that need manual intervention
For collisions with live processes, you must manually:
- Kill the duplicate session, OR
- Decide which agent should own the identity`,
RunE: runAgentsFix,
}
var (
agentsAllFlag bool
agentsCheckJSON bool
)
func init() {
agentsCmd.PersistentFlags().BoolVarP(&agentsAllFlag, "all", "a", false, "Include polecats in the menu")
agentsCheckCmd.Flags().BoolVar(&agentsCheckJSON, "json", false, "Output as JSON")
agentsCmd.AddCommand(agentsListCmd)
agentsCmd.AddCommand(agentsCheckCmd)
agentsCmd.AddCommand(agentsFixCmd)
rootCmd.AddCommand(agentsCmd)
}
@@ -333,3 +376,200 @@ func runAgentsList(cmd *cobra.Command, args []string) error {
return nil
}
// CollisionReport holds the results of a collision check.
type CollisionReport struct {
TotalSessions int `json:"total_sessions"`
TotalLocks int `json:"total_locks"`
Collisions int `json:"collisions"`
StaleLocks int `json:"stale_locks"`
Issues []CollisionIssue `json:"issues,omitempty"`
Locks map[string]*lock.LockInfo `json:"locks,omitempty"`
}
// CollisionIssue describes a single collision or lock issue.
type CollisionIssue struct {
Type string `json:"type"` // "stale", "collision", "orphaned"
WorkerDir string `json:"worker_dir"`
Message string `json:"message"`
PID int `json:"pid,omitempty"`
SessionID string `json:"session_id,omitempty"`
}
func runAgentsCheck(cmd *cobra.Command, args []string) error {
townRoot, err := workspace.FindFromCwdOrError()
if err != nil {
return fmt.Errorf("not in a Gas Town workspace: %w", err)
}
report, err := buildCollisionReport(townRoot)
if err != nil {
return err
}
if agentsCheckJSON {
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
return enc.Encode(report)
}
// Text output
if len(report.Issues) == 0 {
fmt.Printf("%s All agents healthy\n", style.Bold.Render("✓"))
fmt.Printf(" Sessions: %d, Locks: %d\n", report.TotalSessions, report.TotalLocks)
return nil
}
fmt.Printf("%s\n\n", style.Bold.Render("⚠️ Issues Detected"))
fmt.Printf("Collisions: %d, Stale locks: %d\n\n", report.Collisions, report.StaleLocks)
for _, issue := range report.Issues {
fmt.Printf("%s %s\n", style.Bold.Render("!"), issue.Message)
fmt.Printf(" Dir: %s\n", issue.WorkerDir)
if issue.PID > 0 {
fmt.Printf(" PID: %d\n", issue.PID)
}
fmt.Println()
}
fmt.Printf("Run %s to fix stale locks\n", style.Dim.Render("gt agents fix"))
return nil
}
func runAgentsFix(cmd *cobra.Command, args []string) error {
townRoot, err := workspace.FindFromCwdOrError()
if err != nil {
return fmt.Errorf("not in a Gas Town workspace: %w", err)
}
// Clean stale locks
cleaned, err := lock.CleanStaleLocks(townRoot)
if err != nil {
return fmt.Errorf("cleaning stale locks: %w", err)
}
if cleaned > 0 {
fmt.Printf("%s Cleaned %d stale lock(s)\n", style.Bold.Render("✓"), cleaned)
} else {
fmt.Printf("%s No stale locks found\n", style.Dim.Render("○"))
}
// Check for remaining issues
report, err := buildCollisionReport(townRoot)
if err != nil {
return err
}
if report.Collisions > 0 {
fmt.Println()
fmt.Printf("%s %d collision(s) require manual intervention:\n\n",
style.Bold.Render("⚠"), report.Collisions)
for _, issue := range report.Issues {
if issue.Type == "collision" {
fmt.Printf(" %s %s\n", style.Bold.Render("!"), issue.Message)
}
}
fmt.Println()
fmt.Printf("To fix, close duplicate sessions or remove lock files manually.\n")
}
return nil
}
func buildCollisionReport(townRoot string) (*CollisionReport, error) {
report := &CollisionReport{
Locks: make(map[string]*lock.LockInfo),
}
// Get all tmux sessions
t := tmux.NewTmux()
sessions, err := t.ListSessions()
if err != nil {
sessions = []string{} // Continue even if tmux not running
}
// Filter to gt- sessions
var gtSessions []string
for _, s := range sessions {
if strings.HasPrefix(s, "gt-") {
gtSessions = append(gtSessions, s)
}
}
report.TotalSessions = len(gtSessions)
// Find all locks
locks, err := lock.FindAllLocks(townRoot)
if err != nil {
return nil, fmt.Errorf("finding locks: %w", err)
}
report.TotalLocks = len(locks)
report.Locks = locks
// Check each lock for issues
for workerDir, lockInfo := range locks {
if lockInfo.IsStale() {
report.StaleLocks++
report.Issues = append(report.Issues, CollisionIssue{
Type: "stale",
WorkerDir: workerDir,
Message: fmt.Sprintf("Stale lock (dead PID %d)", lockInfo.PID),
PID: lockInfo.PID,
SessionID: lockInfo.SessionID,
})
continue
}
// Check if the locked session exists in tmux
expectedSession := guessSessionFromWorkerDir(workerDir, townRoot)
if expectedSession != "" {
found := false
for _, s := range gtSessions {
if s == expectedSession {
found = true
break
}
}
if !found {
// Lock exists but session doesn't - potential orphan or collision
report.Collisions++
report.Issues = append(report.Issues, CollisionIssue{
Type: "orphaned",
WorkerDir: workerDir,
Message: fmt.Sprintf("Lock exists (PID %d) but no tmux session '%s'", lockInfo.PID, expectedSession),
PID: lockInfo.PID,
SessionID: lockInfo.SessionID,
})
}
}
}
return report, nil
}
func guessSessionFromWorkerDir(workerDir, townRoot string) string {
relPath, err := filepath.Rel(townRoot, workerDir)
if err != nil {
return ""
}
parts := strings.Split(filepath.ToSlash(relPath), "/")
if len(parts) < 3 {
return ""
}
rig := parts[0]
workerType := parts[1]
workerName := parts[2]
switch workerType {
case "crew":
return fmt.Sprintf("gt-%s-crew-%s", rig, workerName)
case "polecats":
return fmt.Sprintf("gt-%s-%s", rig, workerName)
}
return ""
}