From 6b2a7438e1ce2f18112a062458003394d0a54bbe Mon Sep 17 00:00:00 2001 From: gus Date: Mon, 12 Jan 2026 18:12:28 -0800 Subject: [PATCH] feat(deacon): add dog-health-check step to patrol Adds supervision for dispatched dogs that may get stuck. The new step (between dog-pool-maintenance and orphan-check): - Lists dogs in "working" state - Checks work duration vs plugin timeout (default 10m) - Decision matrix based on how long overdue: - < 2x timeout: log warning, check next cycle - 2x-5x timeout: file death warrant - > 5x timeout: force clear + escalate to Mayor - Tracks chronic failures for repeat offenders This closes the supervision gap where dogs could hang forever after being dispatched via `gt dog dispatch --plugin`. Closes: gt-s4dp3 Co-Authored-By: Claude Opus 4.5 --- .../formulas/mol-deacon-patrol.formula.toml | 66 ++++++++++++++++++- .../formulas/mol-deacon-patrol.formula.toml | 66 ++++++++++++++++++- 2 files changed, 130 insertions(+), 2 deletions(-) diff --git a/.beads/formulas/mol-deacon-patrol.formula.toml b/.beads/formulas/mol-deacon-patrol.formula.toml index aca940d5..de62749c 100644 --- a/.beads/formulas/mol-deacon-patrol.formula.toml +++ b/.beads/formulas/mol-deacon-patrol.formula.toml @@ -499,10 +499,74 @@ gt dog status **Exit criteria:** Pool has at least 1 idle dog.""" +[[steps]] +id = "dog-health-check" +title = "Check for stuck dogs" +needs = ["dog-pool-maintenance"] +description = """ +Check for dogs that have been working too long (stuck). + +Dogs dispatched via `gt dog dispatch --plugin` are marked as "working" with +a work description like "plugin:rebuild-gt". If a dog hangs, crashes, or +takes too long, it needs intervention. + +**Step 1: List working dogs** +```bash +gt dog list --json +# Filter for state: "working" +``` + +**Step 2: Check work duration** +For each working dog: +```bash +gt dog status --json +# Check: work_started_at, current_work +``` + +Compare against timeout: +- If plugin has [execution] timeout in plugin.md, use that +- Default timeout: 10 minutes for infrastructure tasks + +**Duration calculation:** +``` +stuck_threshold = plugin_timeout or 10m +duration = now - work_started_at +is_stuck = duration > stuck_threshold +``` + +**Step 3: Handle stuck dogs** + +For dogs working > timeout: +```bash +# Option A: File death warrant (Boot handles termination) +gt warrant file deacon/dogs/ --reason "Stuck: working on for " + +# Option B: Force clear work and notify +gt dog clear --force +gt mail send deacon/ -s "DOG_TIMEOUT " -m "Dog timed out on after " +``` + +**Decision matrix:** + +| Duration over timeout | Action | +|----------------------|--------| +| < 2x timeout | Log warning, check next cycle | +| 2x - 5x timeout | File death warrant | +| > 5x timeout | Force clear + escalate to Mayor | + +**Step 4: Track chronic failures** +If same dog gets stuck repeatedly: +```bash +gt mail send mayor/ -s "Dog chronic failures" \ + -m "Dog has timed out N times in last 24h. Consider removing from pool." +``` + +**Exit criteria:** All stuck dogs handled (warrant filed or cleared).""" + [[steps]] id = "orphan-check" title = "Detect abandoned work" -needs = ["dog-pool-maintenance"] +needs = ["dog-health-check"] description = """ **DETECT ONLY** - Check for orphaned state and dispatch to dog if found. diff --git a/internal/formula/formulas/mol-deacon-patrol.formula.toml b/internal/formula/formulas/mol-deacon-patrol.formula.toml index aca940d5..de62749c 100644 --- a/internal/formula/formulas/mol-deacon-patrol.formula.toml +++ b/internal/formula/formulas/mol-deacon-patrol.formula.toml @@ -499,10 +499,74 @@ gt dog status **Exit criteria:** Pool has at least 1 idle dog.""" +[[steps]] +id = "dog-health-check" +title = "Check for stuck dogs" +needs = ["dog-pool-maintenance"] +description = """ +Check for dogs that have been working too long (stuck). + +Dogs dispatched via `gt dog dispatch --plugin` are marked as "working" with +a work description like "plugin:rebuild-gt". If a dog hangs, crashes, or +takes too long, it needs intervention. + +**Step 1: List working dogs** +```bash +gt dog list --json +# Filter for state: "working" +``` + +**Step 2: Check work duration** +For each working dog: +```bash +gt dog status --json +# Check: work_started_at, current_work +``` + +Compare against timeout: +- If plugin has [execution] timeout in plugin.md, use that +- Default timeout: 10 minutes for infrastructure tasks + +**Duration calculation:** +``` +stuck_threshold = plugin_timeout or 10m +duration = now - work_started_at +is_stuck = duration > stuck_threshold +``` + +**Step 3: Handle stuck dogs** + +For dogs working > timeout: +```bash +# Option A: File death warrant (Boot handles termination) +gt warrant file deacon/dogs/ --reason "Stuck: working on for " + +# Option B: Force clear work and notify +gt dog clear --force +gt mail send deacon/ -s "DOG_TIMEOUT " -m "Dog timed out on after " +``` + +**Decision matrix:** + +| Duration over timeout | Action | +|----------------------|--------| +| < 2x timeout | Log warning, check next cycle | +| 2x - 5x timeout | File death warrant | +| > 5x timeout | Force clear + escalate to Mayor | + +**Step 4: Track chronic failures** +If same dog gets stuck repeatedly: +```bash +gt mail send mayor/ -s "Dog chronic failures" \ + -m "Dog has timed out N times in last 24h. Consider removing from pool." +``` + +**Exit criteria:** All stuck dogs handled (warrant filed or cleared).""" + [[steps]] id = "orphan-check" title = "Detect abandoned work" -needs = ["dog-pool-maintenance"] +needs = ["dog-health-check"] description = """ **DETECT ONLY** - Check for orphaned state and dispatch to dog if found.