From afc1ff04b1657d8332ddf0ee94032fdd75b365e9 Mon Sep 17 00:00:00 2001
From: Subhrajit Makur <Avyukth@users.noreply.github.com>
Date: Thu, 22 Jan 2026 00:08:45 +0530
Subject: [PATCH] feat(formula): add verification legs and presets to
 code-review (#841)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add three new verification legs to the code-review convoy formula:
- wiring: detects dependencies added but not actually used
- commit-discipline: reviews commit quality and atomicity
- test-quality: verifies tests are meaningful, not just present

Also adds presets for common review modes:
- gate: light review (wiring, security, smells, test-quality)
- full: all 10 legs for comprehensive review
- security-focused: security-heavy for sensitive changes
- refactor: code quality focus

This is a minimal subset of PR #4's Worker → Reviewer pattern,
focusing on the most valuable additions without Go code changes.

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
---
 .beads/formulas/code-review.formula.toml      | 140 +++++++++++++++++-
 .../formula/formulas/code-review.formula.toml | 140 +++++++++++++++++-
 2 files changed, 274 insertions(+), 6 deletions(-)

diff --git a/.beads/formulas/code-review.formula.toml b/.beads/formulas/code-review.formula.toml
index dc09c950..13fcae5f 100644
--- a/.beads/formulas/code-review.formula.toml
+++ b/.beads/formulas/code-review.formula.toml
@@ -15,6 +15,8 @@ Each leg examines the code from a different perspective. Findings are
 collected and synthesized into a prioritized, actionable review.
 
 ## Legs (parallel execution)
+
+### Analysis Legs (read and analyze code)
 - **correctness**: Logic errors, bugs, edge cases
 - **performance**: Bottlenecks, efficiency issues
 - **security**: Vulnerabilities, OWASP concerns
@@ -23,6 +25,16 @@ collected and synthesized into a prioritized, actionable review.
 - **style**: Convention compliance, consistency
 - **smells**: Anti-patterns, technical debt
 
+### Verification Legs (check implementation quality)
+- **wiring**: Installed-but-not-wired gaps (deps added but not used)
+- **commit-discipline**: Commit quality and atomicity
+- **test-quality**: Test meaningfulness, not just coverage
+
+## Presets
+- **gate**: Light review for automatic flow (wiring, security, smells, test-quality)
+- **full**: Comprehensive review (all 10 legs)
+- **custom**: Select specific legs via --legs flag
+
 ## Execution Model
 1. Each leg spawns as a separate polecat
 2. Polecats work in parallel
@@ -293,6 +305,125 @@ Review the code for code smells and anti-patterns.
 - Is technical debt being added or paid down?
 """
 
+# ============================================================================
+# VERIFICATION LEGS - Check implementation quality (not just code analysis)
+# ============================================================================
+
+[[legs]]
+id = "wiring"
+title = "Wiring Review"
+focus = "Installed-but-not-wired gaps"
+description = """
+Detect dependencies, configs, or libraries that were added but not actually used.
+
+This catches subtle bugs where the implementer THINKS they integrated something,
+but the old implementation is still being used.
+
+**Look for:**
+- New dependency in manifest but never imported
+  - Go: module in go.mod but no import
+  - Rust: crate in Cargo.toml but no `use`
+  - Node: package in package.json but no import/require
+
+- SDK added but old implementation remains
+  - Added Sentry but still using console.error for errors
+  - Added Zod but still using manual typeof validation
+
+- Config/env var defined but never loaded
+  - New .env var that isn't accessed in code
+
+**Questions to answer:**
+- Is every new dependency actually used?
+- Are there old patterns that should have been replaced?
+- Is there dead config that suggests incomplete migration?
+"""
+
+[[legs]]
+id = "commit-discipline"
+title = "Commit Discipline Review"
+focus = "Commit quality and atomicity"
+description = """
+Review commit history for good practices.
+
+Good commits make the codebase easier to understand, bisect, and revert.
+
+**Look for:**
+- Giant "WIP" or "fix" commits
+  - Multiple unrelated changes in one commit
+  - Commits that touch 20+ files across different features
+
+- Poor commit messages
+  - "stuff", "update", "asdf", "fix"
+  - No context about WHY the change was made
+
+- Unatomic commits
+  - Feature + refactor + bugfix in same commit
+  - Should be separable logical units
+
+- Missing type prefixes (if project uses conventional commits)
+  - feat:, fix:, refactor:, test:, docs:, chore:
+
+**Questions to answer:**
+- Could this history be bisected effectively?
+- Would a reviewer understand the progression?
+- Are commits atomic (one logical change each)?
+"""
+
+[[legs]]
+id = "test-quality"
+title = "Test Quality Review"
+focus = "Test meaningfulness, not just coverage"
+description = """
+Verify tests are actually testing something meaningful.
+
+Coverage numbers lie. A test that can't fail provides no value.
+
+**Look for:**
+- Weak assertions
+  - Only checking != nil / !== null / is not None
+  - Using .is_ok() without checking the value
+  - assertTrue(true) or equivalent
+
+- Missing negative test cases
+  - Happy path only, no error cases
+  - No boundary testing
+  - No invalid input testing
+
+- Tests that can't fail
+  - Mocked so heavily the test is meaningless
+  - Testing implementation details, not behavior
+
+- Flaky test indicators
+  - Sleep/delay in tests
+  - Time-dependent assertions
+
+**Questions to answer:**
+- Do these tests actually verify behavior?
+- Would a bug in the implementation cause a test failure?
+- Are edge cases and error paths tested?
+"""
+
+# ============================================================================
+# PRESETS - Configurable leg selection
+# ============================================================================
+
+[presets]
+[presets.gate]
+description = "Light review for automatic flow - fast, focused on blockers"
+legs = ["wiring", "security", "smells", "test-quality"]
+
+[presets.full]
+description = "Comprehensive review - all legs, for major features"
+legs = ["correctness", "performance", "security", "elegance", "resilience", "style", "smells", "wiring", "commit-discipline", "test-quality"]
+
+[presets.security-focused]
+description = "Security-heavy review for sensitive changes"
+legs = ["security", "resilience", "correctness", "wiring"]
+
+[presets.refactor]
+description = "Review focused on code quality during refactoring"
+legs = ["elegance", "smells", "style", "commit-discipline"]
+
 # Synthesis step - combines all leg outputs
 [synthesis]
 title = "Review Synthesis"
@@ -310,10 +441,13 @@ A synthesized review at: {{.output.directory}}/{{.output.synthesis}}
 2. **Critical Issues** - P0 items from all legs, deduplicated
 3. **Major Issues** - P1 items, grouped by theme
 4. **Minor Issues** - P2 items, briefly listed
-5. **Positive Observations** - What's done well
-6. **Recommendations** - Actionable next steps
+5. **Wiring Gaps** - Dependencies added but not used (from wiring leg)
+6. **Commit Quality** - Notes on commit discipline
+7. **Test Quality** - Assessment of test meaningfulness
+8. **Positive Observations** - What's done well
+9. **Recommendations** - Actionable next steps
 
 Deduplicate issues found by multiple legs (note which legs found them).
 Prioritize by impact and effort. Be actionable.
 """
-depends_on = ["correctness", "performance", "security", "elegance", "resilience", "style", "smells"]
+depends_on = ["correctness", "performance", "security", "elegance", "resilience", "style", "smells", "wiring", "commit-discipline", "test-quality"]
diff --git a/internal/formula/formulas/code-review.formula.toml b/internal/formula/formulas/code-review.formula.toml
index dc09c950..13fcae5f 100644
--- a/internal/formula/formulas/code-review.formula.toml
+++ b/internal/formula/formulas/code-review.formula.toml
@@ -15,6 +15,8 @@ Each leg examines the code from a different perspective. Findings are
 collected and synthesized into a prioritized, actionable review.
 
 ## Legs (parallel execution)
+
+### Analysis Legs (read and analyze code)
 - **correctness**: Logic errors, bugs, edge cases
 - **performance**: Bottlenecks, efficiency issues
 - **security**: Vulnerabilities, OWASP concerns
@@ -23,6 +25,16 @@ collected and synthesized into a prioritized, actionable review.
 - **style**: Convention compliance, consistency
 - **smells**: Anti-patterns, technical debt
 
+### Verification Legs (check implementation quality)
+- **wiring**: Installed-but-not-wired gaps (deps added but not used)
+- **commit-discipline**: Commit quality and atomicity
+- **test-quality**: Test meaningfulness, not just coverage
+
+## Presets
+- **gate**: Light review for automatic flow (wiring, security, smells, test-quality)
+- **full**: Comprehensive review (all 10 legs)
+- **custom**: Select specific legs via --legs flag
+
 ## Execution Model
 1. Each leg spawns as a separate polecat
 2. Polecats work in parallel
@@ -293,6 +305,125 @@ Review the code for code smells and anti-patterns.
 - Is technical debt being added or paid down?
 """
 
+# ============================================================================
+# VERIFICATION LEGS - Check implementation quality (not just code analysis)
+# ============================================================================
+
+[[legs]]
+id = "wiring"
+title = "Wiring Review"
+focus = "Installed-but-not-wired gaps"
+description = """
+Detect dependencies, configs, or libraries that were added but not actually used.
+
+This catches subtle bugs where the implementer THINKS they integrated something,
+but the old implementation is still being used.
+
+**Look for:**
+- New dependency in manifest but never imported
+  - Go: module in go.mod but no import
+  - Rust: crate in Cargo.toml but no `use`
+  - Node: package in package.json but no import/require
+
+- SDK added but old implementation remains
+  - Added Sentry but still using console.error for errors
+  - Added Zod but still using manual typeof validation
+
+- Config/env var defined but never loaded
+  - New .env var that isn't accessed in code
+
+**Questions to answer:**
+- Is every new dependency actually used?
+- Are there old patterns that should have been replaced?
+- Is there dead config that suggests incomplete migration?
+"""
+
+[[legs]]
+id = "commit-discipline"
+title = "Commit Discipline Review"
+focus = "Commit quality and atomicity"
+description = """
+Review commit history for good practices.
+
+Good commits make the codebase easier to understand, bisect, and revert.
+
+**Look for:**
+- Giant "WIP" or "fix" commits
+  - Multiple unrelated changes in one commit
+  - Commits that touch 20+ files across different features
+
+- Poor commit messages
+  - "stuff", "update", "asdf", "fix"
+  - No context about WHY the change was made
+
+- Unatomic commits
+  - Feature + refactor + bugfix in same commit
+  - Should be separable logical units
+
+- Missing type prefixes (if project uses conventional commits)
+  - feat:, fix:, refactor:, test:, docs:, chore:
+
+**Questions to answer:**
+- Could this history be bisected effectively?
+- Would a reviewer understand the progression?
+- Are commits atomic (one logical change each)?
+"""
+
+[[legs]]
+id = "test-quality"
+title = "Test Quality Review"
+focus = "Test meaningfulness, not just coverage"
+description = """
+Verify tests are actually testing something meaningful.
+
+Coverage numbers lie. A test that can't fail provides no value.
+
+**Look for:**
+- Weak assertions
+  - Only checking != nil / !== null / is not None
+  - Using .is_ok() without checking the value
+  - assertTrue(true) or equivalent
+
+- Missing negative test cases
+  - Happy path only, no error cases
+  - No boundary testing
+  - No invalid input testing
+
+- Tests that can't fail
+  - Mocked so heavily the test is meaningless
+  - Testing implementation details, not behavior
+
+- Flaky test indicators
+  - Sleep/delay in tests
+  - Time-dependent assertions
+
+**Questions to answer:**
+- Do these tests actually verify behavior?
+- Would a bug in the implementation cause a test failure?
+- Are edge cases and error paths tested?
+"""
+
+# ============================================================================
+# PRESETS - Configurable leg selection
+# ============================================================================
+
+[presets]
+[presets.gate]
+description = "Light review for automatic flow - fast, focused on blockers"
+legs = ["wiring", "security", "smells", "test-quality"]
+
+[presets.full]
+description = "Comprehensive review - all legs, for major features"
+legs = ["correctness", "performance", "security", "elegance", "resilience", "style", "smells", "wiring", "commit-discipline", "test-quality"]
+
+[presets.security-focused]
+description = "Security-heavy review for sensitive changes"
+legs = ["security", "resilience", "correctness", "wiring"]
+
+[presets.refactor]
+description = "Review focused on code quality during refactoring"
+legs = ["elegance", "smells", "style", "commit-discipline"]
+
 # Synthesis step - combines all leg outputs
 [synthesis]
 title = "Review Synthesis"
@@ -310,10 +441,13 @@ A synthesized review at: {{.output.directory}}/{{.output.synthesis}}
 2. **Critical Issues** - P0 items from all legs, deduplicated
 3. **Major Issues** - P1 items, grouped by theme
 4. **Minor Issues** - P2 items, briefly listed
-5. **Positive Observations** - What's done well
-6. **Recommendations** - Actionable next steps
+5. **Wiring Gaps** - Dependencies added but not used (from wiring leg)
+6. **Commit Quality** - Notes on commit discipline
+7. **Test Quality** - Assessment of test meaningfulness
+8. **Positive Observations** - What's done well
+9. **Recommendations** - Actionable next steps
 
 Deduplicate issues found by multiple legs (note which legs found them).
 Prioritize by impact and effort. Be actionable.
 """
-depends_on = ["correctness", "performance", "security", "elegance", "resilience", "style", "smells"]
+depends_on = ["correctness", "performance", "security", "elegance", "resilience", "style", "smells", "wiring", "commit-discipline", "test-quality"]