gastown/internal/formula/formulas/code-review.formula.toml

# Code Review Convoy Formula
#
# A convoy-style formula that spawns multiple polecats in parallel,
# each focusing on a different review aspect. Results are collected
# and synthesized into a unified review.
#
# Usage:
#   gt formula run code-review --pr=123
#   gt formula run code-review --files="src/*.go"

description = """
Comprehensive code review via parallel specialized reviewers.

Each leg examines the code from a different perspective. Findings are
collected and synthesized into a prioritized, actionable review.

## Legs (parallel execution)

### Analysis Legs (read and analyze code)
- **correctness**: Logic errors, bugs, edge cases
- **performance**: Bottlenecks, efficiency issues
- **security**: Vulnerabilities, OWASP concerns
- **elegance**: Design clarity, abstraction quality
- **resilience**: Error handling, failure modes
- **style**: Convention compliance, consistency
- **smells**: Anti-patterns, technical debt

### Verification Legs (check implementation quality)
- **wiring**: Installed-but-not-wired gaps (deps added but not used)
- **commit-discipline**: Commit quality and atomicity
- **test-quality**: Test meaningfulness, not just coverage

## Presets
- **gate**: Light review for automatic flow (wiring, security, smells, test-quality)
- **full**: Comprehensive review (all 10 legs)
- **custom**: Select specific legs via --legs flag

## Execution Model
1. Each leg spawns as a separate polecat
2. Polecats work in parallel
3. Each writes findings to their designated output
4. Synthesis step combines all findings into unified review
"""
formula = "code-review"
type = "convoy"
version = 1

# Input variables - provided at runtime
[inputs]
[inputs.pr]
description = "Pull request number to review"
type = "number"
required_unless = ["files", "branch"]

[inputs.files]
description = "File glob pattern to review"
type = "string"
required_unless = ["pr", "branch"]

[inputs.branch]
description = "Branch name to review (diff against main)"
type = "string"
required_unless = ["pr", "files"]

# Base prompt template - injected into all leg prompts
# NOTE: Uses Go text/template syntax (not Handlebars)
[prompts]
base = """
# Code Review Assignment

You are a specialized code reviewer participating in a convoy review.

## Context
- **Formula**: {{.formula_name}}
- **Review target**: {{.target_description}}
- **Your focus**: {{.leg.focus}}
- **Leg ID**: {{.leg.id}}

## Files Under Review
{{if .pr_number -}}
PR #{{.pr_number}}: {{.pr_title}}

Changed files:
{{range .changed_files -}}
- {{.path}} (+{{.additions}}/-{{.deletions}})
{{end -}}
{{else -}}
{{range .files -}}
- {{.}}
{{end -}}
{{end}}

## Your Task
{{.leg.description}}

## Output Requirements
Write your findings to: **{{.output_path}}**

Structure your output as follows:
```markdown
# {{.leg.title}} Review

## Summary
(1-2 paragraph overview of findings)

## Critical Issues
(P0 - Must fix before merge)
- Issue description with file:line reference
- Explanation of impact
- Suggested fix

## Major Issues
(P1 - Should fix before merge)
- ...

## Minor Issues
(P2 - Nice to fix)
- ...

## Observations
(Non-blocking notes and suggestions)
- ...
```

Use specific file:line references. Be actionable. Prioritize impact.
"""

# Output configuration
[output]
directory = ".reviews/{{.review_id}}"
leg_pattern = "{{.leg.id}}-findings.md"
synthesis = "review-summary.md"

# Leg definitions - each spawns a parallel polecat
[[legs]]
id = "correctness"
title = "Correctness Review"
focus = "Logical correctness and edge case handling"
description = """
Review the code for logical errors and edge case handling.

**Look for:**
- Logic errors and bugs
- Off-by-one errors
- Null/nil/undefined handling
- Unhandled edge cases
- Race conditions in concurrent code
- Dead code or unreachable branches
- Incorrect assumptions in comments vs code
- Integer overflow/underflow potential
- Floating point comparison issues

**Questions to answer:**
- Does the code do what it claims to do?
- What inputs could cause unexpected behavior?
- Are all code paths tested or obviously correct?
"""

[[legs]]
id = "performance"
title = "Performance Review"
focus = "Performance bottlenecks and efficiency"
description = """
Review the code for performance issues.

**Look for:**
- O(n²) or worse algorithms where O(n) is possible
- Unnecessary allocations in hot paths
- Missing caching opportunities
- N+1 query patterns (database or API)
- Blocking operations in async contexts
- Memory leaks or unbounded growth
- Excessive string concatenation
- Unoptimized regex or parsing

**Questions to answer:**
- What happens at 10x, 100x, 1000x scale?
- Are there obvious optimizations being missed?
- Is performance being traded for readability appropriately?
"""

[[legs]]
id = "security"
title = "Security Review"
focus = "Security vulnerabilities and attack surface"
description = """
Review the code for security vulnerabilities.

**Look for:**
- Input validation gaps
- Authentication/authorization bypasses
- Injection vulnerabilities (SQL, XSS, command, LDAP)
- Sensitive data exposure (logs, errors, responses)
- Hardcoded secrets or credentials
- Insecure cryptographic usage
- Path traversal vulnerabilities
- SSRF (Server-Side Request Forgery)
- Deserialization vulnerabilities
- OWASP Top 10 concerns

**Questions to answer:**
- What can a malicious user do with this code?
- What data could be exposed if this fails?
- Are there defense-in-depth gaps?
"""

[[legs]]
id = "elegance"
title = "Elegance Review"
focus = "Design clarity and abstraction quality"
description = """
Review the code for design quality.

**Look for:**
- Unclear abstractions or naming
- Functions doing too many things
- Missing or over-engineered abstractions
- Coupling that should be loose
- Dependencies that flow the wrong direction
- Unclear data flow or control flow
- Magic numbers/strings without explanation
- Inconsistent design patterns
- Violation of SOLID principles
- Reinventing existing utilities

**Questions to answer:**
- Would a new team member understand this?
- Does the structure match the problem domain?
- Is the complexity justified?
"""

[[legs]]
id = "resilience"
title = "Resilience Review"
focus = "Error handling and failure modes"
description = """
Review the code for resilience and error handling.

**Look for:**
- Swallowed errors or empty catch blocks
- Missing error propagation
- Unclear error messages
- Insufficient retry/backoff logic
- Missing timeout handling
- Resource cleanup on failure (files, connections)
- Partial failure states
- Missing circuit breakers for external calls
- Unhelpful panic/crash behavior
- Recovery path gaps

**Questions to answer:**
- What happens when external services fail?
- Can the system recover from partial failures?
- Are errors actionable for operators?
"""

[[legs]]
id = "style"
title = "Style Review"
focus = "Convention compliance and consistency"
description = """
Review the code for style and convention compliance.

**Look for:**
- Naming convention violations
- Formatting inconsistencies
- Import organization issues
- Comment quality (missing, outdated, or obvious)
- Documentation gaps for public APIs
- Inconsistent patterns within the codebase
- Lint/format violations
- Test naming and organization
- Log message quality and levels

**Questions to answer:**
- Does this match the rest of the codebase?
- Would the style guide approve?
- Is the code self-documenting where possible?
"""

[[legs]]
id = "smells"
title = "Code Smells Review"
focus = "Anti-patterns and technical debt"
description = """
Review the code for code smells and anti-patterns.

**Look for:**
- Long methods (>50 lines is suspicious)
- Deep nesting (>3 levels)
- Shotgun surgery patterns
- Feature envy
- Data clumps
- Primitive obsession
- Temporary fields
- Refused bequest
- Speculative generality
- God classes/functions
- Copy-paste code (DRY violations)
- TODO/FIXME accumulation

**Questions to answer:**
- What will cause pain during the next change?
- What would you refactor if you owned this code?
- Is technical debt being added or paid down?
"""

# ============================================================================
# VERIFICATION LEGS - Check implementation quality (not just code analysis)
# ============================================================================

[[legs]]
id = "wiring"
title = "Wiring Review"
focus = "Installed-but-not-wired gaps"
description = """
Detect dependencies, configs, or libraries that were added but not actually used.

This catches subtle bugs where the implementer THINKS they integrated something,
but the old implementation is still being used.

**Look for:**
- New dependency in manifest but never imported
  - Go: module in go.mod but no import
  - Rust: crate in Cargo.toml but no `use`
  - Node: package in package.json but no import/require

- SDK added but old implementation remains
  - Added Sentry but still using console.error for errors
  - Added Zod but still using manual typeof validation

- Config/env var defined but never loaded
  - New .env var that isn't accessed in code

**Questions to answer:**
- Is every new dependency actually used?
- Are there old patterns that should have been replaced?
- Is there dead config that suggests incomplete migration?
"""

[[legs]]
id = "commit-discipline"
title = "Commit Discipline Review"
focus = "Commit quality and atomicity"
description = """
Review commit history for good practices.

Good commits make the codebase easier to understand, bisect, and revert.

**Look for:**
- Giant "WIP" or "fix" commits
  - Multiple unrelated changes in one commit
  - Commits that touch 20+ files across different features

- Poor commit messages
  - "stuff", "update", "asdf", "fix"
  - No context about WHY the change was made

- Unatomic commits
  - Feature + refactor + bugfix in same commit
  - Should be separable logical units

- Missing type prefixes (if project uses conventional commits)
  - feat:, fix:, refactor:, test:, docs:, chore:

**Questions to answer:**
- Could this history be bisected effectively?
- Would a reviewer understand the progression?
- Are commits atomic (one logical change each)?
"""

[[legs]]
id = "test-quality"
title = "Test Quality Review"
focus = "Test meaningfulness, not just coverage"
description = """
Verify tests are actually testing something meaningful.

Coverage numbers lie. A test that can't fail provides no value.

**Look for:**
- Weak assertions
  - Only checking != nil / !== null / is not None
  - Using .is_ok() without checking the value
  - assertTrue(true) or equivalent

- Missing negative test cases
  - Happy path only, no error cases
  - No boundary testing
  - No invalid input testing

- Tests that can't fail
  - Mocked so heavily the test is meaningless
  - Testing implementation details, not behavior

- Flaky test indicators
  - Sleep/delay in tests
  - Time-dependent assertions

**Questions to answer:**
- Do these tests actually verify behavior?
- Would a bug in the implementation cause a test failure?
- Are edge cases and error paths tested?
"""

# ============================================================================
# PRESETS - Configurable leg selection
# ============================================================================

[presets]
[presets.gate]
description = "Light review for automatic flow - fast, focused on blockers"
legs = ["wiring", "security", "smells", "test-quality"]

[presets.full]
description = "Comprehensive review - all legs, for major features"
legs = ["correctness", "performance", "security", "elegance", "resilience", "style", "smells", "wiring", "commit-discipline", "test-quality"]

[presets.security-focused]
description = "Security-heavy review for sensitive changes"
legs = ["security", "resilience", "correctness", "wiring"]

[presets.refactor]
description = "Review focused on code quality during refactoring"
legs = ["elegance", "smells", "style", "commit-discipline"]

# Synthesis step - combines all leg outputs
[synthesis]
title = "Review Synthesis"
description = """
Combine all leg findings into a unified, prioritized review.

**Your input:**
All leg findings from: {{.output.directory}}/

**Your output:**
A synthesized review at: {{.output.directory}}/{{.output.synthesis}}

**Structure:**
1. **Executive Summary** - Overall assessment, merge recommendation
2. **Critical Issues** - P0 items from all legs, deduplicated
3. **Major Issues** - P1 items, grouped by theme
4. **Minor Issues** - P2 items, briefly listed
5. **Wiring Gaps** - Dependencies added but not used (from wiring leg)
6. **Commit Quality** - Notes on commit discipline
7. **Test Quality** - Assessment of test meaningfulness
8. **Positive Observations** - What's done well
9. **Recommendations** - Actionable next steps

Deduplicate issues found by multiple legs (note which legs found them).
Prioritize by impact and effort. Be actionable.
"""
depends_on = ["correctness", "performance", "security", "elegance", "resilience", "style", "smells", "wiring", "commit-discipline", "test-quality"]