WIP: bd-164 timestamp-only export deduplication (~80% complete)

Implemented content hash-based deduplication to skip exports when only
timestamps changed. Core logic complete, needs export_hashes table wiring.

Completed:
- Added computeIssueContentHash() excluding timestamps
- Created shouldSkipExport() logic
- Updated export loop to skip timestamp-only changes
- Added hash.go with content hashing
- Extended Storage interface

Remaining:
- Complete export_hashes table migration
- Add SetExportHash/GetExportHash to interface
- Revert content_hash from dirty_issues approach
- Wire up hash persistence in export
- Testing

See bd-164 notes for details.

Amp-Thread-ID: https://ampcode.com/threads/T-d70657d1-4433-4f7e-b10a-3fccf8bf17fb
Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
Steve Yegge
2025-10-26 20:29:10 -07:00
parent d05e94d80d
commit a898df6915
8 changed files with 210 additions and 21 deletions

View File

@@ -2,18 +2,71 @@ package main
import (
"context"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"fmt"
"os"
"path/filepath"
"sort"
"strings"
"time"
"github.com/spf13/cobra"
"github.com/steveyegge/beads/internal/storage"
"github.com/steveyegge/beads/internal/storage/sqlite"
"github.com/steveyegge/beads/internal/types"
)
// computeIssueContentHash computes a SHA256 hash of an issue's content, excluding timestamps.
// This is used for detecting timestamp-only changes during export deduplication.
func computeIssueContentHash(issue *types.Issue) (string, error) {
// Clone issue and zero out timestamps to exclude them from hash
normalized := *issue
normalized.CreatedAt = time.Time{}
normalized.UpdatedAt = time.Time{}
// Also zero out ClosedAt if present
if normalized.ClosedAt != nil {
zeroTime := time.Time{}
normalized.ClosedAt = &zeroTime
}
// Serialize to JSON
data, err := json.Marshal(normalized)
if err != nil {
return "", err
}
// SHA256 hash
hash := sha256.Sum256(data)
return hex.EncodeToString(hash[:]), nil
}
// shouldSkipExport checks if an issue should be skipped during export because
// it only has timestamp changes (no actual content changes).
func shouldSkipExport(ctx context.Context, store storage.Storage, issue *types.Issue) (bool, error) {
// Get the stored hash from dirty_issues table
storedHash, err := store.GetDirtyIssueHash(ctx, issue.ID)
if err != nil {
return false, err
}
// If no hash stored, we must export (first export or old data)
if storedHash == "" {
return false, nil
}
// Compute current hash
currentHash, err := computeIssueContentHash(issue)
if err != nil {
return false, err
}
// If hashes match, only timestamps changed - skip export
return currentHash == storedHash, nil
}
// countIssuesInJSONL counts the number of issues in a JSONL file
func countIssuesInJSONL(path string) (int, error) {
file, err := os.Open(path)
@@ -221,16 +274,35 @@ Output to stdout by default, or use -o flag for file output.`,
out = tempFile
}
// Write JSONL
// Write JSONL (with timestamp-only deduplication for bd-164)
encoder := json.NewEncoder(out)
exportedIDs := make([]string, 0, len(issues))
skippedCount := 0
for _, issue := range issues {
// Check if this is only a timestamp change (bd-164)
skip, err := shouldSkipExport(ctx, store, issue)
if err != nil {
// Log warning but continue - don't fail export on hash check errors
fmt.Fprintf(os.Stderr, "Warning: failed to check if %s should skip: %v\n", issue.ID, err)
skip = false
}
if skip {
skippedCount++
continue
}
if err := encoder.Encode(issue); err != nil {
fmt.Fprintf(os.Stderr, "Error encoding issue %s: %v\n", issue.ID, err)
os.Exit(1)
}
exportedIDs = append(exportedIDs, issue.ID)
}
// Report skipped issues if any (helps debugging bd-159)
if skippedCount > 0 && (output == "" || output == findJSONLPath()) {
fmt.Fprintf(os.Stderr, "Skipped %d issue(s) with timestamp-only changes\n", skippedCount)
}
// Only clear dirty issues and auto-flush state if exporting to the default JSONL path
// This prevents clearing dirty flags when exporting to custom paths (e.g., bd export -o backup.jsonl)