Fix daemon crash recovery race conditions (bd-147)

Improvements based on oracle code review:
- Move socket cleanup AFTER lock acquisition (prevents unlinking live sockets)
- Add PID liveness check before removing stale socket
- Add stale lock detection with retry mechanism
- Tighten directory permissions to 0700 for security
- Improve socket readiness probing with shorter timeouts
- Make removeOldSocket() ignore ENOENT errors

Fixes race condition where socket could be removed during daemon startup window,
potentially orphaning a running daemon process.

Amp-Thread-ID: https://ampcode.com/threads/T-63542c60-b5b9-4a34-9f22-415d9d7e8223
Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
Steve Yegge
2025-10-18 13:59:06 -07:00
parent 9e2ee1889f
commit bb13f4634b
3 changed files with 128 additions and 10 deletions

View File

@@ -161,15 +161,27 @@ func (s *Server) Stop() error {
func (s *Server) ensureSocketDir() error {
dir := filepath.Dir(s.socketPath)
if err := os.MkdirAll(dir, 0755); err != nil {
if err := os.MkdirAll(dir, 0700); err != nil {
return err
}
// Best-effort tighten permissions if directory already existed
_ = os.Chmod(dir, 0700)
return nil
}
func (s *Server) removeOldSocket() error {
if _, err := os.Stat(s.socketPath); err == nil {
if err := os.Remove(s.socketPath); err != nil {
// Socket exists - check if it's stale before removing
// Try to connect to see if a daemon is actually using it
conn, err := net.DialTimeout("unix", s.socketPath, 500*time.Millisecond)
if err == nil {
// Socket is active - another daemon is running
conn.Close()
return fmt.Errorf("socket %s is in use by another daemon", s.socketPath)
}
// Socket is stale - safe to remove
if err := os.Remove(s.socketPath); err != nil && !os.IsNotExist(err) {
return err
}
}