Fix daemon crash recovery race conditions (bd-147)
Improvements based on oracle code review: - Move socket cleanup AFTER lock acquisition (prevents unlinking live sockets) - Add PID liveness check before removing stale socket - Add stale lock detection with retry mechanism - Tighten directory permissions to 0700 for security - Improve socket readiness probing with shorter timeouts - Make removeOldSocket() ignore ENOENT errors Fixes race condition where socket could be removed during daemon startup window, potentially orphaning a running daemon process. Amp-Thread-ID: https://ampcode.com/threads/T-63542c60-b5b9-4a34-9f22-415d9d7e8223 Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
@@ -161,15 +161,27 @@ func (s *Server) Stop() error {
|
||||
|
||||
func (s *Server) ensureSocketDir() error {
|
||||
dir := filepath.Dir(s.socketPath)
|
||||
if err := os.MkdirAll(dir, 0755); err != nil {
|
||||
if err := os.MkdirAll(dir, 0700); err != nil {
|
||||
return err
|
||||
}
|
||||
// Best-effort tighten permissions if directory already existed
|
||||
_ = os.Chmod(dir, 0700)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Server) removeOldSocket() error {
|
||||
if _, err := os.Stat(s.socketPath); err == nil {
|
||||
if err := os.Remove(s.socketPath); err != nil {
|
||||
// Socket exists - check if it's stale before removing
|
||||
// Try to connect to see if a daemon is actually using it
|
||||
conn, err := net.DialTimeout("unix", s.socketPath, 500*time.Millisecond)
|
||||
if err == nil {
|
||||
// Socket is active - another daemon is running
|
||||
conn.Close()
|
||||
return fmt.Errorf("socket %s is in use by another daemon", s.socketPath)
|
||||
}
|
||||
|
||||
// Socket is stale - safe to remove
|
||||
if err := os.Remove(s.socketPath); err != nil && !os.IsNotExist(err) {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user