From 170a27310efa062302ada48de32cb5719768a180 Mon Sep 17 00:00:00 2001
From: John Ogle <john@ogle.fyi>
Date: Thu, 16 Apr 2026 15:37:02 -0700
Subject: [PATCH] feat(local-inference): add TTL support for automatic model
 unloading

Add globalTTL and per-model ttl options to llama-swap config,
allowing idle models to be automatically unloaded from memory.
---
 machines/zix790prors/configuration.nix |  1 +
 roles/local-inference/default.nix      | 24 ++++++++++++++++++++----
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/machines/zix790prors/configuration.nix b/machines/zix790prors/configuration.nix
index f9b9ca6..6e16303 100644
--- a/machines/zix790prors/configuration.nix
+++ b/machines/zix790prors/configuration.nix
@@ -30,6 +30,7 @@ with lib;
       enable = true;
       host = "zix790prors.oglehome";
       openFirewall = true;
+      globalTTL = 900;
       models = {
         "Qwen3.6-35B-A3B" = {
           hf-model = "unsloth/Qwen3.6-35B-A3B-GGUF:UD-Q8_K_XL";
diff --git a/roles/local-inference/default.nix b/roles/local-inference/default.nix
index a55e94c..f5ecaba 100644
--- a/roles/local-inference/default.nix
+++ b/roles/local-inference/default.nix
@@ -48,6 +48,11 @@ in
               default = [ ];
               description = "Extra arguments passed to llama-server";
             };
+            ttl = mkOption {
+              type = types.int;
+              default = -1;
+              description = "Seconds before unloading model (-1 = use global default, 0 = never unload)";
+            };
           };
         }
       );
@@ -78,6 +83,12 @@ in
       default = 600;
       description = "Seconds to wait for llama-server health check (model download can take a while)";
     };
+
+    globalTTL = mkOption {
+      type = types.int;
+      default = 0;
+      description = "Default TTL in seconds before unloading an idle model (0 = never unload)";
+    };
   };
 
   config = mkIf cfg.enable {
@@ -98,10 +109,15 @@ in
       openFirewall = cfg.openFirewall;
       settings = {
         healthCheckTimeout = cfg.healthCheckTimeout;
-        models = mapAttrs (_: m: {
-          cmd = "${llama-server} --port \${PORT} -hf ${m.hf-model} -ngl ${toString m.n-gpu-layers} --no-webui ${optionalString m.cpu-moe "--cpu-moe"} ${concatStringsSep " " m.extraArgs}";
-          aliases = m.aliases;
-        }) cfg.models;
+        globalTTL = cfg.globalTTL;
+        models = mapAttrs (
+          name: m:
+          {
+            cmd = "${llama-server} --port \${PORT} -hf ${m.hf-model} -ngl ${toString m.n-gpu-layers} --no-webui ${optionalString m.cpu-moe "--cpu-moe"} ${concatStringsSep " " m.extraArgs}";
+            aliases = m.aliases;
+          }
+          // optionalAttrs (m.ttl != -1) { ttl = m.ttl; }
+        ) cfg.models;
       };
     };
   };