feat(local-inference): add TTL support for automatic model unloading

Add globalTTL and per-model ttl options to llama-swap config, allowing idle models to be automatically unloaded from memory.
2026-04-16 15:37:02 -07:00
parent bd377676ed
commit 170a27310e
2 changed files with 21 additions and 4 deletions
--- a/roles/local-inference/default.nix
+++ b/roles/local-inference/default.nix
@@ -48,6 +48,11 @@ in
              default = [ ];
              description = "Extra arguments passed to llama-server";
            };
+            ttl = mkOption {
+              type = types.int;
+              default = -1;
+              description = "Seconds before unloading model (-1 = use global default, 0 = never unload)";
+            };
          };
        }
      );
@@ -78,6 +83,12 @@ in
      default = 600;
      description = "Seconds to wait for llama-server health check (model download can take a while)";
    };
+
+    globalTTL = mkOption {
+      type = types.int;
+      default = 0;
+      description = "Default TTL in seconds before unloading an idle model (0 = never unload)";
+    };
  };

  config = mkIf cfg.enable {
@@ -98,10 +109,15 @@ in
      openFirewall = cfg.openFirewall;
      settings = {
        healthCheckTimeout = cfg.healthCheckTimeout;
-        models = mapAttrs (_: m: {
-          cmd = "${llama-server} --port \${PORT} -hf ${m.hf-model} -ngl ${toString m.n-gpu-layers} --no-webui ${optionalString m.cpu-moe "--cpu-moe"} ${concatStringsSep " " m.extraArgs}";
-          aliases = m.aliases;
-        }) cfg.models;
+        globalTTL = cfg.globalTTL;
+        models = mapAttrs (
+          name: m:
+          {
+            cmd = "${llama-server} --port \${PORT} -hf ${m.hf-model} -ngl ${toString m.n-gpu-layers} --no-webui ${optionalString m.cpu-moe "--cpu-moe"} ${concatStringsSep " " m.extraArgs}";
+            aliases = m.aliases;
+          }
+          // optionalAttrs (m.ttl != -1) { ttl = m.ttl; }
+        ) cfg.models;
      };
    };
  };