From 170a27310efa062302ada48de32cb5719768a180 Mon Sep 17 00:00:00 2001 From: John Ogle Date: Thu, 16 Apr 2026 15:37:02 -0700 Subject: [PATCH] feat(local-inference): add TTL support for automatic model unloading Add globalTTL and per-model ttl options to llama-swap config, allowing idle models to be automatically unloaded from memory. --- machines/zix790prors/configuration.nix | 1 + roles/local-inference/default.nix | 24 ++++++++++++++++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/machines/zix790prors/configuration.nix b/machines/zix790prors/configuration.nix index f9b9ca6..6e16303 100644 --- a/machines/zix790prors/configuration.nix +++ b/machines/zix790prors/configuration.nix @@ -30,6 +30,7 @@ with lib; enable = true; host = "zix790prors.oglehome"; openFirewall = true; + globalTTL = 900; models = { "Qwen3.6-35B-A3B" = { hf-model = "unsloth/Qwen3.6-35B-A3B-GGUF:UD-Q8_K_XL"; diff --git a/roles/local-inference/default.nix b/roles/local-inference/default.nix index a55e94c..f5ecaba 100644 --- a/roles/local-inference/default.nix +++ b/roles/local-inference/default.nix @@ -48,6 +48,11 @@ in default = [ ]; description = "Extra arguments passed to llama-server"; }; + ttl = mkOption { + type = types.int; + default = -1; + description = "Seconds before unloading model (-1 = use global default, 0 = never unload)"; + }; }; } ); @@ -78,6 +83,12 @@ in default = 600; description = "Seconds to wait for llama-server health check (model download can take a while)"; }; + + globalTTL = mkOption { + type = types.int; + default = 0; + description = "Default TTL in seconds before unloading an idle model (0 = never unload)"; + }; }; config = mkIf cfg.enable { @@ -98,10 +109,15 @@ in openFirewall = cfg.openFirewall; settings = { healthCheckTimeout = cfg.healthCheckTimeout; - models = mapAttrs (_: m: { - cmd = "${llama-server} --port \${PORT} -hf ${m.hf-model} -ngl ${toString m.n-gpu-layers} --no-webui ${optionalString m.cpu-moe "--cpu-moe"} ${concatStringsSep " " m.extraArgs}"; - aliases = m.aliases; - }) cfg.models; + globalTTL = cfg.globalTTL; + models = mapAttrs ( + name: m: + { + cmd = "${llama-server} --port \${PORT} -hf ${m.hf-model} -ngl ${toString m.n-gpu-layers} --no-webui ${optionalString m.cpu-moe "--cpu-moe"} ${concatStringsSep " " m.extraArgs}"; + aliases = m.aliases; + } + // optionalAttrs (m.ttl != -1) { ttl = m.ttl; } + ) cfg.models; }; }; };