feat(local-inference): add TTL support for automatic model unloading
Some checks failed
CI / check (push) Failing after 1m44s
CI / build-and-cache (push) Has been skipped

Add globalTTL and per-model ttl options to llama-swap config,
allowing idle models to be automatically unloaded from memory.
This commit is contained in:
2026-04-16 15:37:02 -07:00
parent bd377676ed
commit 170a27310e
2 changed files with 21 additions and 4 deletions

View File

@@ -30,6 +30,7 @@ with lib;
enable = true;
host = "zix790prors.oglehome";
openFirewall = true;
globalTTL = 900;
models = {
"Qwen3.6-35B-A3B" = {
hf-model = "unsloth/Qwen3.6-35B-A3B-GGUF:UD-Q8_K_XL";

View File

@@ -48,6 +48,11 @@ in
default = [ ];
description = "Extra arguments passed to llama-server";
};
ttl = mkOption {
type = types.int;
default = -1;
description = "Seconds before unloading model (-1 = use global default, 0 = never unload)";
};
};
}
);
@@ -78,6 +83,12 @@ in
default = 600;
description = "Seconds to wait for llama-server health check (model download can take a while)";
};
globalTTL = mkOption {
type = types.int;
default = 0;
description = "Default TTL in seconds before unloading an idle model (0 = never unload)";
};
};
config = mkIf cfg.enable {
@@ -98,10 +109,15 @@ in
openFirewall = cfg.openFirewall;
settings = {
healthCheckTimeout = cfg.healthCheckTimeout;
models = mapAttrs (_: m: {
cmd = "${llama-server} --port \${PORT} -hf ${m.hf-model} -ngl ${toString m.n-gpu-layers} --no-webui ${optionalString m.cpu-moe "--cpu-moe"} ${concatStringsSep " " m.extraArgs}";
aliases = m.aliases;
}) cfg.models;
globalTTL = cfg.globalTTL;
models = mapAttrs (
name: m:
{
cmd = "${llama-server} --port \${PORT} -hf ${m.hf-model} -ngl ${toString m.n-gpu-layers} --no-webui ${optionalString m.cpu-moe "--cpu-moe"} ${concatStringsSep " " m.extraArgs}";
aliases = m.aliases;
}
// optionalAttrs (m.ttl != -1) { ttl = m.ttl; }
) cfg.models;
};
};
};