feat(local-inference): add TTL support for automatic model unloading
Add globalTTL and per-model ttl options to llama-swap config, allowing idle models to be automatically unloaded from memory.
This commit is contained in:
@@ -30,6 +30,7 @@ with lib;
|
|||||||
enable = true;
|
enable = true;
|
||||||
host = "zix790prors.oglehome";
|
host = "zix790prors.oglehome";
|
||||||
openFirewall = true;
|
openFirewall = true;
|
||||||
|
globalTTL = 900;
|
||||||
models = {
|
models = {
|
||||||
"Qwen3.6-35B-A3B" = {
|
"Qwen3.6-35B-A3B" = {
|
||||||
hf-model = "unsloth/Qwen3.6-35B-A3B-GGUF:UD-Q8_K_XL";
|
hf-model = "unsloth/Qwen3.6-35B-A3B-GGUF:UD-Q8_K_XL";
|
||||||
|
|||||||
@@ -48,6 +48,11 @@ in
|
|||||||
default = [ ];
|
default = [ ];
|
||||||
description = "Extra arguments passed to llama-server";
|
description = "Extra arguments passed to llama-server";
|
||||||
};
|
};
|
||||||
|
ttl = mkOption {
|
||||||
|
type = types.int;
|
||||||
|
default = -1;
|
||||||
|
description = "Seconds before unloading model (-1 = use global default, 0 = never unload)";
|
||||||
|
};
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
@@ -78,6 +83,12 @@ in
|
|||||||
default = 600;
|
default = 600;
|
||||||
description = "Seconds to wait for llama-server health check (model download can take a while)";
|
description = "Seconds to wait for llama-server health check (model download can take a while)";
|
||||||
};
|
};
|
||||||
|
|
||||||
|
globalTTL = mkOption {
|
||||||
|
type = types.int;
|
||||||
|
default = 0;
|
||||||
|
description = "Default TTL in seconds before unloading an idle model (0 = never unload)";
|
||||||
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
config = mkIf cfg.enable {
|
config = mkIf cfg.enable {
|
||||||
@@ -98,10 +109,15 @@ in
|
|||||||
openFirewall = cfg.openFirewall;
|
openFirewall = cfg.openFirewall;
|
||||||
settings = {
|
settings = {
|
||||||
healthCheckTimeout = cfg.healthCheckTimeout;
|
healthCheckTimeout = cfg.healthCheckTimeout;
|
||||||
models = mapAttrs (_: m: {
|
globalTTL = cfg.globalTTL;
|
||||||
cmd = "${llama-server} --port \${PORT} -hf ${m.hf-model} -ngl ${toString m.n-gpu-layers} --no-webui ${optionalString m.cpu-moe "--cpu-moe"} ${concatStringsSep " " m.extraArgs}";
|
models = mapAttrs (
|
||||||
aliases = m.aliases;
|
name: m:
|
||||||
}) cfg.models;
|
{
|
||||||
|
cmd = "${llama-server} --port \${PORT} -hf ${m.hf-model} -ngl ${toString m.n-gpu-layers} --no-webui ${optionalString m.cpu-moe "--cpu-moe"} ${concatStringsSep " " m.extraArgs}";
|
||||||
|
aliases = m.aliases;
|
||||||
|
}
|
||||||
|
// optionalAttrs (m.ttl != -1) { ttl = m.ttl; }
|
||||||
|
) cfg.models;
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user