feat(local-inference): add TTL support for automatic model unloading
Add globalTTL and per-model ttl options to llama-swap config, allowing idle models to be automatically unloaded from memory.
This commit is contained in:
@@ -48,6 +48,11 @@ in
|
||||
default = [ ];
|
||||
description = "Extra arguments passed to llama-server";
|
||||
};
|
||||
ttl = mkOption {
|
||||
type = types.int;
|
||||
default = -1;
|
||||
description = "Seconds before unloading model (-1 = use global default, 0 = never unload)";
|
||||
};
|
||||
};
|
||||
}
|
||||
);
|
||||
@@ -78,6 +83,12 @@ in
|
||||
default = 600;
|
||||
description = "Seconds to wait for llama-server health check (model download can take a while)";
|
||||
};
|
||||
|
||||
globalTTL = mkOption {
|
||||
type = types.int;
|
||||
default = 0;
|
||||
description = "Default TTL in seconds before unloading an idle model (0 = never unload)";
|
||||
};
|
||||
};
|
||||
|
||||
config = mkIf cfg.enable {
|
||||
@@ -98,10 +109,15 @@ in
|
||||
openFirewall = cfg.openFirewall;
|
||||
settings = {
|
||||
healthCheckTimeout = cfg.healthCheckTimeout;
|
||||
models = mapAttrs (_: m: {
|
||||
cmd = "${llama-server} --port \${PORT} -hf ${m.hf-model} -ngl ${toString m.n-gpu-layers} --no-webui ${optionalString m.cpu-moe "--cpu-moe"} ${concatStringsSep " " m.extraArgs}";
|
||||
aliases = m.aliases;
|
||||
}) cfg.models;
|
||||
globalTTL = cfg.globalTTL;
|
||||
models = mapAttrs (
|
||||
name: m:
|
||||
{
|
||||
cmd = "${llama-server} --port \${PORT} -hf ${m.hf-model} -ngl ${toString m.n-gpu-layers} --no-webui ${optionalString m.cpu-moe "--cpu-moe"} ${concatStringsSep " " m.extraArgs}";
|
||||
aliases = m.aliases;
|
||||
}
|
||||
// optionalAttrs (m.ttl != -1) { ttl = m.ttl; }
|
||||
) cfg.models;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user