Add globalTTL and per-model ttl options to llama-swap config, allowing idle models to be automatically unloaded from memory.
125 lines
3.5 KiB
Nix
125 lines
3.5 KiB
Nix
{
|
|
config,
|
|
lib,
|
|
pkgs,
|
|
nixpkgs-unstable,
|
|
...
|
|
}:
|
|
|
|
with lib;
|
|
|
|
let
|
|
cfg = config.roles.local-inference;
|
|
llama-cpp-cuda = pkgs.unstable.llama-cpp.override { cudaSupport = true; };
|
|
llama-server = getExe' llama-cpp-cuda "llama-server";
|
|
in
|
|
{
|
|
imports = [ "${nixpkgs-unstable}/nixos/modules/services/networking/llama-swap.nix" ];
|
|
disabledModules = [ "services/networking/llama-swap.nix" ];
|
|
|
|
options.roles.local-inference = {
|
|
enable = mkEnableOption "Enable local LLM inference via llama-swap + llama.cpp";
|
|
|
|
models = mkOption {
|
|
type = types.attrsOf (
|
|
types.submodule {
|
|
options = {
|
|
hf-model = mkOption {
|
|
type = types.str;
|
|
description = "HuggingFace model shorthand (e.g. unsloth/Qwen3.6-35B-A3B-GGUF:UD-Q8_K_XL)";
|
|
};
|
|
aliases = mkOption {
|
|
type = types.listOf types.str;
|
|
default = [ ];
|
|
description = "Aliases for the model in the API";
|
|
};
|
|
n-gpu-layers = mkOption {
|
|
type = types.int;
|
|
default = 99;
|
|
description = "Number of layers to offload to GPU";
|
|
};
|
|
cpu-moe = mkOption {
|
|
type = types.bool;
|
|
default = false;
|
|
description = "Offload MoE expert layers to CPU";
|
|
};
|
|
extraArgs = mkOption {
|
|
type = types.listOf types.str;
|
|
default = [ ];
|
|
description = "Extra arguments passed to llama-server";
|
|
};
|
|
ttl = mkOption {
|
|
type = types.int;
|
|
default = -1;
|
|
description = "Seconds before unloading model (-1 = use global default, 0 = never unload)";
|
|
};
|
|
};
|
|
}
|
|
);
|
|
default = { };
|
|
description = "Models to serve from HuggingFace";
|
|
};
|
|
|
|
host = mkOption {
|
|
type = types.str;
|
|
default = "127.0.0.1";
|
|
description = "IP address llama-swap listens on";
|
|
};
|
|
|
|
port = mkOption {
|
|
type = types.port;
|
|
default = 8080;
|
|
description = "Port llama-swap listens on";
|
|
};
|
|
|
|
openFirewall = mkOption {
|
|
type = types.bool;
|
|
default = false;
|
|
description = "Open the server port in the firewall";
|
|
};
|
|
|
|
healthCheckTimeout = mkOption {
|
|
type = types.int;
|
|
default = 600;
|
|
description = "Seconds to wait for llama-server health check (model download can take a while)";
|
|
};
|
|
|
|
globalTTL = mkOption {
|
|
type = types.int;
|
|
default = 0;
|
|
description = "Default TTL in seconds before unloading an idle model (0 = never unload)";
|
|
};
|
|
};
|
|
|
|
config = mkIf cfg.enable {
|
|
systemd.services.llama-swap.environment = {
|
|
LLAMA_CACHE = "/var/cache/llama-swap";
|
|
HOME = "/var/lib/llama-swap";
|
|
};
|
|
|
|
systemd.services.llama-swap.serviceConfig = {
|
|
CacheDirectory = "llama-swap";
|
|
StateDirectory = "llama-swap";
|
|
};
|
|
|
|
services.llama-swap = {
|
|
enable = true;
|
|
listenAddress = cfg.host;
|
|
port = cfg.port;
|
|
openFirewall = cfg.openFirewall;
|
|
settings = {
|
|
healthCheckTimeout = cfg.healthCheckTimeout;
|
|
globalTTL = cfg.globalTTL;
|
|
models = mapAttrs (
|
|
name: m:
|
|
{
|
|
cmd = "${llama-server} --port \${PORT} -hf ${m.hf-model} -ngl ${toString m.n-gpu-layers} --no-webui ${optionalString m.cpu-moe "--cpu-moe"} ${concatStringsSep " " m.extraArgs}";
|
|
aliases = m.aliases;
|
|
}
|
|
// optionalAttrs (m.ttl != -1) { ttl = m.ttl; }
|
|
) cfg.models;
|
|
};
|
|
};
|
|
};
|
|
}
|