{ config, lib, pkgs, nixpkgs-unstable, ... }: with lib; let cfg = config.roles.local-inference; llama-cpp-cuda = pkgs.unstable.llama-cpp.override { cudaSupport = true; }; llama-server = getExe' llama-cpp-cuda "llama-server"; in { # Replace the stable nixpkgs llama-swap module with the unstable version, # which may have newer features. For systems already built on unstable # (e.g., nix-deck), this module is excluded from roles/default.nix instead. imports = [ "${nixpkgs-unstable}/nixos/modules/services/networking/llama-swap.nix" ]; disabledModules = [ "services/networking/llama-swap.nix" ]; options.roles.local-inference = { enable = mkEnableOption "Enable local LLM inference via llama-swap + llama.cpp"; models = mkOption { type = types.attrsOf ( types.submodule { options = { hf-model = mkOption { type = types.str; description = "HuggingFace model shorthand (e.g. unsloth/Qwen3.6-35B-A3B-GGUF:UD-Q8_K_XL)"; }; aliases = mkOption { type = types.listOf types.str; default = [ ]; description = "Aliases for the model in the API"; }; n-gpu-layers = mkOption { type = types.int; default = 99; description = "Number of layers to offload to GPU"; }; cpu-moe = mkOption { type = types.bool; default = false; description = "Offload MoE expert layers to CPU"; }; extraArgs = mkOption { type = types.listOf types.str; default = [ ]; description = "Extra arguments passed to llama-server"; }; ttl = mkOption { type = types.int; default = -1; description = "Seconds before unloading model (-1 = use global default, 0 = never unload)"; }; }; } ); default = { }; description = "Models to serve from HuggingFace"; }; host = mkOption { type = types.str; default = "127.0.0.1"; description = "IP address llama-swap listens on"; }; port = mkOption { type = types.port; default = 8080; description = "Port llama-swap listens on"; }; openFirewall = mkOption { type = types.bool; default = false; description = "Open the server port in the firewall"; }; healthCheckTimeout = mkOption { type = types.int; default = 600; description = "Seconds to wait for llama-server health check (model download can take a while)"; }; globalTTL = mkOption { type = types.int; default = 0; description = "Default TTL in seconds before unloading an idle model (0 = never unload)"; }; }; config = mkIf cfg.enable { systemd.services.llama-swap.environment = { LLAMA_CACHE = "/var/cache/llama-swap"; HOME = "/var/lib/llama-swap"; }; systemd.services.llama-swap.serviceConfig = { CacheDirectory = "llama-swap"; StateDirectory = "llama-swap"; }; services.llama-swap = { enable = true; listenAddress = cfg.host; port = cfg.port; openFirewall = cfg.openFirewall; settings = { healthCheckTimeout = cfg.healthCheckTimeout; globalTTL = cfg.globalTTL; models = mapAttrs ( name: m: { cmd = "${llama-server} --port \${PORT} -hf ${m.hf-model} -ngl ${toString m.n-gpu-layers} --no-webui ${optionalString m.cpu-moe "--cpu-moe"} ${concatStringsSep " " m.extraArgs}"; aliases = m.aliases; } // optionalAttrs (m.ttl != -1) { ttl = m.ttl; } ) cfg.models; }; }; }; }