Files
nixos-configs/roles/local-inference/default.nix
John Ogle 10efafd92e feat(local-inference): replace ollama with llama-swap + llama.cpp on zix790prors
- Add local-inference NixOS role using llama-swap (from nixpkgs-unstable)
  with llama.cpp (CUDA-enabled, from nixpkgs-unstable)
- Serves Qwen3.6-35B-A3B via HuggingFace auto-download with --cpu-moe
- Add nixosSpecialArgs for nixpkgs-unstable module access
- Configure opencode with llama-local provider pointing to zix790prors:8080
- Update gptel from Ollama backend to OpenAI-compatible llama-swap backend
- Remove ollama service from zix790prors
2026-04-16 15:20:37 -07:00

109 lines
3.0 KiB
Nix

{
config,
lib,
pkgs,
nixpkgs-unstable,
...
}:
with lib;
let
cfg = config.roles.local-inference;
llama-cpp-cuda = pkgs.unstable.llama-cpp.override { cudaSupport = true; };
llama-server = getExe' llama-cpp-cuda "llama-server";
in
{
imports = [ "${nixpkgs-unstable}/nixos/modules/services/networking/llama-swap.nix" ];
disabledModules = [ "services/networking/llama-swap.nix" ];
options.roles.local-inference = {
enable = mkEnableOption "Enable local LLM inference via llama-swap + llama.cpp";
models = mkOption {
type = types.attrsOf (
types.submodule {
options = {
hf-model = mkOption {
type = types.str;
description = "HuggingFace model shorthand (e.g. unsloth/Qwen3.6-35B-A3B-GGUF:UD-Q8_K_XL)";
};
aliases = mkOption {
type = types.listOf types.str;
default = [ ];
description = "Aliases for the model in the API";
};
n-gpu-layers = mkOption {
type = types.int;
default = 99;
description = "Number of layers to offload to GPU";
};
cpu-moe = mkOption {
type = types.bool;
default = false;
description = "Offload MoE expert layers to CPU";
};
extraArgs = mkOption {
type = types.listOf types.str;
default = [ ];
description = "Extra arguments passed to llama-server";
};
};
}
);
default = { };
description = "Models to serve from HuggingFace";
};
host = mkOption {
type = types.str;
default = "127.0.0.1";
description = "IP address llama-swap listens on";
};
port = mkOption {
type = types.port;
default = 8080;
description = "Port llama-swap listens on";
};
openFirewall = mkOption {
type = types.bool;
default = false;
description = "Open the server port in the firewall";
};
healthCheckTimeout = mkOption {
type = types.int;
default = 600;
description = "Seconds to wait for llama-server health check (model download can take a while)";
};
};
config = mkIf cfg.enable {
systemd.services.llama-swap.environment = {
LLAMA_CACHE = "/var/cache/llama-swap";
HOME = "/var/lib/llama-swap";
};
systemd.services.llama-swap.serviceConfig = {
CacheDirectory = "llama-swap";
StateDirectory = "llama-swap";
};
services.llama-swap = {
enable = true;
listenAddress = cfg.host;
port = cfg.port;
openFirewall = cfg.openFirewall;
settings = {
healthCheckTimeout = cfg.healthCheckTimeout;
models = mapAttrs (_: m: {
cmd = "${llama-server} --port \${PORT} -hf ${m.hf-model} -ngl ${toString m.n-gpu-layers} --no-webui ${optionalString m.cpu-moe "--cpu-moe"} ${concatStringsSep " " m.extraArgs}";
aliases = m.aliases;
}) cfg.models;
};
};
};
}