nixos-configs/roles/local-inference/default.nix

{
  config,
  lib,
  pkgs,
  nixpkgs-unstable,
  ...
}:

with lib;

let
  cfg = config.roles.local-inference;
  llama-cpp-cuda = pkgs.unstable.llama-cpp.override { cudaSupport = true; };
  llama-server = getExe' llama-cpp-cuda "llama-server";
in
{
  imports = [ "${nixpkgs-unstable}/nixos/modules/services/networking/llama-swap.nix" ];
  disabledModules = [ "services/networking/llama-swap.nix" ];

  options.roles.local-inference = {
    enable = mkEnableOption "Enable local LLM inference via llama-swap + llama.cpp";

    models = mkOption {
      type = types.attrsOf (
        types.submodule {
          options = {
            hf-model = mkOption {
              type = types.str;
              description = "HuggingFace model shorthand (e.g. unsloth/Qwen3.6-35B-A3B-GGUF:UD-Q8_K_XL)";
            };
            aliases = mkOption {
              type = types.listOf types.str;
              default = [ ];
              description = "Aliases for the model in the API";
            };
            n-gpu-layers = mkOption {
              type = types.int;
              default = 99;
              description = "Number of layers to offload to GPU";
            };
            cpu-moe = mkOption {
              type = types.bool;
              default = false;
              description = "Offload MoE expert layers to CPU";
            };
            extraArgs = mkOption {
              type = types.listOf types.str;
              default = [ ];
              description = "Extra arguments passed to llama-server";
            };
            ttl = mkOption {
              type = types.int;
              default = -1;
              description = "Seconds before unloading model (-1 = use global default, 0 = never unload)";
            };
          };
        }
      );
      default = { };
      description = "Models to serve from HuggingFace";
    };

    host = mkOption {
      type = types.str;
      default = "127.0.0.1";
      description = "IP address llama-swap listens on";
    };

    port = mkOption {
      type = types.port;
      default = 8080;
      description = "Port llama-swap listens on";
    };

    openFirewall = mkOption {
      type = types.bool;
      default = false;
      description = "Open the server port in the firewall";
    };

    healthCheckTimeout = mkOption {
      type = types.int;
      default = 600;
      description = "Seconds to wait for llama-server health check (model download can take a while)";
    };

    globalTTL = mkOption {
      type = types.int;
      default = 0;
      description = "Default TTL in seconds before unloading an idle model (0 = never unload)";
    };
  };

  config = mkIf cfg.enable {
    systemd.services.llama-swap.environment = {
      LLAMA_CACHE = "/var/cache/llama-swap";
      HOME = "/var/lib/llama-swap";
    };

    systemd.services.llama-swap.serviceConfig = {
      CacheDirectory = "llama-swap";
      StateDirectory = "llama-swap";
    };

    services.llama-swap = {
      enable = true;
      listenAddress = cfg.host;
      port = cfg.port;
      openFirewall = cfg.openFirewall;
      settings = {
        healthCheckTimeout = cfg.healthCheckTimeout;
        globalTTL = cfg.globalTTL;
        models = mapAttrs (
          name: m:
          {
            cmd = "${llama-server} --port \${PORT} -hf ${m.hf-model} -ngl ${toString m.n-gpu-layers} --no-webui ${optionalString m.cpu-moe "--cpu-moe"} ${concatStringsSep " " m.extraArgs}";
            aliases = m.aliases;
          }
          // optionalAttrs (m.ttl != -1) { ttl = m.ttl; }
        ) cfg.models;
      };
    };
  };
}