feat(local-inference): replace ollama with llama-swap + llama.cpp on zix790prors

- Add local-inference NixOS role using llama-swap (from nixpkgs-unstable) with llama.cpp (CUDA-enabled, from nixpkgs-unstable) - Serves Qwen3.6-35B-A3B via HuggingFace auto-download with --cpu-moe - Add nixosSpecialArgs for nixpkgs-unstable module access - Configure opencode with llama-local provider pointing to zix790prors:8080 - Update gptel from Ollama backend to OpenAI-compatible llama-swap backend - Remove ollama service from zix790prors
2026-04-16 15:20:37 -07:00
parent d16c8aa67e
commit 10efafd92e
7 changed files with 165 additions and 11 deletions
--- a/roles/local-inference/default.nix
+++ b/roles/local-inference/default.nix
@@ -0,0 +1,108 @@
+{
+  config,
+  lib,
+  pkgs,
+  nixpkgs-unstable,
+  ...
+}:
+
+with lib;
+
+let
+  cfg = config.roles.local-inference;
+  llama-cpp-cuda = pkgs.unstable.llama-cpp.override { cudaSupport = true; };
+  llama-server = getExe' llama-cpp-cuda "llama-server";
+in
+{
+  imports = [ "${nixpkgs-unstable}/nixos/modules/services/networking/llama-swap.nix" ];
+  disabledModules = [ "services/networking/llama-swap.nix" ];
+
+  options.roles.local-inference = {
+    enable = mkEnableOption "Enable local LLM inference via llama-swap + llama.cpp";
+
+    models = mkOption {
+      type = types.attrsOf (
+        types.submodule {
+          options = {
+            hf-model = mkOption {
+              type = types.str;
+              description = "HuggingFace model shorthand (e.g. unsloth/Qwen3.6-35B-A3B-GGUF:UD-Q8_K_XL)";
+            };
+            aliases = mkOption {
+              type = types.listOf types.str;
+              default = [ ];
+              description = "Aliases for the model in the API";
+            };
+            n-gpu-layers = mkOption {
+              type = types.int;
+              default = 99;
+              description = "Number of layers to offload to GPU";
+            };
+            cpu-moe = mkOption {
+              type = types.bool;
+              default = false;
+              description = "Offload MoE expert layers to CPU";
+            };
+            extraArgs = mkOption {
+              type = types.listOf types.str;
+              default = [ ];
+              description = "Extra arguments passed to llama-server";
+            };
+          };
+        }
+      );
+      default = { };
+      description = "Models to serve from HuggingFace";
+    };
+
+    host = mkOption {
+      type = types.str;
+      default = "127.0.0.1";
+      description = "IP address llama-swap listens on";
+    };
+
+    port = mkOption {
+      type = types.port;
+      default = 8080;
+      description = "Port llama-swap listens on";
+    };
+
+    openFirewall = mkOption {
+      type = types.bool;
+      default = false;
+      description = "Open the server port in the firewall";
+    };
+
+    healthCheckTimeout = mkOption {
+      type = types.int;
+      default = 600;
+      description = "Seconds to wait for llama-server health check (model download can take a while)";
+    };
+  };
+
+  config = mkIf cfg.enable {
+    systemd.services.llama-swap.environment = {
+      LLAMA_CACHE = "/var/cache/llama-swap";
+      HOME = "/var/lib/llama-swap";
+    };
+
+    systemd.services.llama-swap.serviceConfig = {
+      CacheDirectory = "llama-swap";
+      StateDirectory = "llama-swap";
+    };
+
+    services.llama-swap = {
+      enable = true;
+      listenAddress = cfg.host;
+      port = cfg.port;
+      openFirewall = cfg.openFirewall;
+      settings = {
+        healthCheckTimeout = cfg.healthCheckTimeout;
+        models = mapAttrs (_: m: {
+          cmd = "${llama-server} --port \${PORT} -hf ${m.hf-model} -ngl ${toString m.n-gpu-layers} --no-webui ${optionalString m.cpu-moe "--cpu-moe"} ${concatStringsSep " " m.extraArgs}";
+          aliases = m.aliases;
+        }) cfg.models;
+      };
+    };
+  };
+}