feat(local-inference): replace ollama with llama-swap + llama.cpp on zix790prors

- Add local-inference NixOS role using llama-swap (from nixpkgs-unstable) with llama.cpp (CUDA-enabled, from nixpkgs-unstable) - Serves Qwen3.6-35B-A3B via HuggingFace auto-download with --cpu-moe - Add nixosSpecialArgs for nixpkgs-unstable module access - Configure opencode with llama-local provider pointing to zix790prors:8080 - Update gptel from Ollama backend to OpenAI-compatible llama-swap backend - Remove ollama service from zix790prors
2026-04-16 15:20:37 -07:00
parent d16c8aa67e
commit 10efafd92e
7 changed files with 165 additions and 11 deletions
--- a/home/roles/base/default.nix
+++ b/home/roles/base/default.nix
@@ -99,6 +99,10 @@ in
      };
    };

+    xdg.configFile."opencode/opencode.json" = {
+      source = ./opencode-config.json;
+    };
+
    # Note: modules must be imported at top-level home config
  };
 }
--- a/home/roles/base/opencode-config.json
+++ b/home/roles/base/opencode-config.json
@@ -0,0 +1,23 @@
+{
+  "$schema": "https://opencode.ai/config.json",
+  "provider": {
+    "llama-local": {
+      "name": "Llama.cpp (zix790prors RTX 4070 Ti)",
+      "npm": "@ai-sdk/openai-compatible",
+      "options": {
+        "baseURL": "http://zix790prors.oglehome:8080/v1"
+      },
+      "models": {
+        "Qwen3.6-35B-A3B": {
+          "name": "Qwen3.6-35B-A3B (UD-Q8_K_XL)",
+          "reasoning": true,
+          "tool_call": true,
+          "limit": {
+            "context": 32768,
+            "output": 8192
+          }
+        }
+      }
+    }
+  }
+}
--- a/home/roles/emacs/doom/config.el
+++ b/home/roles/emacs/doom/config.el
@@ -233,14 +233,15 @@ rbw is unavailable or the entry is not found."
         gptel-use-tools t
         gptel-confirm-tool-calls 'always
         gptel-include-reasoning 'ignore
-         gptel-model "qwen3:30b")
+         gptel-model "Qwen3.6-35B-A3B")

-  ;; Set default backend to be Ollama-Local
+  ;; Set default backend to llama-swap (OpenAI-compatible)
  (setq! gptel-backend
-         (gptel-make-ollama "Ollama-Local"
-           :host "localhost:11434"
+         (gptel-make-openai "llama-swap"
+           :host "localhost:8080"
+           :endpoint "/v1/chat/completions"
           :stream t
-           :models '(deepseek-r1 deepseek-r1-fullctx qwen3:30b qwen3:4b llama3.1 qwen2.5-coder mistral-nemo gpt-oss)))
+           :models '("Qwen3.6-35B-A3B")))

  ;; Define custom tools
  (gptel-make-tool