feat(opencode): add oh-my-openagent plugin with omo config for ollama-cloud/glm-5.1

Configure oh-my-openagent (omo) plugin for multi-agent orchestration using ollama-cloud and local llama-swap providers. Primary model is ollama-cloud/glm-5.1 with fallback chains. Add runtime fallback, background task concurrency limits, and disable incompatible agents (hephaestus, multimodal-looker).
feat(local-inference): add TTL support for automatic model unloading
2026-04-17 13:43:08 -07:00 · 2026-04-16 15:37:02 -07:00 · 2026-04-16 15:20:51 -07:00 · 2026-04-16 15:20:37 -07:00 · 2026-04-13 17:13:41 -07:00
8 changed files with 324 additions and 11 deletions
--- a/flake.nix
+++ b/flake.nix
@@ -104,6 +104,11 @@
          };
        };
      # Common specialArgs passed to all NixOS systems
      nixosSpecialArgs = {
        inherit nixpkgs-unstable;
      };
      # Shared unstable overlays for custom package builds
      customUnstableOverlays = [
        # Override claude-code in unstable to use our custom GCS-based build
@@ -149,6 +154,7 @@
    in
    {
      nixosConfigurations.nix-book = nixpkgs.lib.nixosSystem rec {
        specialArgs = nixosSpecialArgs;
        system = "x86_64-linux";
        modules = nixosModules ++ [
          ./machines/nix-book/configuration.nix
@@ -166,6 +172,7 @@
      };
      nixosConfigurations.boxy = nixpkgs.lib.nixosSystem rec {
        specialArgs = nixosSpecialArgs;
        system = "x86_64-linux";
        modules = nixosModules ++ [
          ./machines/boxy/configuration.nix
@@ -179,6 +186,7 @@
      };
      nixosConfigurations.gym-box = nixpkgs.lib.nixosSystem rec {
        specialArgs = nixosSpecialArgs;
        system = "x86_64-linux";
        modules = nixosModules ++ [
          ./machines/gym-box/configuration.nix
@@ -191,6 +199,7 @@
      };
      nixosConfigurations.zix790prors = nixpkgs.lib.nixosSystem rec {
        specialArgs = nixosSpecialArgs;
        system = "x86_64-linux";
        modules = nixosModules ++ [
          ./machines/zix790prors/configuration.nix
@@ -212,6 +221,7 @@
      # Live USB ISO configuration
      nixosConfigurations.live-usb = nixpkgs.lib.nixosSystem rec {
        specialArgs = nixosSpecialArgs;
        system = "x86_64-linux";
        modules = nixosModules ++ [
          ./machines/live-usb/configuration.nix
@@ -236,6 +246,7 @@
      # ZFS/NFS server configuration
      nixosConfigurations.john-endesktop = nixpkgs.lib.nixosSystem rec {
        specialArgs = nixosSpecialArgs;
        system = "x86_64-linux";
        modules = nixosModules ++ [
          ./machines/john-endesktop/configuration.nix
@@ -283,6 +294,7 @@
          "custom-tea-rbw" = pkgs.custom.tea-rbw;
          "custom-rclone-torbox-setup" = pkgs.custom.rclone-torbox-setup;
          "custom-opencode" = pkgs.custom.opencode;
          "qt-pinned-jellyfin-media-player" = pkgsQt.jellyfin-media-player;
          "qt-pinned-stremio" = pkgsQt.stremio;
        }
--- a/home/roles/base/default.nix
+++ b/home/roles/base/default.nix
@@ -99,6 +99,14 @@ in
      };
    };
    xdg.configFile."opencode/opencode.json" = {
      source = ./opencode-config.json;
    };
    xdg.configFile."opencode/oh-my-openagent.jsonc" = {
      source = ./opencode-omo-config.jsonc;
    };
    # Note: modules must be imported at top-level home config
  };
 }
--- a/home/roles/base/opencode-config.json
+++ b/home/roles/base/opencode-config.json
@@ -0,0 +1,24 @@
 {
  "$schema": "https://opencode.ai/config.json",
  "plugin": ["oh-my-openagent"],
  "provider": {
    "llama-local": {
      "name": "Llama.cpp (zix790prors RTX 4070 Ti)",
      "npm": "@ai-sdk/openai-compatible",
      "options": {
        "baseURL": "http://zix790prors.oglehome:8080/v1"
      },
      "models": {
        "Qwen3.6-35B-A3B": {
          "name": "Qwen3.6-35B-A3B (UD-Q8_K_XL)",
          "reasoning": true,
          "tool_call": true,
          "limit": {
            "context": 131072,
            "output": 32768
          }
        }
      }
    }
  }
 }
--- a/home/roles/base/opencode-omo-config.jsonc
+++ b/home/roles/base/opencode-omo-config.jsonc
@@ -0,0 +1,136 @@
 {
  "$schema": "https://raw.githubusercontent.com/code-yeongyu/oh-my-openagent/dev/assets/oh-my-opencode.schema.json",
  "agents": {
    "sisyphus": {
      "model": "ollama-cloud/glm-5.1",
      "fallback_models": [
        "ollama-cloud/kimi-k2.5",
        "llama-local/Qwen3.6-35B-A3B",
        "ollama-cloud/qwen3-coder-next"
      ]
    },
    "prometheus": {
      "model": "ollama-cloud/glm-5.1",
      "fallback_models": [
        "ollama-cloud/kimi-k2.5",
        "ollama-cloud/qwen3-coder-next"
      ]
    },
    "atlas": {
      "model": "ollama-cloud/glm-5.1",
      "fallback_models": [
        "ollama-cloud/gemma4:31b",
        "ollama-cloud/kimi-k2.5"
      ]
    },
    "explore": {
      "model": "ollama-cloud/gemma4:31b",
      "fallback_models": [
        "ollama-cloud/ministral-3:14b",
        "llama-local/Qwen3.6-35B-A3B"
      ]
    },
    "librarian": {
      "model": "ollama-cloud/gemma4:31b",
      "fallback_models": [
        "ollama-cloud/ministral-3:14b"
      ]
    },
    "oracle": {
      "model": "ollama-cloud/qwen3-coder-next",
      "fallback_models": [
        "ollama-cloud/deepseek-v3.2",
        "ollama-cloud/glm-5.1"
      ]
    },
    "multimodal-looker": {
      "disable": true
    },
    "hephaestus": {
      "disable": true
    },
    "momus": {
      "model": "ollama-cloud/glm-5.1",
      "fallback_models": [
        "ollama-cloud/qwen3-coder-next"
      ]
    },
    "metis": {
      "model": "ollama-cloud/glm-5.1",
      "fallback_models": [
        "ollama-cloud/kimi-k2.5"
      ]
    }
  },
  "categories": {
    "quick": {
      "model": "ollama-cloud/gemma4:31b",
      "fallback_models": [
        "ollama-cloud/ministral-3:14b"
      ]
    },
    "unspecified-low": {
      "model": "ollama-cloud/glm-5.1",
      "fallback_models": [
        "ollama-cloud/kimi-k2.5",
        "llama-local/Qwen3.6-35B-A3B"
      ]
    },
    "unspecified-high": {
      "model": "ollama-cloud/glm-5.1",
      "fallback_models": [
        "ollama-cloud/kimi-k2.5",
        "ollama-cloud/qwen3-coder-next"
      ]
    },
    "deep": {
      "model": "ollama-cloud/qwen3-coder-next",
      "fallback_models": [
        "ollama-cloud/deepseek-v3.2",
        "ollama-cloud/glm-5.1"
      ]
    },
    "ultrabrain": {
      "model": "ollama-cloud/qwen3-coder-next",
      "fallback_models": [
        "ollama-cloud/deepseek-v3.2",
        "ollama-cloud/glm-5.1"
      ]
    },
    "writing": {
      "model": "ollama-cloud/glm-5.1",
      "fallback_models": [
        "ollama-cloud/kimi-k2.5"
      ]
    },
    "visual-engineering": {
      "model": "ollama-cloud/glm-5.1",
      "fallback_models": [
        "ollama-cloud/qwen3-coder-next"
      ]
    }
  },
  "runtime_fallback": {
    "enabled": true,
    "retry_on_errors": [400, 429, 503, 529],
    "max_fallback_attempts": 3,
    "cooldown_seconds": 60,
    "notify_on_fallback": true
  },
  "background_task": {
    "defaultConcurrency": 5,
    "providerConcurrency": {
      "ollama-cloud": 10,
      "llama-local": 2
    }
  },
  "disabled_hooks": ["no-sisyphus-gpt"],
  "comment_checker": {
    "custom_prompt": "Check for AI-generated filler phrases, redundant obvious statements, and excessively verbose explanations. Comments should add value beyond what the code itself expresses. Flag: 'TODO' without ticket references, 'Note that...' when obvious, repeating the function name in the comment, and any form of 'simply' or 'simply just'. Use {{comments}} placeholder."
  },
  "tmux": { "enabled": false },
  "experimental": {
    "aggressive_truncation": true,
    "task_system": true
  }
 }
--- a/home/roles/emacs/doom/config.el
+++ b/home/roles/emacs/doom/config.el
@@ -233,14 +233,15 @@ rbw is unavailable or the entry is not found."
         gptel-use-tools t
         gptel-confirm-tool-calls 'always
         gptel-include-reasoning 'ignore
-         gptel-model "qwen3:30b")
+         gptel-model "Qwen3.6-35B-A3B")
-  ;; Set default backend to be Ollama-Local
+  ;; Set default backend to llama-swap (OpenAI-compatible)
  (setq! gptel-backend
-         (gptel-make-ollama "Ollama-Local"
+         (gptel-make-openai "llama-swap"
-           :host "localhost:11434"
+           :host "localhost:8080"
           :endpoint "/v1/chat/completions"
           :stream t
-           :models '(deepseek-r1 deepseek-r1-fullctx qwen3:30b qwen3:4b llama3.1 qwen2.5-coder mistral-nemo gpt-oss)))
+           :models '("Qwen3.6-35B-A3B")))
  ;; Define custom tools
  (gptel-make-tool
--- a/machines/zix790prors/configuration.nix
+++ b/machines/zix790prors/configuration.nix
@@ -26,6 +26,19 @@ with lib;
      x11 = true;
    };
    kodi.enable = true;
    local-inference = {
      enable = true;
      host = "zix790prors.oglehome";
      openFirewall = true;
      globalTTL = 900;
      models = {
        "Qwen3.6-35B-A3B" = {
          hf-model = "unsloth/Qwen3.6-35B-A3B-GGUF:UD-Q8_K_XL";
          aliases = [ "Qwen3.6-35B-A3B" ];
          cpu-moe = true;
        };
      };
    };
    nfs-mounts.enable = true;
    nvidia = {
      enable = true;
@@ -56,12 +69,6 @@ with lib;
    ${pkgs.xorg.xrandr}/bin/xrandr --output DP-0 --mode 3440x1440 --rate 164.90 --primary
  '';
  services.ollama = {
    enable = true;
    acceleration = "cuda";
    loadModels = [ "gpt-oss" "deepseek-r1" "qwen3:30b" ];
  };
  # This option defines the first version of NixOS you have installed on this particular machine,
  # and is used to maintain compatibility with application data (e.g. databases) created on older NixOS versions.
  #
--- a/roles/default.nix
+++ b/roles/default.nix
@@ -11,6 +11,7 @@ with lib;
    ./desktop
    ./k3s-node
    ./kodi
    ./local-inference
    ./nfs-mounts
    ./plasma-bigscreen
    ./nvidia
--- a/roles/local-inference/default.nix
+++ b/roles/local-inference/default.nix
@@ -0,0 +1,124 @@
 {
  config,
  lib,
  pkgs,
  nixpkgs-unstable,
  ...
 }:
 with lib;
 let
  cfg = config.roles.local-inference;
  llama-cpp-cuda = pkgs.unstable.llama-cpp.override { cudaSupport = true; };
  llama-server = getExe' llama-cpp-cuda "llama-server";
 in
 {
  imports = [ "${nixpkgs-unstable}/nixos/modules/services/networking/llama-swap.nix" ];
  disabledModules = [ "services/networking/llama-swap.nix" ];
  options.roles.local-inference = {
    enable = mkEnableOption "Enable local LLM inference via llama-swap + llama.cpp";
    models = mkOption {
      type = types.attrsOf (
        types.submodule {
          options = {
            hf-model = mkOption {
              type = types.str;
              description = "HuggingFace model shorthand (e.g. unsloth/Qwen3.6-35B-A3B-GGUF:UD-Q8_K_XL)";
            };
            aliases = mkOption {
              type = types.listOf types.str;
              default = [ ];
              description = "Aliases for the model in the API";
            };
            n-gpu-layers = mkOption {
              type = types.int;
              default = 99;
              description = "Number of layers to offload to GPU";
            };
            cpu-moe = mkOption {
              type = types.bool;
              default = false;
              description = "Offload MoE expert layers to CPU";
            };
            extraArgs = mkOption {
              type = types.listOf types.str;
              default = [ ];
              description = "Extra arguments passed to llama-server";
            };
            ttl = mkOption {
              type = types.int;
              default = -1;
              description = "Seconds before unloading model (-1 = use global default, 0 = never unload)";
            };
          };
        }
      );
      default = { };
      description = "Models to serve from HuggingFace";
    };
    host = mkOption {
      type = types.str;
      default = "127.0.0.1";
      description = "IP address llama-swap listens on";
    };
    port = mkOption {
      type = types.port;
      default = 8080;
      description = "Port llama-swap listens on";
    };
    openFirewall = mkOption {
      type = types.bool;
      default = false;
      description = "Open the server port in the firewall";
    };
    healthCheckTimeout = mkOption {
      type = types.int;
      default = 600;
      description = "Seconds to wait for llama-server health check (model download can take a while)";
    };
    globalTTL = mkOption {
      type = types.int;
      default = 0;
      description = "Default TTL in seconds before unloading an idle model (0 = never unload)";
    };
  };
  config = mkIf cfg.enable {
    systemd.services.llama-swap.environment = {
      LLAMA_CACHE = "/var/cache/llama-swap";
      HOME = "/var/lib/llama-swap";
    };
    systemd.services.llama-swap.serviceConfig = {
      CacheDirectory = "llama-swap";
      StateDirectory = "llama-swap";
    };
    services.llama-swap = {
      enable = true;
      listenAddress = cfg.host;
      port = cfg.port;
      openFirewall = cfg.openFirewall;
      settings = {
        healthCheckTimeout = cfg.healthCheckTimeout;
        globalTTL = cfg.globalTTL;
        models = mapAttrs (
          name: m:
          {
            cmd = "${llama-server} --port \${PORT} -hf ${m.hf-model} -ngl ${toString m.n-gpu-layers} --no-webui ${optionalString m.cpu-moe "--cpu-moe"} ${concatStringsSep " " m.extraArgs}";
            aliases = m.aliases;
          }
          // optionalAttrs (m.ttl != -1) { ttl = m.ttl; }
        ) cfg.models;
      };
    };
  };
 }
Author	SHA1	Message	Date
John Ogle	5a82554884	feat(opencode): add oh-my-openagent plugin with omo config for ollama-cloud/glm-5.1 Some checks failed CI / check (push) Failing after 2m8s Details CI / build-and-cache (push) Has been skipped Details Configure oh-my-openagent (omo) plugin for multi-agent orchestration using ollama-cloud and local llama-swap providers. Primary model is ollama-cloud/glm-5.1 with fallback chains. Add runtime fallback, background task concurrency limits, and disable incompatible agents (hephaestus, multimodal-looker).	2026-04-17 13:43:08 -07:00
John Ogle	170a27310e	feat(local-inference): add TTL support for automatic model unloading Some checks failed CI / check (push) Failing after 1m44s Details CI / build-and-cache (push) Has been skipped Details Add globalTTL and per-model ttl options to llama-swap config, allowing idle models to be automatically unloaded from memory.	2026-04-16 15:37:02 -07:00
John Ogle	bd377676ed	fix(opencode): increase context/output limits for local model Some checks failed CI / check (push) Failing after 1m59s Details CI / build-and-cache (push) Has been skipped Details	2026-04-16 15:20:51 -07:00
John Ogle	10efafd92e	feat(local-inference): replace ollama with llama-swap + llama.cpp on zix790prors - Add local-inference NixOS role using llama-swap (from nixpkgs-unstable) with llama.cpp (CUDA-enabled, from nixpkgs-unstable) - Serves Qwen3.6-35B-A3B via HuggingFace auto-download with --cpu-moe - Add nixosSpecialArgs for nixpkgs-unstable module access - Configure opencode with llama-local provider pointing to zix790prors:8080 - Update gptel from Ollama backend to OpenAI-compatible llama-swap backend - Remove ollama service from zix790prors	2026-04-16 15:20:37 -07:00
johno	d16c8aa67e	Merge pull request 'feat(app-launcher): workout card launcher + URL args' (#54 ) from ash/workout-card-launcher into main All checks were successful CI / check (push) Successful in 1m40s Details CI / build-and-cache (push) Successful in 3h10m42s Details Reviewed-on: #54	2026-04-13 17:13:41 -07:00