From f7af6607275b23e73f9098bc04dc62f2f32ab52c Mon Sep 17 00:00:00 2001 From: shahondin1624 Date: Wed, 27 May 2026 10:42:19 +0200 Subject: [PATCH] migrate ai-server extension from llama.cpp router to llama-swap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Endpoint rewrites: - GET /v1/models + /running → merged listModels() with running flag - POST /models/load → GET /upstream//health (warm load) - POST /models/unload → POST /api/models/unload/ (no body) - Added POST /api/models/unload for unloadAll() Config migration: - Preset path: ~/.llama-models.ini → ~/.config/llama-swap/config.yaml - Service unit: llama-server.service → llama-swap.service - setPresetKey() rewritten from INI awk to YAML-aware awk for editing --ctx-size/--temp/--n-gpu-layers in cmd: blocks Per-model ctx-size (fixes 0/33k bug): - parseCtxMapFromYaml(): walks config.yaml, extracts --ctx-size N per model block → Map - extractCtxFromRunningCmd(): parses --ctx-size from /running cmd string - discoverModels(): Promise.all(listModels, listRunning, readPreset), ctx priority: running cmd → yaml → 32768 fallback - Removed broken extractCtxSize stub and dangling imports Tests: 14 passing (parseCtxMapFromYaml ×5, extractCtxFromRunningCmd ×3, isShardArtefact ×3, isReasoningModel ×3) README: full rewrite covering llama-swap architecture, YAML config format, new endpoints, troubleshooting table updated. --- ai-server/README.md | 180 ++++++++++++++++++------------------ ai-server/admin.ts | 184 ++++++++++++++++++++++++++++++------- ai-server/config.ts | 23 ++++- ai-server/index.ts | 14 +-- ai-server/router-utils.ts | 79 ++++++++++++++-- tests/router-utils.test.ts | 105 ++++++++++++++++----- 6 files changed, 414 insertions(+), 171 deletions(-) diff --git a/ai-server/README.md b/ai-server/README.md index 3d08d65..ff2a519 100644 --- a/ai-server/README.md +++ b/ai-server/README.md @@ -1,9 +1,9 @@ -# ai-server — PI extension for a self-hosted llama.cpp router behind mTLS +# ai-server — PI extension for a self-hosted llama-swap server behind mTLS -A multi-file pi extension that exposes a remote llama.cpp router as a provider -to pi, with dynamic model discovery and admin slash commands. Chat streams use -client-certificate TLS so the endpoint can be exposed over the public internet -without a bearer token. +A multi-file pi extension that exposes a remote llama-swap instance as a +provider to pi, with dynamic model discovery and admin slash commands. Chat +streams use client-certificate TLS so the endpoint can be exposed over the +public internet without a bearer token. --- @@ -11,21 +11,21 @@ without a bearer token. ``` ┌────────────┐ mTLS (HTTPS) ┌──────────────┐ HTTP ┌─────────────────┐ -│ pi client │───────────────►│ Caddy │────────►│ llama-server │ +│ pi client │───────────────►│ Caddy │────────►│ llama-swap │ │ (this ext) │ │ 192.168.2.2 │ │ 192.168.2.3:8080 │ -└────────────┘ client cert │ ai.… │ │ router mode │ - └──────────────┘ │ --models-max 1 │ +└────────────┘ client cert │ ai.… │ │ swap mode │ + └──────────────┘ │ globalTTL: 1800 │ + │ scheduler: one │ └─────────────────┘ │ - ~/.llama-models.ini - (per-model presets) + ~/.config/llama-swap/config.yaml + (YAML model config) ``` -- **Caddy** terminates TLS and enforces `require_and_verify` client-cert auth on - `ai.shahondin1624.de`. Plaintext HTTP is forwarded to the llama-server router. -- **llama-server** runs in `--models-mode router` with `--models-max 1`, so - exactly one worker is loaded at a time; selecting a different model unloads - the previous one. +- **Caddy** terminates TLS and enforces `require_and_verify` client-cert auth + on `ai.shahondin1624.de`. Plaintext HTTP is forwarded to llama-swap. +- **llama-swap** runs in swap mode, managing model lifecycle (load/unload/swap) + with a YAML config at `~/.config/llama-swap/config.yaml`. - **This extension** performs OpenAI-compatible chat streaming over mTLS and surfaces admin endpoints as pi slash commands. @@ -37,7 +37,7 @@ without a bearer token. ├── config.ts URLs, SSH host, cert paths, MODELS[] fallback ├── messages.ts Context → OpenAI chat/completions messages ├── stream.ts custom streamSimple: SSE parse, mTLS HTTPS, pi-ai events -├── admin.ts router HTTP client + SSH helpers (preset edit, systemctl) +├── admin.ts router HTTP client + SSH helpers (YAML edit, systemctl) └── README.md this file ``` @@ -54,61 +54,63 @@ All are optional — the defaults match the current host. | `AI_SERVER_CLIENT_KEY` | `/client-key.pem` | Client private key | | `AI_SERVER_TIMEOUT_MS` | `300000` | Per-request stream timeout | | `AI_SERVER_SSH_HOST` | `ai-server@192.168.2.3` | SSH target for admin commands | -| `AI_SERVER_PRESET_PATH` | `~/.llama-models.ini` | Preset path on the SSH target | +| `AI_SERVER_PRESET_PATH` | `~/.config/llama-swap/config.yaml` | YAML config on the SSH target | +| `AI_SERVER_SERVICE_UNIT` | `llama-swap.service` | systemd unit name | +| `AI_SERVER_MODELS_PATH` | `/v1/models` | Models list endpoint | +| `AI_SERVER_RUNNING_PATH` | `/running` | Currently running models endpoint | +| `AI_SERVER_UNLOAD_PATH` | `/api/models/unload/` | Unload single model | +| `AI_SERVER_UNLOAD_ALL_PATH` | `/api/models/unload` | Unload all models | +| `AI_SERVER_UPSTREAM_HEALTH_PATH` | `/upstream//health` | Warm-load / health endpoint | ## 4. Server-side setup (192.168.2.3) -### 4.1 llama.cpp build +### 4.1 llama-swap install ```bash -git clone https://github.com/ggerganov/llama.cpp ~/llama.cpp -cd ~/llama.cpp && cmake -B build -DGGML_VULKAN=ON && cmake --build build --config Release -j$(nproc) +npm install -g llama-swap +# or use the binary release from the llama-swap GitHub repo ``` -Vulkan is used for GPU offload on the Strix Halo iGPU (no ROCm needed). The -binary ends up at `~/llama.cpp/build/bin/llama-server`. - ### 4.2 Model storage ``` ~/models/.gguf ``` -Multi-shard GGUFs (`*-00001-of-NNNNN.gguf`) work too — point the preset at the -first shard and llama.cpp auto-loads the rest. +### 4.3 Config file — `~/.config/llama-swap/config.yaml` -### 4.3 Preset file — `~/.llama-models.ini` +llama-swap uses a YAML config file. Each model is defined under `models:` with +a `cmd:` block containing the llama-server invocation. -Router mode consults this file. Each `[section]` is a model id usable in API -requests. The section name and `model =` path are the only required fields; -the rest become `--flag value` args to the per-model worker when it spawns. +```yaml +globalTTL: 1800 +models: + Qwen_Qwen3.6-35B-A3B-Q8_0: + cmd: | + /home/ai-server/llama.cpp/build/bin/llama-server + --model /home/ai-server/models/Qwen_Qwen3.6-35B-A3B-Q8_0.gguf + --ctx-size 262144 + --temp 0.7 + --cache-type-k q8_0 + --cache-type-v q8_0 + --n-gpu-layers 99 -```ini -[Qwen_Qwen3.6-35B-A3B-Q8_0] -model = /home/ai-server/models/Qwen_Qwen3.6-35B-A3B-Q8_0.gguf -ctx-size = 262144 -temp = 0.7 -cache-type-k = q8_0 -cache-type-v = q8_0 -n-gpu-layers = 99 - -[MiniMax-M2.7-IQ3_XXS] -model = /home/ai-server/models/MiniMax-M2.7-UD-IQ3_XXS-00001-of-NNNNN.gguf -ctx-size = 131072 -temp = 1.0 -cache-type-k = q8_0 -cache-type-v = q8_0 -n-gpu-layers = 99 + MiniMax-M2.7-IQ3_XXS: + cmd: | + /home/ai-server/llama.cpp/build/bin/llama-server + --model /home/ai-server/models/MiniMax-M2.7-UD-IQ3_XXS.gguf + --ctx-size 131072 + --temp 1.0 + --cache-type-k q8_0 + --cache-type-v q8_0 + --n-gpu-layers 99 ``` -Placeholder sections (without `model =`) show up in `GET /models` but are -filtered out by the extension's discovery — they would fail on load. - -### 4.4 Systemd user service — `~/.config/systemd/user/llama-server.service` +### 4.4 Systemd user service — `~/.config/systemd/user/llama-swap.service` ```ini [Unit] -Description=LLaMA.cpp AI Server (Router Mode, Vulkan) +Description=LLaMA-swap AI Server (Swap Mode) After=network.target Wants=network.target @@ -117,16 +119,10 @@ Type=simple User=ai-server Group=ai-server WorkingDirectory=/home/ai-server -ExecStart=/home/ai-server/llama.cpp/build/bin/llama-server \ +ExecStart=/home/ai-server/node_modules/.bin/llama-swap \ --host 0.0.0.0 \ --port 8080 \ - --models-dir /home/ai-server/models \ - --models-max 1 \ - --models-autoload \ - --models-preset /home/ai-server/.llama-models.ini \ - --gpu-layers 99 \ - --cache-type-k q8_0 \ - --cache-type-v q8_0 + --config /home/ai-server/.config/llama-swap/config.yaml LimitNOFILE=65536 LimitMEMLOCK=unlimited @@ -141,18 +137,10 @@ StandardError=journal WantedBy=default.target ``` -Important flags: - -- **No `-c `** at the router level. That flag is inherited by every child - worker and silently caps the preset's `ctx-size`. Let per-model presets win. -- **`--models-max 1`** enforces single-model concurrency (matters on shared - unified-memory hardware where two workers would fight for VRAM). -- **`--models-autoload`** spawns workers on demand via `POST /models/load`. - Enable and start: ```bash -systemctl --user daemon-reload && systemctl --user enable --now llama-server.service +systemctl --user daemon-reload && systemctl --user enable --now llama-swap.service loginctl enable-linger $(whoami) # keep user services running across logouts ``` @@ -160,12 +148,17 @@ loginctl enable-linger $(whoami) # keep user services running across logouts | Method | Path | Body | Notes | |---|---|---|---| -| `GET` | `/models` | — | List models; `status.args` contains the spawned worker's command line | -| `POST` | `/models/load` | `{"model":""}` | Payload key is `model`, **not** `id` | -| `POST` | `/models/unload` | `{"model":""}` | Same | -| `GET` | `/health` | — | `{"status":"ok"}` when router is up | +| `GET` | `/v1/models` | — | List models; `{"data":[{id,object,created,owned_by}]}` | +| `GET` | `/running` | — | Currently loaded models; `{"running":[{id,...}]}` | +| `POST` | `/api/models/unload` | — | Unload all models; returns `{"msg":"ok"}` | +| `POST` | `/api/models/unload/` | — | Unload specific model; plain text `OK` | +| `GET` | `/upstream//health` | — | Warm-load model (forces spawn without inference) | +| `GET` | `/health` | — | Plain text `OK` (not JSON) | | `POST` | `/v1/chat/completions` | OpenAI Chat Completions payload | What pi and the web UI use | -| `GET` | `/` | — | Built-in SvelteKit chat UI with a model picker | + +> **Note:** Response bodies are mixed JSON and plain text. The extension's +> `routerRequest()` falls back to `{raw: buf}` for non-JSON responses, so +> unload calls won't crash — they'll return `{raw: "OK"}`. ## 5. Caddy + mTLS setup (192.168.2.2) @@ -238,11 +231,11 @@ registers the `ai-server` provider, and installs the admin slash commands. |---|---|---| | `/ai-server-status` | Tabular view of models, status, ctx size | HTTPS mTLS | | `/ai-server-refresh` | Re-discover models and re-register the provider | HTTPS mTLS | -| `/ai-server-load ` | Load a model on-demand | HTTPS mTLS | -| `/ai-server-unload ` | Unload a model | HTTPS mTLS | -| `/ai-server-ctx ` | Edit preset ctx-size, unload + reload | SSH + HTTPS | -| `/ai-server-preset` | Print the server's `~/.llama-models.ini` | SSH | -| `/ai-server-restart` | `systemctl --user restart llama-server.service` | SSH | +| `/ai-server-load ` | Warm-load a model via `/upstream//health` | HTTPS mTLS | +| `/ai-server-unload ` | Unload a model via `/api/models/unload/` | HTTPS mTLS | +| `/ai-server-ctx ` | Edit YAML config ctx-size, reload the model | SSH + HTTPS | +| `/ai-server-preset` | Print the server's llama-swap config (YAML) | SSH | +| `/ai-server-restart` | `systemctl --user restart llama-swap.service` | SSH | `` arguments tab-complete against the live router model list. @@ -253,14 +246,13 @@ registers the `ai-server` provider, and installs the admin slash commands. ssh ai-server@192.168.2.3 cd ~/models && hf download / --include '**' --local-dir . -# Add a preset section to ~/.llama-models.ini — section name = model id -# (see example in §4.3) +# Add a config block to ~/.config/llama-swap/config.yaml (see example in §4.3) ``` Then from pi: ``` -/ai-server-refresh # discovers the new preset +/ai-server-refresh # discovers the new model /ai-server-load # first load may take a minute for a cold GGUF ``` @@ -268,9 +260,8 @@ No extension-side config changes are needed — discovery picks it up. ## 9. Browser access to the built-in web UI -`llama-server` ships a SvelteKit chat UI at `/` with a model picker. Navigate to -`https://ai.shahondin1624.de/` in any browser that has the client cert and -trusts the root CA. +Navigate to `https://ai.shahondin1624.de/` in any browser that has the client +cert and trusts the root CA. ### 9.1 Firefox (simplest path, always works) @@ -332,15 +323,18 @@ Verify under `brave://policy`. The policy must show status **OK**, not | Symptom | Likely cause | Fix | |---|---|---| -| pi: `HTTP 400: request exceeds available context size` | Router started with `-c `, overriding the preset's larger `ctx-size` | Remove the router-level `-c` flag from the systemd ExecStart | -| pi: `HTTP 400: File Not Found` on `/models/load` | Wrong JSON body key (older versions used `id`) | Must be `{"model":""}` — the extension's `admin.ts` already does this | +| pi: `HTTP 400: request exceeds available context size` | Model config has a small `--ctx-size` | Increase `--ctx-size` in the YAML config | +| pi: `HTTP 400: File Not Found` on load | Wrong model id — check `/v1/models` | Use the exact id from the models list | +| Model shows as `[unloaded]` in `/ai-server-status` | Model isn't currently loaded in llama-swap | Run `/ai-server-load ` to warm it | +| First request is slow | Cold model load — no preload configured | Add `hooks.on_startup.preload: []` to config | | `certutil: unable to open …root-ca.pem` | CA file not yet scp'd locally | Copy `root-ca.pem` from the Caddy host | | Brave: p12 import "Invalid or corrupt file" | OpenSSL 3 default PBES2/AES-256 encryption | Regenerate with `openssl pkcs12 -legacy -export …` | -| Brave: site loads but padlock is red, `ChromeRootStoreEnabled: Error` in `brave://policy` | Policy was removed upstream | Use `brave://certificate-manager/` → Custom, or use Firefox | +| Brave: site loads but padlock is red | Chrome Root Store issue | Use `brave://certificate-manager/` → Custom | | Cert selection prompt appears on every page load | `AutoSelectCertificateForUrls` policy missing or malformed | See §9.3 | | System-trust update-ca-trust has no effect on Brave | Brave is a Flatpak; sandbox doesn't see host `/etc/pki/ca-trust` | Import directly into the sandbox's NSS DB (§9.3) | -| Model shows as `[no model path]` in `/ai-server-status` | Preset section in `~/.llama-models.ini` has no `model =` line | Add the path, then `/ai-server-refresh` | -| Chat first-token latency seems long | Cold model load is not counted separately | First chat turn may wait 10–60s while the GGUF mmap's in; subsequent turns stream immediately | +| Chat first-token latency seems long | Cold model load | First chat turn may wait 10–60s while the GGUF mmap's in | +| `/ai-server-restart` fails | Wrong service unit name | Check `AI_SERVER_SERVICE_UNIT` / create the proper unit | +| `/ai-server-ctx` fails | YAML format changed | Edit `~/.config/llama-swap/config.yaml` manually first | ## 11. Security notes @@ -348,8 +342,8 @@ Verify under `brave://policy`. The policy must show status **OK**, not is the sole credential for API access. Treat it like an SSH key — do not share, do not commit, do not email. - To revoke a client, regenerate the root CA's cert list and remove/rename the - offending client cert file on Caddy. (Proper CRL/OCSP is not set up — this is - a single-user deployment.) + offending client cert file on Caddy. (Proper CRL/OCSP is not set up — this + is a single-user deployment.) - The `apiKey: "ai-server-mtls"` string in `index.ts` is a placeholder required by the pi model registry; no bearer token is sent over the wire. All auth is cert-based. @@ -363,10 +357,10 @@ Verify under `brave://policy`. The policy must show status **OK**, not | Path | Purpose | |---|---| | `~/llama.cpp/` | llama.cpp source + build tree | -| `~/llama.cpp/build/bin/llama-server` | Binary | +| `~/llama.cpp/build/bin/llama-server` | Binary (invoked by llama-swap) | | `~/models/*.gguf` | Model weights | -| `~/.llama-models.ini` | Router preset file | -| `~/.config/systemd/user/llama-server.service` | Service unit | +| `~/.config/llama-swap/config.yaml` | llama-swap YAML config | +| `~/.config/systemd/user/llama-swap.service` | Service unit | | `~/vram-monitor.sh` | Optional idle-unload cron helper | ### On the Caddy host (192.168.2.2) diff --git a/ai-server/admin.ts b/ai-server/admin.ts index f8a4f25..5f2f78f 100644 --- a/ai-server/admin.ts +++ b/ai-server/admin.ts @@ -3,21 +3,28 @@ import * as https from "node:https"; import { URL } from "node:url"; import { promisify } from "node:util"; import { + AI_SERVER_MODELS_PATH, AI_SERVER_PRESET_PATH, + AI_SERVER_RUNNING_PATH, + AI_SERVER_SERVICE_UNIT, AI_SERVER_SSH_HOST, + AI_SERVER_UNLOAD_ALL_PATH, + AI_SERVER_UNLOAD_PATH, + AI_SERVER_UPSTREAM_HEALTH_PATH, AI_SERVER_URL, type ServerModel, getAdminTimeoutMs, loadCerts, } from "./config.js"; import { - extractCtxSize, + parseCtxMapFromYaml, + extractCtxFromRunningCmd, isReasoningModel, isShardArtefact, } from "./router-utils.js"; // Re-export so existing index.ts imports keep working. -export { extractCtxSize, isReasoningModel }; +export { isReasoningModel }; const exec = promisify(execCb); @@ -84,12 +91,33 @@ async function routerRequest( export interface RouterModel { id: string; - status: { value: "loaded" | "unloaded" | "loading"; args: string[] }; + object?: string; + created?: number; + owned_by?: string; + /** Whether the model is currently loaded in llama-swap. */ + running?: boolean; } export async function listModels(): Promise { - const data = await routerRequest("GET", "/models"); - return (data?.data ?? []) as RouterModel[]; + // llama-swap: GET /v1/models returns { data: [{ id, object, created, owned_by }] } + // GET /running returns { running: [{ id, ... }] } + // We merge: every model from /v1/models gets a `running` flag from /running. + const [modelsRes, runningRes] = await Promise.all([ + routerRequest("GET", AI_SERVER_MODELS_PATH), + routerRequest("GET", AI_SERVER_RUNNING_PATH), + ]); + + const models: RouterModel[] = (modelsRes?.data ?? []) as RouterModel[]; + const runningIds = new Set(); + if (runningRes?.running && Array.isArray(runningRes.running)) { + for (const entry of runningRes.running as Record[]) { + if (entry.id) runningIds.add(String(entry.id)); + } + } + for (const m of models) { + m.running = runningIds.has(m.id); + } + return models; } // Short TTL cache for listModels — tab-completion calls the completer on @@ -113,40 +141,75 @@ export function invalidateListModelsCache(): void { } export async function loadModel(id: string): Promise { - // The router's handler reads `body["model"]`; passing `{id}` yields a 404. - const r = await routerRequest("POST", "/models/load", { model: id }); + // llama-swap: GET /upstream//health forces a spawn (warm load). + // 2xx = success; plain text OK body is acceptable. + const r = await routerRequest("GET", AI_SERVER_UPSTREAM_HEALTH_PATH(id)); invalidateListModelsCache(); return r; } export async function unloadModel(id: string): Promise { - const r = await routerRequest("POST", "/models/unload", { model: id }); + // llama-swap: POST /api/models/unload/, no body. Returns plain text "OK". + const r = await routerRequest("POST", AI_SERVER_UNLOAD_PATH(id)); invalidateListModelsCache(); return r; } -// A preset is "runnable" only if it has a --model path. Placeholder sections -// like [small-7b] without model = ... show up in /models but have no --model -// arg and would fail on load. -function isRunnable(m: RouterModel): boolean { - return (m.status?.args ?? []).includes("--model"); +export async function unloadAll(): Promise { + // llama-swap: POST /api/models/unload, no body. + const r = await routerRequest("POST", AI_SERVER_UNLOAD_ALL_PATH); + invalidateListModelsCache(); + return r; +} + +// llama-swap /v1/models only returns registered presets (all have a model +// path). Placeholder sections are not exposed. We only filter out shard +// artefacts. + +interface RunningEntry { + model: string; + cmd?: string; + state?: string; + ttl?: number; + proxy?: string; +} + +async function listRunning(): Promise { + const res = await routerRequest("GET", AI_SERVER_RUNNING_PATH); + return Array.isArray((res as any)?.running) + ? (res as any).running + : []; } export async function discoverModels(): Promise { - const models = await listModels(); + const [models, running, yaml] = await Promise.all([ + listModels(), + listRunning().catch(() => [] as RunningEntry[]), + readPreset().catch(() => ""), + ]); + + const ctxFromYaml = parseCtxMapFromYaml(yaml); + const ctxFromRunning = new Map(); + for (const r of running) { + const n = extractCtxFromRunningCmd(r.cmd); + if (n) ctxFromRunning.set(r.model, n); + } + return models - .filter(isRunnable) .filter((m) => !isShardArtefact(m.id)) .map((m) => { - const ctx = extractCtxSize(m) ?? 32768; - return { - id: m.id, - name: `${m.id} (AI Server)`, - reasoning: isReasoningModel(m.id), - contextWindow: ctx, - maxTokens: Math.min(16384, Math.max(2048, Math.floor(ctx / 2))), - }; - }); + const ctx = + ctxFromRunning.get(m.id) ?? // live process is authoritative + ctxFromYaml.get(m.id) ?? // config.yaml is next best + 32768; // last-resort fallback + return { + id: m.id, + name: `${m.id} (AI Server)`, + reasoning: isReasoningModel(m.id), + contextWindow: ctx, + maxTokens: Math.min(16384, Math.max(2048, Math.floor(ctx / 2))), + }; + }); } // ─── SSH helpers ───────────────────────────────────────────────────────── @@ -177,30 +240,83 @@ export async function readPreset(): Promise { } /** - * Set a `key = value` line inside a named [section] of the preset file. - * Preserves comments and all other lines. Errors if the key is absent. + * Set a `key = value` inside a named YAML section for llama-swap. + * + * llama-swap config.yaml structure (relevant excerpt): + * + * models: + * Qwen_Qwen3.6-35B-A3B-Q8_0: + * cmd: | + * /path/to/llama-server --model /path/to/gguf ... + * --ctx-size 32768 + * --temp 0.7 + * + * This function finds the `:` block under `models:`, locates the + * `--ctx-size N` line (or other supported flags), and replaces N. + * + * Supported keys: ctx-size, temp, n-gpu-layers */ export async function setPresetKey( section: string, key: string, value: string, ): Promise { + // Map short key names to the actual CLI flag used in cmd: + const flagMap: Record = { + "ctx-size": "--ctx-size", + "temp": "--temp", + "n-gpu-layers": "--n-gpu-layers", + }; + const flag = flagMap[key] ?? `--${key}`; + + // We use a sed-based approach on the YAML file: + // 1. Find the
: block under models: + // 2. Within that block, find the --flag N line + // 3. Replace N with the new value + // + // The sed script works line-by-line: + // - When we see ` ${section}:` under models:, enter editing mode + // - While editing, look for `--flag ` and replace it + // - Exit editing mode when we hit a line at the same or lesser indent + // that is not under this section + const escapedSection = section.replace(/[.[\]*/^$]/g, "\\$&"); + const escapedFlag = flag.replace(/[.[\]*/^$]/g, "\\$&"); + const awkScript = ` -awk -v sec="[${section}]" -v key=${shQuote(key)} -v val=${shQuote(value)} ' - BEGIN { in_s = 0; found = 0 } - /^\\[/ { in_s = ($0 == sec) } - in_s && $1 == key && $2 == "=" { print key " = " val; found = 1; next } - { print } - END { if (!found) exit 2 } +awk -v sec="${escapedSection}" -v flag="${escapedFlag}" -v val="${value}" ' + BEGIN { in_sec = 0; indent = 0 } + { + # Detect section header: "
:" (2-space indent, key followed by colon) + if (!in_sec && match($0, /^[[:space:]]{2}'${escapedSection}':[[:space:]]*$/)) { + in_sec = 1; + indent = 2; + } + # If we are in a section, check if we left it + if (in_sec) { + lineIndent = 0; + m = match($0, /^[[:space:]]*/); + if (m > 0) lineIndent = RLENGTH; + # If indent is <= 2 and line is not empty and not a continuation of cmd, + # we have left this section + if (lineIndent <= 2 && $0 !~ /^[[:space:]]*$/) { + in_sec = 0; + } + } + if (in_sec && match($0, " " flag " [0-9]+")) { + sub(flag " [0-9]+", flag " " val); + } + print + } ' ${AI_SERVER_PRESET_PATH} > ${AI_SERVER_PRESET_PATH}.tmp && mv ${AI_SERVER_PRESET_PATH}.tmp ${AI_SERVER_PRESET_PATH} `.trim(); + try { await runSsh(awkScript); } catch (err: any) { const msg = err?.message ?? String(err); if (msg.includes("exit code 2") || msg.match(/exit.*2/)) { throw new Error( - `Key "${key}" not found in [${section}] — add it to the preset manually first.`, + `Key "${key}" not found for model "${section}" — add it to the preset manually first.`, ); } throw err; @@ -209,7 +325,7 @@ awk -v sec="[${section}]" -v key=${shQuote(key)} -v val=${shQuote(value)} ' export async function restartService(): Promise { return runSsh( - "systemctl --user restart llama-server.service && systemctl --user is-active llama-server.service", + `systemctl --user restart ${AI_SERVER_SERVICE_UNIT} && systemctl --user is-active ${AI_SERVER_SERVICE_UNIT}`, ); } diff --git a/ai-server/config.ts b/ai-server/config.ts index 4c94b0b..844f45b 100644 --- a/ai-server/config.ts +++ b/ai-server/config.ts @@ -13,8 +13,29 @@ export const AI_SERVER_CHAT_PATH = "/v1/chat/completions"; // SSH target for admin operations (preset edits, systemctl). Uses key auth. export const AI_SERVER_SSH_HOST = process.env.AI_SERVER_SSH_HOST ?? "ai-server@192.168.2.3"; + +// llama-swap endpoint paths +export const AI_SERVER_MODELS_PATH = + process.env.AI_SERVER_MODELS_PATH ?? "/v1/models"; +export const AI_SERVER_RUNNING_PATH = + process.env.AI_SERVER_RUNNING_PATH ?? "/running"; +export const AI_SERVER_UNLOAD_ALL_PATH = + process.env.AI_SERVER_UNLOAD_ALL_PATH ?? "/api/models/unload"; +export const AI_SERVER_UNLOAD_PATH = (id: string) => + process.env.AI_SERVER_UNLOAD_PATH ?? + `/api/models/unload/${encodeURIComponent(id)}`; +export const AI_SERVER_UPSTREAM_HEALTH_PATH = (id: string) => + process.env.AI_SERVER_UPSTREAM_HEALTH_PATH ?? + `/upstream/${encodeURIComponent(id)}/health`; + +// llama-swap config file (YAML, replaces old INI preset) export const AI_SERVER_PRESET_PATH = - process.env.AI_SERVER_PRESET_PATH ?? "~/.llama-models.ini"; + process.env.AI_SERVER_PRESET_PATH ?? + "~/.config/llama-swap/config.yaml"; + +// systemd service unit for llama-swap +export const AI_SERVER_SERVICE_UNIT = + process.env.AI_SERVER_SERVICE_UNIT ?? "llama-swap.service"; // Distinct api id so registering streamSimple does NOT overwrite the // built-in openai-completions provider (the api-registry keys by api name). diff --git a/ai-server/index.ts b/ai-server/index.ts index b491e44..71dd805 100644 --- a/ai-server/index.ts +++ b/ai-server/index.ts @@ -1,7 +1,6 @@ import type { ExtensionAPI } from "@mariozechner/pi-coding-agent"; import { discoverModels, - extractCtxSize, listModels, listModelsCached, loadModel, @@ -122,13 +121,8 @@ export default async function (pi: ExtensionAPI) { const routerModels = await listModels(); const lines = [`AI Server: ${AI_SERVER_URL}`]; for (const m of routerModels) { - const status = m.status?.value ?? "?"; - const ctx = extractCtxSize(m); - const hasModel = (m.status?.args ?? []).includes("--model"); - const marker = hasModel ? " " : " [no model path]"; - lines.push( - ` ${m.id} [${status}] ctx=${ctx ?? "?"}${marker}`, - ); + const status = m.running ? "loaded" : "unloaded"; + lines.push(` ${m.id} [${status}]`); } ctx.ui.notify(lines.join("\n"), "info"); } catch (err) { @@ -246,7 +240,7 @@ export default async function (pi: ExtensionAPI) { }); pi.registerCommand("ai-server-preset", { - description: "Print ~/.llama-models.ini on the ai-server", + description: "Print llama-swap config on the ai-server", handler: async (_args, ctx) => { try { const text = await readPreset(); @@ -261,7 +255,7 @@ export default async function (pi: ExtensionAPI) { }); pi.registerCommand("ai-server-restart", { - description: "Restart the ai-server llama-server service", + description: "Restart the ai-server llama-swap service", handler: async (_args, ctx) => { const ok = await ctx.ui.confirm( "Restart llama-server?", diff --git a/ai-server/router-utils.ts b/ai-server/router-utils.ts index e073eb1..bb3e27d 100644 --- a/ai-server/router-utils.ts +++ b/ai-server/router-utils.ts @@ -6,19 +6,80 @@ export interface RouterModelMeta { id: string; - status?: { value: string; args: string[] }; + object?: string; + created?: number; + owned_by?: string; + /** Whether the model is currently loaded in llama-swap. */ + running?: boolean; } /** - * Pull `--ctx-size ` out of the worker's argv. Returns null if the flag - * is missing, at the end of argv, or the value isn't a number. + * Parse ctx-size values from every model block in llama-swap's config.yaml. + * + * The YAML has a structure like: + * + * models: + * Qwen_Qwen3.6-35B-A3B-Q8_0: + * cmd: | + * /path/to/llama-server + * --ctx-size 262144 + * --temp 0.7 + * + * This function scans for `--ctx-size N` lines within each model block and + * returns a Map of id → ctxSize. If a model appears multiple times it keeps + * the last value found. */ -export function extractCtxSize(m: RouterModelMeta): number | null { - const args = m.status?.args ?? []; - const i = args.indexOf("--ctx-size"); - if (i < 0 || i + 1 >= args.length) return null; - const n = Number(args[i + 1]); - return Number.isFinite(n) ? n : null; +export function parseCtxMapFromYaml(yaml: string): Map { + const map = new Map(); + let currentId: string | null = null; + + for (const raw of yaml.split("\n")) { + const line = raw.replace(/\r$/, ""); + + // Skip comments / blank + if (!line.trim() || line.trim().startsWith("#")) continue; + + // New model block: exactly two-space indent, ":" with nothing + // meaningful after the colon (llama-swap uses 2-space indent under + // `models:`). + const idMatch = /^ ([A-Za-z0-9._-]+):\s*$/.exec(line); + if (idMatch) { + currentId = idMatch[1]; + continue; + } + + // Top-level key resets context (e.g. `macros:`, `hooks:`) + if (/^[A-Za-z]/.test(line)) { + currentId = null; + continue; + } + + if (!currentId) continue; + + // Look for --ctx-size N anywhere in the line (handles indented cmd: + // blocks where the flag is on its own line). + const ctx = /--ctx-size\s+(\d+)/.exec(line); + if (ctx) { + map.set(currentId, Number(ctx[1])); + currentId = null; // one ctx per model + } + } + + return map; +} + +/** + * Extract ctx-size from a /running entry's `cmd` string. + * + * The /running endpoint returns entries like: + * { model: "Qwen_...", cmd: "/path/llama-server --model ... --ctx-size 262144 ...", ... } + * + * This is the authoritative source for the currently loaded model's ctx. + */ +export function extractCtxFromRunningCmd(cmd: string | undefined): number | null { + if (!cmd) return null; + const m = /--ctx-size\s+(\d+)/.exec(cmd); + return m ? Number(m[1]) : null; } /** diff --git a/tests/router-utils.test.ts b/tests/router-utils.test.ts index 0400625..0173e72 100644 --- a/tests/router-utils.test.ts +++ b/tests/router-utils.test.ts @@ -7,45 +7,102 @@ import assert from "node:assert/strict"; import { test } from "node:test"; import { - extractCtxSize, + parseCtxMapFromYaml, + extractCtxFromRunningCmd, isReasoningModel, isShardArtefact, } from "../ai-server/router-utils.ts"; -// ── extractCtxSize ────────────────────────────────────────────────────── +// ── parseCtxMapFromYaml ───────────────────────────────────────────────── -test("extractCtxSize: --ctx-size present with value", () => { - const m = { - id: "x", - status: { value: "loaded", args: ["--host", "127.0.0.1", "--ctx-size", "131072"] }, - }; - assert.equal(extractCtxSize(m), 131072); +test("parseCtxMapFromYaml: extracts ctx-size from model blocks", () => { + const yaml = ` +models: + Qwen_Qwen3.6-35B-A3B-Q8_0: + cmd: | + /home/ai-server/llama.cpp/build/bin/llama-server + --model /home/ai-server/models/Qwen_Qwen3.6-35B-A3B-Q8_0.gguf + --ctx-size 262144 + --temp 0.7 + MiniMax-M2.7-IQ3_XXS: + cmd: | + /home/ai-server/llama.cpp/build/bin/llama-server + --model /home/ai-server/models/MiniMax-M2.7-UD-IQ3_XXS.gguf + --ctx-size 131072 + --temp 1.0 +`; + const map = parseCtxMapFromYaml(yaml); + assert.equal(map.get("Qwen_Qwen3.6-35B-A3B-Q8_0"), 262144); + assert.equal(map.get("MiniMax-M2.7-IQ3_XXS"), 131072); + assert.equal(map.size, 2); }); -test("extractCtxSize: missing --ctx-size -> null", () => { - assert.equal(extractCtxSize({ id: "x", status: { value: "loaded", args: ["--host", "127"] } }), null); +test("parseCtxMapFromYaml: skips comments and blank lines", () => { + const yaml = ` +# This is a comment +models: + + # Model with large context + Qwen_Qwen3.6-35B-A3B-Q8_0: + cmd: | + /path/to/server + --ctx-size 65536 + --temp 0.7 +`; + const map = parseCtxMapFromYaml(yaml); + assert.equal(map.get("Qwen_Qwen3.6-35B-A3B-Q8_0"), 65536); }); -test("extractCtxSize: --ctx-size at end of argv -> null (no value)", () => { - assert.equal(extractCtxSize({ id: "x", status: { value: "loaded", args: ["--ctx-size"] } }), null); +test("parseCtxMapFromYaml: resets on top-level keys", () => { + const yaml = ` +models: + Qwen_Qwen3.6-35B-A3B-Q8_0: + cmd: | + /path/to/server + --ctx-size 262144 +hooks: + on_startup: + preload: + - Qwen_Qwen3.6-35B-A3B-Q8_0 +`; + const map = parseCtxMapFromYaml(yaml); + assert.equal(map.get("Qwen_Qwen3.6-35B-A3B-Q8_0"), 262144); + // "preload" is not a valid model id pattern, but even if it were, + // it's under hooks: so should not be included. + assert.ok(!map.has("preload")); }); -test("extractCtxSize: non-numeric value -> null", () => { - assert.equal( - extractCtxSize({ id: "x", status: { value: "loaded", args: ["--ctx-size", "notanumber"] } }), - null, - ); +test("parseCtxMapFromYaml: empty yaml returns empty map", () => { + const map = parseCtxMapFromYaml(""); + assert.equal(map.size, 0); }); -test("extractCtxSize: zero is valid (not null)", () => { - assert.equal( - extractCtxSize({ id: "x", status: { value: "loaded", args: ["--ctx-size", "0"] } }), - 0, - ); +test("parseCtxMapFromYaml: model without ctx-size is skipped", () => { + const yaml = ` +models: + SmallModel: + cmd: | + /path/to/server + --temp 0.7 +`; + const map = parseCtxMapFromYaml(yaml); + assert.equal(map.get("SmallModel"), undefined); + assert.equal(map.size, 0); }); -test("extractCtxSize: missing status entirely -> null", () => { - assert.equal(extractCtxSize({ id: "x" }), null); +// ── extractCtxFromRunningCmd ──────────────────────────────────────────── + +test("extractCtxFromRunningCmd: parses --ctx-size from cmd string", () => { + const cmd = "/home/ai-server/llama.cpp/build/bin/llama-server --model /home/ai-server/models/Qwen.gguf --ctx-size 262144 --temp 0.7"; + assert.equal(extractCtxFromRunningCmd(cmd), 262144); +}); + +test("extractCtxFromRunningCmd: undefined cmd returns null", () => { + assert.equal(extractCtxFromRunningCmd(undefined), null); +}); + +test("extractCtxFromRunningCmd: cmd without --ctx-size returns null", () => { + assert.equal(extractCtxFromRunningCmd("/path/to/server --temp 0.7"), null); }); // ── isShardArtefact ─────────────────────────────────────────────────────