From f7af6607275b23e73f9098bc04dc62f2f32ab52c Mon Sep 17 00:00:00 2001
From: shahondin1624 <shahondin1624@gmail.com>
Date: Wed, 27 May 2026 10:42:19 +0200
Subject: [PATCH] migrate ai-server extension from llama.cpp router to
 llama-swap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Endpoint rewrites:
  - GET /v1/models + /running → merged listModels() with running flag
  - POST /models/load → GET /upstream/<id>/health (warm load)
  - POST /models/unload → POST /api/models/unload/<id> (no body)
  - Added POST /api/models/unload for unloadAll()

Config migration:
  - Preset path: ~/.llama-models.ini → ~/.config/llama-swap/config.yaml
  - Service unit: llama-server.service → llama-swap.service
  - setPresetKey() rewritten from INI awk to YAML-aware awk for
    editing --ctx-size/--temp/--n-gpu-layers in cmd: blocks

Per-model ctx-size (fixes 0/33k bug):
  - parseCtxMapFromYaml(): walks config.yaml, extracts --ctx-size N per
    model block → Map<id, ctxSize>
  - extractCtxFromRunningCmd(): parses --ctx-size from /running cmd string
  - discoverModels(): Promise.all(listModels, listRunning, readPreset),
    ctx priority: running cmd → yaml → 32768 fallback
  - Removed broken extractCtxSize stub and dangling imports

Tests: 14 passing (parseCtxMapFromYaml ×5, extractCtxFromRunningCmd ×3,
isShardArtefact ×3, isReasoningModel ×3)

README: full rewrite covering llama-swap architecture, YAML config format,
new endpoints, troubleshooting table updated.
---
 ai-server/README.md        | 180 ++++++++++++++++++------------------
 ai-server/admin.ts         | 184 ++++++++++++++++++++++++++++++-------
 ai-server/config.ts        |  23 ++++-
 ai-server/index.ts         |  14 +--
 ai-server/router-utils.ts  |  79 ++++++++++++++--
 tests/router-utils.test.ts | 105 ++++++++++++++++-----
 6 files changed, 414 insertions(+), 171 deletions(-)
diff --git a/ai-server/README.md b/ai-server/README.md
index 3d08d65..ff2a519 100644
--- a/ai-server/README.md
+++ b/ai-server/README.md
@@ -1,9 +1,9 @@
-# ai-server — PI extension for a self-hosted llama.cpp router behind mTLS
+# ai-server — PI extension for a self-hosted llama-swap server behind mTLS
 
-A multi-file pi extension that exposes a remote llama.cpp router as a provider
-to pi, with dynamic model discovery and admin slash commands. Chat streams use
-client-certificate TLS so the endpoint can be exposed over the public internet
-without a bearer token.
+A multi-file pi extension that exposes a remote llama-swap instance as a
+provider to pi, with dynamic model discovery and admin slash commands. Chat
+streams use client-certificate TLS so the endpoint can be exposed over the
+public internet without a bearer token.
 
 ---
 
@@ -11,21 +11,21 @@ without a bearer token.
 
 ```
 ┌────────────┐    mTLS (HTTPS) ┌──────────────┐   HTTP  ┌─────────────────┐
-│ pi client  │───────────────►│ Caddy        │────────►│ llama-server     │
+│ pi client  │───────────────►│ Caddy        │────────►│ llama-swap       │
 │ (this ext) │                │ 192.168.2.2  │         │ 192.168.2.3:8080 │
-└────────────┘   client cert  │ ai.…         │         │ router mode      │
-                              └──────────────┘         │ --models-max 1   │
+└────────────┘   client cert  │ ai.…         │         │ swap mode         │
+                              └──────────────┘         │ globalTTL: 1800  │
+                                                       │ scheduler: one   │
                                                        └─────────────────┘
                                                                │
-                                                      ~/.llama-models.ini
-                                                      (per-model presets)
+                                                      ~/.config/llama-swap/config.yaml
+                                                      (YAML model config)
 ```
 
-- **Caddy** terminates TLS and enforces `require_and_verify` client-cert auth on
-  `ai.shahondin1624.de`. Plaintext HTTP is forwarded to the llama-server router.
-- **llama-server** runs in `--models-mode router` with `--models-max 1`, so
-  exactly one worker is loaded at a time; selecting a different model unloads
-  the previous one.
+- **Caddy** terminates TLS and enforces `require_and_verify` client-cert auth
+  on `ai.shahondin1624.de`. Plaintext HTTP is forwarded to llama-swap.
+- **llama-swap** runs in swap mode, managing model lifecycle (load/unload/swap)
+  with a YAML config at `~/.config/llama-swap/config.yaml`.
 - **This extension** performs OpenAI-compatible chat streaming over mTLS and
   surfaces admin endpoints as pi slash commands.
 
@@ -37,7 +37,7 @@ without a bearer token.
 ├── config.ts      URLs, SSH host, cert paths, MODELS[] fallback
 ├── messages.ts    Context → OpenAI chat/completions messages
 ├── stream.ts      custom streamSimple: SSE parse, mTLS HTTPS, pi-ai events
-├── admin.ts       router HTTP client + SSH helpers (preset edit, systemctl)
+├── admin.ts       router HTTP client + SSH helpers (YAML edit, systemctl)
 └── README.md      this file
 ```
 
@@ -54,61 +54,63 @@ All are optional — the defaults match the current host.
 | `AI_SERVER_CLIENT_KEY` | `<certs>/client-key.pem` | Client private key |
 | `AI_SERVER_TIMEOUT_MS` | `300000` | Per-request stream timeout |
 | `AI_SERVER_SSH_HOST` | `ai-server@192.168.2.3` | SSH target for admin commands |
-| `AI_SERVER_PRESET_PATH` | `~/.llama-models.ini` | Preset path on the SSH target |
+| `AI_SERVER_PRESET_PATH` | `~/.config/llama-swap/config.yaml` | YAML config on the SSH target |
+| `AI_SERVER_SERVICE_UNIT` | `llama-swap.service` | systemd unit name |
+| `AI_SERVER_MODELS_PATH` | `/v1/models` | Models list endpoint |
+| `AI_SERVER_RUNNING_PATH` | `/running` | Currently running models endpoint |
+| `AI_SERVER_UNLOAD_PATH` | `/api/models/unload/<id>` | Unload single model |
+| `AI_SERVER_UNLOAD_ALL_PATH` | `/api/models/unload` | Unload all models |
+| `AI_SERVER_UPSTREAM_HEALTH_PATH` | `/upstream/<id>/health` | Warm-load / health endpoint |
 
 ## 4. Server-side setup (192.168.2.3)
 
-### 4.1 llama.cpp build
+### 4.1 llama-swap install
 
 ```bash
-git clone https://github.com/ggerganov/llama.cpp ~/llama.cpp
-cd ~/llama.cpp && cmake -B build -DGGML_VULKAN=ON && cmake --build build --config Release -j$(nproc)
+npm install -g llama-swap
+# or use the binary release from the llama-swap GitHub repo
 ```
 
-Vulkan is used for GPU offload on the Strix Halo iGPU (no ROCm needed). The
-binary ends up at `~/llama.cpp/build/bin/llama-server`.
-
 ### 4.2 Model storage
 
 ```
 ~/models/<model-name>.gguf
 ```
 
-Multi-shard GGUFs (`*-00001-of-NNNNN.gguf`) work too — point the preset at the
-first shard and llama.cpp auto-loads the rest.
+### 4.3 Config file — `~/.config/llama-swap/config.yaml`
 
-### 4.3 Preset file — `~/.llama-models.ini`
+llama-swap uses a YAML config file. Each model is defined under `models:` with
+a `cmd:` block containing the llama-server invocation.
 
-Router mode consults this file. Each `[section]` is a model id usable in API
-requests. The section name and `model =` path are the only required fields;
-the rest become `--flag value` args to the per-model worker when it spawns.
+```yaml
+globalTTL: 1800
+models:
+  Qwen_Qwen3.6-35B-A3B-Q8_0:
+    cmd: |
+      /home/ai-server/llama.cpp/build/bin/llama-server
+      --model /home/ai-server/models/Qwen_Qwen3.6-35B-A3B-Q8_0.gguf
+      --ctx-size 262144
+      --temp 0.7
+      --cache-type-k q8_0
+      --cache-type-v q8_0
+      --n-gpu-layers 99
 
-```ini
-[Qwen_Qwen3.6-35B-A3B-Q8_0]
-model = /home/ai-server/models/Qwen_Qwen3.6-35B-A3B-Q8_0.gguf
-ctx-size = 262144
-temp = 0.7
-cache-type-k = q8_0
-cache-type-v = q8_0
-n-gpu-layers = 99
-
-[MiniMax-M2.7-IQ3_XXS]
-model = /home/ai-server/models/MiniMax-M2.7-UD-IQ3_XXS-00001-of-NNNNN.gguf
-ctx-size = 131072
-temp = 1.0
-cache-type-k = q8_0
-cache-type-v = q8_0
-n-gpu-layers = 99
+  MiniMax-M2.7-IQ3_XXS:
+    cmd: |
+      /home/ai-server/llama.cpp/build/bin/llama-server
+      --model /home/ai-server/models/MiniMax-M2.7-UD-IQ3_XXS.gguf
+      --ctx-size 131072
+      --temp 1.0
+      --cache-type-k q8_0
+      --cache-type-v q8_0
+      --n-gpu-layers 99
 ```
 
-Placeholder sections (without `model =`) show up in `GET /models` but are
-filtered out by the extension's discovery — they would fail on load.
-
-### 4.4 Systemd user service — `~/.config/systemd/user/llama-server.service`
+### 4.4 Systemd user service — `~/.config/systemd/user/llama-swap.service`
 
 ```ini
 [Unit]
-Description=LLaMA.cpp AI Server (Router Mode, Vulkan)
+Description=LLaMA-swap AI Server (Swap Mode)
 After=network.target
 Wants=network.target
 
@@ -117,16 +119,10 @@ Type=simple
 User=ai-server
 Group=ai-server
 WorkingDirectory=/home/ai-server
-ExecStart=/home/ai-server/llama.cpp/build/bin/llama-server \
+ExecStart=/home/ai-server/node_modules/.bin/llama-swap \
     --host 0.0.0.0 \
     --port 8080 \
-    --models-dir /home/ai-server/models \
-    --models-max 1 \
-    --models-autoload \
-    --models-preset /home/ai-server/.llama-models.ini \
-    --gpu-layers 99 \
-    --cache-type-k q8_0 \
-    --cache-type-v q8_0
+    --config /home/ai-server/.config/llama-swap/config.yaml
 
 LimitNOFILE=65536
 LimitMEMLOCK=unlimited
@@ -141,18 +137,10 @@ StandardError=journal
 WantedBy=default.target
 ```
 
-Important flags:
-
-- **No `-c <N>`** at the router level. That flag is inherited by every child
-  worker and silently caps the preset's `ctx-size`. Let per-model presets win.
-- **`--models-max 1`** enforces single-model concurrency (matters on shared
-  unified-memory hardware where two workers would fight for VRAM).
-- **`--models-autoload`** spawns workers on demand via `POST /models/load`.
-
 Enable and start:
 
 ```bash
-systemctl --user daemon-reload && systemctl --user enable --now llama-server.service
+systemctl --user daemon-reload && systemctl --user enable --now llama-swap.service
 loginctl enable-linger $(whoami)   # keep user services running across logouts
 ```
 
@@ -160,12 +148,17 @@ loginctl enable-linger $(whoami)   # keep user services running across logouts
 
 | Method | Path | Body | Notes |
 |---|---|---|---|
-| `GET`  | `/models` | — | List models; `status.args` contains the spawned worker's command line |
-| `POST` | `/models/load` | `{"model":"<id>"}` | Payload key is `model`, **not** `id` |
-| `POST` | `/models/unload` | `{"model":"<id>"}` | Same |
-| `GET`  | `/health` | — | `{"status":"ok"}` when router is up |
+| `GET`  | `/v1/models` | — | List models; `{"data":[{id,object,created,owned_by}]}` |
+| `GET`  | `/running` | — | Currently loaded models; `{"running":[{id,...}]}` |
+| `POST` | `/api/models/unload` | — | Unload all models; returns `{"msg":"ok"}` |
+| `POST` | `/api/models/unload/<id>` | — | Unload specific model; plain text `OK` |
+| `GET`  | `/upstream/<id>/health` | — | Warm-load model (forces spawn without inference) |
+| `GET`  | `/health` | — | Plain text `OK` (not JSON) |
 | `POST` | `/v1/chat/completions` | OpenAI Chat Completions payload | What pi and the web UI use |
-| `GET`  | `/` | — | Built-in SvelteKit chat UI with a model picker |
+
+> **Note:** Response bodies are mixed JSON and plain text. The extension's
+> `routerRequest()` falls back to `{raw: buf}` for non-JSON responses, so
+> unload calls won't crash — they'll return `{raw: "OK"}`.
 
 ## 5. Caddy + mTLS setup (192.168.2.2)
 
@@ -238,11 +231,11 @@ registers the `ai-server` provider, and installs the admin slash commands.
 |---|---|---|
 | `/ai-server-status` | Tabular view of models, status, ctx size | HTTPS mTLS |
 | `/ai-server-refresh` | Re-discover models and re-register the provider | HTTPS mTLS |
-| `/ai-server-load <id>` | Load a model on-demand | HTTPS mTLS |
-| `/ai-server-unload <id>` | Unload a model | HTTPS mTLS |
-| `/ai-server-ctx <id> <size>` | Edit preset ctx-size, unload + reload | SSH + HTTPS |
-| `/ai-server-preset` | Print the server's `~/.llama-models.ini` | SSH |
-| `/ai-server-restart` | `systemctl --user restart llama-server.service` | SSH |
+| `/ai-server-load <id>` | Warm-load a model via `/upstream/<id>/health` | HTTPS mTLS |
+| `/ai-server-unload <id>` | Unload a model via `/api/models/unload/<id>` | HTTPS mTLS |
+| `/ai-server-ctx <id> <size>` | Edit YAML config ctx-size, reload the model | SSH + HTTPS |
+| `/ai-server-preset` | Print the server's llama-swap config (YAML) | SSH |
+| `/ai-server-restart` | `systemctl --user restart llama-swap.service` | SSH |
 
 `<id>` arguments tab-complete against the live router model list.
 
@@ -253,14 +246,13 @@ registers the `ai-server` provider, and installs the admin slash commands.
 ssh ai-server@192.168.2.3
 cd ~/models && hf download <author>/<repo> --include '*<quant>*' --local-dir .
 
-# Add a preset section to ~/.llama-models.ini — section name = model id
-# (see example in §4.3)
+# Add a config block to ~/.config/llama-swap/config.yaml (see example in §4.3)
 ```
 
 Then from pi:
 
 ```
-/ai-server-refresh      # discovers the new preset
+/ai-server-refresh      # discovers the new model
 /ai-server-load <id>    # first load may take a minute for a cold GGUF
 ```
 
@@ -268,9 +260,8 @@ No extension-side config changes are needed — discovery picks it up.
 
 ## 9. Browser access to the built-in web UI
 
-`llama-server` ships a SvelteKit chat UI at `/` with a model picker. Navigate to
-`https://ai.shahondin1624.de/` in any browser that has the client cert and
-trusts the root CA.
+Navigate to `https://ai.shahondin1624.de/` in any browser that has the client
+cert and trusts the root CA.
 
 ### 9.1 Firefox (simplest path, always works)
 
@@ -332,15 +323,18 @@ Verify under `brave://policy`. The policy must show status **OK**, not
 
 | Symptom | Likely cause | Fix |
 |---|---|---|
-| pi: `HTTP 400: request exceeds available context size` | Router started with `-c <small>`, overriding the preset's larger `ctx-size` | Remove the router-level `-c` flag from the systemd ExecStart |
-| pi: `HTTP 400: File Not Found` on `/models/load` | Wrong JSON body key (older versions used `id`) | Must be `{"model":"<id>"}` — the extension's `admin.ts` already does this |
+| pi: `HTTP 400: request exceeds available context size` | Model config has a small `--ctx-size` | Increase `--ctx-size` in the YAML config |
+| pi: `HTTP 400: File Not Found` on load | Wrong model id — check `/v1/models` | Use the exact id from the models list |
+| Model shows as `[unloaded]` in `/ai-server-status` | Model isn't currently loaded in llama-swap | Run `/ai-server-load <id>` to warm it |
+| First request is slow | Cold model load — no preload configured | Add `hooks.on_startup.preload: [<id>]` to config |
 | `certutil: unable to open …root-ca.pem` | CA file not yet scp'd locally | Copy `root-ca.pem` from the Caddy host |
 | Brave: p12 import "Invalid or corrupt file" | OpenSSL 3 default PBES2/AES-256 encryption | Regenerate with `openssl pkcs12 -legacy -export …` |
-| Brave: site loads but padlock is red, `ChromeRootStoreEnabled: Error` in `brave://policy` | Policy was removed upstream | Use `brave://certificate-manager/` → Custom, or use Firefox |
+| Brave: site loads but padlock is red | Chrome Root Store issue | Use `brave://certificate-manager/` → Custom |
 | Cert selection prompt appears on every page load | `AutoSelectCertificateForUrls` policy missing or malformed | See §9.3 |
 | System-trust update-ca-trust has no effect on Brave | Brave is a Flatpak; sandbox doesn't see host `/etc/pki/ca-trust` | Import directly into the sandbox's NSS DB (§9.3) |
-| Model shows as `[no model path]` in `/ai-server-status` | Preset section in `~/.llama-models.ini` has no `model =` line | Add the path, then `/ai-server-refresh` |
-| Chat first-token latency seems long | Cold model load is not counted separately | First chat turn may wait 10–60s while the GGUF mmap's in; subsequent turns stream immediately |
+| Chat first-token latency seems long | Cold model load | First chat turn may wait 10–60s while the GGUF mmap's in |
+| `/ai-server-restart` fails | Wrong service unit name | Check `AI_SERVER_SERVICE_UNIT` / create the proper unit |
+| `/ai-server-ctx` fails | YAML format changed | Edit `~/.config/llama-swap/config.yaml` manually first |
 
 ## 11. Security notes
 
@@ -348,8 +342,8 @@ Verify under `brave://policy`. The policy must show status **OK**, not
   is the sole credential for API access. Treat it like an SSH key — do not
   share, do not commit, do not email.
 - To revoke a client, regenerate the root CA's cert list and remove/rename the
-  offending client cert file on Caddy. (Proper CRL/OCSP is not set up — this is
-  a single-user deployment.)
+  offending client cert file on Caddy. (Proper CRL/OCSP is not set up — this
+  is a single-user deployment.)
 - The `apiKey: "ai-server-mtls"` string in `index.ts` is a placeholder required
   by the pi model registry; no bearer token is sent over the wire. All auth is
   cert-based.
@@ -363,10 +357,10 @@ Verify under `brave://policy`. The policy must show status **OK**, not
 | Path | Purpose |
 |---|---|
 | `~/llama.cpp/` | llama.cpp source + build tree |
-| `~/llama.cpp/build/bin/llama-server` | Binary |
+| `~/llama.cpp/build/bin/llama-server` | Binary (invoked by llama-swap) |
 | `~/models/*.gguf` | Model weights |
-| `~/.llama-models.ini` | Router preset file |
-| `~/.config/systemd/user/llama-server.service` | Service unit |
+| `~/.config/llama-swap/config.yaml` | llama-swap YAML config |
+| `~/.config/systemd/user/llama-swap.service` | Service unit |
 | `~/vram-monitor.sh` | Optional idle-unload cron helper |
 
 ### On the Caddy host (192.168.2.2)
diff --git a/ai-server/admin.ts b/ai-server/admin.ts
index f8a4f25..5f2f78f 100644
--- a/ai-server/admin.ts
+++ b/ai-server/admin.ts
@@ -3,21 +3,28 @@ import * as https from "node:https";
 import { URL } from "node:url";
 import { promisify } from "node:util";
 import {
+	AI_SERVER_MODELS_PATH,
 	AI_SERVER_PRESET_PATH,
+	AI_SERVER_RUNNING_PATH,
+	AI_SERVER_SERVICE_UNIT,
 	AI_SERVER_SSH_HOST,
+	AI_SERVER_UNLOAD_ALL_PATH,
+	AI_SERVER_UNLOAD_PATH,
+	AI_SERVER_UPSTREAM_HEALTH_PATH,
 	AI_SERVER_URL,
 	type ServerModel,
 	getAdminTimeoutMs,
 	loadCerts,
 } from "./config.js";
 import {
-	extractCtxSize,
+	parseCtxMapFromYaml,
+	extractCtxFromRunningCmd,
 	isReasoningModel,
 	isShardArtefact,
 } from "./router-utils.js";
 
 // Re-export so existing index.ts imports keep working.
-export { extractCtxSize, isReasoningModel };
+export { isReasoningModel };
 
 const exec = promisify(execCb);
 
@@ -84,12 +91,33 @@ async function routerRequest(
 
 export interface RouterModel {
 	id: string;
-	status: { value: "loaded" | "unloaded" | "loading"; args: string[] };
+	object?: string;
+	created?: number;
+	owned_by?: string;
+	/** Whether the model is currently loaded in llama-swap. */
+	running?: boolean;
 }
 
 export async function listModels(): Promise<RouterModel[]> {
-	const data = await routerRequest("GET", "/models");
-	return (data?.data ?? []) as RouterModel[];
+	// llama-swap: GET /v1/models returns { data: [{ id, object, created, owned_by }] }
+	// GET /running returns { running: [{ id, ... }] }
+	// We merge: every model from /v1/models gets a `running` flag from /running.
+	const [modelsRes, runningRes] = await Promise.all([
+		routerRequest("GET", AI_SERVER_MODELS_PATH),
+		routerRequest("GET", AI_SERVER_RUNNING_PATH),
+	]);
+
+	const models: RouterModel[] = (modelsRes?.data ?? []) as RouterModel[];
+	const runningIds = new Set<string>();
+	if (runningRes?.running && Array.isArray(runningRes.running)) {
+		for (const entry of runningRes.running as Record<string, unknown>[]) {
+			if (entry.id) runningIds.add(String(entry.id));
+		}
+	}
+	for (const m of models) {
+		m.running = runningIds.has(m.id);
+	}
+	return models;
 }
 
 // Short TTL cache for listModels — tab-completion calls the completer on
@@ -113,40 +141,75 @@ export function invalidateListModelsCache(): void {
 }
 
 export async function loadModel(id: string): Promise<unknown> {
-	// The router's handler reads `body["model"]`; passing `{id}` yields a 404.
-	const r = await routerRequest("POST", "/models/load", { model: id });
+	// llama-swap: GET /upstream/<id>/health forces a spawn (warm load).
+	// 2xx = success; plain text OK body is acceptable.
+	const r = await routerRequest("GET", AI_SERVER_UPSTREAM_HEALTH_PATH(id));
 	invalidateListModelsCache();
 	return r;
 }
 
 export async function unloadModel(id: string): Promise<unknown> {
-	const r = await routerRequest("POST", "/models/unload", { model: id });
+	// llama-swap: POST /api/models/unload/<id>, no body. Returns plain text "OK".
+	const r = await routerRequest("POST", AI_SERVER_UNLOAD_PATH(id));
 	invalidateListModelsCache();
 	return r;
 }
 
-// A preset is "runnable" only if it has a --model path. Placeholder sections
-// like [small-7b] without model = ... show up in /models but have no --model
-// arg and would fail on load.
-function isRunnable(m: RouterModel): boolean {
-	return (m.status?.args ?? []).includes("--model");
+export async function unloadAll(): Promise<unknown> {
+	// llama-swap: POST /api/models/unload, no body.
+	const r = await routerRequest("POST", AI_SERVER_UNLOAD_ALL_PATH);
+	invalidateListModelsCache();
+	return r;
+}
+
+// llama-swap /v1/models only returns registered presets (all have a model
+// path). Placeholder sections are not exposed. We only filter out shard
+// artefacts.
+
+interface RunningEntry {
+	model: string;
+	cmd?: string;
+	state?: string;
+	ttl?: number;
+	proxy?: string;
+}
+
+async function listRunning(): Promise<RunningEntry[]> {
+	const res = await routerRequest("GET", AI_SERVER_RUNNING_PATH);
+	return Array.isArray((res as any)?.running)
+		? (res as any).running
+		: [];
 }
 
 export async function discoverModels(): Promise<ServerModel[]> {
-	const models = await listModels();
+	const [models, running, yaml] = await Promise.all([
+		listModels(),
+		listRunning().catch(() => [] as RunningEntry[]),
+		readPreset().catch(() => ""),
+	]);
+
+	const ctxFromYaml = parseCtxMapFromYaml(yaml);
+	const ctxFromRunning = new Map<string, number>();
+	for (const r of running) {
+		const n = extractCtxFromRunningCmd(r.cmd);
+		if (n) ctxFromRunning.set(r.model, n);
+	}
+
 	return models
-		.filter(isRunnable)
 		.filter((m) => !isShardArtefact(m.id))
 		.map((m) => {
-		const ctx = extractCtxSize(m) ?? 32768;
-		return {
-			id: m.id,
-			name: `${m.id} (AI Server)`,
-			reasoning: isReasoningModel(m.id),
-			contextWindow: ctx,
-			maxTokens: Math.min(16384, Math.max(2048, Math.floor(ctx / 2))),
-		};
-	});
+			const ctx =
+				ctxFromRunning.get(m.id) ?? // live process is authoritative
+				ctxFromYaml.get(m.id) ?? // config.yaml is next best
+				32768; // last-resort fallback
+			return {
+				id: m.id,
+				name: `${m.id} (AI Server)`,
+				reasoning: isReasoningModel(m.id),
+				contextWindow: ctx,
+				maxTokens: Math.min(16384, Math.max(2048, Math.floor(ctx / 2))),
+			};
+		});
 }
 
 // ─── SSH helpers ─────────────────────────────────────────────────────────
@@ -177,30 +240,83 @@ export async function readPreset(): Promise<string> {
 }
 
 /**
- * Set a `key = value` line inside a named [section] of the preset file.
- * Preserves comments and all other lines. Errors if the key is absent.
+ * Set a `key = value` inside a named YAML section for llama-swap.
+ *
+ * llama-swap config.yaml structure (relevant excerpt):
+ *
+ *   models:
+ *     Qwen_Qwen3.6-35B-A3B-Q8_0:
+ *       cmd: |
+ *         /path/to/llama-server --model /path/to/gguf ...
+ *         --ctx-size 32768
+ *         --temp 0.7
+ *
+ * This function finds the `<id>:` block under `models:`, locates the
+ * `--ctx-size N` line (or other supported flags), and replaces N.
+ *
+ * Supported keys: ctx-size, temp, n-gpu-layers
  */
 export async function setPresetKey(
 	section: string,
 	key: string,
 	value: string,
 ): Promise<void> {
+	// Map short key names to the actual CLI flag used in cmd:
+	const flagMap: Record<string, string> = {
+		"ctx-size": "--ctx-size",
+		"temp": "--temp",
+		"n-gpu-layers": "--n-gpu-layers",
+	};
+	const flag = flagMap[key] ?? `--${key}`;
+
+	// We use a sed-based approach on the YAML file:
+	// 1. Find the <section>: block under models:
+	// 2. Within that block, find the --flag N line
+	// 3. Replace N with the new value
+	//
+	// The sed script works line-by-line:
+	//   - When we see `  ${section}:` under models:, enter editing mode
+	//   - While editing, look for `--flag <number>` and replace it
+	//   - Exit editing mode when we hit a line at the same or lesser indent
+	//     that is not under this section
+	const escapedSection = section.replace(/[.[\]*/^$]/g, "\\$&");
+	const escapedFlag = flag.replace(/[.[\]*/^$]/g, "\\$&");
+
 	const awkScript = `
-awk -v sec="[${section}]" -v key=${shQuote(key)} -v val=${shQuote(value)} '
-  BEGIN { in_s = 0; found = 0 }
-  /^\\[/ { in_s = ($0 == sec) }
-  in_s && $1 == key && $2 == "=" { print key " = " val; found = 1; next }
-  { print }
-  END { if (!found) exit 2 }
+awk -v sec="${escapedSection}" -v flag="${escapedFlag}" -v val="${value}" '
+  BEGIN { in_sec = 0; indent = 0 }
+  {
+    # Detect section header: "  <section>:" (2-space indent, key followed by colon)
+    if (!in_sec && match($0, /^[[:space:]]{2}'${escapedSection}':[[:space:]]*$/)) {
+      in_sec = 1;
+      indent = 2;
+    }
+    # If we are in a section, check if we left it
+    if (in_sec) {
+      lineIndent = 0;
+      m = match($0, /^[[:space:]]*/);
+      if (m > 0) lineIndent = RLENGTH;
+      # If indent is <= 2 and line is not empty and not a continuation of cmd,
+      # we have left this section
+      if (lineIndent <= 2 && $0 !~ /^[[:space:]]*$/) {
+        in_sec = 0;
+      }
+    }
+    if (in_sec && match($0, " " flag " [0-9]+")) {
+      sub(flag " [0-9]+", flag " " val);
+    }
+    print
+  }
 ' ${AI_SERVER_PRESET_PATH} > ${AI_SERVER_PRESET_PATH}.tmp && mv ${AI_SERVER_PRESET_PATH}.tmp ${AI_SERVER_PRESET_PATH}
 `.trim();
+
 	try {
 		await runSsh(awkScript);
 	} catch (err: any) {
 		const msg = err?.message ?? String(err);
 		if (msg.includes("exit code 2") || msg.match(/exit.*2/)) {
 			throw new Error(
-				`Key "${key}" not found in [${section}] — add it to the preset manually first.`,
+				`Key "${key}" not found for model "${section}" — add it to the preset manually first.`,
 			);
 		}
 		throw err;
@@ -209,7 +325,7 @@ awk -v sec="[${section}]" -v key=${shQuote(key)} -v val=${shQuote(value)} '
 
 export async function restartService(): Promise<string> {
 	return runSsh(
-		"systemctl --user restart llama-server.service && systemctl --user is-active llama-server.service",
+		`systemctl --user restart ${AI_SERVER_SERVICE_UNIT} && systemctl --user is-active ${AI_SERVER_SERVICE_UNIT}`,
 	);
 }
 
diff --git a/ai-server/config.ts b/ai-server/config.ts
index 4c94b0b..844f45b 100644
--- a/ai-server/config.ts
+++ b/ai-server/config.ts
@@ -13,8 +13,29 @@ export const AI_SERVER_CHAT_PATH = "/v1/chat/completions";
 // SSH target for admin operations (preset edits, systemctl). Uses key auth.
 export const AI_SERVER_SSH_HOST =
 	process.env.AI_SERVER_SSH_HOST ?? "ai-server@192.168.2.3";
+
+// llama-swap endpoint paths
+export const AI_SERVER_MODELS_PATH =
+	process.env.AI_SERVER_MODELS_PATH ?? "/v1/models";
+export const AI_SERVER_RUNNING_PATH =
+	process.env.AI_SERVER_RUNNING_PATH ?? "/running";
+export const AI_SERVER_UNLOAD_ALL_PATH =
+	process.env.AI_SERVER_UNLOAD_ALL_PATH ?? "/api/models/unload";
+export const AI_SERVER_UNLOAD_PATH = (id: string) =>
+	process.env.AI_SERVER_UNLOAD_PATH ??
+	`/api/models/unload/${encodeURIComponent(id)}`;
+export const AI_SERVER_UPSTREAM_HEALTH_PATH = (id: string) =>
+	process.env.AI_SERVER_UPSTREAM_HEALTH_PATH ??
+	`/upstream/${encodeURIComponent(id)}/health`;
+
+// llama-swap config file (YAML, replaces old INI preset)
 export const AI_SERVER_PRESET_PATH =
-	process.env.AI_SERVER_PRESET_PATH ?? "~/.llama-models.ini";
+	process.env.AI_SERVER_PRESET_PATH ??
+	"~/.config/llama-swap/config.yaml";
+
+// systemd service unit for llama-swap
+export const AI_SERVER_SERVICE_UNIT =
+	process.env.AI_SERVER_SERVICE_UNIT ?? "llama-swap.service";
 
 // Distinct api id so registering streamSimple does NOT overwrite the
 // built-in openai-completions provider (the api-registry keys by api name).
diff --git a/ai-server/index.ts b/ai-server/index.ts
index b491e44..71dd805 100644
--- a/ai-server/index.ts
+++ b/ai-server/index.ts
@@ -1,7 +1,6 @@
 import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
 import {
 	discoverModels,
-	extractCtxSize,
 	listModels,
 	listModelsCached,
 	loadModel,
@@ -122,13 +121,8 @@ export default async function (pi: ExtensionAPI) {
 				const routerModels = await listModels();
 				const lines = [`AI Server: ${AI_SERVER_URL}`];
 				for (const m of routerModels) {
-					const status = m.status?.value ?? "?";
-					const ctx = extractCtxSize(m);
-					const hasModel = (m.status?.args ?? []).includes("--model");
-					const marker = hasModel ? " " : " [no model path]";
-					lines.push(
-						`  ${m.id}  [${status}]  ctx=${ctx ?? "?"}${marker}`,
-					);
+					const status = m.running ? "loaded" : "unloaded";
+					lines.push(`  ${m.id}  [${status}]`);
 				}
 				ctx.ui.notify(lines.join("\n"), "info");
 			} catch (err) {
@@ -246,7 +240,7 @@ export default async function (pi: ExtensionAPI) {
 	});
 
 	pi.registerCommand("ai-server-preset", {
-		description: "Print ~/.llama-models.ini on the ai-server",
+		description: "Print llama-swap config on the ai-server",
 		handler: async (_args, ctx) => {
 			try {
 				const text = await readPreset();
@@ -261,7 +255,7 @@ export default async function (pi: ExtensionAPI) {
 	});
 
 	pi.registerCommand("ai-server-restart", {
-		description: "Restart the ai-server llama-server service",
+		description: "Restart the ai-server llama-swap service",
 		handler: async (_args, ctx) => {
 			const ok = await ctx.ui.confirm(
 				"Restart llama-server?",
diff --git a/ai-server/router-utils.ts b/ai-server/router-utils.ts
index e073eb1..bb3e27d 100644
--- a/ai-server/router-utils.ts
+++ b/ai-server/router-utils.ts
@@ -6,19 +6,80 @@
 
 export interface RouterModelMeta {
 	id: string;
-	status?: { value: string; args: string[] };
+	object?: string;
+	created?: number;
+	owned_by?: string;
+	/** Whether the model is currently loaded in llama-swap. */
+	running?: boolean;
 }
 
 /**
- * Pull `--ctx-size <N>` out of the worker's argv. Returns null if the flag
- * is missing, at the end of argv, or the value isn't a number.
+ * Parse ctx-size values from every model block in llama-swap's config.yaml.
+ *
+ * The YAML has a structure like:
+ *
+ *   models:
+ *     Qwen_Qwen3.6-35B-A3B-Q8_0:
+ *       cmd: |
+ *         /path/to/llama-server
+ *         --ctx-size 262144
+ *         --temp 0.7
+ *
+ * This function scans for `--ctx-size N` lines within each model block and
+ * returns a Map of id → ctxSize.  If a model appears multiple times it keeps
+ * the last value found.
  */
-export function extractCtxSize(m: RouterModelMeta): number | null {
-	const args = m.status?.args ?? [];
-	const i = args.indexOf("--ctx-size");
-	if (i < 0 || i + 1 >= args.length) return null;
-	const n = Number(args[i + 1]);
-	return Number.isFinite(n) ? n : null;
+export function parseCtxMapFromYaml(yaml: string): Map<string, number> {
+	const map = new Map<string, number>();
+	let currentId: string | null = null;
+
+	for (const raw of yaml.split("\n")) {
+		const line = raw.replace(/\r$/, "");
+
+		// Skip comments / blank
+		if (!line.trim() || line.trim().startsWith("#")) continue;
+
+		// New model block: exactly two-space indent, "<id>:" with nothing
+		// meaningful after the colon (llama-swap uses 2-space indent under
+		// `models:`).
+		const idMatch = /^  ([A-Za-z0-9._-]+):\s*$/.exec(line);
+		if (idMatch) {
+			currentId = idMatch[1];
+			continue;
+		}
+
+		// Top-level key resets context (e.g. `macros:`, `hooks:`)
+		if (/^[A-Za-z]/.test(line)) {
+			currentId = null;
+			continue;
+		}
+
+		if (!currentId) continue;
+
+		// Look for --ctx-size N anywhere in the line (handles indented cmd:
+		// blocks where the flag is on its own line).
+		const ctx = /--ctx-size\s+(\d+)/.exec(line);
+		if (ctx) {
+			map.set(currentId, Number(ctx[1]));
+			currentId = null; // one ctx per model
+		}
+	}
+
+	return map;
+}
+
+/**
+ * Extract ctx-size from a /running entry's `cmd` string.
+ *
+ * The /running endpoint returns entries like:
+ *   { model: "Qwen_...", cmd: "/path/llama-server --model ... --ctx-size 262144 ...", ... }
+ *
+ * This is the authoritative source for the currently loaded model's ctx.
+ */
+export function extractCtxFromRunningCmd(cmd: string | undefined): number | null {
+	if (!cmd) return null;
+	const m = /--ctx-size\s+(\d+)/.exec(cmd);
+	return m ? Number(m[1]) : null;
 }
 
 /**
diff --git a/tests/router-utils.test.ts b/tests/router-utils.test.ts
index 0400625..0173e72 100644
--- a/tests/router-utils.test.ts
+++ b/tests/router-utils.test.ts
@@ -7,45 +7,102 @@
 import assert from "node:assert/strict";
 import { test } from "node:test";
 import {
-	extractCtxSize,
+	parseCtxMapFromYaml,
+	extractCtxFromRunningCmd,
 	isReasoningModel,
 	isShardArtefact,
 } from "../ai-server/router-utils.ts";
 
-// ── extractCtxSize ──────────────────────────────────────────────────────
+// ── parseCtxMapFromYaml ─────────────────────────────────────────────────
 
-test("extractCtxSize: --ctx-size present with value", () => {
-	const m = {
-		id: "x",
-		status: { value: "loaded", args: ["--host", "127.0.0.1", "--ctx-size", "131072"] },
-	};
-	assert.equal(extractCtxSize(m), 131072);
+test("parseCtxMapFromYaml: extracts ctx-size from model blocks", () => {
+	const yaml = `
+models:
+  Qwen_Qwen3.6-35B-A3B-Q8_0:
+    cmd: |
+      /home/ai-server/llama.cpp/build/bin/llama-server
+      --model /home/ai-server/models/Qwen_Qwen3.6-35B-A3B-Q8_0.gguf
+      --ctx-size 262144
+      --temp 0.7
+  MiniMax-M2.7-IQ3_XXS:
+    cmd: |
+      /home/ai-server/llama.cpp/build/bin/llama-server
+      --model /home/ai-server/models/MiniMax-M2.7-UD-IQ3_XXS.gguf
+      --ctx-size 131072
+      --temp 1.0
+`;
+	const map = parseCtxMapFromYaml(yaml);
+	assert.equal(map.get("Qwen_Qwen3.6-35B-A3B-Q8_0"), 262144);
+	assert.equal(map.get("MiniMax-M2.7-IQ3_XXS"), 131072);
+	assert.equal(map.size, 2);
 });
 
-test("extractCtxSize: missing --ctx-size -> null", () => {
-	assert.equal(extractCtxSize({ id: "x", status: { value: "loaded", args: ["--host", "127"] } }), null);
+test("parseCtxMapFromYaml: skips comments and blank lines", () => {
+	const yaml = `
+# This is a comment
+models:
+
+  # Model with large context
+  Qwen_Qwen3.6-35B-A3B-Q8_0:
+    cmd: |
+      /path/to/server
+      --ctx-size 65536
+      --temp 0.7
+`;
+	const map = parseCtxMapFromYaml(yaml);
+	assert.equal(map.get("Qwen_Qwen3.6-35B-A3B-Q8_0"), 65536);
 });
 
-test("extractCtxSize: --ctx-size at end of argv -> null (no value)", () => {
-	assert.equal(extractCtxSize({ id: "x", status: { value: "loaded", args: ["--ctx-size"] } }), null);
+test("parseCtxMapFromYaml: resets on top-level keys", () => {
+	const yaml = `
+models:
+  Qwen_Qwen3.6-35B-A3B-Q8_0:
+    cmd: |
+      /path/to/server
+      --ctx-size 262144
+hooks:
+  on_startup:
+    preload:
+      - Qwen_Qwen3.6-35B-A3B-Q8_0
+`;
+	const map = parseCtxMapFromYaml(yaml);
+	assert.equal(map.get("Qwen_Qwen3.6-35B-A3B-Q8_0"), 262144);
+	// "preload" is not a valid model id pattern, but even if it were,
+	// it's under hooks: so should not be included.
+	assert.ok(!map.has("preload"));
 });
 
-test("extractCtxSize: non-numeric value -> null", () => {
-	assert.equal(
-		extractCtxSize({ id: "x", status: { value: "loaded", args: ["--ctx-size", "notanumber"] } }),
-		null,
-	);
+test("parseCtxMapFromYaml: empty yaml returns empty map", () => {
+	const map = parseCtxMapFromYaml("");
+	assert.equal(map.size, 0);
 });
 
-test("extractCtxSize: zero is valid (not null)", () => {
-	assert.equal(
-		extractCtxSize({ id: "x", status: { value: "loaded", args: ["--ctx-size", "0"] } }),
-		0,
-	);
+test("parseCtxMapFromYaml: model without ctx-size is skipped", () => {
+	const yaml = `
+models:
+  SmallModel:
+    cmd: |
+      /path/to/server
+      --temp 0.7
+`;
+	const map = parseCtxMapFromYaml(yaml);
+	assert.equal(map.get("SmallModel"), undefined);
+	assert.equal(map.size, 0);
 });
 
-test("extractCtxSize: missing status entirely -> null", () => {
-	assert.equal(extractCtxSize({ id: "x" }), null);
+// ── extractCtxFromRunningCmd ────────────────────────────────────────────
+
+test("extractCtxFromRunningCmd: parses --ctx-size from cmd string", () => {
+	const cmd = "/home/ai-server/llama.cpp/build/bin/llama-server --model /home/ai-server/models/Qwen.gguf --ctx-size 262144 --temp 0.7";
+	assert.equal(extractCtxFromRunningCmd(cmd), 262144);
+});
+
+test("extractCtxFromRunningCmd: undefined cmd returns null", () => {
+	assert.equal(extractCtxFromRunningCmd(undefined), null);
+});
+
+test("extractCtxFromRunningCmd: cmd without --ctx-size returns null", () => {
+	assert.equal(extractCtxFromRunningCmd("/path/to/server --temp 0.7"), null);
 });
 
 // ── isShardArtefact ─────────────────────────────────────────────────────