wire real ctx-size parsing into discoverModels, fix dangling extractCtxSize import

router-utils.ts: - Replace broken extractCtxSize() stub with parseCtxMapFromYaml() that walks the YAML and extracts --ctx-size N per model block - Add extractCtxFromRunningCmd() to parse --ctx-size from /running cmd strings admin.ts: - Import parseCtxMapFromYaml and extractCtxFromRunningCmd (drop dangling extractCtxSize) - Add RunningEntry interface and listRunning() helper - discoverModels() now runs Promise.all of listModels + listRunning + readPreset, then resolves ctx per-model: 1. ctxFromRunning.get(id) — live process is authoritative 2. ctxFromYaml.get(id) — config.yaml fallback 3. 32768 — last-resort index.ts: - Remove dangling extractCtxSize import and ctx display from ai-server-status (ctx is per-model, not per-routerModel)
2026-05-27 10:05:25 +02:00
parent ede2645189
commit fe82d33d94
3 changed files with 109 additions and 23 deletions
@@ -17,13 +17,14 @@ import {
 	loadCerts,
 } from "./config.js";
 import {
-	extractCtxSize,
+	parseCtxMapFromYaml,
+	extractCtxFromRunningCmd,
 	isReasoningModel,
 	isShardArtefact,
 } from "./router-utils.js";

 // Re-export so existing index.ts imports keep working.
-export { extractCtxSize, isReasoningModel };
+export { isReasoningModel };

 const exec = promisify(execCb);

@@ -165,21 +166,50 @@ export async function unloadAll(): Promise<unknown> {
 // path). Placeholder sections are not exposed. We only filter out shard
 // artefacts.

+interface RunningEntry {
+	model: string;
+	cmd?: string;
+	state?: string;
+	ttl?: number;
+	proxy?: string;
+}
+
+async function listRunning(): Promise<RunningEntry[]> {
+	const res = await routerRequest("GET", AI_SERVER_RUNNING_PATH);
+	return Array.isArray((res as any)?.running)
+		? (res as any).running
+		: [];
+}
+
 export async function discoverModels(): Promise<ServerModel[]> {
-	const models = await listModels();
+	const [models, running, yaml] = await Promise.all([
+		listModels(),
+		listRunning().catch(() => [] as RunningEntry[]),
+		readPreset().catch(() => ""),
+	]);
+
+	const ctxFromYaml = parseCtxMapFromYaml(yaml);
+	const ctxFromRunning = new Map<string, number>();
+	for (const r of running) {
+		const n = extractCtxFromRunningCmd(r.cmd);
+		if (n) ctxFromRunning.set(r.model, n);
+	}
+
 	return models
 		.filter((m) => !isShardArtefact(m.id))
 		.map((m) => {
-		// llama-swap doesn't expose ctx-size in the API; use a sensible default.
-		const ctx = 32768;
-		return {
-			id: m.id,
-			name: `${m.id} (AI Server)`,
-			reasoning: isReasoningModel(m.id),
-			contextWindow: ctx,
-			maxTokens: Math.min(16384, Math.max(2048, Math.floor(ctx / 2))),
-		};
-	});
+			const ctx =
+				ctxFromRunning.get(m.id) ?? // live process is authoritative
+				ctxFromYaml.get(m.id) ?? // config.yaml is next best
+				32768; // last-resort fallback
+			return {
+				id: m.id,
+				name: `${m.id} (AI Server)`,
+				reasoning: isReasoningModel(m.id),
+				contextWindow: ctx,
+				maxTokens: Math.min(16384, Math.max(2048, Math.floor(ctx / 2))),
+			};
+		});
 }

 // ─── SSH helpers ─────────────────────────────────────────────────────────
@@ -1,7 +1,6 @@
 import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
 import {
 	discoverModels,
-	extractCtxSize,
 	listModels,
 	listModelsCached,
 	loadModel,
@@ -123,10 +122,7 @@ export default async function (pi: ExtensionAPI) {
 				const lines = [`AI Server: ${AI_SERVER_URL}`];
 				for (const m of routerModels) {
 					const status = m.running ? "loaded" : "unloaded";
-					const ctx = extractCtxSize(m);
-					lines.push(
-						`  ${m.id}  [${status}]  ctx=${ctx ?? "?"}`,
-					);
+					lines.push(`  ${m.id}  [${status}]`);
 				}
 				ctx.ui.notify(lines.join("\n"), "info");
 			} catch (err) {
@@ -14,12 +14,72 @@ export interface RouterModelMeta {
 }

 /**
- * llama-swap does not expose the worker command line, so ctx-size cannot be
- * read from running args. Returns null — callers fall back to a default
- * (32768 in discoverModels).
+ * Parse ctx-size values from every model block in llama-swap's config.yaml.
+ *
+ * The YAML has a structure like:
+ *
+ *   models:
+ *     Qwen_Qwen3.6-35B-A3B-Q8_0:
+ *       cmd: |
+ *         /path/to/llama-server
+ *         --ctx-size 262144
+ *         --temp 0.7
+ *
+ * This function scans for `--ctx-size N` lines within each model block and
+ * returns a Map of id → ctxSize.  If a model appears multiple times it keeps
+ * the last value found.
 */
-export function extractCtxSize(m: RouterModelMeta): number | null {
-	return null;
+export function parseCtxMapFromYaml(yaml: string): Map<string, number> {
+	const map = new Map<string, number>();
+	let currentId: string | null = null;
+
+	for (const raw of yaml.split("\n")) {
+		const line = raw.replace(/\r$/, "");
+
+		// Skip comments / blank
+		if (!line.trim() || line.trim().startsWith("#")) continue;
+
+		// New model block: exactly two-space indent, "<id>:" with nothing
+		// meaningful after the colon (llama-swap uses 2-space indent under
+		// `models:`).
+		const idMatch = /^  ([A-Za-z0-9._-]+):\s*$/.exec(line);
+		if (idMatch) {
+			currentId = idMatch[1];
+			continue;
+		}
+
+		// Top-level key resets context (e.g. `macros:`, `hooks:`)
+		if (/^[A-Za-z]/.test(line)) {
+			currentId = null;
+			continue;
+		}
+
+		if (!currentId) continue;
+
+		// Look for --ctx-size N anywhere in the line (handles indented cmd:
+		// blocks where the flag is on its own line).
+		const ctx = /--ctx-size\s+(\d+)/.exec(line);
+		if (ctx) {
+			map.set(currentId, Number(ctx[1]));
+			currentId = null; // one ctx per model
+		}
+	}
+
+	return map;
+}
+
+/**
+ * Extract ctx-size from a /running entry's `cmd` string.
+ *
+ * The /running endpoint returns entries like:
+ *   { model: "Qwen_...", cmd: "/path/llama-server --model ... --ctx-size 262144 ...", ... }
+ *
+ * This is the authoritative source for the currently loaded model's ctx.
+ */
+export function extractCtxFromRunningCmd(cmd: string | undefined): number | null {
+	if (!cmd) return null;
+	const m = /--ctx-size\s+(\d+)/.exec(cmd);
+	return m ? Number(m[1]) : null;
 }

 /**