wire real ctx-size parsing into discoverModels, fix dangling extractCtxSize import

router-utils.ts:
  - Replace broken extractCtxSize() stub with parseCtxMapFromYaml() that
    walks the YAML and extracts --ctx-size N per model block
  - Add extractCtxFromRunningCmd() to parse --ctx-size from /running cmd strings

admin.ts:
  - Import parseCtxMapFromYaml and extractCtxFromRunningCmd (drop
    dangling extractCtxSize)
  - Add RunningEntry interface and listRunning() helper
  - discoverModels() now runs Promise.all of listModels + listRunning +
    readPreset, then resolves ctx per-model:
      1. ctxFromRunning.get(id)  — live process is authoritative
      2. ctxFromYaml.get(id)     — config.yaml fallback
      3. 32768                   — last-resort

index.ts:
  - Remove dangling extractCtxSize import and ctx display from
    ai-server-status (ctx is per-model, not per-routerModel)
This commit is contained in:
shahondin1624
2026-05-27 10:05:25 +02:00
parent ede2645189
commit fe82d33d94
3 changed files with 109 additions and 23 deletions
+35 -5
View File
@@ -17,13 +17,14 @@ import {
loadCerts,
} from "./config.js";
import {
extractCtxSize,
parseCtxMapFromYaml,
extractCtxFromRunningCmd,
isReasoningModel,
isShardArtefact,
} from "./router-utils.js";
// Re-export so existing index.ts imports keep working.
export { extractCtxSize, isReasoningModel };
export { isReasoningModel };
const exec = promisify(execCb);
@@ -165,13 +166,42 @@ export async function unloadAll(): Promise<unknown> {
// path). Placeholder sections are not exposed. We only filter out shard
// artefacts.
interface RunningEntry {
model: string;
cmd?: string;
state?: string;
ttl?: number;
proxy?: string;
}
async function listRunning(): Promise<RunningEntry[]> {
const res = await routerRequest("GET", AI_SERVER_RUNNING_PATH);
return Array.isArray((res as any)?.running)
? (res as any).running
: [];
}
export async function discoverModels(): Promise<ServerModel[]> {
const models = await listModels();
const [models, running, yaml] = await Promise.all([
listModels(),
listRunning().catch(() => [] as RunningEntry[]),
readPreset().catch(() => ""),
]);
const ctxFromYaml = parseCtxMapFromYaml(yaml);
const ctxFromRunning = new Map<string, number>();
for (const r of running) {
const n = extractCtxFromRunningCmd(r.cmd);
if (n) ctxFromRunning.set(r.model, n);
}
return models
.filter((m) => !isShardArtefact(m.id))
.map((m) => {
// llama-swap doesn't expose ctx-size in the API; use a sensible default.
const ctx = 32768;
const ctx =
ctxFromRunning.get(m.id) ?? // live process is authoritative
ctxFromYaml.get(m.id) ?? // config.yaml is next best
32768; // last-resort fallback
return {
id: m.id,
name: `${m.id} (AI Server)`,
+1 -5
View File
@@ -1,7 +1,6 @@
import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
import {
discoverModels,
extractCtxSize,
listModels,
listModelsCached,
loadModel,
@@ -123,10 +122,7 @@ export default async function (pi: ExtensionAPI) {
const lines = [`AI Server: ${AI_SERVER_URL}`];
for (const m of routerModels) {
const status = m.running ? "loaded" : "unloaded";
const ctx = extractCtxSize(m);
lines.push(
` ${m.id} [${status}] ctx=${ctx ?? "?"}`,
);
lines.push(` ${m.id} [${status}]`);
}
ctx.ui.notify(lines.join("\n"), "info");
} catch (err) {
+65 -5
View File
@@ -14,12 +14,72 @@ export interface RouterModelMeta {
}
/**
* llama-swap does not expose the worker command line, so ctx-size cannot be
* read from running args. Returns null — callers fall back to a default
* (32768 in discoverModels).
* Parse ctx-size values from every model block in llama-swap's config.yaml.
*
* The YAML has a structure like:
*
* models:
* Qwen_Qwen3.6-35B-A3B-Q8_0:
* cmd: |
* /path/to/llama-server
* --ctx-size 262144
* --temp 0.7
*
* This function scans for `--ctx-size N` lines within each model block and
* returns a Map of id → ctxSize. If a model appears multiple times it keeps
* the last value found.
*/
export function extractCtxSize(m: RouterModelMeta): number | null {
return null;
export function parseCtxMapFromYaml(yaml: string): Map<string, number> {
const map = new Map<string, number>();
let currentId: string | null = null;
for (const raw of yaml.split("\n")) {
const line = raw.replace(/\r$/, "");
// Skip comments / blank
if (!line.trim() || line.trim().startsWith("#")) continue;
// New model block: exactly two-space indent, "<id>:" with nothing
// meaningful after the colon (llama-swap uses 2-space indent under
// `models:`).
const idMatch = /^ ([A-Za-z0-9._-]+):\s*$/.exec(line);
if (idMatch) {
currentId = idMatch[1];
continue;
}
// Top-level key resets context (e.g. `macros:`, `hooks:`)
if (/^[A-Za-z]/.test(line)) {
currentId = null;
continue;
}
if (!currentId) continue;
// Look for --ctx-size N anywhere in the line (handles indented cmd:
// blocks where the flag is on its own line).
const ctx = /--ctx-size\s+(\d+)/.exec(line);
if (ctx) {
map.set(currentId, Number(ctx[1]));
currentId = null; // one ctx per model
}
}
return map;
}
/**
* Extract ctx-size from a /running entry's `cmd` string.
*
* The /running endpoint returns entries like:
* { model: "Qwen_...", cmd: "/path/llama-server --model ... --ctx-size 262144 ...", ... }
*
* This is the authoritative source for the currently loaded model's ctx.
*/
export function extractCtxFromRunningCmd(cmd: string | undefined): number | null {
if (!cmd) return null;
const m = /--ctx-size\s+(\d+)/.exec(cmd);
return m ? Number(m[1]) : null;
}
/**