wire real ctx-size parsing into discoverModels, fix dangling extractCtxSize import
router-utils.ts:
- Replace broken extractCtxSize() stub with parseCtxMapFromYaml() that
walks the YAML and extracts --ctx-size N per model block
- Add extractCtxFromRunningCmd() to parse --ctx-size from /running cmd strings
admin.ts:
- Import parseCtxMapFromYaml and extractCtxFromRunningCmd (drop
dangling extractCtxSize)
- Add RunningEntry interface and listRunning() helper
- discoverModels() now runs Promise.all of listModels + listRunning +
readPreset, then resolves ctx per-model:
1. ctxFromRunning.get(id) — live process is authoritative
2. ctxFromYaml.get(id) — config.yaml fallback
3. 32768 — last-resort
index.ts:
- Remove dangling extractCtxSize import and ctx display from
ai-server-status (ctx is per-model, not per-routerModel)
This commit is contained in:
+43
-13
@@ -17,13 +17,14 @@ import {
|
||||
loadCerts,
|
||||
} from "./config.js";
|
||||
import {
|
||||
extractCtxSize,
|
||||
parseCtxMapFromYaml,
|
||||
extractCtxFromRunningCmd,
|
||||
isReasoningModel,
|
||||
isShardArtefact,
|
||||
} from "./router-utils.js";
|
||||
|
||||
// Re-export so existing index.ts imports keep working.
|
||||
export { extractCtxSize, isReasoningModel };
|
||||
export { isReasoningModel };
|
||||
|
||||
const exec = promisify(execCb);
|
||||
|
||||
@@ -165,21 +166,50 @@ export async function unloadAll(): Promise<unknown> {
|
||||
// path). Placeholder sections are not exposed. We only filter out shard
|
||||
// artefacts.
|
||||
|
||||
interface RunningEntry {
|
||||
model: string;
|
||||
cmd?: string;
|
||||
state?: string;
|
||||
ttl?: number;
|
||||
proxy?: string;
|
||||
}
|
||||
|
||||
async function listRunning(): Promise<RunningEntry[]> {
|
||||
const res = await routerRequest("GET", AI_SERVER_RUNNING_PATH);
|
||||
return Array.isArray((res as any)?.running)
|
||||
? (res as any).running
|
||||
: [];
|
||||
}
|
||||
|
||||
export async function discoverModels(): Promise<ServerModel[]> {
|
||||
const models = await listModels();
|
||||
const [models, running, yaml] = await Promise.all([
|
||||
listModels(),
|
||||
listRunning().catch(() => [] as RunningEntry[]),
|
||||
readPreset().catch(() => ""),
|
||||
]);
|
||||
|
||||
const ctxFromYaml = parseCtxMapFromYaml(yaml);
|
||||
const ctxFromRunning = new Map<string, number>();
|
||||
for (const r of running) {
|
||||
const n = extractCtxFromRunningCmd(r.cmd);
|
||||
if (n) ctxFromRunning.set(r.model, n);
|
||||
}
|
||||
|
||||
return models
|
||||
.filter((m) => !isShardArtefact(m.id))
|
||||
.map((m) => {
|
||||
// llama-swap doesn't expose ctx-size in the API; use a sensible default.
|
||||
const ctx = 32768;
|
||||
return {
|
||||
id: m.id,
|
||||
name: `${m.id} (AI Server)`,
|
||||
reasoning: isReasoningModel(m.id),
|
||||
contextWindow: ctx,
|
||||
maxTokens: Math.min(16384, Math.max(2048, Math.floor(ctx / 2))),
|
||||
};
|
||||
});
|
||||
const ctx =
|
||||
ctxFromRunning.get(m.id) ?? // live process is authoritative
|
||||
ctxFromYaml.get(m.id) ?? // config.yaml is next best
|
||||
32768; // last-resort fallback
|
||||
return {
|
||||
id: m.id,
|
||||
name: `${m.id} (AI Server)`,
|
||||
reasoning: isReasoningModel(m.id),
|
||||
contextWindow: ctx,
|
||||
maxTokens: Math.min(16384, Math.max(2048, Math.floor(ctx / 2))),
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
// ─── SSH helpers ─────────────────────────────────────────────────────────
|
||||
|
||||
+1
-5
@@ -1,7 +1,6 @@
|
||||
import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
|
||||
import {
|
||||
discoverModels,
|
||||
extractCtxSize,
|
||||
listModels,
|
||||
listModelsCached,
|
||||
loadModel,
|
||||
@@ -123,10 +122,7 @@ export default async function (pi: ExtensionAPI) {
|
||||
const lines = [`AI Server: ${AI_SERVER_URL}`];
|
||||
for (const m of routerModels) {
|
||||
const status = m.running ? "loaded" : "unloaded";
|
||||
const ctx = extractCtxSize(m);
|
||||
lines.push(
|
||||
` ${m.id} [${status}] ctx=${ctx ?? "?"}`,
|
||||
);
|
||||
lines.push(` ${m.id} [${status}]`);
|
||||
}
|
||||
ctx.ui.notify(lines.join("\n"), "info");
|
||||
} catch (err) {
|
||||
|
||||
@@ -14,12 +14,72 @@ export interface RouterModelMeta {
|
||||
}
|
||||
|
||||
/**
|
||||
* llama-swap does not expose the worker command line, so ctx-size cannot be
|
||||
* read from running args. Returns null — callers fall back to a default
|
||||
* (32768 in discoverModels).
|
||||
* Parse ctx-size values from every model block in llama-swap's config.yaml.
|
||||
*
|
||||
* The YAML has a structure like:
|
||||
*
|
||||
* models:
|
||||
* Qwen_Qwen3.6-35B-A3B-Q8_0:
|
||||
* cmd: |
|
||||
* /path/to/llama-server
|
||||
* --ctx-size 262144
|
||||
* --temp 0.7
|
||||
*
|
||||
* This function scans for `--ctx-size N` lines within each model block and
|
||||
* returns a Map of id → ctxSize. If a model appears multiple times it keeps
|
||||
* the last value found.
|
||||
*/
|
||||
export function extractCtxSize(m: RouterModelMeta): number | null {
|
||||
return null;
|
||||
export function parseCtxMapFromYaml(yaml: string): Map<string, number> {
|
||||
const map = new Map<string, number>();
|
||||
let currentId: string | null = null;
|
||||
|
||||
for (const raw of yaml.split("\n")) {
|
||||
const line = raw.replace(/\r$/, "");
|
||||
|
||||
// Skip comments / blank
|
||||
if (!line.trim() || line.trim().startsWith("#")) continue;
|
||||
|
||||
// New model block: exactly two-space indent, "<id>:" with nothing
|
||||
// meaningful after the colon (llama-swap uses 2-space indent under
|
||||
// `models:`).
|
||||
const idMatch = /^ ([A-Za-z0-9._-]+):\s*$/.exec(line);
|
||||
if (idMatch) {
|
||||
currentId = idMatch[1];
|
||||
continue;
|
||||
}
|
||||
|
||||
// Top-level key resets context (e.g. `macros:`, `hooks:`)
|
||||
if (/^[A-Za-z]/.test(line)) {
|
||||
currentId = null;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!currentId) continue;
|
||||
|
||||
// Look for --ctx-size N anywhere in the line (handles indented cmd:
|
||||
// blocks where the flag is on its own line).
|
||||
const ctx = /--ctx-size\s+(\d+)/.exec(line);
|
||||
if (ctx) {
|
||||
map.set(currentId, Number(ctx[1]));
|
||||
currentId = null; // one ctx per model
|
||||
}
|
||||
}
|
||||
|
||||
return map;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract ctx-size from a /running entry's `cmd` string.
|
||||
*
|
||||
* The /running endpoint returns entries like:
|
||||
* { model: "Qwen_...", cmd: "/path/llama-server --model ... --ctx-size 262144 ...", ... }
|
||||
*
|
||||
* This is the authoritative source for the currently loaded model's ctx.
|
||||
*/
|
||||
export function extractCtxFromRunningCmd(cmd: string | undefined): number | null {
|
||||
if (!cmd) return null;
|
||||
const m = /--ctx-size\s+(\d+)/.exec(cmd);
|
||||
return m ? Number(m[1]) : null;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user