f7af660727
Endpoint rewrites:
- GET /v1/models + /running → merged listModels() with running flag
- POST /models/load → GET /upstream/<id>/health (warm load)
- POST /models/unload → POST /api/models/unload/<id> (no body)
- Added POST /api/models/unload for unloadAll()
Config migration:
- Preset path: ~/.llama-models.ini → ~/.config/llama-swap/config.yaml
- Service unit: llama-server.service → llama-swap.service
- setPresetKey() rewritten from INI awk to YAML-aware awk for
editing --ctx-size/--temp/--n-gpu-layers in cmd: blocks
Per-model ctx-size (fixes 0/33k bug):
- parseCtxMapFromYaml(): walks config.yaml, extracts --ctx-size N per
model block → Map<id, ctxSize>
- extractCtxFromRunningCmd(): parses --ctx-size from /running cmd string
- discoverModels(): Promise.all(listModels, listRunning, readPreset),
ctx priority: running cmd → yaml → 32768 fallback
- Removed broken extractCtxSize stub and dangling imports
Tests: 14 passing (parseCtxMapFromYaml ×5, extractCtxFromRunningCmd ×3,
isShardArtefact ×3, isReasoningModel ×3)
README: full rewrite covering llama-swap architecture, YAML config format,
new endpoints, troubleshooting table updated.
281 lines
8.0 KiB
TypeScript
281 lines
8.0 KiB
TypeScript
import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
|
|
import {
|
|
discoverModels,
|
|
listModels,
|
|
listModelsCached,
|
|
loadModel,
|
|
readPreset,
|
|
reloadOneModel,
|
|
restartService,
|
|
setPresetKey,
|
|
unloadModel,
|
|
} from "./admin.js";
|
|
import {
|
|
AI_SERVER_API_BASE,
|
|
AI_SERVER_API_ID,
|
|
AI_SERVER_PROVIDER_ID,
|
|
AI_SERVER_URL,
|
|
MODELS as STATIC_MODELS,
|
|
type ServerModel,
|
|
} from "./config.js";
|
|
import { streamAiServer } from "./stream.js";
|
|
|
|
async function completeModelId(prefix: string) {
|
|
try {
|
|
// Cached for 5s. Tab-completion calls the completer on every keystroke,
|
|
// but the user typically only types one model id per command — caching
|
|
// deduplicates the network round-trip without stale-state harm.
|
|
const models = await listModelsCached();
|
|
const hits = models
|
|
.filter((m) => m.id.startsWith(prefix))
|
|
.map((m) => ({ value: m.id, label: m.id }));
|
|
return hits.length > 0 ? hits : null;
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
function registerProviderWithModels(
|
|
pi: ExtensionAPI,
|
|
models: ServerModel[],
|
|
): void {
|
|
pi.registerProvider(AI_SERVER_PROVIDER_ID, {
|
|
baseUrl: AI_SERVER_API_BASE,
|
|
apiKey: "ai-server-mtls",
|
|
api: AI_SERVER_API_ID as any,
|
|
models: models.map((m) => ({
|
|
id: m.id,
|
|
name: m.name,
|
|
reasoning: m.reasoning,
|
|
input: ["text"],
|
|
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
|
contextWindow: m.contextWindow,
|
|
maxTokens: m.maxTokens,
|
|
// Informational compat block. Our `streamSimple` is custom and
|
|
// builds the request body itself (see stream.ts), but pi-mono's
|
|
// model-list UI / capability detection reads these fields.
|
|
compat: {
|
|
thinkingFormat: "qwen-chat-template",
|
|
supportsReasoningEffort: m.reasoning,
|
|
supportsDeveloperRole: false,
|
|
supportsUsageInStreaming: true,
|
|
maxTokensField: "max_tokens",
|
|
},
|
|
})),
|
|
streamSimple: streamAiServer,
|
|
});
|
|
}
|
|
|
|
const DISCOVERY_FAST_TIMEOUT_MS = 300;
|
|
|
|
export default async function (pi: ExtensionAPI) {
|
|
// Register the provider IMMEDIATELY with the static fallback list so pi
|
|
// startup isn't blocked on the HTTPS round-trip in the worst case.
|
|
registerProviderWithModels(pi, STATIC_MODELS);
|
|
|
|
// Then race real discovery against a short timeout. On LAN the router
|
|
// answers in ~40ms and pi --list-models sees the live list. On slow
|
|
// networks we bail at 300ms and the fallback is what the user sees; the
|
|
// background promise keeps running and re-registers later.
|
|
const discovery = discoverModels().catch((err) => {
|
|
if (process.env.PI_DEBUG) {
|
|
console.log(
|
|
`[ai-server] Discovery failed (${(err as Error).message}); fallback remains`,
|
|
);
|
|
}
|
|
return null;
|
|
});
|
|
const timeout = new Promise<null>((r) =>
|
|
setTimeout(() => r(null), DISCOVERY_FAST_TIMEOUT_MS),
|
|
);
|
|
const fastResult = await Promise.race([discovery, timeout]);
|
|
|
|
if (fastResult && fastResult.length > 0) {
|
|
registerProviderWithModels(pi, fastResult);
|
|
if (process.env.PI_DEBUG) {
|
|
console.log(
|
|
`[ai-server] Discovered ${fastResult.length} model(s) on ${AI_SERVER_URL}: ${fastResult.map((m) => m.id).join(", ")}`,
|
|
);
|
|
}
|
|
} else {
|
|
// Slow network or discovery still pending — keep waiting in the
|
|
// background and update the provider once it arrives.
|
|
discovery.then((models) => {
|
|
if (models && models.length > 0) {
|
|
registerProviderWithModels(pi, models);
|
|
if (process.env.PI_DEBUG) {
|
|
console.log(
|
|
`[ai-server] Late discovery: ${models.length} model(s)`,
|
|
);
|
|
}
|
|
}
|
|
});
|
|
}
|
|
|
|
// ─── Admin commands ──────────────────────────────────────────────────
|
|
|
|
pi.registerCommand("ai-server-status", {
|
|
description: "Show ai-server model status and ctx sizes",
|
|
handler: async (_args, ctx) => {
|
|
try {
|
|
const routerModels = await listModels();
|
|
const lines = [`AI Server: ${AI_SERVER_URL}`];
|
|
for (const m of routerModels) {
|
|
const status = m.running ? "loaded" : "unloaded";
|
|
lines.push(` ${m.id} [${status}]`);
|
|
}
|
|
ctx.ui.notify(lines.join("\n"), "info");
|
|
} catch (err) {
|
|
ctx.ui.notify(
|
|
`ai-server-status failed: ${(err as Error).message}`,
|
|
"error",
|
|
);
|
|
}
|
|
},
|
|
});
|
|
|
|
pi.registerCommand("ai-server-refresh", {
|
|
description: "Re-discover models from the ai-server router",
|
|
handler: async (_args, ctx) => {
|
|
try {
|
|
const discovered = await discoverModels();
|
|
if (discovered.length === 0) {
|
|
ctx.ui.notify(
|
|
"No runnable models on server (all presets missing --model path)",
|
|
"warning",
|
|
);
|
|
return;
|
|
}
|
|
registerProviderWithModels(pi, discovered);
|
|
ctx.ui.notify(
|
|
`Registered ${discovered.length} model(s): ${discovered
|
|
.map((m) => m.id)
|
|
.join(", ")}`,
|
|
"info",
|
|
);
|
|
} catch (err) {
|
|
ctx.ui.notify(
|
|
`Refresh failed: ${(err as Error).message}`,
|
|
"error",
|
|
);
|
|
}
|
|
},
|
|
});
|
|
|
|
pi.registerCommand("ai-server-load", {
|
|
description: "Load a model (usage: /ai-server-load <id>)",
|
|
getArgumentCompletions: completeModelId,
|
|
handler: async (args, ctx) => {
|
|
const id = args.trim();
|
|
if (!id) {
|
|
ctx.ui.notify("Usage: /ai-server-load <model-id>", "error");
|
|
return;
|
|
}
|
|
try {
|
|
ctx.ui.setStatus("ai-server", `Loading ${id}…`);
|
|
await loadModel(id);
|
|
ctx.ui.setStatus("ai-server", undefined);
|
|
ctx.ui.notify(`Loaded ${id}`, "info");
|
|
} catch (err) {
|
|
ctx.ui.setStatus("ai-server", undefined);
|
|
ctx.ui.notify(`Load failed: ${(err as Error).message}`, "error");
|
|
}
|
|
},
|
|
});
|
|
|
|
pi.registerCommand("ai-server-unload", {
|
|
description: "Unload a model (usage: /ai-server-unload <id>)",
|
|
getArgumentCompletions: completeModelId,
|
|
handler: async (args, ctx) => {
|
|
const id = args.trim();
|
|
if (!id) {
|
|
ctx.ui.notify("Usage: /ai-server-unload <model-id>", "error");
|
|
return;
|
|
}
|
|
try {
|
|
await unloadModel(id);
|
|
ctx.ui.notify(`Unloaded ${id}`, "info");
|
|
} catch (err) {
|
|
ctx.ui.notify(`Unload failed: ${(err as Error).message}`, "error");
|
|
}
|
|
},
|
|
});
|
|
|
|
pi.registerCommand("ai-server-ctx", {
|
|
description:
|
|
"Set ctx-size for a model and reload (usage: /ai-server-ctx <id> <size>)",
|
|
getArgumentCompletions: completeModelId,
|
|
handler: async (args, ctx) => {
|
|
const parts = args.trim().split(/\s+/).filter(Boolean);
|
|
if (parts.length !== 2) {
|
|
ctx.ui.notify("Usage: /ai-server-ctx <model-id> <size>", "error");
|
|
return;
|
|
}
|
|
const [id, sizeStr] = parts;
|
|
const size = Number(sizeStr);
|
|
if (!Number.isInteger(size) || size < 512) {
|
|
ctx.ui.notify(`Invalid size: ${sizeStr}`, "error");
|
|
return;
|
|
}
|
|
const ok = await ctx.ui.confirm(
|
|
"Set ctx-size?",
|
|
`Edit preset [${id}] → ctx-size=${size} and reload the model?`,
|
|
);
|
|
if (!ok) return;
|
|
try {
|
|
ctx.ui.setStatus("ai-server", "Editing preset…");
|
|
await setPresetKey(id, "ctx-size", String(size));
|
|
ctx.ui.setStatus("ai-server", `Reloading ${id}…`);
|
|
await reloadOneModel(id);
|
|
ctx.ui.setStatus("ai-server", undefined);
|
|
ctx.ui.notify(`${id}: ctx-size=${size}, reloaded`, "info");
|
|
} catch (err) {
|
|
ctx.ui.setStatus("ai-server", undefined);
|
|
ctx.ui.notify(
|
|
`ctx update failed: ${(err as Error).message}`,
|
|
"error",
|
|
);
|
|
}
|
|
},
|
|
});
|
|
|
|
pi.registerCommand("ai-server-preset", {
|
|
description: "Print llama-swap config on the ai-server",
|
|
handler: async (_args, ctx) => {
|
|
try {
|
|
const text = await readPreset();
|
|
ctx.ui.notify(text, "info");
|
|
} catch (err) {
|
|
ctx.ui.notify(
|
|
`Preset read failed: ${(err as Error).message}`,
|
|
"error",
|
|
);
|
|
}
|
|
},
|
|
});
|
|
|
|
pi.registerCommand("ai-server-restart", {
|
|
description: "Restart the ai-server llama-swap service",
|
|
handler: async (_args, ctx) => {
|
|
const ok = await ctx.ui.confirm(
|
|
"Restart llama-server?",
|
|
"This unloads all models and kills in-flight requests.",
|
|
);
|
|
if (!ok) return;
|
|
try {
|
|
ctx.ui.setStatus("ai-server", "Restarting…");
|
|
const status = await restartService();
|
|
ctx.ui.setStatus("ai-server", undefined);
|
|
ctx.ui.notify(`Service: ${status.trim()}`, "info");
|
|
} catch (err) {
|
|
ctx.ui.setStatus("ai-server", undefined);
|
|
ctx.ui.notify(
|
|
`Restart failed: ${(err as Error).message}`,
|
|
"error",
|
|
);
|
|
}
|
|
},
|
|
});
|
|
|
|
}
|