Files
pi-extensions/ai-server/index.ts
T
shahondin1624 f7af660727 migrate ai-server extension from llama.cpp router to llama-swap
Endpoint rewrites:
  - GET /v1/models + /running → merged listModels() with running flag
  - POST /models/load → GET /upstream/<id>/health (warm load)
  - POST /models/unload → POST /api/models/unload/<id> (no body)
  - Added POST /api/models/unload for unloadAll()

Config migration:
  - Preset path: ~/.llama-models.ini → ~/.config/llama-swap/config.yaml
  - Service unit: llama-server.service → llama-swap.service
  - setPresetKey() rewritten from INI awk to YAML-aware awk for
    editing --ctx-size/--temp/--n-gpu-layers in cmd: blocks

Per-model ctx-size (fixes 0/33k bug):
  - parseCtxMapFromYaml(): walks config.yaml, extracts --ctx-size N per
    model block → Map<id, ctxSize>
  - extractCtxFromRunningCmd(): parses --ctx-size from /running cmd string
  - discoverModels(): Promise.all(listModels, listRunning, readPreset),
    ctx priority: running cmd → yaml → 32768 fallback
  - Removed broken extractCtxSize stub and dangling imports

Tests: 14 passing (parseCtxMapFromYaml ×5, extractCtxFromRunningCmd ×3,
isShardArtefact ×3, isReasoningModel ×3)

README: full rewrite covering llama-swap architecture, YAML config format,
new endpoints, troubleshooting table updated.
2026-05-27 10:42:19 +02:00

281 lines
8.0 KiB
TypeScript

import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
import {
discoverModels,
listModels,
listModelsCached,
loadModel,
readPreset,
reloadOneModel,
restartService,
setPresetKey,
unloadModel,
} from "./admin.js";
import {
AI_SERVER_API_BASE,
AI_SERVER_API_ID,
AI_SERVER_PROVIDER_ID,
AI_SERVER_URL,
MODELS as STATIC_MODELS,
type ServerModel,
} from "./config.js";
import { streamAiServer } from "./stream.js";
async function completeModelId(prefix: string) {
try {
// Cached for 5s. Tab-completion calls the completer on every keystroke,
// but the user typically only types one model id per command — caching
// deduplicates the network round-trip without stale-state harm.
const models = await listModelsCached();
const hits = models
.filter((m) => m.id.startsWith(prefix))
.map((m) => ({ value: m.id, label: m.id }));
return hits.length > 0 ? hits : null;
} catch {
return null;
}
}
function registerProviderWithModels(
pi: ExtensionAPI,
models: ServerModel[],
): void {
pi.registerProvider(AI_SERVER_PROVIDER_ID, {
baseUrl: AI_SERVER_API_BASE,
apiKey: "ai-server-mtls",
api: AI_SERVER_API_ID as any,
models: models.map((m) => ({
id: m.id,
name: m.name,
reasoning: m.reasoning,
input: ["text"],
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
contextWindow: m.contextWindow,
maxTokens: m.maxTokens,
// Informational compat block. Our `streamSimple` is custom and
// builds the request body itself (see stream.ts), but pi-mono's
// model-list UI / capability detection reads these fields.
compat: {
thinkingFormat: "qwen-chat-template",
supportsReasoningEffort: m.reasoning,
supportsDeveloperRole: false,
supportsUsageInStreaming: true,
maxTokensField: "max_tokens",
},
})),
streamSimple: streamAiServer,
});
}
const DISCOVERY_FAST_TIMEOUT_MS = 300;
export default async function (pi: ExtensionAPI) {
// Register the provider IMMEDIATELY with the static fallback list so pi
// startup isn't blocked on the HTTPS round-trip in the worst case.
registerProviderWithModels(pi, STATIC_MODELS);
// Then race real discovery against a short timeout. On LAN the router
// answers in ~40ms and pi --list-models sees the live list. On slow
// networks we bail at 300ms and the fallback is what the user sees; the
// background promise keeps running and re-registers later.
const discovery = discoverModels().catch((err) => {
if (process.env.PI_DEBUG) {
console.log(
`[ai-server] Discovery failed (${(err as Error).message}); fallback remains`,
);
}
return null;
});
const timeout = new Promise<null>((r) =>
setTimeout(() => r(null), DISCOVERY_FAST_TIMEOUT_MS),
);
const fastResult = await Promise.race([discovery, timeout]);
if (fastResult && fastResult.length > 0) {
registerProviderWithModels(pi, fastResult);
if (process.env.PI_DEBUG) {
console.log(
`[ai-server] Discovered ${fastResult.length} model(s) on ${AI_SERVER_URL}: ${fastResult.map((m) => m.id).join(", ")}`,
);
}
} else {
// Slow network or discovery still pending — keep waiting in the
// background and update the provider once it arrives.
discovery.then((models) => {
if (models && models.length > 0) {
registerProviderWithModels(pi, models);
if (process.env.PI_DEBUG) {
console.log(
`[ai-server] Late discovery: ${models.length} model(s)`,
);
}
}
});
}
// ─── Admin commands ──────────────────────────────────────────────────
pi.registerCommand("ai-server-status", {
description: "Show ai-server model status and ctx sizes",
handler: async (_args, ctx) => {
try {
const routerModels = await listModels();
const lines = [`AI Server: ${AI_SERVER_URL}`];
for (const m of routerModels) {
const status = m.running ? "loaded" : "unloaded";
lines.push(` ${m.id} [${status}]`);
}
ctx.ui.notify(lines.join("\n"), "info");
} catch (err) {
ctx.ui.notify(
`ai-server-status failed: ${(err as Error).message}`,
"error",
);
}
},
});
pi.registerCommand("ai-server-refresh", {
description: "Re-discover models from the ai-server router",
handler: async (_args, ctx) => {
try {
const discovered = await discoverModels();
if (discovered.length === 0) {
ctx.ui.notify(
"No runnable models on server (all presets missing --model path)",
"warning",
);
return;
}
registerProviderWithModels(pi, discovered);
ctx.ui.notify(
`Registered ${discovered.length} model(s): ${discovered
.map((m) => m.id)
.join(", ")}`,
"info",
);
} catch (err) {
ctx.ui.notify(
`Refresh failed: ${(err as Error).message}`,
"error",
);
}
},
});
pi.registerCommand("ai-server-load", {
description: "Load a model (usage: /ai-server-load <id>)",
getArgumentCompletions: completeModelId,
handler: async (args, ctx) => {
const id = args.trim();
if (!id) {
ctx.ui.notify("Usage: /ai-server-load <model-id>", "error");
return;
}
try {
ctx.ui.setStatus("ai-server", `Loading ${id}`);
await loadModel(id);
ctx.ui.setStatus("ai-server", undefined);
ctx.ui.notify(`Loaded ${id}`, "info");
} catch (err) {
ctx.ui.setStatus("ai-server", undefined);
ctx.ui.notify(`Load failed: ${(err as Error).message}`, "error");
}
},
});
pi.registerCommand("ai-server-unload", {
description: "Unload a model (usage: /ai-server-unload <id>)",
getArgumentCompletions: completeModelId,
handler: async (args, ctx) => {
const id = args.trim();
if (!id) {
ctx.ui.notify("Usage: /ai-server-unload <model-id>", "error");
return;
}
try {
await unloadModel(id);
ctx.ui.notify(`Unloaded ${id}`, "info");
} catch (err) {
ctx.ui.notify(`Unload failed: ${(err as Error).message}`, "error");
}
},
});
pi.registerCommand("ai-server-ctx", {
description:
"Set ctx-size for a model and reload (usage: /ai-server-ctx <id> <size>)",
getArgumentCompletions: completeModelId,
handler: async (args, ctx) => {
const parts = args.trim().split(/\s+/).filter(Boolean);
if (parts.length !== 2) {
ctx.ui.notify("Usage: /ai-server-ctx <model-id> <size>", "error");
return;
}
const [id, sizeStr] = parts;
const size = Number(sizeStr);
if (!Number.isInteger(size) || size < 512) {
ctx.ui.notify(`Invalid size: ${sizeStr}`, "error");
return;
}
const ok = await ctx.ui.confirm(
"Set ctx-size?",
`Edit preset [${id}] → ctx-size=${size} and reload the model?`,
);
if (!ok) return;
try {
ctx.ui.setStatus("ai-server", "Editing preset…");
await setPresetKey(id, "ctx-size", String(size));
ctx.ui.setStatus("ai-server", `Reloading ${id}`);
await reloadOneModel(id);
ctx.ui.setStatus("ai-server", undefined);
ctx.ui.notify(`${id}: ctx-size=${size}, reloaded`, "info");
} catch (err) {
ctx.ui.setStatus("ai-server", undefined);
ctx.ui.notify(
`ctx update failed: ${(err as Error).message}`,
"error",
);
}
},
});
pi.registerCommand("ai-server-preset", {
description: "Print llama-swap config on the ai-server",
handler: async (_args, ctx) => {
try {
const text = await readPreset();
ctx.ui.notify(text, "info");
} catch (err) {
ctx.ui.notify(
`Preset read failed: ${(err as Error).message}`,
"error",
);
}
},
});
pi.registerCommand("ai-server-restart", {
description: "Restart the ai-server llama-swap service",
handler: async (_args, ctx) => {
const ok = await ctx.ui.confirm(
"Restart llama-server?",
"This unloads all models and kills in-flight requests.",
);
if (!ok) return;
try {
ctx.ui.setStatus("ai-server", "Restarting…");
const status = await restartService();
ctx.ui.setStatus("ai-server", undefined);
ctx.ui.notify(`Service: ${status.trim()}`, "info");
} catch (err) {
ctx.ui.setStatus("ai-server", undefined);
ctx.ui.notify(
`Restart failed: ${(err as Error).message}`,
"error",
);
}
},
});
}