Files
pi-extensions/ai-server/admin.ts
T
shahondin1624 ede2645189 migrate ai-server extension from llama.cpp router to llama-swap
- Update endpoint paths: /v1/models, /running, /api/models/unload/<id>,
  /upstream/<id>/health
- Change preset config from INI (~/.llama-models.ini) to YAML
  (~/.config/llama-swap/config.yaml)
- Rewrite setPresetKey() for YAML-aware awk editing of --ctx-size in
  cmd: blocks
- Update listModels() to merge /v1/models + /running data
- Rewrite loadModel() to use GET /upstream/<id>/health
- Update unloadModel() to POST /api/models/unload/<id>
- Add unloadAll() for POST /api/models/unload
- Update RouterModel interface: replace status.value/args with running bool
- extractCtxSize() now returns null (llama-swap doesn't expose worker args)
- Update discoverModels() to use default ctx=32768
- Update ai-server-status to use m.running boolean
- Update restartService() to use AI_SERVER_SERVICE_UNIT
- Update all README docs and troubleshooting table
- Update router-utils tests for new interface
2026-05-27 09:48:08 +02:00

312 lines
9.4 KiB
TypeScript

import { exec as execCb } from "node:child_process";
import * as https from "node:https";
import { URL } from "node:url";
import { promisify } from "node:util";
import {
AI_SERVER_MODELS_PATH,
AI_SERVER_PRESET_PATH,
AI_SERVER_RUNNING_PATH,
AI_SERVER_SERVICE_UNIT,
AI_SERVER_SSH_HOST,
AI_SERVER_UNLOAD_ALL_PATH,
AI_SERVER_UNLOAD_PATH,
AI_SERVER_UPSTREAM_HEALTH_PATH,
AI_SERVER_URL,
type ServerModel,
getAdminTimeoutMs,
loadCerts,
} from "./config.js";
import {
extractCtxSize,
isReasoningModel,
isShardArtefact,
} from "./router-utils.js";
// Re-export so existing index.ts imports keep working.
export { extractCtxSize, isReasoningModel };
const exec = promisify(execCb);
// ─── HTTP router API (via Caddy mTLS) ────────────────────────────────────
async function routerRequest(
method: "GET" | "POST",
path: string,
body?: unknown,
): Promise<any> {
const certs = loadCerts();
const url = new URL(AI_SERVER_URL + path);
const data = body === undefined ? undefined : JSON.stringify(body);
return new Promise((resolve, reject) => {
// No `ca:` — server cert is LE-issued; Node's default Mozilla bundle
// covers validation. Client cert/key still required for mTLS.
const req = https.request(
{
hostname: url.hostname,
port: url.port ? Number(url.port) : 443,
path: url.pathname + url.search,
method,
headers: {
Accept: "application/json",
...(data !== undefined
? {
"Content-Type": "application/json",
"Content-Length": Buffer.byteLength(data),
}
: {}),
},
cert: certs.cert,
key: certs.key,
timeout: getAdminTimeoutMs(),
},
(res) => {
let buf = "";
res.setEncoding("utf-8");
res.on("data", (chunk: string) => {
buf += chunk;
});
res.on("end", () => {
const sc = res.statusCode ?? 0;
if (sc < 200 || sc >= 300) {
reject(new Error(`HTTP ${sc}: ${buf.slice(0, 400).trim()}`));
return;
}
try {
resolve(buf.trim() ? JSON.parse(buf) : {});
} catch {
resolve({ raw: buf });
}
});
res.on("error", reject);
},
);
req.on("error", reject);
req.on("timeout", () => req.destroy(new Error("Router request timed out")));
if (data !== undefined) req.write(data);
req.end();
});
}
export interface RouterModel {
id: string;
object?: string;
created?: number;
owned_by?: string;
/** Whether the model is currently loaded in llama-swap. */
running?: boolean;
}
export async function listModels(): Promise<RouterModel[]> {
// llama-swap: GET /v1/models returns { data: [{ id, object, created, owned_by }] }
// GET /running returns { running: [{ id, ... }] }
// We merge: every model from /v1/models gets a `running` flag from /running.
const [modelsRes, runningRes] = await Promise.all([
routerRequest("GET", AI_SERVER_MODELS_PATH),
routerRequest("GET", AI_SERVER_RUNNING_PATH),
]);
const models: RouterModel[] = (modelsRes?.data ?? []) as RouterModel[];
const runningIds = new Set<string>();
if (runningRes?.running && Array.isArray(runningRes.running)) {
for (const entry of runningRes.running as Record<string, unknown>[]) {
if (entry.id) runningIds.add(String(entry.id));
}
}
for (const m of models) {
m.running = runningIds.has(m.id);
}
return models;
}
// Short TTL cache for listModels — tab-completion calls the completer on
// every Tab press, which would otherwise fire an HTTPS round-trip each
// time. Five seconds is long enough to dedupe back-to-back completions
// but short enough that a /ai-server-load still sees near-fresh state.
const LIST_MODELS_TTL_MS = 5_000;
let cachedList: { at: number; models: RouterModel[] } | null = null;
export async function listModelsCached(): Promise<RouterModel[]> {
if (cachedList && Date.now() - cachedList.at < LIST_MODELS_TTL_MS) {
return cachedList.models;
}
const models = await listModels();
cachedList = { at: Date.now(), models };
return models;
}
export function invalidateListModelsCache(): void {
cachedList = null;
}
export async function loadModel(id: string): Promise<unknown> {
// llama-swap: GET /upstream/<id>/health forces a spawn (warm load).
// 2xx = success; plain text OK body is acceptable.
const r = await routerRequest("GET", AI_SERVER_UPSTREAM_HEALTH_PATH(id));
invalidateListModelsCache();
return r;
}
export async function unloadModel(id: string): Promise<unknown> {
// llama-swap: POST /api/models/unload/<id>, no body. Returns plain text "OK".
const r = await routerRequest("POST", AI_SERVER_UNLOAD_PATH(id));
invalidateListModelsCache();
return r;
}
export async function unloadAll(): Promise<unknown> {
// llama-swap: POST /api/models/unload, no body.
const r = await routerRequest("POST", AI_SERVER_UNLOAD_ALL_PATH);
invalidateListModelsCache();
return r;
}
// llama-swap /v1/models only returns registered presets (all have a model
// path). Placeholder sections are not exposed. We only filter out shard
// artefacts.
export async function discoverModels(): Promise<ServerModel[]> {
const models = await listModels();
return models
.filter((m) => !isShardArtefact(m.id))
.map((m) => {
// llama-swap doesn't expose ctx-size in the API; use a sensible default.
const ctx = 32768;
return {
id: m.id,
name: `${m.id} (AI Server)`,
reasoning: isReasoningModel(m.id),
contextWindow: ctx,
maxTokens: Math.min(16384, Math.max(2048, Math.floor(ctx / 2))),
};
});
}
// ─── SSH helpers ─────────────────────────────────────────────────────────
function shQuote(s: string): string {
return `'${s.replace(/'/g, `'\\''`)}'`;
}
async function runSsh(remoteCmd: string, timeoutMs = 60_000): Promise<string> {
const cmd = `ssh -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new ${shQuote(
AI_SERVER_SSH_HOST,
)} ${shQuote(remoteCmd)}`;
try {
const { stdout } = await exec(cmd, {
timeout: timeoutMs,
maxBuffer: 4 * 1024 * 1024,
});
return stdout;
} catch (err: any) {
const stderr = (err?.stderr ?? "").toString().trim();
const msg = stderr || err?.message || String(err);
throw new Error(`ssh ${AI_SERVER_SSH_HOST}: ${msg}`);
}
}
export async function readPreset(): Promise<string> {
return runSsh(`cat ${AI_SERVER_PRESET_PATH}`);
}
/**
* Set a `key = value` inside a named YAML section for llama-swap.
*
* llama-swap config.yaml structure (relevant excerpt):
*
* models:
* Qwen_Qwen3.6-35B-A3B-Q8_0:
* cmd: |
* /path/to/llama-server --model /path/to/gguf ...
* --ctx-size 32768
* --temp 0.7
*
* This function finds the `<id>:` block under `models:`, locates the
* `--ctx-size N` line (or other supported flags), and replaces N.
*
* Supported keys: ctx-size, temp, n-gpu-layers
*/
export async function setPresetKey(
section: string,
key: string,
value: string,
): Promise<void> {
// Map short key names to the actual CLI flag used in cmd:
const flagMap: Record<string, string> = {
"ctx-size": "--ctx-size",
"temp": "--temp",
"n-gpu-layers": "--n-gpu-layers",
};
const flag = flagMap[key] ?? `--${key}`;
// We use a sed-based approach on the YAML file:
// 1. Find the <section>: block under models:
// 2. Within that block, find the --flag N line
// 3. Replace N with the new value
//
// The sed script works line-by-line:
// - When we see ` ${section}:` under models:, enter editing mode
// - While editing, look for `--flag <number>` and replace it
// - Exit editing mode when we hit a line at the same or lesser indent
// that is not under this section
const escapedSection = section.replace(/[.[\]*/^$]/g, "\\$&");
const escapedFlag = flag.replace(/[.[\]*/^$]/g, "\\$&");
const awkScript = `
awk -v sec="${escapedSection}" -v flag="${escapedFlag}" -v val="${value}" '
BEGIN { in_sec = 0; indent = 0 }
{
# Detect section header: " <section>:" (2-space indent, key followed by colon)
if (!in_sec && match($0, /^[[:space:]]{2}'${escapedSection}':[[:space:]]*$/)) {
in_sec = 1;
indent = 2;
}
# If we are in a section, check if we left it
if (in_sec) {
lineIndent = 0;
m = match($0, /^[[:space:]]*/);
if (m > 0) lineIndent = RLENGTH;
# If indent is <= 2 and line is not empty and not a continuation of cmd,
# we have left this section
if (lineIndent <= 2 && $0 !~ /^[[:space:]]*$/) {
in_sec = 0;
}
}
if (in_sec && match($0, " " flag " [0-9]+")) {
sub(flag " [0-9]+", flag " " val);
}
print
}
' ${AI_SERVER_PRESET_PATH} > ${AI_SERVER_PRESET_PATH}.tmp && mv ${AI_SERVER_PRESET_PATH}.tmp ${AI_SERVER_PRESET_PATH}
`.trim();
try {
await runSsh(awkScript);
} catch (err: any) {
const msg = err?.message ?? String(err);
if (msg.includes("exit code 2") || msg.match(/exit.*2/)) {
throw new Error(
`Key "${key}" not found for model "${section}" — add it to the preset manually first.`,
);
}
throw err;
}
}
export async function restartService(): Promise<string> {
return runSsh(
`systemctl --user restart ${AI_SERVER_SERVICE_UNIT} && systemctl --user is-active ${AI_SERVER_SERVICE_UNIT}`,
);
}
export async function reloadOneModel(id: string): Promise<void> {
try {
await unloadModel(id);
} catch {
// Ignore unload errors (model may not be loaded).
}
// Router needs a beat to clear the slot.
await new Promise((r) => setTimeout(r, 2000));
await loadModel(id);
}