Files
shahondin1624 f7af660727 migrate ai-server extension from llama.cpp router to llama-swap
Endpoint rewrites:
  - GET /v1/models + /running → merged listModels() with running flag
  - POST /models/load → GET /upstream/<id>/health (warm load)
  - POST /models/unload → POST /api/models/unload/<id> (no body)
  - Added POST /api/models/unload for unloadAll()

Config migration:
  - Preset path: ~/.llama-models.ini → ~/.config/llama-swap/config.yaml
  - Service unit: llama-server.service → llama-swap.service
  - setPresetKey() rewritten from INI awk to YAML-aware awk for
    editing --ctx-size/--temp/--n-gpu-layers in cmd: blocks

Per-model ctx-size (fixes 0/33k bug):
  - parseCtxMapFromYaml(): walks config.yaml, extracts --ctx-size N per
    model block → Map<id, ctxSize>
  - extractCtxFromRunningCmd(): parses --ctx-size from /running cmd string
  - discoverModels(): Promise.all(listModels, listRunning, readPreset),
    ctx priority: running cmd → yaml → 32768 fallback
  - Removed broken extractCtxSize stub and dangling imports

Tests: 14 passing (parseCtxMapFromYaml ×5, extractCtxFromRunningCmd ×3,
isShardArtefact ×3, isReasoningModel ×3)

README: full rewrite covering llama-swap architecture, YAML config format,
new endpoints, troubleshooting table updated.
2026-05-27 10:42:19 +02:00

342 lines
10 KiB
TypeScript

import { exec as execCb } from "node:child_process";
import * as https from "node:https";
import { URL } from "node:url";
import { promisify } from "node:util";
import {
AI_SERVER_MODELS_PATH,
AI_SERVER_PRESET_PATH,
AI_SERVER_RUNNING_PATH,
AI_SERVER_SERVICE_UNIT,
AI_SERVER_SSH_HOST,
AI_SERVER_UNLOAD_ALL_PATH,
AI_SERVER_UNLOAD_PATH,
AI_SERVER_UPSTREAM_HEALTH_PATH,
AI_SERVER_URL,
type ServerModel,
getAdminTimeoutMs,
loadCerts,
} from "./config.js";
import {
parseCtxMapFromYaml,
extractCtxFromRunningCmd,
isReasoningModel,
isShardArtefact,
} from "./router-utils.js";
// Re-export so existing index.ts imports keep working.
export { isReasoningModel };
const exec = promisify(execCb);
// ─── HTTP router API (via Caddy mTLS) ────────────────────────────────────
async function routerRequest(
method: "GET" | "POST",
path: string,
body?: unknown,
): Promise<any> {
const certs = loadCerts();
const url = new URL(AI_SERVER_URL + path);
const data = body === undefined ? undefined : JSON.stringify(body);
return new Promise((resolve, reject) => {
// No `ca:` — server cert is LE-issued; Node's default Mozilla bundle
// covers validation. Client cert/key still required for mTLS.
const req = https.request(
{
hostname: url.hostname,
port: url.port ? Number(url.port) : 443,
path: url.pathname + url.search,
method,
headers: {
Accept: "application/json",
...(data !== undefined
? {
"Content-Type": "application/json",
"Content-Length": Buffer.byteLength(data),
}
: {}),
},
cert: certs.cert,
key: certs.key,
timeout: getAdminTimeoutMs(),
},
(res) => {
let buf = "";
res.setEncoding("utf-8");
res.on("data", (chunk: string) => {
buf += chunk;
});
res.on("end", () => {
const sc = res.statusCode ?? 0;
if (sc < 200 || sc >= 300) {
reject(new Error(`HTTP ${sc}: ${buf.slice(0, 400).trim()}`));
return;
}
try {
resolve(buf.trim() ? JSON.parse(buf) : {});
} catch {
resolve({ raw: buf });
}
});
res.on("error", reject);
},
);
req.on("error", reject);
req.on("timeout", () => req.destroy(new Error("Router request timed out")));
if (data !== undefined) req.write(data);
req.end();
});
}
export interface RouterModel {
id: string;
object?: string;
created?: number;
owned_by?: string;
/** Whether the model is currently loaded in llama-swap. */
running?: boolean;
}
export async function listModels(): Promise<RouterModel[]> {
// llama-swap: GET /v1/models returns { data: [{ id, object, created, owned_by }] }
// GET /running returns { running: [{ id, ... }] }
// We merge: every model from /v1/models gets a `running` flag from /running.
const [modelsRes, runningRes] = await Promise.all([
routerRequest("GET", AI_SERVER_MODELS_PATH),
routerRequest("GET", AI_SERVER_RUNNING_PATH),
]);
const models: RouterModel[] = (modelsRes?.data ?? []) as RouterModel[];
const runningIds = new Set<string>();
if (runningRes?.running && Array.isArray(runningRes.running)) {
for (const entry of runningRes.running as Record<string, unknown>[]) {
if (entry.id) runningIds.add(String(entry.id));
}
}
for (const m of models) {
m.running = runningIds.has(m.id);
}
return models;
}
// Short TTL cache for listModels — tab-completion calls the completer on
// every Tab press, which would otherwise fire an HTTPS round-trip each
// time. Five seconds is long enough to dedupe back-to-back completions
// but short enough that a /ai-server-load still sees near-fresh state.
const LIST_MODELS_TTL_MS = 5_000;
let cachedList: { at: number; models: RouterModel[] } | null = null;
export async function listModelsCached(): Promise<RouterModel[]> {
if (cachedList && Date.now() - cachedList.at < LIST_MODELS_TTL_MS) {
return cachedList.models;
}
const models = await listModels();
cachedList = { at: Date.now(), models };
return models;
}
export function invalidateListModelsCache(): void {
cachedList = null;
}
export async function loadModel(id: string): Promise<unknown> {
// llama-swap: GET /upstream/<id>/health forces a spawn (warm load).
// 2xx = success; plain text OK body is acceptable.
const r = await routerRequest("GET", AI_SERVER_UPSTREAM_HEALTH_PATH(id));
invalidateListModelsCache();
return r;
}
export async function unloadModel(id: string): Promise<unknown> {
// llama-swap: POST /api/models/unload/<id>, no body. Returns plain text "OK".
const r = await routerRequest("POST", AI_SERVER_UNLOAD_PATH(id));
invalidateListModelsCache();
return r;
}
export async function unloadAll(): Promise<unknown> {
// llama-swap: POST /api/models/unload, no body.
const r = await routerRequest("POST", AI_SERVER_UNLOAD_ALL_PATH);
invalidateListModelsCache();
return r;
}
// llama-swap /v1/models only returns registered presets (all have a model
// path). Placeholder sections are not exposed. We only filter out shard
// artefacts.
interface RunningEntry {
model: string;
cmd?: string;
state?: string;
ttl?: number;
proxy?: string;
}
async function listRunning(): Promise<RunningEntry[]> {
const res = await routerRequest("GET", AI_SERVER_RUNNING_PATH);
return Array.isArray((res as any)?.running)
? (res as any).running
: [];
}
export async function discoverModels(): Promise<ServerModel[]> {
const [models, running, yaml] = await Promise.all([
listModels(),
listRunning().catch(() => [] as RunningEntry[]),
readPreset().catch(() => ""),
]);
const ctxFromYaml = parseCtxMapFromYaml(yaml);
const ctxFromRunning = new Map<string, number>();
for (const r of running) {
const n = extractCtxFromRunningCmd(r.cmd);
if (n) ctxFromRunning.set(r.model, n);
}
return models
.filter((m) => !isShardArtefact(m.id))
.map((m) => {
const ctx =
ctxFromRunning.get(m.id) ?? // live process is authoritative
ctxFromYaml.get(m.id) ?? // config.yaml is next best
32768; // last-resort fallback
return {
id: m.id,
name: `${m.id} (AI Server)`,
reasoning: isReasoningModel(m.id),
contextWindow: ctx,
maxTokens: Math.min(16384, Math.max(2048, Math.floor(ctx / 2))),
};
});
}
// ─── SSH helpers ─────────────────────────────────────────────────────────
function shQuote(s: string): string {
return `'${s.replace(/'/g, `'\\''`)}'`;
}
async function runSsh(remoteCmd: string, timeoutMs = 60_000): Promise<string> {
const cmd = `ssh -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new ${shQuote(
AI_SERVER_SSH_HOST,
)} ${shQuote(remoteCmd)}`;
try {
const { stdout } = await exec(cmd, {
timeout: timeoutMs,
maxBuffer: 4 * 1024 * 1024,
});
return stdout;
} catch (err: any) {
const stderr = (err?.stderr ?? "").toString().trim();
const msg = stderr || err?.message || String(err);
throw new Error(`ssh ${AI_SERVER_SSH_HOST}: ${msg}`);
}
}
export async function readPreset(): Promise<string> {
return runSsh(`cat ${AI_SERVER_PRESET_PATH}`);
}
/**
* Set a `key = value` inside a named YAML section for llama-swap.
*
* llama-swap config.yaml structure (relevant excerpt):
*
* models:
* Qwen_Qwen3.6-35B-A3B-Q8_0:
* cmd: |
* /path/to/llama-server --model /path/to/gguf ...
* --ctx-size 32768
* --temp 0.7
*
* This function finds the `<id>:` block under `models:`, locates the
* `--ctx-size N` line (or other supported flags), and replaces N.
*
* Supported keys: ctx-size, temp, n-gpu-layers
*/
export async function setPresetKey(
section: string,
key: string,
value: string,
): Promise<void> {
// Map short key names to the actual CLI flag used in cmd:
const flagMap: Record<string, string> = {
"ctx-size": "--ctx-size",
"temp": "--temp",
"n-gpu-layers": "--n-gpu-layers",
};
const flag = flagMap[key] ?? `--${key}`;
// We use a sed-based approach on the YAML file:
// 1. Find the <section>: block under models:
// 2. Within that block, find the --flag N line
// 3. Replace N with the new value
//
// The sed script works line-by-line:
// - When we see ` ${section}:` under models:, enter editing mode
// - While editing, look for `--flag <number>` and replace it
// - Exit editing mode when we hit a line at the same or lesser indent
// that is not under this section
const escapedSection = section.replace(/[.[\]*/^$]/g, "\\$&");
const escapedFlag = flag.replace(/[.[\]*/^$]/g, "\\$&");
const awkScript = `
awk -v sec="${escapedSection}" -v flag="${escapedFlag}" -v val="${value}" '
BEGIN { in_sec = 0; indent = 0 }
{
# Detect section header: " <section>:" (2-space indent, key followed by colon)
if (!in_sec && match($0, /^[[:space:]]{2}'${escapedSection}':[[:space:]]*$/)) {
in_sec = 1;
indent = 2;
}
# If we are in a section, check if we left it
if (in_sec) {
lineIndent = 0;
m = match($0, /^[[:space:]]*/);
if (m > 0) lineIndent = RLENGTH;
# If indent is <= 2 and line is not empty and not a continuation of cmd,
# we have left this section
if (lineIndent <= 2 && $0 !~ /^[[:space:]]*$/) {
in_sec = 0;
}
}
if (in_sec && match($0, " " flag " [0-9]+")) {
sub(flag " [0-9]+", flag " " val);
}
print
}
' ${AI_SERVER_PRESET_PATH} > ${AI_SERVER_PRESET_PATH}.tmp && mv ${AI_SERVER_PRESET_PATH}.tmp ${AI_SERVER_PRESET_PATH}
`.trim();
try {
await runSsh(awkScript);
} catch (err: any) {
const msg = err?.message ?? String(err);
if (msg.includes("exit code 2") || msg.match(/exit.*2/)) {
throw new Error(
`Key "${key}" not found for model "${section}" — add it to the preset manually first.`,
);
}
throw err;
}
}
export async function restartService(): Promise<string> {
return runSsh(
`systemctl --user restart ${AI_SERVER_SERVICE_UNIT} && systemctl --user is-active ${AI_SERVER_SERVICE_UNIT}`,
);
}
export async function reloadOneModel(id: string): Promise<void> {
try {
await unloadModel(id);
} catch {
// Ignore unload errors (model may not be loaded).
}
// Router needs a beat to clear the slot.
await new Promise((r) => setTimeout(r, 2000));
await loadModel(id);
}