pi-extensions/ai-server/admin.ts

import { exec as execCb } from "node:child_process";
import * as https from "node:https";
import { URL } from "node:url";
import { promisify } from "node:util";
import {
	AI_SERVER_PRESET_PATH,
	AI_SERVER_SSH_HOST,
	AI_SERVER_URL,
	type ServerModel,
	loadCerts,
} from "./config.js";

const exec = promisify(execCb);

// ─── HTTP router API (via Caddy mTLS) ────────────────────────────────────

async function routerRequest(
	method: "GET" | "POST",
	path: string,
	body?: unknown,
): Promise<any> {
	const certs = loadCerts();
	const url = new URL(AI_SERVER_URL + path);
	const data = body === undefined ? undefined : JSON.stringify(body);

	return new Promise((resolve, reject) => {
		const req = https.request(
			{
				hostname: url.hostname,
				port: url.port ? Number(url.port) : 443,
				path: url.pathname + url.search,
				method,
				headers: {
					Accept: "application/json",
					...(data !== undefined
						? {
								"Content-Type": "application/json",
								"Content-Length": Buffer.byteLength(data),
							}
						: {}),
				},
				ca: certs.ca,
				cert: certs.cert,
				key: certs.key,
				timeout: 30_000,
			},
			(res) => {
				let buf = "";
				res.setEncoding("utf-8");
				res.on("data", (chunk: string) => {
					buf += chunk;
				});
				res.on("end", () => {
					const sc = res.statusCode ?? 0;
					if (sc < 200 || sc >= 300) {
						reject(new Error(`HTTP ${sc}: ${buf.slice(0, 400).trim()}`));
						return;
					}
					try {
						resolve(buf.trim() ? JSON.parse(buf) : {});
					} catch {
						resolve({ raw: buf });
					}
				});
				res.on("error", reject);
			},
		);
		req.on("error", reject);
		req.on("timeout", () => req.destroy(new Error("Router request timed out")));
		if (data !== undefined) req.write(data);
		req.end();
	});
}

export interface RouterModel {
	id: string;
	status: { value: "loaded" | "unloaded" | "loading"; args: string[] };
}

export async function listModels(): Promise<RouterModel[]> {
	const data = await routerRequest("GET", "/models");
	return (data?.data ?? []) as RouterModel[];
}

export async function loadModel(id: string): Promise<unknown> {
	// The router's handler reads `body["model"]`; passing `{id}` yields a 404.
	return routerRequest("POST", "/models/load", { model: id });
}

export async function unloadModel(id: string): Promise<unknown> {
	return routerRequest("POST", "/models/unload", { model: id });
}

export function extractCtxSize(m: RouterModel): number | null {
	const args = m.status?.args ?? [];
	const i = args.indexOf("--ctx-size");
	if (i < 0 || i + 1 >= args.length) return null;
	const n = Number(args[i + 1]);
	return Number.isFinite(n) ? n : null;
}

// A preset is "runnable" only if it has a --model path. Placeholder sections
// like [small-7b] without model = ... show up in /models but have no --model
// arg and would fail on load.
function isRunnable(m: RouterModel): boolean {
	return (m.status?.args ?? []).includes("--model");
}

// llama.cpp's --models-autoload scans the --models-dir and registers every
// .gguf (and GGUF-split shard) it finds. Multi-shard models surface as
// one id per shard with names like "<name>-00001-of-00003". These are
// duplicates of whatever preset section points at the first shard. Skip them.
function isShardArtefact(id: string): boolean {
	return /-\d+-of-\d+$/.test(id);
}

export async function discoverModels(): Promise<ServerModel[]> {
	const models = await listModels();
	return models
		.filter(isRunnable)
		.filter((m) => !isShardArtefact(m.id))
		.map((m) => {
		const ctx = extractCtxSize(m) ?? 32768;
		return {
			id: m.id,
			name: `${m.id} (AI Server)`,
			reasoning: true,
			contextWindow: ctx,
			maxTokens: Math.min(16384, Math.max(2048, Math.floor(ctx / 2))),
		};
	});
}

// ─── SSH helpers ─────────────────────────────────────────────────────────

function shQuote(s: string): string {
	return `'${s.replace(/'/g, `'\\''`)}'`;
}

async function runSsh(remoteCmd: string, timeoutMs = 60_000): Promise<string> {
	const cmd = `ssh -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new ${shQuote(
		AI_SERVER_SSH_HOST,
	)} ${shQuote(remoteCmd)}`;
	try {
		const { stdout } = await exec(cmd, {
			timeout: timeoutMs,
			maxBuffer: 4 * 1024 * 1024,
		});
		return stdout;
	} catch (err: any) {
		const stderr = (err?.stderr ?? "").toString().trim();
		const msg = stderr || err?.message || String(err);
		throw new Error(`ssh ${AI_SERVER_SSH_HOST}: ${msg}`);
	}
}

export async function readPreset(): Promise<string> {
	return runSsh(`cat ${AI_SERVER_PRESET_PATH}`);
}

/**
 * Set a `key = value` line inside a named [section] of the preset file.
 * Preserves comments and all other lines. Errors if the key is absent.
 */
export async function setPresetKey(
	section: string,
	key: string,
	value: string,
): Promise<void> {
	const awkScript = `
awk -v sec="[${section}]" -v key=${shQuote(key)} -v val=${shQuote(value)} '
  BEGIN { in_s = 0; found = 0 }
  /^\\[/ { in_s = ($0 == sec) }
  in_s && $1 == key && $2 == "=" { print key " = " val; found = 1; next }
  { print }
  END { if (!found) exit 2 }
' ${AI_SERVER_PRESET_PATH} > ${AI_SERVER_PRESET_PATH}.tmp && mv ${AI_SERVER_PRESET_PATH}.tmp ${AI_SERVER_PRESET_PATH}
`.trim();
	try {
		await runSsh(awkScript);
	} catch (err: any) {
		const msg = err?.message ?? String(err);
		if (msg.includes("exit code 2") || msg.match(/exit.*2/)) {
			throw new Error(
				`Key "${key}" not found in [${section}] — add it to the preset manually first.`,
			);
		}
		throw err;
	}
}

export async function restartService(): Promise<string> {
	return runSsh(
		"systemctl --user restart llama-server.service && systemctl --user is-active llama-server.service",
	);
}

export async function reloadOneModel(id: string): Promise<void> {
	try {
		await unloadModel(id);
	} catch {
		// Ignore unload errors (model may not be loaded).
	}
	// Router needs a beat to clear the slot.
	await new Promise((r) => setTimeout(r, 2000));
	await loadModel(id);
}