pi-extensions/ai-server/admin.ts

import { exec as execCb } from "node:child_process";
import * as https from "node:https";
import { URL } from "node:url";
import { promisify } from "node:util";
import {
	AI_SERVER_MODELS_PATH,
	AI_SERVER_PRESET_PATH,
	AI_SERVER_RUNNING_PATH,
	AI_SERVER_SERVICE_UNIT,
	AI_SERVER_SSH_HOST,
	AI_SERVER_UNLOAD_ALL_PATH,
	AI_SERVER_UNLOAD_PATH,
	AI_SERVER_UPSTREAM_HEALTH_PATH,
	AI_SERVER_URL,
	type ServerModel,
	getAdminTimeoutMs,
	loadCerts,
} from "./config.js";
import {
	parseCtxMapFromYaml,
	extractCtxFromRunningCmd,
	isReasoningModel,
	isShardArtefact,
} from "./router-utils.js";

// Re-export so existing index.ts imports keep working.
export { isReasoningModel };

const exec = promisify(execCb);

// ─── HTTP router API (via Caddy mTLS) ────────────────────────────────────

async function routerRequest(
	method: "GET" | "POST",
	path: string,
	body?: unknown,
): Promise<any> {
	const certs = loadCerts();
	const url = new URL(AI_SERVER_URL + path);
	const data = body === undefined ? undefined : JSON.stringify(body);

	return new Promise((resolve, reject) => {
		// No `ca:` — server cert is LE-issued; Node's default Mozilla bundle
		// covers validation. Client cert/key still required for mTLS.
		const req = https.request(
			{
				hostname: url.hostname,
				port: url.port ? Number(url.port) : 443,
				path: url.pathname + url.search,
				method,
				headers: {
					Accept: "application/json",
					...(data !== undefined
						? {
								"Content-Type": "application/json",
								"Content-Length": Buffer.byteLength(data),
							}
						: {}),
				},
				cert: certs.cert,
				key: certs.key,
				timeout: getAdminTimeoutMs(),
			},
			(res) => {
				let buf = "";
				res.setEncoding("utf-8");
				res.on("data", (chunk: string) => {
					buf += chunk;
				});
				res.on("end", () => {
					const sc = res.statusCode ?? 0;
					if (sc < 200 || sc >= 300) {
						reject(new Error(`HTTP ${sc}: ${buf.slice(0, 400).trim()}`));
						return;
					}
					try {
						resolve(buf.trim() ? JSON.parse(buf) : {});
					} catch {
						resolve({ raw: buf });
					}
				});
				res.on("error", reject);
			},
		);
		req.on("error", reject);
		req.on("timeout", () => req.destroy(new Error("Router request timed out")));
		if (data !== undefined) req.write(data);
		req.end();
	});
}

export interface RouterModel {
	id: string;
	object?: string;
	created?: number;
	owned_by?: string;
	/** Whether the model is currently loaded in llama-swap. */
	running?: boolean;
}

export async function listModels(): Promise<RouterModel[]> {
	// llama-swap: GET /v1/models returns { data: [{ id, object, created, owned_by }] }
	// GET /running returns { running: [{ id, ... }] }
	// We merge: every model from /v1/models gets a `running` flag from /running.
	const [modelsRes, runningRes] = await Promise.all([
		routerRequest("GET", AI_SERVER_MODELS_PATH),
		routerRequest("GET", AI_SERVER_RUNNING_PATH),
	]);

	const models: RouterModel[] = (modelsRes?.data ?? []) as RouterModel[];
	const runningIds = new Set<string>();
	if (runningRes?.running && Array.isArray(runningRes.running)) {
		for (const entry of runningRes.running as Record<string, unknown>[]) {
			if (entry.id) runningIds.add(String(entry.id));
		}
	}
	for (const m of models) {
		m.running = runningIds.has(m.id);
	}
	return models;
}

// Short TTL cache for listModels — tab-completion calls the completer on
// every Tab press, which would otherwise fire an HTTPS round-trip each
// time. Five seconds is long enough to dedupe back-to-back completions
// but short enough that a /ai-server-load still sees near-fresh state.
const LIST_MODELS_TTL_MS = 5_000;
let cachedList: { at: number; models: RouterModel[] } | null = null;

export async function listModelsCached(): Promise<RouterModel[]> {
	if (cachedList && Date.now() - cachedList.at < LIST_MODELS_TTL_MS) {
		return cachedList.models;
	}
	const models = await listModels();
	cachedList = { at: Date.now(), models };
	return models;
}

export function invalidateListModelsCache(): void {
	cachedList = null;
}

export async function loadModel(id: string): Promise<unknown> {
	// llama-swap: GET /upstream/<id>/health forces a spawn (warm load).
	// 2xx = success; plain text OK body is acceptable.
	const r = await routerRequest("GET", AI_SERVER_UPSTREAM_HEALTH_PATH(id));
	invalidateListModelsCache();
	return r;
}

export async function unloadModel(id: string): Promise<unknown> {
	// llama-swap: POST /api/models/unload/<id>, no body. Returns plain text "OK".
	const r = await routerRequest("POST", AI_SERVER_UNLOAD_PATH(id));
	invalidateListModelsCache();
	return r;
}

export async function unloadAll(): Promise<unknown> {
	// llama-swap: POST /api/models/unload, no body.
	const r = await routerRequest("POST", AI_SERVER_UNLOAD_ALL_PATH);
	invalidateListModelsCache();
	return r;
}

// llama-swap /v1/models only returns registered presets (all have a model
// path). Placeholder sections are not exposed. We only filter out shard
// artefacts.

interface RunningEntry {
	model: string;
	cmd?: string;
	state?: string;
	ttl?: number;
	proxy?: string;
}

async function listRunning(): Promise<RunningEntry[]> {
	const res = await routerRequest("GET", AI_SERVER_RUNNING_PATH);
	return Array.isArray((res as any)?.running)
		? (res as any).running
		: [];
}

export async function discoverModels(): Promise<ServerModel[]> {
	const [models, running, yaml] = await Promise.all([
		listModels(),
		listRunning().catch(() => [] as RunningEntry[]),
		readPreset().catch(() => ""),
	]);

	const ctxFromYaml = parseCtxMapFromYaml(yaml);
	const ctxFromRunning = new Map<string, number>();
	for (const r of running) {
		const n = extractCtxFromRunningCmd(r.cmd);
		if (n) ctxFromRunning.set(r.model, n);
	}

	return models
		.filter((m) => !isShardArtefact(m.id))
		.map((m) => {
			const ctx =
				ctxFromRunning.get(m.id) ?? // live process is authoritative
				ctxFromYaml.get(m.id) ?? // config.yaml is next best
				32768; // last-resort fallback
			return {
				id: m.id,
				name: `${m.id} (AI Server)`,
				reasoning: isReasoningModel(m.id),
				contextWindow: ctx,
				maxTokens: Math.min(16384, Math.max(2048, Math.floor(ctx / 2))),
			};
		});
}

// ─── SSH helpers ─────────────────────────────────────────────────────────

function shQuote(s: string): string {
	return `'${s.replace(/'/g, `'\\''`)}'`;
}

async function runSsh(remoteCmd: string, timeoutMs = 60_000): Promise<string> {
	const cmd = `ssh -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new ${shQuote(
		AI_SERVER_SSH_HOST,
	)} ${shQuote(remoteCmd)}`;
	try {
		const { stdout } = await exec(cmd, {
			timeout: timeoutMs,
			maxBuffer: 4 * 1024 * 1024,
		});
		return stdout;
	} catch (err: any) {
		const stderr = (err?.stderr ?? "").toString().trim();
		const msg = stderr || err?.message || String(err);
		throw new Error(`ssh ${AI_SERVER_SSH_HOST}: ${msg}`);
	}
}

export async function readPreset(): Promise<string> {
	return runSsh(`cat ${AI_SERVER_PRESET_PATH}`);
}

/**
 * Set a `key = value` inside a named YAML section for llama-swap.
 *
 * llama-swap config.yaml structure (relevant excerpt):
 *
 *   models:
 *     Qwen_Qwen3.6-35B-A3B-Q8_0:
 *       cmd: |
 *         /path/to/llama-server --model /path/to/gguf ...
 *         --ctx-size 32768
 *         --temp 0.7
 *
 * This function finds the `<id>:` block under `models:`, locates the
 * `--ctx-size N` line (or other supported flags), and replaces N.
 *
 * Supported keys: ctx-size, temp, n-gpu-layers
 */
export async function setPresetKey(
	section: string,
	key: string,
	value: string,
): Promise<void> {
	// Map short key names to the actual CLI flag used in cmd:
	const flagMap: Record<string, string> = {
		"ctx-size": "--ctx-size",
		"temp": "--temp",
		"n-gpu-layers": "--n-gpu-layers",
	};
	const flag = flagMap[key] ?? `--${key}`;

	// We use a sed-based approach on the YAML file:
	// 1. Find the <section>: block under models:
	// 2. Within that block, find the --flag N line
	// 3. Replace N with the new value
	//
	// The sed script works line-by-line:
	//   - When we see `  ${section}:` under models:, enter editing mode
	//   - While editing, look for `--flag <number>` and replace it
	//   - Exit editing mode when we hit a line at the same or lesser indent
	//     that is not under this section
	const escapedSection = section.replace(/[.[\]*/^$]/g, "\\$&");
	const escapedFlag = flag.replace(/[.[\]*/^$]/g, "\\$&");

	const awkScript = `
awk -v sec="${escapedSection}" -v flag="${escapedFlag}" -v val="${value}" '
  BEGIN { in_sec = 0; indent = 0 }
  {
    # Detect section header: "  <section>:" (2-space indent, key followed by colon)
    if (!in_sec && match($0, /^[[:space:]]{2}'${escapedSection}':[[:space:]]*$/)) {
      in_sec = 1;
      indent = 2;
    }
    # If we are in a section, check if we left it
    if (in_sec) {
      lineIndent = 0;
      m = match($0, /^[[:space:]]*/);
      if (m > 0) lineIndent = RLENGTH;
      # If indent is <= 2 and line is not empty and not a continuation of cmd,
      # we have left this section
      if (lineIndent <= 2 && $0 !~ /^[[:space:]]*$/) {
        in_sec = 0;
      }
    }
    if (in_sec && match($0, " " flag " [0-9]+")) {
      sub(flag " [0-9]+", flag " " val);
    }
    print
  }
' ${AI_SERVER_PRESET_PATH} > ${AI_SERVER_PRESET_PATH}.tmp && mv ${AI_SERVER_PRESET_PATH}.tmp ${AI_SERVER_PRESET_PATH}
`.trim();

	try {
		await runSsh(awkScript);
	} catch (err: any) {
		const msg = err?.message ?? String(err);
		if (msg.includes("exit code 2") || msg.match(/exit.*2/)) {
			throw new Error(
				`Key "${key}" not found for model "${section}" — add it to the preset manually first.`,
			);
		}
		throw err;
	}
}

export async function restartService(): Promise<string> {
	return runSsh(
		`systemctl --user restart ${AI_SERVER_SERVICE_UNIT} && systemctl --user is-active ${AI_SERVER_SERVICE_UNIT}`,
	);
}

export async function reloadOneModel(id: string): Promise<void> {
	try {
		await unloadModel(id);
	} catch {
		// Ignore unload errors (model may not be loaded).
	}
	// Router needs a beat to clear the slot.
	await new Promise((r) => setTimeout(r, 2000));
	await loadModel(id);
}