pi-extensions/ai-server/index.ts

import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
import {
	discoverModels,
	listModels,
	listModelsCached,
	loadModel,
	readPreset,
	reloadOneModel,
	restartService,
	setPresetKey,
	unloadModel,
} from "./admin.js";
import {
	AI_SERVER_API_BASE,
	AI_SERVER_API_ID,
	AI_SERVER_PROVIDER_ID,
	AI_SERVER_URL,
	MODELS as STATIC_MODELS,
	type ServerModel,
} from "./config.js";
import { streamAiServer } from "./stream.js";

async function completeModelId(prefix: string) {
	try {
		// Cached for 5s. Tab-completion calls the completer on every keystroke,
		// but the user typically only types one model id per command — caching
		// deduplicates the network round-trip without stale-state harm.
		const models = await listModelsCached();
		const hits = models
			.filter((m) => m.id.startsWith(prefix))
			.map((m) => ({ value: m.id, label: m.id }));
		return hits.length > 0 ? hits : null;
	} catch {
		return null;
	}
}

function registerProviderWithModels(
	pi: ExtensionAPI,
	models: ServerModel[],
): void {
	pi.registerProvider(AI_SERVER_PROVIDER_ID, {
		baseUrl: AI_SERVER_API_BASE,
		apiKey: "ai-server-mtls",
		api: AI_SERVER_API_ID as any,
		models: models.map((m) => ({
			id: m.id,
			name: m.name,
			reasoning: m.reasoning,
			input: ["text"],
			cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
			contextWindow: m.contextWindow,
			maxTokens: m.maxTokens,
			// Informational compat block. Our `streamSimple` is custom and
			// builds the request body itself (see stream.ts), but pi-mono's
			// model-list UI / capability detection reads these fields.
			compat: {
				thinkingFormat: "qwen-chat-template",
				supportsReasoningEffort: m.reasoning,
				supportsDeveloperRole: false,
				supportsUsageInStreaming: true,
				maxTokensField: "max_tokens",
			},
		})),
		streamSimple: streamAiServer,
	});
}

const DISCOVERY_FAST_TIMEOUT_MS = 300;

export default async function (pi: ExtensionAPI) {
	// Register the provider IMMEDIATELY with the static fallback list so pi
	// startup isn't blocked on the HTTPS round-trip in the worst case.
	registerProviderWithModels(pi, STATIC_MODELS);

	// Then race real discovery against a short timeout. On LAN the router
	// answers in ~40ms and pi --list-models sees the live list. On slow
	// networks we bail at 300ms and the fallback is what the user sees; the
	// background promise keeps running and re-registers later.
	const discovery = discoverModels().catch((err) => {
		if (process.env.PI_DEBUG) {
			console.log(
				`[ai-server] Discovery failed (${(err as Error).message}); fallback remains`,
			);
		}
		return null;
	});
	const timeout = new Promise<null>((r) =>
		setTimeout(() => r(null), DISCOVERY_FAST_TIMEOUT_MS),
	);
	const fastResult = await Promise.race([discovery, timeout]);

	if (fastResult && fastResult.length > 0) {
		registerProviderWithModels(pi, fastResult);
		if (process.env.PI_DEBUG) {
			console.log(
				`[ai-server] Discovered ${fastResult.length} model(s) on ${AI_SERVER_URL}: ${fastResult.map((m) => m.id).join(", ")}`,
			);
		}
	} else {
		// Slow network or discovery still pending — keep waiting in the
		// background and update the provider once it arrives.
		discovery.then((models) => {
			if (models && models.length > 0) {
				registerProviderWithModels(pi, models);
				if (process.env.PI_DEBUG) {
					console.log(
						`[ai-server] Late discovery: ${models.length} model(s)`,
					);
				}
			}
		});
	}

	// ─── Admin commands ──────────────────────────────────────────────────

	pi.registerCommand("ai-server-status", {
		description: "Show ai-server model status and ctx sizes",
		handler: async (_args, ctx) => {
			try {
				const routerModels = await listModels();
				const lines = [`AI Server: ${AI_SERVER_URL}`];
				for (const m of routerModels) {
					const status = m.running ? "loaded" : "unloaded";
					lines.push(`  ${m.id}  [${status}]`);
				}
				ctx.ui.notify(lines.join("\n"), "info");
			} catch (err) {
				ctx.ui.notify(
					`ai-server-status failed: ${(err as Error).message}`,
					"error",
				);
			}
		},
	});

	pi.registerCommand("ai-server-refresh", {
		description: "Re-discover models from the ai-server router",
		handler: async (_args, ctx) => {
			try {
				const discovered = await discoverModels();
				if (discovered.length === 0) {
					ctx.ui.notify(
						"No runnable models on server (all presets missing --model path)",
						"warning",
					);
					return;
				}
				registerProviderWithModels(pi, discovered);
				ctx.ui.notify(
					`Registered ${discovered.length} model(s): ${discovered
						.map((m) => m.id)
						.join(", ")}`,
					"info",
				);
			} catch (err) {
				ctx.ui.notify(
					`Refresh failed: ${(err as Error).message}`,
					"error",
				);
			}
		},
	});

	pi.registerCommand("ai-server-load", {
		description: "Load a model (usage: /ai-server-load <id>)",
		getArgumentCompletions: completeModelId,
		handler: async (args, ctx) => {
			const id = args.trim();
			if (!id) {
				ctx.ui.notify("Usage: /ai-server-load <model-id>", "error");
				return;
			}
			try {
				ctx.ui.setStatus("ai-server", `Loading ${id}…`);
				await loadModel(id);
				ctx.ui.setStatus("ai-server", undefined);
				ctx.ui.notify(`Loaded ${id}`, "info");
			} catch (err) {
				ctx.ui.setStatus("ai-server", undefined);
				ctx.ui.notify(`Load failed: ${(err as Error).message}`, "error");
			}
		},
	});

	pi.registerCommand("ai-server-unload", {
		description: "Unload a model (usage: /ai-server-unload <id>)",
		getArgumentCompletions: completeModelId,
		handler: async (args, ctx) => {
			const id = args.trim();
			if (!id) {
				ctx.ui.notify("Usage: /ai-server-unload <model-id>", "error");
				return;
			}
			try {
				await unloadModel(id);
				ctx.ui.notify(`Unloaded ${id}`, "info");
			} catch (err) {
				ctx.ui.notify(`Unload failed: ${(err as Error).message}`, "error");
			}
		},
	});

	pi.registerCommand("ai-server-ctx", {
		description:
			"Set ctx-size for a model and reload (usage: /ai-server-ctx <id> <size>)",
		getArgumentCompletions: completeModelId,
		handler: async (args, ctx) => {
			const parts = args.trim().split(/\s+/).filter(Boolean);
			if (parts.length !== 2) {
				ctx.ui.notify("Usage: /ai-server-ctx <model-id> <size>", "error");
				return;
			}
			const [id, sizeStr] = parts;
			const size = Number(sizeStr);
			if (!Number.isInteger(size) || size < 512) {
				ctx.ui.notify(`Invalid size: ${sizeStr}`, "error");
				return;
			}
			const ok = await ctx.ui.confirm(
				"Set ctx-size?",
				`Edit preset [${id}] → ctx-size=${size} and reload the model?`,
			);
			if (!ok) return;
			try {
				ctx.ui.setStatus("ai-server", "Editing preset…");
				await setPresetKey(id, "ctx-size", String(size));
				ctx.ui.setStatus("ai-server", `Reloading ${id}…`);
				await reloadOneModel(id);
				ctx.ui.setStatus("ai-server", undefined);
				ctx.ui.notify(`${id}: ctx-size=${size}, reloaded`, "info");
			} catch (err) {
				ctx.ui.setStatus("ai-server", undefined);
				ctx.ui.notify(
					`ctx update failed: ${(err as Error).message}`,
					"error",
				);
			}
		},
	});

	pi.registerCommand("ai-server-preset", {
		description: "Print llama-swap config on the ai-server",
		handler: async (_args, ctx) => {
			try {
				const text = await readPreset();
				ctx.ui.notify(text, "info");
			} catch (err) {
				ctx.ui.notify(
					`Preset read failed: ${(err as Error).message}`,
					"error",
				);
			}
		},
	});

	pi.registerCommand("ai-server-restart", {
		description: "Restart the ai-server llama-swap service",
		handler: async (_args, ctx) => {
			const ok = await ctx.ui.confirm(
				"Restart llama-server?",
				"This unloads all models and kills in-flight requests.",
			);
			if (!ok) return;
			try {
				ctx.ui.setStatus("ai-server", "Restarting…");
				const status = await restartService();
				ctx.ui.setStatus("ai-server", undefined);
				ctx.ui.notify(`Service: ${status.trim()}`, "info");
			} catch (err) {
				ctx.ui.setStatus("ai-server", undefined);
				ctx.ui.notify(
					`Restart failed: ${(err as Error).message}`,
					"error",
				);
			}
		},
	});

}