feat(ai-server): wire pi settings → mTLS extension; per-model reasoning; configurable admin timeout

- config.ts: add getAdminTimeoutMs() reading from AI_SERVER_ADMIN_TIMEOUT_MS env or settings.json retry.provider.adminTimeoutMs (default = inference timeout, capped at 5min). Refactor settings access into a cached readPiSettings() helper shared by both timeout resolvers. - stream.ts: forward options.reasoning (pi-mono's defaultThinkingLevel) to llama.cpp via chat_template_kwargs.enable_thinking + reasoning_effort, gated on per-model reasoning capability. Add TCP keepalive (30s) on the request socket to prevent NAT/middlebox idle drops during long silent prefills (root cause of the recent read ETIMEDOUT). - router-utils.ts: add isReasoningModel(id) with a substring-match list of known reasoning families (MiniMax-M, Qwen3.6, Qwen3-Coder, Qwen3-VL, MiMo-V2, gpt-oss, Devstral). Unanchored to handle HF-style Org_Model ids. - admin.ts: replace hardcoded 30s router HTTP timeout with getAdminTimeoutMs; use isReasoningModel(id) in discoverModels() instead of blanket reasoning: true. - index.ts: add informational compat block (thinkingFormat, supportsReasoningEffort, maxTokensField, etc.) to model registrations so pi-mono's UI / capability detection reflects per-model reasoning support. - tests: 3 new isReasoningModel test groups (positive, negative, unknown). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 19:33:56 +02:00
parent 5a1d738892
commit 98c140ac03
6 changed files with 192 additions and 11 deletions
@@ -7,12 +7,17 @@ import {
 	AI_SERVER_SSH_HOST,
 	AI_SERVER_URL,
 	type ServerModel,
+	getAdminTimeoutMs,
 	loadCerts,
 } from "./config.js";
-import { extractCtxSize, isShardArtefact } from "./router-utils.js";
+import {
+	extractCtxSize,
+	isReasoningModel,
+	isShardArtefact,
+} from "./router-utils.js";

 // Re-export so existing index.ts imports keep working.
-export { extractCtxSize };
+export { extractCtxSize, isReasoningModel };

 const exec = promisify(execCb);

@@ -47,7 +52,7 @@ async function routerRequest(
 				},
 				cert: certs.cert,
 				key: certs.key,
-				timeout: 30_000,
+				timeout: getAdminTimeoutMs(),
 			},
 			(res) => {
 				let buf = "";
@@ -137,7 +142,7 @@ export async function discoverModels(): Promise<ServerModel[]> {
 		return {
 			id: m.id,
 			name: `${m.id} (AI Server)`,
-			reasoning: true,
+			reasoning: isReasoningModel(m.id),
 			contextWindow: ctx,
 			maxTokens: Math.min(16384, Math.max(2048, Math.floor(ctx / 2))),
 		};
@@ -21,10 +21,86 @@ export const AI_SERVER_PRESET_PATH =
 export const AI_SERVER_API_ID = "ai-server-mtls";
 export const AI_SERVER_PROVIDER_ID = "ai-server";

-// 5 minutes — large models can take a while for the first token.
-export const REQUEST_TIMEOUT_MS = Number(
-	process.env.AI_SERVER_TIMEOUT_MS ?? 300_000,
-);
+// Resolve timeouts in priority order:
+//   1. <ENV_VAR>                          — explicit override (tests, one-offs)
+//   2. ~/.pi/agent/settings.json          — kept in sync with pi-mono so
+//                                           inner timeouts never give up
+//                                           before the outer wrapper does
+//   3. <DEFAULT_MS>                       — sensible fallback
+const PI_SETTINGS_PATH =
+	process.env.PI_SETTINGS_PATH ?? path.join(HOME, ".pi/agent/settings.json");
+
+interface PiSettings {
+	retry?: {
+		provider?: {
+			timeoutMs?: unknown;
+			adminTimeoutMs?: unknown;
+		};
+	};
+}
+
+let cachedSettings: PiSettings | null = null;
+let cachedSettingsLoaded = false;
+
+function readPiSettings(): PiSettings {
+	if (cachedSettingsLoaded) return cachedSettings ?? {};
+	cachedSettingsLoaded = true;
+	try {
+		const raw = fs.readFileSync(PI_SETTINGS_PATH, "utf-8");
+		cachedSettings = JSON.parse(raw) as PiSettings;
+	} catch {
+		// settings missing / unreadable / unparseable — fall through
+		cachedSettings = null;
+	}
+	return cachedSettings ?? {};
+}
+
+function pickPositiveNumber(...candidates: unknown[]): number | null {
+	for (const c of candidates) {
+		const n = typeof c === "number" ? c : Number(c);
+		if (Number.isFinite(n) && n > 0) return n;
+	}
+	return null;
+}
+
+function resolveTimeoutMs(
+	envVar: string | undefined,
+	settingsValue: unknown,
+	defaultMs: number,
+): number {
+	return (
+		pickPositiveNumber(envVar, settingsValue) ?? defaultMs
+	);
+}
+
+let cachedRequestTimeoutMs: number | null = null;
+let cachedAdminTimeoutMs: number | null = null;
+
+export function getRequestTimeoutMs(): number {
+	if (cachedRequestTimeoutMs !== null) return cachedRequestTimeoutMs;
+	const settings = readPiSettings();
+	cachedRequestTimeoutMs = resolveTimeoutMs(
+		process.env.AI_SERVER_TIMEOUT_MS,
+		settings.retry?.provider?.timeoutMs,
+		300_000,
+	);
+	return cachedRequestTimeoutMs;
+}
+
+export function getAdminTimeoutMs(): number {
+	if (cachedAdminTimeoutMs !== null) return cachedAdminTimeoutMs;
+	const settings = readPiSettings();
+	// Admin calls (model load/unload/list) are usually quick, but a cold-cache
+	// load of a >100GB model can take much longer. Falls back to the inference
+	// timeout — for a personal setup, "as patient as inference" is a sane
+	// upper bound and avoids surprise admin-call failures during big loads.
+	cachedAdminTimeoutMs = resolveTimeoutMs(
+		process.env.AI_SERVER_ADMIN_TIMEOUT_MS,
+		settings.retry?.provider?.adminTimeoutMs,
+		Math.min(getRequestTimeoutMs(), 300_000),
+	);
+	return cachedAdminTimeoutMs;
+}

 const CA_CERT_PATH =
 	process.env.AI_SERVER_CA ?? path.join(CERTS_DIR, "root-ca.pem");
@@ -52,6 +52,16 @@ function registerProviderWithModels(
 			cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
 			contextWindow: m.contextWindow,
 			maxTokens: m.maxTokens,
+			// Informational compat block. Our `streamSimple` is custom and
+			// builds the request body itself (see stream.ts), but pi-mono's
+			// model-list UI / capability detection reads these fields.
+			compat: {
+				thinkingFormat: "qwen-chat-template",
+				supportsReasoningEffort: m.reasoning,
+				supportsDeveloperRole: false,
+				supportsUsageInStreaming: true,
+				maxTokensField: "max_tokens",
+			},
 		})),
 		streamSimple: streamAiServer,
 	});
@@ -30,3 +30,32 @@ export function extractCtxSize(m: RouterModelMeta): number | null {
 export function isShardArtefact(id: string): boolean {
 	return /-\d+-of-\d+$/.test(id);
 }
+
+/**
+ * Pattern list of model-id prefixes / substrings that identify reasoning
+ * (thinking-capable) models. Used by discoverModels() to set the `reasoning`
+ * flag on registered model entries — which in turn drives whether pi-mono
+ * forwards `defaultThinkingLevel` to the model and renders thinking blocks.
+ *
+ * Conservative by design: false negatives (a real reasoning model shows no
+ * thinking UI) are recoverable; false positives (non-reasoning model gets
+ * `enable_thinking: true` injected, which most chat templates ignore but
+ * some reject) are noisier. Add new families as you load them.
+ */
+// Unanchored on purpose: HuggingFace-style ids are sometimes prefixed with the
+// org name (e.g. "Qwen_Qwen3.6-35B-A3B-Q8_0"), so anchoring with ^ would miss
+// them. The substrings below are distinctive enough that mid-string matches
+// are still meaningful.
+const REASONING_MODEL_PATTERNS: RegExp[] = [
+	/MiniMax-M/i,
+	/Qwen3\.6/i,
+	/Qwen3-Coder/i,
+	/Qwen3-VL/i,
+	/MiMo-V2/i,
+	/gpt-oss/i,
+	/Devstral/i,
+];
+
+export function isReasoningModel(id: string): boolean {
+	return REASONING_MODEL_PATTERNS.some((re) => re.test(id));
+}
@@ -15,7 +15,7 @@ import {
 import {
 	AI_SERVER_CHAT_PATH,
 	AI_SERVER_URL,
-	REQUEST_TIMEOUT_MS,
+	getRequestTimeoutMs,
 	loadCerts,
 } from "./config.js";
 import {
@@ -130,6 +130,26 @@ export function streamAiServer(
 				stream_options: { include_usage: true },
 			};

+			// Reasoning / thinking-level forwarding. pi-mono passes
+			// `options.reasoning` (a ThinkingLevel: minimal|low|medium|high|xhigh)
+			// from `defaultThinkingLevel` in ~/.pi/agent/settings.json. Forward
+			// it to llama.cpp two ways simultaneously so both qwen-style chat
+			// templates and openai-style providers see the directive:
+			//   • chat_template_kwargs.enable_thinking — Qwen3 / MiMo / Devstral
+			//     family templates respect this boolean.
+			//   • reasoning_effort                     — passed through as a
+			//     chat-template kwarg by llama-server; the few templates that
+			//     read it (gpt-oss, MiniMax) get full granularity.
+			// Skip entirely for non-reasoning models so we don't poison their
+			// chat templates with kwargs they don't understand.
+			const reasoning = options?.reasoning;
+			if (reasoning && model.reasoning) {
+				bodyObj.chat_template_kwargs = {
+					enable_thinking: reasoning !== "minimal",
+				};
+				bodyObj.reasoning_effort = reasoning;
+			}
+
 			const openaiTools = toolsToOpenAI(context.tools);
 			if (openaiTools) {
 				bodyObj.tools = openaiTools;
@@ -141,6 +161,7 @@ export function streamAiServer(
 			const body = JSON.stringify(bodyObj);
 			const certs = loadCerts();
 			const url = new URL(AI_SERVER_URL + AI_SERVER_CHAT_PATH);
+			const requestTimeoutMs = getRequestTimeoutMs();

 			// No `ca:` — server cert is publicly-trusted (Let's Encrypt), so
 			// rely on Node's default Mozilla CA bundle. mTLS client auth still
@@ -157,7 +178,16 @@ export function streamAiServer(
 				},
 				cert: certs.cert,
 				key: certs.key,
-				timeout: REQUEST_TIMEOUT_MS,
+				timeout: requestTimeoutMs,
+			});
+
+			// TCP keepalive: kernel sends probes every 30s of idle. Stops NAT /
+			// stateful firewalls on the LAN path from silently dropping the flow
+			// during long prefills (when llama.cpp emits no SSE bytes yet) and
+			// surfaces real drops fast instead of after the kernel retransmit
+			// deadline (~15min).
+			req.on("socket", (socket) => {
+				socket.setKeepAlive(true, 30_000);
 			});

 			const onAbort = () => {
@@ -174,7 +204,7 @@ export function streamAiServer(

 			req.on("timeout", () => {
 				req.destroy(
-					new Error(`Request timed out after ${REQUEST_TIMEOUT_MS}ms`),
+					new Error(`Request timed out after ${requestTimeoutMs}ms`),
 				);
 			});

@@ -8,6 +8,7 @@ import assert from "node:assert/strict";
 import { test } from "node:test";
 import {
 	extractCtxSize,
+	isReasoningModel,
 	isShardArtefact,
 } from "../ai-server/router-utils.ts";

@@ -66,3 +67,33 @@ test("isShardArtefact: non-shard numeric patterns are not matched", () => {
 	assert.equal(isShardArtefact("model-001"), false, "trailing digits alone don't match");
 	assert.equal(isShardArtefact("00001-of-00003-mid"), false, "must be at the end of the id");
 });
+
+// ── isReasoningModel ────────────────────────────────────────────────────
+
+test("isReasoningModel: known reasoning families return true", () => {
+	assert.equal(isReasoningModel("MiniMax-M2.7-IQ3_XXS"), true);
+	assert.equal(isReasoningModel("MiniMax-M2.7-IQ4_XS"), true);
+	assert.equal(isReasoningModel("Qwen3.6-35B-Claude-Opus-Distilled-Q5_K_M"), true);
+	assert.equal(isReasoningModel("Qwen_Qwen3.6-35B-A3B-Q8_0"), true, "underscore-prefixed variant still matches");
+	assert.equal(isReasoningModel("Qwen3-Coder-30B-Q8_0"), true);
+	assert.equal(isReasoningModel("Qwen3-Coder-Next-IQ4_XS"), true);
+	assert.equal(isReasoningModel("Qwen3-VL-30B-Q8_0"), true);
+	assert.equal(isReasoningModel("MiMo-V2-Flash-IQ2_M"), true);
+	assert.equal(isReasoningModel("MiMo-V2-Flash-IQ2_XXS"), true);
+	assert.equal(isReasoningModel("gpt-oss-120b-MXFP4"), true);
+	assert.equal(isReasoningModel("Devstral-2-123B-IQ3_XXS"), true);
+});
+
+test("isReasoningModel: non-reasoning families return false", () => {
+	assert.equal(isReasoningModel("Anubis-70B-v1.2-Q5_K_M"), false);
+	assert.equal(isReasoningModel("Euryale-v2.3-IQ4_XS"), false);
+	assert.equal(isReasoningModel("Gemma-4-31B-Q8_0"), false);
+	assert.equal(isReasoningModel("Skyfall-31B-v4.2-Q8_0"), false);
+	assert.equal(isReasoningModel("Voxtral-Small-24B-Q8_0"), false);
+});
+
+test("isReasoningModel: unknown model id returns false (conservative default)", () => {
+	assert.equal(isReasoningModel("Mistral-7B-Instruct-Q4"), false);
+	assert.equal(isReasoningModel("RandomModel-Q8"), false);
+	assert.equal(isReasoningModel(""), false);
+});