feat(ai-server): wire pi settings → mTLS extension; per-model reasoning; configurable admin timeout

- config.ts: add getAdminTimeoutMs() reading from
  AI_SERVER_ADMIN_TIMEOUT_MS env or settings.json
  retry.provider.adminTimeoutMs (default = inference timeout, capped at 5min).
  Refactor settings access into a cached readPiSettings() helper shared by
  both timeout resolvers.
- stream.ts: forward options.reasoning (pi-mono's defaultThinkingLevel) to
  llama.cpp via chat_template_kwargs.enable_thinking +
  reasoning_effort, gated on per-model reasoning capability. Add TCP keepalive
  (30s) on the request socket to prevent NAT/middlebox idle drops during long
  silent prefills (root cause of the recent read ETIMEDOUT).
- router-utils.ts: add isReasoningModel(id) with a substring-match list of
  known reasoning families (MiniMax-M, Qwen3.6, Qwen3-Coder, Qwen3-VL,
  MiMo-V2, gpt-oss, Devstral). Unanchored to handle HF-style Org_Model ids.
- admin.ts: replace hardcoded 30s router HTTP timeout with getAdminTimeoutMs;
  use isReasoningModel(id) in discoverModels() instead of blanket
  reasoning: true.
- index.ts: add informational compat block (thinkingFormat,
  supportsReasoningEffort, maxTokensField, etc.) to model registrations so
  pi-mono's UI / capability detection reflects per-model reasoning support.
- tests: 3 new isReasoningModel test groups (positive, negative, unknown).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
shahondin1624
2026-04-28 19:33:56 +02:00
parent 5a1d738892
commit 98c140ac03
6 changed files with 192 additions and 11 deletions
+9 -4
View File
@@ -7,12 +7,17 @@ import {
AI_SERVER_SSH_HOST,
AI_SERVER_URL,
type ServerModel,
getAdminTimeoutMs,
loadCerts,
} from "./config.js";
import { extractCtxSize, isShardArtefact } from "./router-utils.js";
import {
extractCtxSize,
isReasoningModel,
isShardArtefact,
} from "./router-utils.js";
// Re-export so existing index.ts imports keep working.
export { extractCtxSize };
export { extractCtxSize, isReasoningModel };
const exec = promisify(execCb);
@@ -47,7 +52,7 @@ async function routerRequest(
},
cert: certs.cert,
key: certs.key,
timeout: 30_000,
timeout: getAdminTimeoutMs(),
},
(res) => {
let buf = "";
@@ -137,7 +142,7 @@ export async function discoverModels(): Promise<ServerModel[]> {
return {
id: m.id,
name: `${m.id} (AI Server)`,
reasoning: true,
reasoning: isReasoningModel(m.id),
contextWindow: ctx,
maxTokens: Math.min(16384, Math.max(2048, Math.floor(ctx / 2))),
};
+80 -4
View File
@@ -21,10 +21,86 @@ export const AI_SERVER_PRESET_PATH =
export const AI_SERVER_API_ID = "ai-server-mtls";
export const AI_SERVER_PROVIDER_ID = "ai-server";
// 5 minutes — large models can take a while for the first token.
export const REQUEST_TIMEOUT_MS = Number(
process.env.AI_SERVER_TIMEOUT_MS ?? 300_000,
);
// Resolve timeouts in priority order:
// 1. <ENV_VAR> — explicit override (tests, one-offs)
// 2. ~/.pi/agent/settings.json — kept in sync with pi-mono so
// inner timeouts never give up
// before the outer wrapper does
// 3. <DEFAULT_MS> — sensible fallback
const PI_SETTINGS_PATH =
process.env.PI_SETTINGS_PATH ?? path.join(HOME, ".pi/agent/settings.json");
interface PiSettings {
retry?: {
provider?: {
timeoutMs?: unknown;
adminTimeoutMs?: unknown;
};
};
}
let cachedSettings: PiSettings | null = null;
let cachedSettingsLoaded = false;
function readPiSettings(): PiSettings {
if (cachedSettingsLoaded) return cachedSettings ?? {};
cachedSettingsLoaded = true;
try {
const raw = fs.readFileSync(PI_SETTINGS_PATH, "utf-8");
cachedSettings = JSON.parse(raw) as PiSettings;
} catch {
// settings missing / unreadable / unparseable — fall through
cachedSettings = null;
}
return cachedSettings ?? {};
}
function pickPositiveNumber(...candidates: unknown[]): number | null {
for (const c of candidates) {
const n = typeof c === "number" ? c : Number(c);
if (Number.isFinite(n) && n > 0) return n;
}
return null;
}
function resolveTimeoutMs(
envVar: string | undefined,
settingsValue: unknown,
defaultMs: number,
): number {
return (
pickPositiveNumber(envVar, settingsValue) ?? defaultMs
);
}
let cachedRequestTimeoutMs: number | null = null;
let cachedAdminTimeoutMs: number | null = null;
export function getRequestTimeoutMs(): number {
if (cachedRequestTimeoutMs !== null) return cachedRequestTimeoutMs;
const settings = readPiSettings();
cachedRequestTimeoutMs = resolveTimeoutMs(
process.env.AI_SERVER_TIMEOUT_MS,
settings.retry?.provider?.timeoutMs,
300_000,
);
return cachedRequestTimeoutMs;
}
export function getAdminTimeoutMs(): number {
if (cachedAdminTimeoutMs !== null) return cachedAdminTimeoutMs;
const settings = readPiSettings();
// Admin calls (model load/unload/list) are usually quick, but a cold-cache
// load of a >100GB model can take much longer. Falls back to the inference
// timeout — for a personal setup, "as patient as inference" is a sane
// upper bound and avoids surprise admin-call failures during big loads.
cachedAdminTimeoutMs = resolveTimeoutMs(
process.env.AI_SERVER_ADMIN_TIMEOUT_MS,
settings.retry?.provider?.adminTimeoutMs,
Math.min(getRequestTimeoutMs(), 300_000),
);
return cachedAdminTimeoutMs;
}
const CA_CERT_PATH =
process.env.AI_SERVER_CA ?? path.join(CERTS_DIR, "root-ca.pem");
+10
View File
@@ -52,6 +52,16 @@ function registerProviderWithModels(
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
contextWindow: m.contextWindow,
maxTokens: m.maxTokens,
// Informational compat block. Our `streamSimple` is custom and
// builds the request body itself (see stream.ts), but pi-mono's
// model-list UI / capability detection reads these fields.
compat: {
thinkingFormat: "qwen-chat-template",
supportsReasoningEffort: m.reasoning,
supportsDeveloperRole: false,
supportsUsageInStreaming: true,
maxTokensField: "max_tokens",
},
})),
streamSimple: streamAiServer,
});
+29
View File
@@ -30,3 +30,32 @@ export function extractCtxSize(m: RouterModelMeta): number | null {
export function isShardArtefact(id: string): boolean {
return /-\d+-of-\d+$/.test(id);
}
/**
* Pattern list of model-id prefixes / substrings that identify reasoning
* (thinking-capable) models. Used by discoverModels() to set the `reasoning`
* flag on registered model entries — which in turn drives whether pi-mono
* forwards `defaultThinkingLevel` to the model and renders thinking blocks.
*
* Conservative by design: false negatives (a real reasoning model shows no
* thinking UI) are recoverable; false positives (non-reasoning model gets
* `enable_thinking: true` injected, which most chat templates ignore but
* some reject) are noisier. Add new families as you load them.
*/
// Unanchored on purpose: HuggingFace-style ids are sometimes prefixed with the
// org name (e.g. "Qwen_Qwen3.6-35B-A3B-Q8_0"), so anchoring with ^ would miss
// them. The substrings below are distinctive enough that mid-string matches
// are still meaningful.
const REASONING_MODEL_PATTERNS: RegExp[] = [
/MiniMax-M/i,
/Qwen3\.6/i,
/Qwen3-Coder/i,
/Qwen3-VL/i,
/MiMo-V2/i,
/gpt-oss/i,
/Devstral/i,
];
export function isReasoningModel(id: string): boolean {
return REASONING_MODEL_PATTERNS.some((re) => re.test(id));
}
+33 -3
View File
@@ -15,7 +15,7 @@ import {
import {
AI_SERVER_CHAT_PATH,
AI_SERVER_URL,
REQUEST_TIMEOUT_MS,
getRequestTimeoutMs,
loadCerts,
} from "./config.js";
import {
@@ -130,6 +130,26 @@ export function streamAiServer(
stream_options: { include_usage: true },
};
// Reasoning / thinking-level forwarding. pi-mono passes
// `options.reasoning` (a ThinkingLevel: minimal|low|medium|high|xhigh)
// from `defaultThinkingLevel` in ~/.pi/agent/settings.json. Forward
// it to llama.cpp two ways simultaneously so both qwen-style chat
// templates and openai-style providers see the directive:
// • chat_template_kwargs.enable_thinking — Qwen3 / MiMo / Devstral
// family templates respect this boolean.
// • reasoning_effort — passed through as a
// chat-template kwarg by llama-server; the few templates that
// read it (gpt-oss, MiniMax) get full granularity.
// Skip entirely for non-reasoning models so we don't poison their
// chat templates with kwargs they don't understand.
const reasoning = options?.reasoning;
if (reasoning && model.reasoning) {
bodyObj.chat_template_kwargs = {
enable_thinking: reasoning !== "minimal",
};
bodyObj.reasoning_effort = reasoning;
}
const openaiTools = toolsToOpenAI(context.tools);
if (openaiTools) {
bodyObj.tools = openaiTools;
@@ -141,6 +161,7 @@ export function streamAiServer(
const body = JSON.stringify(bodyObj);
const certs = loadCerts();
const url = new URL(AI_SERVER_URL + AI_SERVER_CHAT_PATH);
const requestTimeoutMs = getRequestTimeoutMs();
// No `ca:` — server cert is publicly-trusted (Let's Encrypt), so
// rely on Node's default Mozilla CA bundle. mTLS client auth still
@@ -157,7 +178,16 @@ export function streamAiServer(
},
cert: certs.cert,
key: certs.key,
timeout: REQUEST_TIMEOUT_MS,
timeout: requestTimeoutMs,
});
// TCP keepalive: kernel sends probes every 30s of idle. Stops NAT /
// stateful firewalls on the LAN path from silently dropping the flow
// during long prefills (when llama.cpp emits no SSE bytes yet) and
// surfaces real drops fast instead of after the kernel retransmit
// deadline (~15min).
req.on("socket", (socket) => {
socket.setKeepAlive(true, 30_000);
});
const onAbort = () => {
@@ -174,7 +204,7 @@ export function streamAiServer(
req.on("timeout", () => {
req.destroy(
new Error(`Request timed out after ${REQUEST_TIMEOUT_MS}ms`),
new Error(`Request timed out after ${requestTimeoutMs}ms`),
);
});
+31
View File
@@ -8,6 +8,7 @@ import assert from "node:assert/strict";
import { test } from "node:test";
import {
extractCtxSize,
isReasoningModel,
isShardArtefact,
} from "../ai-server/router-utils.ts";
@@ -66,3 +67,33 @@ test("isShardArtefact: non-shard numeric patterns are not matched", () => {
assert.equal(isShardArtefact("model-001"), false, "trailing digits alone don't match");
assert.equal(isShardArtefact("00001-of-00003-mid"), false, "must be at the end of the id");
});
// ── isReasoningModel ────────────────────────────────────────────────────
test("isReasoningModel: known reasoning families return true", () => {
assert.equal(isReasoningModel("MiniMax-M2.7-IQ3_XXS"), true);
assert.equal(isReasoningModel("MiniMax-M2.7-IQ4_XS"), true);
assert.equal(isReasoningModel("Qwen3.6-35B-Claude-Opus-Distilled-Q5_K_M"), true);
assert.equal(isReasoningModel("Qwen_Qwen3.6-35B-A3B-Q8_0"), true, "underscore-prefixed variant still matches");
assert.equal(isReasoningModel("Qwen3-Coder-30B-Q8_0"), true);
assert.equal(isReasoningModel("Qwen3-Coder-Next-IQ4_XS"), true);
assert.equal(isReasoningModel("Qwen3-VL-30B-Q8_0"), true);
assert.equal(isReasoningModel("MiMo-V2-Flash-IQ2_M"), true);
assert.equal(isReasoningModel("MiMo-V2-Flash-IQ2_XXS"), true);
assert.equal(isReasoningModel("gpt-oss-120b-MXFP4"), true);
assert.equal(isReasoningModel("Devstral-2-123B-IQ3_XXS"), true);
});
test("isReasoningModel: non-reasoning families return false", () => {
assert.equal(isReasoningModel("Anubis-70B-v1.2-Q5_K_M"), false);
assert.equal(isReasoningModel("Euryale-v2.3-IQ4_XS"), false);
assert.equal(isReasoningModel("Gemma-4-31B-Q8_0"), false);
assert.equal(isReasoningModel("Skyfall-31B-v4.2-Q8_0"), false);
assert.equal(isReasoningModel("Voxtral-Small-24B-Q8_0"), false);
});
test("isReasoningModel: unknown model id returns false (conservative default)", () => {
assert.equal(isReasoningModel("Mistral-7B-Instruct-Q4"), false);
assert.equal(isReasoningModel("RandomModel-Q8"), false);
assert.equal(isReasoningModel(""), false);
});