feat(ai-server): wire pi settings → mTLS extension; per-model reasoning; configurable admin timeout
- config.ts: add getAdminTimeoutMs() reading from AI_SERVER_ADMIN_TIMEOUT_MS env or settings.json retry.provider.adminTimeoutMs (default = inference timeout, capped at 5min). Refactor settings access into a cached readPiSettings() helper shared by both timeout resolvers. - stream.ts: forward options.reasoning (pi-mono's defaultThinkingLevel) to llama.cpp via chat_template_kwargs.enable_thinking + reasoning_effort, gated on per-model reasoning capability. Add TCP keepalive (30s) on the request socket to prevent NAT/middlebox idle drops during long silent prefills (root cause of the recent read ETIMEDOUT). - router-utils.ts: add isReasoningModel(id) with a substring-match list of known reasoning families (MiniMax-M, Qwen3.6, Qwen3-Coder, Qwen3-VL, MiMo-V2, gpt-oss, Devstral). Unanchored to handle HF-style Org_Model ids. - admin.ts: replace hardcoded 30s router HTTP timeout with getAdminTimeoutMs; use isReasoningModel(id) in discoverModels() instead of blanket reasoning: true. - index.ts: add informational compat block (thinkingFormat, supportsReasoningEffort, maxTokensField, etc.) to model registrations so pi-mono's UI / capability detection reflects per-model reasoning support. - tests: 3 new isReasoningModel test groups (positive, negative, unknown). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+9
-4
@@ -7,12 +7,17 @@ import {
|
||||
AI_SERVER_SSH_HOST,
|
||||
AI_SERVER_URL,
|
||||
type ServerModel,
|
||||
getAdminTimeoutMs,
|
||||
loadCerts,
|
||||
} from "./config.js";
|
||||
import { extractCtxSize, isShardArtefact } from "./router-utils.js";
|
||||
import {
|
||||
extractCtxSize,
|
||||
isReasoningModel,
|
||||
isShardArtefact,
|
||||
} from "./router-utils.js";
|
||||
|
||||
// Re-export so existing index.ts imports keep working.
|
||||
export { extractCtxSize };
|
||||
export { extractCtxSize, isReasoningModel };
|
||||
|
||||
const exec = promisify(execCb);
|
||||
|
||||
@@ -47,7 +52,7 @@ async function routerRequest(
|
||||
},
|
||||
cert: certs.cert,
|
||||
key: certs.key,
|
||||
timeout: 30_000,
|
||||
timeout: getAdminTimeoutMs(),
|
||||
},
|
||||
(res) => {
|
||||
let buf = "";
|
||||
@@ -137,7 +142,7 @@ export async function discoverModels(): Promise<ServerModel[]> {
|
||||
return {
|
||||
id: m.id,
|
||||
name: `${m.id} (AI Server)`,
|
||||
reasoning: true,
|
||||
reasoning: isReasoningModel(m.id),
|
||||
contextWindow: ctx,
|
||||
maxTokens: Math.min(16384, Math.max(2048, Math.floor(ctx / 2))),
|
||||
};
|
||||
|
||||
+79
-3
@@ -21,10 +21,86 @@ export const AI_SERVER_PRESET_PATH =
|
||||
export const AI_SERVER_API_ID = "ai-server-mtls";
|
||||
export const AI_SERVER_PROVIDER_ID = "ai-server";
|
||||
|
||||
// 5 minutes — large models can take a while for the first token.
|
||||
export const REQUEST_TIMEOUT_MS = Number(
|
||||
process.env.AI_SERVER_TIMEOUT_MS ?? 300_000,
|
||||
// Resolve timeouts in priority order:
|
||||
// 1. <ENV_VAR> — explicit override (tests, one-offs)
|
||||
// 2. ~/.pi/agent/settings.json — kept in sync with pi-mono so
|
||||
// inner timeouts never give up
|
||||
// before the outer wrapper does
|
||||
// 3. <DEFAULT_MS> — sensible fallback
|
||||
const PI_SETTINGS_PATH =
|
||||
process.env.PI_SETTINGS_PATH ?? path.join(HOME, ".pi/agent/settings.json");
|
||||
|
||||
interface PiSettings {
|
||||
retry?: {
|
||||
provider?: {
|
||||
timeoutMs?: unknown;
|
||||
adminTimeoutMs?: unknown;
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
let cachedSettings: PiSettings | null = null;
|
||||
let cachedSettingsLoaded = false;
|
||||
|
||||
function readPiSettings(): PiSettings {
|
||||
if (cachedSettingsLoaded) return cachedSettings ?? {};
|
||||
cachedSettingsLoaded = true;
|
||||
try {
|
||||
const raw = fs.readFileSync(PI_SETTINGS_PATH, "utf-8");
|
||||
cachedSettings = JSON.parse(raw) as PiSettings;
|
||||
} catch {
|
||||
// settings missing / unreadable / unparseable — fall through
|
||||
cachedSettings = null;
|
||||
}
|
||||
return cachedSettings ?? {};
|
||||
}
|
||||
|
||||
function pickPositiveNumber(...candidates: unknown[]): number | null {
|
||||
for (const c of candidates) {
|
||||
const n = typeof c === "number" ? c : Number(c);
|
||||
if (Number.isFinite(n) && n > 0) return n;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function resolveTimeoutMs(
|
||||
envVar: string | undefined,
|
||||
settingsValue: unknown,
|
||||
defaultMs: number,
|
||||
): number {
|
||||
return (
|
||||
pickPositiveNumber(envVar, settingsValue) ?? defaultMs
|
||||
);
|
||||
}
|
||||
|
||||
let cachedRequestTimeoutMs: number | null = null;
|
||||
let cachedAdminTimeoutMs: number | null = null;
|
||||
|
||||
export function getRequestTimeoutMs(): number {
|
||||
if (cachedRequestTimeoutMs !== null) return cachedRequestTimeoutMs;
|
||||
const settings = readPiSettings();
|
||||
cachedRequestTimeoutMs = resolveTimeoutMs(
|
||||
process.env.AI_SERVER_TIMEOUT_MS,
|
||||
settings.retry?.provider?.timeoutMs,
|
||||
300_000,
|
||||
);
|
||||
return cachedRequestTimeoutMs;
|
||||
}
|
||||
|
||||
export function getAdminTimeoutMs(): number {
|
||||
if (cachedAdminTimeoutMs !== null) return cachedAdminTimeoutMs;
|
||||
const settings = readPiSettings();
|
||||
// Admin calls (model load/unload/list) are usually quick, but a cold-cache
|
||||
// load of a >100GB model can take much longer. Falls back to the inference
|
||||
// timeout — for a personal setup, "as patient as inference" is a sane
|
||||
// upper bound and avoids surprise admin-call failures during big loads.
|
||||
cachedAdminTimeoutMs = resolveTimeoutMs(
|
||||
process.env.AI_SERVER_ADMIN_TIMEOUT_MS,
|
||||
settings.retry?.provider?.adminTimeoutMs,
|
||||
Math.min(getRequestTimeoutMs(), 300_000),
|
||||
);
|
||||
return cachedAdminTimeoutMs;
|
||||
}
|
||||
|
||||
const CA_CERT_PATH =
|
||||
process.env.AI_SERVER_CA ?? path.join(CERTS_DIR, "root-ca.pem");
|
||||
|
||||
@@ -52,6 +52,16 @@ function registerProviderWithModels(
|
||||
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
||||
contextWindow: m.contextWindow,
|
||||
maxTokens: m.maxTokens,
|
||||
// Informational compat block. Our `streamSimple` is custom and
|
||||
// builds the request body itself (see stream.ts), but pi-mono's
|
||||
// model-list UI / capability detection reads these fields.
|
||||
compat: {
|
||||
thinkingFormat: "qwen-chat-template",
|
||||
supportsReasoningEffort: m.reasoning,
|
||||
supportsDeveloperRole: false,
|
||||
supportsUsageInStreaming: true,
|
||||
maxTokensField: "max_tokens",
|
||||
},
|
||||
})),
|
||||
streamSimple: streamAiServer,
|
||||
});
|
||||
|
||||
@@ -30,3 +30,32 @@ export function extractCtxSize(m: RouterModelMeta): number | null {
|
||||
export function isShardArtefact(id: string): boolean {
|
||||
return /-\d+-of-\d+$/.test(id);
|
||||
}
|
||||
|
||||
/**
|
||||
* Pattern list of model-id prefixes / substrings that identify reasoning
|
||||
* (thinking-capable) models. Used by discoverModels() to set the `reasoning`
|
||||
* flag on registered model entries — which in turn drives whether pi-mono
|
||||
* forwards `defaultThinkingLevel` to the model and renders thinking blocks.
|
||||
*
|
||||
* Conservative by design: false negatives (a real reasoning model shows no
|
||||
* thinking UI) are recoverable; false positives (non-reasoning model gets
|
||||
* `enable_thinking: true` injected, which most chat templates ignore but
|
||||
* some reject) are noisier. Add new families as you load them.
|
||||
*/
|
||||
// Unanchored on purpose: HuggingFace-style ids are sometimes prefixed with the
|
||||
// org name (e.g. "Qwen_Qwen3.6-35B-A3B-Q8_0"), so anchoring with ^ would miss
|
||||
// them. The substrings below are distinctive enough that mid-string matches
|
||||
// are still meaningful.
|
||||
const REASONING_MODEL_PATTERNS: RegExp[] = [
|
||||
/MiniMax-M/i,
|
||||
/Qwen3\.6/i,
|
||||
/Qwen3-Coder/i,
|
||||
/Qwen3-VL/i,
|
||||
/MiMo-V2/i,
|
||||
/gpt-oss/i,
|
||||
/Devstral/i,
|
||||
];
|
||||
|
||||
export function isReasoningModel(id: string): boolean {
|
||||
return REASONING_MODEL_PATTERNS.some((re) => re.test(id));
|
||||
}
|
||||
|
||||
+33
-3
@@ -15,7 +15,7 @@ import {
|
||||
import {
|
||||
AI_SERVER_CHAT_PATH,
|
||||
AI_SERVER_URL,
|
||||
REQUEST_TIMEOUT_MS,
|
||||
getRequestTimeoutMs,
|
||||
loadCerts,
|
||||
} from "./config.js";
|
||||
import {
|
||||
@@ -130,6 +130,26 @@ export function streamAiServer(
|
||||
stream_options: { include_usage: true },
|
||||
};
|
||||
|
||||
// Reasoning / thinking-level forwarding. pi-mono passes
|
||||
// `options.reasoning` (a ThinkingLevel: minimal|low|medium|high|xhigh)
|
||||
// from `defaultThinkingLevel` in ~/.pi/agent/settings.json. Forward
|
||||
// it to llama.cpp two ways simultaneously so both qwen-style chat
|
||||
// templates and openai-style providers see the directive:
|
||||
// • chat_template_kwargs.enable_thinking — Qwen3 / MiMo / Devstral
|
||||
// family templates respect this boolean.
|
||||
// • reasoning_effort — passed through as a
|
||||
// chat-template kwarg by llama-server; the few templates that
|
||||
// read it (gpt-oss, MiniMax) get full granularity.
|
||||
// Skip entirely for non-reasoning models so we don't poison their
|
||||
// chat templates with kwargs they don't understand.
|
||||
const reasoning = options?.reasoning;
|
||||
if (reasoning && model.reasoning) {
|
||||
bodyObj.chat_template_kwargs = {
|
||||
enable_thinking: reasoning !== "minimal",
|
||||
};
|
||||
bodyObj.reasoning_effort = reasoning;
|
||||
}
|
||||
|
||||
const openaiTools = toolsToOpenAI(context.tools);
|
||||
if (openaiTools) {
|
||||
bodyObj.tools = openaiTools;
|
||||
@@ -141,6 +161,7 @@ export function streamAiServer(
|
||||
const body = JSON.stringify(bodyObj);
|
||||
const certs = loadCerts();
|
||||
const url = new URL(AI_SERVER_URL + AI_SERVER_CHAT_PATH);
|
||||
const requestTimeoutMs = getRequestTimeoutMs();
|
||||
|
||||
// No `ca:` — server cert is publicly-trusted (Let's Encrypt), so
|
||||
// rely on Node's default Mozilla CA bundle. mTLS client auth still
|
||||
@@ -157,7 +178,16 @@ export function streamAiServer(
|
||||
},
|
||||
cert: certs.cert,
|
||||
key: certs.key,
|
||||
timeout: REQUEST_TIMEOUT_MS,
|
||||
timeout: requestTimeoutMs,
|
||||
});
|
||||
|
||||
// TCP keepalive: kernel sends probes every 30s of idle. Stops NAT /
|
||||
// stateful firewalls on the LAN path from silently dropping the flow
|
||||
// during long prefills (when llama.cpp emits no SSE bytes yet) and
|
||||
// surfaces real drops fast instead of after the kernel retransmit
|
||||
// deadline (~15min).
|
||||
req.on("socket", (socket) => {
|
||||
socket.setKeepAlive(true, 30_000);
|
||||
});
|
||||
|
||||
const onAbort = () => {
|
||||
@@ -174,7 +204,7 @@ export function streamAiServer(
|
||||
|
||||
req.on("timeout", () => {
|
||||
req.destroy(
|
||||
new Error(`Request timed out after ${REQUEST_TIMEOUT_MS}ms`),
|
||||
new Error(`Request timed out after ${requestTimeoutMs}ms`),
|
||||
);
|
||||
});
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@ import assert from "node:assert/strict";
|
||||
import { test } from "node:test";
|
||||
import {
|
||||
extractCtxSize,
|
||||
isReasoningModel,
|
||||
isShardArtefact,
|
||||
} from "../ai-server/router-utils.ts";
|
||||
|
||||
@@ -66,3 +67,33 @@ test("isShardArtefact: non-shard numeric patterns are not matched", () => {
|
||||
assert.equal(isShardArtefact("model-001"), false, "trailing digits alone don't match");
|
||||
assert.equal(isShardArtefact("00001-of-00003-mid"), false, "must be at the end of the id");
|
||||
});
|
||||
|
||||
// ── isReasoningModel ────────────────────────────────────────────────────
|
||||
|
||||
test("isReasoningModel: known reasoning families return true", () => {
|
||||
assert.equal(isReasoningModel("MiniMax-M2.7-IQ3_XXS"), true);
|
||||
assert.equal(isReasoningModel("MiniMax-M2.7-IQ4_XS"), true);
|
||||
assert.equal(isReasoningModel("Qwen3.6-35B-Claude-Opus-Distilled-Q5_K_M"), true);
|
||||
assert.equal(isReasoningModel("Qwen_Qwen3.6-35B-A3B-Q8_0"), true, "underscore-prefixed variant still matches");
|
||||
assert.equal(isReasoningModel("Qwen3-Coder-30B-Q8_0"), true);
|
||||
assert.equal(isReasoningModel("Qwen3-Coder-Next-IQ4_XS"), true);
|
||||
assert.equal(isReasoningModel("Qwen3-VL-30B-Q8_0"), true);
|
||||
assert.equal(isReasoningModel("MiMo-V2-Flash-IQ2_M"), true);
|
||||
assert.equal(isReasoningModel("MiMo-V2-Flash-IQ2_XXS"), true);
|
||||
assert.equal(isReasoningModel("gpt-oss-120b-MXFP4"), true);
|
||||
assert.equal(isReasoningModel("Devstral-2-123B-IQ3_XXS"), true);
|
||||
});
|
||||
|
||||
test("isReasoningModel: non-reasoning families return false", () => {
|
||||
assert.equal(isReasoningModel("Anubis-70B-v1.2-Q5_K_M"), false);
|
||||
assert.equal(isReasoningModel("Euryale-v2.3-IQ4_XS"), false);
|
||||
assert.equal(isReasoningModel("Gemma-4-31B-Q8_0"), false);
|
||||
assert.equal(isReasoningModel("Skyfall-31B-v4.2-Q8_0"), false);
|
||||
assert.equal(isReasoningModel("Voxtral-Small-24B-Q8_0"), false);
|
||||
});
|
||||
|
||||
test("isReasoningModel: unknown model id returns false (conservative default)", () => {
|
||||
assert.equal(isReasoningModel("Mistral-7B-Instruct-Q4"), false);
|
||||
assert.equal(isReasoningModel("RandomModel-Q8"), false);
|
||||
assert.equal(isReasoningModel(""), false);
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user