Files
pi-extensions/ai-server/config.ts
T
shahondin1624 f7af660727 migrate ai-server extension from llama.cpp router to llama-swap
Endpoint rewrites:
  - GET /v1/models + /running → merged listModels() with running flag
  - POST /models/load → GET /upstream/<id>/health (warm load)
  - POST /models/unload → POST /api/models/unload/<id> (no body)
  - Added POST /api/models/unload for unloadAll()

Config migration:
  - Preset path: ~/.llama-models.ini → ~/.config/llama-swap/config.yaml
  - Service unit: llama-server.service → llama-swap.service
  - setPresetKey() rewritten from INI awk to YAML-aware awk for
    editing --ctx-size/--temp/--n-gpu-layers in cmd: blocks

Per-model ctx-size (fixes 0/33k bug):
  - parseCtxMapFromYaml(): walks config.yaml, extracts --ctx-size N per
    model block → Map<id, ctxSize>
  - extractCtxFromRunningCmd(): parses --ctx-size from /running cmd string
  - discoverModels(): Promise.all(listModels, listRunning, readPreset),
    ctx priority: running cmd → yaml → 32768 fallback
  - Removed broken extractCtxSize stub and dangling imports

Tests: 14 passing (parseCtxMapFromYaml ×5, extractCtxFromRunningCmd ×3,
isShardArtefact ×3, isReasoningModel ×3)

README: full rewrite covering llama-swap architecture, YAML config format,
new endpoints, troubleshooting table updated.
2026-05-27 10:42:19 +02:00

179 lines
5.7 KiB
TypeScript

import * as fs from "node:fs";
import * as path from "node:path";
const HOME = process.env.HOME ?? "";
const CERTS_DIR =
process.env.AI_SERVER_CERTS_DIR ?? path.join(HOME, ".pi/agent/certs");
export const AI_SERVER_URL =
process.env.AI_SERVER_URL ?? "https://ai.shahondin1624.de";
export const AI_SERVER_API_BASE = `${AI_SERVER_URL}/v1`;
export const AI_SERVER_CHAT_PATH = "/v1/chat/completions";
// SSH target for admin operations (preset edits, systemctl). Uses key auth.
export const AI_SERVER_SSH_HOST =
process.env.AI_SERVER_SSH_HOST ?? "ai-server@192.168.2.3";
// llama-swap endpoint paths
export const AI_SERVER_MODELS_PATH =
process.env.AI_SERVER_MODELS_PATH ?? "/v1/models";
export const AI_SERVER_RUNNING_PATH =
process.env.AI_SERVER_RUNNING_PATH ?? "/running";
export const AI_SERVER_UNLOAD_ALL_PATH =
process.env.AI_SERVER_UNLOAD_ALL_PATH ?? "/api/models/unload";
export const AI_SERVER_UNLOAD_PATH = (id: string) =>
process.env.AI_SERVER_UNLOAD_PATH ??
`/api/models/unload/${encodeURIComponent(id)}`;
export const AI_SERVER_UPSTREAM_HEALTH_PATH = (id: string) =>
process.env.AI_SERVER_UPSTREAM_HEALTH_PATH ??
`/upstream/${encodeURIComponent(id)}/health`;
// llama-swap config file (YAML, replaces old INI preset)
export const AI_SERVER_PRESET_PATH =
process.env.AI_SERVER_PRESET_PATH ??
"~/.config/llama-swap/config.yaml";
// systemd service unit for llama-swap
export const AI_SERVER_SERVICE_UNIT =
process.env.AI_SERVER_SERVICE_UNIT ?? "llama-swap.service";
// Distinct api id so registering streamSimple does NOT overwrite the
// built-in openai-completions provider (the api-registry keys by api name).
export const AI_SERVER_API_ID = "ai-server-mtls";
export const AI_SERVER_PROVIDER_ID = "ai-server";
// Resolve timeouts in priority order:
// 1. <ENV_VAR> — explicit override (tests, one-offs)
// 2. ~/.pi/agent/settings.json — kept in sync with pi-mono so
// inner timeouts never give up
// before the outer wrapper does
// 3. <DEFAULT_MS> — sensible fallback
const PI_SETTINGS_PATH =
process.env.PI_SETTINGS_PATH ?? path.join(HOME, ".pi/agent/settings.json");
interface PiSettings {
retry?: {
provider?: {
timeoutMs?: unknown;
adminTimeoutMs?: unknown;
};
};
}
let cachedSettings: PiSettings | null = null;
let cachedSettingsLoaded = false;
function readPiSettings(): PiSettings {
if (cachedSettingsLoaded) return cachedSettings ?? {};
cachedSettingsLoaded = true;
try {
const raw = fs.readFileSync(PI_SETTINGS_PATH, "utf-8");
cachedSettings = JSON.parse(raw) as PiSettings;
} catch {
// settings missing / unreadable / unparseable — fall through
cachedSettings = null;
}
return cachedSettings ?? {};
}
function pickPositiveNumber(...candidates: unknown[]): number | null {
for (const c of candidates) {
const n = typeof c === "number" ? c : Number(c);
if (Number.isFinite(n) && n > 0) return n;
}
return null;
}
function resolveTimeoutMs(
envVar: string | undefined,
settingsValue: unknown,
defaultMs: number,
): number {
return (
pickPositiveNumber(envVar, settingsValue) ?? defaultMs
);
}
let cachedRequestTimeoutMs: number | null = null;
let cachedAdminTimeoutMs: number | null = null;
export function getRequestTimeoutMs(): number {
if (cachedRequestTimeoutMs !== null) return cachedRequestTimeoutMs;
const settings = readPiSettings();
cachedRequestTimeoutMs = resolveTimeoutMs(
process.env.AI_SERVER_TIMEOUT_MS,
settings.retry?.provider?.timeoutMs,
300_000,
);
return cachedRequestTimeoutMs;
}
export function getAdminTimeoutMs(): number {
if (cachedAdminTimeoutMs !== null) return cachedAdminTimeoutMs;
const settings = readPiSettings();
// Admin calls (model load/unload/list) are usually quick, but a cold-cache
// load of a >100GB model can take much longer. Falls back to the inference
// timeout — for a personal setup, "as patient as inference" is a sane
// upper bound and avoids surprise admin-call failures during big loads.
cachedAdminTimeoutMs = resolveTimeoutMs(
process.env.AI_SERVER_ADMIN_TIMEOUT_MS,
settings.retry?.provider?.adminTimeoutMs,
Math.min(getRequestTimeoutMs(), 300_000),
);
return cachedAdminTimeoutMs;
}
const CA_CERT_PATH =
process.env.AI_SERVER_CA ?? path.join(CERTS_DIR, "root-ca.pem");
const CLIENT_CERT_PATH =
process.env.AI_SERVER_CLIENT_CERT ?? path.join(CERTS_DIR, "client.pem");
const CLIENT_KEY_PATH =
process.env.AI_SERVER_CLIENT_KEY ?? path.join(CERTS_DIR, "client-key.pem");
export interface CertBundle {
ca: Buffer;
cert: Buffer;
key: Buffer;
}
let cachedCerts: CertBundle | null = null;
export function loadCerts(): CertBundle {
if (cachedCerts) return cachedCerts;
try {
cachedCerts = {
ca: fs.readFileSync(CA_CERT_PATH),
cert: fs.readFileSync(CLIENT_CERT_PATH),
key: fs.readFileSync(CLIENT_KEY_PATH),
};
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
throw new Error(
`[ai-server] Failed to load mTLS certs from ${CERTS_DIR}: ${msg}. ` +
"Set AI_SERVER_CA / AI_SERVER_CLIENT_CERT / AI_SERVER_CLIENT_KEY to override.",
);
}
return cachedCerts;
}
export interface ServerModel {
id: string;
name: string;
reasoning: boolean;
contextWindow: number;
maxTokens: number;
}
// Fallback list, used only when the router is unreachable at extension load
// time. In normal operation, models are discovered from `GET /models` via
// admin.ts::discoverModels() so this stays in sync with server-side presets.
export const MODELS: ServerModel[] = [
{
id: "Qwen_Qwen3.6-35B-A3B-Q8_0",
name: "Qwen3.6-35B-A3B (AI Server, mTLS)",
reasoning: true,
contextWindow: 32_768,
maxTokens: 16_384,
},
];