f7af660727
Endpoint rewrites:
- GET /v1/models + /running → merged listModels() with running flag
- POST /models/load → GET /upstream/<id>/health (warm load)
- POST /models/unload → POST /api/models/unload/<id> (no body)
- Added POST /api/models/unload for unloadAll()
Config migration:
- Preset path: ~/.llama-models.ini → ~/.config/llama-swap/config.yaml
- Service unit: llama-server.service → llama-swap.service
- setPresetKey() rewritten from INI awk to YAML-aware awk for
editing --ctx-size/--temp/--n-gpu-layers in cmd: blocks
Per-model ctx-size (fixes 0/33k bug):
- parseCtxMapFromYaml(): walks config.yaml, extracts --ctx-size N per
model block → Map<id, ctxSize>
- extractCtxFromRunningCmd(): parses --ctx-size from /running cmd string
- discoverModels(): Promise.all(listModels, listRunning, readPreset),
ctx priority: running cmd → yaml → 32768 fallback
- Removed broken extractCtxSize stub and dangling imports
Tests: 14 passing (parseCtxMapFromYaml ×5, extractCtxFromRunningCmd ×3,
isShardArtefact ×3, isReasoningModel ×3)
README: full rewrite covering llama-swap architecture, YAML config format,
new endpoints, troubleshooting table updated.
179 lines
5.7 KiB
TypeScript
179 lines
5.7 KiB
TypeScript
import * as fs from "node:fs";
|
|
import * as path from "node:path";
|
|
|
|
const HOME = process.env.HOME ?? "";
|
|
const CERTS_DIR =
|
|
process.env.AI_SERVER_CERTS_DIR ?? path.join(HOME, ".pi/agent/certs");
|
|
|
|
export const AI_SERVER_URL =
|
|
process.env.AI_SERVER_URL ?? "https://ai.shahondin1624.de";
|
|
export const AI_SERVER_API_BASE = `${AI_SERVER_URL}/v1`;
|
|
export const AI_SERVER_CHAT_PATH = "/v1/chat/completions";
|
|
|
|
// SSH target for admin operations (preset edits, systemctl). Uses key auth.
|
|
export const AI_SERVER_SSH_HOST =
|
|
process.env.AI_SERVER_SSH_HOST ?? "ai-server@192.168.2.3";
|
|
|
|
// llama-swap endpoint paths
|
|
export const AI_SERVER_MODELS_PATH =
|
|
process.env.AI_SERVER_MODELS_PATH ?? "/v1/models";
|
|
export const AI_SERVER_RUNNING_PATH =
|
|
process.env.AI_SERVER_RUNNING_PATH ?? "/running";
|
|
export const AI_SERVER_UNLOAD_ALL_PATH =
|
|
process.env.AI_SERVER_UNLOAD_ALL_PATH ?? "/api/models/unload";
|
|
export const AI_SERVER_UNLOAD_PATH = (id: string) =>
|
|
process.env.AI_SERVER_UNLOAD_PATH ??
|
|
`/api/models/unload/${encodeURIComponent(id)}`;
|
|
export const AI_SERVER_UPSTREAM_HEALTH_PATH = (id: string) =>
|
|
process.env.AI_SERVER_UPSTREAM_HEALTH_PATH ??
|
|
`/upstream/${encodeURIComponent(id)}/health`;
|
|
|
|
// llama-swap config file (YAML, replaces old INI preset)
|
|
export const AI_SERVER_PRESET_PATH =
|
|
process.env.AI_SERVER_PRESET_PATH ??
|
|
"~/.config/llama-swap/config.yaml";
|
|
|
|
// systemd service unit for llama-swap
|
|
export const AI_SERVER_SERVICE_UNIT =
|
|
process.env.AI_SERVER_SERVICE_UNIT ?? "llama-swap.service";
|
|
|
|
// Distinct api id so registering streamSimple does NOT overwrite the
|
|
// built-in openai-completions provider (the api-registry keys by api name).
|
|
export const AI_SERVER_API_ID = "ai-server-mtls";
|
|
export const AI_SERVER_PROVIDER_ID = "ai-server";
|
|
|
|
// Resolve timeouts in priority order:
|
|
// 1. <ENV_VAR> — explicit override (tests, one-offs)
|
|
// 2. ~/.pi/agent/settings.json — kept in sync with pi-mono so
|
|
// inner timeouts never give up
|
|
// before the outer wrapper does
|
|
// 3. <DEFAULT_MS> — sensible fallback
|
|
const PI_SETTINGS_PATH =
|
|
process.env.PI_SETTINGS_PATH ?? path.join(HOME, ".pi/agent/settings.json");
|
|
|
|
interface PiSettings {
|
|
retry?: {
|
|
provider?: {
|
|
timeoutMs?: unknown;
|
|
adminTimeoutMs?: unknown;
|
|
};
|
|
};
|
|
}
|
|
|
|
let cachedSettings: PiSettings | null = null;
|
|
let cachedSettingsLoaded = false;
|
|
|
|
function readPiSettings(): PiSettings {
|
|
if (cachedSettingsLoaded) return cachedSettings ?? {};
|
|
cachedSettingsLoaded = true;
|
|
try {
|
|
const raw = fs.readFileSync(PI_SETTINGS_PATH, "utf-8");
|
|
cachedSettings = JSON.parse(raw) as PiSettings;
|
|
} catch {
|
|
// settings missing / unreadable / unparseable — fall through
|
|
cachedSettings = null;
|
|
}
|
|
return cachedSettings ?? {};
|
|
}
|
|
|
|
function pickPositiveNumber(...candidates: unknown[]): number | null {
|
|
for (const c of candidates) {
|
|
const n = typeof c === "number" ? c : Number(c);
|
|
if (Number.isFinite(n) && n > 0) return n;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function resolveTimeoutMs(
|
|
envVar: string | undefined,
|
|
settingsValue: unknown,
|
|
defaultMs: number,
|
|
): number {
|
|
return (
|
|
pickPositiveNumber(envVar, settingsValue) ?? defaultMs
|
|
);
|
|
}
|
|
|
|
let cachedRequestTimeoutMs: number | null = null;
|
|
let cachedAdminTimeoutMs: number | null = null;
|
|
|
|
export function getRequestTimeoutMs(): number {
|
|
if (cachedRequestTimeoutMs !== null) return cachedRequestTimeoutMs;
|
|
const settings = readPiSettings();
|
|
cachedRequestTimeoutMs = resolveTimeoutMs(
|
|
process.env.AI_SERVER_TIMEOUT_MS,
|
|
settings.retry?.provider?.timeoutMs,
|
|
300_000,
|
|
);
|
|
return cachedRequestTimeoutMs;
|
|
}
|
|
|
|
export function getAdminTimeoutMs(): number {
|
|
if (cachedAdminTimeoutMs !== null) return cachedAdminTimeoutMs;
|
|
const settings = readPiSettings();
|
|
// Admin calls (model load/unload/list) are usually quick, but a cold-cache
|
|
// load of a >100GB model can take much longer. Falls back to the inference
|
|
// timeout — for a personal setup, "as patient as inference" is a sane
|
|
// upper bound and avoids surprise admin-call failures during big loads.
|
|
cachedAdminTimeoutMs = resolveTimeoutMs(
|
|
process.env.AI_SERVER_ADMIN_TIMEOUT_MS,
|
|
settings.retry?.provider?.adminTimeoutMs,
|
|
Math.min(getRequestTimeoutMs(), 300_000),
|
|
);
|
|
return cachedAdminTimeoutMs;
|
|
}
|
|
|
|
const CA_CERT_PATH =
|
|
process.env.AI_SERVER_CA ?? path.join(CERTS_DIR, "root-ca.pem");
|
|
const CLIENT_CERT_PATH =
|
|
process.env.AI_SERVER_CLIENT_CERT ?? path.join(CERTS_DIR, "client.pem");
|
|
const CLIENT_KEY_PATH =
|
|
process.env.AI_SERVER_CLIENT_KEY ?? path.join(CERTS_DIR, "client-key.pem");
|
|
|
|
export interface CertBundle {
|
|
ca: Buffer;
|
|
cert: Buffer;
|
|
key: Buffer;
|
|
}
|
|
|
|
let cachedCerts: CertBundle | null = null;
|
|
|
|
export function loadCerts(): CertBundle {
|
|
if (cachedCerts) return cachedCerts;
|
|
try {
|
|
cachedCerts = {
|
|
ca: fs.readFileSync(CA_CERT_PATH),
|
|
cert: fs.readFileSync(CLIENT_CERT_PATH),
|
|
key: fs.readFileSync(CLIENT_KEY_PATH),
|
|
};
|
|
} catch (err) {
|
|
const msg = err instanceof Error ? err.message : String(err);
|
|
throw new Error(
|
|
`[ai-server] Failed to load mTLS certs from ${CERTS_DIR}: ${msg}. ` +
|
|
"Set AI_SERVER_CA / AI_SERVER_CLIENT_CERT / AI_SERVER_CLIENT_KEY to override.",
|
|
);
|
|
}
|
|
return cachedCerts;
|
|
}
|
|
|
|
export interface ServerModel {
|
|
id: string;
|
|
name: string;
|
|
reasoning: boolean;
|
|
contextWindow: number;
|
|
maxTokens: number;
|
|
}
|
|
|
|
// Fallback list, used only when the router is unreachable at extension load
|
|
// time. In normal operation, models are discovered from `GET /models` via
|
|
// admin.ts::discoverModels() so this stays in sync with server-side presets.
|
|
export const MODELS: ServerModel[] = [
|
|
{
|
|
id: "Qwen_Qwen3.6-35B-A3B-Q8_0",
|
|
name: "Qwen3.6-35B-A3B (AI Server, mTLS)",
|
|
reasoning: true,
|
|
contextWindow: 32_768,
|
|
maxTokens: 16_384,
|
|
},
|
|
];
|