2a8a3e8e22
Minimal shell wrapper around llama.cpp router's OpenAI-compatible API
(/v1/chat/completions), gated by the same mTLS cert as the pi extension.
Single-file, runtime deps: bash + curl + jq. Useful for scripts and agents
(Claude Code, etc.) that want to delegate generation without pulling in
a full SDK.
Features:
--list / --status / --load <model>
--stream <model> "..." for SSE token-stream output
--raw <model> '...' for full openai-format json bodies (also @file)
--prompt-file <path> reads prompt from disk via jq --rawfile, bypassing
Linux's MAX_ARG_STRLEN (~128KB per argv) so prompts
up to the model's context window work
--temperature / --top-p / --max-tokens / --system sampling overrides
Auto-retry with exponential backoff on transient empty/non-JSON
responses (model-loading window). Short-circuits on structured 4xx
errors (e.g. exceed_context_size).
AI_CERT_DIR / AI_ENDPOINT / AI_RETRIES env overrides.
Includes scripts/AI-COMPLETE.md with install + usage docs and a row in
the top-level README's scripts table.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
168 lines
8.0 KiB
Bash
Executable File
168 lines
8.0 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# ai-complete — minimal CLI for the home AI server.
|
|
#
|
|
# Usage:
|
|
# ai-complete <model> "<prompt>" # one-shot completion
|
|
# echo "prompt" | ai-complete <model> # stdin prompt
|
|
# ai-complete --list # list available models
|
|
# ai-complete --load <model> # warm up a model (cold load can take 1-3 min)
|
|
# ai-complete --status # show currently loaded model
|
|
# ai-complete --raw <model> '<full-json-body>' # send full /v1/chat/completions body
|
|
# ai-complete --stream <model> "<prompt>" # streaming SSE output
|
|
# ai-complete --temperature N --top-p N <model> "..." # sampling overrides
|
|
#
|
|
# Auth: uses the client cert at ~/.pi/agent/certs/{client.pem,client-key.pem}
|
|
# Override with $AI_CERT_DIR.
|
|
#
|
|
# Output: prints just the assistant text to stdout (no JSON wrapping).
|
|
# Errors go to stderr and exit non-zero.
|
|
|
|
set -euo pipefail
|
|
|
|
CERT_DIR="${AI_CERT_DIR:-$HOME/.pi/agent/certs}"
|
|
ENDPOINT="${AI_ENDPOINT:-https://ai.shahondin1624.de}"
|
|
CRT="$CERT_DIR/client.pem"
|
|
KEY="$CERT_DIR/client-key.pem"
|
|
|
|
[[ -f "$CRT" && -f "$KEY" ]] || { echo "missing $CRT or $KEY (set AI_CERT_DIR)" >&2; exit 1; }
|
|
command -v jq >/dev/null || { echo "needs jq" >&2; exit 1; }
|
|
|
|
curl_args=(-sS --cert "$CRT" --key "$KEY")
|
|
|
|
case "${1:-}" in
|
|
--list)
|
|
curl "${curl_args[@]}" "$ENDPOINT/v1/models" | jq -r '.data[].id'
|
|
;;
|
|
--status)
|
|
# Router exposes load state at /models (not /v1/models) under .data[].status.value
|
|
out=$(curl "${curl_args[@]}" "$ENDPOINT/models" | jq -r '.data[] | select(.status.value=="loaded") | .id')
|
|
if [[ -z "$out" ]]; then echo "none"; else echo "$out"; fi
|
|
;;
|
|
--load)
|
|
# No dedicated load endpoint; the router auto-loads on first request.
|
|
# Send a 1-token completion to trigger the load and wait until it's done.
|
|
model="$2"
|
|
body=$(jq -n --arg m "$model" '{model:$m, messages:[{role:"user",content:"."}], max_tokens:1, stream:false}')
|
|
curl "${curl_args[@]}" -X POST --max-time 600 "$ENDPOINT/v1/chat/completions" -H 'Content-Type: application/json' -d "$body" >/dev/null
|
|
echo "loaded: $model"
|
|
;;
|
|
--raw)
|
|
# body source: $3 (argv-capped) or @path syntax to read body from file
|
|
model="$2"; body_arg="$3"
|
|
body_path=$(mktemp); trap 'rm -f "$body_path"' EXIT
|
|
if [[ "${body_arg:0:1}" == "@" ]]; then
|
|
cp "${body_arg:1}" "$body_path"
|
|
else
|
|
printf '%s' "$body_arg" > "$body_path"
|
|
fi
|
|
curl "${curl_args[@]}" -X POST "$ENDPOINT/v1/chat/completions" -H 'Content-Type: application/json' --data-binary @"$body_path"
|
|
;;
|
|
--stream)
|
|
shift
|
|
# Reuse the same prompt-source logic as the default path
|
|
prompt_path=""
|
|
cleanup_paths=()
|
|
trap 'rm -f "${cleanup_paths[@]}"' EXIT
|
|
if [[ "${1:-}" == "--prompt-file" ]]; then
|
|
[[ -f "$2" ]] || { echo "prompt-file not found: $2" >&2; exit 1; }
|
|
prompt_path="$2"; shift 2
|
|
fi
|
|
model="$1"; shift
|
|
if [[ -z "$prompt_path" ]]; then
|
|
prompt_path=$(mktemp); cleanup_paths+=("$prompt_path")
|
|
if [[ -n "${1:-}" ]]; then printf '%s' "$1" > "$prompt_path"
|
|
else cat > "$prompt_path"; fi
|
|
fi
|
|
body_path=$(mktemp); cleanup_paths+=("$body_path")
|
|
jq -n --arg m "$model" --rawfile p "$prompt_path" \
|
|
'{model:$m,messages:[{role:"user",content:$p}],stream:true}' > "$body_path"
|
|
curl "${curl_args[@]}" -N -X POST "$ENDPOINT/v1/chat/completions" \
|
|
-H 'Content-Type: application/json' --data-binary @"$body_path" \
|
|
| sed -u 's/^data: //' \
|
|
| grep -v '^$' \
|
|
| grep -v '^\[DONE\]' \
|
|
| jq -j --unbuffered '.choices[0].delta.content // empty'
|
|
echo
|
|
;;
|
|
--help|-h)
|
|
sed -n '2,/^$/p' "$0" | sed 's/^# \{0,1\}//'
|
|
;;
|
|
*)
|
|
# Default: one-shot non-streaming completion
|
|
# Optional flags: --temperature N --top-p N --max-tokens N
|
|
opts='{}'
|
|
while [[ "${1:-}" == --* ]]; do
|
|
case "$1" in
|
|
--temperature) opts=$(jq --argjson v "$2" '.temperature=$v' <<<"$opts"); shift 2 ;;
|
|
--top-p) opts=$(jq --argjson v "$2" '."top_p"=$v' <<<"$opts"); shift 2 ;;
|
|
--max-tokens) opts=$(jq --argjson v "$2" '."max_tokens"=$v' <<<"$opts"); shift 2 ;;
|
|
--system) opts=$(jq --arg v "$2" '.system=$v' <<<"$opts"); shift 2 ;;
|
|
--prompt-file) opts=$(jq --arg v "$2" '."_prompt_file"=$v' <<<"$opts"); shift 2 ;;
|
|
*) echo "unknown flag $1" >&2; exit 1 ;;
|
|
esac
|
|
done
|
|
model="${1:-}"
|
|
[[ -n "$model" ]] || { echo "usage: ai-complete <model> \"prompt\"" >&2; exit 1; }
|
|
# Materialize the prompt to a temp file so it never travels through any argv
|
|
# (argv is capped at MAX_ARG_STRLEN = 128KB per element on Linux).
|
|
# Source priority: --prompt-file (zero-copy) > $2 (capped at 128KB) > stdin (uncapped).
|
|
prompt_file=$(jq -r '."_prompt_file" // empty' <<<"$opts")
|
|
opts=$(jq 'del(."_prompt_file")' <<<"$opts")
|
|
prompt_path=""
|
|
cleanup_paths=()
|
|
trap 'rm -f "${cleanup_paths[@]}"' EXIT
|
|
if [[ -n "$prompt_file" ]]; then
|
|
[[ -f "$prompt_file" ]] || { echo "prompt-file not found: $prompt_file" >&2; exit 1; }
|
|
prompt_path="$prompt_file"
|
|
elif [[ -n "${2:-}" ]]; then
|
|
prompt_path=$(mktemp); cleanup_paths+=("$prompt_path")
|
|
printf '%s' "$2" > "$prompt_path"
|
|
else
|
|
prompt_path=$(mktemp); cleanup_paths+=("$prompt_path")
|
|
cat > "$prompt_path"
|
|
fi
|
|
|
|
# Build the full request body in a single jq call — uses --rawfile for the prompt
|
|
# so it's read from disk, not argv. Write the body to a temp file too so curl -d @file
|
|
# avoids the same argv cap.
|
|
sys=$(jq -r '.system // empty' <<<"$opts")
|
|
body_path=$(mktemp); cleanup_paths+=("$body_path")
|
|
jq -n --arg m "$model" --arg s "$sys" --rawfile p "$prompt_path" --argjson o "$opts" \
|
|
'def msgs: if $s == "" then [{role:"user",content:$p}] else [{role:"system",content:$s},{role:"user",content:$p}] end;
|
|
{model:$m, messages:msgs, stream:false} + ($o | del(.system))' > "$body_path"
|
|
|
|
# Retry loop: model auto-load can take 1-3 min for big models. On the first call
|
|
# after eviction, the response can be empty / non-JSON / a 503 — wait and retry.
|
|
attempts="${AI_RETRIES:-4}"
|
|
delay=2
|
|
text=""
|
|
last_resp=""
|
|
resp_path=$(mktemp); cleanup_paths+=("$resp_path")
|
|
for ((i=1; i<=attempts; i++)); do
|
|
curl "${curl_args[@]}" -X POST --max-time 600 \
|
|
"$ENDPOINT/v1/chat/completions" -H 'Content-Type: application/json' \
|
|
--data-binary @"$body_path" -o "$resp_path" 2>/dev/null || true
|
|
last_resp=$(cat "$resp_path" 2>/dev/null || echo "")
|
|
text=$(jq -r '.choices[0].message.content // empty' "$resp_path" 2>/dev/null || echo "")
|
|
if [[ -n "$text" ]]; then break; fi
|
|
# If the response is structured JSON with an "error" key (eg 400 exceed_context_size),
|
|
# don't retry — it's a real client-side error, not a transient load timeout.
|
|
err=$(jq -r '.error.message // empty' "$resp_path" 2>/dev/null || echo "")
|
|
if [[ -n "$err" ]]; then
|
|
echo "[ai-complete] API error (no retry): $err" >&2
|
|
break
|
|
fi
|
|
echo "[ai-complete] attempt $i/$attempts produced no text (likely model loading), retrying in ${delay}s…" >&2
|
|
sleep "$delay"
|
|
delay=$(( delay * 2 ))
|
|
done
|
|
if [[ -n "$text" ]]; then
|
|
printf '%s\n' "$text"
|
|
else
|
|
echo "[ai-complete] gave up after $attempts attempts. Last response:" >&2
|
|
echo "$last_resp" >&2
|
|
exit 1
|
|
fi
|
|
;;
|
|
esac
|