853cef84af
The context handler consumed the handoff marker after the first turn (and agent_end cleared all markers), so every subsequent turn fell through to the fast path and re-sent the full, ever-growing history. Across a task chain the context grew monotonically instead of resetting per task, defeating the handoff and eventually overrunning the model's context window. Track a single activeHandoffId that is never consumed and is overwritten when a later handoff supersedes it, so truncation re-applies on every turn and the context stays pinned to the carry-over prompt for the whole task. Add a pure, testable resolveHandoffContext helper and a multi-turn regression test. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
456 lines
16 KiB
TypeScript
456 lines
16 KiB
TypeScript
/**
|
|
* Unit tests for session-handoff/state.ts (pure helpers + sentinel detection).
|
|
*
|
|
* node --experimental-strip-types --test tests/session-handoff.test.ts
|
|
*/
|
|
|
|
import assert from "node:assert/strict";
|
|
import { test } from "node:test";
|
|
import {
|
|
buildHandoffMessage,
|
|
buildHandoffSentinel,
|
|
extractSentinelId,
|
|
findPendingHandoff,
|
|
normalizePrompt,
|
|
normalizeReason,
|
|
resolveHandoffContext,
|
|
SENTINEL_RE,
|
|
stripSentinel,
|
|
truncateForHandoff,
|
|
} from "../session-handoff/state.ts";
|
|
|
|
// Helper for constructing message fixtures
|
|
function user(text: string) {
|
|
return { role: "user", content: [{ type: "text", text }] };
|
|
}
|
|
function assistant(text: string) {
|
|
return { role: "assistant", content: [{ type: "text", text }] };
|
|
}
|
|
function toolResult(text: string) {
|
|
return { role: "toolResult", content: [{ type: "text", text }] };
|
|
}
|
|
|
|
// ── normalizePrompt ─────────────────────────────────────────────────────
|
|
|
|
test("normalizePrompt: trims surrounding whitespace", () => {
|
|
assert.equal(normalizePrompt(" hello "), "hello");
|
|
assert.equal(normalizePrompt("\n\ntask\n"), "task");
|
|
});
|
|
|
|
test("normalizePrompt: collapses blank inputs to empty string", () => {
|
|
assert.equal(normalizePrompt(""), "");
|
|
assert.equal(normalizePrompt(" "), "");
|
|
assert.equal(normalizePrompt("\t\n "), "");
|
|
});
|
|
|
|
test("normalizePrompt: coerces nullish to empty string", () => {
|
|
assert.equal(normalizePrompt(undefined), "");
|
|
assert.equal(normalizePrompt(null), "");
|
|
});
|
|
|
|
test("normalizePrompt: preserves interior whitespace and newlines", () => {
|
|
const multi = "line one\nline two\n\nline four";
|
|
assert.equal(normalizePrompt(` ${multi} `), multi);
|
|
});
|
|
|
|
// ── normalizeReason ─────────────────────────────────────────────────────
|
|
|
|
test("normalizeReason: trims and returns non-empty reasons", () => {
|
|
assert.equal(normalizeReason(" context drift "), "context drift");
|
|
});
|
|
|
|
test("normalizeReason: blank/whitespace/nullish all collapse to undefined", () => {
|
|
assert.equal(normalizeReason(""), undefined);
|
|
assert.equal(normalizeReason(" "), undefined);
|
|
assert.equal(normalizeReason("\n\t"), undefined);
|
|
assert.equal(normalizeReason(undefined), undefined);
|
|
assert.equal(normalizeReason(null), undefined);
|
|
});
|
|
|
|
// ── buildHandoffSentinel / buildHandoffMessage ──────────────────────────
|
|
|
|
test("buildHandoffSentinel: produces the expected literal form", () => {
|
|
assert.equal(
|
|
buildHandoffSentinel("abc-123"),
|
|
"<pi-handoff-resume:abc-123>",
|
|
);
|
|
});
|
|
|
|
test("buildHandoffMessage: prepends sentinel + newline to prompt", () => {
|
|
assert.equal(
|
|
buildHandoffMessage("abc-123", "next task"),
|
|
"<pi-handoff-resume:abc-123>\nnext task",
|
|
);
|
|
});
|
|
|
|
test("buildHandoffMessage: preserves multi-line prompts verbatim", () => {
|
|
const prompt = "## Context\nDo X\n## Task\nDo Y";
|
|
assert.equal(
|
|
buildHandoffMessage("id", prompt),
|
|
`<pi-handoff-resume:id>\n${prompt}`,
|
|
);
|
|
});
|
|
|
|
// ── extractSentinelId ───────────────────────────────────────────────────
|
|
|
|
test("extractSentinelId: returns id when sentinel present at start", () => {
|
|
assert.equal(
|
|
extractSentinelId("<pi-handoff-resume:abc-123>\nthe prompt"),
|
|
"abc-123",
|
|
);
|
|
});
|
|
|
|
test("extractSentinelId: tolerates missing trailing newline", () => {
|
|
assert.equal(extractSentinelId("<pi-handoff-resume:xyz>"), "xyz");
|
|
});
|
|
|
|
test("extractSentinelId: returns undefined when no sentinel", () => {
|
|
assert.equal(extractSentinelId("just a regular prompt"), undefined);
|
|
assert.equal(extractSentinelId(""), undefined);
|
|
});
|
|
|
|
test("extractSentinelId: only matches when sentinel is at start (anchored)", () => {
|
|
// Mid-string match should NOT trigger — anchored regex prevents the LLM
|
|
// or another extension from sneaking in a sentinel via prompt content.
|
|
assert.equal(
|
|
extractSentinelId("preface text\n<pi-handoff-resume:fake>\nrest"),
|
|
undefined,
|
|
);
|
|
});
|
|
|
|
test("extractSentinelId: handles UUID-shaped ids", () => {
|
|
const uuid = "018f3a2c-7b1e-7c4d-9e8a-1234567890ab";
|
|
assert.equal(
|
|
extractSentinelId(`<pi-handoff-resume:${uuid}>\nprompt`),
|
|
uuid,
|
|
);
|
|
});
|
|
|
|
// ── stripSentinel ───────────────────────────────────────────────────────
|
|
|
|
test("stripSentinel: removes sentinel and trailing newline cleanly", () => {
|
|
assert.equal(
|
|
stripSentinel("<pi-handoff-resume:id>\nthe prompt"),
|
|
"the prompt",
|
|
);
|
|
});
|
|
|
|
test("stripSentinel: leaves text unchanged when no sentinel", () => {
|
|
assert.equal(stripSentinel("just a prompt"), "just a prompt");
|
|
assert.equal(stripSentinel(""), "");
|
|
});
|
|
|
|
test("stripSentinel: does not strip mid-string sentinel (anchored)", () => {
|
|
const text = "preface\n<pi-handoff-resume:fake>\nrest";
|
|
assert.equal(stripSentinel(text), text);
|
|
});
|
|
|
|
// ── SENTINEL_RE direct ──────────────────────────────────────────────────
|
|
|
|
test("SENTINEL_RE: matches both ASCII and UUID-shaped ids", () => {
|
|
assert.match("<pi-handoff-resume:simple>\n", SENTINEL_RE);
|
|
assert.match(
|
|
"<pi-handoff-resume:018f3a2c-7b1e-7c4d-9e8a-1234567890ab>\n",
|
|
SENTINEL_RE,
|
|
);
|
|
});
|
|
|
|
test("SENTINEL_RE: does not match when prefix differs", () => {
|
|
assert.doesNotMatch("<pi-handoff:id>\n", SENTINEL_RE);
|
|
assert.doesNotMatch("pi-handoff-resume:id\n", SENTINEL_RE);
|
|
});
|
|
|
|
// ── findPendingHandoff ──────────────────────────────────────────────────
|
|
|
|
test("findPendingHandoff: returns undefined when pendingIds is empty", () => {
|
|
const messages = [user(buildHandoffMessage("id-1", "task"))];
|
|
assert.equal(findPendingHandoff(messages, new Set()), undefined);
|
|
});
|
|
|
|
test("findPendingHandoff: returns undefined when no message has a sentinel", () => {
|
|
const messages = [user("regular prompt"), assistant("response")];
|
|
assert.equal(findPendingHandoff(messages, new Set(["id-1"])), undefined);
|
|
});
|
|
|
|
test("findPendingHandoff: finds a matching sentinel", () => {
|
|
const messages = [
|
|
user("first"),
|
|
assistant("response"),
|
|
user(buildHandoffMessage("id-1", "next task")),
|
|
];
|
|
assert.deepEqual(findPendingHandoff(messages, new Set(["id-1"])), {
|
|
index: 2,
|
|
id: "id-1",
|
|
});
|
|
});
|
|
|
|
test("findPendingHandoff: ignores sentinel whose id is not pending", () => {
|
|
// Past handoff sentinels persist in the session log but are inert when
|
|
// their id is not the active one (e.g. a handoff from before a reload).
|
|
const messages = [user(buildHandoffMessage("inactive", "old task"))];
|
|
assert.equal(findPendingHandoff(messages, new Set(["different"])), undefined);
|
|
});
|
|
|
|
test("findPendingHandoff: picks the LATEST pending sentinel when several exist", () => {
|
|
const messages = [
|
|
user(buildHandoffMessage("id-1", "first")),
|
|
assistant("ack"),
|
|
user(buildHandoffMessage("id-2", "second")),
|
|
];
|
|
const pending = new Set(["id-1", "id-2"]);
|
|
assert.deepEqual(findPendingHandoff(messages, pending), {
|
|
index: 2,
|
|
id: "id-2",
|
|
});
|
|
});
|
|
|
|
test("findPendingHandoff: skips non-user messages even with sentinel-like text", () => {
|
|
const messages = [
|
|
assistant(buildHandoffMessage("id-1", "shouldn't match")),
|
|
toolResult(buildHandoffMessage("id-1", "also shouldn't match")),
|
|
];
|
|
assert.equal(findPendingHandoff(messages, new Set(["id-1"])), undefined);
|
|
});
|
|
|
|
test("findPendingHandoff: skips non-text content (e.g. images)", () => {
|
|
const messages = [
|
|
{
|
|
role: "user",
|
|
content: [
|
|
{ type: "image", data: "..." },
|
|
{ type: "text", text: "no sentinel here" },
|
|
],
|
|
},
|
|
];
|
|
assert.equal(findPendingHandoff(messages, new Set(["id-1"])), undefined);
|
|
});
|
|
|
|
test("findPendingHandoff: only considers anchored sentinels (not mid-text)", () => {
|
|
const messages = [
|
|
user(`some preface\n<pi-handoff-resume:id-1>\nrest of message`),
|
|
];
|
|
assert.equal(findPendingHandoff(messages, new Set(["id-1"])), undefined);
|
|
});
|
|
|
|
// ── truncateForHandoff ──────────────────────────────────────────────────
|
|
|
|
test("truncateForHandoff: returns undefined when no pending sentinel matches", () => {
|
|
const messages = [user("regular"), assistant("response")];
|
|
assert.equal(truncateForHandoff(messages, new Set(["id-1"])), undefined);
|
|
});
|
|
|
|
test("truncateForHandoff: slices to the handoff message and strips the sentinel", () => {
|
|
const messages = [
|
|
user("task one"),
|
|
assistant("doing task one"),
|
|
toolResult("done"),
|
|
user(buildHandoffMessage("id-1", "task two: do this fresh")),
|
|
];
|
|
const result = truncateForHandoff(messages, new Set(["id-1"]));
|
|
assert.ok(result, "expected truncation result");
|
|
assert.equal(result.matchedId, "id-1");
|
|
assert.equal(result.messages.length, 1);
|
|
assert.equal(result.messages[0].role, "user");
|
|
const content = result.messages[0].content as Array<{
|
|
type: string;
|
|
text: string;
|
|
}>;
|
|
assert.equal(content.length, 1);
|
|
assert.equal(content[0].text, "task two: do this fresh");
|
|
});
|
|
|
|
test("truncateForHandoff: keeps subsequent messages after the handoff verbatim", () => {
|
|
// Unlikely in practice (followUp adds at end) but a sane invariant.
|
|
const messages = [
|
|
user("old task"),
|
|
user(buildHandoffMessage("id-1", "new task")),
|
|
assistant("started"),
|
|
];
|
|
const result = truncateForHandoff(messages, new Set(["id-1"]));
|
|
assert.ok(result);
|
|
assert.equal(result.messages.length, 2);
|
|
assert.equal(
|
|
(result.messages[0].content as Array<{ text: string }>)[0].text,
|
|
"new task",
|
|
);
|
|
assert.equal(result.messages[1].role, "assistant");
|
|
});
|
|
|
|
test("truncateForHandoff: does NOT mutate the pendingIds set (caller's job)", () => {
|
|
const pending = new Set(["id-1"]);
|
|
const messages = [user(buildHandoffMessage("id-1", "task"))];
|
|
truncateForHandoff(messages, pending);
|
|
assert.equal(pending.has("id-1"), true, "set should still contain id");
|
|
});
|
|
|
|
test("truncateForHandoff: does NOT mutate the input messages array", () => {
|
|
const messages = [
|
|
user("first"),
|
|
user(buildHandoffMessage("id-1", "second")),
|
|
];
|
|
const before = JSON.stringify(messages);
|
|
truncateForHandoff(messages, new Set(["id-1"]));
|
|
assert.equal(
|
|
JSON.stringify(messages),
|
|
before,
|
|
"input messages should be untouched",
|
|
);
|
|
});
|
|
|
|
test("truncateForHandoff: preserves non-text content alongside text in the handoff message", () => {
|
|
const messages = [
|
|
{
|
|
role: "user",
|
|
content: [
|
|
{
|
|
type: "text",
|
|
text: buildHandoffMessage("id-1", "next task"),
|
|
},
|
|
{ type: "image", data: "abc" },
|
|
],
|
|
},
|
|
];
|
|
const result = truncateForHandoff(messages, new Set(["id-1"]));
|
|
assert.ok(result);
|
|
const content = result.messages[0].content as Array<{
|
|
type: string;
|
|
text?: string;
|
|
data?: string;
|
|
}>;
|
|
assert.equal(content.length, 2);
|
|
assert.equal(content[0].text, "next task");
|
|
assert.equal(content[1].type, "image");
|
|
assert.equal(content[1].data, "abc");
|
|
});
|
|
|
|
test("truncateForHandoff: picks the LATEST handoff when multiple are pending", () => {
|
|
const messages = [
|
|
user(buildHandoffMessage("id-1", "first task")),
|
|
user(buildHandoffMessage("id-2", "second task")),
|
|
];
|
|
const result = truncateForHandoff(messages, new Set(["id-1", "id-2"]));
|
|
assert.ok(result);
|
|
assert.equal(result.matchedId, "id-2");
|
|
assert.equal(result.messages.length, 1);
|
|
assert.equal(
|
|
(result.messages[0].content as Array<{ text: string }>)[0].text,
|
|
"second task",
|
|
);
|
|
});
|
|
|
|
test("truncateForHandoff: keeps matching across turns while the id stays pending", () => {
|
|
// Regression guard: the id must NOT be removed after the first match. As
|
|
// long as it stays in the pending set, every call keeps slicing to the
|
|
// sentinel — that is what pins the context to the carry-over prompt for the
|
|
// whole task instead of just one turn.
|
|
const pending = new Set<string>();
|
|
const messages = [user("task 1"), assistant("response")];
|
|
|
|
// No pending markers: no truncation.
|
|
assert.equal(truncateForHandoff(messages, pending), undefined);
|
|
|
|
// Handoff fires; its followUp drains into the log.
|
|
pending.add("id-1");
|
|
messages.push(user(buildHandoffMessage("id-1", "task 2")));
|
|
|
|
// Every subsequent turn keeps truncating to the same sentinel.
|
|
for (let turn = 0; turn < 3; turn++) {
|
|
const result = truncateForHandoff(messages, pending);
|
|
assert.ok(result, `turn ${turn}: expected truncation`);
|
|
assert.equal(result.matchedId, "id-1");
|
|
assert.equal(result.messages[0].role, "user");
|
|
assert.equal(
|
|
(result.messages[0].content as Array<{ text: string }>)[0].text,
|
|
"task 2",
|
|
);
|
|
messages.push(assistant(`step ${turn}`));
|
|
}
|
|
});
|
|
|
|
// ── resolveHandoffContext (the per-turn decision used by index.ts) ──────────
|
|
|
|
test("resolveHandoffContext: returns undefined when no handoff is active", () => {
|
|
const messages = [user(buildHandoffMessage("id-1", "task")), assistant("x")];
|
|
assert.equal(resolveHandoffContext(messages, null), undefined);
|
|
});
|
|
|
|
test("resolveHandoffContext: slices to the active handoff and strips the sentinel", () => {
|
|
const messages = [
|
|
user("old task"),
|
|
assistant("did it"),
|
|
user(buildHandoffMessage("id-1", "new task")),
|
|
];
|
|
const result = resolveHandoffContext(messages, "id-1");
|
|
assert.ok(result);
|
|
assert.equal(result.length, 1);
|
|
assert.equal(
|
|
(result[0].content as Array<{ text: string }>)[0].text,
|
|
"new task",
|
|
);
|
|
});
|
|
|
|
test("resolveHandoffContext: returns undefined when the active sentinel is not in the log", () => {
|
|
// Handoff staged but its followUp has not drained yet (or was aborted):
|
|
// the sentinel isn't a message, so the context is left untouched.
|
|
const messages = [user("ordinary"), assistant("reply")];
|
|
assert.equal(resolveHandoffContext(messages, "id-1"), undefined);
|
|
});
|
|
|
|
test("resolveHandoffContext: PERSISTS truncation across every turn (regression)", () => {
|
|
// The original bug consumed the handoff id after one turn, so the second
|
|
// turn sent the full pre-handoff history again (context ballooned across a
|
|
// task chain instead of resetting). Here we simulate several turns of a task
|
|
// growing after a handoff and assert the context stays pinned to the
|
|
// carry-over prompt on EVERY turn.
|
|
const history = [user("task 11"), assistant("done 11"), toolResult("ok")];
|
|
const messages = [
|
|
...history,
|
|
user(buildHandoffMessage("id-1", "task 12")),
|
|
];
|
|
const activeId = "id-1";
|
|
|
|
for (let turn = 0; turn < 5; turn++) {
|
|
const result = resolveHandoffContext(messages, activeId);
|
|
assert.ok(result, `turn ${turn}: expected truncation`);
|
|
// First visible message is always the (stripped) carry-over prompt.
|
|
assert.equal(result[0].role, "user");
|
|
assert.equal(
|
|
(result[0].content as Array<{ text: string }>)[0].text.startsWith(
|
|
"task 12",
|
|
),
|
|
true,
|
|
`turn ${turn}: context not pinned to carry-over prompt`,
|
|
);
|
|
// Stale pre-handoff history must never leak back in.
|
|
assert.equal(
|
|
result.some((m) =>
|
|
(m.content as Array<{ text?: string }>).some(
|
|
(c) => c.text === "task 11",
|
|
),
|
|
),
|
|
false,
|
|
`turn ${turn}: stale pre-handoff history leaked back in`,
|
|
);
|
|
// The task does more work; the log grows.
|
|
messages.push(assistant(`work step ${turn}`));
|
|
messages.push(toolResult(`result ${turn}`));
|
|
}
|
|
});
|
|
|
|
test("resolveHandoffContext: a later handoff supersedes the earlier one", () => {
|
|
const messages = [
|
|
user(buildHandoffMessage("id-1", "first task")),
|
|
assistant("worked on first"),
|
|
toolResult("done"),
|
|
user(buildHandoffMessage("id-2", "second task")),
|
|
];
|
|
// index.ts overwrites activeHandoffId with the newest id on each handoff.
|
|
const result = resolveHandoffContext(messages, "id-2");
|
|
assert.ok(result);
|
|
assert.equal(result.length, 1);
|
|
assert.equal(
|
|
(result[0].content as Array<{ text: string }>)[0].text,
|
|
"second task",
|
|
);
|
|
});
|