Files
pi-extensions/tests/session-handoff.test.ts
T
shahondin1624 853cef84af fix session-handoff truncation persisting only one turn
The context handler consumed the handoff marker after the first turn (and
agent_end cleared all markers), so every subsequent turn fell through to the
fast path and re-sent the full, ever-growing history. Across a task chain the
context grew monotonically instead of resetting per task, defeating the handoff
and eventually overrunning the model's context window.

Track a single activeHandoffId that is never consumed and is overwritten when a
later handoff supersedes it, so truncation re-applies on every turn and the
context stays pinned to the carry-over prompt for the whole task. Add a pure,
testable resolveHandoffContext helper and a multi-turn regression test.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-28 19:26:55 +02:00

456 lines
16 KiB
TypeScript

/**
* Unit tests for session-handoff/state.ts (pure helpers + sentinel detection).
*
* node --experimental-strip-types --test tests/session-handoff.test.ts
*/
import assert from "node:assert/strict";
import { test } from "node:test";
import {
buildHandoffMessage,
buildHandoffSentinel,
extractSentinelId,
findPendingHandoff,
normalizePrompt,
normalizeReason,
resolveHandoffContext,
SENTINEL_RE,
stripSentinel,
truncateForHandoff,
} from "../session-handoff/state.ts";
// Helper for constructing message fixtures
function user(text: string) {
return { role: "user", content: [{ type: "text", text }] };
}
function assistant(text: string) {
return { role: "assistant", content: [{ type: "text", text }] };
}
function toolResult(text: string) {
return { role: "toolResult", content: [{ type: "text", text }] };
}
// ── normalizePrompt ─────────────────────────────────────────────────────
test("normalizePrompt: trims surrounding whitespace", () => {
assert.equal(normalizePrompt(" hello "), "hello");
assert.equal(normalizePrompt("\n\ntask\n"), "task");
});
test("normalizePrompt: collapses blank inputs to empty string", () => {
assert.equal(normalizePrompt(""), "");
assert.equal(normalizePrompt(" "), "");
assert.equal(normalizePrompt("\t\n "), "");
});
test("normalizePrompt: coerces nullish to empty string", () => {
assert.equal(normalizePrompt(undefined), "");
assert.equal(normalizePrompt(null), "");
});
test("normalizePrompt: preserves interior whitespace and newlines", () => {
const multi = "line one\nline two\n\nline four";
assert.equal(normalizePrompt(` ${multi} `), multi);
});
// ── normalizeReason ─────────────────────────────────────────────────────
test("normalizeReason: trims and returns non-empty reasons", () => {
assert.equal(normalizeReason(" context drift "), "context drift");
});
test("normalizeReason: blank/whitespace/nullish all collapse to undefined", () => {
assert.equal(normalizeReason(""), undefined);
assert.equal(normalizeReason(" "), undefined);
assert.equal(normalizeReason("\n\t"), undefined);
assert.equal(normalizeReason(undefined), undefined);
assert.equal(normalizeReason(null), undefined);
});
// ── buildHandoffSentinel / buildHandoffMessage ──────────────────────────
test("buildHandoffSentinel: produces the expected literal form", () => {
assert.equal(
buildHandoffSentinel("abc-123"),
"<pi-handoff-resume:abc-123>",
);
});
test("buildHandoffMessage: prepends sentinel + newline to prompt", () => {
assert.equal(
buildHandoffMessage("abc-123", "next task"),
"<pi-handoff-resume:abc-123>\nnext task",
);
});
test("buildHandoffMessage: preserves multi-line prompts verbatim", () => {
const prompt = "## Context\nDo X\n## Task\nDo Y";
assert.equal(
buildHandoffMessage("id", prompt),
`<pi-handoff-resume:id>\n${prompt}`,
);
});
// ── extractSentinelId ───────────────────────────────────────────────────
test("extractSentinelId: returns id when sentinel present at start", () => {
assert.equal(
extractSentinelId("<pi-handoff-resume:abc-123>\nthe prompt"),
"abc-123",
);
});
test("extractSentinelId: tolerates missing trailing newline", () => {
assert.equal(extractSentinelId("<pi-handoff-resume:xyz>"), "xyz");
});
test("extractSentinelId: returns undefined when no sentinel", () => {
assert.equal(extractSentinelId("just a regular prompt"), undefined);
assert.equal(extractSentinelId(""), undefined);
});
test("extractSentinelId: only matches when sentinel is at start (anchored)", () => {
// Mid-string match should NOT trigger — anchored regex prevents the LLM
// or another extension from sneaking in a sentinel via prompt content.
assert.equal(
extractSentinelId("preface text\n<pi-handoff-resume:fake>\nrest"),
undefined,
);
});
test("extractSentinelId: handles UUID-shaped ids", () => {
const uuid = "018f3a2c-7b1e-7c4d-9e8a-1234567890ab";
assert.equal(
extractSentinelId(`<pi-handoff-resume:${uuid}>\nprompt`),
uuid,
);
});
// ── stripSentinel ───────────────────────────────────────────────────────
test("stripSentinel: removes sentinel and trailing newline cleanly", () => {
assert.equal(
stripSentinel("<pi-handoff-resume:id>\nthe prompt"),
"the prompt",
);
});
test("stripSentinel: leaves text unchanged when no sentinel", () => {
assert.equal(stripSentinel("just a prompt"), "just a prompt");
assert.equal(stripSentinel(""), "");
});
test("stripSentinel: does not strip mid-string sentinel (anchored)", () => {
const text = "preface\n<pi-handoff-resume:fake>\nrest";
assert.equal(stripSentinel(text), text);
});
// ── SENTINEL_RE direct ──────────────────────────────────────────────────
test("SENTINEL_RE: matches both ASCII and UUID-shaped ids", () => {
assert.match("<pi-handoff-resume:simple>\n", SENTINEL_RE);
assert.match(
"<pi-handoff-resume:018f3a2c-7b1e-7c4d-9e8a-1234567890ab>\n",
SENTINEL_RE,
);
});
test("SENTINEL_RE: does not match when prefix differs", () => {
assert.doesNotMatch("<pi-handoff:id>\n", SENTINEL_RE);
assert.doesNotMatch("pi-handoff-resume:id\n", SENTINEL_RE);
});
// ── findPendingHandoff ──────────────────────────────────────────────────
test("findPendingHandoff: returns undefined when pendingIds is empty", () => {
const messages = [user(buildHandoffMessage("id-1", "task"))];
assert.equal(findPendingHandoff(messages, new Set()), undefined);
});
test("findPendingHandoff: returns undefined when no message has a sentinel", () => {
const messages = [user("regular prompt"), assistant("response")];
assert.equal(findPendingHandoff(messages, new Set(["id-1"])), undefined);
});
test("findPendingHandoff: finds a matching sentinel", () => {
const messages = [
user("first"),
assistant("response"),
user(buildHandoffMessage("id-1", "next task")),
];
assert.deepEqual(findPendingHandoff(messages, new Set(["id-1"])), {
index: 2,
id: "id-1",
});
});
test("findPendingHandoff: ignores sentinel whose id is not pending", () => {
// Past handoff sentinels persist in the session log but are inert when
// their id is not the active one (e.g. a handoff from before a reload).
const messages = [user(buildHandoffMessage("inactive", "old task"))];
assert.equal(findPendingHandoff(messages, new Set(["different"])), undefined);
});
test("findPendingHandoff: picks the LATEST pending sentinel when several exist", () => {
const messages = [
user(buildHandoffMessage("id-1", "first")),
assistant("ack"),
user(buildHandoffMessage("id-2", "second")),
];
const pending = new Set(["id-1", "id-2"]);
assert.deepEqual(findPendingHandoff(messages, pending), {
index: 2,
id: "id-2",
});
});
test("findPendingHandoff: skips non-user messages even with sentinel-like text", () => {
const messages = [
assistant(buildHandoffMessage("id-1", "shouldn't match")),
toolResult(buildHandoffMessage("id-1", "also shouldn't match")),
];
assert.equal(findPendingHandoff(messages, new Set(["id-1"])), undefined);
});
test("findPendingHandoff: skips non-text content (e.g. images)", () => {
const messages = [
{
role: "user",
content: [
{ type: "image", data: "..." },
{ type: "text", text: "no sentinel here" },
],
},
];
assert.equal(findPendingHandoff(messages, new Set(["id-1"])), undefined);
});
test("findPendingHandoff: only considers anchored sentinels (not mid-text)", () => {
const messages = [
user(`some preface\n<pi-handoff-resume:id-1>\nrest of message`),
];
assert.equal(findPendingHandoff(messages, new Set(["id-1"])), undefined);
});
// ── truncateForHandoff ──────────────────────────────────────────────────
test("truncateForHandoff: returns undefined when no pending sentinel matches", () => {
const messages = [user("regular"), assistant("response")];
assert.equal(truncateForHandoff(messages, new Set(["id-1"])), undefined);
});
test("truncateForHandoff: slices to the handoff message and strips the sentinel", () => {
const messages = [
user("task one"),
assistant("doing task one"),
toolResult("done"),
user(buildHandoffMessage("id-1", "task two: do this fresh")),
];
const result = truncateForHandoff(messages, new Set(["id-1"]));
assert.ok(result, "expected truncation result");
assert.equal(result.matchedId, "id-1");
assert.equal(result.messages.length, 1);
assert.equal(result.messages[0].role, "user");
const content = result.messages[0].content as Array<{
type: string;
text: string;
}>;
assert.equal(content.length, 1);
assert.equal(content[0].text, "task two: do this fresh");
});
test("truncateForHandoff: keeps subsequent messages after the handoff verbatim", () => {
// Unlikely in practice (followUp adds at end) but a sane invariant.
const messages = [
user("old task"),
user(buildHandoffMessage("id-1", "new task")),
assistant("started"),
];
const result = truncateForHandoff(messages, new Set(["id-1"]));
assert.ok(result);
assert.equal(result.messages.length, 2);
assert.equal(
(result.messages[0].content as Array<{ text: string }>)[0].text,
"new task",
);
assert.equal(result.messages[1].role, "assistant");
});
test("truncateForHandoff: does NOT mutate the pendingIds set (caller's job)", () => {
const pending = new Set(["id-1"]);
const messages = [user(buildHandoffMessage("id-1", "task"))];
truncateForHandoff(messages, pending);
assert.equal(pending.has("id-1"), true, "set should still contain id");
});
test("truncateForHandoff: does NOT mutate the input messages array", () => {
const messages = [
user("first"),
user(buildHandoffMessage("id-1", "second")),
];
const before = JSON.stringify(messages);
truncateForHandoff(messages, new Set(["id-1"]));
assert.equal(
JSON.stringify(messages),
before,
"input messages should be untouched",
);
});
test("truncateForHandoff: preserves non-text content alongside text in the handoff message", () => {
const messages = [
{
role: "user",
content: [
{
type: "text",
text: buildHandoffMessage("id-1", "next task"),
},
{ type: "image", data: "abc" },
],
},
];
const result = truncateForHandoff(messages, new Set(["id-1"]));
assert.ok(result);
const content = result.messages[0].content as Array<{
type: string;
text?: string;
data?: string;
}>;
assert.equal(content.length, 2);
assert.equal(content[0].text, "next task");
assert.equal(content[1].type, "image");
assert.equal(content[1].data, "abc");
});
test("truncateForHandoff: picks the LATEST handoff when multiple are pending", () => {
const messages = [
user(buildHandoffMessage("id-1", "first task")),
user(buildHandoffMessage("id-2", "second task")),
];
const result = truncateForHandoff(messages, new Set(["id-1", "id-2"]));
assert.ok(result);
assert.equal(result.matchedId, "id-2");
assert.equal(result.messages.length, 1);
assert.equal(
(result.messages[0].content as Array<{ text: string }>)[0].text,
"second task",
);
});
test("truncateForHandoff: keeps matching across turns while the id stays pending", () => {
// Regression guard: the id must NOT be removed after the first match. As
// long as it stays in the pending set, every call keeps slicing to the
// sentinel — that is what pins the context to the carry-over prompt for the
// whole task instead of just one turn.
const pending = new Set<string>();
const messages = [user("task 1"), assistant("response")];
// No pending markers: no truncation.
assert.equal(truncateForHandoff(messages, pending), undefined);
// Handoff fires; its followUp drains into the log.
pending.add("id-1");
messages.push(user(buildHandoffMessage("id-1", "task 2")));
// Every subsequent turn keeps truncating to the same sentinel.
for (let turn = 0; turn < 3; turn++) {
const result = truncateForHandoff(messages, pending);
assert.ok(result, `turn ${turn}: expected truncation`);
assert.equal(result.matchedId, "id-1");
assert.equal(result.messages[0].role, "user");
assert.equal(
(result.messages[0].content as Array<{ text: string }>)[0].text,
"task 2",
);
messages.push(assistant(`step ${turn}`));
}
});
// ── resolveHandoffContext (the per-turn decision used by index.ts) ──────────
test("resolveHandoffContext: returns undefined when no handoff is active", () => {
const messages = [user(buildHandoffMessage("id-1", "task")), assistant("x")];
assert.equal(resolveHandoffContext(messages, null), undefined);
});
test("resolveHandoffContext: slices to the active handoff and strips the sentinel", () => {
const messages = [
user("old task"),
assistant("did it"),
user(buildHandoffMessage("id-1", "new task")),
];
const result = resolveHandoffContext(messages, "id-1");
assert.ok(result);
assert.equal(result.length, 1);
assert.equal(
(result[0].content as Array<{ text: string }>)[0].text,
"new task",
);
});
test("resolveHandoffContext: returns undefined when the active sentinel is not in the log", () => {
// Handoff staged but its followUp has not drained yet (or was aborted):
// the sentinel isn't a message, so the context is left untouched.
const messages = [user("ordinary"), assistant("reply")];
assert.equal(resolveHandoffContext(messages, "id-1"), undefined);
});
test("resolveHandoffContext: PERSISTS truncation across every turn (regression)", () => {
// The original bug consumed the handoff id after one turn, so the second
// turn sent the full pre-handoff history again (context ballooned across a
// task chain instead of resetting). Here we simulate several turns of a task
// growing after a handoff and assert the context stays pinned to the
// carry-over prompt on EVERY turn.
const history = [user("task 11"), assistant("done 11"), toolResult("ok")];
const messages = [
...history,
user(buildHandoffMessage("id-1", "task 12")),
];
const activeId = "id-1";
for (let turn = 0; turn < 5; turn++) {
const result = resolveHandoffContext(messages, activeId);
assert.ok(result, `turn ${turn}: expected truncation`);
// First visible message is always the (stripped) carry-over prompt.
assert.equal(result[0].role, "user");
assert.equal(
(result[0].content as Array<{ text: string }>)[0].text.startsWith(
"task 12",
),
true,
`turn ${turn}: context not pinned to carry-over prompt`,
);
// Stale pre-handoff history must never leak back in.
assert.equal(
result.some((m) =>
(m.content as Array<{ text?: string }>).some(
(c) => c.text === "task 11",
),
),
false,
`turn ${turn}: stale pre-handoff history leaked back in`,
);
// The task does more work; the log grows.
messages.push(assistant(`work step ${turn}`));
messages.push(toolResult(`result ${turn}`));
}
});
test("resolveHandoffContext: a later handoff supersedes the earlier one", () => {
const messages = [
user(buildHandoffMessage("id-1", "first task")),
assistant("worked on first"),
toolResult("done"),
user(buildHandoffMessage("id-2", "second task")),
];
// index.ts overwrites activeHandoffId with the newest id on each handoff.
const result = resolveHandoffContext(messages, "id-2");
assert.ok(result);
assert.equal(result.length, 1);
assert.equal(
(result[0].content as Array<{ text: string }>)[0].text,
"second task",
);
});