fix session-handoff truncation persisting only one turn

The context handler consumed the handoff marker after the first turn (and agent_end cleared all markers), so every subsequent turn fell through to the fast path and re-sent the full, ever-growing history. Across a task chain the context grew monotonically instead of resetting per task, defeating the handoff and eventually overrunning the model's context window. Track a single activeHandoffId that is never consumed and is overwritten when a later handoff supersedes it, so truncation re-applies on every turn and the context stays pinned to the carry-over prompt for the whole task. Add a pure, testable resolveHandoffContext helper and a multi-turn regression test. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-28 19:26:55 +02:00
parent f7af660727
commit 853cef84af
3 changed files with 193 additions and 70 deletions
@@ -25,18 +25,21 @@
 * Flow
 * ────
 *   1. LLM calls `session_handoff({ prompt: "..." })`.
- *   2. Tool generates a unique sentinel ID, adds it to `pendingMarkers`.
+ *   2. Tool generates a unique sentinel ID and records it as `activeHandoffId`.
 *   3. Tool queues `<pi-handoff-resume:ID>\n<prompt>` as a followUp user
 *      message. Returns `terminate: true` to stop the current agent loop.
 *   4. Agent loop ends. followUp drains, the message is appended.
 *   5. New agent turn begins, LLM call requested.
- *   6. `context` event fires. Our handler finds the sentinel, slices the
+ *   6. `context` event fires. Our handler finds the active sentinel, slices the
- *      message list to start with that message, strips the sentinel from its
+ *      message list to start with that message, and strips the sentinel from
- *      text, removes the ID from `pendingMarkers` so the sentinel won't
+ *      its text. This runs on EVERY subsequent turn — the active id is NOT
- *      re-trigger on subsequent turns.
+ *      consumed — so the context stays pinned to the carry-over prompt for the
- *   7. LLM receives a single-user-message context: just the carry-over
+ *      whole task, not just the first turn. A later handoff overwrites
- *      prompt. System prompt and tool definitions are unaffected (they're
+ *      `activeHandoffId` and wins, since its sentinel sits further down.
- *      separate from `messages` in the LLM payload).
+ *   7. LLM receives a context that starts at the carry-over prompt — just that
 *      prompt on the first turn, plus the new task's own work on later turns.
 *      System prompt and tool definitions are unaffected (they're separate from
 *      `messages` in the LLM payload).
 *
 * Known caveats
 * ─────────────
@@ -65,37 +68,32 @@ import {
 	buildHandoffMessage,
 	normalizePrompt,
 	normalizeReason,
-	truncateForHandoff,
+	resolveHandoffContext,
 } from "./state.js";
 export default function (pi: ExtensionAPI) {
-	// Sentinel IDs awaiting consumption. The tool adds an ID per handoff; the
+	// ID of the handoff currently anchoring the context, or null when none is
-	// context handler removes it once the corresponding sentinel has been
+	// active. Set when the tool fires; overwritten when a later handoff
-	// detected in the message list and truncation has been applied. Closure-
+	// supersedes it. Closure-scoped — survives across turns within a single
-	// scoped — survives across turns within a single extension load, resets
+	// extension load, resets to null on /reload or session restart (correctly
-	// on /reload or session restart (correctly making historical sentinels in
+	// making historical sentinels in a resumed on-disk log inert: no active id
-	// the on-disk log inert).
+	// ⇒ no truncation until a NEW handoff fires).
-	const pendingMarkers = new Set<string>();
+	let activeHandoffId: string | null = null;
 	pi.on("context", async (event, _ctx) => {
-		// Fast path for normal turns (no pending handoff).
+		// Fast path for normal turns (no active handoff).
-		if (pendingMarkers.size === 0) return undefined;
+		if (activeHandoffId === null) return undefined;
-		const result = truncateForHandoff(event.messages, pendingMarkers);
+		// Re-applied on EVERY turn, not just the first one after the handoff.
-		if (!result) return undefined;
+		// The sentinel stays in the on-disk log (our return only rewrites what
 		// the LLM sees, never the stored messages), so slicing to it each turn
 		// keeps the context window pinned to the carry-over prompt for the whole
 		// task. Returns undefined until the sentinel actually appears (followUp
 		// not yet drained), leaving the context untouched.
 		const messages = resolveHandoffContext(event.messages, activeHandoffId);
 		if (!messages) return undefined;
-		pendingMarkers.delete(result.consumedId);
+		return { messages };
 		return { messages: result.messages };
 	});
 	// Defensive cleanup: drop any pending marker IDs at the end of each agent
 	// invocation. On the normal handoff path the set is already empty by this
 	// point (consumed by the context handler). On an aborted invocation the
 	// followUp message stays queued — clearing here ensures the orphaned
 	// sentinel is INERT when it eventually drains, instead of resurrecting the
 	// old handoff on top of whatever the user types next.
 	pi.on("agent_end", () => {
 		pendingMarkers.clear();
 	});
 	pi.registerTool({
@@ -143,7 +141,7 @@ export default function (pi: ExtensionAPI) {
 			const reason = normalizeReason(params.reason);
 			const id = randomUUID();
-			pendingMarkers.add(id);
+			activeHandoffId = id;
 			// Notify other extensions (logging, metrics, etc.) before we
 			// queue the actual handoff message.
@@ -32,19 +32,22 @@
 * log. The trailing newline is consumed by the regex so the stripped prompt
 * starts cleanly.
 *
- * Multi-handoff safety
+ * Persistence across turns
- * ────────────────────
+ * ────────────────────────
- * Sentinels persist in the underlying session log even after we consume them,
+ * Truncation must re-apply on EVERY turn after a handoff, not just the first.
- * so we can't rely on "is the sentinel present?" alone — on the LLM call
+ * The sentinel stays in the underlying session log — rewriting the LLM's view
- * AFTER the handoff turn, the sentinel is still in `context.messages` and a
+ * via the `context` event never writes back to the stored messages — so each
- * naive check would re-truncate, destroying the model's response to the
+ * turn we slice to the active handoff's sentinel again, keeping the context
- * carry-over prompt. We track pending sentinel IDs in a Set in the caller's
+ * pinned to the carry-over prompt for the whole task. `slice(index)` keeps the
- * closure: a sentinel only matches if its ID is still pending. After the
+ * sentinel message AND everything after it, so re-slicing never discards the
- * truncation fires, the caller removes the ID from the set, making the
+ * model's work on the new task; it only keeps trimming the stale pre-handoff
- * sentinel inert for all future context calls.
+ * history.
 *
- * This also makes resumed sessions safe — pending IDs start empty after
+ * The active handoff is tracked as a single id in the caller's closure (see
- * extension reload, so all historical sentinels in the log are inert.
+ * index.ts::activeHandoffId). A later handoff overwrites it and wins, since its
 * sentinel sits further down the message list. The id resets to null on
 * extension reload, so a resumed session's historical sentinels are inert until
 * a NEW handoff fires.
 */
 export interface PendingHandoff {
@@ -157,16 +160,17 @@ export function findPendingHandoff(
 /**
 * Slice the message list to start at the handoff message and strip the
- * sentinel from its text. Returns the new list and the consumed ID, or
+ * sentinel from its text. Returns the new list and the matched ID, or
- * undefined if no pending sentinel is present.
+ * undefined if no matching sentinel is present.
 *
- * Pure: does NOT mutate `pendingIds`. The caller is responsible for removing
+ * Pure: does NOT mutate `pendingIds`. Callers must NOT remove the matched id
- * the consumed ID from the set after a successful truncation.
+ * afterwards — truncation is meant to persist across turns (see
 * resolveHandoffContext).
 */
 export function truncateForHandoff<M extends MessageLike>(
 	messages: ReadonlyArray<M>,
 	pendingIds: ReadonlySet<string>,
-): { messages: M[]; consumedId: string } | undefined {
+): { messages: M[]; matchedId: string } | undefined {
 	const match = findPendingHandoff(messages, pendingIds);
 	if (!match) return undefined;
@@ -184,6 +188,34 @@ export function truncateForHandoff<M extends MessageLike>(
 	return {
 		messages: [cleanedFirst, ...tail.slice(1)],
-		consumedId: match.id,
+		matchedId: match.id,
 	};
 }
 /**
 * Resolve the LLM-visible message list for one `context` event, given the id of
 * the currently active handoff (or null when none is active).
 *
 * Returns the truncated list — sliced to the active handoff's sentinel, with
 * the sentinel stripped — or undefined to leave the context untouched.
 *
 * STABLE across turns: as long as `activeId` stays set and its sentinel is in
 * the message log, every call returns the same truncation. That persistence is
 * the whole point — it keeps the post-handoff context pinned to the carry-over
 * prompt for the entire task, not just the first turn after the handoff. (The
 * earlier design consumed the id after one turn, which let the full pre-handoff
 * history snap back on the second turn — see the regression test in
 * tests/session-handoff.test.ts.)
 *
 * Returns undefined when the active sentinel is not yet a message (its followUp
 * hasn't drained, or the handoff was aborted before draining), leaving the
 * context untouched until the sentinel actually appears.
 */
 export function resolveHandoffContext<M extends MessageLike>(
 	messages: ReadonlyArray<M>,
 	activeId: string | null,
 ): M[] | undefined {
 	if (!activeId) return undefined;
 	const result = truncateForHandoff(messages, new Set([activeId]));
 	return result?.messages;
 }
@@ -13,6 +13,7 @@ import {
 	findPendingHandoff,
 	normalizePrompt,
 	normalizeReason,
 	resolveHandoffContext,
 	SENTINEL_RE,
 	stripSentinel,
 	truncateForHandoff,
@@ -184,9 +185,9 @@ test("findPendingHandoff: finds a matching sentinel", () => {
 });
 test("findPendingHandoff: ignores sentinel whose id is not pending", () => {
-	// Past handoff sentinels persist in the session log but are inert once
+	// Past handoff sentinels persist in the session log but are inert when
-	// their id has been consumed (removed from the pending set).
+	// their id is not the active one (e.g. a handoff from before a reload).
-	const messages = [user(buildHandoffMessage("consumed", "old task"))];
+	const messages = [user(buildHandoffMessage("inactive", "old task"))];
 	assert.equal(findPendingHandoff(messages, new Set(["different"])), undefined);
 });
@@ -247,7 +248,7 @@ test("truncateForHandoff: slices to the handoff message and strips the sentinel"
 	];
 	const result = truncateForHandoff(messages, new Set(["id-1"]));
 	assert.ok(result, "expected truncation result");
-	assert.equal(result.consumedId, "id-1");
+	assert.equal(result.matchedId, "id-1");
 	assert.equal(result.messages.length, 1);
 	assert.equal(result.messages[0].role, "user");
 	const content = result.messages[0].content as Array<{
@@ -329,7 +330,7 @@ test("truncateForHandoff: picks the LATEST handoff when multiple are pending", (
 	];
 	const result = truncateForHandoff(messages, new Set(["id-1", "id-2"]));
 	assert.ok(result);
-	assert.equal(result.consumedId, "id-2");
+	assert.equal(result.matchedId, "id-2");
 	assert.equal(result.messages.length, 1);
 	assert.equal(
 		(result.messages[0].content as Array<{ text: string }>)[0].text,
@@ -337,26 +338,118 @@ test("truncateForHandoff: picks the LATEST handoff when multiple are pending", (
 	);
 });
-test("truncateForHandoff: full lifecycle — pending → consume → inert", () => {
+test("truncateForHandoff: keeps matching across turns while the id stays pending", () => {
 	// Regression guard: the id must NOT be removed after the first match. As
 	// long as it stays in the pending set, every call keeps slicing to the
 	// sentinel — that is what pins the context to the carry-over prompt for the
 	// whole task instead of just one turn.
 	const pending = new Set<string>();
 	const messages = [user("task 1"), assistant("response")];
-	// First context call: no pending markers, no truncation
+	// No pending markers: no truncation.
 	assert.equal(truncateForHandoff(messages, pending), undefined);
-	// Tool fires, prompt drains
+	// Handoff fires; its followUp drains into the log.
 	pending.add("id-1");
 	messages.push(user(buildHandoffMessage("id-1", "task 2")));
-	// Next context call: truncation fires
+	// Every subsequent turn keeps truncating to the same sentinel.
-	const first = truncateForHandoff(messages, pending);
+	for (let turn = 0; turn < 3; turn++) {
-	assert.ok(first);
+		const result = truncateForHandoff(messages, pending);
-	assert.equal(first.consumedId, "id-1");
+		assert.ok(result, `turn ${turn}: expected truncation`);
-	pending.delete(first.consumedId);
+		assert.equal(result.matchedId, "id-1");
-
+		assert.equal(result.messages[0].role, "user");
-	// Subsequent context calls (within task 2): the sentinel is still in the
+		assert.equal(
-	// session log (we never write back), but the id is no longer pending —
+			(result.messages[0].content as Array<{ text: string }>)[0].text,
-	// no further truncation.
+			"task 2",
-	messages.push(assistant("starting task 2"));
+		);
-	assert.equal(truncateForHandoff(messages, pending), undefined);
+		messages.push(assistant(`step ${turn}`));
 	}
 });
 // ── resolveHandoffContext (the per-turn decision used by index.ts) ──────────
 test("resolveHandoffContext: returns undefined when no handoff is active", () => {
 	const messages = [user(buildHandoffMessage("id-1", "task")), assistant("x")];
 	assert.equal(resolveHandoffContext(messages, null), undefined);
 });
 test("resolveHandoffContext: slices to the active handoff and strips the sentinel", () => {
 	const messages = [
 		user("old task"),
 		assistant("did it"),
 		user(buildHandoffMessage("id-1", "new task")),
 	];
 	const result = resolveHandoffContext(messages, "id-1");
 	assert.ok(result);
 	assert.equal(result.length, 1);
 	assert.equal(
 		(result[0].content as Array<{ text: string }>)[0].text,
 		"new task",
 	);
 });
 test("resolveHandoffContext: returns undefined when the active sentinel is not in the log", () => {
 	// Handoff staged but its followUp has not drained yet (or was aborted):
 	// the sentinel isn't a message, so the context is left untouched.
 	const messages = [user("ordinary"), assistant("reply")];
 	assert.equal(resolveHandoffContext(messages, "id-1"), undefined);
 });
 test("resolveHandoffContext: PERSISTS truncation across every turn (regression)", () => {
 	// The original bug consumed the handoff id after one turn, so the second
 	// turn sent the full pre-handoff history again (context ballooned across a
 	// task chain instead of resetting). Here we simulate several turns of a task
 	// growing after a handoff and assert the context stays pinned to the
 	// carry-over prompt on EVERY turn.
 	const history = [user("task 11"), assistant("done 11"), toolResult("ok")];
 	const messages = [
 		...history,
 		user(buildHandoffMessage("id-1", "task 12")),
 	];
 	const activeId = "id-1";
 	for (let turn = 0; turn < 5; turn++) {
 		const result = resolveHandoffContext(messages, activeId);
 		assert.ok(result, `turn ${turn}: expected truncation`);
 		// First visible message is always the (stripped) carry-over prompt.
 		assert.equal(result[0].role, "user");
 		assert.equal(
 			(result[0].content as Array<{ text: string }>)[0].text.startsWith(
 				"task 12",
 			),
 			true,
 			`turn ${turn}: context not pinned to carry-over prompt`,
 		);
 		// Stale pre-handoff history must never leak back in.
 		assert.equal(
 			result.some((m) =>
 				(m.content as Array<{ text?: string }>).some(
 					(c) => c.text === "task 11",
 				),
 			),
 			false,
 			`turn ${turn}: stale pre-handoff history leaked back in`,
 		);
 		// The task does more work; the log grows.
 		messages.push(assistant(`work step ${turn}`));
 		messages.push(toolResult(`result ${turn}`));
 	}
 });
 test("resolveHandoffContext: a later handoff supersedes the earlier one", () => {
 	const messages = [
 		user(buildHandoffMessage("id-1", "first task")),
 		assistant("worked on first"),
 		toolResult("done"),
 		user(buildHandoffMessage("id-2", "second task")),
 	];
 	// index.ts overwrites activeHandoffId with the newest id on each handoff.
 	const result = resolveHandoffContext(messages, "id-2");
 	assert.ok(result);
 	assert.equal(result.length, 1);
 	assert.equal(
 		(result[0].content as Array<{ text: string }>)[0].text,
 		"second task",
 	);
 });