fix session-handoff truncation persisting only one turn

The context handler consumed the handoff marker after the first turn (and agent_end cleared all markers), so every subsequent turn fell through to the fast path and re-sent the full, ever-growing history. Across a task chain the context grew monotonically instead of resetting per task, defeating the handoff and eventually overrunning the model's context window. Track a single activeHandoffId that is never consumed and is overwritten when a later handoff supersedes it, so truncation re-applies on every turn and the context stays pinned to the carry-over prompt for the whole task. Add a pure, testable resolveHandoffContext helper and a multi-turn regression test. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-28 19:26:55 +02:00
parent f7af660727
commit 853cef84af
3 changed files with 193 additions and 70 deletions
@@ -25,18 +25,21 @@
 * Flow
 * ────
 *   1. LLM calls `session_handoff({ prompt: "..." })`.
- *   2. Tool generates a unique sentinel ID, adds it to `pendingMarkers`.
+ *   2. Tool generates a unique sentinel ID and records it as `activeHandoffId`.
 *   3. Tool queues `<pi-handoff-resume:ID>\n<prompt>` as a followUp user
 *      message. Returns `terminate: true` to stop the current agent loop.
 *   4. Agent loop ends. followUp drains, the message is appended.
 *   5. New agent turn begins, LLM call requested.
- *   6. `context` event fires. Our handler finds the sentinel, slices the
- *      message list to start with that message, strips the sentinel from its
- *      text, removes the ID from `pendingMarkers` so the sentinel won't
- *      re-trigger on subsequent turns.
- *   7. LLM receives a single-user-message context: just the carry-over
- *      prompt. System prompt and tool definitions are unaffected (they're
- *      separate from `messages` in the LLM payload).
+ *   6. `context` event fires. Our handler finds the active sentinel, slices the
+ *      message list to start with that message, and strips the sentinel from
+ *      its text. This runs on EVERY subsequent turn — the active id is NOT
+ *      consumed — so the context stays pinned to the carry-over prompt for the
+ *      whole task, not just the first turn. A later handoff overwrites
+ *      `activeHandoffId` and wins, since its sentinel sits further down.
+ *   7. LLM receives a context that starts at the carry-over prompt — just that
+ *      prompt on the first turn, plus the new task's own work on later turns.
+ *      System prompt and tool definitions are unaffected (they're separate from
+ *      `messages` in the LLM payload).
 *
 * Known caveats
 * ─────────────
@@ -65,37 +68,32 @@ import {
 	buildHandoffMessage,
 	normalizePrompt,
 	normalizeReason,
-	truncateForHandoff,
+	resolveHandoffContext,
 } from "./state.js";

 export default function (pi: ExtensionAPI) {
-	// Sentinel IDs awaiting consumption. The tool adds an ID per handoff; the
-	// context handler removes it once the corresponding sentinel has been
-	// detected in the message list and truncation has been applied. Closure-
-	// scoped — survives across turns within a single extension load, resets
-	// on /reload or session restart (correctly making historical sentinels in
-	// the on-disk log inert).
-	const pendingMarkers = new Set<string>();
+	// ID of the handoff currently anchoring the context, or null when none is
+	// active. Set when the tool fires; overwritten when a later handoff
+	// supersedes it. Closure-scoped — survives across turns within a single
+	// extension load, resets to null on /reload or session restart (correctly
+	// making historical sentinels in a resumed on-disk log inert: no active id
+	// ⇒ no truncation until a NEW handoff fires).
+	let activeHandoffId: string | null = null;

 	pi.on("context", async (event, _ctx) => {
-		// Fast path for normal turns (no pending handoff).
-		if (pendingMarkers.size === 0) return undefined;
+		// Fast path for normal turns (no active handoff).
+		if (activeHandoffId === null) return undefined;

-		const result = truncateForHandoff(event.messages, pendingMarkers);
-		if (!result) return undefined;
+		// Re-applied on EVERY turn, not just the first one after the handoff.
+		// The sentinel stays in the on-disk log (our return only rewrites what
+		// the LLM sees, never the stored messages), so slicing to it each turn
+		// keeps the context window pinned to the carry-over prompt for the whole
+		// task. Returns undefined until the sentinel actually appears (followUp
+		// not yet drained), leaving the context untouched.
+		const messages = resolveHandoffContext(event.messages, activeHandoffId);
+		if (!messages) return undefined;

-		pendingMarkers.delete(result.consumedId);
-		return { messages: result.messages };
-	});
-
-	// Defensive cleanup: drop any pending marker IDs at the end of each agent
-	// invocation. On the normal handoff path the set is already empty by this
-	// point (consumed by the context handler). On an aborted invocation the
-	// followUp message stays queued — clearing here ensures the orphaned
-	// sentinel is INERT when it eventually drains, instead of resurrecting the
-	// old handoff on top of whatever the user types next.
-	pi.on("agent_end", () => {
-		pendingMarkers.clear();
+		return { messages };
 	});

 	pi.registerTool({
@@ -143,7 +141,7 @@ export default function (pi: ExtensionAPI) {
 			const reason = normalizeReason(params.reason);

 			const id = randomUUID();
-			pendingMarkers.add(id);
+			activeHandoffId = id;

 			// Notify other extensions (logging, metrics, etc.) before we
 			// queue the actual handoff message.
@@ -32,19 +32,22 @@
 * log. The trailing newline is consumed by the regex so the stripped prompt
 * starts cleanly.
 *
- * Multi-handoff safety
- * ────────────────────
- * Sentinels persist in the underlying session log even after we consume them,
- * so we can't rely on "is the sentinel present?" alone — on the LLM call
- * AFTER the handoff turn, the sentinel is still in `context.messages` and a
- * naive check would re-truncate, destroying the model's response to the
- * carry-over prompt. We track pending sentinel IDs in a Set in the caller's
- * closure: a sentinel only matches if its ID is still pending. After the
- * truncation fires, the caller removes the ID from the set, making the
- * sentinel inert for all future context calls.
+ * Persistence across turns
+ * ────────────────────────
+ * Truncation must re-apply on EVERY turn after a handoff, not just the first.
+ * The sentinel stays in the underlying session log — rewriting the LLM's view
+ * via the `context` event never writes back to the stored messages — so each
+ * turn we slice to the active handoff's sentinel again, keeping the context
+ * pinned to the carry-over prompt for the whole task. `slice(index)` keeps the
+ * sentinel message AND everything after it, so re-slicing never discards the
+ * model's work on the new task; it only keeps trimming the stale pre-handoff
+ * history.
 *
- * This also makes resumed sessions safe — pending IDs start empty after
- * extension reload, so all historical sentinels in the log are inert.
+ * The active handoff is tracked as a single id in the caller's closure (see
+ * index.ts::activeHandoffId). A later handoff overwrites it and wins, since its
+ * sentinel sits further down the message list. The id resets to null on
+ * extension reload, so a resumed session's historical sentinels are inert until
+ * a NEW handoff fires.
 */

 export interface PendingHandoff {
@@ -157,16 +160,17 @@ export function findPendingHandoff(

 /**
 * Slice the message list to start at the handoff message and strip the
- * sentinel from its text. Returns the new list and the consumed ID, or
- * undefined if no pending sentinel is present.
+ * sentinel from its text. Returns the new list and the matched ID, or
+ * undefined if no matching sentinel is present.
 *
- * Pure: does NOT mutate `pendingIds`. The caller is responsible for removing
- * the consumed ID from the set after a successful truncation.
+ * Pure: does NOT mutate `pendingIds`. Callers must NOT remove the matched id
+ * afterwards — truncation is meant to persist across turns (see
+ * resolveHandoffContext).
 */
 export function truncateForHandoff<M extends MessageLike>(
 	messages: ReadonlyArray<M>,
 	pendingIds: ReadonlySet<string>,
-): { messages: M[]; consumedId: string } | undefined {
+): { messages: M[]; matchedId: string } | undefined {
 	const match = findPendingHandoff(messages, pendingIds);
 	if (!match) return undefined;

@@ -184,6 +188,34 @@ export function truncateForHandoff<M extends MessageLike>(

 	return {
 		messages: [cleanedFirst, ...tail.slice(1)],
-		consumedId: match.id,
+		matchedId: match.id,
 	};
 }
+
+/**
+ * Resolve the LLM-visible message list for one `context` event, given the id of
+ * the currently active handoff (or null when none is active).
+ *
+ * Returns the truncated list — sliced to the active handoff's sentinel, with
+ * the sentinel stripped — or undefined to leave the context untouched.
+ *
+ * STABLE across turns: as long as `activeId` stays set and its sentinel is in
+ * the message log, every call returns the same truncation. That persistence is
+ * the whole point — it keeps the post-handoff context pinned to the carry-over
+ * prompt for the entire task, not just the first turn after the handoff. (The
+ * earlier design consumed the id after one turn, which let the full pre-handoff
+ * history snap back on the second turn — see the regression test in
+ * tests/session-handoff.test.ts.)
+ *
+ * Returns undefined when the active sentinel is not yet a message (its followUp
+ * hasn't drained, or the handoff was aborted before draining), leaving the
+ * context untouched until the sentinel actually appears.
+ */
+export function resolveHandoffContext<M extends MessageLike>(
+	messages: ReadonlyArray<M>,
+	activeId: string | null,
+): M[] | undefined {
+	if (!activeId) return undefined;
+	const result = truncateForHandoff(messages, new Set([activeId]));
+	return result?.messages;
+}
@@ -13,6 +13,7 @@ import {
 	findPendingHandoff,
 	normalizePrompt,
 	normalizeReason,
+	resolveHandoffContext,
 	SENTINEL_RE,
 	stripSentinel,
 	truncateForHandoff,
@@ -184,9 +185,9 @@ test("findPendingHandoff: finds a matching sentinel", () => {
 });

 test("findPendingHandoff: ignores sentinel whose id is not pending", () => {
-	// Past handoff sentinels persist in the session log but are inert once
-	// their id has been consumed (removed from the pending set).
-	const messages = [user(buildHandoffMessage("consumed", "old task"))];
+	// Past handoff sentinels persist in the session log but are inert when
+	// their id is not the active one (e.g. a handoff from before a reload).
+	const messages = [user(buildHandoffMessage("inactive", "old task"))];
 	assert.equal(findPendingHandoff(messages, new Set(["different"])), undefined);
 });

@@ -247,7 +248,7 @@ test("truncateForHandoff: slices to the handoff message and strips the sentinel"
 	];
 	const result = truncateForHandoff(messages, new Set(["id-1"]));
 	assert.ok(result, "expected truncation result");
-	assert.equal(result.consumedId, "id-1");
+	assert.equal(result.matchedId, "id-1");
 	assert.equal(result.messages.length, 1);
 	assert.equal(result.messages[0].role, "user");
 	const content = result.messages[0].content as Array<{
@@ -329,7 +330,7 @@ test("truncateForHandoff: picks the LATEST handoff when multiple are pending", (
 	];
 	const result = truncateForHandoff(messages, new Set(["id-1", "id-2"]));
 	assert.ok(result);
-	assert.equal(result.consumedId, "id-2");
+	assert.equal(result.matchedId, "id-2");
 	assert.equal(result.messages.length, 1);
 	assert.equal(
 		(result.messages[0].content as Array<{ text: string }>)[0].text,
@@ -337,26 +338,118 @@ test("truncateForHandoff: picks the LATEST handoff when multiple are pending", (
 	);
 });

-test("truncateForHandoff: full lifecycle — pending → consume → inert", () => {
+test("truncateForHandoff: keeps matching across turns while the id stays pending", () => {
+	// Regression guard: the id must NOT be removed after the first match. As
+	// long as it stays in the pending set, every call keeps slicing to the
+	// sentinel — that is what pins the context to the carry-over prompt for the
+	// whole task instead of just one turn.
 	const pending = new Set<string>();
 	const messages = [user("task 1"), assistant("response")];

-	// First context call: no pending markers, no truncation
+	// No pending markers: no truncation.
 	assert.equal(truncateForHandoff(messages, pending), undefined);

-	// Tool fires, prompt drains
+	// Handoff fires; its followUp drains into the log.
 	pending.add("id-1");
 	messages.push(user(buildHandoffMessage("id-1", "task 2")));

-	// Next context call: truncation fires
-	const first = truncateForHandoff(messages, pending);
-	assert.ok(first);
-	assert.equal(first.consumedId, "id-1");
-	pending.delete(first.consumedId);
-
-	// Subsequent context calls (within task 2): the sentinel is still in the
-	// session log (we never write back), but the id is no longer pending —
-	// no further truncation.
-	messages.push(assistant("starting task 2"));
-	assert.equal(truncateForHandoff(messages, pending), undefined);
+	// Every subsequent turn keeps truncating to the same sentinel.
+	for (let turn = 0; turn < 3; turn++) {
+		const result = truncateForHandoff(messages, pending);
+		assert.ok(result, `turn ${turn}: expected truncation`);
+		assert.equal(result.matchedId, "id-1");
+		assert.equal(result.messages[0].role, "user");
+		assert.equal(
+			(result.messages[0].content as Array<{ text: string }>)[0].text,
+			"task 2",
+		);
+		messages.push(assistant(`step ${turn}`));
+	}
+});
+
+// ── resolveHandoffContext (the per-turn decision used by index.ts) ──────────
+
+test("resolveHandoffContext: returns undefined when no handoff is active", () => {
+	const messages = [user(buildHandoffMessage("id-1", "task")), assistant("x")];
+	assert.equal(resolveHandoffContext(messages, null), undefined);
+});
+
+test("resolveHandoffContext: slices to the active handoff and strips the sentinel", () => {
+	const messages = [
+		user("old task"),
+		assistant("did it"),
+		user(buildHandoffMessage("id-1", "new task")),
+	];
+	const result = resolveHandoffContext(messages, "id-1");
+	assert.ok(result);
+	assert.equal(result.length, 1);
+	assert.equal(
+		(result[0].content as Array<{ text: string }>)[0].text,
+		"new task",
+	);
+});
+
+test("resolveHandoffContext: returns undefined when the active sentinel is not in the log", () => {
+	// Handoff staged but its followUp has not drained yet (or was aborted):
+	// the sentinel isn't a message, so the context is left untouched.
+	const messages = [user("ordinary"), assistant("reply")];
+	assert.equal(resolveHandoffContext(messages, "id-1"), undefined);
+});
+
+test("resolveHandoffContext: PERSISTS truncation across every turn (regression)", () => {
+	// The original bug consumed the handoff id after one turn, so the second
+	// turn sent the full pre-handoff history again (context ballooned across a
+	// task chain instead of resetting). Here we simulate several turns of a task
+	// growing after a handoff and assert the context stays pinned to the
+	// carry-over prompt on EVERY turn.
+	const history = [user("task 11"), assistant("done 11"), toolResult("ok")];
+	const messages = [
+		...history,
+		user(buildHandoffMessage("id-1", "task 12")),
+	];
+	const activeId = "id-1";
+
+	for (let turn = 0; turn < 5; turn++) {
+		const result = resolveHandoffContext(messages, activeId);
+		assert.ok(result, `turn ${turn}: expected truncation`);
+		// First visible message is always the (stripped) carry-over prompt.
+		assert.equal(result[0].role, "user");
+		assert.equal(
+			(result[0].content as Array<{ text: string }>)[0].text.startsWith(
+				"task 12",
+			),
+			true,
+			`turn ${turn}: context not pinned to carry-over prompt`,
+		);
+		// Stale pre-handoff history must never leak back in.
+		assert.equal(
+			result.some((m) =>
+				(m.content as Array<{ text?: string }>).some(
+					(c) => c.text === "task 11",
+				),
+			),
+			false,
+			`turn ${turn}: stale pre-handoff history leaked back in`,
+		);
+		// The task does more work; the log grows.
+		messages.push(assistant(`work step ${turn}`));
+		messages.push(toolResult(`result ${turn}`));
+	}
+});
+
+test("resolveHandoffContext: a later handoff supersedes the earlier one", () => {
+	const messages = [
+		user(buildHandoffMessage("id-1", "first task")),
+		assistant("worked on first"),
+		toolResult("done"),
+		user(buildHandoffMessage("id-2", "second task")),
+	];
+	// index.ts overwrites activeHandoffId with the newest id on each handoff.
+	const result = resolveHandoffContext(messages, "id-2");
+	assert.ok(result);
+	assert.equal(result.length, 1);
+	assert.equal(
+		(result[0].content as Array<{ text: string }>)[0].text,
+		"second task",
+	);
 });