From 853cef84af02434d17d4155ed810fd0cc3d90815 Mon Sep 17 00:00:00 2001 From: shahondin1624 Date: Thu, 28 May 2026 19:26:55 +0200 Subject: [PATCH] fix session-handoff truncation persisting only one turn The context handler consumed the handoff marker after the first turn (and agent_end cleared all markers), so every subsequent turn fell through to the fast path and re-sent the full, ever-growing history. Across a task chain the context grew monotonically instead of resetting per task, defeating the handoff and eventually overrunning the model's context window. Track a single activeHandoffId that is never consumed and is overwritten when a later handoff supersedes it, so truncation re-applies on every turn and the context stays pinned to the carry-over prompt for the whole task. Add a pure, testable resolveHandoffContext helper and a multi-turn regression test. Co-Authored-By: Claude Opus 4.7 (1M context) --- session-handoff/index.ts | 64 ++++++++--------- session-handoff/state.ts | 68 +++++++++++++----- tests/session-handoff.test.ts | 131 +++++++++++++++++++++++++++++----- 3 files changed, 193 insertions(+), 70 deletions(-) diff --git a/session-handoff/index.ts b/session-handoff/index.ts index 677a6b7..ca0dec4 100644 --- a/session-handoff/index.ts +++ b/session-handoff/index.ts @@ -25,18 +25,21 @@ * Flow * ──── * 1. LLM calls `session_handoff({ prompt: "..." })`. - * 2. Tool generates a unique sentinel ID, adds it to `pendingMarkers`. + * 2. Tool generates a unique sentinel ID and records it as `activeHandoffId`. * 3. Tool queues `\n` as a followUp user * message. Returns `terminate: true` to stop the current agent loop. * 4. Agent loop ends. followUp drains, the message is appended. * 5. New agent turn begins, LLM call requested. - * 6. `context` event fires. Our handler finds the sentinel, slices the - * message list to start with that message, strips the sentinel from its - * text, removes the ID from `pendingMarkers` so the sentinel won't - * re-trigger on subsequent turns. - * 7. LLM receives a single-user-message context: just the carry-over - * prompt. System prompt and tool definitions are unaffected (they're - * separate from `messages` in the LLM payload). + * 6. `context` event fires. Our handler finds the active sentinel, slices the + * message list to start with that message, and strips the sentinel from + * its text. This runs on EVERY subsequent turn — the active id is NOT + * consumed — so the context stays pinned to the carry-over prompt for the + * whole task, not just the first turn. A later handoff overwrites + * `activeHandoffId` and wins, since its sentinel sits further down. + * 7. LLM receives a context that starts at the carry-over prompt — just that + * prompt on the first turn, plus the new task's own work on later turns. + * System prompt and tool definitions are unaffected (they're separate from + * `messages` in the LLM payload). * * Known caveats * ───────────── @@ -65,37 +68,32 @@ import { buildHandoffMessage, normalizePrompt, normalizeReason, - truncateForHandoff, + resolveHandoffContext, } from "./state.js"; export default function (pi: ExtensionAPI) { - // Sentinel IDs awaiting consumption. The tool adds an ID per handoff; the - // context handler removes it once the corresponding sentinel has been - // detected in the message list and truncation has been applied. Closure- - // scoped — survives across turns within a single extension load, resets - // on /reload or session restart (correctly making historical sentinels in - // the on-disk log inert). - const pendingMarkers = new Set(); + // ID of the handoff currently anchoring the context, or null when none is + // active. Set when the tool fires; overwritten when a later handoff + // supersedes it. Closure-scoped — survives across turns within a single + // extension load, resets to null on /reload or session restart (correctly + // making historical sentinels in a resumed on-disk log inert: no active id + // ⇒ no truncation until a NEW handoff fires). + let activeHandoffId: string | null = null; pi.on("context", async (event, _ctx) => { - // Fast path for normal turns (no pending handoff). - if (pendingMarkers.size === 0) return undefined; + // Fast path for normal turns (no active handoff). + if (activeHandoffId === null) return undefined; - const result = truncateForHandoff(event.messages, pendingMarkers); - if (!result) return undefined; + // Re-applied on EVERY turn, not just the first one after the handoff. + // The sentinel stays in the on-disk log (our return only rewrites what + // the LLM sees, never the stored messages), so slicing to it each turn + // keeps the context window pinned to the carry-over prompt for the whole + // task. Returns undefined until the sentinel actually appears (followUp + // not yet drained), leaving the context untouched. + const messages = resolveHandoffContext(event.messages, activeHandoffId); + if (!messages) return undefined; - pendingMarkers.delete(result.consumedId); - return { messages: result.messages }; - }); - - // Defensive cleanup: drop any pending marker IDs at the end of each agent - // invocation. On the normal handoff path the set is already empty by this - // point (consumed by the context handler). On an aborted invocation the - // followUp message stays queued — clearing here ensures the orphaned - // sentinel is INERT when it eventually drains, instead of resurrecting the - // old handoff on top of whatever the user types next. - pi.on("agent_end", () => { - pendingMarkers.clear(); + return { messages }; }); pi.registerTool({ @@ -143,7 +141,7 @@ export default function (pi: ExtensionAPI) { const reason = normalizeReason(params.reason); const id = randomUUID(); - pendingMarkers.add(id); + activeHandoffId = id; // Notify other extensions (logging, metrics, etc.) before we // queue the actual handoff message. diff --git a/session-handoff/state.ts b/session-handoff/state.ts index ebd4905..9d026f0 100644 --- a/session-handoff/state.ts +++ b/session-handoff/state.ts @@ -32,19 +32,22 @@ * log. The trailing newline is consumed by the regex so the stripped prompt * starts cleanly. * - * Multi-handoff safety - * ──────────────────── - * Sentinels persist in the underlying session log even after we consume them, - * so we can't rely on "is the sentinel present?" alone — on the LLM call - * AFTER the handoff turn, the sentinel is still in `context.messages` and a - * naive check would re-truncate, destroying the model's response to the - * carry-over prompt. We track pending sentinel IDs in a Set in the caller's - * closure: a sentinel only matches if its ID is still pending. After the - * truncation fires, the caller removes the ID from the set, making the - * sentinel inert for all future context calls. + * Persistence across turns + * ──────────────────────── + * Truncation must re-apply on EVERY turn after a handoff, not just the first. + * The sentinel stays in the underlying session log — rewriting the LLM's view + * via the `context` event never writes back to the stored messages — so each + * turn we slice to the active handoff's sentinel again, keeping the context + * pinned to the carry-over prompt for the whole task. `slice(index)` keeps the + * sentinel message AND everything after it, so re-slicing never discards the + * model's work on the new task; it only keeps trimming the stale pre-handoff + * history. * - * This also makes resumed sessions safe — pending IDs start empty after - * extension reload, so all historical sentinels in the log are inert. + * The active handoff is tracked as a single id in the caller's closure (see + * index.ts::activeHandoffId). A later handoff overwrites it and wins, since its + * sentinel sits further down the message list. The id resets to null on + * extension reload, so a resumed session's historical sentinels are inert until + * a NEW handoff fires. */ export interface PendingHandoff { @@ -157,16 +160,17 @@ export function findPendingHandoff( /** * Slice the message list to start at the handoff message and strip the - * sentinel from its text. Returns the new list and the consumed ID, or - * undefined if no pending sentinel is present. + * sentinel from its text. Returns the new list and the matched ID, or + * undefined if no matching sentinel is present. * - * Pure: does NOT mutate `pendingIds`. The caller is responsible for removing - * the consumed ID from the set after a successful truncation. + * Pure: does NOT mutate `pendingIds`. Callers must NOT remove the matched id + * afterwards — truncation is meant to persist across turns (see + * resolveHandoffContext). */ export function truncateForHandoff( messages: ReadonlyArray, pendingIds: ReadonlySet, -): { messages: M[]; consumedId: string } | undefined { +): { messages: M[]; matchedId: string } | undefined { const match = findPendingHandoff(messages, pendingIds); if (!match) return undefined; @@ -184,6 +188,34 @@ export function truncateForHandoff( return { messages: [cleanedFirst, ...tail.slice(1)], - consumedId: match.id, + matchedId: match.id, }; } + +/** + * Resolve the LLM-visible message list for one `context` event, given the id of + * the currently active handoff (or null when none is active). + * + * Returns the truncated list — sliced to the active handoff's sentinel, with + * the sentinel stripped — or undefined to leave the context untouched. + * + * STABLE across turns: as long as `activeId` stays set and its sentinel is in + * the message log, every call returns the same truncation. That persistence is + * the whole point — it keeps the post-handoff context pinned to the carry-over + * prompt for the entire task, not just the first turn after the handoff. (The + * earlier design consumed the id after one turn, which let the full pre-handoff + * history snap back on the second turn — see the regression test in + * tests/session-handoff.test.ts.) + * + * Returns undefined when the active sentinel is not yet a message (its followUp + * hasn't drained, or the handoff was aborted before draining), leaving the + * context untouched until the sentinel actually appears. + */ +export function resolveHandoffContext( + messages: ReadonlyArray, + activeId: string | null, +): M[] | undefined { + if (!activeId) return undefined; + const result = truncateForHandoff(messages, new Set([activeId])); + return result?.messages; +} diff --git a/tests/session-handoff.test.ts b/tests/session-handoff.test.ts index 5d98378..ec8aebf 100644 --- a/tests/session-handoff.test.ts +++ b/tests/session-handoff.test.ts @@ -13,6 +13,7 @@ import { findPendingHandoff, normalizePrompt, normalizeReason, + resolveHandoffContext, SENTINEL_RE, stripSentinel, truncateForHandoff, @@ -184,9 +185,9 @@ test("findPendingHandoff: finds a matching sentinel", () => { }); test("findPendingHandoff: ignores sentinel whose id is not pending", () => { - // Past handoff sentinels persist in the session log but are inert once - // their id has been consumed (removed from the pending set). - const messages = [user(buildHandoffMessage("consumed", "old task"))]; + // Past handoff sentinels persist in the session log but are inert when + // their id is not the active one (e.g. a handoff from before a reload). + const messages = [user(buildHandoffMessage("inactive", "old task"))]; assert.equal(findPendingHandoff(messages, new Set(["different"])), undefined); }); @@ -247,7 +248,7 @@ test("truncateForHandoff: slices to the handoff message and strips the sentinel" ]; const result = truncateForHandoff(messages, new Set(["id-1"])); assert.ok(result, "expected truncation result"); - assert.equal(result.consumedId, "id-1"); + assert.equal(result.matchedId, "id-1"); assert.equal(result.messages.length, 1); assert.equal(result.messages[0].role, "user"); const content = result.messages[0].content as Array<{ @@ -329,7 +330,7 @@ test("truncateForHandoff: picks the LATEST handoff when multiple are pending", ( ]; const result = truncateForHandoff(messages, new Set(["id-1", "id-2"])); assert.ok(result); - assert.equal(result.consumedId, "id-2"); + assert.equal(result.matchedId, "id-2"); assert.equal(result.messages.length, 1); assert.equal( (result.messages[0].content as Array<{ text: string }>)[0].text, @@ -337,26 +338,118 @@ test("truncateForHandoff: picks the LATEST handoff when multiple are pending", ( ); }); -test("truncateForHandoff: full lifecycle — pending → consume → inert", () => { +test("truncateForHandoff: keeps matching across turns while the id stays pending", () => { + // Regression guard: the id must NOT be removed after the first match. As + // long as it stays in the pending set, every call keeps slicing to the + // sentinel — that is what pins the context to the carry-over prompt for the + // whole task instead of just one turn. const pending = new Set(); const messages = [user("task 1"), assistant("response")]; - // First context call: no pending markers, no truncation + // No pending markers: no truncation. assert.equal(truncateForHandoff(messages, pending), undefined); - // Tool fires, prompt drains + // Handoff fires; its followUp drains into the log. pending.add("id-1"); messages.push(user(buildHandoffMessage("id-1", "task 2"))); - // Next context call: truncation fires - const first = truncateForHandoff(messages, pending); - assert.ok(first); - assert.equal(first.consumedId, "id-1"); - pending.delete(first.consumedId); - - // Subsequent context calls (within task 2): the sentinel is still in the - // session log (we never write back), but the id is no longer pending — - // no further truncation. - messages.push(assistant("starting task 2")); - assert.equal(truncateForHandoff(messages, pending), undefined); + // Every subsequent turn keeps truncating to the same sentinel. + for (let turn = 0; turn < 3; turn++) { + const result = truncateForHandoff(messages, pending); + assert.ok(result, `turn ${turn}: expected truncation`); + assert.equal(result.matchedId, "id-1"); + assert.equal(result.messages[0].role, "user"); + assert.equal( + (result.messages[0].content as Array<{ text: string }>)[0].text, + "task 2", + ); + messages.push(assistant(`step ${turn}`)); + } +}); + +// ── resolveHandoffContext (the per-turn decision used by index.ts) ────────── + +test("resolveHandoffContext: returns undefined when no handoff is active", () => { + const messages = [user(buildHandoffMessage("id-1", "task")), assistant("x")]; + assert.equal(resolveHandoffContext(messages, null), undefined); +}); + +test("resolveHandoffContext: slices to the active handoff and strips the sentinel", () => { + const messages = [ + user("old task"), + assistant("did it"), + user(buildHandoffMessage("id-1", "new task")), + ]; + const result = resolveHandoffContext(messages, "id-1"); + assert.ok(result); + assert.equal(result.length, 1); + assert.equal( + (result[0].content as Array<{ text: string }>)[0].text, + "new task", + ); +}); + +test("resolveHandoffContext: returns undefined when the active sentinel is not in the log", () => { + // Handoff staged but its followUp has not drained yet (or was aborted): + // the sentinel isn't a message, so the context is left untouched. + const messages = [user("ordinary"), assistant("reply")]; + assert.equal(resolveHandoffContext(messages, "id-1"), undefined); +}); + +test("resolveHandoffContext: PERSISTS truncation across every turn (regression)", () => { + // The original bug consumed the handoff id after one turn, so the second + // turn sent the full pre-handoff history again (context ballooned across a + // task chain instead of resetting). Here we simulate several turns of a task + // growing after a handoff and assert the context stays pinned to the + // carry-over prompt on EVERY turn. + const history = [user("task 11"), assistant("done 11"), toolResult("ok")]; + const messages = [ + ...history, + user(buildHandoffMessage("id-1", "task 12")), + ]; + const activeId = "id-1"; + + for (let turn = 0; turn < 5; turn++) { + const result = resolveHandoffContext(messages, activeId); + assert.ok(result, `turn ${turn}: expected truncation`); + // First visible message is always the (stripped) carry-over prompt. + assert.equal(result[0].role, "user"); + assert.equal( + (result[0].content as Array<{ text: string }>)[0].text.startsWith( + "task 12", + ), + true, + `turn ${turn}: context not pinned to carry-over prompt`, + ); + // Stale pre-handoff history must never leak back in. + assert.equal( + result.some((m) => + (m.content as Array<{ text?: string }>).some( + (c) => c.text === "task 11", + ), + ), + false, + `turn ${turn}: stale pre-handoff history leaked back in`, + ); + // The task does more work; the log grows. + messages.push(assistant(`work step ${turn}`)); + messages.push(toolResult(`result ${turn}`)); + } +}); + +test("resolveHandoffContext: a later handoff supersedes the earlier one", () => { + const messages = [ + user(buildHandoffMessage("id-1", "first task")), + assistant("worked on first"), + toolResult("done"), + user(buildHandoffMessage("id-2", "second task")), + ]; + // index.ts overwrites activeHandoffId with the newest id on each handoff. + const result = resolveHandoffContext(messages, "id-2"); + assert.ok(result); + assert.equal(result.length, 1); + assert.equal( + (result[0].content as Array<{ text: string }>)[0].text, + "second task", + ); });