fix session-handoff truncation persisting only one turn
The context handler consumed the handoff marker after the first turn (and agent_end cleared all markers), so every subsequent turn fell through to the fast path and re-sent the full, ever-growing history. Across a task chain the context grew monotonically instead of resetting per task, defeating the handoff and eventually overrunning the model's context window. Track a single activeHandoffId that is never consumed and is overwritten when a later handoff supersedes it, so truncation re-applies on every turn and the context stays pinned to the carry-over prompt for the whole task. Add a pure, testable resolveHandoffContext helper and a multi-turn regression test. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+31
-33
@@ -25,18 +25,21 @@
|
||||
* Flow
|
||||
* ────
|
||||
* 1. LLM calls `session_handoff({ prompt: "..." })`.
|
||||
* 2. Tool generates a unique sentinel ID, adds it to `pendingMarkers`.
|
||||
* 2. Tool generates a unique sentinel ID and records it as `activeHandoffId`.
|
||||
* 3. Tool queues `<pi-handoff-resume:ID>\n<prompt>` as a followUp user
|
||||
* message. Returns `terminate: true` to stop the current agent loop.
|
||||
* 4. Agent loop ends. followUp drains, the message is appended.
|
||||
* 5. New agent turn begins, LLM call requested.
|
||||
* 6. `context` event fires. Our handler finds the sentinel, slices the
|
||||
* message list to start with that message, strips the sentinel from its
|
||||
* text, removes the ID from `pendingMarkers` so the sentinel won't
|
||||
* re-trigger on subsequent turns.
|
||||
* 7. LLM receives a single-user-message context: just the carry-over
|
||||
* prompt. System prompt and tool definitions are unaffected (they're
|
||||
* separate from `messages` in the LLM payload).
|
||||
* 6. `context` event fires. Our handler finds the active sentinel, slices the
|
||||
* message list to start with that message, and strips the sentinel from
|
||||
* its text. This runs on EVERY subsequent turn — the active id is NOT
|
||||
* consumed — so the context stays pinned to the carry-over prompt for the
|
||||
* whole task, not just the first turn. A later handoff overwrites
|
||||
* `activeHandoffId` and wins, since its sentinel sits further down.
|
||||
* 7. LLM receives a context that starts at the carry-over prompt — just that
|
||||
* prompt on the first turn, plus the new task's own work on later turns.
|
||||
* System prompt and tool definitions are unaffected (they're separate from
|
||||
* `messages` in the LLM payload).
|
||||
*
|
||||
* Known caveats
|
||||
* ─────────────
|
||||
@@ -65,37 +68,32 @@ import {
|
||||
buildHandoffMessage,
|
||||
normalizePrompt,
|
||||
normalizeReason,
|
||||
truncateForHandoff,
|
||||
resolveHandoffContext,
|
||||
} from "./state.js";
|
||||
|
||||
export default function (pi: ExtensionAPI) {
|
||||
// Sentinel IDs awaiting consumption. The tool adds an ID per handoff; the
|
||||
// context handler removes it once the corresponding sentinel has been
|
||||
// detected in the message list and truncation has been applied. Closure-
|
||||
// scoped — survives across turns within a single extension load, resets
|
||||
// on /reload or session restart (correctly making historical sentinels in
|
||||
// the on-disk log inert).
|
||||
const pendingMarkers = new Set<string>();
|
||||
// ID of the handoff currently anchoring the context, or null when none is
|
||||
// active. Set when the tool fires; overwritten when a later handoff
|
||||
// supersedes it. Closure-scoped — survives across turns within a single
|
||||
// extension load, resets to null on /reload or session restart (correctly
|
||||
// making historical sentinels in a resumed on-disk log inert: no active id
|
||||
// ⇒ no truncation until a NEW handoff fires).
|
||||
let activeHandoffId: string | null = null;
|
||||
|
||||
pi.on("context", async (event, _ctx) => {
|
||||
// Fast path for normal turns (no pending handoff).
|
||||
if (pendingMarkers.size === 0) return undefined;
|
||||
// Fast path for normal turns (no active handoff).
|
||||
if (activeHandoffId === null) return undefined;
|
||||
|
||||
const result = truncateForHandoff(event.messages, pendingMarkers);
|
||||
if (!result) return undefined;
|
||||
// Re-applied on EVERY turn, not just the first one after the handoff.
|
||||
// The sentinel stays in the on-disk log (our return only rewrites what
|
||||
// the LLM sees, never the stored messages), so slicing to it each turn
|
||||
// keeps the context window pinned to the carry-over prompt for the whole
|
||||
// task. Returns undefined until the sentinel actually appears (followUp
|
||||
// not yet drained), leaving the context untouched.
|
||||
const messages = resolveHandoffContext(event.messages, activeHandoffId);
|
||||
if (!messages) return undefined;
|
||||
|
||||
pendingMarkers.delete(result.consumedId);
|
||||
return { messages: result.messages };
|
||||
});
|
||||
|
||||
// Defensive cleanup: drop any pending marker IDs at the end of each agent
|
||||
// invocation. On the normal handoff path the set is already empty by this
|
||||
// point (consumed by the context handler). On an aborted invocation the
|
||||
// followUp message stays queued — clearing here ensures the orphaned
|
||||
// sentinel is INERT when it eventually drains, instead of resurrecting the
|
||||
// old handoff on top of whatever the user types next.
|
||||
pi.on("agent_end", () => {
|
||||
pendingMarkers.clear();
|
||||
return { messages };
|
||||
});
|
||||
|
||||
pi.registerTool({
|
||||
@@ -143,7 +141,7 @@ export default function (pi: ExtensionAPI) {
|
||||
const reason = normalizeReason(params.reason);
|
||||
|
||||
const id = randomUUID();
|
||||
pendingMarkers.add(id);
|
||||
activeHandoffId = id;
|
||||
|
||||
// Notify other extensions (logging, metrics, etc.) before we
|
||||
// queue the actual handoff message.
|
||||
|
||||
+50
-18
@@ -32,19 +32,22 @@
|
||||
* log. The trailing newline is consumed by the regex so the stripped prompt
|
||||
* starts cleanly.
|
||||
*
|
||||
* Multi-handoff safety
|
||||
* ────────────────────
|
||||
* Sentinels persist in the underlying session log even after we consume them,
|
||||
* so we can't rely on "is the sentinel present?" alone — on the LLM call
|
||||
* AFTER the handoff turn, the sentinel is still in `context.messages` and a
|
||||
* naive check would re-truncate, destroying the model's response to the
|
||||
* carry-over prompt. We track pending sentinel IDs in a Set in the caller's
|
||||
* closure: a sentinel only matches if its ID is still pending. After the
|
||||
* truncation fires, the caller removes the ID from the set, making the
|
||||
* sentinel inert for all future context calls.
|
||||
* Persistence across turns
|
||||
* ────────────────────────
|
||||
* Truncation must re-apply on EVERY turn after a handoff, not just the first.
|
||||
* The sentinel stays in the underlying session log — rewriting the LLM's view
|
||||
* via the `context` event never writes back to the stored messages — so each
|
||||
* turn we slice to the active handoff's sentinel again, keeping the context
|
||||
* pinned to the carry-over prompt for the whole task. `slice(index)` keeps the
|
||||
* sentinel message AND everything after it, so re-slicing never discards the
|
||||
* model's work on the new task; it only keeps trimming the stale pre-handoff
|
||||
* history.
|
||||
*
|
||||
* This also makes resumed sessions safe — pending IDs start empty after
|
||||
* extension reload, so all historical sentinels in the log are inert.
|
||||
* The active handoff is tracked as a single id in the caller's closure (see
|
||||
* index.ts::activeHandoffId). A later handoff overwrites it and wins, since its
|
||||
* sentinel sits further down the message list. The id resets to null on
|
||||
* extension reload, so a resumed session's historical sentinels are inert until
|
||||
* a NEW handoff fires.
|
||||
*/
|
||||
|
||||
export interface PendingHandoff {
|
||||
@@ -157,16 +160,17 @@ export function findPendingHandoff(
|
||||
|
||||
/**
|
||||
* Slice the message list to start at the handoff message and strip the
|
||||
* sentinel from its text. Returns the new list and the consumed ID, or
|
||||
* undefined if no pending sentinel is present.
|
||||
* sentinel from its text. Returns the new list and the matched ID, or
|
||||
* undefined if no matching sentinel is present.
|
||||
*
|
||||
* Pure: does NOT mutate `pendingIds`. The caller is responsible for removing
|
||||
* the consumed ID from the set after a successful truncation.
|
||||
* Pure: does NOT mutate `pendingIds`. Callers must NOT remove the matched id
|
||||
* afterwards — truncation is meant to persist across turns (see
|
||||
* resolveHandoffContext).
|
||||
*/
|
||||
export function truncateForHandoff<M extends MessageLike>(
|
||||
messages: ReadonlyArray<M>,
|
||||
pendingIds: ReadonlySet<string>,
|
||||
): { messages: M[]; consumedId: string } | undefined {
|
||||
): { messages: M[]; matchedId: string } | undefined {
|
||||
const match = findPendingHandoff(messages, pendingIds);
|
||||
if (!match) return undefined;
|
||||
|
||||
@@ -184,6 +188,34 @@ export function truncateForHandoff<M extends MessageLike>(
|
||||
|
||||
return {
|
||||
messages: [cleanedFirst, ...tail.slice(1)],
|
||||
consumedId: match.id,
|
||||
matchedId: match.id,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve the LLM-visible message list for one `context` event, given the id of
|
||||
* the currently active handoff (or null when none is active).
|
||||
*
|
||||
* Returns the truncated list — sliced to the active handoff's sentinel, with
|
||||
* the sentinel stripped — or undefined to leave the context untouched.
|
||||
*
|
||||
* STABLE across turns: as long as `activeId` stays set and its sentinel is in
|
||||
* the message log, every call returns the same truncation. That persistence is
|
||||
* the whole point — it keeps the post-handoff context pinned to the carry-over
|
||||
* prompt for the entire task, not just the first turn after the handoff. (The
|
||||
* earlier design consumed the id after one turn, which let the full pre-handoff
|
||||
* history snap back on the second turn — see the regression test in
|
||||
* tests/session-handoff.test.ts.)
|
||||
*
|
||||
* Returns undefined when the active sentinel is not yet a message (its followUp
|
||||
* hasn't drained, or the handoff was aborted before draining), leaving the
|
||||
* context untouched until the sentinel actually appears.
|
||||
*/
|
||||
export function resolveHandoffContext<M extends MessageLike>(
|
||||
messages: ReadonlyArray<M>,
|
||||
activeId: string | null,
|
||||
): M[] | undefined {
|
||||
if (!activeId) return undefined;
|
||||
const result = truncateForHandoff(messages, new Set([activeId]));
|
||||
return result?.messages;
|
||||
}
|
||||
|
||||
+112
-19
@@ -13,6 +13,7 @@ import {
|
||||
findPendingHandoff,
|
||||
normalizePrompt,
|
||||
normalizeReason,
|
||||
resolveHandoffContext,
|
||||
SENTINEL_RE,
|
||||
stripSentinel,
|
||||
truncateForHandoff,
|
||||
@@ -184,9 +185,9 @@ test("findPendingHandoff: finds a matching sentinel", () => {
|
||||
});
|
||||
|
||||
test("findPendingHandoff: ignores sentinel whose id is not pending", () => {
|
||||
// Past handoff sentinels persist in the session log but are inert once
|
||||
// their id has been consumed (removed from the pending set).
|
||||
const messages = [user(buildHandoffMessage("consumed", "old task"))];
|
||||
// Past handoff sentinels persist in the session log but are inert when
|
||||
// their id is not the active one (e.g. a handoff from before a reload).
|
||||
const messages = [user(buildHandoffMessage("inactive", "old task"))];
|
||||
assert.equal(findPendingHandoff(messages, new Set(["different"])), undefined);
|
||||
});
|
||||
|
||||
@@ -247,7 +248,7 @@ test("truncateForHandoff: slices to the handoff message and strips the sentinel"
|
||||
];
|
||||
const result = truncateForHandoff(messages, new Set(["id-1"]));
|
||||
assert.ok(result, "expected truncation result");
|
||||
assert.equal(result.consumedId, "id-1");
|
||||
assert.equal(result.matchedId, "id-1");
|
||||
assert.equal(result.messages.length, 1);
|
||||
assert.equal(result.messages[0].role, "user");
|
||||
const content = result.messages[0].content as Array<{
|
||||
@@ -329,7 +330,7 @@ test("truncateForHandoff: picks the LATEST handoff when multiple are pending", (
|
||||
];
|
||||
const result = truncateForHandoff(messages, new Set(["id-1", "id-2"]));
|
||||
assert.ok(result);
|
||||
assert.equal(result.consumedId, "id-2");
|
||||
assert.equal(result.matchedId, "id-2");
|
||||
assert.equal(result.messages.length, 1);
|
||||
assert.equal(
|
||||
(result.messages[0].content as Array<{ text: string }>)[0].text,
|
||||
@@ -337,26 +338,118 @@ test("truncateForHandoff: picks the LATEST handoff when multiple are pending", (
|
||||
);
|
||||
});
|
||||
|
||||
test("truncateForHandoff: full lifecycle — pending → consume → inert", () => {
|
||||
test("truncateForHandoff: keeps matching across turns while the id stays pending", () => {
|
||||
// Regression guard: the id must NOT be removed after the first match. As
|
||||
// long as it stays in the pending set, every call keeps slicing to the
|
||||
// sentinel — that is what pins the context to the carry-over prompt for the
|
||||
// whole task instead of just one turn.
|
||||
const pending = new Set<string>();
|
||||
const messages = [user("task 1"), assistant("response")];
|
||||
|
||||
// First context call: no pending markers, no truncation
|
||||
// No pending markers: no truncation.
|
||||
assert.equal(truncateForHandoff(messages, pending), undefined);
|
||||
|
||||
// Tool fires, prompt drains
|
||||
// Handoff fires; its followUp drains into the log.
|
||||
pending.add("id-1");
|
||||
messages.push(user(buildHandoffMessage("id-1", "task 2")));
|
||||
|
||||
// Next context call: truncation fires
|
||||
const first = truncateForHandoff(messages, pending);
|
||||
assert.ok(first);
|
||||
assert.equal(first.consumedId, "id-1");
|
||||
pending.delete(first.consumedId);
|
||||
|
||||
// Subsequent context calls (within task 2): the sentinel is still in the
|
||||
// session log (we never write back), but the id is no longer pending —
|
||||
// no further truncation.
|
||||
messages.push(assistant("starting task 2"));
|
||||
assert.equal(truncateForHandoff(messages, pending), undefined);
|
||||
// Every subsequent turn keeps truncating to the same sentinel.
|
||||
for (let turn = 0; turn < 3; turn++) {
|
||||
const result = truncateForHandoff(messages, pending);
|
||||
assert.ok(result, `turn ${turn}: expected truncation`);
|
||||
assert.equal(result.matchedId, "id-1");
|
||||
assert.equal(result.messages[0].role, "user");
|
||||
assert.equal(
|
||||
(result.messages[0].content as Array<{ text: string }>)[0].text,
|
||||
"task 2",
|
||||
);
|
||||
messages.push(assistant(`step ${turn}`));
|
||||
}
|
||||
});
|
||||
|
||||
// ── resolveHandoffContext (the per-turn decision used by index.ts) ──────────
|
||||
|
||||
test("resolveHandoffContext: returns undefined when no handoff is active", () => {
|
||||
const messages = [user(buildHandoffMessage("id-1", "task")), assistant("x")];
|
||||
assert.equal(resolveHandoffContext(messages, null), undefined);
|
||||
});
|
||||
|
||||
test("resolveHandoffContext: slices to the active handoff and strips the sentinel", () => {
|
||||
const messages = [
|
||||
user("old task"),
|
||||
assistant("did it"),
|
||||
user(buildHandoffMessage("id-1", "new task")),
|
||||
];
|
||||
const result = resolveHandoffContext(messages, "id-1");
|
||||
assert.ok(result);
|
||||
assert.equal(result.length, 1);
|
||||
assert.equal(
|
||||
(result[0].content as Array<{ text: string }>)[0].text,
|
||||
"new task",
|
||||
);
|
||||
});
|
||||
|
||||
test("resolveHandoffContext: returns undefined when the active sentinel is not in the log", () => {
|
||||
// Handoff staged but its followUp has not drained yet (or was aborted):
|
||||
// the sentinel isn't a message, so the context is left untouched.
|
||||
const messages = [user("ordinary"), assistant("reply")];
|
||||
assert.equal(resolveHandoffContext(messages, "id-1"), undefined);
|
||||
});
|
||||
|
||||
test("resolveHandoffContext: PERSISTS truncation across every turn (regression)", () => {
|
||||
// The original bug consumed the handoff id after one turn, so the second
|
||||
// turn sent the full pre-handoff history again (context ballooned across a
|
||||
// task chain instead of resetting). Here we simulate several turns of a task
|
||||
// growing after a handoff and assert the context stays pinned to the
|
||||
// carry-over prompt on EVERY turn.
|
||||
const history = [user("task 11"), assistant("done 11"), toolResult("ok")];
|
||||
const messages = [
|
||||
...history,
|
||||
user(buildHandoffMessage("id-1", "task 12")),
|
||||
];
|
||||
const activeId = "id-1";
|
||||
|
||||
for (let turn = 0; turn < 5; turn++) {
|
||||
const result = resolveHandoffContext(messages, activeId);
|
||||
assert.ok(result, `turn ${turn}: expected truncation`);
|
||||
// First visible message is always the (stripped) carry-over prompt.
|
||||
assert.equal(result[0].role, "user");
|
||||
assert.equal(
|
||||
(result[0].content as Array<{ text: string }>)[0].text.startsWith(
|
||||
"task 12",
|
||||
),
|
||||
true,
|
||||
`turn ${turn}: context not pinned to carry-over prompt`,
|
||||
);
|
||||
// Stale pre-handoff history must never leak back in.
|
||||
assert.equal(
|
||||
result.some((m) =>
|
||||
(m.content as Array<{ text?: string }>).some(
|
||||
(c) => c.text === "task 11",
|
||||
),
|
||||
),
|
||||
false,
|
||||
`turn ${turn}: stale pre-handoff history leaked back in`,
|
||||
);
|
||||
// The task does more work; the log grows.
|
||||
messages.push(assistant(`work step ${turn}`));
|
||||
messages.push(toolResult(`result ${turn}`));
|
||||
}
|
||||
});
|
||||
|
||||
test("resolveHandoffContext: a later handoff supersedes the earlier one", () => {
|
||||
const messages = [
|
||||
user(buildHandoffMessage("id-1", "first task")),
|
||||
assistant("worked on first"),
|
||||
toolResult("done"),
|
||||
user(buildHandoffMessage("id-2", "second task")),
|
||||
];
|
||||
// index.ts overwrites activeHandoffId with the newest id on each handoff.
|
||||
const result = resolveHandoffContext(messages, "id-2");
|
||||
assert.ok(result);
|
||||
assert.equal(result.length, 1);
|
||||
assert.equal(
|
||||
(result[0].content as Array<{ text: string }>)[0].text,
|
||||
"second task",
|
||||
);
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user