fix session-handoff truncation persisting only one turn
The context handler consumed the handoff marker after the first turn (and agent_end cleared all markers), so every subsequent turn fell through to the fast path and re-sent the full, ever-growing history. Across a task chain the context grew monotonically instead of resetting per task, defeating the handoff and eventually overrunning the model's context window. Track a single activeHandoffId that is never consumed and is overwritten when a later handoff supersedes it, so truncation re-applies on every turn and the context stays pinned to the carry-over prompt for the whole task. Add a pure, testable resolveHandoffContext helper and a multi-turn regression test. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+31
-33
@@ -25,18 +25,21 @@
|
|||||||
* Flow
|
* Flow
|
||||||
* ────
|
* ────
|
||||||
* 1. LLM calls `session_handoff({ prompt: "..." })`.
|
* 1. LLM calls `session_handoff({ prompt: "..." })`.
|
||||||
* 2. Tool generates a unique sentinel ID, adds it to `pendingMarkers`.
|
* 2. Tool generates a unique sentinel ID and records it as `activeHandoffId`.
|
||||||
* 3. Tool queues `<pi-handoff-resume:ID>\n<prompt>` as a followUp user
|
* 3. Tool queues `<pi-handoff-resume:ID>\n<prompt>` as a followUp user
|
||||||
* message. Returns `terminate: true` to stop the current agent loop.
|
* message. Returns `terminate: true` to stop the current agent loop.
|
||||||
* 4. Agent loop ends. followUp drains, the message is appended.
|
* 4. Agent loop ends. followUp drains, the message is appended.
|
||||||
* 5. New agent turn begins, LLM call requested.
|
* 5. New agent turn begins, LLM call requested.
|
||||||
* 6. `context` event fires. Our handler finds the sentinel, slices the
|
* 6. `context` event fires. Our handler finds the active sentinel, slices the
|
||||||
* message list to start with that message, strips the sentinel from its
|
* message list to start with that message, and strips the sentinel from
|
||||||
* text, removes the ID from `pendingMarkers` so the sentinel won't
|
* its text. This runs on EVERY subsequent turn — the active id is NOT
|
||||||
* re-trigger on subsequent turns.
|
* consumed — so the context stays pinned to the carry-over prompt for the
|
||||||
* 7. LLM receives a single-user-message context: just the carry-over
|
* whole task, not just the first turn. A later handoff overwrites
|
||||||
* prompt. System prompt and tool definitions are unaffected (they're
|
* `activeHandoffId` and wins, since its sentinel sits further down.
|
||||||
* separate from `messages` in the LLM payload).
|
* 7. LLM receives a context that starts at the carry-over prompt — just that
|
||||||
|
* prompt on the first turn, plus the new task's own work on later turns.
|
||||||
|
* System prompt and tool definitions are unaffected (they're separate from
|
||||||
|
* `messages` in the LLM payload).
|
||||||
*
|
*
|
||||||
* Known caveats
|
* Known caveats
|
||||||
* ─────────────
|
* ─────────────
|
||||||
@@ -65,37 +68,32 @@ import {
|
|||||||
buildHandoffMessage,
|
buildHandoffMessage,
|
||||||
normalizePrompt,
|
normalizePrompt,
|
||||||
normalizeReason,
|
normalizeReason,
|
||||||
truncateForHandoff,
|
resolveHandoffContext,
|
||||||
} from "./state.js";
|
} from "./state.js";
|
||||||
|
|
||||||
export default function (pi: ExtensionAPI) {
|
export default function (pi: ExtensionAPI) {
|
||||||
// Sentinel IDs awaiting consumption. The tool adds an ID per handoff; the
|
// ID of the handoff currently anchoring the context, or null when none is
|
||||||
// context handler removes it once the corresponding sentinel has been
|
// active. Set when the tool fires; overwritten when a later handoff
|
||||||
// detected in the message list and truncation has been applied. Closure-
|
// supersedes it. Closure-scoped — survives across turns within a single
|
||||||
// scoped — survives across turns within a single extension load, resets
|
// extension load, resets to null on /reload or session restart (correctly
|
||||||
// on /reload or session restart (correctly making historical sentinels in
|
// making historical sentinels in a resumed on-disk log inert: no active id
|
||||||
// the on-disk log inert).
|
// ⇒ no truncation until a NEW handoff fires).
|
||||||
const pendingMarkers = new Set<string>();
|
let activeHandoffId: string | null = null;
|
||||||
|
|
||||||
pi.on("context", async (event, _ctx) => {
|
pi.on("context", async (event, _ctx) => {
|
||||||
// Fast path for normal turns (no pending handoff).
|
// Fast path for normal turns (no active handoff).
|
||||||
if (pendingMarkers.size === 0) return undefined;
|
if (activeHandoffId === null) return undefined;
|
||||||
|
|
||||||
const result = truncateForHandoff(event.messages, pendingMarkers);
|
// Re-applied on EVERY turn, not just the first one after the handoff.
|
||||||
if (!result) return undefined;
|
// The sentinel stays in the on-disk log (our return only rewrites what
|
||||||
|
// the LLM sees, never the stored messages), so slicing to it each turn
|
||||||
|
// keeps the context window pinned to the carry-over prompt for the whole
|
||||||
|
// task. Returns undefined until the sentinel actually appears (followUp
|
||||||
|
// not yet drained), leaving the context untouched.
|
||||||
|
const messages = resolveHandoffContext(event.messages, activeHandoffId);
|
||||||
|
if (!messages) return undefined;
|
||||||
|
|
||||||
pendingMarkers.delete(result.consumedId);
|
return { messages };
|
||||||
return { messages: result.messages };
|
|
||||||
});
|
|
||||||
|
|
||||||
// Defensive cleanup: drop any pending marker IDs at the end of each agent
|
|
||||||
// invocation. On the normal handoff path the set is already empty by this
|
|
||||||
// point (consumed by the context handler). On an aborted invocation the
|
|
||||||
// followUp message stays queued — clearing here ensures the orphaned
|
|
||||||
// sentinel is INERT when it eventually drains, instead of resurrecting the
|
|
||||||
// old handoff on top of whatever the user types next.
|
|
||||||
pi.on("agent_end", () => {
|
|
||||||
pendingMarkers.clear();
|
|
||||||
});
|
});
|
||||||
|
|
||||||
pi.registerTool({
|
pi.registerTool({
|
||||||
@@ -143,7 +141,7 @@ export default function (pi: ExtensionAPI) {
|
|||||||
const reason = normalizeReason(params.reason);
|
const reason = normalizeReason(params.reason);
|
||||||
|
|
||||||
const id = randomUUID();
|
const id = randomUUID();
|
||||||
pendingMarkers.add(id);
|
activeHandoffId = id;
|
||||||
|
|
||||||
// Notify other extensions (logging, metrics, etc.) before we
|
// Notify other extensions (logging, metrics, etc.) before we
|
||||||
// queue the actual handoff message.
|
// queue the actual handoff message.
|
||||||
|
|||||||
+50
-18
@@ -32,19 +32,22 @@
|
|||||||
* log. The trailing newline is consumed by the regex so the stripped prompt
|
* log. The trailing newline is consumed by the regex so the stripped prompt
|
||||||
* starts cleanly.
|
* starts cleanly.
|
||||||
*
|
*
|
||||||
* Multi-handoff safety
|
* Persistence across turns
|
||||||
* ────────────────────
|
* ────────────────────────
|
||||||
* Sentinels persist in the underlying session log even after we consume them,
|
* Truncation must re-apply on EVERY turn after a handoff, not just the first.
|
||||||
* so we can't rely on "is the sentinel present?" alone — on the LLM call
|
* The sentinel stays in the underlying session log — rewriting the LLM's view
|
||||||
* AFTER the handoff turn, the sentinel is still in `context.messages` and a
|
* via the `context` event never writes back to the stored messages — so each
|
||||||
* naive check would re-truncate, destroying the model's response to the
|
* turn we slice to the active handoff's sentinel again, keeping the context
|
||||||
* carry-over prompt. We track pending sentinel IDs in a Set in the caller's
|
* pinned to the carry-over prompt for the whole task. `slice(index)` keeps the
|
||||||
* closure: a sentinel only matches if its ID is still pending. After the
|
* sentinel message AND everything after it, so re-slicing never discards the
|
||||||
* truncation fires, the caller removes the ID from the set, making the
|
* model's work on the new task; it only keeps trimming the stale pre-handoff
|
||||||
* sentinel inert for all future context calls.
|
* history.
|
||||||
*
|
*
|
||||||
* This also makes resumed sessions safe — pending IDs start empty after
|
* The active handoff is tracked as a single id in the caller's closure (see
|
||||||
* extension reload, so all historical sentinels in the log are inert.
|
* index.ts::activeHandoffId). A later handoff overwrites it and wins, since its
|
||||||
|
* sentinel sits further down the message list. The id resets to null on
|
||||||
|
* extension reload, so a resumed session's historical sentinels are inert until
|
||||||
|
* a NEW handoff fires.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
export interface PendingHandoff {
|
export interface PendingHandoff {
|
||||||
@@ -157,16 +160,17 @@ export function findPendingHandoff(
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Slice the message list to start at the handoff message and strip the
|
* Slice the message list to start at the handoff message and strip the
|
||||||
* sentinel from its text. Returns the new list and the consumed ID, or
|
* sentinel from its text. Returns the new list and the matched ID, or
|
||||||
* undefined if no pending sentinel is present.
|
* undefined if no matching sentinel is present.
|
||||||
*
|
*
|
||||||
* Pure: does NOT mutate `pendingIds`. The caller is responsible for removing
|
* Pure: does NOT mutate `pendingIds`. Callers must NOT remove the matched id
|
||||||
* the consumed ID from the set after a successful truncation.
|
* afterwards — truncation is meant to persist across turns (see
|
||||||
|
* resolveHandoffContext).
|
||||||
*/
|
*/
|
||||||
export function truncateForHandoff<M extends MessageLike>(
|
export function truncateForHandoff<M extends MessageLike>(
|
||||||
messages: ReadonlyArray<M>,
|
messages: ReadonlyArray<M>,
|
||||||
pendingIds: ReadonlySet<string>,
|
pendingIds: ReadonlySet<string>,
|
||||||
): { messages: M[]; consumedId: string } | undefined {
|
): { messages: M[]; matchedId: string } | undefined {
|
||||||
const match = findPendingHandoff(messages, pendingIds);
|
const match = findPendingHandoff(messages, pendingIds);
|
||||||
if (!match) return undefined;
|
if (!match) return undefined;
|
||||||
|
|
||||||
@@ -184,6 +188,34 @@ export function truncateForHandoff<M extends MessageLike>(
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
messages: [cleanedFirst, ...tail.slice(1)],
|
messages: [cleanedFirst, ...tail.slice(1)],
|
||||||
consumedId: match.id,
|
matchedId: match.id,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resolve the LLM-visible message list for one `context` event, given the id of
|
||||||
|
* the currently active handoff (or null when none is active).
|
||||||
|
*
|
||||||
|
* Returns the truncated list — sliced to the active handoff's sentinel, with
|
||||||
|
* the sentinel stripped — or undefined to leave the context untouched.
|
||||||
|
*
|
||||||
|
* STABLE across turns: as long as `activeId` stays set and its sentinel is in
|
||||||
|
* the message log, every call returns the same truncation. That persistence is
|
||||||
|
* the whole point — it keeps the post-handoff context pinned to the carry-over
|
||||||
|
* prompt for the entire task, not just the first turn after the handoff. (The
|
||||||
|
* earlier design consumed the id after one turn, which let the full pre-handoff
|
||||||
|
* history snap back on the second turn — see the regression test in
|
||||||
|
* tests/session-handoff.test.ts.)
|
||||||
|
*
|
||||||
|
* Returns undefined when the active sentinel is not yet a message (its followUp
|
||||||
|
* hasn't drained, or the handoff was aborted before draining), leaving the
|
||||||
|
* context untouched until the sentinel actually appears.
|
||||||
|
*/
|
||||||
|
export function resolveHandoffContext<M extends MessageLike>(
|
||||||
|
messages: ReadonlyArray<M>,
|
||||||
|
activeId: string | null,
|
||||||
|
): M[] | undefined {
|
||||||
|
if (!activeId) return undefined;
|
||||||
|
const result = truncateForHandoff(messages, new Set([activeId]));
|
||||||
|
return result?.messages;
|
||||||
|
}
|
||||||
|
|||||||
+112
-19
@@ -13,6 +13,7 @@ import {
|
|||||||
findPendingHandoff,
|
findPendingHandoff,
|
||||||
normalizePrompt,
|
normalizePrompt,
|
||||||
normalizeReason,
|
normalizeReason,
|
||||||
|
resolveHandoffContext,
|
||||||
SENTINEL_RE,
|
SENTINEL_RE,
|
||||||
stripSentinel,
|
stripSentinel,
|
||||||
truncateForHandoff,
|
truncateForHandoff,
|
||||||
@@ -184,9 +185,9 @@ test("findPendingHandoff: finds a matching sentinel", () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
test("findPendingHandoff: ignores sentinel whose id is not pending", () => {
|
test("findPendingHandoff: ignores sentinel whose id is not pending", () => {
|
||||||
// Past handoff sentinels persist in the session log but are inert once
|
// Past handoff sentinels persist in the session log but are inert when
|
||||||
// their id has been consumed (removed from the pending set).
|
// their id is not the active one (e.g. a handoff from before a reload).
|
||||||
const messages = [user(buildHandoffMessage("consumed", "old task"))];
|
const messages = [user(buildHandoffMessage("inactive", "old task"))];
|
||||||
assert.equal(findPendingHandoff(messages, new Set(["different"])), undefined);
|
assert.equal(findPendingHandoff(messages, new Set(["different"])), undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -247,7 +248,7 @@ test("truncateForHandoff: slices to the handoff message and strips the sentinel"
|
|||||||
];
|
];
|
||||||
const result = truncateForHandoff(messages, new Set(["id-1"]));
|
const result = truncateForHandoff(messages, new Set(["id-1"]));
|
||||||
assert.ok(result, "expected truncation result");
|
assert.ok(result, "expected truncation result");
|
||||||
assert.equal(result.consumedId, "id-1");
|
assert.equal(result.matchedId, "id-1");
|
||||||
assert.equal(result.messages.length, 1);
|
assert.equal(result.messages.length, 1);
|
||||||
assert.equal(result.messages[0].role, "user");
|
assert.equal(result.messages[0].role, "user");
|
||||||
const content = result.messages[0].content as Array<{
|
const content = result.messages[0].content as Array<{
|
||||||
@@ -329,7 +330,7 @@ test("truncateForHandoff: picks the LATEST handoff when multiple are pending", (
|
|||||||
];
|
];
|
||||||
const result = truncateForHandoff(messages, new Set(["id-1", "id-2"]));
|
const result = truncateForHandoff(messages, new Set(["id-1", "id-2"]));
|
||||||
assert.ok(result);
|
assert.ok(result);
|
||||||
assert.equal(result.consumedId, "id-2");
|
assert.equal(result.matchedId, "id-2");
|
||||||
assert.equal(result.messages.length, 1);
|
assert.equal(result.messages.length, 1);
|
||||||
assert.equal(
|
assert.equal(
|
||||||
(result.messages[0].content as Array<{ text: string }>)[0].text,
|
(result.messages[0].content as Array<{ text: string }>)[0].text,
|
||||||
@@ -337,26 +338,118 @@ test("truncateForHandoff: picks the LATEST handoff when multiple are pending", (
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("truncateForHandoff: full lifecycle — pending → consume → inert", () => {
|
test("truncateForHandoff: keeps matching across turns while the id stays pending", () => {
|
||||||
|
// Regression guard: the id must NOT be removed after the first match. As
|
||||||
|
// long as it stays in the pending set, every call keeps slicing to the
|
||||||
|
// sentinel — that is what pins the context to the carry-over prompt for the
|
||||||
|
// whole task instead of just one turn.
|
||||||
const pending = new Set<string>();
|
const pending = new Set<string>();
|
||||||
const messages = [user("task 1"), assistant("response")];
|
const messages = [user("task 1"), assistant("response")];
|
||||||
|
|
||||||
// First context call: no pending markers, no truncation
|
// No pending markers: no truncation.
|
||||||
assert.equal(truncateForHandoff(messages, pending), undefined);
|
assert.equal(truncateForHandoff(messages, pending), undefined);
|
||||||
|
|
||||||
// Tool fires, prompt drains
|
// Handoff fires; its followUp drains into the log.
|
||||||
pending.add("id-1");
|
pending.add("id-1");
|
||||||
messages.push(user(buildHandoffMessage("id-1", "task 2")));
|
messages.push(user(buildHandoffMessage("id-1", "task 2")));
|
||||||
|
|
||||||
// Next context call: truncation fires
|
// Every subsequent turn keeps truncating to the same sentinel.
|
||||||
const first = truncateForHandoff(messages, pending);
|
for (let turn = 0; turn < 3; turn++) {
|
||||||
assert.ok(first);
|
const result = truncateForHandoff(messages, pending);
|
||||||
assert.equal(first.consumedId, "id-1");
|
assert.ok(result, `turn ${turn}: expected truncation`);
|
||||||
pending.delete(first.consumedId);
|
assert.equal(result.matchedId, "id-1");
|
||||||
|
assert.equal(result.messages[0].role, "user");
|
||||||
// Subsequent context calls (within task 2): the sentinel is still in the
|
assert.equal(
|
||||||
// session log (we never write back), but the id is no longer pending —
|
(result.messages[0].content as Array<{ text: string }>)[0].text,
|
||||||
// no further truncation.
|
"task 2",
|
||||||
messages.push(assistant("starting task 2"));
|
);
|
||||||
assert.equal(truncateForHandoff(messages, pending), undefined);
|
messages.push(assistant(`step ${turn}`));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ── resolveHandoffContext (the per-turn decision used by index.ts) ──────────
|
||||||
|
|
||||||
|
test("resolveHandoffContext: returns undefined when no handoff is active", () => {
|
||||||
|
const messages = [user(buildHandoffMessage("id-1", "task")), assistant("x")];
|
||||||
|
assert.equal(resolveHandoffContext(messages, null), undefined);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("resolveHandoffContext: slices to the active handoff and strips the sentinel", () => {
|
||||||
|
const messages = [
|
||||||
|
user("old task"),
|
||||||
|
assistant("did it"),
|
||||||
|
user(buildHandoffMessage("id-1", "new task")),
|
||||||
|
];
|
||||||
|
const result = resolveHandoffContext(messages, "id-1");
|
||||||
|
assert.ok(result);
|
||||||
|
assert.equal(result.length, 1);
|
||||||
|
assert.equal(
|
||||||
|
(result[0].content as Array<{ text: string }>)[0].text,
|
||||||
|
"new task",
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("resolveHandoffContext: returns undefined when the active sentinel is not in the log", () => {
|
||||||
|
// Handoff staged but its followUp has not drained yet (or was aborted):
|
||||||
|
// the sentinel isn't a message, so the context is left untouched.
|
||||||
|
const messages = [user("ordinary"), assistant("reply")];
|
||||||
|
assert.equal(resolveHandoffContext(messages, "id-1"), undefined);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("resolveHandoffContext: PERSISTS truncation across every turn (regression)", () => {
|
||||||
|
// The original bug consumed the handoff id after one turn, so the second
|
||||||
|
// turn sent the full pre-handoff history again (context ballooned across a
|
||||||
|
// task chain instead of resetting). Here we simulate several turns of a task
|
||||||
|
// growing after a handoff and assert the context stays pinned to the
|
||||||
|
// carry-over prompt on EVERY turn.
|
||||||
|
const history = [user("task 11"), assistant("done 11"), toolResult("ok")];
|
||||||
|
const messages = [
|
||||||
|
...history,
|
||||||
|
user(buildHandoffMessage("id-1", "task 12")),
|
||||||
|
];
|
||||||
|
const activeId = "id-1";
|
||||||
|
|
||||||
|
for (let turn = 0; turn < 5; turn++) {
|
||||||
|
const result = resolveHandoffContext(messages, activeId);
|
||||||
|
assert.ok(result, `turn ${turn}: expected truncation`);
|
||||||
|
// First visible message is always the (stripped) carry-over prompt.
|
||||||
|
assert.equal(result[0].role, "user");
|
||||||
|
assert.equal(
|
||||||
|
(result[0].content as Array<{ text: string }>)[0].text.startsWith(
|
||||||
|
"task 12",
|
||||||
|
),
|
||||||
|
true,
|
||||||
|
`turn ${turn}: context not pinned to carry-over prompt`,
|
||||||
|
);
|
||||||
|
// Stale pre-handoff history must never leak back in.
|
||||||
|
assert.equal(
|
||||||
|
result.some((m) =>
|
||||||
|
(m.content as Array<{ text?: string }>).some(
|
||||||
|
(c) => c.text === "task 11",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
false,
|
||||||
|
`turn ${turn}: stale pre-handoff history leaked back in`,
|
||||||
|
);
|
||||||
|
// The task does more work; the log grows.
|
||||||
|
messages.push(assistant(`work step ${turn}`));
|
||||||
|
messages.push(toolResult(`result ${turn}`));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test("resolveHandoffContext: a later handoff supersedes the earlier one", () => {
|
||||||
|
const messages = [
|
||||||
|
user(buildHandoffMessage("id-1", "first task")),
|
||||||
|
assistant("worked on first"),
|
||||||
|
toolResult("done"),
|
||||||
|
user(buildHandoffMessage("id-2", "second task")),
|
||||||
|
];
|
||||||
|
// index.ts overwrites activeHandoffId with the newest id on each handoff.
|
||||||
|
const result = resolveHandoffContext(messages, "id-2");
|
||||||
|
assert.ok(result);
|
||||||
|
assert.equal(result.length, 1);
|
||||||
|
assert.equal(
|
||||||
|
(result[0].content as Array<{ text: string }>)[0].text,
|
||||||
|
"second task",
|
||||||
|
);
|
||||||
});
|
});
|
||||||
|
|||||||
Reference in New Issue
Block a user