From 687a35d3cc96cc0c4207f8c8643c42c7f6486d30 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sat, 13 Jun 2026 18:59:20 -0600 Subject: [PATCH] =?UTF-8?q?fix(tool-loop):=20strict-model=20tool-call=20hi?= =?UTF-8?q?story=20(assistant+tool=5Fcalls=20/=20role:tool)=20=E2=80=94=20?= =?UTF-8?q?not=20a=20user-message=20fold?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit runToolLoop and streamToolLoop folded every tool outcome into a single { role: 'user', content: 'Tool results:\n…' } message and dropped the assistant turn's tool_calls. A strict tool-history validator (Claude, and any OpenAI-compatible provider that validates) can't see its own tool use that way and re-issues the same call in a loop. The loop now appends OpenAI function-calling shape: the assistant turn that emitted the calls is preserved as an assistant message carrying its tool_calls array (content null when the turn was tool-only), and each result is its own { role: 'tool', tool_call_id, content } message keyed to its call. A missing tool-call id is derived deterministically from the tool name so the assistant entry and its result still match. ToolLoopMessage widens additively: tool_calls / tool_call_id are optional and content is string | null. A streamTurn that reads only role + content is unaffected; one that forwards the whole message array to an OpenAI-compatible endpoint now sends correct tool history. --- src/index.ts | 1 + src/tool-loop.test.ts | 177 +++++++++++++++++++++++++++++++++++++----- src/tool-loop.ts | 79 ++++++++++++++++--- 3 files changed, 227 insertions(+), 30 deletions(-) diff --git a/src/index.ts b/src/index.ts index 9eb0749f..d021b9cc 100644 --- a/src/index.ts +++ b/src/index.ts @@ -224,6 +224,7 @@ export { type StreamToolLoopYield, streamToolLoop, type ToolCallOutcome, + type ToolLoopAssistantToolCall, type ToolLoopCall, type ToolLoopEvent, type ToolLoopMessage, diff --git a/src/tool-loop.test.ts b/src/tool-loop.test.ts index 2af92276..97423db6 100644 --- a/src/tool-loop.test.ts +++ b/src/tool-loop.test.ts @@ -7,6 +7,7 @@ import { type ToolCallOutcome, type ToolLoopCall, type ToolLoopEvent, + type ToolLoopMessage, } from './tool-loop' const isExec = (n: string) => n === 'submit_proposal' || n === 'schedule_followup' @@ -31,8 +32,8 @@ describe('runToolLoop', () => { it('executes a tool call, folds the result back, re-runs to the final answer', async () => { const calls: ToolLoopCall[] = [] - const streamTurn = async function* (messages: Array<{ role: string; content: string }>) { - if (!messages.some((m) => m.content.includes('Tool results'))) { + const streamTurn = async function* (messages: ToolLoopMessage[]) { + if (!messages.some((m) => m.role === 'tool')) { yield { type: 'text', text: 'Routing. ' } as ToolLoopEvent yield { type: 'tool_call', @@ -57,6 +58,113 @@ describe('runToolLoop', () => { expect(r.finalText).toBe('Routing. Done.') }) + it('appends tool history in OpenAI shape: assistant+tool_calls then role:tool per result, no user fold', async () => { + const seen: ToolLoopMessage[][] = [] + const streamTurn = async function* (messages: ToolLoopMessage[]) { + seen.push(messages) + if (!messages.some((m) => m.role === 'tool')) { + yield { type: 'text', text: 'Routing. ' } as ToolLoopEvent + yield { + type: 'tool_call', + call: { toolName: 'submit_proposal', toolCallId: 'p1', args: { title: 'A' } }, + } as ToolLoopEvent + return + } + yield { type: 'text', text: 'Done.' } as ToolLoopEvent + } + + await runToolLoop({ + systemPrompt: 's', + userMessage: 'u', + streamTurn, + executeToolCall: async () => ({ ok: true, result: { id: 1 } }), + isExecutableTool: isExec, + }) + + // The second turn's history carries the model's own tool use in OpenAI shape. + const second = seen[1] ?? [] + const assistant = second.find((m) => m.role === 'assistant' && m.tool_calls) + expect(assistant).toMatchObject({ + role: 'assistant', + content: 'Routing.', + tool_calls: [{ id: 'p1', type: 'function', function: { name: 'submit_proposal' } }], + }) + const tool = second.find((m) => m.role === 'tool') + expect(tool).toMatchObject({ role: 'tool', tool_call_id: 'p1' }) + // The broken shape — a user message folding results — must not appear. + expect( + second.some((m) => m.role === 'user' && (m.content ?? '').includes('Tool results')), + ).toBe(false) + }) + + it('derives a stable tool_call_id from the tool name when the model omits one', async () => { + const seen: ToolLoopMessage[][] = [] + const streamTurn = async function* (messages: ToolLoopMessage[]) { + seen.push(messages) + if (!messages.some((m) => m.role === 'tool')) { + yield { + type: 'tool_call', + call: { toolName: 'submit_proposal', args: { title: 'A' } }, + } as ToolLoopEvent + return + } + yield { type: 'text', text: 'ok' } as ToolLoopEvent + } + + await runToolLoop({ + systemPrompt: 's', + userMessage: 'u', + streamTurn, + executeToolCall: async () => ({ ok: true, result: {} }), + isExecutableTool: isExec, + }) + + const second = seen[1] ?? [] + const assistant = second.find((m) => m.role === 'assistant' && m.tool_calls) + const tool = second.find((m) => m.role === 'tool') + // A tool-only turn carries null content and a name-derived id matching its result. + expect(assistant?.content).toBeNull() + expect(assistant?.tool_calls?.[0]?.id).toBe('call_submit_proposal') + expect(tool?.tool_call_id).toBe('call_submit_proposal') + }) + + it('a strict model that re-issues on a user fold completes on the tool-history shape', async () => { + // This model mimics a strict tool-history validator (Claude, OpenAI-compat): + // if its own tool call is folded into a user message it cannot see the + // result and re-issues the same call forever; given the assistant+tool_calls + // / role:tool shape it reads the result back and finishes. + let calls = 0 + const streamTurn = async function* (messages: ToolLoopMessage[]) { + const sawToolResult = messages.some((m) => m.role === 'tool') + const wasFoldedIntoUser = messages.some( + (m) => m.role === 'user' && (m.content ?? '').includes('Tool results'), + ) + if (sawToolResult && !wasFoldedIntoUser) { + yield { type: 'text', text: 'Final answer.' } as ToolLoopEvent + return + } + // No readable result yet — re-issue the call (the looping pathology). + calls++ + yield { + type: 'tool_call', + call: { toolName: 'submit_proposal', toolCallId: 'p1', args: { title: 'A' } }, + } as ToolLoopEvent + } + + const r = await runToolLoop({ + systemPrompt: 's', + userMessage: 'u', + streamTurn, + executeToolCall: async () => ({ ok: true, result: { id: 1 } }), + isExecutableTool: isExec, + }) + + expect(r.stopReason).toBe('completed') + expect(r.finalText).toBe('Final answer.') + // One call, then it reads the result and stops — not a stuck loop. + expect(calls).toBe(1) + }) + it('ignores non-executable tool calls', async () => { const stream = async function* () { yield { type: 'text', text: 'done' } as ToolLoopEvent @@ -196,8 +304,8 @@ describe('runToolLoop', () => { }) it('turns an executor throw into a failed outcome', async () => { - const streamTurn = async function* (messages: Array<{ role: string; content: string }>) { - if (!messages.some((m) => m.content.includes('Tool results'))) { + const streamTurn = async function* (messages: ToolLoopMessage[]) { + if (!messages.some((m) => m.role === 'tool')) { yield { type: 'tool_call', call: { toolName: 'submit_proposal', args: {} }, @@ -224,8 +332,8 @@ describe('runToolLoop', () => { it('emits general before/after hook events around loop, turn, and tool call boundaries', async () => { const events: RuntimeHookEvent[] = [] - const streamTurn = async function* (messages: Array<{ role: string; content: string }>) { - if (!messages.some((m) => m.content.includes('Tool results'))) { + const streamTurn = async function* (messages: ToolLoopMessage[]) { + if (!messages.some((m) => m.role === 'tool')) { yield { type: 'tool_call', call: { toolName: 'submit_proposal', toolCallId: 'p1', args: { title: 'A' } }, @@ -270,8 +378,8 @@ describe('runToolLoop', () => { it('emits one failure-recovery decision point before the next model turn', async () => { const order: string[] = [] const points: RuntimeDecisionPoint[] = [] - const streamTurn = async function* (messages: Array<{ role: string; content: string }>) { - if (!messages.some((m) => m.content.includes('Tool results'))) { + const streamTurn = async function* (messages: ToolLoopMessage[]) { + if (!messages.some((m) => m.role === 'tool')) { order.push('first-turn') yield { type: 'text', text: 'I will call the tool.' } as ToolLoopEvent yield { @@ -331,8 +439,8 @@ describe('runToolLoop', () => { it('does not emit a failure-recovery decision point for successful tool results', async () => { const points: RuntimeDecisionPoint[] = [] - const streamTurn = async function* (messages: Array<{ role: string; content: string }>) { - if (!messages.some((m) => m.content.includes('Tool results'))) { + const streamTurn = async function* (messages: ToolLoopMessage[]) { + if (!messages.some((m) => m.role === 'tool')) { yield { type: 'tool_call', call: { toolName: 'submit_proposal', args: {} }, @@ -360,8 +468,8 @@ describe('runToolLoop', () => { it('isolates hook failures from the tool loop', async () => { const hookErrors: string[] = [] - const streamTurn = async function* (messages: Array<{ role: string; content: string }>) { - if (!messages.some((m) => m.content.includes('Tool results'))) { + const streamTurn = async function* (messages: ToolLoopMessage[]) { + if (!messages.some((m) => m.role === 'tool')) { yield { type: 'tool_call', call: { toolName: 'submit_proposal', args: {} }, @@ -437,10 +545,8 @@ describe('streamToolLoop', () => { } it('yields each raw event + each tool_result and drives the loop', async () => { - const streamTurn = async function* ( - messages: Array<{ role: string; content: string }>, - ): AsyncIterable { - if (!messages.some((m) => m.content.includes('Tool results'))) { + const streamTurn = async function* (messages: ToolLoopMessage[]): AsyncIterable { + if (!messages.some((m) => m.role === 'tool')) { yield { type: 'text_delta', text: 'Routing. ' } yield { type: 'tool_call', toolName: 'submit_proposal', toolCallId: 'p1', args: {} } return @@ -460,6 +566,39 @@ describe('streamToolLoop', () => { expect(ys.filter((y) => y.kind === 'tool_result').length).toBe(1) }) + it('appends tool history in OpenAI shape (assistant+tool_calls then role:tool), no user fold', async () => { + const seen: ToolLoopMessage[][] = [] + const streamTurn = async function* (messages: ToolLoopMessage[]): AsyncIterable { + seen.push(messages) + if (!messages.some((m) => m.role === 'tool')) { + yield { type: 'text_delta', text: 'Routing. ' } + yield { type: 'tool_call', toolName: 'submit_proposal', toolCallId: 'p1', args: { a: 1 } } + return + } + yield { type: 'text_delta', text: 'Done.' } + } + for await (const _ of streamToolLoop({ + systemPrompt: 's', + userMessage: 'u', + streamTurn, + ...seams, + executeToolCall: async () => ({ ok: true, result: {} }), + })) { + // drain + } + const second = seen[1] ?? [] + expect(second.find((m) => m.role === 'assistant' && m.tool_calls)).toMatchObject({ + tool_calls: [{ id: 'p1', type: 'function', function: { name: 'submit_proposal' } }], + }) + expect(second.find((m) => m.role === 'tool')).toMatchObject({ + role: 'tool', + tool_call_id: 'p1', + }) + expect( + second.some((m) => m.role === 'user' && (m.content ?? '').includes('Tool results')), + ).toBe(false) + }) + it('emits one capped signal with backstop stopReason when the model never stops', async () => { const streamTurn = async function* (): AsyncIterable { yield { type: 'tool_call', toolName: 'submit_proposal', args: {} } @@ -499,10 +638,8 @@ describe('streamToolLoop', () => { it('emits a failure-recovery decision point without changing streamed yields', async () => { const points: RuntimeDecisionPoint[] = [] - const streamTurn = async function* ( - messages: Array<{ role: string; content: string }>, - ): AsyncIterable { - if (!messages.some((m) => m.content.includes('Tool results'))) { + const streamTurn = async function* (messages: ToolLoopMessage[]): AsyncIterable { + if (!messages.some((m) => m.role === 'tool')) { yield { type: 'text_delta', text: 'Trying. ' } yield { type: 'tool_call', toolName: 'submit_proposal', toolCallId: 'p1', args: {} } return diff --git a/src/tool-loop.ts b/src/tool-loop.ts index 0f0d86e1..71daa2f9 100644 --- a/src/tool-loop.ts +++ b/src/tool-loop.ts @@ -41,7 +41,62 @@ const FAILURE_RECOVERY_ACTIONS = ['retry', 'verify', 'continue', 'stop'] * stuck-loop detection. The window resets on any different call. */ const STUCK_LOOP_THRESHOLD = 3 -export type ToolLoopMessage = { role: string; content: string } +/** One OpenAI-shaped tool-call entry carried on an assistant message. */ +export interface ToolLoopAssistantToolCall { + id: string + type: 'function' + function: { name: string; arguments: string } +} + +/** + * A message in the running conversation the loop sends to `streamTurn`. + * + * The base `{ role, content }` covers `system` / `user` / plain `assistant` + * turns. Two optional fields carry the OpenAI function-calling contract so a + * strict model (Claude, and any OpenAI-compatible provider that validates tool + * history) reads its own tool use back instead of re-issuing the same call: + * + * - an assistant turn that emitted tool calls carries `tool_calls`, and its + * `content` is `null` when the turn was tool-only; + * - each tool result is its own `{ role: 'tool', tool_call_id, content }` + * message keyed to the call that produced it. + * + * Widening is additive: a `streamTurn` that reads only `role` + `content` still + * works; one that forwards the whole message to an OpenAI-compatible endpoint + * now sends correct tool history. + */ +export type ToolLoopMessage = { + role: string + content: string | null + tool_calls?: ToolLoopAssistantToolCall[] + tool_call_id?: string +} + +/** A tool-call id is required to key a `role: 'tool'` result back to its call. + * When the model omitted one, derive a stable id from the tool name so the + * assistant `tool_calls` entry and its `tool` result still match. */ +function toolCallId(call: ToolLoopCall): string { + return call.toolCallId ?? `call_${call.toolName}` +} + +/** The assistant turn that emitted `pending`, in OpenAI shape: text content + * (null when the turn was tool-only) plus its `tool_calls` array. */ +function assistantToolCallMessage(turnText: string, pending: ToolLoopCall[]): ToolLoopMessage { + return { + role: 'assistant', + content: turnText.trim() || null, + tool_calls: pending.map((call) => ({ + id: toolCallId(call), + type: 'function', + function: { name: call.toolName, arguments: JSON.stringify(call.args) }, + })), + } +} + +/** One `role: 'tool'` result message keyed to its call by `tool_call_id`. */ +function toolResultMessage(call: ToolLoopCall, content: string): ToolLoopMessage { + return { role: 'tool', tool_call_id: toolCallId(call), content } +} function defaultRender(label: string, outcome: ToolCallOutcome): string { if (outcome.ok) return `- ${label} → ok: ${JSON.stringify(outcome.result)}` @@ -158,8 +213,9 @@ export async function runToolLoop(opts: RunToolLoopOptions): Promise= opts.maxCostUsd) { const label = labelFor(call) toolResults.push({ call, label, outcome }) + messages.push(toolResultMessage(call, render(label, outcome))) observer.toolCallAfter(toolTurn, callEventId, call, outcome) observer.turnAfter(toolTurn, turnEventId, { pendingToolCalls: pending.length, @@ -211,8 +268,9 @@ export async function runToolLoop(opts: RunToolLoopOptions): Promise !item.outcome.ok).length, }) - messages.push({ role: 'user', content: `Tool results:\n${lines.join('\n')}` }) } observer.loopAfter({ turns, toolResults: toolResults.length, stopReason: 'completed' }) return { finalText, toolResults, turns, stopReason: 'completed', cappedOut: false } @@ -329,8 +386,9 @@ export async function* streamToolLoop( return } - if (turnText.trim()) messages.push({ role: 'assistant', content: turnText }) - const lines: string[] = [] + // The assistant turn that emitted the calls, carrying its tool_calls array, + // so a strict model reads its own tool use back in OpenAI shape. + messages.push(assistantToolCallMessage(turnText, pending)) const outcomes: ExecutedToolCall[] = [] for (const [callIndex, call] of pending.entries()) { // Stuck-loop detection. @@ -375,6 +433,7 @@ export async function* streamToolLoop( label, outcome, } + messages.push(toolResultMessage(call, render(label, outcome))) observer.toolCallAfter(toolTurn, callEventId, call, outcome) observer.turnAfter(toolTurn, turnEventId, { pendingToolCalls: pending.length, @@ -395,8 +454,9 @@ export async function* streamToolLoop( outcome, } const rendered = render(label, outcome) - lines.push(rendered) outcomes.push({ call, label, outcome, rendered }) + // One role:'tool' message per result, keyed to its call by tool_call_id. + messages.push(toolResultMessage(call, rendered)) observer.toolCallAfter(toolTurn, callEventId, call, outcome) } observer.failureRecovery({ @@ -414,7 +474,6 @@ export async function* streamToolLoop( })), failedToolCalls: outcomes.filter((item) => !item.outcome.ok).length, }) - messages.push({ role: 'user', content: `Tool results:\n${lines.join('\n')}` }) } } @@ -663,7 +722,7 @@ function renderDecisionContext( turnText: string, outcomes: ExecutedToolCall[], ): string { - const recent = messages.slice(-6).map((message) => `[${message.role}]\n${message.content}`) + const recent = messages.slice(-6).map((message) => `[${message.role}]\n${message.content ?? ''}`) const assistant = turnText.trim() ? [`[assistant]\n${turnText}`] : [] const toolResults = [`[tool results]\n${outcomes.map((item) => item.rendered).join('\n')}`] return trimText(