diff --git a/.changeset/large-session-tool-pruning.md b/.changeset/large-session-tool-pruning.md new file mode 100644 index 00000000000..187f985c2ea --- /dev/null +++ b/.changeset/large-session-tool-pruning.md @@ -0,0 +1,5 @@ +--- +"kilo-code": patch +--- + +Reduce delays in long sessions by compacting older tool outputs when the model request becomes too large. diff --git a/packages/opencode/src/session/compaction.ts b/packages/opencode/src/session/compaction.ts index 91b0ef4b863..e96332bcbfb 100644 --- a/packages/opencode/src/session/compaction.ts +++ b/packages/opencode/src/session/compaction.ts @@ -92,6 +92,10 @@ type CompletedCompaction = { summary: string | undefined } +// kilocode_change start - allow safe pruning at cache-invalidating boundaries +export type PruneReason = "normal" | "post-compaction" | "payload-limit" +// kilocode_change end + function summaryText(message: MessageV2.WithParts) { const text = message.parts .filter((part): part is MessageV2.TextPart => part.type === "text") @@ -188,7 +192,7 @@ export interface Interface { tokens: MessageV2.Assistant["tokens"] model: Provider.Model }) => Effect.Effect - readonly prune: (input: { sessionID: SessionID }) => Effect.Effect + readonly prune: (input: { sessionID: SessionID; reason?: PruneReason }) => Effect.Effect // kilocode_change readonly process: (input: { parentID: MessageID messages: MessageV2.WithParts[] @@ -296,10 +300,13 @@ export const layer: Layer.Layer< // goes backwards through parts until there are PRUNE_PROTECT tokens worth of tool // calls, then erases output of older tool calls to free context space - const prune = Effect.fn("SessionCompaction.prune")(function* (input: { sessionID: SessionID }) { + // kilocode_change start - preserve normal opt-in pruning, but allow payload/compaction cleanup by default + const prune = Effect.fn("SessionCompaction.prune")(function* (input: { sessionID: SessionID; reason?: PruneReason }) { const cfg = yield* config.get() - if (!cfg.compaction?.prune) return - log.info("pruning") + const reason = input.reason ?? "normal" + if (cfg.compaction?.prune === false) return + if (reason === "normal" && cfg.compaction?.prune !== true) return + log.info("pruning", { reason }) const msgs = yield* session .messages({ sessionID: input.sessionID }) @@ -338,9 +345,10 @@ export const layer: Layer.Layer< yield* session.updatePart(part) } } - log.info("pruned", { count: toPrune.length }) + log.info("pruned", { reason, count: toPrune.length }) } }) + // kilocode_change end const processCompaction = Effect.fn("SessionCompaction.process")(function* (input: { parentID: MessageID @@ -556,8 +564,13 @@ export const layer: Layer.Layer< } } + // kilocode_change start - compaction already invalidates cache, so collapse stale tool outputs too if (processor.message.error) return "stop" - if (result === "continue") yield* bus.publish(Event.Compacted, { sessionID: input.sessionID }) + if (result === "continue") { + yield* prune({ sessionID: input.sessionID, reason: "post-compaction" }) + yield* bus.publish(Event.Compacted, { sessionID: input.sessionID }) + } + // kilocode_change end return result }) @@ -612,11 +625,11 @@ export const defaultLayer = Layer.suspend(() => const { runPromise } = makeRuntime(Service, defaultLayer) -export async function isOverflow(input: { tokens: MessageV2.Assistant["tokens"]; model: Provider.Model }) { +export async function isOverflow(input: { tokens: MessageV2.Assistant["tokens"]; model: Provider.Model }) { // kilocode_change return runPromise((svc) => svc.isOverflow(input)) } -export async function prune(input: { sessionID: SessionID }) { +export async function prune(input: { sessionID: SessionID; reason?: PruneReason }) { // kilocode_change return runPromise((svc) => svc.prune(input)) } diff --git a/packages/opencode/src/session/prompt.ts b/packages/opencode/src/session/prompt.ts index 69fa84bcac7..1f679142639 100644 --- a/packages/opencode/src/session/prompt.ts +++ b/packages/opencode/src/session/prompt.ts @@ -78,6 +78,10 @@ const STRUCTURED_OUTPUT_SYSTEM_PROMPT = `IMPORTANT: The user has requested struc // kilocode_change export const shouldAskPlanFollowup = KiloSessionPrompt.shouldAskPlanFollowup +// kilocode_change start - persistent tool-output pruning when payload is already large +const REQUEST_PRUNE_BYTES = 1_250_000 +// kilocode_change end + const log = Log.create({ service: "session.prompt" }) const elog = EffectLogger.create({ service: "session.prompt" }) @@ -1333,7 +1337,6 @@ NOTE: At any point in time through this workflow you should feel free to ask the // kilocode_change end }, ) - // kilocode_change end const lastAssistant = Effect.fnUntraced(function* (sessionID: SessionID) { // kilocode_change start - retry when cancel races before shellImpl writes messages @@ -1587,12 +1590,27 @@ NOTE: At any point in time through this workflow you should feel free to ask the msgs = KiloSessionPrompt.maybeStripHistoricalMedia(msgs) // kilocode_change end - const [skills, env, instructions, modelMsgs] = yield* Effect.all([ + // kilocode_change start - persistently prune stale tool outputs when payload is already large + const [skills, env, instructions] = yield* Effect.all([ sys.skills(agent), sys.environment(model, lastUser.editorContext), // kilocode_change instruction.system().pipe(Effect.orDie), - MessageV2.toModelMessagesEffect(msgs, model), ]) + let modelMsgs = yield* MessageV2.toModelMessagesEffect(msgs, model) + const size = Buffer.byteLength(JSON.stringify(modelMsgs)) + if (size > REQUEST_PRUNE_BYTES) { + yield* compaction.prune({ sessionID, reason: "payload-limit" }) + msgs = yield* MessageV2.filterCompactedEffect(sessionID) + msgs = KiloSessionPromptQueue.scope(sessionID, msgs) + msgs = KiloSessionPrompt.trimBeforeLastSummary(msgs) + yield* plugin.trigger("experimental.chat.messages.transform", {}, { messages: msgs }) + KiloSessionPrompt.injectEditorContext({ msgs, lastUser, sessionID, cache: envCache }) + msgs = KiloSessionPrompt.maybeStripHistoricalMedia(msgs) + modelMsgs = yield* MessageV2.toModelMessagesEffect(msgs, model) + const nextSize = Buffer.byteLength(JSON.stringify(modelMsgs)) + if (nextSize > REQUEST_PRUNE_BYTES) log.warn("payload still large after pruning", { size: nextSize }) + } + // kilocode_change end const system = [...env, ...instructions, ...(skills ? [skills] : [])] const format = lastUser.format ?? { type: "text" as const } if (format.type === "json_schema") system.push(STRUCTURED_OUTPUT_SYSTEM_PROMPT) // kilocode_change @@ -1698,7 +1716,7 @@ NOTE: At any point in time through this workflow you should feel free to ask the continue } - yield* compaction.prune({ sessionID }).pipe(Effect.ignore, Effect.forkIn(scope)) + yield* compaction.prune({ sessionID, reason: "normal" }).pipe(Effect.ignore, Effect.forkIn(scope)) return yield* lastAssistant(sessionID) }, )