diff --git a/packages/core/src/services/chatCompressionService.test.ts b/packages/core/src/services/chatCompressionService.test.ts index 1317106817c..d5dbc83a0b9 100644 --- a/packages/core/src/services/chatCompressionService.test.ts +++ b/packages/core/src/services/chatCompressionService.test.ts @@ -7,6 +7,7 @@ import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; import { ChatCompressionService, + COMPACT_MAX_OUTPUT_TOKENS, computeThresholds, MAX_CONSECUTIVE_FAILURES, MAX_HOOK_INSTRUCTIONS_CHARS, @@ -914,7 +915,108 @@ describe('ChatCompressionService', () => { expect(result.newHistory).toBeNull(); }); - it('should return FAILED if usage metadata is missing', async () => { + it('should use estimated token count if usage metadata is missing', async () => { + const largeMessage = 'x'.repeat(4_000); + const history: Content[] = [ + { role: 'user', parts: [{ text: largeMessage }] }, + { role: 'model', parts: [{ text: largeMessage }] }, + { role: 'user', parts: [{ text: largeMessage }] }, + { role: 'model', parts: [{ text: largeMessage }] }, + ]; + vi.mocked(mockChat.getHistory).mockReturnValue(history); + vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue( + 5_000, + ); + vi.mocked(mockConfig.getContentGeneratorConfig).mockReturnValue({ + model: 'gemini-pro', + contextWindowSize: 6_000, + } as unknown as ReturnType); + const debug = vi.fn(); + ( + mockConfig as unknown as { + getDebugLogger: () => { + warn: ReturnType; + debug: typeof debug; + }; + } + ).getDebugLogger = () => ({ + warn: vi.fn(), + debug, + }); + + const mockGenerateContent = vi.fn().mockResolvedValue({ + text: 'Summary', + // Some OpenAI-compatible providers (for example MiniMax-2.7) may omit + // usage on the compression side-query even when they return a summary. + usage: undefined, + }); + vi.mocked(mockConfig.getBaseLlmClient).mockReturnValue({ + generateText: mockGenerateContent, + } as unknown as BaseLlmClient); + + const result = await service.compress(mockChat, { + promptId: mockPromptId, + force: true, + model: mockModel, + config: mockConfig, + consecutiveFailures: 0, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); + + expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED); + expect(result.info.originalTokenCount).toBe(5_000); + expect(result.info.newTokenCount).toBeGreaterThan(1_000); + expect(result.info.newTokenCount).toBeLessThan(1_100); + expect(result.newHistory).not.toBeNull(); + expect(result.newHistory![0].parts![0].text).toContain('Summary'); + expect(debug).toHaveBeenCalledWith( + expect.stringContaining('usage metadata missing'), + ); + expect(debug).toHaveBeenCalledWith( + expect.stringContaining('API-reported non-visible remainder (1000)'), + ); + }); + + it('should reject inflated local delta if usage metadata is missing', async () => { + const history: Content[] = [ + { role: 'user', parts: [{ text: 'short user message' }] }, + { role: 'model', parts: [{ text: 'short model response' }] }, + { role: 'user', parts: [{ text: 'another short user message' }] }, + { role: 'model', parts: [{ text: 'another short model response' }] }, + ]; + vi.mocked(mockChat.getHistory).mockReturnValue(history); + vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue(800); + vi.mocked(mockConfig.getContentGeneratorConfig).mockReturnValue({ + model: 'gemini-pro', + contextWindowSize: 6_000, + } as unknown as ReturnType); + + const mockGenerateContent = vi.fn().mockResolvedValue({ + text: 'x'.repeat(40_000), + usage: undefined, + }); + vi.mocked(mockConfig.getBaseLlmClient).mockReturnValue({ + generateText: mockGenerateContent, + } as unknown as BaseLlmClient); + + const result = await service.compress(mockChat, { + promptId: mockPromptId, + force: true, + model: mockModel, + config: mockConfig, + consecutiveFailures: 0, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); + + expect(result.info.compressionStatus).toBe( + CompressionStatus.COMPRESSION_FAILED_INFLATED_TOKEN_COUNT, + ); + expect(result.info.originalTokenCount).toBe(800); + expect(result.info.newTokenCount).toBeGreaterThan(800); + expect(result.newHistory).toBeNull(); + }); + + it('should reject cap-sized summaries even if usage metadata is missing', async () => { const history: Content[] = [ { role: 'user', parts: [{ text: 'msg1' }] }, { role: 'model', parts: [{ text: 'msg2' }] }, @@ -922,15 +1024,28 @@ describe('ChatCompressionService', () => { { role: 'model', parts: [{ text: 'msg4' }] }, ]; vi.mocked(mockChat.getHistory).mockReturnValue(history); - vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue(800); + vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue( + 180_000, + ); vi.mocked(mockConfig.getContentGeneratorConfig).mockReturnValue({ model: 'gemini-pro', - contextWindowSize: 1000, + contextWindowSize: 200_000, } as unknown as ReturnType); + const warn = vi.fn(); + ( + mockConfig as unknown as { + getDebugLogger: () => { + warn: typeof warn; + debug: ReturnType; + }; + } + ).getDebugLogger = () => ({ + warn, + debug: vi.fn(), + }); const mockGenerateContent = vi.fn().mockResolvedValue({ - text: 'Summary', - // No usage -> keep original token count + text: 'x'.repeat(COMPACT_MAX_OUTPUT_TOKENS * 4), usage: undefined, }); vi.mocked(mockConfig.getBaseLlmClient).mockReturnValue({ @@ -947,11 +1062,72 @@ describe('ChatCompressionService', () => { }); expect(result.info.compressionStatus).toBe( - CompressionStatus.COMPRESSION_FAILED_TOKEN_COUNT_ERROR, + CompressionStatus.COMPRESSION_FAILED_OUTPUT_TRUNCATED, + ); + expect(result.newHistory).toBeNull(); + expect(warn).toHaveBeenCalledWith( + expect.stringContaining('local estimate'), + ); + expect(warn).toHaveBeenCalledWith( + expect.stringContaining('COMPACT_MAX_OUTPUT_TOKENS'), + ); + }); + + it('should reject CJK cap-sized summaries when usage metadata is missing', async () => { + const history: Content[] = [ + { role: 'user', parts: [{ text: 'msg1' }] }, + { role: 'model', parts: [{ text: 'msg2' }] }, + { role: 'user', parts: [{ text: 'msg3' }] }, + { role: 'model', parts: [{ text: 'msg4' }] }, + ]; + vi.mocked(mockChat.getHistory).mockReturnValue(history); + vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue( + 180_000, + ); + vi.mocked(mockConfig.getContentGeneratorConfig).mockReturnValue({ + model: 'gemini-pro', + contextWindowSize: 200_000, + } as unknown as ReturnType); + + const warn = vi.fn(); + ( + mockConfig as unknown as { + getDebugLogger: () => { + warn: typeof warn; + debug: ReturnType; + }; + } + ).getDebugLogger = () => ({ + warn, + debug: vi.fn(), + }); + const mockGenerateContent = vi.fn().mockResolvedValue({ + text: '\u4e00'.repeat(Math.ceil(COMPACT_MAX_OUTPUT_TOKENS / 1.5)), + usage: undefined, + }); + vi.mocked(mockConfig.getBaseLlmClient).mockReturnValue({ + generateText: mockGenerateContent, + } as unknown as BaseLlmClient); + + const result = await service.compress(mockChat, { + promptId: mockPromptId, + force: false, + model: mockModel, + config: mockConfig, + consecutiveFailures: 0, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); + + expect(result.info.compressionStatus).toBe( + CompressionStatus.COMPRESSION_FAILED_OUTPUT_TRUNCATED, ); - expect(result.info.originalTokenCount).toBe(800); - expect(result.info.newTokenCount).toBe(800); expect(result.newHistory).toBeNull(); + expect(warn).toHaveBeenCalledWith( + expect.stringContaining('local estimate'), + ); + expect(warn).toHaveBeenCalledWith( + expect.stringContaining('COMPACT_MAX_OUTPUT_TOKENS'), + ); }); it('should return FAILED if summary is empty string', async () => { diff --git a/packages/core/src/services/chatCompressionService.ts b/packages/core/src/services/chatCompressionService.ts index cd59820a96e..b9706f6245d 100644 --- a/packages/core/src/services/chatCompressionService.ts +++ b/packages/core/src/services/chatCompressionService.ts @@ -25,7 +25,11 @@ import { resolveSlimmingConfig, slimCompactionInput, } from './compactionInputSlimming.js'; -import { CHARS_PER_TOKEN, estimatePromptTokens } from './tokenEstimation.js'; +import { + CHARS_PER_TOKEN, + estimateContentTokens, + estimatePromptTokens, +} from './tokenEstimation.js'; import { buildStateReminderParts, composePostCompactHistory, @@ -91,6 +95,30 @@ export const HARD_BUFFER = 3_000; */ export const MAX_CONSECUTIVE_FAILURES = 3; +const CJK_CHAR_TOKEN_MULTIPLIER = 1.5; +const CJK_CHAR_PATTERN = + /[\u3040-\u30ff\u3400-\u9fff\uf900-\ufaff\uac00-\ud7af]/g; + +function estimateSummaryOutputTokens( + summary: string, + imageTokenEstimate: number, +): number { + const genericEstimate = estimateContentTokens( + [{ role: 'model', parts: [{ text: summary }] }], + imageTokenEstimate, + ); + const cjkCharCount = summary.match(CJK_CHAR_PATTERN)?.length ?? 0; + if (cjkCharCount === 0) { + return genericEstimate; + } + + const nonCjkCharCount = Math.max(0, summary.length - cjkCharCount); + const cjkAwareEstimate = + Math.ceil(nonCjkCharCount / CHARS_PER_TOKEN) + + Math.ceil(cjkCharCount * CJK_CHAR_TOKEN_MULTIPLIER); + return Math.max(genericEstimate, cjkAwareEstimate); +} + /** * Hard cap on the PreCompact hook's `additionalContext` once it is merged * into the side-query system prompt. The user-supplied `/compress` text is @@ -497,6 +525,19 @@ export class ChatCompressionService { compressionUsageMetadata.totalTokenCount - compressionInputTokenCount, ); } + if (compressionOutputTokenCount === undefined && !isSummaryEmpty) { + compressionOutputTokenCount = estimateSummaryOutputTokens( + summary, + slimmingConfig.imageTokenEstimate, + ); + config + .getDebugLogger() + .warn( + `[chat-compression] compression side-query omitted usage metadata; ` + + `using local estimate for summary output token count ` + + `(${compressionOutputTokenCount}).`, + ); + } // Defensive guard: if the side-query hit COMPACT_MAX_OUTPUT_TOKENS, the // summary is likely truncated mid-content and unsafe to persist. Drop it @@ -624,7 +665,12 @@ export class ChatCompressionService { ]; } - // Best-effort token math using *only* model-reported token counts. + // Best-effort token math using model-reported token counts when + // available. Some OpenAI-compatible providers omit usage for the + // compression side-query; in that case, fall back to the same local + // content estimator used by the auto-compaction gate so a valid summary + // can still shrink the history instead of failing with a token-count + // error. // // Note: compressionInputTokenCount includes the entire compression // system prompt (the instructions, ~900 tokens) PLUS @@ -654,12 +700,9 @@ export class ChatCompressionService { // The composer injects file-restoration blocks (up to // maxRecentFiles × 5K tokens) and an image-restoration block (up to // maxRecentImages images) that are NOT in - // compressionOutputTokenCount. Estimate their - // cost locally so the inflation guard below - // (newTokenCount > originalTokenCount) actually fires when - // attachments dominate the post-compact size, and so - // `lastPromptTokenCount` doesn't under-report the next auto- - // compaction cheap-gate input (Finding 1). + // compressionOutputTokenCount. Estimate their cost locally so the + // inflation guard below fires when attachments dominate the + // post-compact size. const restorationChars = extraHistory .slice(2) // skip [summary, model ack] .reduce( @@ -668,6 +711,41 @@ export class ChatCompressionService { 0, ); newTokenCount += Math.ceil(restorationChars / CHARS_PER_TOKEN); + } else { + const estimatedOriginalVisibleTokenCount = estimateContentTokens( + curatedHistory, + slimmingConfig.imageTokenEstimate, + ); + const estimatedNewVisibleTokenCount = estimateContentTokens( + extraHistory, + slimmingConfig.imageTokenEstimate, + ); + if ( + estimatedOriginalVisibleTokenCount > 0 && + estimatedNewVisibleTokenCount > 0 + ) { + const estimatedNonVisibleTokenCount = Math.max( + 0, + originalTokenCount - estimatedOriginalVisibleTokenCount, + ); + // Keep the API-reported system/tool/prompt remainder intact. The + // local estimator is only used for the visible conversation delta, so + // missing usage metadata cannot replace the authoritative total with + // a much smaller visible-history-only estimate. + newTokenCount = + estimatedNonVisibleTokenCount + estimatedNewVisibleTokenCount; + canCalculateNewTokenCount = true; + config + .getDebugLogger() + .debug( + `[chat-compression] usage metadata missing; estimated ` + + `post-compression token count by preserving the ` + + `API-reported non-visible remainder ` + + `(${estimatedNonVisibleTokenCount}) and replacing the ` + + `visible-history estimate (${estimatedOriginalVisibleTokenCount} -> ` + + `${estimatedNewVisibleTokenCount}).`, + ); + } } }