Skip to content
Merged
192 changes: 184 additions & 8 deletions packages/core/src/services/chatCompressionService.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
import {
ChatCompressionService,
COMPACT_MAX_OUTPUT_TOKENS,
computeThresholds,
MAX_CONSECUTIVE_FAILURES,
MAX_HOOK_INSTRUCTIONS_CHARS,
Expand Down Expand Up @@ -914,23 +915,137 @@ describe('ChatCompressionService', () => {
expect(result.newHistory).toBeNull();
});

it('should return FAILED if usage metadata is missing', async () => {
it('should use estimated token count if usage metadata is missing', async () => {
const largeMessage = 'x'.repeat(4_000);
const history: Content[] = [
{ role: 'user', parts: [{ text: largeMessage }] },
{ role: 'model', parts: [{ text: largeMessage }] },
{ role: 'user', parts: [{ text: largeMessage }] },
{ role: 'model', parts: [{ text: largeMessage }] },
];
vi.mocked(mockChat.getHistory).mockReturnValue(history);
vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue(
5_000,
);
vi.mocked(mockConfig.getContentGeneratorConfig).mockReturnValue({
model: 'gemini-pro',
contextWindowSize: 6_000,
} as unknown as ReturnType<typeof mockConfig.getContentGeneratorConfig>);
const debug = vi.fn();
(
mockConfig as unknown as {
getDebugLogger: () => {
warn: ReturnType<typeof vi.fn>;
debug: typeof debug;
};
}
).getDebugLogger = () => ({
warn: vi.fn(),
debug,
});

const mockGenerateContent = vi.fn().mockResolvedValue({
text: 'Summary',
// Some OpenAI-compatible providers (for example MiniMax-2.7) may omit
// usage on the compression side-query even when they return a summary.
usage: undefined,
});
vi.mocked(mockConfig.getBaseLlmClient).mockReturnValue({
generateText: mockGenerateContent,
} as unknown as BaseLlmClient);

const result = await service.compress(mockChat, {
promptId: mockPromptId,
force: true,
model: mockModel,
config: mockConfig,
consecutiveFailures: 0,
originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
});

expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED);
expect(result.info.originalTokenCount).toBe(5_000);
expect(result.info.newTokenCount).toBeGreaterThan(1_000);
expect(result.info.newTokenCount).toBeLessThan(1_100);
expect(result.newHistory).not.toBeNull();
expect(result.newHistory![0].parts![0].text).toContain('Summary');
expect(debug).toHaveBeenCalledWith(
expect.stringContaining('usage metadata missing'),
);
expect(debug).toHaveBeenCalledWith(
expect.stringContaining('API-reported non-visible remainder (1000)'),
);
});

it('should reject inflated local delta if usage metadata is missing', async () => {
const history: Content[] = [
{ role: 'user', parts: [{ text: 'short user message' }] },
{ role: 'model', parts: [{ text: 'short model response' }] },
{ role: 'user', parts: [{ text: 'another short user message' }] },
{ role: 'model', parts: [{ text: 'another short model response' }] },
];
vi.mocked(mockChat.getHistory).mockReturnValue(history);
vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue(800);
vi.mocked(mockConfig.getContentGeneratorConfig).mockReturnValue({
model: 'gemini-pro',
contextWindowSize: 6_000,
} as unknown as ReturnType<typeof mockConfig.getContentGeneratorConfig>);

const mockGenerateContent = vi.fn().mockResolvedValue({
text: 'x'.repeat(40_000),
usage: undefined,
});
vi.mocked(mockConfig.getBaseLlmClient).mockReturnValue({
generateText: mockGenerateContent,
} as unknown as BaseLlmClient);

const result = await service.compress(mockChat, {
promptId: mockPromptId,
force: true,
model: mockModel,
config: mockConfig,
consecutiveFailures: 0,
originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
});

expect(result.info.compressionStatus).toBe(
CompressionStatus.COMPRESSION_FAILED_INFLATED_TOKEN_COUNT,
);
expect(result.info.originalTokenCount).toBe(800);
expect(result.info.newTokenCount).toBeGreaterThan(800);
expect(result.newHistory).toBeNull();
});

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Suggestion] Missing test coverage for the delta-based estimation path producing COMPRESSION_FAILED_INFLATED_TOKEN_COUNT. All existing INFLATED tests (lines 959 and 1171) use explicit promptTokenCount/candidatesTokenCount in mock usage, exercising the primary formula. No test covers the scenario where delta-based estimation (estimatedNewContentTokenCount > estimatedOriginalContentTokenCount) triggers the INFLATED guard — e.g., when the side-query returns a verbose summary larger than the compressed input in estimated terms. This branch at chatCompressionService.ts:723 is reachable via the delta path but untested.

Consider adding a test that constructs a small historyForSplit with a verbose summary (e.g., text: 'x'.repeat(40_000)) and usage: undefined, asserting COMPRESSION_FAILED_INFLATED_TOKEN_COUNT and newHistory === null.

— qwen3.7-max via Qwen Code /review

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Covered in the current branch by should reject inflated local delta if usage metadata is missing. That test sets usage: undefined, exercises the local delta estimation path, and asserts COMPRESSION_FAILED_INFLATED_TOKEN_COUNT with newHistory === null when the estimated post-compression count is inflated.

Validated:

  • npm run test --workspace=packages/core -- src/services/chatCompressionService.test.ts
  • npx prettier --check packages/core/src/services/chatCompressionService.ts packages/core/src/services/chatCompressionService.test.ts
  • npx eslint packages/core/src/services/chatCompressionService.ts packages/core/src/services/chatCompressionService.test.ts


it('should reject cap-sized summaries even if usage metadata is missing', async () => {
const history: Content[] = [
{ role: 'user', parts: [{ text: 'msg1' }] },
{ role: 'model', parts: [{ text: 'msg2' }] },
{ role: 'user', parts: [{ text: 'msg3' }] },
{ role: 'model', parts: [{ text: 'msg4' }] },
];
vi.mocked(mockChat.getHistory).mockReturnValue(history);
vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue(800);
vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue(
180_000,
);
vi.mocked(mockConfig.getContentGeneratorConfig).mockReturnValue({
model: 'gemini-pro',
contextWindowSize: 1000,
contextWindowSize: 200_000,
} as unknown as ReturnType<typeof mockConfig.getContentGeneratorConfig>);

const warn = vi.fn();
(
mockConfig as unknown as {
getDebugLogger: () => {
warn: typeof warn;
debug: ReturnType<typeof vi.fn>;
};
}
).getDebugLogger = () => ({
warn,
debug: vi.fn(),
});
const mockGenerateContent = vi.fn().mockResolvedValue({
text: 'Summary',
// No usage -> keep original token count
text: 'x'.repeat(COMPACT_MAX_OUTPUT_TOKENS * 4),
usage: undefined,
});
vi.mocked(mockConfig.getBaseLlmClient).mockReturnValue({
Expand All @@ -947,11 +1062,72 @@ describe('ChatCompressionService', () => {
});

expect(result.info.compressionStatus).toBe(
CompressionStatus.COMPRESSION_FAILED_TOKEN_COUNT_ERROR,
CompressionStatus.COMPRESSION_FAILED_OUTPUT_TRUNCATED,
);
expect(result.newHistory).toBeNull();
expect(warn).toHaveBeenCalledWith(
expect.stringContaining('local estimate'),
);
expect(warn).toHaveBeenCalledWith(
expect.stringContaining('COMPACT_MAX_OUTPUT_TOKENS'),
);
});

it('should reject CJK cap-sized summaries when usage metadata is missing', async () => {
const history: Content[] = [
{ role: 'user', parts: [{ text: 'msg1' }] },
{ role: 'model', parts: [{ text: 'msg2' }] },
{ role: 'user', parts: [{ text: 'msg3' }] },
{ role: 'model', parts: [{ text: 'msg4' }] },
];
vi.mocked(mockChat.getHistory).mockReturnValue(history);
vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue(
180_000,
);
vi.mocked(mockConfig.getContentGeneratorConfig).mockReturnValue({
model: 'gemini-pro',
contextWindowSize: 200_000,
} as unknown as ReturnType<typeof mockConfig.getContentGeneratorConfig>);

const warn = vi.fn();
(
mockConfig as unknown as {
getDebugLogger: () => {
warn: typeof warn;
debug: ReturnType<typeof vi.fn>;
};
}
).getDebugLogger = () => ({
warn,
debug: vi.fn(),
});
const mockGenerateContent = vi.fn().mockResolvedValue({
text: '\u4e00'.repeat(Math.ceil(COMPACT_MAX_OUTPUT_TOKENS / 1.5)),
usage: undefined,
});
vi.mocked(mockConfig.getBaseLlmClient).mockReturnValue({
generateText: mockGenerateContent,
} as unknown as BaseLlmClient);

const result = await service.compress(mockChat, {
promptId: mockPromptId,
force: false,
model: mockModel,
config: mockConfig,
consecutiveFailures: 0,
originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
});

expect(result.info.compressionStatus).toBe(
CompressionStatus.COMPRESSION_FAILED_OUTPUT_TRUNCATED,
);
expect(result.info.originalTokenCount).toBe(800);
expect(result.info.newTokenCount).toBe(800);
expect(result.newHistory).toBeNull();
expect(warn).toHaveBeenCalledWith(
expect.stringContaining('local estimate'),
);
expect(warn).toHaveBeenCalledWith(
expect.stringContaining('COMPACT_MAX_OUTPUT_TOKENS'),
);
});

it('should return FAILED if summary is empty string', async () => {
Expand Down
94 changes: 86 additions & 8 deletions packages/core/src/services/chatCompressionService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,11 @@ import {
resolveSlimmingConfig,
slimCompactionInput,
} from './compactionInputSlimming.js';
import { CHARS_PER_TOKEN, estimatePromptTokens } from './tokenEstimation.js';
import {
CHARS_PER_TOKEN,
estimateContentTokens,
estimatePromptTokens,
} from './tokenEstimation.js';
import {
buildStateReminderParts,
composePostCompactHistory,
Expand Down Expand Up @@ -91,6 +95,30 @@ export const HARD_BUFFER = 3_000;
*/
export const MAX_CONSECUTIVE_FAILURES = 3;

const CJK_CHAR_TOKEN_MULTIPLIER = 1.5;
const CJK_CHAR_PATTERN =
/[\u3040-\u30ff\u3400-\u9fff\uf900-\ufaff\uac00-\ud7af]/g;

function estimateSummaryOutputTokens(
summary: string,
imageTokenEstimate: number,
): number {
const genericEstimate = estimateContentTokens(
[{ role: 'model', parts: [{ text: summary }] }],
imageTokenEstimate,
);
const cjkCharCount = summary.match(CJK_CHAR_PATTERN)?.length ?? 0;
if (cjkCharCount === 0) {
return genericEstimate;
}

const nonCjkCharCount = Math.max(0, summary.length - cjkCharCount);
const cjkAwareEstimate =
Math.ceil(nonCjkCharCount / CHARS_PER_TOKEN) +
Math.ceil(cjkCharCount * CJK_CHAR_TOKEN_MULTIPLIER);
return Math.max(genericEstimate, cjkAwareEstimate);
}

/**
* Hard cap on the PreCompact hook's `additionalContext` once it is merged
* into the side-query system prompt. The user-supplied `/compress` text is
Expand Down Expand Up @@ -497,6 +525,19 @@ export class ChatCompressionService {
compressionUsageMetadata.totalTokenCount - compressionInputTokenCount,
);
}
if (compressionOutputTokenCount === undefined && !isSummaryEmpty) {
compressionOutputTokenCount = estimateSummaryOutputTokens(
summary,
slimmingConfig.imageTokenEstimate,

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

estimateContentTokens uses a fixed ~4 chars/token ratio (CHARS_PER_TOKEN). For CJK text the actual ratio is closer to ~0.67 chars/token (each CJK character ≈ 1.5 tokens). A CJK summary truncated at COMPACT_MAX_OUTPUT_TOKENS tokens would have only ~COMPACT_MAX_OUTPUT_TOKENS × 0.67 characters, yielding an estimate of ~COMPACT_MAX_OUTPUT_TOKENS × 0.17 — well below the threshold, so the truncation guard on line 413 would not fire.

This only affects providers that omit usage metadata AND produce CJK output, but for those providers a truncated summary could silently corrupt the compressed history.

One possible mitigation: supplement the char-based estimate with a length check against COMPACT_MAX_OUTPUT_TOKENS × CHARS_PER_TOKEN chars (the maximum the model could have written), or apply a safety margin.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed in the current branch. The output-cap guard now uses a CJK-aware local estimate for summary output when provider usage is missing, so CJK summaries near the output-token cap are rejected as COMPRESSION_FAILED_OUTPUT_TRUNCATED instead of passing through via char/4 underestimation.

Coverage:

  • should reject CJK cap-sized summaries when usage metadata is missing

I also pushed 93b07ea08 to write that fixture as \u4e00, avoiding non-ASCII display/encoding noise while keeping the same CJK token-estimation case.

Validated:

  • npm run test --workspace=packages/core -- src/services/chatCompressionService.test.ts
  • npx prettier --check packages/core/src/services/chatCompressionService.ts packages/core/src/services/chatCompressionService.test.ts
  • npx eslint packages/core/src/services/chatCompressionService.ts packages/core/src/services/chatCompressionService.test.ts

);
config
.getDebugLogger()
.warn(
`[chat-compression] compression side-query omitted usage metadata; ` +
`using local estimate for summary output token count ` +
`(${compressionOutputTokenCount}).`,
);
}

// Defensive guard: if the side-query hit COMPACT_MAX_OUTPUT_TOKENS, the
// summary is likely truncated mid-content and unsafe to persist. Drop it
Expand Down Expand Up @@ -624,7 +665,12 @@ export class ChatCompressionService {
];
}

// Best-effort token math using *only* model-reported token counts.
// Best-effort token math using model-reported token counts when
// available. Some OpenAI-compatible providers omit usage for the
// compression side-query; in that case, fall back to the same local
// content estimator used by the auto-compaction gate so a valid summary
// can still shrink the history instead of failing with a token-count
// error.
//
// Note: compressionInputTokenCount includes the entire compression
// system prompt (the <state_snapshot> instructions, ~900 tokens) PLUS
Expand Down Expand Up @@ -654,12 +700,9 @@ export class ChatCompressionService {
// The composer injects file-restoration blocks (up to
// maxRecentFiles × 5K tokens) and an image-restoration block (up to
// maxRecentImages images) that are NOT in
// compressionOutputTokenCount. Estimate their
// cost locally so the inflation guard below
// (newTokenCount > originalTokenCount) actually fires when
// attachments dominate the post-compact size, and so
// `lastPromptTokenCount` doesn't under-report the next auto-
// compaction cheap-gate input (Finding 1).
// compressionOutputTokenCount. Estimate their cost locally so the
// inflation guard below fires when attachments dominate the
// post-compact size.
const restorationChars = extraHistory
.slice(2) // skip [summary, model ack]
.reduce(
Expand All @@ -668,6 +711,41 @@ export class ChatCompressionService {
0,
);
newTokenCount += Math.ceil(restorationChars / CHARS_PER_TOKEN);
} else {

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Critical] estimateContentTokens(extraHistory) produces a char/4 lower-bound of only the visible post-compression messages (summary + ack + historyToKeep). But originalTokenCount is API-reported and includes system prompt + tool definitions (~15–25K tokens of invisible context). These are on fundamentally different scales.

Downstream, geminiChat.ts:1362 stores newTokenCount as lastPromptTokenCount. On the next send, estimatePromptTokens() takes the lastPromptTokenCount > 0 branch and returns lastPromptTokenCount + estimateContentTokens([userMessage]) — missing the system-prompt-and-tools delta entirely. The auto-compaction gate then undercounts by 15–25K tokens until the next API response corrects it. For providers that consistently omit usage, the undercount persists across compression cycles.

Additionally, the newTokenCount > originalTokenCount (INFLATED_TOKEN_COUNT) guard becomes effectively dead code: the char/4 estimate of visible content will almost always be less than the API-reported full-prompt count, so even a pathological summary that makes the history larger would pass this check.

Suggested fix — estimate the delta rather than the absolute post-compression size, so newTokenCount stays on the same scale as originalTokenCount:

} else {
  // Preserve the API-reported baseline (system prompt + tools) by
  // estimating the reduction from content replacement rather than
  // the absolute post-compression size.
  const estimatedRemoved = estimateContentTokens(
    historyToCompress,
    slimmingConfig.imageTokenEstimate,
  );
  const syntheticMessages = extraHistory.slice(
    0,
    extraHistory.length - historyToKeep.length,
  );
  const estimatedAdded = estimateContentTokens(
    syntheticMessages,
    slimmingConfig.imageTokenEstimate,
  );
  newTokenCount = Math.max(
    0,
    originalTokenCount - estimatedRemoved + estimatedAdded,
  );
  canCalculateNewTokenCount = newTokenCount > 0;
}

This mirrors the primary path's structure (originalTokenCount - removed + added) and keeps both sides of the INFLATED guard comparison on the same measurement scale.

— qwen3.7-max via Qwen Code /review

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed in the current branch. The missing-usage path no longer replaces the API-reported total with a visible-history-only estimate. It computes the visible delta only, preserves the API-reported non-visible remainder (originalTokenCount - estimatedOriginalVisibleTokenCount), and adds the estimated post-compression visible history back on top.

Coverage:

  • should use estimated token count if usage metadata is missing

Validated:

  • npm run test --workspace=packages/core -- src/services/chatCompressionService.test.ts
  • npx prettier --check packages/core/src/services/chatCompressionService.ts packages/core/src/services/chatCompressionService.test.ts
  • npx eslint packages/core/src/services/chatCompressionService.ts packages/core/src/services/chatCompressionService.test.ts

const estimatedOriginalVisibleTokenCount = estimateContentTokens(
curatedHistory,
slimmingConfig.imageTokenEstimate,
);
const estimatedNewVisibleTokenCount = estimateContentTokens(
extraHistory,
slimmingConfig.imageTokenEstimate,
);
if (
estimatedOriginalVisibleTokenCount > 0 &&
estimatedNewVisibleTokenCount > 0
) {
const estimatedNonVisibleTokenCount = Math.max(
0,
originalTokenCount - estimatedOriginalVisibleTokenCount,
);
// Keep the API-reported system/tool/prompt remainder intact. The
// local estimator is only used for the visible conversation delta, so
// missing usage metadata cannot replace the authoritative total with
// a much smaller visible-history-only estimate.
newTokenCount =
estimatedNonVisibleTokenCount + estimatedNewVisibleTokenCount;
canCalculateNewTokenCount = true;
config
.getDebugLogger()
.debug(
`[chat-compression] usage metadata missing; estimated ` +
`post-compression token count by preserving the ` +
`API-reported non-visible remainder ` +
`(${estimatedNonVisibleTokenCount}) and replacing the ` +
`visible-history estimate (${estimatedOriginalVisibleTokenCount} -> ` +
`${estimatedNewVisibleTokenCount}).`,
);
}
}
}

Expand Down
Loading