diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go index 5aed62fe1066..e3a237d80276 100644 --- a/core/http/endpoints/openai/chat.go +++ b/core/http/endpoints/openai/chat.go @@ -86,7 +86,11 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator // Prefer pre-parsed chat deltas from C++ autoparser when available if tokenUsage.HasChatDeltaContent() { - reasoningDelta, contentDelta = tokenUsage.ChatDeltaReasoningAndContent() + rawReasoning, cd := tokenUsage.ChatDeltaReasoningAndContent() + contentDelta = cd + // Strip reasoning tags (e.g. <|channel>thought / ) that + // the C++ autoparser includes as part of reasoning content. + reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning) // Keep extractor state consistent for fallback extractor.ProcessToken(s) } else { @@ -149,7 +153,11 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator // Prefer pre-parsed chat deltas from C++ autoparser when available if usage.HasChatDeltaContent() { - reasoningDelta, contentDelta = usage.ChatDeltaReasoningAndContent() + rawReasoning, cd := usage.ChatDeltaReasoningAndContent() + contentDelta = cd + // Strip reasoning tags (e.g. <|channel>thought / ) that + // the C++ autoparser includes as part of reasoning content. + reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning) // Keep extractor state consistent for fallback extractor.ProcessToken(s) } else { diff --git a/core/http/endpoints/openresponses/responses.go b/core/http/endpoints/openresponses/responses.go index 962b6bb7244e..565582627a08 100644 --- a/core/http/endpoints/openresponses/responses.go +++ b/core/http/endpoints/openresponses/responses.go @@ -1823,7 +1823,9 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6 var reasoningDelta, contentDelta string // Prefer pre-parsed chat deltas from C++ autoparser when available if tokenUsage.HasChatDeltaContent() { - reasoningDelta, contentDelta = tokenUsage.ChatDeltaReasoningAndContent() + rawReasoning, cd := tokenUsage.ChatDeltaReasoningAndContent() + contentDelta = cd + reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning) extractor.ProcessToken(token) // keep state consistent } else { reasoningDelta, contentDelta = extractor.ProcessToken(token) @@ -2350,7 +2352,9 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6 var reasoningDelta, contentDelta string // Prefer pre-parsed chat deltas from C++ autoparser when available if tokenUsage.HasChatDeltaContent() { - reasoningDelta, contentDelta = tokenUsage.ChatDeltaReasoningAndContent() + rawReasoning, cd := tokenUsage.ChatDeltaReasoningAndContent() + contentDelta = cd + reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning) extractor.ProcessToken(token) // keep state consistent } else { reasoningDelta, contentDelta = extractor.ProcessToken(token) diff --git a/pkg/reasoning/extractor.go b/pkg/reasoning/extractor.go index f5b5c6c82835..fb5103e7dcff 100644 --- a/pkg/reasoning/extractor.go +++ b/pkg/reasoning/extractor.go @@ -21,6 +21,12 @@ type ReasoningExtractor struct { lastReasoning string lastCleaned string suppressReasoning bool + + // ChatDelta reasoning accumulator — used by ProcessChatDeltaReasoning + // to strip reasoning tags (e.g. <|channel>thought, ) that + // the C++ autoparser includes in reasoning_content deltas. + cdReasoningAccum string + cdLastStrippedReasoning string } // NewReasoningExtractor creates a new extractor for the given thinking token and config. @@ -64,6 +70,61 @@ func (e *ReasoningExtractor) ProcessToken(token string) (reasoningDelta, content return reasoningDelta, contentDelta } +// ProcessChatDeltaReasoning accumulates raw reasoning text from C++ autoparser +// ChatDeltas, strips any embedded reasoning tags (e.g. <|channel>thought … +// for Gemma 4), and returns only the new stripped delta. +// This prevents tag tokens from leaking into the reasoning field of SSE chunks. +// +// When the C++ autoparser already strips tags (e.g. models), the text +// passes through unchanged — ExtractReasoning finds no tags so we use the raw text. +func (e *ReasoningExtractor) ProcessChatDeltaReasoning(rawDelta string) string { + if rawDelta == "" { + return "" + } + e.cdReasoningAccum += rawDelta + + // Try to strip reasoning tags from accumulated ChatDelta reasoning. + stripped, cleaned := ExtractReasoning(e.cdReasoningAccum, &e.config) + + if stripped == "" { + // ExtractReasoning found no reasoning content. This happens when: + // a) A complete start tag was found but has no content after it yet + // (cleaned == "" because everything is inside the unclosed tag) + // → keep buffering + // b) We're accumulating a partial multi-token start tag + // (e.g. "<|channel>" before "thought" arrives) + // → keep buffering + // c) No tags at all — C++ already stripped them + // → pass through the raw text as-is + if cleaned == "" && strings.TrimSpace(e.cdReasoningAccum) != "" { + // Case (a): tag found, unclosed, no content yet + stripped = "" + } else if e.thinkingStartToken != "" && + len(strings.TrimSpace(e.cdReasoningAccum)) < len(e.thinkingStartToken) && + strings.HasPrefix(e.thinkingStartToken, strings.TrimSpace(e.cdReasoningAccum)) { + // Case (b): partial start tag prefix + stripped = "" + } else { + // Case (c): no tags found — text is already clean from C++ + stripped = e.cdReasoningAccum + } + } + + // Compute delta from stripped reasoning + var delta string + if len(stripped) > len(e.cdLastStrippedReasoning) && strings.HasPrefix(stripped, e.cdLastStrippedReasoning) { + delta = stripped[len(e.cdLastStrippedReasoning):] + } else if stripped != e.cdLastStrippedReasoning && stripped != "" { + delta = stripped + } + e.cdLastStrippedReasoning = stripped + + if e.suppressReasoning { + return "" + } + return delta +} + // Reasoning returns the total accumulated reasoning after streaming. func (e *ReasoningExtractor) Reasoning() string { return e.lastReasoning @@ -84,6 +145,8 @@ func (e *ReasoningExtractor) Reset() { e.accumulated = "" e.lastReasoning = "" e.lastCleaned = "" + e.cdReasoningAccum = "" + e.cdLastStrippedReasoning = "" } // ResetAndSuppressReasoning clears state and suppresses future reasoning deltas. @@ -95,6 +158,8 @@ func (e *ReasoningExtractor) ResetAndSuppressReasoning() { e.accumulated = "" e.lastReasoning = "" e.lastCleaned = "" + e.cdReasoningAccum = "" + e.cdLastStrippedReasoning = "" e.suppressReasoning = true } diff --git a/pkg/reasoning/extractor_test.go b/pkg/reasoning/extractor_test.go index 854f59cf06a6..f6ccf514a0f5 100644 --- a/pkg/reasoning/extractor_test.go +++ b/pkg/reasoning/extractor_test.go @@ -195,4 +195,91 @@ var _ = Describe("ReasoningExtractor", func() { Expect(ext.CleanedContent()).To(Equal("visible content")) }) }) + + Context("ProcessChatDeltaReasoning with Gemma 4 tags", func() { + It("should strip <|channel>thought and tags from streaming deltas", func() { + ext := NewReasoningExtractor("<|channel>thought", Config{}) + + // Simulate C++ autoparser sending tag tokens as reasoning + d1 := ext.ProcessChatDeltaReasoning("<|channel>") + Expect(d1).To(BeEmpty(), "start tag prefix should be buffered, not emitted") + + d2 := ext.ProcessChatDeltaReasoning("thought") + Expect(d2).To(BeEmpty(), "start tag suffix should be buffered, not emitted") + + d3 := ext.ProcessChatDeltaReasoning("\n") + Expect(d3).To(BeEmpty(), "newline after start tag should not emit yet") + + d4 := ext.ProcessChatDeltaReasoning("The") + Expect(d4).To(Equal("The")) + + d5 := ext.ProcessChatDeltaReasoning(" user") + Expect(d5).To(Equal(" user")) + + d6 := ext.ProcessChatDeltaReasoning(" asks") + Expect(d6).To(Equal(" asks")) + + // Trailing newline gets TrimSpaced by ExtractReasoning, + // so it appears delayed with the next non-whitespace token + d7 := ext.ProcessChatDeltaReasoning("\n") + Expect(d7).To(BeEmpty(), "trailing newline is buffered by TrimSpace") + + d8 := ext.ProcessChatDeltaReasoning("2+2=4") + Expect(d8).To(Equal("\n2+2=4"), "delayed newline emitted with next content") + + d9 := ext.ProcessChatDeltaReasoning("") + Expect(d9).To(BeEmpty(), "close tag should be consumed, not emitted") + }) + + It("should handle empty deltas", func() { + ext := NewReasoningExtractor("<|channel>thought", Config{}) + d := ext.ProcessChatDeltaReasoning("") + Expect(d).To(BeEmpty()) + }) + + It("should pass through reasoning without tags unchanged", func() { + ext := NewReasoningExtractor("", Config{}) + + // When C++ autoparser already strips tags (e.g. models), + // reasoning arrives clean — just pass it through. + d1 := ext.ProcessChatDeltaReasoning("I need to") + Expect(d1).To(Equal("I need to")) + + d2 := ext.ProcessChatDeltaReasoning(" think carefully") + Expect(d2).To(Equal(" think carefully")) + }) + + It("should strip tags if C++ autoparser includes them", func() { + ext := NewReasoningExtractor("", Config{}) + + d1 := ext.ProcessChatDeltaReasoning("") + Expect(d1).To(BeEmpty()) + + d2 := ext.ProcessChatDeltaReasoning("reasoning") + Expect(d2).To(Equal("reasoning")) + + d3 := ext.ProcessChatDeltaReasoning("") + Expect(d3).To(BeEmpty()) + }) + + It("should respect suppressReasoning", func() { + ext := NewReasoningExtractor("<|channel>thought", Config{}) + ext.ResetAndSuppressReasoning() + + d := ext.ProcessChatDeltaReasoning("some reasoning") + Expect(d).To(BeEmpty()) + }) + + It("should reset ChatDelta state on Reset", func() { + ext := NewReasoningExtractor("<|channel>thought", Config{}) + + ext.ProcessChatDeltaReasoning("<|channel>thought") + ext.ProcessChatDeltaReasoning("\nfirst reasoning") + ext.Reset() + + // After reset, should start fresh + d := ext.ProcessChatDeltaReasoning("clean reasoning") + Expect(d).To(Equal("clean reasoning")) + }) + }) })