Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions core/http/endpoints/openai/chat.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,11 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator

// Prefer pre-parsed chat deltas from C++ autoparser when available
if tokenUsage.HasChatDeltaContent() {
reasoningDelta, contentDelta = tokenUsage.ChatDeltaReasoningAndContent()
rawReasoning, cd := tokenUsage.ChatDeltaReasoningAndContent()
contentDelta = cd
// Strip reasoning tags (e.g. <|channel>thought / <channel|>) that
// the C++ autoparser includes as part of reasoning content.
reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
// Keep extractor state consistent for fallback
extractor.ProcessToken(s)
} else {
Expand Down Expand Up @@ -149,7 +153,11 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator

// Prefer pre-parsed chat deltas from C++ autoparser when available
if usage.HasChatDeltaContent() {
reasoningDelta, contentDelta = usage.ChatDeltaReasoningAndContent()
rawReasoning, cd := usage.ChatDeltaReasoningAndContent()
contentDelta = cd
// Strip reasoning tags (e.g. <|channel>thought / <channel|>) that
// the C++ autoparser includes as part of reasoning content.
reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
// Keep extractor state consistent for fallback
extractor.ProcessToken(s)
} else {
Expand Down
8 changes: 6 additions & 2 deletions core/http/endpoints/openresponses/responses.go
Original file line number Diff line number Diff line change
Expand Up @@ -1823,7 +1823,9 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
var reasoningDelta, contentDelta string
// Prefer pre-parsed chat deltas from C++ autoparser when available
if tokenUsage.HasChatDeltaContent() {
reasoningDelta, contentDelta = tokenUsage.ChatDeltaReasoningAndContent()
rawReasoning, cd := tokenUsage.ChatDeltaReasoningAndContent()
contentDelta = cd
reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
extractor.ProcessToken(token) // keep state consistent
} else {
reasoningDelta, contentDelta = extractor.ProcessToken(token)
Expand Down Expand Up @@ -2350,7 +2352,9 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
var reasoningDelta, contentDelta string
// Prefer pre-parsed chat deltas from C++ autoparser when available
if tokenUsage.HasChatDeltaContent() {
reasoningDelta, contentDelta = tokenUsage.ChatDeltaReasoningAndContent()
rawReasoning, cd := tokenUsage.ChatDeltaReasoningAndContent()
contentDelta = cd
reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
extractor.ProcessToken(token) // keep state consistent
} else {
reasoningDelta, contentDelta = extractor.ProcessToken(token)
Expand Down
65 changes: 65 additions & 0 deletions pkg/reasoning/extractor.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ type ReasoningExtractor struct {
lastReasoning string
lastCleaned string
suppressReasoning bool

// ChatDelta reasoning accumulator — used by ProcessChatDeltaReasoning
// to strip reasoning tags (e.g. <|channel>thought, <channel|>) that
// the C++ autoparser includes in reasoning_content deltas.
cdReasoningAccum string
cdLastStrippedReasoning string
}

// NewReasoningExtractor creates a new extractor for the given thinking token and config.
Expand Down Expand Up @@ -64,6 +70,61 @@ func (e *ReasoningExtractor) ProcessToken(token string) (reasoningDelta, content
return reasoningDelta, contentDelta
}

// ProcessChatDeltaReasoning accumulates raw reasoning text from C++ autoparser
// ChatDeltas, strips any embedded reasoning tags (e.g. <|channel>thought …
// <channel|> for Gemma 4), and returns only the new stripped delta.
// This prevents tag tokens from leaking into the reasoning field of SSE chunks.
//
// When the C++ autoparser already strips tags (e.g. <think> models), the text
// passes through unchanged — ExtractReasoning finds no tags so we use the raw text.
func (e *ReasoningExtractor) ProcessChatDeltaReasoning(rawDelta string) string {
if rawDelta == "" {
return ""
}
e.cdReasoningAccum += rawDelta

// Try to strip reasoning tags from accumulated ChatDelta reasoning.
stripped, cleaned := ExtractReasoning(e.cdReasoningAccum, &e.config)

if stripped == "" {
// ExtractReasoning found no reasoning content. This happens when:
// a) A complete start tag was found but has no content after it yet
// (cleaned == "" because everything is inside the unclosed tag)
// → keep buffering
// b) We're accumulating a partial multi-token start tag
// (e.g. "<|channel>" before "thought" arrives)
// → keep buffering
// c) No tags at all — C++ already stripped them
// → pass through the raw text as-is
if cleaned == "" && strings.TrimSpace(e.cdReasoningAccum) != "" {
// Case (a): tag found, unclosed, no content yet
stripped = ""
} else if e.thinkingStartToken != "" &&
len(strings.TrimSpace(e.cdReasoningAccum)) < len(e.thinkingStartToken) &&
strings.HasPrefix(e.thinkingStartToken, strings.TrimSpace(e.cdReasoningAccum)) {
// Case (b): partial start tag prefix
stripped = ""
} else {
// Case (c): no tags found — text is already clean from C++
stripped = e.cdReasoningAccum
}
}

// Compute delta from stripped reasoning
var delta string
if len(stripped) > len(e.cdLastStrippedReasoning) && strings.HasPrefix(stripped, e.cdLastStrippedReasoning) {
delta = stripped[len(e.cdLastStrippedReasoning):]
} else if stripped != e.cdLastStrippedReasoning && stripped != "" {
delta = stripped
}
e.cdLastStrippedReasoning = stripped

if e.suppressReasoning {
return ""
}
return delta
}

// Reasoning returns the total accumulated reasoning after streaming.
func (e *ReasoningExtractor) Reasoning() string {
return e.lastReasoning
Expand All @@ -84,6 +145,8 @@ func (e *ReasoningExtractor) Reset() {
e.accumulated = ""
e.lastReasoning = ""
e.lastCleaned = ""
e.cdReasoningAccum = ""
e.cdLastStrippedReasoning = ""
}

// ResetAndSuppressReasoning clears state and suppresses future reasoning deltas.
Expand All @@ -95,6 +158,8 @@ func (e *ReasoningExtractor) ResetAndSuppressReasoning() {
e.accumulated = ""
e.lastReasoning = ""
e.lastCleaned = ""
e.cdReasoningAccum = ""
e.cdLastStrippedReasoning = ""
e.suppressReasoning = true
}

Expand Down
87 changes: 87 additions & 0 deletions pkg/reasoning/extractor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -195,4 +195,91 @@ var _ = Describe("ReasoningExtractor", func() {
Expect(ext.CleanedContent()).To(Equal("visible content"))
})
})

Context("ProcessChatDeltaReasoning with Gemma 4 tags", func() {
It("should strip <|channel>thought and <channel|> tags from streaming deltas", func() {
ext := NewReasoningExtractor("<|channel>thought", Config{})

// Simulate C++ autoparser sending tag tokens as reasoning
d1 := ext.ProcessChatDeltaReasoning("<|channel>")
Expect(d1).To(BeEmpty(), "start tag prefix should be buffered, not emitted")

d2 := ext.ProcessChatDeltaReasoning("thought")
Expect(d2).To(BeEmpty(), "start tag suffix should be buffered, not emitted")

d3 := ext.ProcessChatDeltaReasoning("\n")
Expect(d3).To(BeEmpty(), "newline after start tag should not emit yet")

d4 := ext.ProcessChatDeltaReasoning("The")
Expect(d4).To(Equal("The"))

d5 := ext.ProcessChatDeltaReasoning(" user")
Expect(d5).To(Equal(" user"))

d6 := ext.ProcessChatDeltaReasoning(" asks")
Expect(d6).To(Equal(" asks"))

// Trailing newline gets TrimSpaced by ExtractReasoning,
// so it appears delayed with the next non-whitespace token
d7 := ext.ProcessChatDeltaReasoning("\n")
Expect(d7).To(BeEmpty(), "trailing newline is buffered by TrimSpace")

d8 := ext.ProcessChatDeltaReasoning("2+2=4")
Expect(d8).To(Equal("\n2+2=4"), "delayed newline emitted with next content")

d9 := ext.ProcessChatDeltaReasoning("<channel|>")
Expect(d9).To(BeEmpty(), "close tag should be consumed, not emitted")
})

It("should handle empty deltas", func() {
ext := NewReasoningExtractor("<|channel>thought", Config{})
d := ext.ProcessChatDeltaReasoning("")
Expect(d).To(BeEmpty())
})

It("should pass through reasoning without tags unchanged", func() {
ext := NewReasoningExtractor("<think>", Config{})

// When C++ autoparser already strips tags (e.g. <think> models),
// reasoning arrives clean — just pass it through.
d1 := ext.ProcessChatDeltaReasoning("I need to")
Expect(d1).To(Equal("I need to"))

d2 := ext.ProcessChatDeltaReasoning(" think carefully")
Expect(d2).To(Equal(" think carefully"))
})

It("should strip <think> tags if C++ autoparser includes them", func() {
ext := NewReasoningExtractor("<think>", Config{})

d1 := ext.ProcessChatDeltaReasoning("<think>")
Expect(d1).To(BeEmpty())

d2 := ext.ProcessChatDeltaReasoning("reasoning")
Expect(d2).To(Equal("reasoning"))

d3 := ext.ProcessChatDeltaReasoning("</think>")
Expect(d3).To(BeEmpty())
})

It("should respect suppressReasoning", func() {
ext := NewReasoningExtractor("<|channel>thought", Config{})
ext.ResetAndSuppressReasoning()

d := ext.ProcessChatDeltaReasoning("some reasoning")
Expect(d).To(BeEmpty())
})

It("should reset ChatDelta state on Reset", func() {
ext := NewReasoningExtractor("<|channel>thought", Config{})

ext.ProcessChatDeltaReasoning("<|channel>thought")
ext.ProcessChatDeltaReasoning("\nfirst reasoning")
ext.Reset()

// After reset, should start fresh
d := ext.ProcessChatDeltaReasoning("clean reasoning")
Expect(d).To(Equal("clean reasoning"))
})
})
})
Loading