Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions backend/backend.proto
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ service Backend {
rpc GetMetrics(MetricsRequest) returns (MetricsResponse);

rpc VAD(VADRequest) returns (VADResponse) {}

rpc ModelMetadata(ModelOptions) returns (ModelMetadataResponse) {}
}

// Define the empty request
Expand Down Expand Up @@ -410,3 +412,8 @@ message Detection {
message DetectResponse {
repeated Detection Detections = 1;
}

message ModelMetadataResponse {
bool supports_thinking = 1;
string rendered_template = 2; // The rendered chat template with enable_thinking=true (empty if not applicable)
}
41 changes: 41 additions & 0 deletions backend/cpp/llama-cpp/grpc-server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2476,6 +2476,47 @@ class BackendServiceImpl final : public backend::Backend::Service {
response->set_prompt_tokens_processed(res_metrics->n_prompt_tokens_processed_total);


return grpc::Status::OK;
}

grpc::Status ModelMetadata(ServerContext* /*context*/, const backend::ModelOptions* /*request*/, backend::ModelMetadataResponse* response) override {
// Check if model is loaded
if (params_base.model.path.empty()) {
return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
}

// Check if chat templates are initialized
if (ctx_server.impl->chat_params.tmpls == nullptr) {
// If templates are not initialized, we can't detect thinking support
// Return false as default
response->set_supports_thinking(false);
response->set_rendered_template("");
return grpc::Status::OK;
}

// Detect thinking support using llama.cpp's function
bool supports_thinking = common_chat_templates_support_enable_thinking(ctx_server.impl->chat_params.tmpls.get());
response->set_supports_thinking(supports_thinking);

// Render the template with enable_thinking=true so Go code can detect thinking tokens
// This allows reusing existing detection functions in Go
std::string rendered_template = "";
if (params_base.use_jinja) {
// Render the template with enable_thinking=true to see what the actual prompt looks like
common_chat_templates_inputs dummy_inputs;
common_chat_msg msg;
msg.role = "user";
msg.content = "test";
dummy_inputs.messages = {msg};
dummy_inputs.enable_thinking = true;
dummy_inputs.use_jinja = params_base.use_jinja;

const auto rendered = common_chat_templates_apply(ctx_server.impl->chat_params.tmpls.get(), dummy_inputs);
rendered_template = rendered.prompt;
}

response->set_rendered_template(rendered_template);

return grpc::Status::OK;
}
};
Expand Down
12 changes: 12 additions & 0 deletions core/backend/llm.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,18 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima
return nil, err
}

// Detect thinking support after model load (only if not already detected)
// This needs to happen after LoadModel succeeds so the backend can render templates
if (c.ReasoningConfig.DisableReasoning == nil && c.ReasoningConfig.DisableReasoningTagPrefill == nil) && c.TemplateConfig.UseTokenizerTemplate {
modelOpts := grpcModelOpts(*c, o.SystemState.Model.ModelsPath)
config.DetectThinkingSupportFromBackend(ctx, c, inferenceModel, modelOpts)
// Update the config in the loader so it persists for future requests
cl.UpdateModelConfig(c.Name, func(cfg *config.ModelConfig) {
cfg.ReasoningConfig.DisableReasoning = c.ReasoningConfig.DisableReasoning
cfg.ReasoningConfig.DisableReasoningTagPrefill = c.ReasoningConfig.DisableReasoningTagPrefill
})
}

var protoMessages []*proto.Message
// if we are using the tokenizer template, we need to convert the messages to proto messages
// unless the prompt has already been tokenized (non-chat endpoints + functions)
Expand Down
52 changes: 52 additions & 0 deletions core/config/gguf.go
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
package config

import (
"context"

"github.com/mudler/LocalAI/pkg/grpc"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
"github.com/mudler/LocalAI/pkg/reasoning"
"github.com/mudler/LocalAI/pkg/xsysinfo"
"github.com/mudler/xlog"

gguf "github.com/gpustack/gguf-parser-go"
"github.com/gpustack/gguf-parser-go/util/ptr"
)

const (
Expand Down Expand Up @@ -71,6 +77,8 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
cfg.modelTemplate = chatTemplate.ValueString()
}

// Thinking support detection is done after model load via DetectThinkingSupportFromBackend

// template estimations
if cfg.HasTemplate() {
// nothing to guess here
Expand All @@ -92,3 +100,47 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
cfg.KnownUsecaseStrings = append(cfg.KnownUsecaseStrings, "FLAG_CHAT")

}

// DetectThinkingSupportFromBackend calls the ModelMetadata gRPC method to detect
// if the model supports thinking mode and if the template ends with a thinking start token.
// This should be called after the model is loaded.
// The results are stored in cfg.SupportsThinking and cfg.ThinkingForcedOpen.
func DetectThinkingSupportFromBackend(ctx context.Context, cfg *ModelConfig, backendClient grpc.Backend, modelOptions *pb.ModelOptions) {
if backendClient == nil {
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: backend client is nil, skipping detection")
return
}

if modelOptions == nil {
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: model options is nil, skipping detection")
return
}

// Only detect for llama-cpp backend when using tokenizer templates
if cfg.Backend != "llama-cpp" || !cfg.TemplateConfig.UseTokenizerTemplate {
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: skipping detection", "backend", cfg.Backend, "useTokenizerTemplate", cfg.TemplateConfig.UseTokenizerTemplate)
return
}

metadata, err := backendClient.ModelMetadata(ctx, modelOptions)
if err != nil {
xlog.Warn("[gguf] DetectThinkingSupportFromBackend: failed to get model metadata", "error", err)
return
}

if metadata != nil {
cfg.ReasoningConfig.DisableReasoning = ptr.To(!metadata.SupportsThinking)

// Use the rendered template to detect if thinking token is at the end
// This reuses the existing DetectThinkingStartToken function
if metadata.RenderedTemplate != "" {
thinkingStartToken := reasoning.DetectThinkingStartToken(metadata.RenderedTemplate, &cfg.ReasoningConfig)
thinkingForcedOpen := thinkingStartToken != ""
cfg.ReasoningConfig.DisableReasoningTagPrefill = ptr.To(!thinkingForcedOpen)
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: thinking support detected", "supports_thinking", metadata.SupportsThinking, "thinking_forced_open", thinkingForcedOpen, "thinking_start_token", thinkingStartToken)
} else {
cfg.ReasoningConfig.DisableReasoningTagPrefill = ptr.To(true)
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: thinking support detected", "supports_thinking", metadata.SupportsThinking, "thinking_forced_open", false)
}
}
}
11 changes: 11 additions & 0 deletions core/config/model_config_loader.go
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,17 @@ func (bcl *ModelConfigLoader) RemoveModelConfig(m string) {
delete(bcl.configs, m)
}

// UpdateModelConfig updates an existing model config in the loader.
// This is useful for updating runtime-detected properties like thinking support.
func (bcl *ModelConfigLoader) UpdateModelConfig(m string, updater func(*ModelConfig)) {
bcl.Lock()
defer bcl.Unlock()
if cfg, exists := bcl.configs[m]; exists {
updater(&cfg)
bcl.configs[m] = cfg
}
}

// Preload prepare models if they are not local but url or huggingface repositories
func (bcl *ModelConfigLoader) Preload(modelPath string) error {
bcl.Lock()
Expand Down
2 changes: 2 additions & 0 deletions pkg/grpc/backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,4 +57,6 @@ type Backend interface {
GetTokenMetrics(ctx context.Context, in *pb.MetricsRequest, opts ...grpc.CallOption) (*pb.MetricsResponse, error)

VAD(ctx context.Context, in *pb.VADRequest, opts ...grpc.CallOption) (*pb.VADResponse, error)

ModelMetadata(ctx context.Context, in *pb.ModelOptions, opts ...grpc.CallOption) (*pb.ModelMetadataResponse, error)
}
4 changes: 4 additions & 0 deletions pkg/grpc/base/base.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ func (llm *Base) TokenizeString(opts *pb.PredictOptions) (pb.TokenizationRespons
return pb.TokenizationResponse{}, fmt.Errorf("unimplemented")
}

func (llm *Base) ModelMetadata(opts *pb.ModelOptions) (*pb.ModelMetadataResponse, error) {
return nil, fmt.Errorf("unimplemented")
}

// backends may wish to call this to capture the gopsutil info, then enhance with additional memory usage details?
func (llm *Base) Status() (pb.StatusResponse, error) {
return pb.StatusResponse{
Expand Down
22 changes: 22 additions & 0 deletions pkg/grpc/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -537,3 +537,25 @@ func (c *Client) Detect(ctx context.Context, in *pb.DetectOptions, opts ...grpc.
client := pb.NewBackendClient(conn)
return client.Detect(ctx, in, opts...)
}

func (c *Client) ModelMetadata(ctx context.Context, in *pb.ModelOptions, opts ...grpc.CallOption) (*pb.ModelMetadataResponse, error) {
if !c.parallel {
c.opMutex.Lock()
defer c.opMutex.Unlock()
}
c.setBusy(true)
defer c.setBusy(false)
c.wdMark()
defer c.wdUnMark()
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
grpc.WithDefaultCallOptions(
grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
))
if err != nil {
return nil, err
}
defer conn.Close()
client := pb.NewBackendClient(conn)
return client.ModelMetadata(ctx, in, opts...)
}
4 changes: 4 additions & 0 deletions pkg/grpc/embed.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,10 @@ func (e *embedBackend) VAD(ctx context.Context, in *pb.VADRequest, opts ...grpc.
return e.s.VAD(ctx, in)
}

func (e *embedBackend) ModelMetadata(ctx context.Context, in *pb.ModelOptions, opts ...grpc.CallOption) (*pb.ModelMetadataResponse, error) {
return e.s.ModelMetadata(ctx, in)
}

func (e *embedBackend) GetTokenMetrics(ctx context.Context, in *pb.MetricsRequest, opts ...grpc.CallOption) (*pb.MetricsResponse, error) {
return e.s.GetMetrics(ctx, in)
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/grpc/interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ type AIModel interface {
StoresFind(*pb.StoresFindOptions) (pb.StoresFindResult, error)

VAD(*pb.VADRequest) (pb.VADResponse, error)

ModelMetadata(*pb.ModelOptions) (*pb.ModelMetadataResponse, error)
}

func newReply(s string) *pb.Reply {
Expand Down
12 changes: 12 additions & 0 deletions pkg/grpc/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,18 @@ func (s *server) VAD(ctx context.Context, in *pb.VADRequest) (*pb.VADResponse, e
return &res, nil
}

func (s *server) ModelMetadata(ctx context.Context, in *pb.ModelOptions) (*pb.ModelMetadataResponse, error) {
if s.llm.Locking() {
s.llm.Lock()
defer s.llm.Unlock()
}
res, err := s.llm.ModelMetadata(in)
if err != nil {
return nil, err
}
return res, nil
}

func StartServer(address string, model AIModel) error {
lis, err := net.Listen("tcp", address)
if err != nil {
Expand Down
Loading