curl -m 60 http://localhost:8081/v1/chat/completions
-H "Content-Type: application/json"
-d '{"model": "Qwen3.5-4B-Q4_K_M.gguf", "messages": [{"role": "user", "content": "say hello"}], "stream": false}'
{"created":1773284146,"object":"chat.completion","id":"2c5092f8-db39-4c15-aedf-a4ef2c6c8f15","model":"Qwen3.5-4B-Q4_K_M.gguf","choices":[{"index":0,"finish_reason":"stop","message":{"role":"assistant","content":"","reasoning":"(the response is a confirmation of the following statement: "The response is in a system response. The assistant is an AI assistant, and the user is referring to the assistant in a system response. The user is asking about the relationship between the assistant and the user in a system response. The assistant is a system response, and the user is referring to the assistant in a system response. The user is asking about the relationship between the assistant and the user in a system response. The assistant is a system response, and the user is referring to the assistant in a system response. The user is asking about the relationship between the assistant and the user in a system response. The assistant is a system response, and the user is referring to the assistant in a system response." in Chinese."}}],"usage":{"prompt_tokens":12,"completion_tokens":158,"total_tokens":170}}
backend: llama-cpp
description: Imported from https://huggingface.co/bartowski/Qwen_Qwen3.5-4B-GGUF
function:
grammar:
disable: true
known_usecases:
- chat
mmproj: llama-cpp/mmproj/mmproj-Qwen_Qwen3.5-4B-f16.gguf
name: qwen_qwen3.5-4b
options:
- use_jinja:true
parameters:
model: llama-cpp/models/Qwen_Qwen3.5-4B-Q4_K_M.gguf
template:
use_tokenizer_template: true
version: "3.9"
services:
api:
#image: localai/localai:latest-gpu-intel
image: localai/localai:master-gpu-intel
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8080/readyz"]
interval: 1m
timeout: 20m
retries: 5
ports:
- 8081:8080
shm_size: '8gb'
devices:
- /dev/dri/card1:/dev/dri/card1
- /dev/dri/renderD129:/dev/dri/renderD129
group_add:
- "993"
environment:
- CONTEXT_SIZE=4096
- GGML_SYCL_TARGET=INTEL
- OLLAMA_COMPAT=true
- THREADS=1 # Single thread is best for GPU offload
- DEBUG=true
volumes:
- /etc/localtime:/etc/localtime:ro
- /opt/localai/models:/models:rw
- /opt/localai/backends:/backends:rw
- /opt/localai/config:/configuration:rw
curl -m 60 http://localhost:8081/v1/chat/completions
-H "Content-Type: application/json"
-d '{"model": "Qwen3.5-4B-Q4_K_M.gguf", "messages": [{"role": "user", "content": "say hello"}], "stream": false}'
{"created":1773284146,"object":"chat.completion","id":"2c5092f8-db39-4c15-aedf-a4ef2c6c8f15","model":"Qwen3.5-4B-Q4_K_M.gguf","choices":[{"index":0,"finish_reason":"stop","message":{"role":"assistant","content":"","reasoning":"(the response is a confirmation of the following statement: "The response is in a system response. The assistant is an AI assistant, and the user is referring to the assistant in a system response. The user is asking about the relationship between the assistant and the user in a system response. The assistant is a system response, and the user is referring to the assistant in a system response. The user is asking about the relationship between the assistant and the user in a system response. The assistant is a system response, and the user is referring to the assistant in a system response. The user is asking about the relationship between the assistant and the user in a system response. The assistant is a system response, and the user is referring to the assistant in a system response." in Chinese."}}],"usage":{"prompt_tokens":12,"completion_tokens":158,"total_tokens":170}}