package main import ( "context" "encoding/json" "flag" "fmt" "log" "math/rand" "net/http" "strings" "time" "unsafe" "makarna/pkg/backend/cpu" "makarna/pkg/backend/cuda" "makarna/pkg/backend/device" "makarna/pkg/chat" "makarna/pkg/engine" "makarna/pkg/kvcache" "makarna/pkg/sample" "makarna/pkg/tensor" "makarna/pkg/tokenizer" ) type chatCompletionRequest struct { Model string `json:"model"` Messages []chatCompletionMessage `json:"messages"` Tools []any `json:"tools,omitempty"` Stream bool `json:"stream,omitempty"` MaxTokens int `json:"max_tokens,omitempty"` Temperature float64 `json:"temperature,omitempty"` TopP float64 `json:"top_p,omitempty"` TopK int `json:"top_k,omitempty"` PresencePenalty float64 `json:"presence_penalty,omitempty"` FrequencyPenalty float64 `json:"frequency_penalty,omitempty"` } type chatCompletionMessage struct { Role string `json:"role"` Content string `json:"content"` Name string `json:"name,omitempty"` ToolCallID string `json:"tool_call_id,omitempty"` } type chatCompletionResponse struct { ID string `json:"id"` Object string `json:"object"` Created int64 `json:"created"` Model string `json:"model"` Usage chatCompletionUsage `json:"usage"` Choices []chatCompletionChoice `json:"choices"` } type chatCompletionUsage struct { PromptTokens int `json:"prompt_tokens"` CompletionTokens int `json:"completion_tokens"` TotalTokens int `json:"total_tokens"` } type chatCompletionChoice struct { Index int `json:"index"` Message chatCompletionOutMsg `json:"message"` FinishReason string `json:"finish_reason"` } type chatCompletionOutMsg struct { Role string `json:"role"` Content string `json:"content"` ToolCalls []openAIToolCall `json:"tool_calls,omitempty"` } type openAIToolCall struct { ID string `json:"id"` Type string `json:"type"` Function openAIFunctionCall `json:"function"` } type openAIFunctionCall struct { Name string `json:"name"` Arguments string `json:"arguments"` } type server struct { eng *engine.Engine tok *tokenizer.Tokenizer arch string maxSeqLen int blockSize int } func main() { listen := flag.String("listen", ":8080", "listen address") modelPath := flag.String("model", "model.mak", "Path to .mak model file") maxSeq := flag.Int("max-seq-len", 8192, "Maximum sequence length to reserve in KV cache") blockSize := flag.Int("block-size", 32, "KV cache block size") nGPULayers := flag.Int("n-gpu-layers", -1, "Number of layers to offload to GPU (-1=auto, 0=CPU only)") gpuBudget := flag.Float64("gpu-budget", 0.9, "Fraction of GPU memory to use (0.0-1.0)") flag.Parse() cfg := engine.Config{GPULayers: *nGPULayers, GPUBudget: *gpuBudget} eng, err := engine.Load(*modelPath, cfg) if err != nil { log.Fatalf("load model: %v", err) } defer eng.Close() md := eng.Model().Config() var tok *tokenizer.Tokenizer tokData, err := eng.Loader().GetTokenizerData() if err == nil && len(tokData) > 0 { tok, err = tokenizer.LoadFromBytes(tokData) if err != nil { log.Printf("warning: load embedded tokenizer: %v", err) } } if tok == nil { log.Fatalf("tokenizer not found in model file") } s := &server{eng: eng, tok: tok, arch: md.Architecture, maxSeqLen: *maxSeq, blockSize: *blockSize} h := http.NewServeMux() h.HandleFunc("/v1/chat/completions", s.handleChatCompletions) h.HandleFunc("/v1/models", s.handleModels) log.Printf("listening on %s (arch=%s, cuda=%v)", *listen, s.arch, device.CUDAAvailable()) log.Fatal(http.ListenAndServe(*listen, h)) } func (s *server) handleModels(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodGet { http.Error(w, "method not allowed", http.StatusMethodNotAllowed) return } resp := map[string]any{ "object": "list", "data": []any{map[string]any{"id": "local", "object": "model"}}, } writeJSON(w, resp) } func (s *server) handleChatCompletions(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodPost { http.Error(w, "method not allowed", http.StatusMethodNotAllowed) return } var req chatCompletionRequest if err := json.NewDecoder(r.Body).Decode(&req); err != nil { http.Error(w, "bad json", http.StatusBadRequest) return } if req.Stream { http.Error(w, "stream not implemented", http.StatusNotImplemented) return } if len(req.Messages) == 0 { http.Error(w, "messages required", http.StatusBadRequest) return } msgs := make([]chat.Message, 0, len(req.Messages)) for _, m := range req.Messages { role := strings.ToLower(m.Role) msgs = append(msgs, chat.Message{Role: role, Content: m.Content}) } prompt, err := chat.RenderForArchitecture(s.arch, msgs, chat.Options{ AddGenerationPrompt: true, EnableThinking: true, Tools: req.Tools, }) if err != nil { http.Error(w, fmt.Sprintf("render prompt: %v", err), http.StatusInternalServerError) return } promptTokens := len(s.tok.Encode(prompt)) maxTokens := req.MaxTokens if maxTokens <= 0 { maxTokens = 128 } temp := req.Temperature if temp == 0 { temp = 0.7 } topP := req.TopP if topP == 0 { topP = 0.9 } topK := req.TopK if topK == 0 { topK = 40 } outText, err := s.generate(r.Context(), prompt, maxTokens, temp, topP, topK) if err != nil { http.Error(w, fmt.Sprintf("generate: %v", err), http.StatusInternalServerError) return } completionTokens := len(s.tok.Encode(outText)) _, content := chat.StripThinking(outText) content, calls, err := chat.ExtractToolCalls(content) if err != nil { http.Error(w, fmt.Sprintf("parse tool_calls: %v", err), http.StatusInternalServerError) return } content = strings.TrimSpace(content) var outCalls []openAIToolCall for i, c := range calls { outCalls = append(outCalls, openAIToolCall{ ID: fmt.Sprintf("call_%d_%d", time.Now().UnixNano(), i), Type: "function", Function: openAIFunctionCall{ Name: c.Name, Arguments: string(bytesOrEmptyObject(c.Arguments)), }, }) } finish := "stop" if len(outCalls) > 0 { finish = "tool_calls" } resp := chatCompletionResponse{ ID: fmt.Sprintf("chatcmpl_%d", rand.Int63()), Object: "chat.completion", Created: time.Now().Unix(), Model: req.Model, Usage: chatCompletionUsage{ PromptTokens: promptTokens, CompletionTokens: completionTokens, TotalTokens: promptTokens + completionTokens, }, Choices: []chatCompletionChoice{{ Index: 0, Message: chatCompletionOutMsg{ Role: "assistant", Content: content, ToolCalls: outCalls, }, FinishReason: finish, }}, } writeJSON(w, resp) } func bytesOrEmptyObject(b []byte) []byte { if len(b) == 0 { return []byte("{}") } return b } func writeJSON(w http.ResponseWriter, v any) { w.Header().Set("Content-Type", "application/json") enc := json.NewEncoder(w) enc.SetEscapeHTML(false) _ = enc.Encode(v) } func (s *server) generate(ctx context.Context, prompt string, maxTokens int, temperature float64, topP float64, topK int) (string, error) { ids := s.tok.Encode(prompt) if len(ids) == 0 { return "", fmt.Errorf("empty prompt after tokenization") } modelCfg := s.eng.Model().Config() placements := make([]tensor.DevicePlacement, modelCfg.NumLayers) if s.eng.Dispatcher() != nil { for i := 0; i < modelCfg.NumLayers; i++ { placements[i] = s.eng.Dispatcher().LayerPlacement(i) } } // Enable mixed per-layer KV cache when any layer is on GPU. kvDevice := tensor.CPU if device.CUDAAvailable() { for i := 0; i < modelCfg.NumLayers && i < len(placements); i++ { if placements[i].Normalize().Type == tensor.CUDA { kvDevice = tensor.CUDA break } } } pool, err := kvcache.NewBlockPool(kvcache.BlockPoolConfig{ NumLayers: modelCfg.NumLayers, NumKVHeads: modelCfg.NumKVHeads, HeadDim: modelCfg.HeadDim, BlockSize: s.blockSize, NumBlocks: (s.maxSeqLen + s.blockSize - 1) / s.blockSize, Device: kvDevice, GPU: 0, LayerPlacements: func() []tensor.DevicePlacement { if kvDevice != tensor.CUDA || len(placements) != modelCfg.NumLayers { return nil } out := make([]tensor.DevicePlacement, modelCfg.NumLayers) for i := 0; i < modelCfg.NumLayers; i++ { out[i] = placements[i].Normalize() } return out }(), Preallocate: kvDevice == tensor.CUDA, }) if err != nil { return "", err } cache := kvcache.NewPagedKVCache(pool, kvcache.PagedCacheConfig{ NumLayers: modelCfg.NumLayers, NumKVHeads: modelCfg.NumKVHeads, HeadDim: modelCfg.HeadDim, BlockSize: s.blockSize, MaxSeqLen: s.maxSeqLen, Device: kvDevice, GPU: 0, }, "cmd-openai") if _, err := cache.AllocateForTokens(ids); err != nil { cache.Free() return "", err } defer cache.Free() sampler := sample.New(sample.Config{ Temperature: float32(temperature), TopK: topK, TopP: float32(topP), RepetitionPenalty: 1.1, Seed: -1, }) input := createInputTensor(ids) positions := createPositionTensor(0, len(ids)) logits, err := s.eng.Forward(ctx, input, positions, cache) if err != nil { return "", err } // sample first token var nextToken int if logitsCPU := getLogitsRowCPU(logits, len(ids)-1); logitsCPU != nil { nextToken = sampler.Sample(logitsCPU, ids) } else { gpuLogits := logits.(*cuda.Tensor) vocabSize := gpuLogits.Shape()[1] row := len(ids) - 1 view, err := gpuLogits.ViewAt(tensor.Shape{vocabSize}, uintptr(row*vocabSize*4)) if err != nil { return "", err } host := make([]float32, vocabSize) if err := view.CopyToHost(host); err != nil { return "", err } nextToken = sampler.Sample(host, ids) } ids = append(ids, nextToken) var sb strings.Builder sb.WriteString(s.tok.Decode([]int{nextToken})) eosID := s.tok.EosID() for i := 1; i < maxTokens; i++ { if nextToken == eosID { break } select { case <-ctx.Done(): return "", ctx.Err() default: } input = createInputTensor([]int{nextToken}) currentPos := len(ids) - 1 positions = createPositionTensor(currentPos, 1) logits, err = s.eng.Forward(ctx, input, positions, cache) if err != nil { return "", err } recent := ids if len(recent) > 64 { recent = recent[len(recent)-64:] } if logitsCPU := getLogitsRowCPU(logits, 0); logitsCPU != nil { nextToken = sampler.Sample(logitsCPU, recent) } else { gpuLogits := logits.(*cuda.Tensor) vocabSize := gpuLogits.Shape()[1] view, err := gpuLogits.ViewAt(tensor.Shape{vocabSize}, 0) if err != nil { return "", err } host := make([]float32, vocabSize) if err := view.CopyToHost(host); err != nil { return "", err } nextToken = sampler.Sample(host, recent) } ids = append(ids, nextToken) sb.WriteString(s.tok.Decode([]int{nextToken})) } return sb.String(), nil } func createInputTensor(ids []int) tensor.Tensor { t := cpu.NewTensor(tensor.Shape{len(ids)}, nil) data := t.DataFloat32() for i, id := range ids { data[i] = float32(id) } return t } func createPositionTensor(start, count int) tensor.Tensor { t := cpu.NewTensor(tensor.Shape{count}, nil) data := t.DataFloat32() for i := 0; i < count; i++ { data[i] = float32(start + i) } return t } func getLogitsRowCPU(logits tensor.Tensor, row int) []float32 { if _, ok := logits.(*cpu.Tensor); !ok { return nil } data := logits.Data().(unsafe.Pointer) shape := logits.Shape() vocabSize := shape[1] slice := unsafe.Slice((*float32)(data), shape.NumElements()) return slice[row*vocabSize : (row+1)*vocabSize] }