| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440 |
- package main
- import (
- "context"
- "encoding/json"
- "flag"
- "fmt"
- "log"
- "math/rand"
- "net/http"
- "strings"
- "time"
- "unsafe"
- "makarna/pkg/backend/cpu"
- "makarna/pkg/backend/cuda"
- "makarna/pkg/backend/device"
- "makarna/pkg/chat"
- "makarna/pkg/engine"
- "makarna/pkg/kvcache"
- "makarna/pkg/sample"
- "makarna/pkg/tensor"
- "makarna/pkg/tokenizer"
- )
- type chatCompletionRequest struct {
- Model string `json:"model"`
- Messages []chatCompletionMessage `json:"messages"`
- Tools []any `json:"tools,omitempty"`
- Stream bool `json:"stream,omitempty"`
- MaxTokens int `json:"max_tokens,omitempty"`
- Temperature float64 `json:"temperature,omitempty"`
- TopP float64 `json:"top_p,omitempty"`
- TopK int `json:"top_k,omitempty"`
- PresencePenalty float64 `json:"presence_penalty,omitempty"`
- FrequencyPenalty float64 `json:"frequency_penalty,omitempty"`
- }
- type chatCompletionMessage struct {
- Role string `json:"role"`
- Content string `json:"content"`
- Name string `json:"name,omitempty"`
- ToolCallID string `json:"tool_call_id,omitempty"`
- }
- type chatCompletionResponse struct {
- ID string `json:"id"`
- Object string `json:"object"`
- Created int64 `json:"created"`
- Model string `json:"model"`
- Usage chatCompletionUsage `json:"usage"`
- Choices []chatCompletionChoice `json:"choices"`
- }
- type chatCompletionUsage struct {
- PromptTokens int `json:"prompt_tokens"`
- CompletionTokens int `json:"completion_tokens"`
- TotalTokens int `json:"total_tokens"`
- }
- type chatCompletionChoice struct {
- Index int `json:"index"`
- Message chatCompletionOutMsg `json:"message"`
- FinishReason string `json:"finish_reason"`
- }
- type chatCompletionOutMsg struct {
- Role string `json:"role"`
- Content string `json:"content"`
- ToolCalls []openAIToolCall `json:"tool_calls,omitempty"`
- }
- type openAIToolCall struct {
- ID string `json:"id"`
- Type string `json:"type"`
- Function openAIFunctionCall `json:"function"`
- }
- type openAIFunctionCall struct {
- Name string `json:"name"`
- Arguments string `json:"arguments"`
- }
- type server struct {
- eng *engine.Engine
- tok *tokenizer.Tokenizer
- arch string
- maxSeqLen int
- blockSize int
- }
- func main() {
- listen := flag.String("listen", ":8080", "listen address")
- modelPath := flag.String("model", "model.mak", "Path to .mak model file")
- maxSeq := flag.Int("max-seq-len", 8192, "Maximum sequence length to reserve in KV cache")
- blockSize := flag.Int("block-size", 32, "KV cache block size")
- nGPULayers := flag.Int("n-gpu-layers", -1, "Number of layers to offload to GPU (-1=auto, 0=CPU only)")
- gpuBudget := flag.Float64("gpu-budget", 0.9, "Fraction of GPU memory to use (0.0-1.0)")
- flag.Parse()
- cfg := engine.Config{GPULayers: *nGPULayers, GPUBudget: *gpuBudget}
- eng, err := engine.Load(*modelPath, cfg)
- if err != nil {
- log.Fatalf("load model: %v", err)
- }
- defer eng.Close()
- md := eng.Model().Config()
- var tok *tokenizer.Tokenizer
- tokData, err := eng.Loader().GetTokenizerData()
- if err == nil && len(tokData) > 0 {
- tok, err = tokenizer.LoadFromBytes(tokData)
- if err != nil {
- log.Printf("warning: load embedded tokenizer: %v", err)
- }
- }
- if tok == nil {
- log.Fatalf("tokenizer not found in model file")
- }
- s := &server{eng: eng, tok: tok, arch: md.Architecture, maxSeqLen: *maxSeq, blockSize: *blockSize}
- h := http.NewServeMux()
- h.HandleFunc("/v1/chat/completions", s.handleChatCompletions)
- h.HandleFunc("/v1/models", s.handleModels)
- log.Printf("listening on %s (arch=%s, cuda=%v)", *listen, s.arch, device.CUDAAvailable())
- log.Fatal(http.ListenAndServe(*listen, h))
- }
- func (s *server) handleModels(w http.ResponseWriter, r *http.Request) {
- if r.Method != http.MethodGet {
- http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
- return
- }
- resp := map[string]any{
- "object": "list",
- "data": []any{map[string]any{"id": "local", "object": "model"}},
- }
- writeJSON(w, resp)
- }
- func (s *server) handleChatCompletions(w http.ResponseWriter, r *http.Request) {
- if r.Method != http.MethodPost {
- http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
- return
- }
- var req chatCompletionRequest
- if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
- http.Error(w, "bad json", http.StatusBadRequest)
- return
- }
- if req.Stream {
- http.Error(w, "stream not implemented", http.StatusNotImplemented)
- return
- }
- if len(req.Messages) == 0 {
- http.Error(w, "messages required", http.StatusBadRequest)
- return
- }
- msgs := make([]chat.Message, 0, len(req.Messages))
- for _, m := range req.Messages {
- role := strings.ToLower(m.Role)
- msgs = append(msgs, chat.Message{Role: role, Content: m.Content})
- }
- prompt, err := chat.RenderForArchitecture(s.arch, msgs, chat.Options{
- AddGenerationPrompt: true,
- EnableThinking: true,
- Tools: req.Tools,
- })
- if err != nil {
- http.Error(w, fmt.Sprintf("render prompt: %v", err), http.StatusInternalServerError)
- return
- }
- promptTokens := len(s.tok.Encode(prompt))
- maxTokens := req.MaxTokens
- if maxTokens <= 0 {
- maxTokens = 128
- }
- temp := req.Temperature
- if temp == 0 {
- temp = 0.7
- }
- topP := req.TopP
- if topP == 0 {
- topP = 0.9
- }
- topK := req.TopK
- if topK == 0 {
- topK = 40
- }
- outText, err := s.generate(r.Context(), prompt, maxTokens, temp, topP, topK)
- if err != nil {
- http.Error(w, fmt.Sprintf("generate: %v", err), http.StatusInternalServerError)
- return
- }
- completionTokens := len(s.tok.Encode(outText))
- _, content := chat.StripThinking(outText)
- content, calls, err := chat.ExtractToolCalls(content)
- if err != nil {
- http.Error(w, fmt.Sprintf("parse tool_calls: %v", err), http.StatusInternalServerError)
- return
- }
- content = strings.TrimSpace(content)
- var outCalls []openAIToolCall
- for i, c := range calls {
- outCalls = append(outCalls, openAIToolCall{
- ID: fmt.Sprintf("call_%d_%d", time.Now().UnixNano(), i),
- Type: "function",
- Function: openAIFunctionCall{
- Name: c.Name,
- Arguments: string(bytesOrEmptyObject(c.Arguments)),
- },
- })
- }
- finish := "stop"
- if len(outCalls) > 0 {
- finish = "tool_calls"
- }
- resp := chatCompletionResponse{
- ID: fmt.Sprintf("chatcmpl_%d", rand.Int63()),
- Object: "chat.completion",
- Created: time.Now().Unix(),
- Model: req.Model,
- Usage: chatCompletionUsage{
- PromptTokens: promptTokens,
- CompletionTokens: completionTokens,
- TotalTokens: promptTokens + completionTokens,
- },
- Choices: []chatCompletionChoice{{
- Index: 0,
- Message: chatCompletionOutMsg{
- Role: "assistant",
- Content: content,
- ToolCalls: outCalls,
- },
- FinishReason: finish,
- }},
- }
- writeJSON(w, resp)
- }
- func bytesOrEmptyObject(b []byte) []byte {
- if len(b) == 0 {
- return []byte("{}")
- }
- return b
- }
- func writeJSON(w http.ResponseWriter, v any) {
- w.Header().Set("Content-Type", "application/json")
- enc := json.NewEncoder(w)
- enc.SetEscapeHTML(false)
- _ = enc.Encode(v)
- }
- func (s *server) generate(ctx context.Context, prompt string, maxTokens int, temperature float64, topP float64, topK int) (string, error) {
- ids := s.tok.Encode(prompt)
- if len(ids) == 0 {
- return "", fmt.Errorf("empty prompt after tokenization")
- }
- modelCfg := s.eng.Model().Config()
- placements := make([]tensor.DevicePlacement, modelCfg.NumLayers)
- if s.eng.Dispatcher() != nil {
- for i := 0; i < modelCfg.NumLayers; i++ {
- placements[i] = s.eng.Dispatcher().LayerPlacement(i)
- }
- }
- // Enable mixed per-layer KV cache when any layer is on GPU.
- kvDevice := tensor.CPU
- if device.CUDAAvailable() {
- for i := 0; i < modelCfg.NumLayers && i < len(placements); i++ {
- if placements[i].Normalize().Type == tensor.CUDA {
- kvDevice = tensor.CUDA
- break
- }
- }
- }
- pool, err := kvcache.NewBlockPool(kvcache.BlockPoolConfig{
- NumLayers: modelCfg.NumLayers,
- NumKVHeads: modelCfg.NumKVHeads,
- HeadDim: modelCfg.HeadDim,
- BlockSize: s.blockSize,
- NumBlocks: (s.maxSeqLen + s.blockSize - 1) / s.blockSize,
- Device: kvDevice,
- GPU: 0,
- LayerPlacements: func() []tensor.DevicePlacement {
- if kvDevice != tensor.CUDA || len(placements) != modelCfg.NumLayers {
- return nil
- }
- out := make([]tensor.DevicePlacement, modelCfg.NumLayers)
- for i := 0; i < modelCfg.NumLayers; i++ {
- out[i] = placements[i].Normalize()
- }
- return out
- }(),
- Preallocate: kvDevice == tensor.CUDA,
- })
- if err != nil {
- return "", err
- }
- cache := kvcache.NewPagedKVCache(pool, kvcache.PagedCacheConfig{
- NumLayers: modelCfg.NumLayers,
- NumKVHeads: modelCfg.NumKVHeads,
- HeadDim: modelCfg.HeadDim,
- BlockSize: s.blockSize,
- MaxSeqLen: s.maxSeqLen,
- Device: kvDevice,
- GPU: 0,
- }, "cmd-openai")
- if _, err := cache.AllocateForTokens(ids); err != nil {
- cache.Free()
- return "", err
- }
- defer cache.Free()
- sampler := sample.New(sample.Config{
- Temperature: float32(temperature),
- TopK: topK,
- TopP: float32(topP),
- RepetitionPenalty: 1.1,
- Seed: -1,
- })
- input := createInputTensor(ids)
- positions := createPositionTensor(0, len(ids))
- logits, err := s.eng.Forward(ctx, input, positions, cache)
- if err != nil {
- return "", err
- }
- // sample first token
- var nextToken int
- if logitsCPU := getLogitsRowCPU(logits, len(ids)-1); logitsCPU != nil {
- nextToken = sampler.Sample(logitsCPU, ids)
- } else {
- gpuLogits := logits.(*cuda.Tensor)
- vocabSize := gpuLogits.Shape()[1]
- row := len(ids) - 1
- view, err := gpuLogits.ViewAt(tensor.Shape{vocabSize}, uintptr(row*vocabSize*4))
- if err != nil {
- return "", err
- }
- host := make([]float32, vocabSize)
- if err := view.CopyToHost(host); err != nil {
- return "", err
- }
- nextToken = sampler.Sample(host, ids)
- }
- ids = append(ids, nextToken)
- var sb strings.Builder
- sb.WriteString(s.tok.Decode([]int{nextToken}))
- eosID := s.tok.EosID()
- for i := 1; i < maxTokens; i++ {
- if nextToken == eosID {
- break
- }
- select {
- case <-ctx.Done():
- return "", ctx.Err()
- default:
- }
- input = createInputTensor([]int{nextToken})
- currentPos := len(ids) - 1
- positions = createPositionTensor(currentPos, 1)
- logits, err = s.eng.Forward(ctx, input, positions, cache)
- if err != nil {
- return "", err
- }
- recent := ids
- if len(recent) > 64 {
- recent = recent[len(recent)-64:]
- }
- if logitsCPU := getLogitsRowCPU(logits, 0); logitsCPU != nil {
- nextToken = sampler.Sample(logitsCPU, recent)
- } else {
- gpuLogits := logits.(*cuda.Tensor)
- vocabSize := gpuLogits.Shape()[1]
- view, err := gpuLogits.ViewAt(tensor.Shape{vocabSize}, 0)
- if err != nil {
- return "", err
- }
- host := make([]float32, vocabSize)
- if err := view.CopyToHost(host); err != nil {
- return "", err
- }
- nextToken = sampler.Sample(host, recent)
- }
- ids = append(ids, nextToken)
- sb.WriteString(s.tok.Decode([]int{nextToken}))
- }
- return sb.String(), nil
- }
- func createInputTensor(ids []int) tensor.Tensor {
- t := cpu.NewTensor(tensor.Shape{len(ids)}, nil)
- data := t.DataFloat32()
- for i, id := range ids {
- data[i] = float32(id)
- }
- return t
- }
- func createPositionTensor(start, count int) tensor.Tensor {
- t := cpu.NewTensor(tensor.Shape{count}, nil)
- data := t.DataFloat32()
- for i := 0; i < count; i++ {
- data[i] = float32(start + i)
- }
- return t
- }
- func getLogitsRowCPU(logits tensor.Tensor, row int) []float32 {
- if _, ok := logits.(*cpu.Tensor); !ok {
- return nil
- }
- data := logits.Data().(unsafe.Pointer)
- shape := logits.Shape()
- vocabSize := shape[1]
- slice := unsafe.Slice((*float32)(data), shape.NumElements())
- return slice[row*vocabSize : (row+1)*vocabSize]
- }
|