package main import ( "context" "encoding/json" "flag" "fmt" "log" "net" "path/filepath" "sort" "strconv" "strings" "time" "unsafe" "makarna/pkg/backend/cpu" "makarna/pkg/backend/cuda" "makarna/pkg/backend/device" "makarna/pkg/chat" "makarna/pkg/compute" "makarna/pkg/engine" "makarna/pkg/kvcache" "makarna/pkg/loader" "makarna/pkg/openai" "makarna/pkg/profile" "makarna/pkg/sample" "makarna/pkg/model" "makarna/pkg/tensor" "makarna/pkg/tokenizer" kimi_linear "makarna/pkg/model/models/kimi_linear" // Register KimiLinear model _ "makarna/pkg/model/models/qwen3" // Register Qwen3 model ) func main() { modelPath := flag.String("model", "model.mak", "Path to .mak model file") prompt := flag.String("prompt", "Hello world", "Prompt to generate") steps := flag.Int("steps", 10, "Number of tokens to generate") useChat := flag.Bool("chat", false, "Use chat format for prompt") serverMode := flag.Bool("server", false, "Run OpenAI-compatible HTTP server") listen := flag.String("listen", "", "Server listen address (e.g. :8080, 0.0.0.0:8080). If set, implies --server") host := flag.String("host", "127.0.0.1", "Server host (used when --listen is empty)") port := flag.Int("port", 8080, "Server port (used when --listen is empty)") temperature := flag.Float64("temp", 0.7, "Sampling temperature (0 = greedy)") topK := flag.Int("top-k", 40, "Top-K sampling (0 = disabled)") topP := flag.Float64("top-p", 0.9, "Top-P nucleus sampling (1.0 = disabled)") repPenalty := flag.Float64("rep-penalty", 1.1, "Repetition penalty (1.0 = disabled)") threads := flag.Int("threads", -1, "Number of CPU threads to use (default: 90% of cores)") listTensors := flag.Bool("list-tensors", false, "List tensors in model and exit") useMmap := flag.Bool("mmap", false, "Use mmap for model weights (default: false)") // Device placement flags - llama.cpp style nGPULayers := flag.Int("n-gpu-layers", -1, "Number of layers to offload to GPU (-1=auto, 0=CPU only)") gpuBudget := flag.Float64("gpu-budget", 0.9, "Fraction of GPU memory to use (0.0-1.0)") gpuDevicesFlag := flag.String("gpu-devices", "", "Comma-separated GPU device ordinals to use (e.g. 0 or 0,1)") layerMap := flag.String("layer-map", "", "Advanced: layer placement map, e.g. 0-9:gpu0,10-19:gpu1,20-:cpu") cpuMoE := flag.Bool("cpu-moe", false, "Keep MoE expert weights on CPU (saves GPU memory for large MoE models)") blockSize := flag.Int("block-size", 16, "KV cache block size") maxSeq := flag.Int("max-seq-len", 2048, "Maximum sequence length to reserve in KV cache") kvCacheCPU := flag.Bool("kv-cache-cpu", false, "Force KV cache to CPU (default: GPU when available)") prefillChunkSize := flag.Int("prefill-chunk-size", 512, "Prompt prefill chunk size (llama.cpp eval batch size analogue)") maxConcurrent := flag.Int("max-concurrent", 1, "Server mode: max concurrent sequences to reserve KV/scratch for") // Profiling flags profileOn := flag.Bool("profile", false, "Enable profiling summary report (alias for -profile-log=report)") profileLog := flag.String("profile-log", "", "Enable profiling: 'true'=realtime screen, 'report'=summary only, or file path") flag.Parse() // Initialize profiling if *profileOn && *profileLog == "" { *profileLog = "report" } if *profileLog != "" { profile.Enable() switch strings.ToLower(*profileLog) { case "true", "1", "realtime": // Realtime output to stderr profile.SetRealtime(true) fmt.Println("Profiling enabled: realtime output to stderr") case "report", "summary": // Summary only at the end profile.SetRealtime(false) fmt.Println("Profiling enabled: summary report at end") default: // File path specified if err := profile.SetLogFile(*profileLog); err != nil { log.Fatalf("Failed to open profile log file: %v", err) } profile.SetRealtime(true) fmt.Printf("Profiling enabled: logging to %s\n", *profileLog) } defer func() { profile.Report() profile.Close() }() } cpu.SetMaxThreads(*threads) var gpuDevices []int if strings.TrimSpace(*gpuDevicesFlag) != "" { for _, part := range strings.Split(*gpuDevicesFlag, ",") { part = strings.TrimSpace(part) if part == "" { continue } id, err := strconv.Atoi(part) if err != nil { log.Fatalf("invalid --gpu-devices entry %q: %v", part, err) } gpuDevices = append(gpuDevices, id) } if len(gpuDevices) == 0 { log.Fatalf("invalid --gpu-devices: no devices parsed") } } // Determine engine config cfg := engine.Config{ GPULayers: *nGPULayers, GPUBudget: *gpuBudget, GPUDevices: gpuDevices, UseMmap: *useMmap, CPUMoE: *cpuMoE, } // Load Model fmt.Printf("Loading model from %s...\n", *modelPath) if *listTensors { md, err := loader.LoadWithOptions(*modelPath, loader.LoadOptions{UseMmap: *useMmap}) if err != nil { log.Fatalf("Failed to load model: %v", err) } defer md.Close() names := make([]string, 0, len(md.Metadata.Tensors)) for name := range md.Metadata.Tensors { names = append(names, name) } sort.Strings(names) for _, name := range names { info := md.Metadata.Tensors[name] fmt.Printf("%s\t%s\t%v\t%d\n", name, info.DType.String(), info.Shape, info.Size) } return } eng, err := engine.Load(*modelPath, cfg) if err != nil { log.Fatalf("Failed to load model: %v", err) } defer eng.Close() // Show device info if device.CUDAAvailable() { fmt.Println("CUDA available: yes") } else { fmt.Println("CUDA available: no (CPU only)") } modelConfig := eng.Model().Config() // If layer-map is specified, use that for KV cache placement var placements []tensor.DevicePlacement if *layerMap != "" { placements = parseLayerMap(modelConfig.NumLayers, *layerMap) } else if eng.Dispatcher() != nil { // Use dispatcher's placements placements = make([]tensor.DevicePlacement, modelConfig.NumLayers) for i := 0; i < modelConfig.NumLayers; i++ { placements[i] = eng.Dispatcher().LayerPlacement(i) } } fmt.Println("Model loaded successfully!") // Load Tokenizer var tok *tokenizer.Tokenizer tokData, err := eng.Loader().GetTokenizerData() if err == nil && len(tokData) > 0 { fmt.Println("Found embedded tokenizer in model file.") tok, err = tokenizer.LoadFromBytes(tokData) if err != nil { log.Printf("Warning: failed to load embedded tokenizer: %v", err) } } if tok == nil { modelDir := filepath.Dir(*modelPath) tokPath := filepath.Join(modelDir, "tokenizer.json") fmt.Printf("Loading tokenizer from %s...\n", tokPath) tok, err = tokenizer.LoadFromJSON(tokPath) if err != nil { log.Printf("Warning: failed to load tokenizer: %v", err) } } // Format prompt (optionally with chat template) finalPrompt := *prompt if *useChat { messages := []chat.Message{{Role: "user", Content: *prompt}} formatted, err := chat.RenderForArchitecture(modelConfig.Architecture, messages, chat.Options{ AddGenerationPrompt: true, EnableThinking: true, }) if err != nil { log.Fatalf("format prompt failed: %v", err) } finalPrompt = formatted fmt.Printf("Formatted prompt:\n%s\n", finalPrompt) } // Server mode: start HTTP server and block. // We do this after model+tokenizer are loaded so all flags (GPU, KV cache sizes, etc.) apply. if *listen != "" { *serverMode = true } if *serverMode { addr := *listen if addr == "" { addr = net.JoinHostPort(*host, strconv.Itoa(*port)) } // Ensure JSON is linked in this binary (avoid unused import when tags change) _ = json.Valid err := openai.Serve(eng, tok, modelConfig.Architecture, openai.Config{ Listen: addr, MaxSeqLen: *maxSeq, BlockSize: *blockSize, KVCacheCPU: *kvCacheCPU, EnableThinking: false, PrefillChunkSize: *prefillChunkSize, MaxConcurrent: *maxConcurrent, }) if err != nil { log.Fatalf("server failed: %v", err) } return } // Tokenize prompt var ids []int if tok != nil { ids = tok.Encode(finalPrompt) fmt.Printf("Tokens: %v\n", ids) } else { ids = []int{1, 2, 3} } // Initialize KV Cache var kv model.KVCache var pagedCache *kvcache.PagedKVCache if modelConfig.Architecture == "KimiLinearForCausalLM" { params := modelConfig.Params lacRaw := params["linear_attn_config"] lac, _ := lacRaw.(map[string]any) kdaNumHeads := int(lac["num_heads"].(float64)) kdaHeadDim := int(lac["head_dim"].(float64)) kdaKernel := int(lac["short_conv_kernel_size"].(float64)) mlaNumHeads := int(params["num_attention_heads"].(float64)) qkNope := int(params["qk_nope_head_dim"].(float64)) qkRope := int(params["qk_rope_head_dim"].(float64)) vDim := int(params["v_head_dim"].(float64)) kimiCache, err := kimi_linear.NewKimiCache(modelConfig.NumLayers, kdaNumHeads, kdaHeadDim, kdaKernel, mlaNumHeads, qkNope+qkRope, vDim) if err != nil { log.Fatalf("KimiCache alloc failed: %v", err) } kv = kimiCache fmt.Println("KV cache: KimiCache (CPU)") } else { // Default: enable GPU KV per-layer when ANY layer is on GPU (mixed offload supported), // unless --kv-cache-cpu is specified. kvDevice := tensor.CPU if !*kvCacheCPU && device.CUDAAvailable() { for i := 0; i < modelConfig.NumLayers && i < len(placements); i++ { if placements[i].Normalize().Type == tensor.CUDA { kvDevice = tensor.CUDA break } } } switch kvDevice { case tensor.CUDA: fmt.Println("KV cache: mixed (per-layer)") default: fmt.Println("KV cache: CPU") } pool, err := kvcache.NewBlockPool(kvcache.BlockPoolConfig{ NumLayers: modelConfig.NumLayers, NumKVHeads: modelConfig.NumKVHeads, HeadDim: modelConfig.HeadDim, BlockSize: *blockSize, NumBlocks: (*maxSeq + *blockSize - 1) / (*blockSize), Device: kvDevice, GPU: 0, LayerPlacements: func() []tensor.DevicePlacement { if kvDevice != tensor.CUDA || len(placements) != modelConfig.NumLayers { return nil } out := make([]tensor.DevicePlacement, modelConfig.NumLayers) for i := 0; i < modelConfig.NumLayers; i++ { out[i] = placements[i].Normalize() } return out }(), Preallocate: kvDevice == tensor.CUDA, }) if err != nil { log.Fatalf("NewBlockPool failed: %v", err) } pagedCache = kvcache.NewPagedKVCache(pool, kvcache.PagedCacheConfig{ NumLayers: modelConfig.NumLayers, NumKVHeads: modelConfig.NumKVHeads, HeadDim: modelConfig.HeadDim, BlockSize: *blockSize, MaxSeqLen: *maxSeq, Device: kvDevice, GPU: 0, }, "run-model") if _, err := pagedCache.AllocateForTokens(ids); err != nil { pagedCache.Free() log.Fatalf("PagedKVCache alloc failed: %v", err) } defer pagedCache.Free() kv = pagedCache } // Preallocate scratch buffers once so prefill doesn't hit cudaMalloc churn. runCtx := context.Background() needWarmup := false if device.CUDAAvailable() && eng.Dispatcher() != nil && cuda.Available() { gpuSeen := make(map[int]struct{}) var gpus []int for i := 0; i < modelConfig.NumLayers; i++ { p := eng.Dispatcher().LayerPlacement(i).Normalize() if p.Type != tensor.CUDA || p.GPU < 0 { continue } if _, ok := gpuSeen[p.GPU]; ok { continue } gpuSeen[p.GPU] = struct{}{} gpus = append(gpus, p.GPU) } if len(gpus) > 0 { const minScratchBytes = 8 << 20 var ( ss *compute.ScratchSet scratchErr error scratchSize = compute.DefaultScratchBytes ) for scratchSize >= minScratchBytes { var err error ss, err = compute.NewScratchSet(gpus, scratchSize) if err == nil { break } scratchErr = err ss = nil scratchSize /= 2 } if ss != nil { defer ss.Free() runCtx = compute.WithScratchSet(runCtx, ss) runCtx = compute.WithScratch(runCtx, ss.Scratch(gpus[0])) needWarmup = true log.Printf("scratch: gpus=%v bytes=%d", gpus, scratchSize) } else if scratchErr != nil { log.Printf("scratch disabled (alloc failed): %v", scratchErr) } } } if needWarmup { if _, err := eng.Forward(runCtx, createInputTensor([]int{0}), createPositionTensor(0, 1), nil); err != nil { log.Fatalf("warmup forward failed: %v", err) } compute.LogWeightCacheSummary() } // Initialize Sampler sampler := sample.New(sample.Config{ Temperature: float32(*temperature), TopK: *topK, TopP: float32(*topP), RepetitionPenalty: float32(*repPenalty), Seed: -1, }) // Prefill prompt in chunks (llama.cpp eval batch size analogue). chunk := *prefillChunkSize if chunk <= 0 { chunk = 512 } var logits tensor.Tensor for start := 0; start < len(ids); start += chunk { end := start + chunk if end > len(ids) { end = len(ids) } part := ids[start:end] input := createInputTensor(part) positions := createPositionTensor(kv.SeqLen(), len(part)) before := kv.SeqLen() profile.Start("Prefill/Forward") out, err := eng.Forward(runCtx, input, positions, kv) profile.End("Prefill/Forward") if err != nil { log.Fatalf("Prefill forward failed: %v", err) } logits = out if kv.SeqLen() == before { kv.Commit(len(part)) } } if logits == nil { log.Fatalf("prefill produced nil logits") } // Sample first generated token lastPartLen := len(ids) % chunk if lastPartLen == 0 { if chunk < len(ids) { lastPartLen = chunk } else { lastPartLen = len(ids) } } rowIdx := lastPartLen - 1 logitsSlice := getLogitsRowCPU(logits, rowIdx) var nextToken int if logitsSlice != nil { profile.Start("Prefill/Sample") nextToken = sampler.Sample(logitsSlice, ids) profile.End("Prefill/Sample") } else { // CUDA path: take top-k (or argmax) from GPU, copy only small candidate list. gpuLogits := logits.(*cuda.Tensor) vocabSize := gpuLogits.Shape()[1] view, err := gpuLogits.ViewAt(tensor.Shape{vocabSize}, uintptr(rowIdx*vocabSize*4)) if err != nil { log.Fatalf("logits view failed: %v", err) } k := *topK if *temperature == 0 { k = 1 } if k <= 0 { // Semantics-preserving fallback: copy full logits row to CPU and use existing sampler. host := make([]float32, vocabSize) profile.Start("Prefill/LogitsD2H") if err := view.CopyToHost(host); err != nil { log.Fatalf("logits D2H failed: %v", err) } profile.End("Prefill/LogitsD2H") profile.Start("Prefill/Sample") nextToken = sampler.Sample(host, ids) profile.End("Prefill/Sample") goto sampledPrefill } recent := ids if len(recent) > 64 { recent = recent[len(recent)-64:] } repIDs := make([]int32, len(recent)) for i, t := range recent { repIDs[i] = int32(t) } profile.Start("Prefill/TopK") allIDs, allScores, blocks, err := cuda.TopKLogitsF32(view.Data().(unsafe.Pointer), vocabSize, repIDs, float32(*repPenalty), k, gpuLogits.GPU()) profile.End("Prefill/TopK") if err != nil { log.Fatalf("cuda topk failed: %v", err) } // Merge per-block candidates on CPU to get global top-k cands := make([]struct { id int32 score float32 }, 0, blocks*k) for i := 0; i < blocks*k; i++ { if allIDs[i] < 0 { continue } cands = append(cands, struct { id int32 score float32 }{id: allIDs[i], score: allScores[i]}) } sort.Slice(cands, func(i, j int) bool { return cands[i].score > cands[j].score }) if len(cands) > k { cands = cands[:k] } finalIDs := make([]int32, len(cands)) finalScores := make([]float32, len(cands)) for i := range cands { finalIDs[i] = cands[i].id finalScores[i] = cands[i].score } profile.Start("Prefill/Sample") nextToken = sampler.SampleFromTopK(finalIDs, finalScores) profile.End("Prefill/Sample") } sampledPrefill: if tok != nil { fmt.Print(tok.Decode([]int{nextToken})) } ids = append(ids, nextToken) if pagedCache != nil { pagedCache.AppendToken(nextToken) } // Autoregressive generation with KV Cache eosID := 151645 // <|im_end|> if tok != nil { eosID = tok.EosID() } startGen := time.Now() genTokens := 0 for i := 1; i < *steps; i++ { profile.TokenStart() // Check for EOS if nextToken == eosID { profile.TokenEnd() break } // Prepare single token input input := createInputTensor([]int{nextToken}) currentPos := len(ids) - 1 positions := createPositionTensor(currentPos, 1) profile.Start("Decode/Forward") logits, err = eng.Forward(runCtx, input, positions, kv) profile.End("Decode/Forward") if err != nil { log.Fatalf("Forward failed: %v", err) } // Sample with recent context for repetition penalty logitsSlice = getLogitsRowCPU(logits, 0) recentTokens := ids if len(ids) > 64 { recentTokens = ids[len(ids)-64:] } if logitsSlice != nil { profile.Start("Decode/Sample") nextToken = sampler.Sample(logitsSlice, recentTokens) profile.End("Decode/Sample") } else { gpuLogits := logits.(*cuda.Tensor) vocabSize := gpuLogits.Shape()[1] view, err := gpuLogits.ViewAt(tensor.Shape{vocabSize}, 0) if err != nil { log.Fatalf("logits view failed: %v", err) } k := *topK if *temperature == 0 { k = 1 } if k <= 0 { host := make([]float32, vocabSize) profile.Start("Decode/LogitsD2H") if err := view.CopyToHost(host); err != nil { log.Fatalf("logits D2H failed: %v", err) } profile.End("Decode/LogitsD2H") profile.Start("Decode/Sample") nextToken = sampler.Sample(host, recentTokens) profile.End("Decode/Sample") goto sampledDecode } repIDs := make([]int32, len(recentTokens)) for i, t := range recentTokens { repIDs[i] = int32(t) } profile.Start("Decode/TopK") allIDs, allScores, blocks, err := cuda.TopKLogitsF32(view.Data().(unsafe.Pointer), vocabSize, repIDs, float32(*repPenalty), k, gpuLogits.GPU()) profile.End("Decode/TopK") if err != nil { log.Fatalf("cuda topk failed: %v", err) } cands := make([]struct { id int32 score float32 }, 0, blocks*k) for i := 0; i < blocks*k; i++ { if allIDs[i] < 0 { continue } cands = append(cands, struct { id int32 score float32 }{id: allIDs[i], score: allScores[i]}) } sort.Slice(cands, func(i, j int) bool { return cands[i].score > cands[j].score }) if len(cands) > k { cands = cands[:k] } finalIDs := make([]int32, len(cands)) finalScores := make([]float32, len(cands)) for i := range cands { finalIDs[i] = cands[i].id finalScores[i] = cands[i].score } profile.Start("Decode/Sample") nextToken = sampler.SampleFromTopK(finalIDs, finalScores) profile.End("Decode/Sample") } sampledDecode: if tok != nil { fmt.Print(tok.Decode([]int{nextToken})) } ids = append(ids, nextToken) if pagedCache != nil { pagedCache.AppendToken(nextToken) } genTokens++ profile.TokenEnd() } duration := time.Since(startGen) fmt.Printf("\n\nDone. Generated %d tokens in %v (%.2f tok/s)\n", genTokens, duration, float64(genTokens)/duration.Seconds()) } func createInputTensor(ids []int) tensor.Tensor { t := cpu.NewTensor(tensor.Shape{len(ids)}, nil) data := t.DataFloat32() for i, id := range ids { data[i] = float32(id) } return t } func createPositionTensor(start, count int) tensor.Tensor { t := cpu.NewTensor(tensor.Shape{count}, nil) data := t.DataFloat32() for i := 0; i < count; i++ { data[i] = float32(start + i) } return t } func getLogitsRowCPU(logits tensor.Tensor, row int) []float32 { if _, ok := logits.(*cpu.Tensor); !ok { return nil } data := logits.Data().(unsafe.Pointer) shape := logits.Shape() vocabSize := shape[1] slice := unsafe.Slice((*float32)(data), shape.NumElements()) return slice[row*vocabSize : (row+1)*vocabSize] } // parseLayerMap parses a comma-separated placement string like // "0-9:gpu0,10-19:gpu1,20-:cpu" and returns per-layer placements. func parseLayerMap(numLayers int, spec string) []tensor.DevicePlacement { placements := make([]tensor.DevicePlacement, numLayers) for i := range placements { placements[i] = tensor.DevicePlacement{Type: tensor.CPU, GPU: -1} } if spec == "" { return placements } entries := strings.Split(spec, ",") for _, entry := range entries { entry = strings.TrimSpace(entry) if entry == "" { continue } parts := strings.Split(entry, ":") if len(parts) != 2 { log.Printf("invalid layer-map entry %q, skipping", entry) continue } rng, target := strings.TrimSpace(parts[0]), strings.TrimSpace(parts[1]) start, end := 0, numLayers-1 if rng != "" { if strings.Contains(rng, "-") { rp := strings.SplitN(rng, "-", 2) if rp[0] != "" { if v, err := strconv.Atoi(rp[0]); err == nil { start = v } } if rp[1] != "" { if v, err := strconv.Atoi(rp[1]); err == nil { end = v } } } else if v, err := strconv.Atoi(rng); err == nil { start, end = v, v } } if start < 0 { start = 0 } if end >= numLayers { end = numLayers - 1 } var placement tensor.DevicePlacement switch { case strings.HasPrefix(strings.ToLower(target), "gpu"): idStr := strings.TrimPrefix(strings.ToLower(target), "gpu") id := 0 if idStr != "" { if v, err := strconv.Atoi(idStr); err == nil { id = v } } placement = tensor.DevicePlacement{Type: tensor.CUDA, GPU: id}.Normalize() case strings.ToLower(target) == "cpu": placement = tensor.DevicePlacement{Type: tensor.CPU, GPU: -1} default: log.Printf("unknown target %q, defaulting to CPU", target) placement = tensor.DevicePlacement{Type: tensor.CPU, GPU: -1} } for i := start; i <= end && i < numLayers; i++ { placements[i] = placement } } return placements }