| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729 |
- package main
- import (
- "context"
- "encoding/json"
- "flag"
- "fmt"
- "log"
- "net"
- "path/filepath"
- "sort"
- "strconv"
- "strings"
- "time"
- "unsafe"
- "makarna/pkg/backend/cpu"
- "makarna/pkg/backend/cuda"
- "makarna/pkg/backend/device"
- "makarna/pkg/chat"
- "makarna/pkg/compute"
- "makarna/pkg/engine"
- "makarna/pkg/kvcache"
- "makarna/pkg/loader"
- "makarna/pkg/openai"
- "makarna/pkg/profile"
- "makarna/pkg/sample"
- "makarna/pkg/model"
- "makarna/pkg/tensor"
- "makarna/pkg/tokenizer"
- kimi_linear "makarna/pkg/model/models/kimi_linear" // Register KimiLinear model
- _ "makarna/pkg/model/models/qwen3" // Register Qwen3 model
- )
- func main() {
- modelPath := flag.String("model", "model.mak", "Path to .mak model file")
- prompt := flag.String("prompt", "Hello world", "Prompt to generate")
- steps := flag.Int("steps", 10, "Number of tokens to generate")
- useChat := flag.Bool("chat", false, "Use chat format for prompt")
- serverMode := flag.Bool("server", false, "Run OpenAI-compatible HTTP server")
- listen := flag.String("listen", "", "Server listen address (e.g. :8080, 0.0.0.0:8080). If set, implies --server")
- host := flag.String("host", "127.0.0.1", "Server host (used when --listen is empty)")
- port := flag.Int("port", 8080, "Server port (used when --listen is empty)")
- temperature := flag.Float64("temp", 0.7, "Sampling temperature (0 = greedy)")
- topK := flag.Int("top-k", 40, "Top-K sampling (0 = disabled)")
- topP := flag.Float64("top-p", 0.9, "Top-P nucleus sampling (1.0 = disabled)")
- repPenalty := flag.Float64("rep-penalty", 1.1, "Repetition penalty (1.0 = disabled)")
- threads := flag.Int("threads", -1, "Number of CPU threads to use (default: 90% of cores)")
- listTensors := flag.Bool("list-tensors", false, "List tensors in model and exit")
- useMmap := flag.Bool("mmap", false, "Use mmap for model weights (default: false)")
- // Device placement flags - llama.cpp style
- nGPULayers := flag.Int("n-gpu-layers", -1, "Number of layers to offload to GPU (-1=auto, 0=CPU only)")
- gpuBudget := flag.Float64("gpu-budget", 0.9, "Fraction of GPU memory to use (0.0-1.0)")
- gpuDevicesFlag := flag.String("gpu-devices", "", "Comma-separated GPU device ordinals to use (e.g. 0 or 0,1)")
- layerMap := flag.String("layer-map", "", "Advanced: layer placement map, e.g. 0-9:gpu0,10-19:gpu1,20-:cpu")
- cpuMoE := flag.Bool("cpu-moe", false, "Keep MoE expert weights on CPU (saves GPU memory for large MoE models)")
- blockSize := flag.Int("block-size", 16, "KV cache block size")
- maxSeq := flag.Int("max-seq-len", 2048, "Maximum sequence length to reserve in KV cache")
- kvCacheCPU := flag.Bool("kv-cache-cpu", false, "Force KV cache to CPU (default: GPU when available)")
- prefillChunkSize := flag.Int("prefill-chunk-size", 512, "Prompt prefill chunk size (llama.cpp eval batch size analogue)")
- maxConcurrent := flag.Int("max-concurrent", 1, "Server mode: max concurrent sequences to reserve KV/scratch for")
- // Profiling flags
- profileOn := flag.Bool("profile", false, "Enable profiling summary report (alias for -profile-log=report)")
- profileLog := flag.String("profile-log", "", "Enable profiling: 'true'=realtime screen, 'report'=summary only, or file path")
- flag.Parse()
- // Initialize profiling
- if *profileOn && *profileLog == "" {
- *profileLog = "report"
- }
- if *profileLog != "" {
- profile.Enable()
- switch strings.ToLower(*profileLog) {
- case "true", "1", "realtime":
- // Realtime output to stderr
- profile.SetRealtime(true)
- fmt.Println("Profiling enabled: realtime output to stderr")
- case "report", "summary":
- // Summary only at the end
- profile.SetRealtime(false)
- fmt.Println("Profiling enabled: summary report at end")
- default:
- // File path specified
- if err := profile.SetLogFile(*profileLog); err != nil {
- log.Fatalf("Failed to open profile log file: %v", err)
- }
- profile.SetRealtime(true)
- fmt.Printf("Profiling enabled: logging to %s\n", *profileLog)
- }
- defer func() {
- profile.Report()
- profile.Close()
- }()
- }
- cpu.SetMaxThreads(*threads)
- var gpuDevices []int
- if strings.TrimSpace(*gpuDevicesFlag) != "" {
- for _, part := range strings.Split(*gpuDevicesFlag, ",") {
- part = strings.TrimSpace(part)
- if part == "" {
- continue
- }
- id, err := strconv.Atoi(part)
- if err != nil {
- log.Fatalf("invalid --gpu-devices entry %q: %v", part, err)
- }
- gpuDevices = append(gpuDevices, id)
- }
- if len(gpuDevices) == 0 {
- log.Fatalf("invalid --gpu-devices: no devices parsed")
- }
- }
- // Determine engine config
- cfg := engine.Config{
- GPULayers: *nGPULayers,
- GPUBudget: *gpuBudget,
- GPUDevices: gpuDevices,
- UseMmap: *useMmap,
- CPUMoE: *cpuMoE,
- }
- // Load Model
- fmt.Printf("Loading model from %s...\n", *modelPath)
- if *listTensors {
- md, err := loader.LoadWithOptions(*modelPath, loader.LoadOptions{UseMmap: *useMmap})
- if err != nil {
- log.Fatalf("Failed to load model: %v", err)
- }
- defer md.Close()
- names := make([]string, 0, len(md.Metadata.Tensors))
- for name := range md.Metadata.Tensors {
- names = append(names, name)
- }
- sort.Strings(names)
- for _, name := range names {
- info := md.Metadata.Tensors[name]
- fmt.Printf("%s\t%s\t%v\t%d\n", name, info.DType.String(), info.Shape, info.Size)
- }
- return
- }
- eng, err := engine.Load(*modelPath, cfg)
- if err != nil {
- log.Fatalf("Failed to load model: %v", err)
- }
- defer eng.Close()
- // Show device info
- if device.CUDAAvailable() {
- fmt.Println("CUDA available: yes")
- } else {
- fmt.Println("CUDA available: no (CPU only)")
- }
- modelConfig := eng.Model().Config()
- // If layer-map is specified, use that for KV cache placement
- var placements []tensor.DevicePlacement
- if *layerMap != "" {
- placements = parseLayerMap(modelConfig.NumLayers, *layerMap)
- } else if eng.Dispatcher() != nil {
- // Use dispatcher's placements
- placements = make([]tensor.DevicePlacement, modelConfig.NumLayers)
- for i := 0; i < modelConfig.NumLayers; i++ {
- placements[i] = eng.Dispatcher().LayerPlacement(i)
- }
- }
- fmt.Println("Model loaded successfully!")
- // Load Tokenizer
- var tok *tokenizer.Tokenizer
- tokData, err := eng.Loader().GetTokenizerData()
- if err == nil && len(tokData) > 0 {
- fmt.Println("Found embedded tokenizer in model file.")
- tok, err = tokenizer.LoadFromBytes(tokData)
- if err != nil {
- log.Printf("Warning: failed to load embedded tokenizer: %v", err)
- }
- }
- if tok == nil {
- modelDir := filepath.Dir(*modelPath)
- tokPath := filepath.Join(modelDir, "tokenizer.json")
- fmt.Printf("Loading tokenizer from %s...\n", tokPath)
- tok, err = tokenizer.LoadFromJSON(tokPath)
- if err != nil {
- log.Printf("Warning: failed to load tokenizer: %v", err)
- }
- }
- // Format prompt (optionally with chat template)
- finalPrompt := *prompt
- if *useChat {
- messages := []chat.Message{{Role: "user", Content: *prompt}}
- formatted, err := chat.RenderForArchitecture(modelConfig.Architecture, messages, chat.Options{
- AddGenerationPrompt: true,
- EnableThinking: true,
- })
- if err != nil {
- log.Fatalf("format prompt failed: %v", err)
- }
- finalPrompt = formatted
- fmt.Printf("Formatted prompt:\n%s\n", finalPrompt)
- }
- // Server mode: start HTTP server and block.
- // We do this after model+tokenizer are loaded so all flags (GPU, KV cache sizes, etc.) apply.
- if *listen != "" {
- *serverMode = true
- }
- if *serverMode {
- addr := *listen
- if addr == "" {
- addr = net.JoinHostPort(*host, strconv.Itoa(*port))
- }
- // Ensure JSON is linked in this binary (avoid unused import when tags change)
- _ = json.Valid
- err := openai.Serve(eng, tok, modelConfig.Architecture, openai.Config{
- Listen: addr,
- MaxSeqLen: *maxSeq,
- BlockSize: *blockSize,
- KVCacheCPU: *kvCacheCPU,
- EnableThinking: false,
- PrefillChunkSize: *prefillChunkSize,
- MaxConcurrent: *maxConcurrent,
- })
- if err != nil {
- log.Fatalf("server failed: %v", err)
- }
- return
- }
- // Tokenize prompt
- var ids []int
- if tok != nil {
- ids = tok.Encode(finalPrompt)
- fmt.Printf("Tokens: %v\n", ids)
- } else {
- ids = []int{1, 2, 3}
- }
- // Initialize KV Cache
- var kv model.KVCache
- var pagedCache *kvcache.PagedKVCache
- if modelConfig.Architecture == "KimiLinearForCausalLM" {
- params := modelConfig.Params
- lacRaw := params["linear_attn_config"]
- lac, _ := lacRaw.(map[string]any)
- kdaNumHeads := int(lac["num_heads"].(float64))
- kdaHeadDim := int(lac["head_dim"].(float64))
- kdaKernel := int(lac["short_conv_kernel_size"].(float64))
- mlaNumHeads := int(params["num_attention_heads"].(float64))
- qkNope := int(params["qk_nope_head_dim"].(float64))
- qkRope := int(params["qk_rope_head_dim"].(float64))
- vDim := int(params["v_head_dim"].(float64))
- kimiCache, err := kimi_linear.NewKimiCache(modelConfig.NumLayers, kdaNumHeads, kdaHeadDim, kdaKernel, mlaNumHeads, qkNope+qkRope, vDim)
- if err != nil {
- log.Fatalf("KimiCache alloc failed: %v", err)
- }
- kv = kimiCache
- fmt.Println("KV cache: KimiCache (CPU)")
- } else {
- // Default: enable GPU KV per-layer when ANY layer is on GPU (mixed offload supported),
- // unless --kv-cache-cpu is specified.
- kvDevice := tensor.CPU
- if !*kvCacheCPU && device.CUDAAvailable() {
- for i := 0; i < modelConfig.NumLayers && i < len(placements); i++ {
- if placements[i].Normalize().Type == tensor.CUDA {
- kvDevice = tensor.CUDA
- break
- }
- }
- }
- switch kvDevice {
- case tensor.CUDA:
- fmt.Println("KV cache: mixed (per-layer)")
- default:
- fmt.Println("KV cache: CPU")
- }
- pool, err := kvcache.NewBlockPool(kvcache.BlockPoolConfig{
- NumLayers: modelConfig.NumLayers,
- NumKVHeads: modelConfig.NumKVHeads,
- HeadDim: modelConfig.HeadDim,
- BlockSize: *blockSize,
- NumBlocks: (*maxSeq + *blockSize - 1) / (*blockSize),
- Device: kvDevice,
- GPU: 0,
- LayerPlacements: func() []tensor.DevicePlacement {
- if kvDevice != tensor.CUDA || len(placements) != modelConfig.NumLayers {
- return nil
- }
- out := make([]tensor.DevicePlacement, modelConfig.NumLayers)
- for i := 0; i < modelConfig.NumLayers; i++ {
- out[i] = placements[i].Normalize()
- }
- return out
- }(),
- Preallocate: kvDevice == tensor.CUDA,
- })
- if err != nil {
- log.Fatalf("NewBlockPool failed: %v", err)
- }
- pagedCache = kvcache.NewPagedKVCache(pool, kvcache.PagedCacheConfig{
- NumLayers: modelConfig.NumLayers,
- NumKVHeads: modelConfig.NumKVHeads,
- HeadDim: modelConfig.HeadDim,
- BlockSize: *blockSize,
- MaxSeqLen: *maxSeq,
- Device: kvDevice,
- GPU: 0,
- }, "run-model")
- if _, err := pagedCache.AllocateForTokens(ids); err != nil {
- pagedCache.Free()
- log.Fatalf("PagedKVCache alloc failed: %v", err)
- }
- defer pagedCache.Free()
- kv = pagedCache
- }
- // Preallocate scratch buffers once so prefill doesn't hit cudaMalloc churn.
- runCtx := context.Background()
- needWarmup := false
- if device.CUDAAvailable() && eng.Dispatcher() != nil && cuda.Available() {
- gpuSeen := make(map[int]struct{})
- var gpus []int
- for i := 0; i < modelConfig.NumLayers; i++ {
- p := eng.Dispatcher().LayerPlacement(i).Normalize()
- if p.Type != tensor.CUDA || p.GPU < 0 {
- continue
- }
- if _, ok := gpuSeen[p.GPU]; ok {
- continue
- }
- gpuSeen[p.GPU] = struct{}{}
- gpus = append(gpus, p.GPU)
- }
- if len(gpus) > 0 {
- const minScratchBytes = 8 << 20
- var (
- ss *compute.ScratchSet
- scratchErr error
- scratchSize = compute.DefaultScratchBytes
- )
- for scratchSize >= minScratchBytes {
- var err error
- ss, err = compute.NewScratchSet(gpus, scratchSize)
- if err == nil {
- break
- }
- scratchErr = err
- ss = nil
- scratchSize /= 2
- }
- if ss != nil {
- defer ss.Free()
- runCtx = compute.WithScratchSet(runCtx, ss)
- runCtx = compute.WithScratch(runCtx, ss.Scratch(gpus[0]))
- needWarmup = true
- log.Printf("scratch: gpus=%v bytes=%d", gpus, scratchSize)
- } else if scratchErr != nil {
- log.Printf("scratch disabled (alloc failed): %v", scratchErr)
- }
- }
- }
- if needWarmup {
- if _, err := eng.Forward(runCtx, createInputTensor([]int{0}), createPositionTensor(0, 1), nil); err != nil {
- log.Fatalf("warmup forward failed: %v", err)
- }
- compute.LogWeightCacheSummary()
- }
- // Initialize Sampler
- sampler := sample.New(sample.Config{
- Temperature: float32(*temperature),
- TopK: *topK,
- TopP: float32(*topP),
- RepetitionPenalty: float32(*repPenalty),
- Seed: -1,
- })
- // Prefill prompt in chunks (llama.cpp eval batch size analogue).
- chunk := *prefillChunkSize
- if chunk <= 0 {
- chunk = 512
- }
- var logits tensor.Tensor
- for start := 0; start < len(ids); start += chunk {
- end := start + chunk
- if end > len(ids) {
- end = len(ids)
- }
- part := ids[start:end]
- input := createInputTensor(part)
- positions := createPositionTensor(kv.SeqLen(), len(part))
- before := kv.SeqLen()
- profile.Start("Prefill/Forward")
- out, err := eng.Forward(runCtx, input, positions, kv)
- profile.End("Prefill/Forward")
- if err != nil {
- log.Fatalf("Prefill forward failed: %v", err)
- }
- logits = out
- if kv.SeqLen() == before {
- kv.Commit(len(part))
- }
- }
- if logits == nil {
- log.Fatalf("prefill produced nil logits")
- }
- // Sample first generated token
- lastPartLen := len(ids) % chunk
- if lastPartLen == 0 {
- if chunk < len(ids) {
- lastPartLen = chunk
- } else {
- lastPartLen = len(ids)
- }
- }
- rowIdx := lastPartLen - 1
- logitsSlice := getLogitsRowCPU(logits, rowIdx)
- var nextToken int
- if logitsSlice != nil {
- profile.Start("Prefill/Sample")
- nextToken = sampler.Sample(logitsSlice, ids)
- profile.End("Prefill/Sample")
- } else {
- // CUDA path: take top-k (or argmax) from GPU, copy only small candidate list.
- gpuLogits := logits.(*cuda.Tensor)
- vocabSize := gpuLogits.Shape()[1]
- view, err := gpuLogits.ViewAt(tensor.Shape{vocabSize}, uintptr(rowIdx*vocabSize*4))
- if err != nil {
- log.Fatalf("logits view failed: %v", err)
- }
- k := *topK
- if *temperature == 0 {
- k = 1
- }
- if k <= 0 {
- // Semantics-preserving fallback: copy full logits row to CPU and use existing sampler.
- host := make([]float32, vocabSize)
- profile.Start("Prefill/LogitsD2H")
- if err := view.CopyToHost(host); err != nil {
- log.Fatalf("logits D2H failed: %v", err)
- }
- profile.End("Prefill/LogitsD2H")
- profile.Start("Prefill/Sample")
- nextToken = sampler.Sample(host, ids)
- profile.End("Prefill/Sample")
- goto sampledPrefill
- }
- recent := ids
- if len(recent) > 64 {
- recent = recent[len(recent)-64:]
- }
- repIDs := make([]int32, len(recent))
- for i, t := range recent {
- repIDs[i] = int32(t)
- }
- profile.Start("Prefill/TopK")
- allIDs, allScores, blocks, err := cuda.TopKLogitsF32(view.Data().(unsafe.Pointer), vocabSize, repIDs, float32(*repPenalty), k, gpuLogits.GPU())
- profile.End("Prefill/TopK")
- if err != nil {
- log.Fatalf("cuda topk failed: %v", err)
- }
- // Merge per-block candidates on CPU to get global top-k
- cands := make([]struct {
- id int32
- score float32
- }, 0, blocks*k)
- for i := 0; i < blocks*k; i++ {
- if allIDs[i] < 0 {
- continue
- }
- cands = append(cands, struct {
- id int32
- score float32
- }{id: allIDs[i], score: allScores[i]})
- }
- sort.Slice(cands, func(i, j int) bool { return cands[i].score > cands[j].score })
- if len(cands) > k {
- cands = cands[:k]
- }
- finalIDs := make([]int32, len(cands))
- finalScores := make([]float32, len(cands))
- for i := range cands {
- finalIDs[i] = cands[i].id
- finalScores[i] = cands[i].score
- }
- profile.Start("Prefill/Sample")
- nextToken = sampler.SampleFromTopK(finalIDs, finalScores)
- profile.End("Prefill/Sample")
- }
- sampledPrefill:
- if tok != nil {
- fmt.Print(tok.Decode([]int{nextToken}))
- }
- ids = append(ids, nextToken)
- if pagedCache != nil {
- pagedCache.AppendToken(nextToken)
- }
- // Autoregressive generation with KV Cache
- eosID := 151645 // <|im_end|>
- if tok != nil {
- eosID = tok.EosID()
- }
- startGen := time.Now()
- genTokens := 0
- for i := 1; i < *steps; i++ {
- profile.TokenStart()
- // Check for EOS
- if nextToken == eosID {
- profile.TokenEnd()
- break
- }
- // Prepare single token input
- input := createInputTensor([]int{nextToken})
- currentPos := len(ids) - 1
- positions := createPositionTensor(currentPos, 1)
- profile.Start("Decode/Forward")
- logits, err = eng.Forward(runCtx, input, positions, kv)
- profile.End("Decode/Forward")
- if err != nil {
- log.Fatalf("Forward failed: %v", err)
- }
- // Sample with recent context for repetition penalty
- logitsSlice = getLogitsRowCPU(logits, 0)
- recentTokens := ids
- if len(ids) > 64 {
- recentTokens = ids[len(ids)-64:]
- }
- if logitsSlice != nil {
- profile.Start("Decode/Sample")
- nextToken = sampler.Sample(logitsSlice, recentTokens)
- profile.End("Decode/Sample")
- } else {
- gpuLogits := logits.(*cuda.Tensor)
- vocabSize := gpuLogits.Shape()[1]
- view, err := gpuLogits.ViewAt(tensor.Shape{vocabSize}, 0)
- if err != nil {
- log.Fatalf("logits view failed: %v", err)
- }
- k := *topK
- if *temperature == 0 {
- k = 1
- }
- if k <= 0 {
- host := make([]float32, vocabSize)
- profile.Start("Decode/LogitsD2H")
- if err := view.CopyToHost(host); err != nil {
- log.Fatalf("logits D2H failed: %v", err)
- }
- profile.End("Decode/LogitsD2H")
- profile.Start("Decode/Sample")
- nextToken = sampler.Sample(host, recentTokens)
- profile.End("Decode/Sample")
- goto sampledDecode
- }
- repIDs := make([]int32, len(recentTokens))
- for i, t := range recentTokens {
- repIDs[i] = int32(t)
- }
- profile.Start("Decode/TopK")
- allIDs, allScores, blocks, err := cuda.TopKLogitsF32(view.Data().(unsafe.Pointer), vocabSize, repIDs, float32(*repPenalty), k, gpuLogits.GPU())
- profile.End("Decode/TopK")
- if err != nil {
- log.Fatalf("cuda topk failed: %v", err)
- }
- cands := make([]struct {
- id int32
- score float32
- }, 0, blocks*k)
- for i := 0; i < blocks*k; i++ {
- if allIDs[i] < 0 {
- continue
- }
- cands = append(cands, struct {
- id int32
- score float32
- }{id: allIDs[i], score: allScores[i]})
- }
- sort.Slice(cands, func(i, j int) bool { return cands[i].score > cands[j].score })
- if len(cands) > k {
- cands = cands[:k]
- }
- finalIDs := make([]int32, len(cands))
- finalScores := make([]float32, len(cands))
- for i := range cands {
- finalIDs[i] = cands[i].id
- finalScores[i] = cands[i].score
- }
- profile.Start("Decode/Sample")
- nextToken = sampler.SampleFromTopK(finalIDs, finalScores)
- profile.End("Decode/Sample")
- }
- sampledDecode:
- if tok != nil {
- fmt.Print(tok.Decode([]int{nextToken}))
- }
- ids = append(ids, nextToken)
- if pagedCache != nil {
- pagedCache.AppendToken(nextToken)
- }
- genTokens++
- profile.TokenEnd()
- }
- duration := time.Since(startGen)
- fmt.Printf("\n\nDone. Generated %d tokens in %v (%.2f tok/s)\n", genTokens, duration, float64(genTokens)/duration.Seconds())
- }
- func createInputTensor(ids []int) tensor.Tensor {
- t := cpu.NewTensor(tensor.Shape{len(ids)}, nil)
- data := t.DataFloat32()
- for i, id := range ids {
- data[i] = float32(id)
- }
- return t
- }
- func createPositionTensor(start, count int) tensor.Tensor {
- t := cpu.NewTensor(tensor.Shape{count}, nil)
- data := t.DataFloat32()
- for i := 0; i < count; i++ {
- data[i] = float32(start + i)
- }
- return t
- }
- func getLogitsRowCPU(logits tensor.Tensor, row int) []float32 {
- if _, ok := logits.(*cpu.Tensor); !ok {
- return nil
- }
- data := logits.Data().(unsafe.Pointer)
- shape := logits.Shape()
- vocabSize := shape[1]
- slice := unsafe.Slice((*float32)(data), shape.NumElements())
- return slice[row*vocabSize : (row+1)*vocabSize]
- }
- // parseLayerMap parses a comma-separated placement string like
- // "0-9:gpu0,10-19:gpu1,20-:cpu" and returns per-layer placements.
- func parseLayerMap(numLayers int, spec string) []tensor.DevicePlacement {
- placements := make([]tensor.DevicePlacement, numLayers)
- for i := range placements {
- placements[i] = tensor.DevicePlacement{Type: tensor.CPU, GPU: -1}
- }
- if spec == "" {
- return placements
- }
- entries := strings.Split(spec, ",")
- for _, entry := range entries {
- entry = strings.TrimSpace(entry)
- if entry == "" {
- continue
- }
- parts := strings.Split(entry, ":")
- if len(parts) != 2 {
- log.Printf("invalid layer-map entry %q, skipping", entry)
- continue
- }
- rng, target := strings.TrimSpace(parts[0]), strings.TrimSpace(parts[1])
- start, end := 0, numLayers-1
- if rng != "" {
- if strings.Contains(rng, "-") {
- rp := strings.SplitN(rng, "-", 2)
- if rp[0] != "" {
- if v, err := strconv.Atoi(rp[0]); err == nil {
- start = v
- }
- }
- if rp[1] != "" {
- if v, err := strconv.Atoi(rp[1]); err == nil {
- end = v
- }
- }
- } else if v, err := strconv.Atoi(rng); err == nil {
- start, end = v, v
- }
- }
- if start < 0 {
- start = 0
- }
- if end >= numLayers {
- end = numLayers - 1
- }
- var placement tensor.DevicePlacement
- switch {
- case strings.HasPrefix(strings.ToLower(target), "gpu"):
- idStr := strings.TrimPrefix(strings.ToLower(target), "gpu")
- id := 0
- if idStr != "" {
- if v, err := strconv.Atoi(idStr); err == nil {
- id = v
- }
- }
- placement = tensor.DevicePlacement{Type: tensor.CUDA, GPU: id}.Normalize()
- case strings.ToLower(target) == "cpu":
- placement = tensor.DevicePlacement{Type: tensor.CPU, GPU: -1}
- default:
- log.Printf("unknown target %q, defaulting to CPU", target)
- placement = tensor.DevicePlacement{Type: tensor.CPU, GPU: -1}
- }
- for i := start; i <= end && i < numLayers; i++ {
- placements[i] = placement
- }
- }
- return placements
- }
|