| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566 |
- //go:build cuda
- package cuda
- /*
- #cgo CFLAGS: -I${SRCDIR}
- #cgo LDFLAGS: -L${SRCDIR}/../../..//build/cuda -Wl,-Bstatic -lmakarna_cuda -Wl,-Bdynamic
- #cgo LDFLAGS: -L/usr/local/cuda/lib64 -lcudart -lstdc++ -lm
- #cgo LDFLAGS: -Wl,-rpath,${SRCDIR}/../../..//build/cuda -Wl,-rpath,/usr/local/cuda/lib64
- #include "kernels.h"
- */
- import "C"
- import (
- "errors"
- "fmt"
- "runtime"
- "time"
- "unsafe"
- "makarna/pkg/profile"
- "makarna/pkg/tensor"
- )
- func syncIfProfiling(gpu int) error {
- if !profile.Enabled() {
- return nil
- }
- return Synchronize(gpu)
- }
- // Ensure Interface Compliance
- var _ tensor.Tensor = (*Tensor)(nil)
- // Storage holds the underlying GPU memory with reference counting.
- // Multiple Tensors can share the same Storage (e.g., views, reshapes).
- // Memory is freed only when all references are gone.
- type Storage struct {
- ptr unsafe.Pointer
- gpu int
- // Note: We rely on Go's GC and SetFinalizer for ref counting.
- // Each Tensor that shares this storage keeps a reference to it.
- // When the last Tensor is GC'd, the Storage becomes unreachable,
- // and its finalizer frees the GPU memory.
- }
- // newStorage creates a new Storage and sets up its finalizer
- func newStorage(ptr unsafe.Pointer, gpu int) *Storage {
- s := &Storage{ptr: ptr, gpu: gpu}
- runtime.SetFinalizer(s, func(st *Storage) {
- _ = C.cuda_set_device(C.int(st.gpu))
- C.cuda_free(st.ptr)
- })
- return s
- }
- type Tensor struct {
- shape tensor.Shape
- dtype tensor.DType
- storage *Storage // Shared storage with ref counting
- ptr unsafe.Pointer // Pointer into storage (may be offset for slices)
- gpu int
- // ownsStorage indicates whether this Tensor is responsible for explicitly
- // freeing the underlying CUDA allocation.
- // Views/reshapes must not free shared storage because they may outlive the base
- // tensor (e.g. scratch-buffer views).
- ownsStorage bool
- }
- // NewTensor allocates memory on the GPU
- func NewTensor(shape tensor.Shape, dtype tensor.DType, gpu int) (*Tensor, error) {
- if dtype != tensor.Float32 && dtype != tensor.Float16 && dtype != tensor.BFloat16 {
- return nil, errors.New("unsupported dtype on CUDA")
- }
- if gpu < 0 {
- gpu = 0
- }
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return nil, errors.New("failed to set cuda device")
- }
- size := shape.NumElements() * dtype.Size()
- ptr := C.cuda_malloc(C.size_t(size))
- if ptr == nil {
- return nil, errors.New("cuda malloc failed")
- }
- storage := newStorage(ptr, gpu)
- t := &Tensor{
- shape: shape,
- dtype: dtype,
- storage: storage,
- ptr: ptr,
- gpu: gpu,
- ownsStorage: true,
- }
- return t, nil
- }
- func (t *Tensor) Shape() tensor.Shape {
- return t.shape
- }
- func (t *Tensor) DType() tensor.DType {
- return t.dtype
- }
- func (t *Tensor) Device() tensor.DeviceType {
- return tensor.CUDA
- }
- // GPU returns the device ordinal.
- func (t *Tensor) GPU() int {
- return t.gpu
- }
- func (t *Tensor) Placement() tensor.DevicePlacement {
- return tensor.DevicePlacement{Type: tensor.CUDA, GPU: t.gpu}
- }
- func (t *Tensor) Data() interface{} {
- return t.ptr
- }
- // Free explicitly frees the GPU memory associated with the tensor.
- // Use this for temporary tensors to avoid OOM due to delayed GC.
- func (t *Tensor) Free() {
- if t == nil {
- return
- }
- // Only the allocating tensor should explicitly free the CUDA allocation.
- // Views/reshapes share storage and must not free it.
- if t.storage != nil && t.ownsStorage {
- // Clear finalizer so it doesn't run later
- runtime.SetFinalizer(t.storage, nil)
- _ = C.cuda_set_device(C.int(t.gpu))
- C.cuda_free(t.storage.ptr)
- }
- t.storage = nil
- t.ptr = nil
- }
- func (t *Tensor) Add(other tensor.Tensor) error {
- o, ok := other.(*Tensor)
- if !ok {
- return errors.New("other must be CUDA tensor")
- }
- if t.dtype != tensor.Float32 || o.dtype != tensor.Float32 {
- return errors.New("Add only supports Float32")
- }
- if t.shape.NumElements() != o.shape.NumElements() {
- return errors.New("shape mismatch")
- }
- // Calls in-place add: t += o
- if ret := C.cuda_set_device(C.int(t.gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_add_f32((*C.float)(t.ptr), (*C.float)(o.ptr), C.size_t(t.shape.NumElements()))
- if ret != 0 {
- return errors.New("cuda add failed")
- }
- return nil
- }
- func PagedAttentionBatch(Q, kBlocksFlatDev, vBlocksFlatDev, blockOffsetsDev, kvLensDev, queryPosDev, out unsafe.Pointer, numTokens, numHeads, numKVHeads, headDim, blockSize int, scale float32, maxKvLen int, gpu int) error {
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_paged_attention_batch_f32(
- (*C.float)(Q),
- (**C.float)(kBlocksFlatDev),
- (**C.float)(vBlocksFlatDev),
- (*C.int)(blockOffsetsDev),
- (*C.int)(kvLensDev),
- (*C.int)(queryPosDev),
- (*C.float)(out),
- C.int(numTokens),
- C.int(numHeads), C.int(numKVHeads), C.int(headDim),
- C.int(blockSize),
- C.float(scale),
- C.int(maxKvLen),
- )
- if ret != 0 {
- return errors.New("cuda paged attention batch failed")
- }
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- return nil
- }
- func (t *Tensor) Mul(other tensor.Tensor) error {
- o, ok := other.(*Tensor)
- if !ok {
- return errors.New("other must be CUDA tensor")
- }
- if t.dtype != tensor.Float32 || o.dtype != tensor.Float32 {
- return errors.New("Mul only supports Float32")
- }
- if t.shape.NumElements() != o.shape.NumElements() {
- return errors.New("shape mismatch")
- }
- if ret := C.cuda_set_device(C.int(t.gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_mul_f32((*C.float)(t.ptr), (*C.float)(o.ptr), C.size_t(t.shape.NumElements()))
- if ret != 0 {
- return errors.New("cuda mul failed")
- }
- return nil
- }
- func (t *Tensor) MatMul(other tensor.Tensor, out tensor.Tensor) error {
- B, ok := other.(*Tensor)
- if !ok {
- return errors.New("other must be CUDA tensor")
- }
- C_out, ok := out.(*Tensor)
- if !ok {
- return errors.New("out must be CUDA tensor")
- }
- if t.dtype != tensor.Float32 || B.dtype != tensor.Float32 || C_out.dtype != tensor.Float32 {
- return errors.New("MatMul only supports Float32")
- }
- if len(t.shape) != 2 || len(B.shape) != 2 || len(C_out.shape) != 2 {
- return errors.New("only 2D matmul")
- }
- M := t.shape[0]
- K := t.shape[1]
- // We use NT matmul (A @ B^T), so B is expected to be [N, K]
- N := B.shape[0]
- K2 := B.shape[1]
- if K != K2 {
- return fmt.Errorf("k dimension mismatch: A[%d,%d] vs B[%d,%d]", M, K, N, K2)
- }
- if C_out.shape[0] != M || C_out.shape[1] != N {
- return fmt.Errorf("out shape mismatch: expected [%d,%d], got [%d,%d]", M, N, C_out.shape[0], C_out.shape[1])
- }
- if ret := C.cuda_set_device(C.int(t.gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_matmul_f32_nt(
- (*C.float)(t.ptr),
- (*C.float)(B.ptr),
- (*C.float)(C_out.ptr),
- C.int(M), C.int(K), C.int(N),
- )
- if ret != 0 {
- return errors.New("cuda matmul failed")
- }
- return nil
- }
- // Reshape creates a view (shared storage) with new shape.
- // The new tensor shares the same underlying Storage, so memory
- // is only freed when all tensors sharing this storage are GC'd.
- func (t *Tensor) Reshape(shape tensor.Shape) (tensor.Tensor, error) {
- if shape.NumElements() != t.shape.NumElements() {
- return nil, errors.New("num elements mismatch")
- }
- // Share the same storage - Go's GC handles ref counting for us
- return &Tensor{
- shape: shape,
- dtype: t.dtype,
- storage: t.storage, // Shared reference
- ptr: t.ptr,
- gpu: t.gpu,
- ownsStorage: false,
- }, nil
- }
- // ViewAt returns a view into the tensor starting at the given byte offset.
- // The returned tensor shares storage and does not allocate.
- func (t *Tensor) ViewAt(shape tensor.Shape, offsetBytes uintptr) (*Tensor, error) {
- if t == nil {
- return nil, errors.New("nil tensor")
- }
- if offsetBytes%uintptr(t.dtype.Size()) != 0 {
- return nil, fmt.Errorf("offset %d not aligned to dtype size %d", offsetBytes, t.dtype.Size())
- }
- newPtr := unsafe.Pointer(uintptr(t.ptr) + offsetBytes)
- return &Tensor{
- shape: shape,
- dtype: t.dtype,
- storage: t.storage,
- ptr: newPtr,
- gpu: t.gpu,
- ownsStorage: false,
- }, nil
- }
- func (t *Tensor) View(shape tensor.Shape) (tensor.Tensor, error) {
- return t.Reshape(shape)
- }
- func (t *Tensor) ToDevice(device tensor.DeviceType) (tensor.Tensor, error) {
- if device == tensor.CUDA {
- return t, nil
- }
- // TODO: support CUDA -> CPU
- if device == tensor.CPU {
- // We need to copy data back
- // 1. Create CPU tensor
- // 2. Memcpy D2H
- // 3. Return CPU tensor
- // This requires importing "makarna/pkg/backend/cpu". Circular dependency risk?
- // No, `cpu` imports `tensor`, `cuda` imports `tensor`.
- // But `cuda` cannot import `cpu` easily if `cpu` is intended to be the default.
- // Actually it's fine if `cuda` imports `cpu`.
- return nil, errors.New("ToDevice(CPU) not implemented here yet, use helper")
- }
- return nil, errors.New("unknown device")
- }
- func (t *Tensor) CopyFrom(data interface{}) error {
- if t.dtype != tensor.Float32 {
- return errors.New("CopyFrom only supports Float32")
- }
- // Assuming data is []float32 on Host
- src, ok := data.([]float32)
- if !ok {
- return errors.New("data must be []float32")
- }
- size := len(src) * 4
- if size != t.shape.NumElements()*t.dtype.Size() {
- return errors.New("size mismatch")
- }
- if ret := C.cuda_set_device(C.int(t.gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- start := time.Now()
- ret := C.cuda_memcpy_h2d(t.ptr, unsafe.Pointer(&src[0]), C.size_t(size))
- if ret != 0 {
- runtime.KeepAlive(src)
- runtime.KeepAlive(t)
- return errors.New("cuda memcpy failed")
- }
- profile.RecordTransfer("CopyFrom/H2D", profile.EventH2D, int64(size), time.Since(start), t.gpu)
- runtime.KeepAlive(src)
- runtime.KeepAlive(t)
- return nil
- }
- // Helper to copy back to host
- func (t *Tensor) CopyToHost(dst []float32) error {
- if t.dtype != tensor.Float32 {
- return errors.New("CopyToHost only supports Float32")
- }
- size := len(dst) * 4
- if size != t.shape.NumElements()*4 {
- return errors.New("size mismatch")
- }
- if ret := C.cuda_set_device(C.int(t.gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- start := time.Now()
- ret := C.cuda_memcpy_d2h(unsafe.Pointer(&dst[0]), t.ptr, C.size_t(size))
- if ret != 0 {
- runtime.KeepAlive(dst)
- runtime.KeepAlive(t)
- return errors.New("cuda memcpy d2h failed")
- }
- profile.RecordTransfer("CopyToHost/D2H", profile.EventD2H, int64(size), time.Since(start), t.gpu)
- runtime.KeepAlive(dst)
- runtime.KeepAlive(t)
- return nil
- }
- func (t *Tensor) CopyToInt32(dst []int32) error {
- if t.dtype != tensor.Int32 {
- return errors.New("CopyToInt32 only supports Int32")
- }
- size := len(dst) * 4
- if size != t.shape.NumElements()*4 {
- return errors.New("size mismatch")
- }
- if ret := C.cuda_set_device(C.int(t.gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_memcpy_d2h(unsafe.Pointer(&dst[0]), t.ptr, C.size_t(size))
- if ret != 0 {
- return errors.New("cuda memcpy d2h failed")
- }
- runtime.KeepAlive(dst)
- runtime.KeepAlive(t)
- return nil
- }
- // CopyPartialFrom copies a portion of host data to the tensor at a given offset.
- // dstOffset: offset in float32 elements from the start of the tensor
- // src: source data to copy from host
- func (t *Tensor) CopyPartialFrom(dstOffset int, src []float32) error {
- if t.dtype != tensor.Float32 {
- return errors.New("CopyPartialFrom only supports Float32")
- }
- if dstOffset+len(src) > t.shape.NumElements() {
- return errors.New("partial copy would exceed tensor bounds")
- }
- if len(src) == 0 {
- return nil
- }
- if ret := C.cuda_set_device(C.int(t.gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- // Calculate destination pointer with offset
- dstPtr := unsafe.Pointer(uintptr(t.ptr) + uintptr(dstOffset*4))
- size := len(src) * 4
- start := time.Now()
- ret := C.cuda_memcpy_h2d(dstPtr, unsafe.Pointer(&src[0]), C.size_t(size))
- if ret != 0 {
- runtime.KeepAlive(src)
- runtime.KeepAlive(t)
- return errors.New("cuda memcpy partial failed")
- }
- profile.RecordTransfer("CopyPartialFrom/H2D", profile.EventH2D, int64(size), time.Since(start), t.gpu)
- runtime.KeepAlive(src)
- runtime.KeepAlive(t)
- return nil
- }
- // CopyPartialFromDevice copies a portion from another CUDA tensor into this tensor.
- // Offsets and length are in float32 elements.
- func (t *Tensor) CopyPartialFromDevice(dstOffset int, src *Tensor, srcOffset int, length int) error {
- if t.dtype != src.dtype {
- return errors.New("dtype mismatch")
- }
- if dstOffset+length > t.shape.NumElements() {
- return errors.New("dst offset/length exceed tensor bounds")
- }
- if srcOffset+length > src.shape.NumElements() {
- return errors.New("src offset/length exceed tensor bounds")
- }
- if length == 0 {
- return nil
- }
- if ret := C.cuda_set_device(C.int(t.gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- start := time.Now()
- eltSize := t.dtype.Size()
- dstPtr := unsafe.Pointer(uintptr(t.ptr) + uintptr(dstOffset*eltSize))
- srcPtr := unsafe.Pointer(uintptr(src.ptr) + uintptr(srcOffset*eltSize))
- size := C.size_t(length * eltSize)
- if ret := C.cuda_memcpy_d2d(dstPtr, srcPtr, size); ret != 0 {
- runtime.KeepAlive(src)
- runtime.KeepAlive(t)
- return errors.New("cuda memcpy d2d failed")
- }
- profile.RecordTransfer("CopyPartialFromDevice/D2D", profile.EventD2D, int64(length*eltSize), time.Since(start), t.gpu)
- runtime.KeepAlive(src)
- runtime.KeepAlive(t)
- return nil
- }
- func CastF32ToF16(srcF32, dstF16 unsafe.Pointer, n int, gpu int) error {
- if n <= 0 {
- return nil
- }
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- if ret := C.cuda_cast_f32_to_f16((*C.float)(srcF32), (*C.ushort)(dstF16), C.int(n)); ret != 0 {
- return errors.New("cuda cast f32->f16 failed")
- }
- return nil
- }
- func PagedAttentionF32F16KV(Q, kBlocksDev, vBlocksDev, out unsafe.Pointer, seqLen, kvLen, numHeads, numKVHeads, headDim, blockSize int, scale float32, startPos int, gpu int) error {
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_paged_attention_f32_f16kv(
- (*C.float)(Q),
- (**C.ushort)(kBlocksDev),
- (**C.ushort)(vBlocksDev),
- (*C.float)(out),
- C.int(seqLen), C.int(kvLen),
- C.int(numHeads), C.int(numKVHeads), C.int(headDim),
- C.int(blockSize),
- C.float(scale), C.int(startPos),
- )
- if ret != 0 {
- return errors.New("cuda paged attention f16kv failed")
- }
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- return nil
- }
- func PagedAttentionBatchF32F16KV(Q, kBlocksFlatDev, vBlocksFlatDev, blockOffsetsDev, kvLensDev, queryPosDev, out unsafe.Pointer, numTokens, numHeads, numKVHeads, headDim, blockSize int, scale float32, maxKvLen int, gpu int) error {
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_paged_attention_batch_f32_f16kv(
- (*C.float)(Q),
- (**C.ushort)(kBlocksFlatDev),
- (**C.ushort)(vBlocksFlatDev),
- (*C.int)(blockOffsetsDev),
- (*C.int)(kvLensDev),
- (*C.int)(queryPosDev),
- (*C.float)(out),
- C.int(numTokens),
- C.int(numHeads), C.int(numKVHeads), C.int(headDim),
- C.int(blockSize),
- C.float(scale),
- C.int(maxKvLen),
- )
- if ret != 0 {
- return errors.New("cuda paged attention batch f16kv failed")
- }
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- return nil
- }
- // PagedAttentionRoPEF32F16KV runs paged attention with fused RoPE inside the kernel.
- // Expects un-rotated Q and un-rotated K blocks; V blocks are unchanged.
- func PagedAttentionRoPEF32F16KV(Q, kBlocksDev, vBlocksDev, out unsafe.Pointer, seqLen, kvLen, numHeads, numKVHeads, headDim, blockSize int, scale float32, startPos int, theta float32, gpu int) error {
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_paged_attention_rope_f32_f16kv(
- (*C.float)(Q),
- (**C.ushort)(kBlocksDev),
- (**C.ushort)(vBlocksDev),
- (*C.float)(out),
- C.int(seqLen), C.int(kvLen),
- C.int(numHeads), C.int(numKVHeads), C.int(headDim),
- C.int(blockSize),
- C.float(scale), C.int(startPos),
- C.float(theta),
- )
- if ret != 0 {
- return errors.New("cuda paged attention rope f16kv failed")
- }
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- return nil
- }
- // PagedAttentionBatchRoPEF32F16KV runs batched paged attention with fused RoPE inside the kernel.
- // Expects un-rotated Q and un-rotated K blocks; V blocks are unchanged.
- func PagedAttentionBatchRoPEF32F16KV(Q, kBlocksFlatDev, vBlocksFlatDev, blockOffsetsDev, kvLensDev, queryPosDev, out unsafe.Pointer, numTokens, numHeads, numKVHeads, headDim, blockSize int, scale float32, maxKvLen int, theta float32, gpu int) error {
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_paged_attention_rope_batch_f32_f16kv(
- (*C.float)(Q),
- (**C.ushort)(kBlocksFlatDev),
- (**C.ushort)(vBlocksFlatDev),
- (*C.int)(blockOffsetsDev),
- (*C.int)(kvLensDev),
- (*C.int)(queryPosDev),
- (*C.float)(out),
- C.int(numTokens),
- C.int(numHeads), C.int(numKVHeads), C.int(headDim),
- C.int(blockSize),
- C.float(scale),
- C.int(maxKvLen),
- C.float(theta),
- )
- if ret != 0 {
- return errors.New("cuda paged attention batch rope f16kv failed")
- }
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- return nil
- }
- // Available returns whether CUDA is available
- func Available() bool {
- return true
- }
- // MemoryInfo returns (total, free) bytes for the current CUDA device.
- func MemoryInfo() (total uint64, free uint64, err error) {
- var cFree, cTotal C.size_t
- ret := C.cuda_mem_info(&cFree, &cTotal)
- if ret != 0 {
- return 0, 0, errors.New("cuda_mem_info failed")
- }
- return uint64(cTotal), uint64(cFree), nil
- }
- // MemoryInfoDevice returns (total, free) bytes for the given CUDA device.
- func MemoryInfoDevice(gpu int) (total uint64, free uint64, err error) {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return 0, 0, errors.New("failed to set cuda device")
- }
- var cFree, cTotal C.size_t
- ret := C.cuda_mem_info(&cFree, &cTotal)
- if ret != 0 {
- return 0, 0, errors.New("cuda_mem_info failed")
- }
- return uint64(cTotal), uint64(cFree), nil
- }
- // DeviceCount returns the number of visible CUDA devices.
- func DeviceCount() (int, error) {
- var cCount C.int
- ret := C.cuda_device_count(&cCount)
- if ret != 0 {
- return 0, errors.New("cuda_device_count failed")
- }
- if cCount < 0 {
- return 0, errors.New("cuda_device_count returned negative")
- }
- return int(cCount), nil
- }
- // Synchronize waits for all queued work on the given GPU.
- // Use when explicit host/device coordination is required.
- func Synchronize(gpu int) error {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- if ret := C.cuda_synchronize(); ret != 0 {
- return errors.New("cuda synchronize failed")
- }
- return nil
- }
- // ============================================================
- // Neural Network Operations
- // ============================================================
- // RMSNorm applies RMS normalization in-place on GPU
- // x: [seqLen, dim] device pointer, w: [dim] device pointer
- func RMSNorm(x, w unsafe.Pointer, seqLen, dim int, eps float32, gpu int) error {
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_rmsnorm_f32((*C.float)(x), (*C.float)(w), C.int(seqLen), C.int(dim), C.float(eps))
- if ret != 0 {
- return errors.New("cuda rmsnorm failed")
- }
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- return nil
- }
- // RoPE applies rotary positional embeddings in-place
- // x: [seqLen, numHeads * headDim] device pointer
- // positions: [seqLen] device pointer (int32)
- func RoPE(x, positions unsafe.Pointer, seqLen, numHeads, headDim int, theta float32, gpu int) error {
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_rope_f32((*C.float)(x), (*C.int)(positions), C.int(seqLen), C.int(numHeads), C.int(headDim), C.float(theta))
- if ret != 0 {
- return errors.New("cuda rope failed")
- }
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- return nil
- }
- // RoPESingle runs RoPE for a single token at a specific position.
- func RoPESingle(x unsafe.Pointer, pos, numHeads, headDim int, theta float32, gpu int) error {
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_rope_f32_single((*C.float)(x), C.int(pos), C.int(numHeads), C.int(headDim), C.float(theta))
- if ret != 0 {
- return errors.New("cuda rope single failed")
- }
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- return nil
- }
- // Softmax applies softmax along last dimension in-place
- func Softmax(x unsafe.Pointer, rows, cols int, gpu int) error {
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_softmax_f32((*C.float)(x), C.int(rows), C.int(cols))
- if ret != 0 {
- return errors.New("cuda softmax failed")
- }
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- return nil
- }
- // SiLU applies SiLU activation in-place: x = x * sigmoid(x)
- func SiLU(x unsafe.Pointer, n int, gpu int) error {
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_silu_f32((*C.float)(x), C.size_t(n))
- if ret != 0 {
- return errors.New("cuda silu failed")
- }
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- return nil
- }
- // MulInplace performs element-wise a = a * b
- func MulInplace(a, b unsafe.Pointer, n int, gpu int) error {
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_mul_inplace_f32((*C.float)(a), (*C.float)(b), C.size_t(n))
- if ret != 0 {
- return errors.New("cuda mul inplace failed")
- }
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- return nil
- }
- // Copy copies GPU memory: dst = src
- func Copy(dst, src unsafe.Pointer, n int, gpu int) error {
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_copy_f32((*C.float)(dst), (*C.float)(src), C.size_t(n))
- if ret != 0 {
- return errors.New("cuda copy failed")
- }
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- return nil
- }
- func KDACausalShortConv1D(x, state, w unsafe.Pointer, tokens, projSize, kernel int, gpu int) error {
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_kda_causal_short_conv1d_f32(
- (*C.float)(x),
- (*C.float)(state),
- (*C.float)(w),
- C.int(tokens),
- C.int(projSize),
- C.int(kernel),
- )
- if ret != 0 {
- return errors.New("cuda kda causal short conv1d failed")
- }
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- return nil
- }
- func L2NormHeads(q, k unsafe.Pointer, tokens, numHeads, headDim int, eps float32, gpu int) error {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_l2norm_heads_f32((*C.float)(q), (*C.float)(k), C.int(tokens), C.int(numHeads), C.int(headDim), C.float(eps))
- if ret != 0 {
- return errors.New("cuda l2norm heads failed")
- }
- return nil
- }
- func KDAGate(g, aLog, dtBias, out unsafe.Pointer, tokens, numHeads, headDim int, gpu int) error {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_kda_gate_f32((*C.float)(g), (*C.float)(aLog), (*C.float)(dtBias), (*C.float)(out), C.int(tokens), C.int(numHeads), C.int(headDim))
- if ret != 0 {
- return errors.New("cuda kda gate failed")
- }
- return nil
- }
- func KDARecurrent(q, k, v, g, beta, state unsafe.Pointer, tokens, numHeads, headDim int, gpu int) error {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_kda_recurrent_f32((*C.float)(q), (*C.float)(k), (*C.float)(v), (*C.float)(g), (*C.float)(beta), (*C.float)(state), C.int(tokens), C.int(numHeads), C.int(headDim))
- if ret != 0 {
- return errors.New("cuda kda recurrent failed")
- }
- return nil
- }
- func RMSNormGated(out, g, weight unsafe.Pointer, n, headDim int, eps float32, gpu int) error {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_rmsnorm_gated_f32((*C.float)(out), (*C.float)(g), (*C.float)(weight), C.int(n), C.int(headDim), C.float(eps))
- if ret != 0 {
- return errors.New("cuda rmsnorm gated failed")
- }
- return nil
- }
- func Sigmoid(x unsafe.Pointer, n int, gpu int) error {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_sigmoid_f32((*C.float)(x), C.int(n))
- if ret != 0 {
- return errors.New("cuda sigmoid failed")
- }
- return nil
- }
- func SoftmaxRows(x unsafe.Pointer, rows, cols int, gpu int) error {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_softmax_rows_f32((*C.float)(x), C.int(rows), C.int(cols))
- if ret != 0 {
- return errors.New("cuda softmax rows failed")
- }
- return nil
- }
- func TopKPerRow(scores unsafe.Pointer, indices unsafe.Pointer, values unsafe.Pointer, rows, cols, k int, gpu int) error {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_topk_per_row_f32((*C.float)(scores), (*C.int)(indices), (*C.float)(values), C.int(rows), C.int(cols), C.int(k))
- if ret != 0 {
- return errors.New("cuda topk per row failed")
- }
- return nil
- }
- // Attention computes full causal attention on GPU
- // Q: [seqLen, numHeads * headDim]
- // K, V: [kvLen, numKVHeads * headDim]
- // out: [seqLen, numHeads * headDim]
- func Attention(Q, K, V, out unsafe.Pointer, seqLen, kvLen, numHeads, numKVHeads, headDim int, scale float32, startPos int, gpu int) error {
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_attention_f32(
- (*C.float)(Q), (*C.float)(K), (*C.float)(V), (*C.float)(out),
- C.int(seqLen), C.int(kvLen), C.int(numHeads), C.int(numKVHeads), C.int(headDim),
- C.float(scale), C.int(startPos),
- )
- if ret != 0 {
- return errors.New("cuda attention failed")
- }
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- return nil
- }
- func PagedAttention(Q, kBlocksDev, vBlocksDev, out unsafe.Pointer, seqLen, kvLen, numHeads, numKVHeads, headDim, blockSize int, scale float32, startPos int, gpu int) error {
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_paged_attention_f32(
- (*C.float)(Q),
- (**C.float)(kBlocksDev),
- (**C.float)(vBlocksDev),
- (*C.float)(out),
- C.int(seqLen), C.int(kvLen), C.int(numHeads), C.int(numKVHeads), C.int(headDim),
- C.int(blockSize),
- C.float(scale), C.int(startPos),
- )
- if ret != 0 {
- return errors.New("cuda paged attention failed")
- }
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- return nil
- }
- // AttentionTimed runs attention and returns kernel time in milliseconds.
- // Intended for profiling/debugging only (it synchronizes internally).
- func AttentionTimed(Q, K, V, out unsafe.Pointer, seqLen, kvLen, numHeads, numKVHeads, headDim int, scale float32, startPos int, gpu int) (float32, error) {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return 0, errors.New("failed to set cuda device")
- }
- var ms C.float
- ret := C.cuda_attention_f32_timed(
- (*C.float)(Q), (*C.float)(K), (*C.float)(V), (*C.float)(out),
- C.int(seqLen), C.int(kvLen), C.int(numHeads), C.int(numKVHeads), C.int(headDim),
- C.float(scale), C.int(startPos), &ms,
- )
- if ret != 0 {
- return 0, errors.New("cuda attention timed failed")
- }
- return float32(ms), nil
- }
- // AddInplace performs element-wise a = a + b
- func AddInplace(a, b unsafe.Pointer, n int, gpu int) error {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_add_f32((*C.float)(a), (*C.float)(b), C.size_t(n))
- if ret != 0 {
- return errors.New("cuda add failed")
- }
- return nil
- }
- // ============================================================
- // Dequantization Operations
- // ============================================================
- // DequantQ8K dequantizes Q8_K blocks on GPU
- // blocks: device pointer to Q8_K data
- // out: device pointer to output float32 (numBlocks * 256 elements)
- func DequantQ8K(blocks unsafe.Pointer, out unsafe.Pointer, numBlocks int, gpu int) error {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_dequant_q8k(blocks, (*C.float)(out), C.int(numBlocks))
- if ret != 0 {
- return errors.New("cuda dequant q8k failed")
- }
- return nil
- }
- func DequantQ4K(blocks unsafe.Pointer, out unsafe.Pointer, numBlocks int, gpu int) error {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_dequant_q4k(blocks, (*C.float)(out), C.int(numBlocks))
- if ret != 0 {
- return errors.New("cuda dequant q4k failed")
- }
- return nil
- }
- func DequantQ5K(blocks unsafe.Pointer, out unsafe.Pointer, numBlocks int, gpu int) error {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_dequant_q5k(blocks, (*C.float)(out), C.int(numBlocks))
- if ret != 0 {
- return errors.New("cuda dequant q5k failed")
- }
- return nil
- }
- // DequantQ6K dequantizes Q6_K blocks on GPU
- func DequantQ6K(blocks unsafe.Pointer, out unsafe.Pointer, numBlocks int, gpu int) error {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_dequant_q6k(blocks, (*C.float)(out), C.int(numBlocks))
- if ret != 0 {
- return errors.New("cuda dequant q6k failed")
- }
- return nil
- }
- func DequantQ3K(blocks unsafe.Pointer, out unsafe.Pointer, numBlocks int, gpu int) error {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_dequant_q3k(blocks, (*C.float)(out), C.int(numBlocks))
- if ret != 0 {
- return errors.New("cuda dequant q3k failed")
- }
- return nil
- }
- func DequantQ2K(blocks unsafe.Pointer, out unsafe.Pointer, numBlocks int, gpu int) error {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_dequant_q2k(blocks, (*C.float)(out), C.int(numBlocks))
- if ret != 0 {
- return errors.New("cuda dequant q2k failed")
- }
- return nil
- }
- // MatMulQ8K performs C = A @ dequant(B) where B is Q8_K quantized
- func MatMulQ8K(A unsafe.Pointer, B unsafe.Pointer, C unsafe.Pointer, M, K, N, gpu int) error {
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_matmul_f32_q8k((*C.float)(A), B, (*C.float)(C), C.int(M), C.int(K), C.int(N))
- if ret != 0 {
- return errors.New("cuda matmul q8k failed")
- }
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- return nil
- }
- func MatMulQ5K(A unsafe.Pointer, B unsafe.Pointer, C unsafe.Pointer, M, K, N, gpu int) error {
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_matmul_f32_q5k((*C.float)(A), B, (*C.float)(C), C.int(M), C.int(K), C.int(N))
- if ret != 0 {
- return errors.New("cuda matmul q5k failed")
- }
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- return nil
- }
- func MatMulQ4K(A unsafe.Pointer, B unsafe.Pointer, Cptr unsafe.Pointer, M, K, N, gpu int) error {
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_matmul_f32_q4k((*C.float)(A), B, (*C.float)(Cptr), C.int(M), C.int(K), C.int(N))
- if ret != 0 {
- return errors.New("cuda matmul q4k failed")
- }
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- return nil
- }
- func MatMulQ2K(aPtr, bPtr, cPtr unsafe.Pointer, m, k, n int, gpu int) error {
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- if k%256 != 0 {
- return fmt.Errorf("MatMulQ2K: K must be multiple of 256, got %d", k)
- }
- ret := C.cuda_matmul_f32_q2k((*C.float)(aPtr), bPtr, (*C.float)(cPtr), C.int(m), C.int(k), C.int(n))
- if ret != 0 {
- return errors.New("cuda matmul q2k failed")
- }
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- return nil
- }
- func MatMulQ3K(aPtr, bPtr, cPtr unsafe.Pointer, m, k, n int, gpu int) error {
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- if k%256 != 0 {
- return fmt.Errorf("MatMulQ3K: K must be multiple of 256, got %d", k)
- }
- ret := C.cuda_matmul_f32_q3k((*C.float)(aPtr), bPtr, (*C.float)(cPtr), C.int(m), C.int(k), C.int(n))
- if ret != 0 {
- return errors.New("cuda matmul q3k failed")
- }
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- return nil
- }
- func MatMulQ6K(aPtr, bPtr, cPtr unsafe.Pointer, m, k, n int, gpu int) error {
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- if k%256 != 0 {
- return fmt.Errorf("MatMulQ6K: K must be multiple of 256, got %d", k)
- }
- ret := C.cuda_matmul_f32_q6k((*C.float)(aPtr), bPtr, (*C.float)(cPtr), C.int(m), C.int(k), C.int(n))
- if ret != 0 {
- return errors.New("cuda matmul q6k failed")
- }
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- return nil
- }
- func MatMulF32(A, B, Cptr unsafe.Pointer, M, K, N, gpu int) error {
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_matmul_f32_nt(
- (*C.float)(A),
- (*C.float)(B),
- (*C.float)(Cptr),
- C.int(M), C.int(K), C.int(N),
- )
- if ret != 0 {
- return errors.New("cuda matmul f32 failed")
- }
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- return nil
- }
- // MatMulF16 performs C = A @ B^T where A and B are float16 (stored as uint16),
- // and C is float32 output.
- func MatMulF16(A, B, Cptr unsafe.Pointer, M, K, N, gpu int) error {
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_matmul_f16_nt(
- (*C.ushort)(A),
- (*C.ushort)(B),
- (*C.float)(Cptr),
- C.int(M), C.int(K), C.int(N),
- )
- if ret != 0 {
- return errors.New("cuda matmul f16 failed")
- }
- if err := syncIfProfiling(gpu); err != nil {
- return err
- }
- return nil
- }
- // FP16 Input MatMul variants - 2x memory bandwidth for activations
- // A is FP16, B is quantized, C is FP32 output
- func MatMulF16Q8K(A, B, Cptr unsafe.Pointer, M, K, N, gpu int) error {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_matmul_f16_q8k(A, B, (*C.float)(Cptr), C.int(M), C.int(K), C.int(N))
- if ret != 0 {
- return errors.New("cuda matmul f16 q8k failed")
- }
- return nil
- }
- func MatMulF16Q4K(A, B, Cptr unsafe.Pointer, M, K, N, gpu int) error {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_matmul_f16_q4k(A, B, (*C.float)(Cptr), C.int(M), C.int(K), C.int(N))
- if ret != 0 {
- return errors.New("cuda matmul f16 q4k failed")
- }
- return nil
- }
- func MatMulF16Q5K(A, B, Cptr unsafe.Pointer, M, K, N, gpu int) error {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_matmul_f16_q5k(A, B, (*C.float)(Cptr), C.int(M), C.int(K), C.int(N))
- if ret != 0 {
- return errors.New("cuda matmul f16 q5k failed")
- }
- return nil
- }
- func MatMulF16Q2K(A, B, Cptr unsafe.Pointer, M, K, N, gpu int) error {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_matmul_f16_q2k(A, B, (*C.float)(Cptr), C.int(M), C.int(K), C.int(N))
- if ret != 0 {
- return errors.New("cuda matmul f16 q2k failed")
- }
- return nil
- }
- func MatMulF16Q3K(A, B, Cptr unsafe.Pointer, M, K, N, gpu int) error {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_matmul_f16_q3k(A, B, (*C.float)(Cptr), C.int(M), C.int(K), C.int(N))
- if ret != 0 {
- return errors.New("cuda matmul f16 q3k failed")
- }
- return nil
- }
- func MatMulF16Q6K(A, B, Cptr unsafe.Pointer, M, K, N, gpu int) error {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_matmul_f16_q6k(A, B, (*C.float)(Cptr), C.int(M), C.int(K), C.int(N))
- if ret != 0 {
- return errors.New("cuda matmul f16 q6k failed")
- }
- return nil
- }
- // UploadQ8K uploads Q8_K blocks from host to GPU
- func UploadQ8K(hostData []byte, numBlocks int, gpu int) (unsafe.Pointer, error) {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return nil, errors.New("failed to set cuda device")
- }
- size := len(hostData)
- ptr := C.cuda_malloc(C.size_t(size))
- if ptr == nil {
- return nil, errors.New("cuda malloc failed for Q8K")
- }
- ret := C.cuda_memcpy_h2d(ptr, unsafe.Pointer(&hostData[0]), C.size_t(size))
- if ret != 0 {
- C.cuda_free(ptr)
- return nil, errors.New("cuda memcpy h2d failed for Q8K")
- }
- return ptr, nil
- }
- func AllocAndCopyPtrTable(ptrs []uintptr, gpu int) (unsafe.Pointer, error) {
- if len(ptrs) == 0 {
- return nil, errors.New("empty ptr table")
- }
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return nil, errors.New("failed to set cuda device")
- }
- size := len(ptrs) * int(unsafe.Sizeof(uintptr(0)))
- ptr := C.cuda_malloc(C.size_t(size))
- if ptr == nil {
- return nil, errors.New("cuda malloc failed for ptr table")
- }
- ret := C.cuda_memcpy_h2d(ptr, unsafe.Pointer(&ptrs[0]), C.size_t(size))
- if ret != 0 {
- C.cuda_free(ptr)
- return nil, errors.New("cuda memcpy h2d failed for ptr table")
- }
- return ptr, nil
- }
- func UploadQ5K(hostData []byte, numBlocks int, gpu int) (unsafe.Pointer, error) {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return nil, errors.New("failed to set cuda device")
- }
- size := len(hostData)
- ptr := C.cuda_malloc(C.size_t(size))
- if ptr == nil {
- return nil, errors.New("cuda malloc failed for Q5K")
- }
- ret := C.cuda_memcpy_h2d(ptr, unsafe.Pointer(&hostData[0]), C.size_t(size))
- if ret != 0 {
- C.cuda_free(ptr)
- return nil, errors.New("cuda memcpy h2d failed for Q5K")
- }
- return ptr, nil
- }
- // UploadQ4K uploads Q4_K blocks from host to GPU
- func UploadQ4K(hostData []byte, numBlocks int, gpu int) (unsafe.Pointer, error) {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return nil, errors.New("failed to set cuda device")
- }
- size := len(hostData)
- ptr := C.cuda_malloc(C.size_t(size))
- if ptr == nil {
- return nil, errors.New("cuda malloc failed for Q4K")
- }
- ret := C.cuda_memcpy_h2d(ptr, unsafe.Pointer(&hostData[0]), C.size_t(size))
- if ret != 0 {
- C.cuda_free(ptr)
- return nil, errors.New("cuda memcpy h2d failed for Q4K")
- }
- return ptr, nil
- }
- // UploadQ2K uploads Q2_K blocks from host to GPU
- func UploadQ2K(hostData []byte, numBlocks int, gpu int) (unsafe.Pointer, error) {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return nil, errors.New("failed to set cuda device")
- }
- size := len(hostData)
- ptr := C.cuda_malloc(C.size_t(size))
- if ptr == nil {
- return nil, errors.New("cuda malloc failed for Q2K")
- }
- ret := C.cuda_memcpy_h2d(ptr, unsafe.Pointer(&hostData[0]), C.size_t(size))
- if ret != 0 {
- C.cuda_free(ptr)
- return nil, errors.New("cuda memcpy h2d failed for Q2K")
- }
- return ptr, nil
- }
- // UploadQ3K uploads Q3_K blocks from host to GPU
- func UploadQ3K(hostData []byte, numBlocks int, gpu int) (unsafe.Pointer, error) {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return nil, errors.New("failed to set cuda device")
- }
- size := len(hostData)
- ptr := C.cuda_malloc(C.size_t(size))
- if ptr == nil {
- return nil, errors.New("cuda malloc failed for Q3K")
- }
- ret := C.cuda_memcpy_h2d(ptr, unsafe.Pointer(&hostData[0]), C.size_t(size))
- if ret != 0 {
- C.cuda_free(ptr)
- return nil, errors.New("cuda memcpy h2d failed for Q3K")
- }
- return ptr, nil
- }
- // UploadQ6K uploads Q6_K blocks from host to GPU
- func UploadQ6K(hostData []byte, numBlocks int, gpu int) (unsafe.Pointer, error) {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return nil, errors.New("failed to set cuda device")
- }
- size := len(hostData)
- ptr := C.cuda_malloc(C.size_t(size))
- if ptr == nil {
- return nil, errors.New("cuda malloc failed for Q6K")
- }
- ret := C.cuda_memcpy_h2d(ptr, unsafe.Pointer(&hostData[0]), C.size_t(size))
- if ret != 0 {
- C.cuda_free(ptr)
- return nil, errors.New("cuda memcpy h2d failed for Q6K")
- }
- return ptr, nil
- }
- // MemcpyH2D copies data from host to device pointer.
- // dst: device pointer
- // src: host data (unsafe.Pointer to first element)
- // size: number of bytes
- // gpu: device id (must be active or will be set)
- func MemcpyH2D(dst, src unsafe.Pointer, size uintptr, gpu int) error {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_memcpy_h2d(dst, src, C.size_t(size))
- if ret != 0 {
- return errors.New("cuda memcpy h2d failed")
- }
- return nil
- }
- // MemcpyD2H copies data from device pointer to host pointer.
- func MemcpyD2H(dst, src unsafe.Pointer, size uintptr, gpu int) error {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_memcpy_d2h(dst, src, C.size_t(size))
- if ret != 0 {
- return errors.New("cuda memcpy d2h failed")
- }
- return nil
- }
- func MemcpyD2D(dst, src unsafe.Pointer, size uintptr, gpu int) error {
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return errors.New("failed to set cuda device")
- }
- ret := C.cuda_memcpy_d2d(dst, src, C.size_t(size))
- if ret != 0 {
- return errors.New("cuda memcpy d2d failed")
- }
- return nil
- }
- // TopKLogitsF32 computes per-block top-k on GPU (with repetition penalty applied)
- // and returns the concatenated candidate list on host (caller does final global top-k).
- func TopKLogitsF32(logits unsafe.Pointer, vocab int, repIDs []int32, repPenalty float32, k int, gpu int) ([]int32, []float32, int, error) {
- if k <= 0 {
- return nil, nil, 0, nil
- }
- if k > 64 {
- return nil, nil, 0, fmt.Errorf("TopKLogitsF32: k too large: %d", k)
- }
- blocks := (vocab + 2048 - 1) / 2048
- if blocks <= 0 {
- blocks = 1
- }
- count := blocks * k
- var repPtr unsafe.Pointer
- if len(repIDs) > 0 {
- p, err := AllocAndCopyInt32(repIDs, gpu)
- if err != nil {
- return nil, nil, 0, err
- }
- repPtr = p
- defer FreeDevicePtr(repPtr)
- }
- // Device outputs
- outIDsPtr := C.cuda_malloc(C.size_t(count * 4))
- if outIDsPtr == nil {
- return nil, nil, 0, errors.New("TopKLogitsF32: cuda malloc failed for outIDs")
- }
- defer C.cuda_free(outIDsPtr)
- outScoresPtr := C.cuda_malloc(C.size_t(count * 4))
- if outScoresPtr == nil {
- return nil, nil, 0, errors.New("TopKLogitsF32: cuda malloc failed for outScores")
- }
- defer C.cuda_free(outScoresPtr)
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return nil, nil, 0, errors.New("failed to set cuda device")
- }
- ret := C.cuda_topk_logits_f32(
- (*C.float)(logits),
- C.int(vocab),
- (*C.int)(repPtr),
- C.int(len(repIDs)),
- C.float(repPenalty),
- C.int(k),
- (*C.int)(outIDsPtr),
- (*C.float)(outScoresPtr),
- )
- if ret != 0 {
- return nil, nil, 0, errors.New("cuda topk logits failed")
- }
- ids := make([]int32, count)
- scores := make([]float32, count)
- if err := MemcpyD2H(unsafe.Pointer(&ids[0]), unsafe.Pointer(outIDsPtr), uintptr(count*4), gpu); err != nil {
- return nil, nil, 0, err
- }
- if err := MemcpyD2H(unsafe.Pointer(&scores[0]), unsafe.Pointer(outScoresPtr), uintptr(count*4), gpu); err != nil {
- return nil, nil, 0, err
- }
- return ids, scores, blocks, nil
- }
- // FreeDevicePtr frees a device pointer
- func FreeDevicePtr(ptr unsafe.Pointer) {
- if ptr != nil {
- C.cuda_free(ptr)
- }
- }
- // Free is an alias for FreeDevicePtr for convenience
- func Free(ptr unsafe.Pointer) {
- FreeDevicePtr(ptr)
- }
- // AllocAndCopyInt32 allocates GPU memory and copies int32 data to it
- // Returns raw device pointer (caller must Free it)
- func AllocAndCopyInt32(data []int32, gpu int) (unsafe.Pointer, error) {
- if len(data) == 0 {
- return nil, errors.New("empty data")
- }
- if ret := C.cuda_set_device(C.int(gpu)); ret != 0 {
- return nil, errors.New("failed to set cuda device")
- }
- size := len(data) * 4 // 4 bytes per int32
- ptr := C.cuda_malloc(C.size_t(size))
- if ptr == nil {
- return nil, errors.New("cuda malloc failed for int32 data")
- }
- ret := C.cuda_memcpy_h2d(ptr, unsafe.Pointer(&data[0]), C.size_t(size))
- if ret != 0 {
- C.cuda_free(ptr)
- return nil, errors.New("cuda memcpy h2d failed for int32 data")
- }
- return ptr, nil
- }
|