cturan
/
makarna
mirror of https://github.com/cturan/makarna


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
							.PHONY: all test test-gen clean build build-cuda cuda-lib quantize

# CUDA Configuration
CUDA_HOME ?= /usr/local/cuda
NVCC ?= $(CUDA_HOME)/bin/nvcc
CUDA_LIB_PATH ?= $(CUDA_HOME)/lib64

GO_BUILD_FLAGS ?= -trimpath -ldflags="-s -w"

# Build directories
CUDA_SRC_DIR = pkg/backend/cuda
CUDA_BUILD_DIR = build/cuda
CUDA_OBJ = \
	$(CUDA_BUILD_DIR)/cuda_memory.o \
	$(CUDA_BUILD_DIR)/cuda_elementwise.o \
	$(CUDA_BUILD_DIR)/cuda_dequant_q8k.o \
	$(CUDA_BUILD_DIR)/cuda_dequant_q4k.o \
	$(CUDA_BUILD_DIR)/cuda_dequant_q5k.o \
	$(CUDA_BUILD_DIR)/cuda_dequant_other.o \
	$(CUDA_BUILD_DIR)/cuda_matmul.o \
	$(CUDA_BUILD_DIR)/cuda_nn.o
CUDA_STATIC_LIB = $(CUDA_BUILD_DIR)/libmakarna_cuda.a
CUDA_SHARED_LIB = $(CUDA_BUILD_DIR)/libmakarna_cuda.so

all: build

# Build CPU-only binaries
build:
	go build $(GO_BUILD_FLAGS) -o bin/makarna ./cmd/run-model
	go build $(GO_BUILD_FLAGS) -o bin/quantize ./cmd/quantize
	go build $(GO_BUILD_FLAGS) -o bin/convert ./cmd/convert
	

# Build CUDA-enabled binaries with static linking of our code
build-cuda: cuda-static-lib
	CGO_LDFLAGS="-L$(CURDIR)/$(CUDA_BUILD_DIR) -L$(CUDA_LIB_PATH) -Wl,-Bstatic -lmakarna_cuda -Wl,-Bdynamic -lcudart -lstdc++" \
	CGO_CFLAGS="-I$(CURDIR)/$(CUDA_SRC_DIR)" \
	go build $(GO_BUILD_FLAGS) -tags cuda -o bin/makarna-cuda ./cmd/run-model
	go build $(GO_BUILD_FLAGS) -o bin/quantize ./cmd/quantize
	go build $(GO_BUILD_FLAGS) -o bin/convert ./cmd/convert
	@echo "CUDA build complete. Run with: ./bin/makarna-cuda"

# Compile CUDA kernels into static library
cuda-static-lib: $(CUDA_STATIC_LIB)

$(CUDA_STATIC_LIB): $(CUDA_OBJ)
	@echo "Creating static library..."
	ar rcs $@ $^
	@echo "Static library built: $@"


$(CUDA_BUILD_DIR):
	mkdir -p $@

$(CUDA_BUILD_DIR)/%.o: $(CUDA_SRC_DIR)/%.cu $(CUDA_SRC_DIR)/kernels.h $(CUDA_SRC_DIR)/cuda_common.cuh | $(CUDA_BUILD_DIR)
	@echo "Compiling CUDA kernels..."
	$(NVCC) -c -Xcompiler -fPIC -Xcompiler -O3 -Xcompiler -DNDEBUG \
		-O3 \
		--use_fast_math \
		--expt-relaxed-constexpr \
		-std=c++17 \
		-arch=sm_75 \
		-gencode=arch=compute_75,code=sm_75 \
		-gencode=arch=compute_80,code=sm_80 \
		-gencode=arch=compute_86,code=sm_86 \
		-gencode=arch=compute_89,code=sm_89 \
		-o $@ $<

# Legacy: shared library (kept for compatibility)
cuda-lib: $(CUDA_OBJ)
	@echo "Building CUDA shared library..."
	$(NVCC) -shared -Xcompiler -fPIC \
		-O3 \
		--use_fast_math \
		-arch=sm_75 \
		-o $(CUDA_SHARED_LIB) $(CUDA_OBJ)
run-cuda: build-cuda
	LD_LIBRARY_PATH=$(CURDIR)/$(CUDA_BUILD_DIR):$(CUDA_LIB_PATH):$$LD_LIBRARY_PATH \
	./bin/makarna-cuda -model $(MODEL) -prompt "$(PROMPT)" -chat -steps $(STEPS) -n-gpu-layers $(GPU_LAYERS)

# Default values for run-cuda
MODEL ?= /home/ai/llama/quants/qwen3-q8.mak
PROMPT ?= "Hello"
STEPS ?= 10
GPU_LAYERS ?= 28

PYTHON ?= python3

test-gen:
	@echo "Generating golden test data..."
	PYTHONPATH=. $(PYTHON) scripts/gen_test_data.py
	@echo "Running tests..."
	go test -v ./tests/... ./pkg/...

test-cpu:
	@echo "Running CPU tests..."
	go test -v ./pkg/...

test-cuda: cuda-lib
	@echo "Running CUDA tests..."
	CGO_LDFLAGS="-L$(CURDIR)/$(CUDA_BUILD_DIR) -L$(CUDA_LIB_PATH) -Wl,-Bstatic -lmakarna_cuda -Wl,-Bdynamic -lcudart -Wl,-rpath,$(CURDIR)/$(CUDA_BUILD_DIR) -Wl,-rpath,$(CUDA_LIB_PATH)" \
	LD_LIBRARY_PATH=$(CURDIR)/$(CUDA_BUILD_DIR):$(CUDA_LIB_PATH):$$LD_LIBRARY_PATH \
	go test -tags cuda -v ./pkg/backend/cuda/...

test-quant:
	@echo "Testing quantization functions..."
	go test -v ./pkg/quant/...

bench-quant:
	@echo "Benchmarking quantization..."
	go test -bench=. ./pkg/quant/

clean:
	rm -rf bin/
	rm -rf build/
	rm -f tests/data/*.bin
	rm -f $(CUDA_LIB)

clean-cuda:
	rm -f $(CUDA_LIB)

# Convenience targets for model conversion
convert-f32:
	PYTHONPATH=scripts $(PYTHON) scripts/convert_fast.py $(MODEL) $(OUTPUT)

quantize-q4k:
	./bin/quantize $(INPUT) $(OUTPUT) q4_k

quantize-q6k:
	./bin/quantize $(INPUT) $(OUTPUT) q6_k

quantize-q8k:
	./bin/quantize $(INPUT) $(OUTPUT) q8_k

# Help
help:
	@echo "Makarna - Inference Engine"
	@echo ""
	@echo "Build targets:"
	@echo "  make build        - Build CPU-only binaries"
	@echo "  make build-cuda   - Build CUDA-enabled binaries"
	@echo "  make cuda-lib     - Build CUDA kernel library only"
	@echo ""
	@echo "Run targets:"
	@echo "  make run-cuda MODEL=path PROMPT='text' STEPS=n GPU_LAYERS=n"
	@echo ""
	@echo "Test targets:"
	@echo "  make test-cpu     - Run CPU tests"
	@echo "  make test-cuda    - Run CUDA tests"
	@echo "  make test-quant   - Run quantization tests"
	@echo ""
	@echo "Clean targets:"
	@echo "  make clean        - Remove all build artifacts"
	@echo "  make clean-cuda   - Remove CUDA library only"