2 лет назад · 11f3ca06b8
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -67,7 +67,9 @@ endif()
 
				 option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
			
 
				 option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
			
 
				 set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
			
 
				-option(LLAMA_CUBLAS                          "llama: use cuBLAS"                                OFF)
			
 
				+option(LLAMA_CUBLAS                          "llama: use CUDA"                                  OFF)
			
 
				+option(LLAMA_CUDA_CUBLAS                     "llama: use cuBLAS for prompt processing"          OFF)
			
 
				+set(LLAMA_CUDA_MMQ_Y       "64" CACHE STRING "llama: y tile size for mmq CUDA kernels")
			
 
				 option(LLAMA_CUDA_FORCE_DMMV                 "llama: use dmmv instead of mmvq CUDA kernels"     OFF)
			
 
				 set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
			
 
				 set(LLAMA_CUDA_MMV_Y        "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
			
@@ -251,6 +253,10 @@ if (LLAMA_CUBLAS)
 
				         set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
			
 
				 
			
 
				         add_compile_definitions(GGML_USE_CUBLAS)
			
 
				+        if (LLAMA_CUDA_CUBLAS)
			
 
				+            add_compile_definitions(GGML_CUDA_CUBLAS)
			
 
				+        endif()
			
 
				+        add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
			
 
				         if (LLAMA_CUDA_FORCE_DMMV)
			
 
				             add_compile_definitions(GGML_CUDA_FORCE_DMMV)
			
 
				         endif()
			
--- a/Makefile
+++ b/Makefile
@@ -194,7 +194,7 @@ ifdef LLAMA_CUBLAS
 
				 	CXXFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
			
 
				 	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
			
 
				 	OBJS      += ggml-cuda.o
			
 
				-	NVCCFLAGS = --forward-unknown-to-host-compiler
			
 
				+	NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
			
 
				 ifdef LLAMA_CUDA_NVCC
			
 
				 	NVCC = $(LLAMA_CUDA_NVCC)
			
 
				 else
			
@@ -220,14 +220,25 @@ else ifdef LLAMA_CUDA_DMMV_Y
 
				 else
			
 
				 	NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
			
 
				 endif # LLAMA_CUDA_MMV_Y
			
 
				+ifdef LLAMA_CUDA_F16
			
 
				+	NVCCFLAGS += -DGGML_CUDA_F16
			
 
				+endif # LLAMA_CUDA_F16
			
 
				 ifdef LLAMA_CUDA_DMMV_F16
			
 
				-	NVCCFLAGS += -DGGML_CUDA_DMMV_F16
			
 
				+	NVCCFLAGS += -DGGML_CUDA_F16
			
 
				 endif # LLAMA_CUDA_DMMV_F16
			
 
				 ifdef LLAMA_CUDA_KQUANTS_ITER
			
 
				 	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
			
 
				 else
			
 
				 	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
			
 
				 endif
			
 
				+ifdef LLAMA_CUDA_MMQ_Y
			
 
				+	NVCCFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y)
			
 
				+else
			
 
				+	NVCCFLAGS += -DGGML_CUDA_MMQ_Y=64
			
 
				+endif # LLAMA_CUDA_MMQ_Y
			
 
				+ifdef LLAMA_CUDA_CUBLAS
			
 
				+	NVCCFLAGS += -DGGML_CUDA_CUBLAS
			
 
				+endif # LLAMA_CUDA_CUBLAS
			
 
				 ifdef LLAMA_CUDA_CCBIN
			
 
				 	NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
			
 
				 endif
			
--- a/README.md
+++ b/README.md
@@ -402,10 +402,12 @@ Building the program with BLAS support may lead to some performance improvements
 
				 
			
 
				   | Option                  | Legal values           | Default | Description |
			
 
				   |-------------------------|------------------------|---------|-------------|
			
 
				+  | LLAMA_CUDA_CUBLAS       | Boolean                |   false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
			
 
				+  | LLAMA_CUDA_MMQ_Y        | Positive integer >= 32 |      64 | Tile size in y direction when using the custom CUDA kernels for prompt processing. Higher values can be faster depending on the amount of shared memory available. Power of 2 heavily recommended. |
			
 
				   | LLAMA_CUDA_FORCE_DMMV   | Boolean                |   false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
			
 
				   | LLAMA_CUDA_DMMV_X       | Positive integer >= 32 |      32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
			
 
				-  | LLAMA_CUDA_MMV_Y       | Positive integer       |       1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
			
 
				-  | LLAMA_CUDA_DMMV_F16     | Boolean                |   false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |
			
 
				+  | LLAMA_CUDA_MMV_Y        | Positive integer       |       1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
			
 
				+  | LLAMA_CUDA_F16          | Boolean                |   false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
			
 
				   | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 |       2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
			
 
				 
			
 
				 - #### CLBlast
			
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu