hace 2 años · c63bb1d16a
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -387,11 +387,11 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 
				 #else
			
 
				             fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
			
 
				 #endif // GGML_USE_CUBLAS
			
 
				-        } else if (arg == "--mul-mat-q" || arg == "-mmq") {
			
 
				+        } else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
			
 
				 #ifdef GGML_USE_CUBLAS
			
 
				-            params.mul_mat_q = true;
			
 
				+            params.mul_mat_q = false;
			
 
				 #else
			
 
				-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n");
			
 
				+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
			
 
				 #endif // GGML_USE_CUBLAS
			
 
				         } else if (arg == "--low-vram" || arg == "-lv") {
			
 
				 #ifdef GGML_USE_CUBLAS
			
@@ -599,11 +599,11 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 
				     fprintf(stdout, "                        number of layers to store in VRAM\n");
			
 
				     fprintf(stdout, "  -ts SPLIT --tensor-split SPLIT\n");
			
 
				     fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
			
 
				-    fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n" );
			
 
				-    fprintf(stdout, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n" );
			
 
				-    fprintf(stdout, "  -mmq, --mul-mat-q     use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
			
 
				-    fprintf(stdout, "                        Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
			
 
				-    fprintf(stdout, "                        is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
			
 
				+    fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
			
 
				+    fprintf(stdout, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n");
			
 
				+    fprintf(stdout, "  -nommq, --no-mul-mat-q\n");
			
 
				+    fprintf(stdout, "                        use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
			
 
				+    fprintf(stdout, "                        Not recommended since this is both slower and uses more VRAM.\n");
			
 
				 #endif
			
 
				     fprintf(stdout, "  --mtest               compute maximum memory usage\n");
			
 
				     fprintf(stdout, "  --export              export the computation graph to 'llama.ggml'\n");
			
--- a/common/common.h
+++ b/common/common.h
@@ -68,7 +68,7 @@ struct gpt_params {
 
				     size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score
			
 
				 
			
 
				     bool low_vram          = false; // if true, reduce VRAM usage at the cost of performance
			
 
				-    bool mul_mat_q         = false; // if true, use experimental mul_mat_q kernels
			
 
				+    bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
			
 
				     bool memory_f16        = true;  // use f16 instead of f32 for memory kv
			
 
				     bool random_prompt     = false; // do not randomize prompt if none provided
			
 
				     bool use_color         = false; // use color to distinguish generations and inputs
			
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -671,12 +671,11 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
 
				     fprintf(stdout, "                        number of layers to store in VRAM\n");
			
 
				     fprintf(stdout, "  -ts SPLIT --tensor-split SPLIT\n");
			
 
				     fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
			
 
				-    fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
			
 
				     fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
			
 
				     fprintf(stdout, "  -lv, --low-vram don't allocate VRAM scratch buffer\n");
			
 
				-    fprintf(stdout, "  -mmq, --mul-mat-q     use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
			
 
				-    fprintf(stdout, "                        Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
			
 
				-    fprintf(stdout, "                        is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
			
 
				+    fprintf(stdout, "  -nommq, --no-mul-mat-q\n");
			
 
				+    fprintf(stdout, "                        use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
			
 
				+    fprintf(stdout, "                        Not recommended since this is both slower and uses more VRAM.\n");
			
 
				 #endif
			
 
				     fprintf(stdout, "  -m FNAME, --model FNAME\n");
			
 
				     fprintf(stdout, "                        model path (default: %s)\n", params.model.c_str());
			
@@ -867,12 +866,12 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
 
				             LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {});
			
 
				 #endif // GGML_USE_CUBLAS
			
 
				         }
			
 
				-        else if (arg == "--mul-mat-q" || arg == "-mmq")
			
 
				+        else if (arg == "--no-mul-mat-q" || arg == "-nommq")
			
 
				         {
			
 
				 #ifdef GGML_USE_CUBLAS
			
 
				-            params.mul_mat_q = true;
			
 
				+            params.mul_mat_q = false;
			
 
				 #else
			
 
				-            LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n", {});
			
 
				+            LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {});
			
 
				 #endif // GGML_USE_CUBLAS
			
 
				         }
			
 
				         else if (arg == "--main-gpu" || arg == "-mg")
			
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -287,7 +287,7 @@ static int g_device_count = -1;
 
				 static int g_main_device = 0;
			
 
				 static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
			
 
				 static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
			
 
				-static bool g_mul_mat_q = false;
			
 
				+static bool g_mul_mat_q = true;
			
 
				 
			
 
				 static void * g_scratch_buffer = nullptr;
			
 
				 static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default