2 anni fa · 84e723653c
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -374,6 +374,17 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 
				 #else
			
 
				             fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
			
 
				             fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
			
 
				+#endif
			
 
				+        } else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
			
 
				+            if (++i >= argc) {
			
 
				+                invalid_param = true;
			
 
				+                break;
			
 
				+            }
			
 
				+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
			
 
				+            params.n_gpu_layers_draft = std::stoi(argv[i]);
			
 
				+#else
			
 
				+            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
			
 
				+            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
			
 
				 #endif
			
 
				         } else if (arg == "--main-gpu" || arg == "-mg") {
			
 
				             if (++i >= argc) {
			
@@ -664,6 +675,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 
				 #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
			
 
				     printf("  -ngl N, --n-gpu-layers N\n");
			
 
				     printf("                        number of layers to store in VRAM\n");
			
 
				+    printf("  -ngld N, --n-gpu-layers-draft N\n");
			
 
				+    printf("                        number of layers to store in VRAM for the draft model\n");
			
 
				     printf("  -ts SPLIT --tensor-split SPLIT\n");
			
 
				     printf("                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
			
 
				     printf("  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
			
--- a/common/common.h
+++ b/common/common.h
@@ -38,6 +38,7 @@ struct gpt_params {
 
				     int32_t n_draft                         = 16;   // number of tokens to draft during speculative decoding
			
 
				     int32_t n_chunks                        = -1;   // max number of chunks to process (-1 = unlimited)
			
 
				     int32_t n_gpu_layers                    = -1;   // number of layers to store in VRAM (-1 - use default)
			
 
				+    int32_t n_gpu_layers_draft              = -1;   // number of layers to store in VRAM for the draft model (-1 - use default)
			
 
				     int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
			
 
				     float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
			
 
				     int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
			
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -42,6 +42,7 @@ int main(int argc, char ** argv) {
 
				 
			
 
				     // load the draft model
			
 
				     params.model = params.model_draft;
			
 
				+    params.n_gpu_layers = params.n_gpu_layers_draft;
			
 
				     std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
			
 
				 
			
 
				     // tokenize the prompt