|
|
@@ -374,6 +374,17 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|
|
#else
|
|
|
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
|
|
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
|
|
+#endif
|
|
|
+ } else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
|
|
|
+ if (++i >= argc) {
|
|
|
+ invalid_param = true;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
|
|
+ params.n_gpu_layers_draft = std::stoi(argv[i]);
|
|
|
+#else
|
|
|
+ fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
|
|
|
+ fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
|
|
#endif
|
|
|
} else if (arg == "--main-gpu" || arg == "-mg") {
|
|
|
if (++i >= argc) {
|
|
|
@@ -664,6 +675,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|
|
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
|
|
printf(" -ngl N, --n-gpu-layers N\n");
|
|
|
printf(" number of layers to store in VRAM\n");
|
|
|
+ printf(" -ngld N, --n-gpu-layers-draft N\n");
|
|
|
+ printf(" number of layers to store in VRAM for the draft model\n");
|
|
|
printf(" -ts SPLIT --tensor-split SPLIT\n");
|
|
|
printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
|
|
printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
|