|
|
@@ -2571,5 +2571,43 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
}
|
|
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
|
|
|
|
+ add_opt(common_arg(
|
|
|
+ {"--fim-qwen-7b-spec"},
|
|
|
+ string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
|
|
|
+ [](common_params & params) {
|
|
|
+ params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
|
|
|
+ params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
|
|
|
+ params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
|
|
|
+ params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
|
|
|
+ params.speculative.n_gpu_layers = 99;
|
|
|
+ params.port = 8012;
|
|
|
+ params.n_gpu_layers = 99;
|
|
|
+ params.flash_attn = true;
|
|
|
+ params.n_ubatch = 1024;
|
|
|
+ params.n_batch = 1024;
|
|
|
+ params.n_ctx = 0;
|
|
|
+ params.n_cache_reuse = 256;
|
|
|
+ }
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
|
+
|
|
|
+ add_opt(common_arg(
|
|
|
+ {"--fim-qwen-14b-spec"},
|
|
|
+ string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
|
|
|
+ [](common_params & params) {
|
|
|
+ params.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
|
|
|
+ params.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
|
|
|
+ params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
|
|
|
+ params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
|
|
|
+ params.speculative.n_gpu_layers = 99;
|
|
|
+ params.port = 8012;
|
|
|
+ params.n_gpu_layers = 99;
|
|
|
+ params.flash_attn = true;
|
|
|
+ params.n_ubatch = 1024;
|
|
|
+ params.n_batch = 1024;
|
|
|
+ params.n_ctx = 0;
|
|
|
+ params.n_cache_reuse = 256;
|
|
|
+ }
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
|
+
|
|
|
return ctx_arg;
|
|
|
}
|