1 gadu atpakaļ · 42c76d1358
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -251,6 +251,57 @@ int32_t cpu_get_num_math() {
 
															     return cpu_get_num_physical_cores();
														
 
															 }
														
 
															+// Helper for setting process priority
														
 
															+
														
 
															+#if defined(_WIN32)
														
 
															+
														
 
															+bool set_process_priority(enum ggml_sched_priority prio) {
														
 
															+    if (prio == GGML_SCHED_PRIO_NORMAL) {
														
 
															+        return true;
														
 
															+    }
														
 
															+
														
 
															+    DWORD p = NORMAL_PRIORITY_CLASS;
														
 
															+    switch (prio) {
														
 
															+        case GGML_SCHED_PRIO_NORMAL:   p = NORMAL_PRIORITY_CLASS;       break;
														
 
															+        case GGML_SCHED_PRIO_MEDIUM:   p = ABOVE_NORMAL_PRIORITY_CLASS; break;
														
 
															+        case GGML_SCHED_PRIO_HIGH:     p = HIGH_PRIORITY_CLASS;         break;
														
 
															+        case GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS;     break;
														
 
															+    }
														
 
															+
														
 
															+    if (!SetPriorityClass(GetCurrentProcess(), p)) {
														
 
															+        fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
														
 
															+        return false;
														
 
															+    }
														
 
															+
														
 
															+    return true;
														
 
															+}
														
 
															+
														
 
															+#else // MacOS and POSIX
														
 
															+#include <sys/types.h>
														
 
															+#include <sys/resource.h>
														
 
															+
														
 
															+bool set_process_priority(enum ggml_sched_priority prio) {
														
 
															+    if (prio == GGML_SCHED_PRIO_NORMAL) {
														
 
															+        return true;
														
 
															+    }
														
 
															+
														
 
															+    int p = 0;
														
 
															+    switch (prio) {
														
 
															+        case GGML_SCHED_PRIO_NORMAL:   p =  0;  break;
														
 
															+        case GGML_SCHED_PRIO_MEDIUM:   p = -5;  break;
														
 
															+        case GGML_SCHED_PRIO_HIGH:     p = -10; break;
														
 
															+        case GGML_SCHED_PRIO_REALTIME: p = -20; break;
														
 
															+    }
														
 
															+
														
 
															+    if (!setpriority(PRIO_PROCESS, 0, p)) {
														
 
															+        fprintf(stderr, "warn: failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
														
 
															+        return false;
														
 
															+    }
														
 
															+    return true;
														
 
															+}
														
 
															+
														
 
															+#endif
														
 
															+
														
 
															 //
														
 
															 // CLI argument parsing
														
 
															 //
														
@@ -277,6 +328,30 @@ void gpt_params_handle_model_default(gpt_params & params) {
 
															     }
														
 
															 }
														
 
															+void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
														
 
															+    int32_t n_set = 0;
														
 
															+
														
 
															+    if (cpuparams.n_threads < 0) {
														
 
															+        // Assuming everything about cpuparams is invalid
														
 
															+        if (role_model != nullptr) {
														
 
															+            cpuparams = *role_model;
														
 
															+        } else {
														
 
															+            cpuparams.n_threads = cpu_get_num_math();
														
 
															+        }
														
 
															+    }
														
 
															+
														
 
															+    for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
														
 
															+        if (cpuparams.cpumask[i]) {
														
 
															+            n_set++;
														
 
															+        }
														
 
															+    }
														
 
															+
														
 
															+    if (n_set && n_set < cpuparams.n_threads) {
														
 
															+        // Not enough set bits, may experience performance issues.
														
 
															+        fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
														
 
															     bool invalid_param = false;
														
 
															     std::string arg;
														
@@ -296,6 +371,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
 
															         }
														
 
															     }
														
 
															+    postprocess_cpu_params(params.cpuparams, nullptr);
														
 
															+    postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
														
 
															+    postprocess_cpu_params(params.draft_cpuparams, &params.cpuparams);
														
 
															+    postprocess_cpu_params(params.draft_cpuparams_batch, &params.cpuparams_batch);
														
 
															+
														
 
															     if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
														
 
															         throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
														
 
															     }
														
@@ -331,7 +411,7 @@ void gpt_params_parse_from_env(gpt_params & params) {
 
															     get_env("LLAMA_ARG_MODEL_ALIAS",      params.model_alias);
														
 
															     get_env("LLAMA_ARG_HF_REPO",          params.hf_repo);
														
 
															     get_env("LLAMA_ARG_HF_FILE",          params.hf_file);
														
 
															-    get_env("LLAMA_ARG_THREADS",          params.n_threads);
														
 
															+    get_env("LLAMA_ARG_THREADS",          params.cpuparams.n_threads);
														
 
															     get_env("LLAMA_ARG_CTX_SIZE",         params.n_ctx);
														
 
															     get_env("LLAMA_ARG_N_PARALLEL",       params.n_parallel);
														
 
															     get_env("LLAMA_ARG_BATCH",            params.n_batch);
														
@@ -368,6 +448,79 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 
															     return true;
														
 
															 }
														
 
															+bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
														
 
															+    size_t dash_loc = range.find('-');
														
 
															+    if (dash_loc == std::string::npos) {
														
 
															+        fprintf(stderr, "Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
														
 
															+        return false;
														
 
															+    }
														
 
															+
														
 
															+    size_t start_i;
														
 
															+    size_t end_i;
														
 
															+
														
 
															+    if (dash_loc == 0) {
														
 
															+        start_i = 0;
														
 
															+    } else {
														
 
															+        start_i = std::stoull(range.substr(0, dash_loc));
														
 
															+        if (start_i >= GGML_MAX_N_THREADS) {
														
 
															+            fprintf(stderr, "Start index out of bounds!\n");
														
 
															+            return false;
														
 
															+        }
														
 
															+    }
														
 
															+
														
 
															+    if (dash_loc == range.length() - 1) {
														
 
															+        end_i = GGML_MAX_N_THREADS - 1;
														
 
															+    } else {
														
 
															+        end_i = std::stoull(range.substr(dash_loc + 1));
														
 
															+        if (end_i >= GGML_MAX_N_THREADS) {
														
 
															+            fprintf(stderr, "End index out of bounds!\n");
														
 
															+            return false;
														
 
															+        }
														
 
															+    }
														
 
															+
														
 
															+    for (size_t i = start_i; i <= end_i; i++) {
														
 
															+        boolmask[i] = true;
														
 
															+    }
														
 
															+
														
 
															+    return true;
														
 
															+}
														
 
															+
														
 
															+bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREADS]) {
														
 
															+    // Discard potential 0x prefix
														
 
															+    size_t start_i = 0;
														
 
															+    if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {
														
 
															+        start_i = 2;
														
 
															+    }
														
 
															+
														
 
															+    size_t num_digits = mask.length() - start_i;
														
 
															+    if (num_digits > 128) num_digits = 128;
														
 
															+
														
 
															+    size_t end_i = num_digits + start_i;
														
 
															+
														
 
															+    for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
														
 
															+        char c = mask.at(i);
														
 
															+        int8_t id = c;
														
 
															+
														
 
															+        if ((c >= '0' && c <= '9')) {
														
 
															+            id -= '0';
														
 
															+        } else if (c >= 'a' && c <= 'f') {
														
 
															+            id -= 'a' - 10;
														
 
															+        } else if (c >= 'A' && c <= 'F') {
														
 
															+            id -= 'A' - 10;
														
 
															+        } else {
														
 
															+            fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i));
														
 
															+            return false;
														
 
															+        }
														
 
															+
														
 
															+        boolmask[  n  ] = boolmask[  n  ] || ((id & 8) != 0);
														
 
															+        boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
														
 
															+        boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
														
 
															+        boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
														
 
															+    }
														
 
															+
														
 
															+    return true;
														
 
															+}
														
 
															+
														
 
															 #define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; }
														
 
															 bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
														
@@ -384,36 +537,142 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
 
															     }
														
 
															     if (arg == "-t" || arg == "--threads") {
														
 
															         CHECK_ARG
														
 
															-        params.n_threads = std::stoi(argv[i]);
														
 
															-        if (params.n_threads <= 0) {
														
 
															-            params.n_threads = std::thread::hardware_concurrency();
														
 
															+        params.cpuparams.n_threads = std::stoi(argv[i]);
														
 
															+        if (params.cpuparams.n_threads <= 0) {
														
 
															+            params.cpuparams.n_threads = std::thread::hardware_concurrency();
														
 
															         }
														
 
															         return true;
														
 
															     }
														
 
															+    if (arg == "-C" || arg == "--cpu-mask") {
														
 
															+        CHECK_ARG
														
 
															+        std::string mask = argv[i];
														
 
															+        params.cpuparams.mask_valid = true;
														
 
															+        invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
														
 
															+        return true;
														
 
															+    }
														
 
															+    if (arg == "-Cr" || arg == "--cpu-range") {
														
 
															+        CHECK_ARG
														
 
															+        std::string range = argv[i];
														
 
															+        params.cpuparams.mask_valid = true;
														
 
															+        invalid_param = !parse_cpu_range(range, params.cpuparams.cpumask);
														
 
															+        return true;
														
 
															+    }
														
 
															+    if (arg == "--prio") {
														
 
															+        CHECK_ARG
														
 
															+        params.cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
														
 
															+        return true;
														
 
															+    }
														
 
															+    if (arg == "--cpu-strict") {
														
 
															+        CHECK_ARG
														
 
															+        params.cpuparams.strict_cpu = std::stoul(argv[i]);
														
 
															+        return true;
														
 
															+    }
														
 
															+    if (arg == "--poll") {
														
 
															+        CHECK_ARG
														
 
															+        params.cpuparams.poll = std::stoul(argv[i]);
														
 
															+        return true;
														
 
															+    }
														
 
															     if (arg == "-tb" || arg == "--threads-batch") {
														
 
															         CHECK_ARG
														
 
															-        params.n_threads_batch = std::stoi(argv[i]);
														
 
															-        if (params.n_threads_batch <= 0) {
														
 
															-            params.n_threads_batch = std::thread::hardware_concurrency();
														
 
															+        params.cpuparams_batch.n_threads = std::stoi(argv[i]);
														
 
															+        if (params.cpuparams_batch.n_threads <= 0) {
														
 
															+            params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
														
 
															         }
														
 
															         return true;
														
 
															     }
														
 
															+    if (arg == "-Cb" || arg == "--cpu-mask-batch") {
														
 
															+        CHECK_ARG
														
 
															+        std::string mask = argv[i];
														
 
															+        params.cpuparams_batch.mask_valid = true;
														
 
															+        invalid_param = !parse_cpu_mask(mask, params.cpuparams_batch.cpumask);
														
 
															+        return true;
														
 
															+    }
														
 
															+    if (arg == "-Crb" || arg == "--cpu-range_batch") {
														
 
															+        CHECK_ARG
														
 
															+        std::string range = argv[i];
														
 
															+        params.cpuparams_batch.mask_valid = true;
														
 
															+        invalid_param = !parse_cpu_range(range, params.cpuparams_batch.cpumask);
														
 
															+        return true;
														
 
															+    }
														
 
															+    if (arg == "--prio-batch") {
														
 
															+        CHECK_ARG
														
 
															+        params.cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
														
 
															+        return true;
														
 
															+    }
														
 
															+    if (arg == "--cpu-strict-batch") {
														
 
															+        params.cpuparams_batch.strict_cpu = true;
														
 
															+        return true;
														
 
															+    }
														
 
															+    if (arg == "--poll-batch") {
														
 
															+        CHECK_ARG
														
 
															+        params.cpuparams_batch.poll = std::stoul(argv[i]);
														
 
															+        return true;
														
 
															+    }
														
 
															     if (arg == "-td" || arg == "--threads-draft") {
														
 
															         CHECK_ARG
														
 
															-        params.n_threads_draft = std::stoi(argv[i]);
														
 
															-        if (params.n_threads_draft <= 0) {
														
 
															-            params.n_threads_draft = std::thread::hardware_concurrency();
														
 
															+        params.draft_cpuparams.n_threads = std::stoi(argv[i]);
														
 
															+        if (params.draft_cpuparams.n_threads <= 0) {
														
 
															+            params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
														
 
															         }
														
 
															         return true;
														
 
															+    }
														
 
															+        if (arg == "-Cd" || arg == "--cpu-mask-draft") {
														
 
															+        CHECK_ARG
														
 
															+        std::string mask = argv[i];
														
 
															+        params.draft_cpuparams.mask_valid = true;
														
 
															+        invalid_param = !parse_cpu_mask(mask, params.draft_cpuparams.cpumask);
														
 
															+        return true;
														
 
															+    }
														
 
															+    if (arg == "-Crd" || arg == "--cpu-range-draft") {
														
 
															+        CHECK_ARG
														
 
															+        std::string range = argv[i];
														
 
															+        params.draft_cpuparams.mask_valid = true;
														
 
															+        invalid_param = !parse_cpu_range(range, params.draft_cpuparams.cpumask);
														
 
															+        return true;
														
 
															+    }
														
 
															+    if (arg == "--prio-draft") {
														
 
															+        CHECK_ARG
														
 
															+        params.draft_cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
														
 
															+        return true;
														
 
															+    }
														
 
															+    if (arg == "--cpu-strict-draft") {
														
 
															+        params.draft_cpuparams.strict_cpu = true;
														
 
															+        return true;
														
 
															+    }
														
 
															+    if (arg == "--poll-draft") {
														
 
															+        CHECK_ARG
														
 
															+        params.draft_cpuparams.poll = std::stoul(argv[i]);
														
 
															+        return true;
														
 
															     }
														
 
															     if (arg == "-tbd" || arg == "--threads-batch-draft") {
														
 
															         CHECK_ARG
														
 
															-        params.n_threads_batch_draft = std::stoi(argv[i]);
														
 
															-        if (params.n_threads_batch_draft <= 0) {
														
 
															-            params.n_threads_batch_draft = std::thread::hardware_concurrency();
														
 
															+        params.draft_cpuparams_batch.n_threads = std::stoi(argv[i]);
														
 
															+        if (params.draft_cpuparams_batch.n_threads <= 0) {
														
 
															+            params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
														
 
															         }
														
 
															         return true;
														
 
															     }
														
 
															+    if (arg == "-Crbd" || arg == "--cpu-range-batch-draft") {
														
 
															+        CHECK_ARG
														
 
															+        std::string range = argv[i];
														
 
															+        params.draft_cpuparams_batch.mask_valid = true;
														
 
															+        invalid_param = !parse_cpu_range(range, params.draft_cpuparams_batch.cpumask);
														
 
															+        return true;
														
 
															+    }
														
 
															+    if (arg == "--prio-batch-draft") {
														
 
															+        CHECK_ARG
														
 
															+        params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
														
 
															+        return true;
														
 
															+    }
														
 
															+    if (arg == "--cpu-strict-batch-draft") {
														
 
															+        params.draft_cpuparams_batch.strict_cpu = true;
														
 
															+        return true;
														
 
															+    }
														
 
															+    if (arg == "--poll-batch-draft") {
														
 
															+        CHECK_ARG
														
 
															+        params.draft_cpuparams_batch.poll = std::stoul(argv[i]);
														
 
															+        return true;
														
 
															+    }
														
 
															     if (arg == "-p" || arg == "--prompt") {
														
 
															         CHECK_ARG
														
 
															         params.prompt = argv[i];
														
@@ -1498,11 +1757,40 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
 
															     options.push_back({ "*",           "       --no-display-prompt",    "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
														
 
															     options.push_back({ "*",           "-co,   --color",                "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
														
 
															     options.push_back({ "*",           "-s,    --seed SEED",            "RNG seed (default: %d, use random seed for < 0)", params.seed });
														
 
															-    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during generation (default: %d)", params.n_threads });
														
 
															+    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during generation (default: %d)", params.cpuparams.n_threads });
														
 
															     options.push_back({ "*",           "-tb,   --threads-batch N",      "number of threads to use during batch and prompt processing (default: same as --threads)" });
														
 
															     options.push_back({ "speculative", "-td,   --threads-draft N",      "number of threads to use during generation (default: same as --threads)" });
														
 
															-    options.push_back({ "speculative", "-tbd,  --threads-batch-draft N",
														
 
															-                                                                        "number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
														
 
															+    options.push_back({ "speculative", "-tbd,  --threads-batch-draft N","number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
														
 
															+
														
 
															+#ifndef GGML_USE_OPENMP
														
 
															+    // these options are available only with the internal threadpool
														
 
															+    options.push_back({ "*",           "-C,    --cpu-mask M",            "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")"});
														
 
															+    options.push_back({ "*",           "-Cr,   --cpu-range lo-hi",       "range of CPUs for affinity. Complements --cpu-mask"});
														
 
															+    options.push_back({ "*",           "       --cpu-strict <0|1>",      "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
														
 
															+    options.push_back({ "*",           "       --priority N",            "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
														
 
															+    options.push_back({ "*",           "       --poll <0...100>",        "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll});
														
 
															+
														
 
															+    options.push_back({ "*",           "-Cb,   --cpu-mask-batch M",      "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
														
 
															+    options.push_back({ "*",           "-Crb,  --cpu-range-batch lo-hi", "ranges of CPUs for affinity. Complements --cpu-mask-batch"});
														
 
															+    options.push_back({ "*",           "       --cpu-strict-batch <0|1>","use strict CPU placement (default: same as --cpu-strict)"});
														
 
															+    options.push_back({ "*",           "       --priority-batch N",      "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority)"});
														
 
															+    options.push_back({ "*",           "       --poll-batch <0|1>",      "use polling to wait for work (default: same as --poll"});
														
 
															+
														
 
															+    options.push_back({ "speculative", "-Cd,   --cpu-mask-draft M",      "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)"});
														
 
															+    options.push_back({ "speculative", "-Crd,  --cpu-range-draft lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft"});
														
 
															+    options.push_back({ "speculative", "       --cpu-strict-draft <0|1>","Use strict CPU placement for draft model (default: same as --cpu-strict)"});
														
 
															+    options.push_back({ "speculative", "       --priority-draft N",      "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: same as --priority)"});
														
 
															+    options.push_back({ "speculative", "       --poll-draft <0|1>",      "Use polling to wait for draft model work (default: same as --poll])"});
														
 
															+
														
 
															+    options.push_back({ "speculative", "-Cbd,  --cpu-mask-batch-draft M","Draft model CPU affinity mask. Complements cpu-range-draft-batch (default: same as --cpu-mask-draft)"});
														
 
															+    options.push_back({ "speculative", "-Crbd, --cpu-range-batch-draft lo-hi",
														
 
															+                                                                         "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)"});
														
 
															+    options.push_back({ "speculative", "       --cpu-strict-batch-draft <0|1>",
														
 
															+                                                                         "Use strict CPU placement for draft model (default: --cpu-strict-draft)"});
														
 
															+    options.push_back({ "speculative", "       --priority-batch-draft N","Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority-draft)"});
														
 
															+    options.push_back({ "speculative", "       --poll-batch-draft <0|1>","Use polling to wait for draft model work (default: --poll-draft)"});
														
 
															+#endif // GGML_USE_OPENMP
														
 
															+
														
 
															     options.push_back({ "speculative", "       --draft N",              "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
														
 
															     options.push_back({ "speculative", "-ps,   --p-split N",            "speculative decoding split probability (default: %.1f)", (double)params.p_split });
														
 
															     options.push_back({ "*",           "-lcs,  --lookup-cache-static FNAME",
														
@@ -1774,7 +2062,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
 
															     options.push_back({ "export-lora", "-m,    --model",                "model path from which to load base model (default '%s')", params.model.c_str() });
														
 
															     options.push_back({ "export-lora", "       --lora FNAME",           "path to LoRA adapter  (can be repeated to use multiple adapters)" });
														
 
															     options.push_back({ "export-lora", "       --lora-scaled FNAME S",  "path to LoRA adapter with user defined scaling S  (can be repeated to use multiple adapters)" });
														
 
															-    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during computation (default: %d)", params.n_threads });
														
 
															     options.push_back({ "export-lora", "-o,    --output FNAME",         "output file (default: '%s')", params.lora_outfile.c_str() });
														
 
															     printf("usage: %s [options]\n", argv[0]);
														
@@ -1806,9 +2093,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
 
															 std::string gpt_params_get_system_info(const gpt_params & params) {
														
 
															     std::ostringstream os;
														
 
															-    os << "system_info: n_threads = " << params.n_threads;
														
 
															-    if (params.n_threads_batch != -1) {
														
 
															-        os << " (n_threads_batch = " << params.n_threads_batch << ")";
														
 
															+    os << "system_info: n_threads = " << params.cpuparams.n_threads;
														
 
															+    if (params.cpuparams_batch.n_threads != -1) {
														
 
															+        os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
														
 
															     }
														
 
															 #if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
														
 
															     // TODO: windows + arm64 + mingw64
														
@@ -2332,8 +2619,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
 
															     cparams.n_seq_max         = params.n_parallel;
														
 
															     cparams.n_batch           = params.n_batch;
														
 
															     cparams.n_ubatch          = params.n_ubatch;
														
 
															-    cparams.n_threads         = params.n_threads;
														
 
															-    cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
														
 
															+    cparams.n_threads         = params.cpuparams.n_threads;
														
 
															+    cparams.n_threads_batch   = params.cpuparams_batch.n_threads == -1 ?
														
 
															+                                    params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
														
 
															     cparams.seed              = params.seed;
														
 
															     cparams.logits_all        = params.logits_all;
														
 
															     cparams.embeddings        = params.embedding;
														
@@ -2359,6 +2647,22 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
 
															     return cparams;
														
 
															 }
														
 
															+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
														
 
															+    struct ggml_threadpool_params tpp;
														
 
															+
														
 
															+    ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
														
 
															+
														
 
															+    if (params.mask_valid) {
														
 
															+        std::memcpy(&tpp.cpumask, &params.cpumask, GGML_MAX_N_THREADS);
														
 
															+    }
														
 
															+
														
 
															+    tpp.prio       = params.priority;
														
 
															+    tpp.poll       = params.poll;
														
 
															+    tpp.strict_cpu = params.strict_cpu;
														
 
															+
														
 
															+    return tpp;
														
 
															+}
														
 
															+
														
 
															 #ifdef LLAMA_USE_CURL
														
 
															 static bool starts_with(const std::string & str, const std::string & prefix) {
														
@@ -3348,7 +3652,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
 
															     yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
														
 
															     fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
														
 
															-    fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
														
 
															+    fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
														
 
															     fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
														
 
															     fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
														
 
															     fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
														
--- a/common/common.h
+++ b/common/common.h
@@ -67,13 +67,18 @@ enum dimre_method {
 
															     DIMRE_METHOD_MEAN,
														
 
															 };
														
 
															+struct cpu_params {
														
 
															+    int      n_threads                   = -1;
														
 
															+    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
														
 
															+    bool     mask_valid                  = false;   // Default: any CPU
														
 
															+    enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
														
 
															+    bool     strict_cpu                  = false;   // Use strict CPU placement
														
 
															+    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
														
 
															+};
														
 
															+
														
 
															 struct gpt_params {
														
 
															     uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
														
 
															-    int32_t n_threads             = cpu_get_num_math();
														
 
															-    int32_t n_threads_draft       =    -1;
														
 
															-    int32_t n_threads_batch       =    -1; // number of threads to use for batch processing (-1 = use n_threads)
														
 
															-    int32_t n_threads_batch_draft =    -1;
														
 
															     int32_t n_predict             =    -1; // new tokens to predict
														
 
															     int32_t n_ctx                 =     0; // context size
														
 
															     int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
														
@@ -100,6 +105,11 @@ struct gpt_params {
 
															     int32_t yarn_orig_ctx         =     0; // YaRN original context length
														
 
															     float   defrag_thold          = -1.0f; // KV cache defragmentation threshold
														
 
															+    struct cpu_params cpuparams;
														
 
															+    struct cpu_params cpuparams_batch;
														
 
															+    struct cpu_params draft_cpuparams;
														
 
															+    struct cpu_params draft_cpuparams_batch;
														
 
															+
														
 
															     ggml_backend_sched_eval_callback cb_eval = nullptr;
														
 
															     void * cb_eval_user_data                 = nullptr;
														
@@ -204,7 +214,7 @@ struct gpt_params {
 
															     int32_t port           = 8080;         // server listens on this network port
														
 
															     int32_t timeout_read   = 600;          // http read timeout in seconds
														
 
															     int32_t timeout_write  = timeout_read; // http write timeout in seconds
														
 
															-    int32_t n_threads_http = -1;           // number of threads to process HTTP requests
														
 
															+    int     n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
														
 
															     std::string hostname      = "127.0.0.1";
														
 
															     std::string public_path   = "";
														
@@ -277,6 +287,11 @@ void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
 
															 std::string gpt_params_get_system_info(const gpt_params & params);
														
 
															+bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
														
 
															+bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
														
 
															+void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
														
 
															+bool set_process_priority(enum ggml_sched_priority prio);
														
 
															+
														
 
															 //
														
 
															 // String utils
														
 
															 //
														
@@ -327,8 +342,9 @@ struct llama_init_result {
 
															 struct llama_init_result    llama_init_from_gpt_params(gpt_params & params);
														
 
															-struct llama_model_params   llama_model_params_from_gpt_params  (const gpt_params & params);
														
 
															-struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
														
 
															+struct llama_model_params     llama_model_params_from_gpt_params    (const gpt_params & params);
														
 
															+struct llama_context_params   llama_context_params_from_gpt_params  (const gpt_params & params);
														
 
															+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
														
 
															 struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
														
 
															 struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
														
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -18,7 +18,7 @@ constexpr float rms_norm_eps = 5e-6f;
 
															 #endif
														
 
															 static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
														
 
															-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
														
 
															+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
														
 
															     if (plan.work_size > 0) {
														
 
															         buf.resize(plan.work_size);
														
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -21,7 +21,7 @@
 
															 #endif
														
 
															 static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
														
 
															-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
														
 
															+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
														
 
															     if (plan.work_size > 0) {
														
 
															         buf.resize(plan.work_size);
														
@@ -54,7 +54,7 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) {
 
															 #define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
														
 
															 struct benchmark_params_struct {
														
 
															-    int32_t n_threads     = 1;
														
 
															+    int     n_threads     = 1;
														
 
															     int32_t n_iterations  = 10;
														
 
															 };
														
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -486,8 +486,8 @@ int main(int argc, char ** argv) {
 
															     if (use_pca) {
														
 
															         // run PCA
														
 
															         PCA::pca_params pca_params;
														
 
															-        pca_params.n_threads = params.n_threads;
														
 
															-        pca_params.n_batch = params.n_pca_batch;
														
 
															+        pca_params.n_threads    = params.cpuparams.n_threads;
														
 
															+        pca_params.n_batch      = params.n_pca_batch;
														
 
															         pca_params.n_iterations = params.n_pca_iterations;
														
 
															         PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
														
 
															     } else {
														
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -410,7 +410,7 @@ int main(int argc, char ** argv) {
 
															     g_verbose = (params.verbosity == 1);
														
 
															     try {
														
 
															-        lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.n_threads);
														
 
															+        lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
														
 
															         ctx.run_merge();
														
 
															     } catch (const std::exception & err) {
														
 
															         fprintf(stderr, "%s\n", err.what());
														
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -16,6 +16,7 @@
 
															 #include <sstream>
														
 
															 #include <string>
														
 
															 #include <vector>
														
 
															+#include <thread>
														
 
															 #include "ggml.h"
														
 
															 #include "llama.h"
														
@@ -225,6 +226,9 @@ struct cmd_params {
 
															     std::vector<ggml_type> type_k;
														
 
															     std::vector<ggml_type> type_v;
														
 
															     std::vector<int> n_threads;
														
 
															+    std::vector<std::string> cpu_mask;
														
 
															+    std::vector<bool> cpu_strict;
														
 
															+    std::vector<int> poll;
														
 
															     std::vector<int> n_gpu_layers;
														
 
															     std::vector<std::string> rpc_servers;
														
 
															     std::vector<llama_split_mode> split_mode;
														
@@ -236,6 +240,8 @@ struct cmd_params {
 
															     std::vector<bool> embeddings;
														
 
															     ggml_numa_strategy numa;
														
 
															     int reps;
														
 
															+    ggml_sched_priority prio;
														
 
															+    int delay;
														
 
															     bool verbose;
														
 
															     output_formats output_format;
														
 
															     output_formats output_format_stderr;
														
@@ -251,6 +257,9 @@ static const cmd_params cmd_params_defaults = {
 
															     /* type_k               */ {GGML_TYPE_F16},
														
 
															     /* type_v               */ {GGML_TYPE_F16},
														
 
															     /* n_threads            */ {cpu_get_num_math()},
														
 
															+    /* cpu_mask             */ {"0x0"},
														
 
															+    /* cpu_strict           */ {false},
														
 
															+    /* poll                 */ {50},
														
 
															     /* n_gpu_layers         */ {99},
														
 
															     /* rpc_servers          */ {""},
														
 
															     /* split_mode           */ {LLAMA_SPLIT_MODE_LAYER},
														
@@ -262,6 +271,8 @@ static const cmd_params cmd_params_defaults = {
 
															     /* embeddings           */ {false},
														
 
															     /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
														
 
															     /* reps                 */ 5,
														
 
															+    /* prio                 */ GGML_SCHED_PRIO_NORMAL,
														
 
															+    /* delay                */ 0,
														
 
															     /* verbose              */ false,
														
 
															     /* output_format        */ MARKDOWN,
														
 
															     /* output_format_stderr */ NONE,
														
@@ -281,6 +292,9 @@ static void print_usage(int /* argc */, char ** argv) {
 
															     printf("  -ctk, --cache-type-k <t>            (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
														
 
															     printf("  -ctv, --cache-type-v <t>            (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
														
 
															     printf("  -t, --threads <n>                   (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
														
 
															+    printf("  -C, --cpu-mask <hex,hex>            (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
														
 
															+    printf("  --cpu-strict <0|1>                  (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
														
 
															+    printf("  --poll <0...100>                    (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
														
 
															     printf("  -ngl, --n-gpu-layers <n>            (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
														
 
															     printf("  -rpc, --rpc <rpc_servers>           (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
														
 
															     printf("  -sm, --split-mode <none|layer|row>  (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
														
@@ -292,6 +306,8 @@ static void print_usage(int /* argc */, char ** argv) {
 
															     printf("  -embd, --embeddings <0|1>           (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
														
 
															     printf("  -ts, --tensor-split <ts0/ts1/..>    (default: 0)\n");
														
 
															     printf("  -r, --repetitions <n>               (default: %d)\n", cmd_params_defaults.reps);
														
 
															+    printf("  --prio <0|1|2|3>                    (default: %d)\n", cmd_params_defaults.prio);
														
 
															+    printf("  --delay <0...N> (seconds)           (default: %d)\n", cmd_params_defaults.delay);
														
 
															     printf("  -o, --output <csv|json|md|sql>      (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
														
 
															     printf("  -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
														
 
															     printf("  -v, --verbose                       (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
														
@@ -338,6 +354,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
 
															     params.output_format_stderr = cmd_params_defaults.output_format_stderr;
														
 
															     params.reps = cmd_params_defaults.reps;
														
 
															     params.numa = cmd_params_defaults.numa;
														
 
															+    params.prio = cmd_params_defaults.prio;
														
 
															+    params.delay = cmd_params_defaults.delay;
														
 
															     for (int i = 1; i < argc; i++) {
														
 
															         arg = argv[i];
														
@@ -433,6 +451,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
 
															             }
														
 
															             auto p = string_split<int>(argv[i], split_delim);
														
 
															             params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
														
 
															+        } else if (arg == "-C" || arg == "--cpu-mask") {
														
 
															+            if (++i >= argc) {
														
 
															+                invalid_param = true;
														
 
															+                break;
														
 
															+            }
														
 
															+            auto p = string_split<std::string>(argv[i], split_delim);
														
 
															+            params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
														
 
															+        } else if (arg == "--cpu-strict") {
														
 
															+            if (++i >= argc) {
														
 
															+                invalid_param = true;
														
 
															+                break;
														
 
															+            }
														
 
															+            auto p = string_split<bool>(argv[i], split_delim);
														
 
															+            params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
														
 
															+        } else if (arg == "--poll") {
														
 
															+            if (++i >= argc) {
														
 
															+                invalid_param = true;
														
 
															+                break;
														
 
															+            }
														
 
															+            auto p = string_split<int>(argv[i], split_delim);
														
 
															+            params.poll.insert(params.poll.end(), p.begin(), p.end());
														
 
															         } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
														
 
															             if (++i >= argc) {
														
 
															                 invalid_param = true;
														
@@ -541,6 +580,18 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
 
															                 break;
														
 
															             }
														
 
															             params.reps = std::stoi(argv[i]);
														
 
															+        } else if (arg == "--prio") {
														
 
															+            if (++i >= argc) {
														
 
															+                invalid_param = true;
														
 
															+                break;
														
 
															+            }
														
 
															+            params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
														
 
															+        } else if (arg == "--delay") {
														
 
															+            if (++i >= argc) {
														
 
															+                invalid_param = true;
														
 
															+                break;
														
 
															+            }
														
 
															+            params.delay = std::stoi(argv[i]);
														
 
															         } else if (arg == "-o" || arg == "--output") {
														
 
															             if (++i >= argc) {
														
 
															                 invalid_param = true;
														
@@ -585,6 +636,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
 
															     if (params.use_mmap.empty())     { params.use_mmap = cmd_params_defaults.use_mmap; }
														
 
															     if (params.embeddings.empty())   { params.embeddings = cmd_params_defaults.embeddings; }
														
 
															     if (params.n_threads.empty())    { params.n_threads = cmd_params_defaults.n_threads; }
														
 
															+    if (params.cpu_mask.empty())     { params.cpu_mask  = cmd_params_defaults.cpu_mask;  }
														
 
															+    if (params.cpu_strict.empty())   { params.cpu_strict = cmd_params_defaults.cpu_strict; }
														
 
															+    if (params.poll.empty())         { params.poll = cmd_params_defaults.poll; }
														
 
															     return params;
														
 
															 }
														
@@ -598,6 +652,9 @@ struct cmd_params_instance {
 
															     ggml_type type_k;
														
 
															     ggml_type type_v;
														
 
															     int n_threads;
														
 
															+    std::string cpu_mask;
														
 
															+    bool cpu_strict;
														
 
															+    int poll;
														
 
															     int n_gpu_layers;
														
 
															     std::string rpc_servers;
														
 
															     llama_split_mode split_mode;
														
@@ -667,7 +724,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
 
															     for (const auto & tv : params.type_v)
														
 
															     for (const auto & nkvo : params.no_kv_offload)
														
 
															     for (const auto & fa : params.flash_attn)
														
 
															-    for (const auto & nt : params.n_threads) {
														
 
															+    for (const auto & nt : params.n_threads)
														
 
															+    for (const auto & cm : params.cpu_mask)
														
 
															+    for (const auto & cs : params.cpu_strict)
														
 
															+    for (const auto & pl : params.poll) {
														
 
															         for (const auto & n_prompt : params.n_prompt) {
														
 
															             if (n_prompt == 0) {
														
 
															                 continue;
														
@@ -681,6 +741,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
 
															                 /* .type_k       = */ tk,
														
 
															                 /* .type_v       = */ tv,
														
 
															                 /* .n_threads    = */ nt,
														
 
															+                /* .cpu_mask     = */ cm,
														
 
															+                /* .cpu_strict   = */ cs,
														
 
															+                /* .poll         = */ pl,
														
 
															                 /* .n_gpu_layers = */ nl,
														
 
															                 /* .rpc_servers  = */ rpc,
														
 
															                 /* .split_mode   = */ sm,
														
@@ -707,6 +770,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
 
															                 /* .type_k       = */ tk,
														
 
															                 /* .type_v       = */ tv,
														
 
															                 /* .n_threads    = */ nt,
														
 
															+                /* .cpu_mask     = */ cm,
														
 
															+                /* .cpu_strict   = */ cs,
														
 
															+                /* .poll         = */ pl,
														
 
															                 /* .n_gpu_layers = */ nl,
														
 
															                 /* .rpc_servers  = */ rpc,
														
 
															                 /* .split_mode   = */ sm,
														
@@ -733,6 +799,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
 
															                 /* .type_k       = */ tk,
														
 
															                 /* .type_v       = */ tv,
														
 
															                 /* .n_threads    = */ nt,
														
 
															+                /* .cpu_mask     = */ cm,
														
 
															+                /* .cpu_strict   = */ cs,
														
 
															+                /* .poll         = */ pl,
														
 
															                 /* .n_gpu_layers = */ nl,
														
 
															                 /* .rpc_servers  = */ rpc,
														
 
															                 /* .split_mode   = */ sm,
														
@@ -769,6 +838,9 @@ struct test {
 
															     int n_batch;
														
 
															     int n_ubatch;
														
 
															     int n_threads;
														
 
															+    std::string cpu_mask;
														
 
															+    bool cpu_strict;
														
 
															+    int poll;
														
 
															     bool has_rpc;
														
 
															     ggml_type type_k;
														
 
															     ggml_type type_v;
														
@@ -795,6 +867,9 @@ struct test {
 
															         n_batch = inst.n_batch;
														
 
															         n_ubatch = inst.n_ubatch;
														
 
															         n_threads = inst.n_threads;
														
 
															+        cpu_mask = inst.cpu_mask;
														
 
															+        cpu_strict = inst.cpu_strict;
														
 
															+        poll = inst.poll;
														
 
															         has_rpc = !inst.rpc_servers.empty();
														
 
															         type_k = inst.type_k;
														
 
															         type_v = inst.type_v;
														
@@ -872,13 +947,14 @@ struct test {
 
															             "cpu_info", "gpu_info",
														
 
															             "model_filename", "model_type", "model_size", "model_n_params",
														
 
															             "n_batch", "n_ubatch",
														
 
															-            "n_threads", "type_k", "type_v",
														
 
															+            "n_threads", "cpu_mask", "cpu_strict", "poll",
														
 
															+            "type_k", "type_v",
														
 
															             "n_gpu_layers", "split_mode",
														
 
															             "main_gpu", "no_kv_offload", "flash_attn",
														
 
															             "tensor_split", "use_mmap", "embeddings",
														
 
															             "n_prompt", "n_gen", "test_time",
														
 
															             "avg_ns", "stddev_ns",
														
 
															-            "avg_ts", "stddev_ts"
														
 
															+            "avg_ts", "stddev_ts",
														
 
															         };
														
 
															         return fields;
														
 
															     }
														
@@ -887,7 +963,7 @@ struct test {
 
															     static field_type get_field_type(const std::string & field) {
														
 
															         if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
														
 
															-            field == "n_threads" ||
														
 
															+            field == "n_threads" || field == "poll" ||
														
 
															             field == "model_size" || field == "model_n_params" ||
														
 
															             field == "n_gpu_layers" || field == "main_gpu" ||
														
 
															             field == "n_prompt" || field == "n_gen" ||
														
@@ -896,6 +972,7 @@ struct test {
 
															         }
														
 
															         if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
														
 
															             field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
														
 
															+            field == "cpu_strict" ||
														
 
															             field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
														
 
															             return BOOL;
														
 
															         }
														
@@ -928,7 +1005,8 @@ struct test {
 
															             cpu_info, gpu_info,
														
 
															             model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
														
 
															             std::to_string(n_batch), std::to_string(n_ubatch),
														
 
															-            std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
														
 
															+            std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll),
														
 
															+            ggml_type_name(type_k), ggml_type_name(type_v),
														
 
															             std::to_string(n_gpu_layers), split_mode_str(split_mode),
														
 
															             std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
														
 
															             tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
														
@@ -1067,7 +1145,7 @@ struct markdown_printer : public printer {
 
															             return -30;
														
 
															         }
														
 
															         if (field == "t/s") {
														
 
															-            return 16;
														
 
															+            return 20;
														
 
															         }
														
 
															         if (field == "size" || field == "params") {
														
 
															             return 10;
														
@@ -1149,6 +1227,15 @@ struct markdown_printer : public printer {
 
															         if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
														
 
															             fields.emplace_back("n_threads");
														
 
															         }
														
 
															+        if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
														
 
															+            fields.emplace_back("cpu_mask");
														
 
															+        }
														
 
															+        if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) {
														
 
															+            fields.emplace_back("cpu_strict");
														
 
															+        }
														
 
															+        if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) {
														
 
															+            fields.emplace_back("poll");
														
 
															+        }
														
 
															         if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
														
 
															             fields.emplace_back("n_batch");
														
 
															         }
														
@@ -1383,6 +1470,8 @@ int main(int argc, char ** argv) {
 
															     llama_backend_init();
														
 
															     llama_numa_init(params.numa);
														
 
															+    set_process_priority(params.prio);
														
 
															+
														
 
															     // initialize printer
														
 
															     std::unique_ptr<printer> p = create_printer(params.output_format);
														
 
															     std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
														
@@ -1428,6 +1517,28 @@ int main(int argc, char ** argv) {
 
															         llama_kv_cache_clear(ctx);
														
 
															+        // cool off before the test
														
 
															+        if (params.delay) {
														
 
															+            std::this_thread::sleep_for(std::chrono::seconds(params.delay));
														
 
															+        }
														
 
															+
														
 
															+        struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
														
 
															+        if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
														
 
															+            LOG_TEE("%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
														
 
															+            exit(1);
														
 
															+        }
														
 
															+        tpp.strict_cpu = t.cpu_strict;
														
 
															+        tpp.poll       = t.poll;
														
 
															+        tpp.prio       = params.prio;
														
 
															+
														
 
															+        struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
														
 
															+        if (!threadpool) {
														
 
															+            LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
														
 
															+            exit(1);
														
 
															+        }
														
 
															+
														
 
															+        llama_attach_threadpool(ctx, threadpool, NULL);
														
 
															+
														
 
															         // warmup run
														
 
															         if (t.n_prompt > 0) {
														
 
															             //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
														
@@ -1466,6 +1577,8 @@ int main(int argc, char ** argv) {
 
															         llama_print_timings(ctx);
														
 
															         llama_free(ctx);
														
 
															+
														
 
															+        ggml_threadpool_free(threadpool);
														
 
															     }
														
 
															     llama_free_model(lmodel);
														
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -71,8 +71,8 @@ actor LlamaContext {
 
															         var ctx_params = llama_context_default_params()
														
 
															         ctx_params.seed  = 1234
														
 
															         ctx_params.n_ctx = 2048
														
 
															-        ctx_params.n_threads       = UInt32(n_threads)
														
 
															-        ctx_params.n_threads_batch = UInt32(n_threads)
														
 
															+        ctx_params.n_threads       = Int32(n_threads)
														
 
															+        ctx_params.n_threads_batch = Int32(n_threads)
														
 
															         let context = llama_new_context_with_model(model, ctx_params)
														
 
															         guard let context else {
														
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -129,14 +129,14 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
 
															         if (!params->image.empty()) {
														
 
															             LOG_TEE("using base64 encoded image instead of command line image path\n");
														
 
															         }
														
 
															-        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
														
 
															+        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
														
 
															         if (!embed) {
														
 
															             LOG_TEE("%s: can't load image from prompt\n", __func__);
														
 
															             return NULL;
														
 
															         }
														
 
															         params->prompt = remove_image_from_prompt(prompt);
														
 
															     } else {
														
 
															-        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
														
 
															+        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
														
 
															         if (!embed) {
														
 
															             fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
														
 
															             return NULL;
														
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -180,7 +180,7 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
 
															 static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
														
 
															     auto ctx_clip = clip_init_context(params);
														
 
															-    auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
														
 
															+    auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
														
 
															     if (!embeds) {
														
 
															         std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
														
 
															         return NULL;
														
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -221,6 +221,40 @@ int main(int argc, char ** argv) {
 
															         return 1;
														
 
															     }
														
 
															+    LOG("%s: llama threadpool init = n_threads = %d\n",
														
 
															+        __func__,
														
 
															+        (int) params.cpuparams.n_threads
														
 
															+    );
														
 
															+    struct ggml_threadpool_params tpp_batch =
														
 
															+            ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
														
 
															+    struct ggml_threadpool_params tpp =
														
 
															+            ggml_threadpool_params_from_cpu_params(params.cpuparams);
														
 
															+
														
 
															+    set_process_priority(params.cpuparams.priority);
														
 
															+
														
 
															+    struct ggml_threadpool * threadpool_batch = NULL;
														
 
															+    if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
														
 
															+        threadpool_batch = ggml_threadpool_new(&tpp_batch);
														
 
															+        if (!threadpool_batch) {
														
 
															+            LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
														
 
															+            exit(1);
														
 
															+        }
														
 
															+
														
 
															+        // Start the non-batch threadpool in the paused state
														
 
															+        tpp.paused = true;
														
 
															+    }
														
 
															+
														
 
															+    struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
														
 
															+    if (!threadpool) {
														
 
															+        LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
														
 
															+        exit(1);
														
 
															+    }
														
 
															+
														
 
															+    llama_attach_threadpool(ctx, threadpool, threadpool_batch);
														
 
															+    if (ctx_guidance) {
														
 
															+        llama_attach_threadpool(ctx_guidance, threadpool, threadpool_batch);
														
 
															+    }
														
 
															+
														
 
															     const int n_ctx_train = llama_n_ctx_train(model);
														
 
															     const int n_ctx = llama_n_ctx(ctx);
														
 
															     LOG("n_ctx: %d\n", n_ctx);
														
@@ -989,6 +1023,9 @@ int main(int argc, char ** argv) {
 
															     llama_sampling_free(ctx_sampling);
														
 
															     llama_backend_free();
														
 
															+    ggml_threadpool_free(threadpool);
														
 
															+    ggml_threadpool_free(threadpool_batch);
														
 
															+
														
 
															 #ifndef LOG_DISABLE_LOGS
														
 
															     LOG_TEE("Log end\n");
														
 
															 #endif // LOG_DISABLE_LOGS
														
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2534,8 +2534,8 @@ int main(int argc, char ** argv) {
 
															     });
														
 
															     LOG_INFO("system info", {
														
 
															-        {"n_threads",       params.n_threads},
														
 
															-        {"n_threads_batch", params.n_threads_batch},
														
 
															+        {"n_threads",       params.cpuparams.n_threads},
														
 
															+        {"n_threads_batch", params.cpuparams_batch.n_threads},
														
 
															         {"total_threads",   std::thread::hardware_concurrency()},
														
 
															         {"system_info",     llama_print_system_info()},
														
 
															     });
														
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -73,10 +73,11 @@ int main(int argc, char ** argv) {
 
															     // load the draft model
														
 
															     params.model = params.model_draft;
														
 
															     params.n_gpu_layers = params.n_gpu_layers_draft;
														
 
															-    if (params.n_threads_draft > 0) {
														
 
															-        params.n_threads = params.n_threads_draft;
														
 
															+    if (params.draft_cpuparams.n_threads > 0) {
														
 
															+        params.cpuparams.n_threads = params.draft_cpuparams.n_threads;
														
 
															     }
														
 
															-    params.n_threads_batch = params.n_threads_batch_draft;
														
 
															+
														
 
															+    params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
														
 
															     llama_init_result llama_init_dft = llama_init_from_gpt_params(params);
														
 
															     model_dft = llama_init_dft.model;
														
 
															     ctx_dft = llama_init_dft.context;
														
--- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h
@@ -7,8 +7,8 @@ extern "C" {
 
															 #endif
														
 
															 typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
														
 
															-typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
														
 
															-typedef struct ggml_backend * ggml_backend_t;
														
 
															+typedef struct      ggml_backend_buffer * ggml_backend_buffer_t;
														
 
															+typedef struct             ggml_backend * ggml_backend_t;
														
 
															 // Tensor allocator
														
 
															 struct ggml_tallocr {
														
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -103,6 +103,7 @@ extern "C" {
 
															     GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
														
 
															     GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
														
 
															+    GGML_API           void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
														
 
															     GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
														
 
															     // Create a backend buffer from an existing pointer
														
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -231,6 +231,8 @@
 
															 #define GGML_MAX_SRC            10
														
 
															 #ifndef GGML_MAX_NAME
														
 
															 #define GGML_MAX_NAME           64
														
 
															+#define GGML_MAX_N_THREADS      512
														
 
															+
														
 
															 #endif
														
 
															 #define GGML_MAX_OP_PARAMS      64
														
 
															 #define GGML_DEFAULT_N_THREADS  4
														
@@ -628,6 +630,29 @@ extern "C" {
 
															     // If it returns true, the computation is aborted
														
 
															     typedef bool (*ggml_abort_callback)(void * data);
														
 
															+    // Scheduling priorities
														
 
															+    enum ggml_sched_priority {
														
 
															+        GGML_SCHED_PRIO_NORMAL,
														
 
															+        GGML_SCHED_PRIO_MEDIUM,
														
 
															+        GGML_SCHED_PRIO_HIGH,
														
 
															+        GGML_SCHED_PRIO_REALTIME
														
 
															+    };
														
 
															+
														
 
															+    // Threadpool params
														
 
															+    // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
														
 
															+    struct ggml_threadpool_params {
														
 
															+        bool                cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
														
 
															+        int                 n_threads;                   // number of threads
														
 
															+        enum ggml_sched_priority prio;                   // thread priority
														
 
															+        uint32_t            poll;                        // polling level (0 - no polling, 100 - aggressive polling)
														
 
															+        bool                strict_cpu;                  // strict cpu placement
														
 
															+        bool                paused;                      // start in paused state
														
 
															+    };
														
 
															+
														
 
															+    struct ggml_threadpool;     // forward declaration, see ggml.c
														
 
															+
														
 
															+    typedef struct  ggml_threadpool * ggml_threadpool_t;
														
 
															+
														
 
															     // the compute plan that needs to be prepared for ggml_graph_compute()
														
 
															     // since https://github.com/ggerganov/ggml/issues/287
														
 
															     struct ggml_cplan {
														
@@ -635,6 +660,7 @@ extern "C" {
 
															         uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
														
 
															         int n_threads;
														
 
															+        struct ggml_threadpool * threadpool;
														
 
															         // abort ggml_graph_compute when true
														
 
															         ggml_abort_callback abort_callback;
														
@@ -2057,10 +2083,23 @@ extern "C" {
 
															     GGML_API size_t ggml_graph_overhead(void);
														
 
															     GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
														
 
															+    GGML_API struct ggml_threadpool_params   ggml_threadpool_params_default(int n_threads);
														
 
															+    GGML_API void                            ggml_threadpool_params_init  (struct ggml_threadpool_params *p, int n_threads);
														
 
															+    GGML_API bool                            ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
														
 
															+    GGML_API struct ggml_threadpool*         ggml_threadpool_new          (struct ggml_threadpool_params  * params);
														
 
															+    GGML_API void                            ggml_threadpool_free         (struct ggml_threadpool * threadpool);
														
 
															+    GGML_API int                             ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
														
 
															+    GGML_API void                            ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
														
 
															+    GGML_API void                            ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
														
 
															+
														
 
															     // ggml_graph_plan() has to be called before ggml_graph_compute()
														
 
															     // when plan.work_size > 0, caller must allocate memory for plan.work_data
														
 
															-    GGML_API struct ggml_cplan ggml_graph_plan   (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
														
 
															-    GGML_API enum ggml_status  ggml_graph_compute(      struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
														
 
															+    GGML_API struct ggml_cplan ggml_graph_plan(
														
 
															+                  const struct ggml_cgraph * cgraph,
														
 
															+                                       int   n_threads, /* = GGML_DEFAULT_N_THREADS */
														
 
															+                    struct ggml_threadpool * threadpool /* = NULL */ );
														
 
															+    GGML_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
														
 
															+
														
 
															     // same as ggml_graph_compute() but the work data is allocated as a part of the context
														
 
															     // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
														
 
															     GGML_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
														
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -1247,7 +1247,7 @@ endif()
 
															 # Data types, macros and functions related to controlling CPU affinity and
														
 
															 # some memory allocation are available on Linux through GNU extensions in libc
														
 
															-if (CMAKE_SYSTEM_NAME MATCHES "Linux")
														
 
															+if (CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Android")
														
 
															     add_compile_definitions(_GNU_SOURCE)
														
 
															 endif()
														
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@@ -722,9 +722,11 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
 
															 #endif
														
 
															 struct ggml_backend_cpu_context {
														
 
															-    int n_threads;
														
 
															-    void * work_data;
														
 
															-    size_t work_size;
														
 
															+    int                 n_threads;
														
 
															+    ggml_threadpool_t   threadpool;
														
 
															+
														
 
															+    void *              work_data;
														
 
															+    size_t              work_size;
														
 
															     ggml_abort_callback abort_callback;
														
 
															     void *              abort_callback_data;
														
@@ -759,7 +761,7 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
 
															     struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
														
 
															-    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
														
 
															+    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
														
 
															     cpu_plan->cgraph = *cgraph; // FIXME: deep copy
														
 
															     if (cpu_plan->cplan.work_size > 0) {
														
@@ -796,7 +798,7 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe
 
															 GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
														
 
															     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
														
 
															-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
														
 
															+    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
														
 
															     if (cpu_ctx->work_size < cplan.work_size) {
														
 
															         free(cpu_ctx->work_data);
														
@@ -873,6 +875,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
 
															     }
														
 
															     ctx->n_threads           = GGML_DEFAULT_N_THREADS;
														
 
															+    ctx->threadpool          = NULL;
														
 
															     ctx->work_data           = NULL;
														
 
															     ctx->work_size           = 0;
														
 
															     ctx->abort_callback      = NULL;
														
@@ -903,6 +906,18 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
 
															     ctx->n_threads = n_threads;
														
 
															 }
														
 
															+void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
														
 
															+    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
														
 
															+
														
 
															+    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
														
 
															+
														
 
															+    if (ctx->threadpool && ctx->threadpool != threadpool) {
														
 
															+        // already had a different threadpool, pause/suspend it before switching
														
 
															+        ggml_threadpool_pause(ctx->threadpool);
														
 
															+    }
														
 
															+    ctx->threadpool = threadpool;
														
 
															+}
														
 
															+
														
 
															 void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
														
 
															     GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
														
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
--- a/include/llama.h
+++ b/include/llama.h
@@ -304,8 +304,8 @@ extern "C" {
 
															         uint32_t n_batch;           // logical maximum batch size that can be submitted to llama_decode
														
 
															         uint32_t n_ubatch;          // physical maximum batch size
														
 
															         uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
														
 
															-        uint32_t n_threads;         // number of threads to use for generation
														
 
															-        uint32_t n_threads_batch;   // number of threads to use for batch processing
														
 
															+        int32_t  n_threads;         // number of threads to use for generation
														
 
															+        int32_t  n_threads_batch;   // number of threads to use for batch processing
														
 
															         enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
														
 
															         enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
														
@@ -428,6 +428,13 @@ extern "C" {
 
															     //optional:
														
 
															     LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
														
 
															+    // Optional: an auto threadpool gets created in ggml if not passed explicitly
														
 
															+    LLAMA_API void llama_attach_threadpool(
														
 
															+               struct   llama_context * ctx,
														
 
															+            ggml_threadpool_t   threadpool,
														
 
															+            ggml_threadpool_t   threadpool_batch);
														
 
															+    LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
														
 
															+
														
 
															     // Call once at the end of the program - currently only used for MPI
														
 
															     LLAMA_API void llama_backend_free(void);
														
@@ -837,13 +844,13 @@ extern "C" {
 
															     // Set the number of threads used for decoding
														
 
															     // n_threads is the number of threads used for generation (single token)
														
 
															     // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
														
 
															-    LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
														
 
															+    LLAMA_API void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch);
														
 
															     // Get the number of threads used for generation of a single token.
														
 
															-    LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
														
 
															+    LLAMA_API int32_t llama_n_threads(struct llama_context * ctx);
														
 
															     // Get the number of threads used for prompt and batch processing (multiple token).
														
 
															-    LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
														
 
															+    LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
														
 
															     // Set whether the model is in embeddings mode or not
														
 
															     // If true, embeddings will be returned but logits will not
														
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2373,8 +2373,8 @@ struct llama_cparams {
 
															     uint32_t n_batch;
														
 
															     uint32_t n_ubatch;
														
 
															     uint32_t n_seq_max;
														
 
															-    uint32_t n_threads;       // number of threads to use for generation
														
 
															-    uint32_t n_threads_batch; // number of threads to use for batch processing
														
 
															+    int      n_threads;       // number of threads to use for generation
														
 
															+    int      n_threads_batch; // number of threads to use for batch processing
														
 
															     float rope_freq_base;
														
 
															     float rope_freq_scale;
														
@@ -3091,6 +3091,9 @@ struct llama_context {
 
															 #endif
														
 
															     ggml_backend_t backend_cpu = nullptr;
														
 
															+    ggml_threadpool_t threadpool       = nullptr;
														
 
															+    ggml_threadpool_t threadpool_batch = nullptr;
														
 
															+
														
 
															     bool has_evaluated_once = false;
														
 
															     int64_t t_start_us;
														
@@ -15494,9 +15497,10 @@ static void llama_output_reorder(struct llama_context * ctx) {
 
															 }
														
 
															 static void llama_graph_compute(
														
 
															-        llama_context & lctx,
														
 
															-          ggml_cgraph * gf,
														
 
															-                  int   n_threads) {
														
 
															+          llama_context & lctx,
														
 
															+            ggml_cgraph * gf,
														
 
															+                    int   n_threads,
														
 
															+        ggml_threadpool * threadpool) {
														
 
															 #ifdef GGML_USE_METAL
														
 
															     if (ggml_backend_is_metal(lctx.backend_metal)) {
														
 
															         ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
														
@@ -15505,6 +15509,7 @@ static void llama_graph_compute(
 
															     if (lctx.backend_cpu != nullptr) {
														
 
															         ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
														
 
															+        ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
														
 
															         ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
														
 
															     }
														
 
															 #ifdef GGML_USE_BLAS
														
@@ -15625,6 +15630,8 @@ static int llama_decode_internal(
 
															         }
														
 
															         int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
														
 
															+        ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
														
 
															+
														
 
															         GGML_ASSERT(n_threads > 0);
														
 
															         // non-causal masks do not use the KV cache
														
@@ -15686,7 +15693,7 @@ static int llama_decode_internal(
 
															         llama_set_inputs(lctx, ubatch);
														
 
															-        llama_graph_compute(lctx, gf, n_threads);
														
 
															+        llama_graph_compute(lctx, gf, n_threads, threadpool);
														
 
															         // update the kv ring buffer
														
 
															         {
														
@@ -15863,7 +15870,9 @@ static int llama_encode_internal(
 
															     lctx.inp_embd_enc = NULL;
														
 
															     lctx.n_outputs = n_tokens;
														
 
															-    const int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
														
 
															+    int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
														
 
															+    ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
														
 
															+
														
 
															     GGML_ASSERT(n_threads > 0);
														
 
															     ggml_backend_sched_reset(lctx.sched);
														
@@ -15895,7 +15904,7 @@ static int llama_encode_internal(
 
															     llama_set_inputs(lctx, ubatch);
														
 
															-    llama_graph_compute(lctx, gf, n_threads);
														
 
															+    llama_graph_compute(lctx, gf, n_threads, threadpool);
														
 
															     // extract embeddings
														
 
															     if (embd) {
														
@@ -16177,7 +16186,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
 
															     ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
														
 
															-    llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
														
 
															+    llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
														
 
															 #endif
														
 
															     //const int64_t t_end = ggml_time_us();
														
@@ -16203,7 +16212,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
 
															             llama_set_k_shift(lctx);
														
 
															-            llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
														
 
															+            llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
														
 
															             need_reserve = true;
														
 
															         }
														
@@ -17451,6 +17460,19 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
 
															     }
														
 
															 }
														
 
															+void llama_attach_threadpool(
														
 
															+             struct llama_context * ctx,
														
 
															+        ggml_threadpool_t   threadpool,
														
 
															+        ggml_threadpool_t   threadpool_batch) {
														
 
															+    ctx->threadpool       = threadpool;
														
 
															+    ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
														
 
															+}
														
 
															+
														
 
															+void llama_detach_threadpool(struct llama_context * ctx) {
														
 
															+    ctx->threadpool       = nullptr;
														
 
															+    ctx->threadpool_batch = nullptr;
														
 
															+}
														
 
															+
														
 
															 void llama_backend_free(void) {
														
 
															     ggml_quantize_free();
														
 
															 }
														
@@ -19367,16 +19389,16 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa
 
															     }
														
 
															 }
														
 
															-void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
														
 
															+void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) {
														
 
															     ctx->cparams.n_threads       = n_threads;
														
 
															     ctx->cparams.n_threads_batch = n_threads_batch;
														
 
															 }
														
 
															-uint32_t llama_n_threads(struct llama_context * ctx) {
														
 
															+int32_t llama_n_threads(struct llama_context * ctx) {
														
 
															     return ctx->cparams.n_threads;
														
 
															 }
														
 
															-uint32_t llama_n_threads_batch(struct llama_context * ctx) {
														
 
															+int32_t llama_n_threads_batch(struct llama_context * ctx) {
														
 
															     return ctx->cparams.n_threads_batch;
														
 
															 }
														
--- a/tests/test-rope.cpp
+++ b/tests/test-rope.cpp
@@ -113,7 +113,7 @@ static struct ggml_tensor * get_random_tensor_f32(
 
															 }
														
 
															 static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
														
 
															-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
														
 
															+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
														
 
															     if (plan.work_size > 0) {
														
 
															         buf.resize(plan.work_size);