Explorar el Código

llama : add pipeline parallelism support (#6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
slaren hace 1 año
padre
commit
f30ea47a87

+ 3 - 0
CMakeLists.txt

@@ -118,6 +118,7 @@ option(LLAMA_SYCL                            "llama: use SYCL"
 option(LLAMA_SYCL_F16                        "llama: use 16 bit floats for sycl calculations"   OFF)
 option(LLAMA_SYCL_F16                        "llama: use 16 bit floats for sycl calculations"   OFF)
 set(LLAMA_SYCL_TARGET   "INTEL" CACHE STRING "llama: sycl target device")
 set(LLAMA_SYCL_TARGET   "INTEL" CACHE STRING "llama: sycl target device")
 option(LLAMA_CPU_HBM                         "llama: use memkind for CPU HBM"                   OFF)
 option(LLAMA_CPU_HBM                         "llama: use memkind for CPU HBM"                   OFF)
+set(LLAMA_SCHED_MAX_COPIES  "4" CACHE STRING "llama: max input copies for pipeline parallelism")
 
 
 option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES                  "llama: build examples" ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES                  "llama: build examples" ${LLAMA_STANDALONE})
@@ -147,6 +148,8 @@ set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
 find_package(Threads REQUIRED)
 include(CheckCXXCompilerFlag)
 include(CheckCXXCompilerFlag)
 
 
+add_compile_definitions(GGML_SCHED_MAX_COPIES=${LLAMA_SCHED_MAX_COPIES})
+
 # enable libstdc++ assertions for debug builds
 # enable libstdc++ assertions for debug builds
 if (CMAKE_SYSTEM_NAME MATCHES "Linux")
 if (CMAKE_SYSTEM_NAME MATCHES "Linux")
     add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
     add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)

+ 4 - 0
Makefile

@@ -167,6 +167,10 @@ ifeq ($(UNAME_S),OpenBSD)
 	MK_CPPFLAGS += -D_BSD_SOURCE
 	MK_CPPFLAGS += -D_BSD_SOURCE
 endif
 endif
 
 
+ifdef LLAMA_SCHED_MAX_COPIES
+	MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(LLAMA_SCHED_MAX_COPIES)
+endif
+
 ifdef LLAMA_DEBUG
 ifdef LLAMA_DEBUG
 	MK_CFLAGS   += -O0 -g
 	MK_CFLAGS   += -O0 -g
 	MK_CXXFLAGS += -O0 -g
 	MK_CXXFLAGS += -O0 -g

+ 12 - 2
common/common.cpp

@@ -483,6 +483,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 break;
                 break;
             }
             }
             params.n_batch = std::stoi(argv[i]);
             params.n_batch = std::stoi(argv[i]);
+        } else if (arg == "-ub" || arg == "--ubatch-size") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_ubatch = std::stoi(argv[i]);
         } else if (arg == "--keep") {
         } else if (arg == "--keep") {
             if (++i >= argc) {
             if (++i >= argc) {
                 invalid_param = true;
                 invalid_param = true;
@@ -977,7 +983,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        binary file containing multiple choice tasks.\n");
     printf("                        binary file containing multiple choice tasks.\n");
     printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
     printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
     printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
     printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
-    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
+    printf("  -b N, --batch-size N  logical maximum batch size (default: %d)\n", params.n_batch);
+    printf("  -ub N, --ubatch-size N\n");
+    printf("                        physical maximum batch size (default: %d)\n", params.n_ubatch);
     printf("  --samplers            samplers that will be used for generation in the order, separated by \';\'\n");
     printf("  --samplers            samplers that will be used for generation in the order, separated by \';\'\n");
     printf("                        (default: %s)\n", sampler_type_names.c_str());
     printf("                        (default: %s)\n", sampler_type_names.c_str());
     printf("  --sampling-seq        simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str());
     printf("  --sampling-seq        simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str());
@@ -1287,8 +1295,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     auto cparams = llama_context_default_params();
     auto cparams = llama_context_default_params();
 
 
     cparams.n_ctx             = params.n_ctx;
     cparams.n_ctx             = params.n_ctx;
-    cparams.n_batch           = params.n_batch;
     cparams.n_seq_max         = params.n_parallel;
     cparams.n_seq_max         = params.n_parallel;
+    cparams.n_batch           = params.n_batch;
+    cparams.n_ubatch          = params.n_ubatch;
     cparams.n_threads         = params.n_threads;
     cparams.n_threads         = params.n_threads;
     cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
     cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
     cparams.seed              = params.seed;
     cparams.seed              = params.seed;
@@ -1379,6 +1388,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
         std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
         std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
         llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
         llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
         llama_kv_cache_clear(lctx);
         llama_kv_cache_clear(lctx);
+        llama_synchronize(lctx);
         llama_reset_timings(lctx);
         llama_reset_timings(lctx);
     }
     }
 
 

+ 2 - 1
common/common.h

@@ -51,7 +51,8 @@ struct gpt_params {
     int32_t n_threads_batch_draft = -1;
     int32_t n_threads_batch_draft = -1;
     int32_t n_predict             = -1;    // new tokens to predict
     int32_t n_predict             = -1;    // new tokens to predict
     int32_t n_ctx                 = 512;   // context size
     int32_t n_ctx                 = 512;   // context size
-    int32_t n_batch               = 512;   // batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_batch               = 2048;  // logical batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_ubatch              = 512;   // physical batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_keep                = 0;     // number of tokens to keep from initial prompt
     int32_t n_keep                = 0;     // number of tokens to keep from initial prompt
     int32_t n_draft               = 5;     // number of tokens to draft during speculative decoding
     int32_t n_draft               = 5;     // number of tokens to draft during speculative decoding
     int32_t n_chunks              = -1;    // max number of chunks to process (-1 = unlimited)
     int32_t n_chunks              = -1;    // max number of chunks to process (-1 = unlimited)

+ 2 - 0
examples/batched-bench/batched-bench.cpp

@@ -138,6 +138,8 @@ int main(int argc, char ** argv) {
                 LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
                 LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
                 return false;
                 return false;
             }
             }
+
+            llama_synchronize(ctx);
         }
         }
 
 
         return true;
         return true;

+ 1 - 1
examples/embedding/embedding.cpp

@@ -107,7 +107,7 @@ int main(int argc, char ** argv) {
 
 
     // max batch size
     // max batch size
     const uint64_t n_batch = params.n_batch;
     const uint64_t n_batch = params.n_batch;
-    GGML_ASSERT(params.n_batch == params.n_ctx);
+    GGML_ASSERT(params.n_batch >= params.n_ctx);
 
 
     // tokenize the prompts and trim
     // tokenize the prompts and trim
     std::vector<std::vector<int32_t>> inputs;
     std::vector<std::vector<int32_t>> inputs;

+ 43 - 10
examples/llama-bench/llama-bench.cpp

@@ -164,6 +164,7 @@ struct cmd_params {
     std::vector<int> n_prompt;
     std::vector<int> n_prompt;
     std::vector<int> n_gen;
     std::vector<int> n_gen;
     std::vector<int> n_batch;
     std::vector<int> n_batch;
+    std::vector<int> n_ubatch;
     std::vector<ggml_type> type_k;
     std::vector<ggml_type> type_k;
     std::vector<ggml_type> type_v;
     std::vector<ggml_type> type_v;
     std::vector<int> n_threads;
     std::vector<int> n_threads;
@@ -183,7 +184,8 @@ static const cmd_params cmd_params_defaults = {
     /* model         */ {"models/7B/ggml-model-q4_0.gguf"},
     /* model         */ {"models/7B/ggml-model-q4_0.gguf"},
     /* n_prompt      */ {512},
     /* n_prompt      */ {512},
     /* n_gen         */ {128},
     /* n_gen         */ {128},
-    /* n_batch       */ {512},
+    /* n_batch       */ {2048},
+    /* n_ubatch      */ {512},
     /* type_k        */ {GGML_TYPE_F16},
     /* type_k        */ {GGML_TYPE_F16},
     /* type_v        */ {GGML_TYPE_F16},
     /* type_v        */ {GGML_TYPE_F16},
     /* n_threads     */ {get_num_physical_cores()},
     /* n_threads     */ {get_num_physical_cores()},
@@ -208,6 +210,7 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  -p, --n-prompt <n>                  (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
     printf("  -p, --n-prompt <n>                  (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
     printf("  -n, --n-gen <n>                     (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
     printf("  -n, --n-gen <n>                     (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
     printf("  -b, --batch-size <n>                (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
     printf("  -b, --batch-size <n>                (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
+    printf("  -ub N, --ubatch-size <n>            (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
     printf("  -ctk <t>, --cache-type-k <t>        (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
     printf("  -ctk <t>, --cache-type-k <t>        (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
     printf("  -ctv <t>, --cache-type-v <t>        (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
     printf("  -ctv <t>, --cache-type-v <t>        (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
     printf("  -t, --threads <n>                   (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
     printf("  -t, --threads <n>                   (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
@@ -217,7 +220,7 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  -nkvo, --no-kv-offload <0|1>        (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
     printf("  -nkvo, --no-kv-offload <0|1>        (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
     printf("  -mmp, --mmap <0|1>                  (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
     printf("  -mmp, --mmap <0|1>                  (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
     printf("  -embd, --embeddings <0|1>           (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
     printf("  -embd, --embeddings <0|1>           (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
-    printf("  -ts, --tensor_split <ts0/ts1/..>    (default: 0)\n");
+    printf("  -ts, --tensor-split <ts0/ts1/..>    (default: 0)\n");
     printf("  -r, --repetitions <n>               (default: %d)\n", cmd_params_defaults.reps);
     printf("  -r, --repetitions <n>               (default: %d)\n", cmd_params_defaults.reps);
     printf("  -o, --output <csv|json|md|sql>      (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
     printf("  -o, --output <csv|json|md|sql>      (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
     printf("  -v, --verbose                       (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
     printf("  -v, --verbose                       (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
@@ -297,6 +300,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
             }
             }
             auto p = split<int>(argv[i], split_delim);
             auto p = split<int>(argv[i], split_delim);
             params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
             params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
+        } else if (arg == "-ub" || arg == "--ubatch-size") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = split<int>(argv[i], split_delim);
+            params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
         } else if (arg == "-ctk" || arg == "--cache-type-k") {
         } else if (arg == "-ctk" || arg == "--cache-type-k") {
             if (++i >= argc) {
             if (++i >= argc) {
                 invalid_param = true;
                 invalid_param = true;
@@ -455,6 +465,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
     if (params.n_prompt.empty())     { params.n_prompt = cmd_params_defaults.n_prompt; }
     if (params.n_prompt.empty())     { params.n_prompt = cmd_params_defaults.n_prompt; }
     if (params.n_gen.empty())        { params.n_gen = cmd_params_defaults.n_gen; }
     if (params.n_gen.empty())        { params.n_gen = cmd_params_defaults.n_gen; }
     if (params.n_batch.empty())      { params.n_batch = cmd_params_defaults.n_batch; }
     if (params.n_batch.empty())      { params.n_batch = cmd_params_defaults.n_batch; }
+    if (params.n_ubatch.empty())     { params.n_ubatch = cmd_params_defaults.n_ubatch; }
     if (params.type_k.empty())       { params.type_k = cmd_params_defaults.type_k; }
     if (params.type_k.empty())       { params.type_k = cmd_params_defaults.type_k; }
     if (params.type_v.empty())       { params.type_v = cmd_params_defaults.type_v; }
     if (params.type_v.empty())       { params.type_v = cmd_params_defaults.type_v; }
     if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
     if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
@@ -474,6 +485,7 @@ struct cmd_params_instance {
     int n_prompt;
     int n_prompt;
     int n_gen;
     int n_gen;
     int n_batch;
     int n_batch;
+    int n_ubatch;
     ggml_type type_k;
     ggml_type type_k;
     ggml_type type_v;
     ggml_type type_v;
     int n_threads;
     int n_threads;
@@ -511,6 +523,7 @@ struct cmd_params_instance {
 
 
         cparams.n_ctx = n_prompt + n_gen;
         cparams.n_ctx = n_prompt + n_gen;
         cparams.n_batch = n_batch;
         cparams.n_batch = n_batch;
+        cparams.n_ubatch = n_ubatch;
         cparams.type_k = type_k;
         cparams.type_k = type_k;
         cparams.type_v = type_v;
         cparams.type_v = type_v;
         cparams.offload_kqv = !no_kv_offload;
         cparams.offload_kqv = !no_kv_offload;
@@ -532,6 +545,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
     for (const auto & mmp : params.use_mmap)
     for (const auto & mmp : params.use_mmap)
     for (const auto & embd : params.embeddings)
     for (const auto & embd : params.embeddings)
     for (const auto & nb : params.n_batch)
     for (const auto & nb : params.n_batch)
+    for (const auto & nub : params.n_ubatch)
     for (const auto & tk : params.type_k)
     for (const auto & tk : params.type_k)
     for (const auto & tv : params.type_v)
     for (const auto & tv : params.type_v)
     for (const auto & nkvo : params.no_kv_offload)
     for (const auto & nkvo : params.no_kv_offload)
@@ -545,6 +559,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .n_prompt     = */ n_prompt,
                 /* .n_prompt     = */ n_prompt,
                 /* .n_gen        = */ 0,
                 /* .n_gen        = */ 0,
                 /* .n_batch      = */ nb,
                 /* .n_batch      = */ nb,
+                /* .n_ubatch     = */ nub,
                 /* .type_k       = */ tk,
                 /* .type_k       = */ tk,
                 /* .type_v       = */ tv,
                 /* .type_v       = */ tv,
                 /* .n_threads    = */ nt,
                 /* .n_threads    = */ nt,
@@ -568,6 +583,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .n_prompt     = */ 0,
                 /* .n_prompt     = */ 0,
                 /* .n_gen        = */ n_gen,
                 /* .n_gen        = */ n_gen,
                 /* .n_batch      = */ nb,
                 /* .n_batch      = */ nb,
+                /* .n_ubatch     = */ nub,
                 /* .type_k       = */ tk,
                 /* .type_k       = */ tk,
                 /* .type_v       = */ tv,
                 /* .type_v       = */ tv,
                 /* .n_threads    = */ nt,
                 /* .n_threads    = */ nt,
@@ -604,6 +620,7 @@ struct test {
     uint64_t model_size;
     uint64_t model_size;
     uint64_t model_n_params;
     uint64_t model_n_params;
     int n_batch;
     int n_batch;
+    int n_ubatch;
     int n_threads;
     int n_threads;
     ggml_type type_k;
     ggml_type type_k;
     ggml_type type_v;
     ggml_type type_v;
@@ -627,6 +644,7 @@ struct test {
         model_size = llama_model_size(lmodel);
         model_size = llama_model_size(lmodel);
         model_n_params = llama_model_n_params(lmodel);
         model_n_params = llama_model_n_params(lmodel);
         n_batch = inst.n_batch;
         n_batch = inst.n_batch;
+        n_ubatch = inst.n_ubatch;
         n_threads = inst.n_threads;
         n_threads = inst.n_threads;
         type_k = inst.type_k;
         type_k = inst.type_k;
         type_v = inst.type_v;
         type_v = inst.type_v;
@@ -705,7 +723,8 @@ struct test {
             "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas",
             "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas",
             "cpu_info", "gpu_info",
             "cpu_info", "gpu_info",
             "model_filename", "model_type", "model_size", "model_n_params",
             "model_filename", "model_type", "model_size", "model_n_params",
-            "n_batch", "n_threads", "type_k", "type_v",
+            "n_batch", "n_ubatch",
+            "n_threads", "type_k", "type_v",
             "n_gpu_layers", "split_mode",
             "n_gpu_layers", "split_mode",
             "main_gpu", "no_kv_offload",
             "main_gpu", "no_kv_offload",
             "tensor_split", "use_mmap", "embeddings",
             "tensor_split", "use_mmap", "embeddings",
@@ -719,7 +738,8 @@ struct test {
     enum field_type {STRING, BOOL, INT, FLOAT};
     enum field_type {STRING, BOOL, INT, FLOAT};
 
 
     static field_type get_field_type(const std::string & field) {
     static field_type get_field_type(const std::string & field) {
-        if (field == "build_number" || field == "n_batch" || field == "n_threads" ||
+        if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
+            field == "n_threads" ||
             field == "model_size" || field == "model_n_params" ||
             field == "model_size" || field == "model_n_params" ||
             field == "n_gpu_layers" || field == "main_gpu" ||
             field == "n_gpu_layers" || field == "main_gpu" ||
             field == "n_prompt" || field == "n_gen" ||
             field == "n_prompt" || field == "n_gen" ||
@@ -759,7 +779,8 @@ struct test {
             std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
             std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
             cpu_info, gpu_info,
             cpu_info, gpu_info,
             model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
             model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
-            std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
+            std::to_string(n_batch), std::to_string(n_ubatch),
+            std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
             std::to_string(n_gpu_layers), split_mode_str(split_mode),
             std::to_string(n_gpu_layers), split_mode_str(split_mode),
             std::to_string(main_gpu), std::to_string(no_kv_offload),
             std::to_string(main_gpu), std::to_string(no_kv_offload),
             tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
             tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
@@ -957,6 +978,9 @@ struct markdown_printer : public printer {
         if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
         if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
             fields.emplace_back("n_batch");
             fields.emplace_back("n_batch");
         }
         }
+        if (params.n_ubatch.size() > 1 || params.n_ubatch != cmd_params_defaults.n_ubatch) {
+            fields.emplace_back("n_ubatch");
+        }
         if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) {
         if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) {
             fields.emplace_back("type_k");
             fields.emplace_back("type_k");
         }
         }
@@ -1096,25 +1120,32 @@ struct sql_printer : public printer {
 };
 };
 
 
 static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
 static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
+    llama_set_n_threads(ctx, n_threads, n_threads);
+
+    //std::vector<llama_token> tokens(n_prompt, llama_token_bos(llama_get_model(ctx)));
+    //llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt, n_past, 0));
+    //GGML_UNUSED(n_batch);
+
     std::vector<llama_token> tokens(n_batch, llama_token_bos(llama_get_model(ctx)));
     std::vector<llama_token> tokens(n_batch, llama_token_bos(llama_get_model(ctx)));
     int n_processed = 0;
     int n_processed = 0;
 
 
-    llama_set_n_threads(ctx, n_threads, n_threads);
-
     while (n_processed < n_prompt) {
     while (n_processed < n_prompt) {
         int n_tokens = std::min(n_prompt - n_processed, n_batch);
         int n_tokens = std::min(n_prompt - n_processed, n_batch);
         llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0));
         llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0));
         n_processed += n_tokens;
         n_processed += n_tokens;
     }
     }
+
+    llama_synchronize(ctx);
 }
 }
 
 
 static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
 static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
-    llama_token token = llama_token_bos(llama_get_model(ctx));
-
     llama_set_n_threads(ctx, n_threads, n_threads);
     llama_set_n_threads(ctx, n_threads, n_threads);
 
 
+    llama_token token = llama_token_bos(llama_get_model(ctx));
+
     for (int i = 0; i < n_gen; i++) {
     for (int i = 0; i < n_gen; i++) {
         llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0));
         llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0));
+        llama_synchronize(ctx);
     }
     }
 }
 }
 
 
@@ -1203,7 +1234,8 @@ int main(int argc, char ** argv) {
 
 
         // warmup run
         // warmup run
         if (t.n_prompt > 0) {
         if (t.n_prompt > 0) {
-            test_prompt(ctx, std::min(2, t.n_batch), 0, t.n_batch, t.n_threads);
+            //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
+            test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
         }
         }
         if (t.n_gen > 0) {
         if (t.n_gen > 0) {
             test_gen(ctx, 1, 0, t.n_threads);
             test_gen(ctx, 1, 0, t.n_threads);
@@ -1219,6 +1251,7 @@ int main(int argc, char ** argv) {
             if (t.n_gen > 0) {
             if (t.n_gen > 0) {
                 test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
                 test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
             }
             }
+
             uint64_t t_ns = get_time_ns() - t_start;
             uint64_t t_ns = get_time_ns() - t_start;
             t.samples_ns.push_back(t_ns);
             t.samples_ns.push_back(t_ns);
         }
         }

+ 2 - 0
examples/llama.swiftui/llama.cpp.swift/LibLlama.swift

@@ -221,6 +221,7 @@ actor LlamaContext {
             if llama_decode(context, batch) != 0 {
             if llama_decode(context, batch) != 0 {
                 print("llama_decode() failed during prompt")
                 print("llama_decode() failed during prompt")
             }
             }
+            llama_synchronize(context)
 
 
             let t_pp_end = ggml_time_us()
             let t_pp_end = ggml_time_us()
 
 
@@ -240,6 +241,7 @@ actor LlamaContext {
                 if llama_decode(context, batch) != 0 {
                 if llama_decode(context, batch) != 0 {
                     print("llama_decode() failed during text generation")
                     print("llama_decode() failed during text generation")
                 }
                 }
+                llama_synchronize(context)
             }
             }
 
 
             let t_tg_end = ggml_time_us()
             let t_tg_end = ggml_time_us()

+ 2 - 1
examples/perplexity/perplexity.cpp

@@ -589,9 +589,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
             }
             }
         }
         }
 
 
-        const auto t_end = std::chrono::high_resolution_clock::now();
 
 
         if (i == 0) {
         if (i == 0) {
+            llama_synchronize(ctx);
+            const auto t_end = std::chrono::high_resolution_clock::now();
             const float t_total = std::chrono::duration<float>(t_end - t_start).count();
             const float t_total = std::chrono::duration<float>(t_end - t_start).count();
             fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
             fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
             int total_seconds = (int)(t_total*n_chunk/n_seq);
             int total_seconds = (int)(t_total*n_chunk/n_seq);

+ 24 - 8
examples/server/server.cpp

@@ -147,7 +147,7 @@ struct server_slot {
     int32_t n_decoded   = 0;
     int32_t n_decoded   = 0;
     int32_t n_remaining = -1;
     int32_t n_remaining = -1;
     int32_t i_batch     = -1;
     int32_t i_batch     = -1;
-    int32_t n_predict   = -1;
+    int32_t n_predict   = -1; // TODO: disambiguate from params.n_predict
 
 
     int32_t n_prompt_tokens           = 0;
     int32_t n_prompt_tokens           = 0;
     int32_t n_prompt_tokens_processed = 0;
     int32_t n_prompt_tokens_processed = 0;
@@ -739,7 +739,13 @@ struct server_context {
         default_generation_settings_for_props = get_formated_generation(slots.front());
         default_generation_settings_for_props = get_formated_generation(slots.front());
         default_generation_settings_for_props["seed"] = -1;
         default_generation_settings_for_props["seed"] = -1;
 
 
-        batch = llama_batch_init(n_ctx, 0, params.n_parallel);
+        // the update_slots() logic will always submit a maximum of n_batch tokens
+        // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
+        {
+            const int32_t n_batch = llama_n_batch(ctx);
+
+            batch = llama_batch_init(n_batch, 0, params.n_parallel);
+        }
 
 
         metrics.init();
         metrics.init();
     }
     }
@@ -1036,8 +1042,10 @@ struct server_context {
                 llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
                 llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
             }
             }
 
 
-            for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch) {
-                const int32_t n_tokens = std::min(params.n_batch, (int32_t) (batch.n_tokens - i));
+            const int32_t n_batch = llama_n_batch(ctx);
+
+            for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
+                const int32_t n_tokens = std::min(params.n_batch, batch.n_tokens - i);
                 llama_batch batch_view = {
                 llama_batch batch_view = {
                     n_tokens,
                     n_tokens,
                     batch.token    + i,
                     batch.token    + i,
@@ -1226,7 +1234,7 @@ struct server_context {
             {"mirostat_eta",              slot.sparams.mirostat_eta},
             {"mirostat_eta",              slot.sparams.mirostat_eta},
             {"penalize_nl",               slot.sparams.penalize_nl},
             {"penalize_nl",               slot.sparams.penalize_nl},
             {"stop",                      slot.params.antiprompt},
             {"stop",                      slot.params.antiprompt},
-            {"n_predict",                 slot.params.n_predict},
+            {"n_predict",                 slot.params.n_predict}, // TODO: fix duplicate key n_predict
             {"n_keep",                    params.n_keep},
             {"n_keep",                    params.n_keep},
             {"ignore_eos",                ignore_eos},
             {"ignore_eos",                ignore_eos},
             {"stream",                    slot.params.stream},
             {"stream",                    slot.params.stream},
@@ -1738,7 +1746,8 @@ struct server_context {
         }
         }
 
 
         // process in chunks of params.n_batch
         // process in chunks of params.n_batch
-        int32_t n_batch = params.n_batch;
+        int32_t n_batch = llama_n_batch(ctx);
+        int32_t n_ubatch = llama_n_ubatch(ctx);
 
 
         // next, batch any pending prompts without exceeding n_batch
         // next, batch any pending prompts without exceeding n_batch
         if (params.cont_batching || batch.n_tokens == 0) {
         if (params.cont_batching || batch.n_tokens == 0) {
@@ -1811,7 +1820,7 @@ struct server_context {
 
 
                         if (slot.embedding) {
                         if (slot.embedding) {
                             // this prompt is too large to process - discard it
                             // this prompt is too large to process - discard it
-                            if (slot.n_prompt_tokens > n_batch) {
+                            if (slot.n_prompt_tokens > n_ubatch) {
                                 slot.state = SLOT_STATE_PROCESSING;
                                 slot.state = SLOT_STATE_PROCESSING;
                                 slot.command = SLOT_COMMAND_NONE;
                                 slot.command = SLOT_COMMAND_NONE;
                                 slot.release();
                                 slot.release();
@@ -2157,7 +2166,8 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
     printf("  --pooling {none,mean,cls} pooling type for embeddings, use model default if unspecified\n");
     printf("  --pooling {none,mean,cls} pooling type for embeddings, use model default if unspecified\n");
     printf("  -dt N, --defrag-thold N\n");
     printf("  -dt N, --defrag-thold N\n");
     printf("                            KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
     printf("                            KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
-    printf("  -b N, --batch-size N      batch size for prompt processing (default: %d)\n", params.n_batch);
+    printf("  -b N, --batch-size N      logical maximum batch size (default: %d)\n", params.n_batch);
+    printf("  -ub N, --ubatch-size N    physical maximum batch size (default: %d)\n", params.n_ubatch);
     printf("  --memory-f32              use f32 instead of f16 for memory key+value (default: disabled)\n");
     printf("  --memory-f32              use f32 instead of f16 for memory key+value (default: disabled)\n");
     printf("                            not recommended: doubles context memory required and no measurable increase in quality\n");
     printf("                            not recommended: doubles context memory required and no measurable increase in quality\n");
     if (llama_supports_mlock()) {
     if (llama_supports_mlock()) {
@@ -2424,6 +2434,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                 break;
                 break;
             }
             }
             params.n_batch = std::stoi(argv[i]);
             params.n_batch = std::stoi(argv[i]);
+        } else if (arg == "-ub" || arg == "--ubatch-size") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_ubatch = std::stoi(argv[i]);
         } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
         } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
             if (++i >= argc) {
             if (++i >= argc) {
                 invalid_param = true;
                 invalid_param = true;

+ 1 - 0
examples/server/tests/features/embeddings.feature

@@ -9,6 +9,7 @@ Feature: llama.cpp server
     And   42 as server seed
     And   42 as server seed
     And   2 slots
     And   2 slots
     And   1024 as batch size
     And   1024 as batch size
+    And   1024 as ubatch size
     And   2048 KV cache size
     And   2048 KV cache size
     And   embeddings extraction
     And   embeddings extraction
     Then  the server is starting
     Then  the server is starting

+ 8 - 0
examples/server/tests/features/steps/steps.py

@@ -33,6 +33,7 @@ def step_server_config(context, server_fqdn, server_port):
 
 
     context.model_alias = None
     context.model_alias = None
     context.n_batch = None
     context.n_batch = None
+    context.n_ubatch = None
     context.n_ctx = None
     context.n_ctx = None
     context.n_ga = None
     context.n_ga = None
     context.n_ga_w = None
     context.n_ga_w = None
@@ -278,6 +279,11 @@ def step_n_batch(context, n_batch):
     context.n_batch = n_batch
     context.n_batch = n_batch
 
 
 
 
+@step('{n_ubatch:d} as ubatch size')
+def step_n_ubatch(context, n_ubatch):
+    context.n_ubatch = n_ubatch
+
+
 @step('{seed:d} as seed')
 @step('{seed:d} as seed')
 def step_seed(context, seed):
 def step_seed(context, seed):
     context.seed = seed
     context.seed = seed
@@ -1029,6 +1035,8 @@ def start_server_background(context):
     ]
     ]
     if context.n_batch:
     if context.n_batch:
         server_args.extend(['--batch-size', context.n_batch])
         server_args.extend(['--batch-size', context.n_batch])
+    if context.n_ubatch:
+        server_args.extend(['--ubatch-size', context.n_ubatch])
     if context.n_gpu_layer:
     if context.n_gpu_layer:
         server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
         server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
     if context.server_continuous_batching:
     if context.server_continuous_batching:

+ 45 - 64
ggml-alloc.c

@@ -61,7 +61,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
     }
     }
 }
 }
 
 
-// TODO: GGML_PAD ?
 static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
 static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
     assert(alignment && !(alignment & (alignment - 1))); // power of 2
     assert(alignment && !(alignment & (alignment - 1))); // power of 2
     size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
     size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
@@ -69,25 +68,14 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen
 }
 }
 
 
 // tallocr
 // tallocr
-struct ggml_tallocr {
-    ggml_backend_buffer_t buffer;
-    void * base;
-    size_t alignment;
-    size_t offset;
-};
-
-ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
-    ggml_tallocr_t talloc = malloc(sizeof(struct ggml_tallocr));
-    if (talloc == NULL) {
-        return NULL;
-    }
 
 
+struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
     void * base = ggml_backend_buffer_get_base(buffer);
     void * base = ggml_backend_buffer_get_base(buffer);
     size_t align = ggml_backend_buffer_get_alignment(buffer);
     size_t align = ggml_backend_buffer_get_alignment(buffer);
 
 
     assert(align && !(align & (align - 1))); // power of 2
     assert(align && !(align & (align - 1))); // power of 2
 
 
-    *talloc = (struct ggml_tallocr) {
+    struct ggml_tallocr talloc = (struct ggml_tallocr) {
         /*.buffer    = */ buffer,
         /*.buffer    = */ buffer,
         /*.base      = */ base,
         /*.base      = */ base,
         /*.alignment = */ align,
         /*.alignment = */ align,
@@ -96,11 +84,7 @@ ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
     return talloc;
     return talloc;
 }
 }
 
 
-void ggml_tallocr_free(ggml_tallocr_t talloc) {
-    free(talloc);
-}
-
-void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor) {
+void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
     size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
     size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
     size = GGML_PAD(size, talloc->alignment);
     size = GGML_PAD(size, talloc->alignment);
 
 
@@ -354,12 +338,16 @@ struct hash_node {
     bool allocated;
     bool allocated;
 };
 };
 
 
-//
 struct tensor_alloc {
 struct tensor_alloc {
     size_t offset;
     size_t offset;
     size_t size_max; // 0 = pre-allocated, unused, or view
     size_t size_max; // 0 = pre-allocated, unused, or view
 };
 };
 
 
+struct leaf_alloc {
+    int buffer_id;
+    struct tensor_alloc leaf;
+};
+
 struct node_alloc {
 struct node_alloc {
     int buffer_id;
     int buffer_id;
     struct tensor_alloc dst;
     struct tensor_alloc dst;
@@ -378,7 +366,7 @@ struct ggml_gallocr {
     struct node_alloc * node_allocs; // [n_nodes]
     struct node_alloc * node_allocs; // [n_nodes]
     int n_nodes;
     int n_nodes;
 
 
-    struct tensor_alloc * leaf_allocs; // [n_leafs]
+    struct leaf_alloc * leaf_allocs; // [n_leafs]
     int n_leafs;
     int n_leafs;
 };
 };
 
 
@@ -543,13 +531,20 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
     return node_buffer_ids ? node_buffer_ids[i] : 0;
     return node_buffer_ids ? node_buffer_ids[i] : 0;
 }
 }
 
 
-static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
+static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
     // clear hash tables
     // clear hash tables
     memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
     memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
     memset(galloc->hash_values,   0, galloc->hash_set.size * sizeof(struct hash_node));
     memset(galloc->hash_values,   0, galloc->hash_set.size * sizeof(struct hash_node));
 
 
+    // allocate leafs
+    // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
+    for (int i = 0; i < graph->n_leafs; i++) {
+        struct ggml_tensor * leaf = graph->leafs[i];
+        ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i));
+    }
+
     // count number of children and views
     // count number of children and views
-    // allocate all graph inputs and leafs first to avoid overwriting them
+    // allocate other graph inputs and leafs first to avoid overwriting them
     for (int i = 0; i < graph->n_nodes; i++) {
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
         struct ggml_tensor * node = graph->nodes[i];
 
 
@@ -577,19 +572,6 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
         }
         }
     }
     }
 
 
-    // allocate the remaining leafs that are unused on the graph
-    // these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
-    for (int i = 0; i < graph->n_leafs; i++) {
-        struct ggml_tensor * leaf = graph->leafs[i];
-        struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
-
-        if (hn->n_children == 0) {
-            assert(!hn->allocated);
-            // since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
-            ggml_gallocr_allocate_node(galloc, leaf, 0);
-        }
-    }
-
     // allocate tensors
     // allocate tensors
     for (int i = 0; i < graph->n_nodes; i++) {
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
         struct ggml_tensor * node = graph->nodes[i];
@@ -652,7 +634,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
     }
     }
 }
 }
 
 
-bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
+bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
     size_t hash_size = graph->visited_hash_table.size;
     size_t hash_size = graph->visited_hash_table.size;
 
 
     // initialize hash table
     // initialize hash table
@@ -676,7 +658,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
     }
     }
 
 
     // allocate in hash table
     // allocate in hash table
-    ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids);
+    ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids);
 
 
     // set the node_allocs from the hash table
     // set the node_allocs from the hash table
     if (galloc->n_nodes < graph->n_nodes) {
     if (galloc->n_nodes < graph->n_nodes) {
@@ -711,15 +693,16 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
     }
     }
     if (galloc->n_leafs < graph->n_leafs) {
     if (galloc->n_leafs < graph->n_leafs) {
         free(galloc->leaf_allocs);
         free(galloc->leaf_allocs);
-        galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs);
+        galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0]), graph->n_leafs);
         GGML_ASSERT(galloc->leaf_allocs != NULL);
         GGML_ASSERT(galloc->leaf_allocs != NULL);
     }
     }
     galloc->n_leafs = graph->n_leafs;
     galloc->n_leafs = graph->n_leafs;
     for (int i = 0; i < graph->n_leafs; i++) {
     for (int i = 0; i < graph->n_leafs; i++) {
         struct ggml_tensor * leaf = graph->leafs[i];
         struct ggml_tensor * leaf = graph->leafs[i];
         struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
         struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
-        galloc->leaf_allocs[i].offset = hn->offset;
-        galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
+        galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
+        galloc->leaf_allocs[i].leaf.offset = hn->offset;
+        galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
     }
     }
 
 
     // reallocate buffers if needed
     // reallocate buffers if needed
@@ -727,7 +710,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
         size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
         size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
         size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
         size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
 
 
-        if (new_size > cur_size) {
+        // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
+        if (new_size > cur_size || galloc->buffers[i] == NULL) {
 #ifndef NDEBUG
 #ifndef NDEBUG
             fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
             fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
 #endif
 #endif
@@ -744,30 +728,30 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
 }
 }
 
 
 bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
 bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
-    return ggml_gallocr_reserve_n(galloc, graph, NULL);
+    return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
 }
 }
 
 
-static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) {
-    assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
+static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) {
+    assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
 
 
-    if (node->view_src != NULL) {
-        if (node->buffer == NULL) {
+    if (tensor->view_src != NULL) {
+        if (tensor->buffer == NULL) {
             assert(tensor_alloc->offset == SIZE_MAX);
             assert(tensor_alloc->offset == SIZE_MAX);
-            if (node->view_src->buffer == NULL) {
+            if (tensor->view_src->buffer == NULL) {
                 // this tensor was allocated without ggml-backend
                 // this tensor was allocated without ggml-backend
                 return;
                 return;
             }
             }
-            ggml_backend_view_init(galloc->buffers[buffer_id], node);
+            ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
         }
         }
     } else {
     } else {
-        if (node->data == NULL) {
+        if (tensor->data == NULL) {
             assert(tensor_alloc->offset != SIZE_MAX);
             assert(tensor_alloc->offset != SIZE_MAX);
-            assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
+            assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
             void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
             void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
             void * addr = (char *)base + tensor_alloc->offset;
             void * addr = (char *)base + tensor_alloc->offset;
-            ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr);
+            ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
         } else {
         } else {
-            if (node->buffer == NULL) {
+            if (tensor->buffer == NULL) {
                 // this tensor was allocated without ggml-backend
                 // this tensor was allocated without ggml-backend
                 return;
                 return;
             }
             }
@@ -843,13 +827,18 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
 
 
     // reset buffers
     // reset buffers
     for (int i = 0; i < galloc->n_buffers; i++) {
     for (int i = 0; i < galloc->n_buffers; i++) {
-        // zero size buffers are not allocated
         if (galloc->buffers[i] != NULL) {
         if (galloc->buffers[i] != NULL) {
             ggml_backend_buffer_reset(galloc->buffers[i]);
             ggml_backend_buffer_reset(galloc->buffers[i]);
         }
         }
     }
     }
 
 
     // allocate the graph tensors from the previous assignments
     // allocate the graph tensors from the previous assignments
+    // leafs
+    for (int i = 0; i < graph->n_leafs; i++) {
+        struct ggml_tensor * leaf = graph->leafs[i];
+        struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
+        ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf);
+    }
     // nodes
     // nodes
     for (int i = 0; i < graph->n_nodes; i++) {
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
         struct ggml_tensor * node = graph->nodes[i];
@@ -863,12 +852,6 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
         }
         }
         ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
         ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
     }
     }
-    // leafs
-    for (int i = 0; i < graph->n_leafs; i++) {
-        struct ggml_tensor * leaf = graph->leafs[i];
-        struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
-        ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
-    }
 
 
     return true;
     return true;
 }
 }
@@ -900,12 +883,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
         return false;
         return false;
     }
     }
 
 
-    struct ggml_tallocr * tallocr = ggml_tallocr_new(buffer);
+    struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
 
 
     for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
     for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
         if (t->data == NULL) {
         if (t->data == NULL) {
             if (t->view_src == NULL) {
             if (t->view_src == NULL) {
-                ggml_tallocr_alloc(tallocr, t);
+                ggml_tallocr_alloc(&tallocr, t);
             } else if (t->buffer == NULL) {
             } else if (t->buffer == NULL) {
                 ggml_backend_view_init(buffer, t);
                 ggml_backend_view_init(buffer, t);
             }
             }
@@ -917,8 +900,6 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
         }
         }
     }
     }
 
 
-    ggml_tallocr_free(tallocr);
-
     *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
     *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
     (*buffers)[(*n_buffers)++] = buffer;
     (*buffers)[(*n_buffers)++] = buffer;
 
 

+ 13 - 5
ggml-alloc.h

@@ -11,11 +11,15 @@ typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
 typedef struct ggml_backend * ggml_backend_t;
 typedef struct ggml_backend * ggml_backend_t;
 
 
 // Tensor allocator
 // Tensor allocator
-typedef struct ggml_tallocr * ggml_tallocr_t;
+struct ggml_tallocr {
+    ggml_backend_buffer_t buffer;
+    void * base;
+    size_t alignment;
+    size_t offset;
+};
 
 
-GGML_API ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer);
-GGML_API void           ggml_tallocr_free(ggml_tallocr_t talloc);
-GGML_API void           ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor);
+GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
+GGML_API void                ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
 
 
 // Graph allocator
 // Graph allocator
 /*
 /*
@@ -50,7 +54,11 @@ GGML_API void           ggml_gallocr_free(ggml_gallocr_t galloc);
 // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
 // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
 // returns false if the buffer allocation failed
 // returns false if the buffer allocation failed
 GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
 GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
-GGML_API bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids);
+GGML_API bool ggml_gallocr_reserve_n(
+    ggml_gallocr_t galloc,
+    struct ggml_cgraph * graph,
+    const int * node_buffer_ids,
+    const int * leaf_buffer_ids);
 
 
 // automatic reallocation if the topology changes when using a single buffer
 // automatic reallocation if the topology changes when using a single buffer
 // returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
 // returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)

+ 14 - 3
ggml-backend-impl.h

@@ -86,12 +86,12 @@ extern "C" {
         // (optional) asynchronous tensor data access
         // (optional) asynchronous tensor data access
         void (*GGML_CALL set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
         void (*GGML_CALL set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
         void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
         void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
+        bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
 
 
         // (optional) complete all pending operations
         // (optional) complete all pending operations
         void (*GGML_CALL synchronize)(ggml_backend_t backend);
         void (*GGML_CALL synchronize)(ggml_backend_t backend);
 
 
-        // create a plan for ggml_cgraph and free it
+        // compute graph with a plan (not used currently)
         ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
         ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
         void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
         void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
 
 
@@ -102,16 +102,27 @@ extern "C" {
 
 
         // check if the backend supports an operation
         // check if the backend supports an operation
         bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
         bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
+
+        // (optional) event synchronization
+        ggml_backend_event_t (*GGML_CALL event_new)         (ggml_backend_t backend);
+        void                 (*GGML_CALL event_free)        (ggml_backend_event_t event);
+        void                 (*GGML_CALL event_record)      (ggml_backend_event_t event);
+        void                 (*GGML_CALL event_wait)        (ggml_backend_t backend, ggml_backend_event_t event);
+        void                 (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
     };
     };
 
 
     struct ggml_backend {
     struct ggml_backend {
         ggml_guid_t guid;
         ggml_guid_t guid;
 
 
         struct ggml_backend_i iface;
         struct ggml_backend_i iface;
-
         ggml_backend_context_t context;
         ggml_backend_context_t context;
     };
     };
 
 
+    struct ggml_backend_event {
+        ggml_backend_t backend;
+        void * context;
+    };
+
     //
     //
     // Backend registry
     // Backend registry
     //
     //

+ 358 - 135
ggml-backend.c

@@ -221,29 +221,29 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
 GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
 GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
     ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
 
 
-    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
     GGML_ASSERT(buf != NULL && "tensor buffer not set");
     GGML_ASSERT(buf != NULL && "tensor buffer not set");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
     GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
     GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
 
 
     if (!size) {
     if (!size) {
         return;
         return;
     }
     }
 
 
-    tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
+    buf->iface.set_tensor(buf, tensor, data, offset, size);
 }
 }
 
 
 GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
 GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
     ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
     ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
 
 
+    GGML_ASSERT(buf != NULL && "tensor buffer not set");
     GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
     GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-    GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
     GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
     GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
 
 
     if (!size) {
     if (!size) {
         return;
         return;
     }
     }
 
 
-    tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size);
+    buf->iface.get_tensor(buf, tensor, data, offset, size);
 }
 }
 
 
 void ggml_backend_synchronize(ggml_backend_t backend) {
 void ggml_backend_synchronize(ggml_backend_t backend) {
@@ -255,18 +255,30 @@ void ggml_backend_synchronize(ggml_backend_t backend) {
 }
 }
 
 
 ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
 ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    GGML_ASSERT(backend->iface.graph_plan_create != NULL);
+
     return backend->iface.graph_plan_create(backend, cgraph);
     return backend->iface.graph_plan_create(backend, cgraph);
 }
 }
 
 
 void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
 void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    GGML_ASSERT(backend->iface.graph_plan_free != NULL);
+
     backend->iface.graph_plan_free(backend, plan);
     backend->iface.graph_plan_free(backend, plan);
 }
 }
 
 
 enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
 enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
+
     return backend->iface.graph_plan_compute(backend, plan);
     return backend->iface.graph_plan_compute(backend, plan);
 }
 }
 
 
 enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
 enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
+    ggml_backend_synchronize(backend);
+    return err;
+}
+
+bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     return backend->iface.graph_compute(backend, cgraph);
     return backend->iface.graph_compute(backend, cgraph);
 }
 }
 
 
@@ -314,34 +326,68 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
     }
     }
 }
 }
 
 
-void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
+void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
     GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
     GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
 
 
     if (src == dst) {
     if (src == dst) {
         return;
         return;
     }
     }
 
 
-    if (ggml_backend_buft_supports_backend(src->buffer->buft, backend) && ggml_backend_buft_supports_backend(dst->buffer->buft, backend)) {
-        if (backend->iface.cpy_tensor_async != NULL) {
-            if (backend->iface.cpy_tensor_async(backend, src, dst)) {
-                return;
-            }
+    if (backend_dst->iface.cpy_tensor_async != NULL) {
+        if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
+            return;
         }
         }
     }
     }
 
 
-    size_t nbytes = ggml_nbytes(src);
+    // an async copy would normally happen after all the queued operations on both backends are completed
+    // sync src, set_async dst
     if (ggml_backend_buffer_is_host(src->buffer)) {
     if (ggml_backend_buffer_is_host(src->buffer)) {
-        ggml_backend_tensor_set_async(backend, dst, src->data, 0, nbytes);
-    }
-    else {
+        ggml_backend_synchronize(backend_src);
+        ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, ggml_nbytes(src));
+    } else {
+        ggml_backend_synchronize(backend_src);
         ggml_backend_tensor_copy(src, dst);
         ggml_backend_tensor_copy(src, dst);
+        ggml_backend_synchronize(backend_dst);
+    }
+}
+
+// events
+
+ggml_backend_event_t ggml_backend_event_new(ggml_backend_t backend) {
+    if (backend->iface.event_new == NULL) {
+        return NULL;
+    }
+    return backend->iface.event_new(backend);
+}
+
+void ggml_backend_event_free(ggml_backend_event_t event) {
+    if (event == NULL) {
+        return;
     }
     }
+    event->backend->iface.event_free(event);
+}
+
+void ggml_backend_event_record(ggml_backend_event_t event) {
+    GGML_ASSERT(event->backend->iface.event_record != NULL);
+
+    event->backend->iface.event_record(event);
+}
+
+void ggml_backend_event_synchronize(ggml_backend_event_t event) {
+    GGML_ASSERT(event->backend->iface.event_synchronize != NULL);
+
+    event->backend->iface.event_synchronize(event);
 }
 }
 
 
+void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
+    GGML_ASSERT(backend->iface.event_wait != NULL);
+
+    backend->iface.event_wait(backend, event);
+}
 
 
 // backend registry
 // backend registry
 
 
-#define GGML_MAX_BACKENDS_REG 16
+#define GGML_REG_MAX_BACKENDS 16
 
 
 struct ggml_backend_reg {
 struct ggml_backend_reg {
     char name[128];
     char name[128];
@@ -350,7 +396,7 @@ struct ggml_backend_reg {
     void * user_data;
     void * user_data;
 };
 };
 
 
-static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
+static struct ggml_backend_reg ggml_backend_registry[GGML_REG_MAX_BACKENDS];
 static size_t ggml_backend_registry_count = 0;
 static size_t ggml_backend_registry_count = 0;
 
 
 GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
 GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
@@ -395,7 +441,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
 }
 }
 
 
 GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
 GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
-    GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
+    GGML_ASSERT(ggml_backend_registry_count < GGML_REG_MAX_BACKENDS);
 
 
     size_t id = ggml_backend_registry_count;
     size_t id = ggml_backend_registry_count;
 
 
@@ -746,8 +792,12 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t
     struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
     struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
 
 
     if (cpu_ctx->work_size < cplan.work_size) {
     if (cpu_ctx->work_size < cplan.work_size) {
-        // TODO: may be faster to free and use malloc to avoid the copy
-        cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
+        free(cpu_ctx->work_data);
+        cpu_ctx->work_data = malloc(cplan.work_size);
+        if (cpu_ctx->work_data == NULL) {
+            cpu_ctx->work_size = 0;
+            return GGML_STATUS_ALLOC_FAILED;
+        }
         cpu_ctx->work_size = cplan.work_size;
         cpu_ctx->work_size = cplan.work_size;
     }
     }
     cplan.work_data = cpu_ctx->work_data;
     cplan.work_data = cpu_ctx->work_data;
@@ -784,6 +834,11 @@ static struct ggml_backend_i cpu_backend_i = {
     /* .graph_plan_compute      = */ ggml_backend_cpu_graph_plan_compute,
     /* .graph_plan_compute      = */ ggml_backend_cpu_graph_plan_compute,
     /* .graph_compute           = */ ggml_backend_cpu_graph_compute,
     /* .graph_compute           = */ ggml_backend_cpu_graph_compute,
     /* .supports_op             = */ ggml_backend_cpu_supports_op,
     /* .supports_op             = */ ggml_backend_cpu_supports_op,
+    /* .event_new               = */ NULL,
+    /* .event_free              = */ NULL,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+    /* .event_synchronize       = */ NULL,
 };
 };
 
 
 static ggml_guid_t ggml_backend_cpu_guid(void) {
 static ggml_guid_t ggml_backend_cpu_guid(void) {
@@ -939,15 +994,27 @@ static bool ggml_is_view_op(enum ggml_op op) {
 
 
 // scheduler
 // scheduler
 
 
-#define GGML_MAX_BACKENDS 16
-#define GGML_MAX_SPLITS 256
-#define GGML_MAX_SPLIT_INPUTS 16
+#ifndef GGML_SCHED_MAX_BACKENDS
+#define GGML_SCHED_MAX_BACKENDS 16
+#endif
+
+#ifndef GGML_SCHED_MAX_SPLITS
+#define GGML_SCHED_MAX_SPLITS 256
+#endif
+
+#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
+#define GGML_SCHED_MAX_SPLIT_INPUTS 16
+#endif
+
+#ifndef GGML_SCHED_MAX_COPIES
+#define GGML_SCHED_MAX_COPIES 4
+#endif
 
 
 struct ggml_backend_sched_split {
 struct ggml_backend_sched_split {
     int backend_id;
     int backend_id;
     int i_start;
     int i_start;
     int i_end;
     int i_end;
-    struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
+    struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
     int n_inputs;
     int n_inputs;
     // graph view of this split
     // graph view of this split
     struct ggml_cgraph graph;
     struct ggml_cgraph graph;
@@ -955,45 +1022,53 @@ struct ggml_backend_sched_split {
 
 
 struct ggml_backend_sched {
 struct ggml_backend_sched {
     bool is_reset; // true if the scheduler has been reset since the last graph split
     bool is_reset; // true if the scheduler has been reset since the last graph split
+    bool is_alloc;
 
 
     int n_backends;
     int n_backends;
-    ggml_backend_t backends[GGML_MAX_BACKENDS];
-    ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
 
 
+    ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS];
+    ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
     ggml_gallocr_t galloc;
     ggml_gallocr_t galloc;
 
 
     // hash keys of the nodes in the graph
     // hash keys of the nodes in the graph
     struct ggml_hash_set    hash_set;
     struct ggml_hash_set    hash_set;
     // hash values
     // hash values
     int * tensor_backend_id;
     int * tensor_backend_id;
-    struct ggml_tensor * (* tensor_copies)[GGML_MAX_BACKENDS];
+    struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
 
 
-    int * node_backend_ids; // [n_nodes]
-    int n_nodes;
+    int * node_backend_ids; // [graph_size]
+    int * leaf_backend_ids; // [graph_size]
 
 
     // copy of the graph with modified inputs
     // copy of the graph with modified inputs
     struct ggml_cgraph * graph;
     struct ggml_cgraph * graph;
 
 
-    struct ggml_backend_sched_split splits[GGML_MAX_SPLITS];
+    // graph splits
+    struct ggml_backend_sched_split splits[GGML_SCHED_MAX_SPLITS];
     int n_splits;
     int n_splits;
 
 
+    // pipeline parallelism support
+    int n_copies;
+    int cur_copy;
+    ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
+    struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
+    int n_graph_inputs;
+
     struct ggml_context * ctx;
     struct ggml_context * ctx;
 
 
     ggml_backend_sched_eval_callback callback_eval;
     ggml_backend_sched_eval_callback callback_eval;
     void * callback_eval_user_data;
     void * callback_eval_user_data;
 
 
     // align context_buffer to GGML_MEM_ALIGN
     // align context_buffer to GGML_MEM_ALIGN
-    #ifdef _MSC_VER
+#ifdef _MSC_VER
     __declspec(align(GGML_MEM_ALIGN))
     __declspec(align(GGML_MEM_ALIGN))
-    #else
+#else
     __attribute__((aligned(GGML_MEM_ALIGN)))
     __attribute__((aligned(GGML_MEM_ALIGN)))
-    #endif
-    char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
+#endif
+    char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
 };
 };
 
 
-#define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
-#define tensor_backend_id(node) sched->tensor_backend_id[hash_id(node)]
-#define tensor_backend(node) (tensor_backend_id(node) == -1 ? NULL : sched->backends[tensor_backend_id(node)])
+#define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor)
+#define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)]
 
 
 // returns the priority of the backend, lower id is higher priority
 // returns the priority of the backend, lower id is higher priority
 static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
 static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
@@ -1005,7 +1080,8 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
     return -1;
     return -1;
 }
 }
 
 
-static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
+static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) {
+    ggml_backend_buffer_t buffer = tensor->buffer;
     if (buffer == NULL) {
     if (buffer == NULL) {
         return -1;
         return -1;
     }
     }
@@ -1016,12 +1092,16 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, gg
             return i;
             return i;
         }
         }
     }
     }
-    GGML_ASSERT(false && "tensor buffer type not supported by any backend");
-    return -1; // silence warning
+
+    fprintf(stderr, "%s: error: no backend supports buffer type %s used in tensor %s\n",
+        __func__, ggml_backend_buffer_name(buffer), tensor->name);
+    GGML_ASSERT(false);
+
+    return -1;
 }
 }
 
 
 #if 0
 #if 0
-static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
+static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
 #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
 #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
 #define GET_CAUSE(node) causes[hash_id(node)]
 #define GET_CAUSE(node) causes[hash_id(node)]
 #else
 #else
@@ -1035,19 +1115,28 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
 
 
     // assign pre-allocated nodes to their backend
     // assign pre-allocated nodes to their backend
     // dst
     // dst
-    int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->buffer);
+    int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor);
     if (cur_backend != -1) {
     if (cur_backend != -1) {
-        SET_CAUSE(node, "1.dst");
+        SET_CAUSE(tensor, "1.dst");
         return cur_backend;
         return cur_backend;
     }
     }
+
     // view_src
     // view_src
     if (tensor->view_src != NULL) {
     if (tensor->view_src != NULL) {
-        cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src->buffer);
+        cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
         if (cur_backend != -1) {
         if (cur_backend != -1) {
-            SET_CAUSE(node, "1.vsrc");
+            SET_CAUSE(tensor, "1.vsrc");
             return cur_backend;
             return cur_backend;
         }
         }
     }
     }
+
+    // input
+    if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
+        cur_backend = sched->n_backends - 1; // last backend (assumed CPU)
+        SET_CAUSE(tensor, "1.inp");
+        return cur_backend;
+    }
+
     // assign nodes that use weights to the backend of the weights
     // assign nodes that use weights to the backend of the weights
     for (int i = 0; i < GGML_MAX_SRC; i++) {
     for (int i = 0; i < GGML_MAX_SRC; i++) {
         const struct ggml_tensor * src = tensor->src[i];
         const struct ggml_tensor * src = tensor->src[i];
@@ -1055,9 +1144,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
             continue;
             continue;
         }
         }
         if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
         if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
-            int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer);
+            int src_backend = ggml_backend_sched_backend_from_buffer(sched, src);
             // operations with weights are always run on the same backend as the weights
             // operations with weights are always run on the same backend as the weights
-            SET_CAUSE(node, "1.wgt%d", i);
+            SET_CAUSE(tensor, "1.wgt%d", i);
             return src_backend;
             return src_backend;
         }
         }
     }
     }
@@ -1093,7 +1182,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
         if (ggml_is_view_op(node->op)) {
         if (ggml_is_view_op(node->op)) {
             continue;
             continue;
         }
         }
-        ggml_backend_t tensor_backend = tensor_backend(node);
+        ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
         fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
         fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
             fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
             fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
         for (int j = 0; j < GGML_MAX_SRC; j++) {
         for (int j = 0; j < GGML_MAX_SRC; j++) {
@@ -1101,7 +1190,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
             if (src == NULL) {
             if (src == NULL) {
                 continue;
                 continue;
             }
             }
-            ggml_backend_t src_backend = tensor_backend(src);
+            ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
             fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
             fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
                 fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
                 fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
         }
         }
@@ -1118,6 +1207,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
 static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
 static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
     // reset splits
     // reset splits
     sched->n_splits = 0;
     sched->n_splits = 0;
+    sched->n_graph_inputs = 0;
     sched->is_reset = false;
     sched->is_reset = false;
 
 
     struct ggml_init_params params = {
     struct ggml_init_params params = {
@@ -1163,7 +1253,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
         }
         }
     }
     }
 #ifdef DEBUG_PASS1
 #ifdef DEBUG_PASS1
-    fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+    fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
 #endif
 #endif
 
 
     // pass 2: expand current backend assignments
     // pass 2: expand current backend assignments
@@ -1171,10 +1261,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
     // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
     // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
     // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
     // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
 
 
-    // pass 2.1 expand gpu up
+
+    // pass 2.2 expand gpu down
     {
     {
         int cur_backend_id = -1;
         int cur_backend_id = -1;
-        for (int i = graph->n_nodes - 1; i >= 0; i--) {
+        for (int i = 0; i < graph->n_nodes; i++) {
             struct ggml_tensor * node = graph->nodes[i];
             struct ggml_tensor * node = graph->nodes[i];
             if (ggml_is_view_op(node->op)) {
             if (ggml_is_view_op(node->op)) {
                 continue;
                 continue;
@@ -1189,15 +1280,15 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                 }
                 }
             } else {
             } else {
                 tensor_backend_id(node) = cur_backend_id;
                 tensor_backend_id(node) = cur_backend_id;
-                SET_CAUSE(node, "2.1");
+                SET_CAUSE(node, "2.2");
             }
             }
         }
         }
     }
     }
 
 
-    // pass 2.2 expand gpu down
+    // pass 2.1 expand gpu up
     {
     {
         int cur_backend_id = -1;
         int cur_backend_id = -1;
-        for (int i = 0; i < graph->n_nodes; i++) {
+        for (int i = graph->n_nodes - 1; i >= 0; i--) {
             struct ggml_tensor * node = graph->nodes[i];
             struct ggml_tensor * node = graph->nodes[i];
             if (ggml_is_view_op(node->op)) {
             if (ggml_is_view_op(node->op)) {
                 continue;
                 continue;
@@ -1212,15 +1303,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                 }
                 }
             } else {
             } else {
                 tensor_backend_id(node) = cur_backend_id;
                 tensor_backend_id(node) = cur_backend_id;
-                SET_CAUSE(node, "2.2");
+                SET_CAUSE(node, "2.1");
             }
             }
         }
         }
     }
     }
 
 
-    // pass 2.3 expand rest up
+
+    // pass 2.4 expand rest down
     {
     {
         int cur_backend_id = -1;
         int cur_backend_id = -1;
-        for (int i = graph->n_nodes - 1; i >= 0; i--) {
+        for (int i = 0; i < graph->n_nodes; i++) {
             struct ggml_tensor * node = graph->nodes[i];
             struct ggml_tensor * node = graph->nodes[i];
             if (ggml_is_view_op(node->op)) {
             if (ggml_is_view_op(node->op)) {
                 continue;
                 continue;
@@ -1230,15 +1322,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                 cur_backend_id = tensor_backend_id;
                 cur_backend_id = tensor_backend_id;
             } else {
             } else {
                 tensor_backend_id(node) = cur_backend_id;
                 tensor_backend_id(node) = cur_backend_id;
-                SET_CAUSE(node, "2.3");
+                SET_CAUSE(node, "2.4");
             }
             }
         }
         }
     }
     }
-
-    // pass 2.4 expand rest down
+        // pass 2.3 expand rest up
     {
     {
         int cur_backend_id = -1;
         int cur_backend_id = -1;
-        for (int i = 0; i < graph->n_nodes; i++) {
+        for (int i = graph->n_nodes - 1; i >= 0; i--) {
             struct ggml_tensor * node = graph->nodes[i];
             struct ggml_tensor * node = graph->nodes[i];
             if (ggml_is_view_op(node->op)) {
             if (ggml_is_view_op(node->op)) {
                 continue;
                 continue;
@@ -1248,12 +1339,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                 cur_backend_id = tensor_backend_id;
                 cur_backend_id = tensor_backend_id;
             } else {
             } else {
                 tensor_backend_id(node) = cur_backend_id;
                 tensor_backend_id(node) = cur_backend_id;
-                SET_CAUSE(node, "2.4");
+                SET_CAUSE(node, "2.3");
             }
             }
         }
         }
     }
     }
+
 #ifdef DEBUG_PASS2
 #ifdef DEBUG_PASS2
-    fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+    fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
 #endif
 #endif
 
 
     // pass 3: assign backends to remaining src from dst and view_src
     // pass 3: assign backends to remaining src from dst and view_src
@@ -1283,7 +1375,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
         }
         }
     }
     }
 #ifdef DEBUG_PASS3
 #ifdef DEBUG_PASS3
-    fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+    fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
 #endif
 #endif
 
 
     // pass 4: split graph, find tensors that need to be copied
     // pass 4: split graph, find tensors that need to be copied
@@ -1315,7 +1407,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
             if (tensor_backend_id != cur_backend_id) {
             if (tensor_backend_id != cur_backend_id) {
                 sched->splits[cur_split].i_end = i;
                 sched->splits[cur_split].i_end = i;
                 cur_split++;
                 cur_split++;
-                GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
+                GGML_ASSERT(cur_split < GGML_SCHED_MAX_SPLITS);
                 sched->splits[cur_split].backend_id = tensor_backend_id;
                 sched->splits[cur_split].backend_id = tensor_backend_id;
                 sched->splits[cur_split].i_start = i;
                 sched->splits[cur_split].i_start = i;
                 sched->splits[cur_split].n_inputs = 0;
                 sched->splits[cur_split].n_inputs = 0;
@@ -1328,25 +1420,57 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                 if (src == NULL) {
                 if (src == NULL) {
                     continue;
                     continue;
                 }
                 }
+
                 int src_backend_id = tensor_backend_id(src);
                 int src_backend_id = tensor_backend_id(src);
                 assert(src_backend_id != -1); // all inputs should be assigned by now
                 assert(src_backend_id != -1); // all inputs should be assigned by now
+
+                if (src->flags & GGML_TENSOR_FLAG_INPUT)  {
+                    size_t id = hash_id(src);
+                    if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
+                        ggml_backend_t backend = sched->backends[src_backend_id];
+                        for (int c = 0; c < sched->n_copies; c++) {
+                            struct ggml_tensor * tensor_copy;
+                            if (c == sched->cur_copy) {
+                                tensor_copy = src; // use the original tensor as the current copy
+                            } else {
+                                tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
+                                ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
+                            }
+                            if (sched->n_copies > 1) {
+                                ggml_set_input(tensor_copy);
+                                ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
+                            }
+                            sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
+                            tensor_backend_id(tensor_copy) = src_backend_id;
+                            SET_CAUSE(tensor_copy, "4.cpy");
+                        }
+                        int n_graph_inputs = sched->n_graph_inputs++;
+                        GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
+                        sched->graph_inputs[n_graph_inputs] = src;
+                    }
+                }
+
                 if (src_backend_id != tensor_backend_id) {
                 if (src_backend_id != tensor_backend_id) {
                     // create a copy of the input in the split's backend
                     // create a copy of the input in the split's backend
                     size_t id = hash_id(src);
                     size_t id = hash_id(src);
-                    if (sched->tensor_copies[id][cur_backend_id] == NULL) {
+                    if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
                         ggml_backend_t backend = sched->backends[cur_backend_id];
                         ggml_backend_t backend = sched->backends[cur_backend_id];
-                        struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
-                        ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
-
-                        sched->tensor_copies[id][cur_backend_id] = tensor_copy;
-                        tensor_backend_id(tensor_copy) = cur_backend_id;
-                        SET_CAUSE(tensor_copy, "4.cpy");
-
+                        for (int c = 0; c < sched->n_copies; c++) {
+                            struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
+                            ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
+                            if (sched->n_copies > 1) {
+                                ggml_set_input(tensor_copy);
+                                ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
+                            }
+                            sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
+                            tensor_backend_id(tensor_copy) = cur_backend_id;
+                            SET_CAUSE(tensor_copy, "4.cpy");
+                        }
                         int n_inputs = sched->splits[cur_split].n_inputs++;
                         int n_inputs = sched->splits[cur_split].n_inputs++;
-                        GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
+                        GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
                         sched->splits[cur_split].inputs[n_inputs] = src;
                         sched->splits[cur_split].inputs[n_inputs] = src;
                     }
                     }
-                    node->src[j] = sched->tensor_copies[id][cur_backend_id];
+                    node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
                 }
                 }
             }
             }
         }
         }
@@ -1354,37 +1478,39 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
         sched->n_splits = cur_split + 1;
         sched->n_splits = cur_split + 1;
     }
     }
 #ifdef DEBUG_PASS4
 #ifdef DEBUG_PASS4
-    fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+    fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
 #endif
 #endif
 
 
 #ifndef NDEBUG
 #ifndef NDEBUG
     // sanity check: all sources should have the same backend as the node
     // sanity check: all sources should have the same backend as the node
     for (int i = 0; i < graph->n_nodes; i++) {
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
         struct ggml_tensor * node = graph->nodes[i];
-        ggml_backend_t tensor_backend = tensor_backend(node);
+        ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
         if (tensor_backend == NULL) {
         if (tensor_backend == NULL) {
             fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
             fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
         }
         }
-        if (node->view_src != NULL && tensor_backend != tensor_backend(node->view_src)) {
+        if (node->view_src != NULL && tensor_backend != ggml_backend_sched_get_tensor_backend(sched, node->view_src)) {
             fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
             fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
                 node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
                 node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
-                node->view_src->name, tensor_backend(node->view_src) ? ggml_backend_name(tensor_backend(node->view_src)) : "NULL");
+                node->view_src->name, ggml_backend_sched_get_tensor_backend(sched, node->view_src) ?
+                    ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, node->view_src)) : "NULL");
         }
         }
         for (int j = 0; j < GGML_MAX_SRC; j++) {
         for (int j = 0; j < GGML_MAX_SRC; j++) {
             struct ggml_tensor * src = node->src[j];
             struct ggml_tensor * src = node->src[j];
             if (src == NULL) {
             if (src == NULL) {
                 continue;
                 continue;
             }
             }
-            ggml_backend_t src_backend = tensor_backend(src);
+            ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
             if (src_backend != tensor_backend /* && src_backend != NULL */) {
             if (src_backend != tensor_backend /* && src_backend != NULL */) {
                 fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
                 fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
                     node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
                     node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
                     j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
                     j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
             }
             }
-            if (src->view_src != NULL && src_backend != tensor_backend(src->view_src)) {
+            if (src->view_src != NULL && src_backend != ggml_backend_sched_get_tensor_backend(sched, src->view_src)) {
                 fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
                 fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
                     src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
                     src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
-                    src->view_src->name, tensor_backend(src->view_src) ? ggml_backend_name(tensor_backend(src->view_src)) : "NULL");
+                    src->view_src->name, ggml_backend_sched_get_tensor_backend(sched, src->view_src) ?
+                        ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, src->view_src)) : "NULL");
             }
             }
         }
         }
     }
     }
@@ -1392,18 +1518,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
 #endif
 #endif
 
 
     // create copies of the graph for each split
     // create copies of the graph for each split
-    // FIXME: avoid this copy, pass split inputs to ggml_gallocr_alloc_graph_n in some other way
-    struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_MAX_SPLIT_INPUTS, false);
+    // TODO: avoid this copy
+    struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
     for (int i = 0; i < sched->n_splits; i++) {
     for (int i = 0; i < sched->n_splits; i++) {
         struct ggml_backend_sched_split * split = &sched->splits[i];
         struct ggml_backend_sched_split * split = &sched->splits[i];
         split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
         split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
 
 
+        // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
         for (int j = 0; j < split->n_inputs; j++) {
         for (int j = 0; j < split->n_inputs; j++) {
             struct ggml_tensor * input = split->inputs[j];
             struct ggml_tensor * input = split->inputs[j];
-            struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id];
+            struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id][sched->cur_copy];
 
 
             // add a dependency to the input source so that it is not freed before the copy is done
             // add a dependency to the input source so that it is not freed before the copy is done
             struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
             struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
+            input_dep->src[0] = input;
             sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
             sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
             graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
             graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
 
 
@@ -1417,18 +1545,56 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
             graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
             graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
         }
         }
     }
     }
+
+    if (sched->n_copies > 1) {
+        // add input copies as leafs so that they are allocated first
+        for (int i = 0; i < sched->n_graph_inputs; i++) {
+            struct ggml_tensor * input = sched->graph_inputs[i];
+            size_t id = hash_id(input);
+            int backend_id = tensor_backend_id(input);
+            for (int c = 0; c < sched->n_copies; c++) {
+                struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
+                sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
+                graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
+            }
+        }
+
+        for (int i = 0; i < sched->n_splits; i++) {
+            struct ggml_backend_sched_split * split = &sched->splits[i];
+            int backend_id = split->backend_id;
+            for (int j = 0; j < split->n_inputs; j++) {
+                struct ggml_tensor * input = split->inputs[j];
+                size_t id = hash_id(input);
+                for (int c = 0; c < sched->n_copies; c++) {
+                    struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
+                    sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
+                    graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
+                }
+            }
+        }
+    }
+
+    // add leafs from the original graph
+    for (int i = 0; i < graph->n_leafs; i++) {
+        struct ggml_tensor * leaf = graph->leafs[i];
+        sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
+        graph_copy->leafs[graph_copy->n_leafs++] = leaf;
+    }
+
     sched->graph = graph_copy;
     sched->graph = graph_copy;
 }
 }
 
 
 static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
 static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
-    // ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
+    // allocate graph
     if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
     if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
+        // the re-allocation may cause the split inputs to be moved to a different address
+        ggml_backend_sched_synchronize(sched);
 #ifndef NDEBUG
 #ifndef NDEBUG
-        fprintf(stderr, "ggml_backend_sched: failed to allocate graph, reserving\n");
+        fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__);
 #endif
 #endif
-        ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
+        ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
         if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
         if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
-            fprintf(stderr, "ggml_backend_sched: failed to allocate graph\n");
+            fprintf(stderr, "%s: failed to allocate graph\n", __func__);
             return false;
             return false;
         }
         }
     }
     }
@@ -1437,9 +1603,6 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
 }
 }
 
 
 static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
 static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
-    uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
-    uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
-
     struct ggml_backend_sched_split * splits = sched->splits;
     struct ggml_backend_sched_split * splits = sched->splits;
 
 
     for (int i = 0; i < sched->n_splits; i++) {
     for (int i = 0; i < sched->n_splits; i++) {
@@ -1448,34 +1611,36 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
         ggml_backend_t split_backend = sched->backends[split_backend_id];
         ggml_backend_t split_backend = sched->backends[split_backend_id];
 
 
         // copy the input tensors to the split backend
         // copy the input tensors to the split backend
-        uint64_t copy_start_us = ggml_time_us();
         for (int j = 0; j < split->n_inputs; j++) {
         for (int j = 0; j < split->n_inputs; j++) {
+            ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
             struct ggml_tensor * input = split->inputs[j];
             struct ggml_tensor * input = split->inputs[j];
-            struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id];
+            struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy];
 
 
-            GGML_ASSERT(input->buffer != NULL);
-            GGML_ASSERT(input_cpy->buffer != NULL);
+            if (input->flags & GGML_TENSOR_FLAG_INPUT) {
+                // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
+                if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
+                    ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
+                } else {
+                    ggml_backend_synchronize(split_backend);
+                }
+                ggml_backend_tensor_copy(input, input_cpy);
+            } else {
+                if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
+                    ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
+                } else {
+                    ggml_backend_synchronize(split_backend);
+                    ggml_backend_synchronize(input_backend);
+                }
 
 
-            ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
+                ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
+            }
         }
         }
-        //ggml_backend_synchronize(split_backend); // necessary to measure copy time
-        int64_t copy_end_us = ggml_time_us();
-        copy_us[split_backend_id] += copy_end_us - copy_start_us;
 
 
-#if 0
-        char split_filename[GGML_MAX_NAME];
-        snprintf(split_filename, GGML_MAX_NAME, "split_%i_%s.dot", i, ggml_backend_name(split_backend));
-        ggml_graph_dump_dot(split->graph, NULL, split_filename);
-#endif
-
-
-        uint64_t compute_start_us = ggml_time_us();
         if (!sched->callback_eval) {
         if (!sched->callback_eval) {
-            enum ggml_status ec = ggml_backend_graph_compute(split_backend, &split->graph);
+            enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
             if (ec != GGML_STATUS_SUCCESS) {
             if (ec != GGML_STATUS_SUCCESS) {
                 return ec;
                 return ec;
             }
             }
-            //ggml_backend_synchronize(split_backend); // necessary to measure compute time
         } else {
         } else {
             // similar to ggml_backend_compare_graph_backend
             // similar to ggml_backend_compare_graph_backend
             for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
             for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
@@ -1494,11 +1659,14 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
 
 
                 struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
                 struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
 
 
-                enum ggml_status ec = ggml_backend_graph_compute(split_backend, &gv);
+                enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
                 if (ec != GGML_STATUS_SUCCESS) {
                 if (ec != GGML_STATUS_SUCCESS) {
                     return ec;
                     return ec;
                 }
                 }
 
 
+                // TODO: pass backend to the callback, then the user can decide if they want to synchronize
+                ggml_backend_synchronize(split_backend);
+
                 if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
                 if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
                     break;
                     break;
                 }
                 }
@@ -1506,39 +1674,54 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                 j0 = j1;
                 j0 = j1;
             }
             }
         }
         }
-        uint64_t compute_end_us = ggml_time_us();
-        compute_us[split_backend_id] += compute_end_us - compute_start_us;
-    }
 
 
-#if 0
-    // per-backend timings
-    fprintf(stderr, "sched_compute_splits times (%d splits):\n", sched->n_splits);
-    for (int i = 0; i < sched->n_backends; i++) {
-        if (copy_us[i] > 0 || compute_us[i] > 0) {
-            fprintf(stderr, "\t%5.5s: %lu us copy, %lu us compute\n", ggml_backend_name(sched->backends[i]), copy_us[i], compute_us[i]);
+        // record the event of this copy
+        if (split->n_inputs > 0) {
+            if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
+                ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]);
+            }
         }
         }
     }
     }
-#endif
+
+    sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
 
 
     return GGML_STATUS_SUCCESS;
     return GGML_STATUS_SUCCESS;
 }
 }
 
 
-ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
+ggml_backend_sched_t ggml_backend_sched_new(
+        ggml_backend_t * backends,
+        ggml_backend_buffer_type_t * bufts,
+        int n_backends,
+        size_t graph_size,
+        bool parallel) {
     GGML_ASSERT(n_backends > 0);
     GGML_ASSERT(n_backends > 0);
-    GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
+    GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
+    GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
 
 
     struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
     struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
 
 
     // initialize hash table
     // initialize hash table
-    sched->hash_set          = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
+    sched->hash_set          = ggml_hash_set_new(graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
     sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
     sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
     sched->tensor_copies     = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
     sched->tensor_copies     = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
     sched->node_backend_ids  = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
     sched->node_backend_ids  = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
+    sched->leaf_backend_ids  = calloc(sizeof(sched->leaf_backend_ids[0]), graph_size);
 
 
     sched->n_backends = n_backends;
     sched->n_backends = n_backends;
-    for (int i = 0; i < n_backends; i++) {
-        sched->backends[i] = backends[i];
-        sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
+
+    sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
+
+    GGML_ASSERT(sched->n_copies <= GGML_SCHED_MAX_COPIES);
+
+    for (int b = 0; b < n_backends; b++) {
+        sched->backends[b] = backends[b];
+        sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
+        GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b]));
+        if (sched->n_copies > 1) {
+            for (int c = 0; c < sched->n_copies; c++) {
+                sched->events[b][c] = ggml_backend_event_new(backends[b]);
+            }
+        }
     }
     }
 
 
     sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
     sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
@@ -1552,12 +1735,18 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
     if (sched == NULL) {
     if (sched == NULL) {
         return;
         return;
     }
     }
+    for (int b = 0; b < sched->n_backends; b++) {
+        for (int c = 0; c < sched->n_copies; c++) {
+            ggml_backend_event_free(sched->events[b][c]);
+        }
+    }
     ggml_gallocr_free(sched->galloc);
     ggml_gallocr_free(sched->galloc);
     ggml_free(sched->ctx);
     ggml_free(sched->ctx);
     free(sched->hash_set.keys);
     free(sched->hash_set.keys);
     free(sched->tensor_backend_id);
     free(sched->tensor_backend_id);
     free(sched->tensor_copies);
     free(sched->tensor_copies);
     free(sched->node_backend_ids);
     free(sched->node_backend_ids);
+    free(sched->leaf_backend_ids);
     free(sched);
     free(sched);
 }
 }
 
 
@@ -1569,34 +1758,63 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
     memset(sched->tensor_copies,      0, sizeof(sched->tensor_copies[0])     * hash_size);
     memset(sched->tensor_copies,      0, sizeof(sched->tensor_copies[0])     * hash_size);
 
 
     sched->is_reset = true;
     sched->is_reset = true;
+    sched->is_alloc = false;
 }
 }
 
 
 bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
 bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
     ggml_backend_sched_split_graph(sched, measure_graph);
     ggml_backend_sched_split_graph(sched, measure_graph);
 
 
-    if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids)) {
+    // TODO: extract this to a separate function
+    if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
         return false;
         return false;
     }
     }
 
 
     ggml_backend_sched_reset(sched);
     ggml_backend_sched_reset(sched);
+    ggml_backend_sched_synchronize(sched);
+
+    return true;
+}
+
+bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
+    GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
+
+    ggml_backend_sched_split_graph(sched, graph);
+
+    if (!ggml_backend_sched_alloc_splits(sched)) {
+        return false;
+    }
+
+    sched->is_alloc = true;
+
     return true;
     return true;
 }
 }
 
 
 enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
 enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
-    GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
+    enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph);
+    ggml_backend_sched_synchronize(sched);
+    return err;
+}
 
 
-    if (!sched->is_reset) {
+enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
+    if (!sched->is_reset && !sched->is_alloc) {
         ggml_backend_sched_reset(sched);
         ggml_backend_sched_reset(sched);
     }
     }
 
 
-    ggml_backend_sched_split_graph(sched, graph);
-    if (!ggml_backend_sched_alloc_splits(sched)) {
-        return GGML_STATUS_ALLOC_FAILED;
+    if (!sched->is_alloc) {
+        if (!ggml_backend_sched_alloc_graph(sched, graph)) {
+            return GGML_STATUS_ALLOC_FAILED;
+        }
     }
     }
 
 
     return ggml_backend_sched_compute_splits(sched);
     return ggml_backend_sched_compute_splits(sched);
 }
 }
 
 
+void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
+    for (int i = 0; i < sched->n_backends; i++) {
+        ggml_backend_synchronize(sched->backends[i]);
+    }
+}
+
 void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
 void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
     sched->callback_eval = callback;
     sched->callback_eval = callback;
     sched->callback_eval_user_data = user_data;
     sched->callback_eval_user_data = user_data;
@@ -1606,19 +1824,24 @@ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
     return sched->n_splits;
     return sched->n_splits;
 }
 }
 
 
+int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
+    return sched->n_copies;
+}
+
 size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
 size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
     int backend_index = ggml_backend_sched_backend_id(sched, backend);
     int backend_index = ggml_backend_sched_backend_id(sched, backend);
     GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
     GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
+
     return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
     return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
 }
 }
 
 
-void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
+void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
     int backend_index = ggml_backend_sched_backend_id(sched, backend);
     int backend_index = ggml_backend_sched_backend_id(sched, backend);
     GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
     GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
     tensor_backend_id(node) = backend_index;
     tensor_backend_id(node) = backend_index;
 }
 }
 
 
-ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
+ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
     int backend_index = tensor_backend_id(node);
     int backend_index = tensor_backend_id(node);
     if (backend_index == -1) {
     if (backend_index == -1) {
         return NULL;
         return NULL;

+ 41 - 17
ggml-backend.h

@@ -9,6 +9,7 @@ extern "C" {
 
 
     typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
     typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
     typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
     typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
+    typedef struct ggml_backend_event * ggml_backend_event_t;
     typedef struct ggml_backend * ggml_backend_t;
     typedef struct ggml_backend * ggml_backend_t;
     typedef void * ggml_backend_graph_plan_t;
     typedef void * ggml_backend_graph_plan_t;
 
 
@@ -72,11 +73,24 @@ extern "C" {
     GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
     GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
     GGML_API enum ggml_status ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
     GGML_API enum ggml_status ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
 
 
+    GGML_API bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
     GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
     GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
 
 
     // tensor copy between different backends
     // tensor copy between different backends
     GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
     GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
-    GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); // automatic fallback to sync copy
+
+    // asynchronous copy
+    // the copy is performed after all the currently queued operations in backend_src
+    // backend_dst will wait for the copy to complete before performing other operations
+    // automatic fallback to sync copy if async is not supported
+    GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
+
+    // events
+    GGML_API ggml_backend_event_t   ggml_backend_event_new        (ggml_backend_t backend);
+    GGML_API void                   ggml_backend_event_free       (ggml_backend_event_t event);
+    GGML_API void                   ggml_backend_event_record     (ggml_backend_event_t event);
+    GGML_API void                   ggml_backend_event_synchronize(ggml_backend_event_t event);
+    GGML_API void                   ggml_backend_event_wait       (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event
 
 
     //
     //
     // CPU backend
     // CPU backend
@@ -123,27 +137,31 @@ extern "C" {
     /*
     /*
       Example usage:
       Example usage:
 
 
-        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, num_backends);
-        // sched is initialized with measure allocators and cannot be used until allocated with a measure graph
+        // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be asigned
+        // preferrably to run on the same backend as the buffer
+        ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
 
 
-        // initialize buffers from a measure graph
-        measure_graph = build_graph(sched); // use the allocr to allocate inputs as needed
+        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
 
 
-        // in build_graph:
-        build_graph(...) {
-            // manually assign nodes to a backend (optional, should not be needed in most cases)
-            struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
-            ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
-        }
+        // initialize buffers from a max size graph (optional)
+        reserve_graph = build_graph(sched, max_batch_size);
 
 
-        // allocate backend buffers from measure graph
-        ggml_backend_sched_init_measure(sched, measure_graph);
+        // manually assign nodes to a backend (optional, should not be needed in most cases)
+        struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
+        ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);
 
 
-        // the scheduler is now ready to compute graphs
+        ggml_backend_sched_reserve(sched, reserve_graph);
 
 
         // compute
         // compute
         graph = build_graph(sched);
         graph = build_graph(sched);
         ggml_backend_sched_graph_compute(sched, graph);
         ggml_backend_sched_graph_compute(sched, graph);
+
+        // if there are graph inputs:
+        ggml_backend_sched_reset(sched);
+        ggml_backend_sched_alloc_graph(sched, graph);
+        ggml_backend_tensor_set(input_tensor, ...);
+        ggml_backend_sched_graph_compute(sched, graph);
+    }
     */
     */
 
 
     struct ggml_backend_sched;
     struct ggml_backend_sched;
@@ -158,20 +176,26 @@ extern "C" {
     typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
     typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
 
 
     // Initialize a backend scheduler
     // Initialize a backend scheduler
-    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
+    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
     GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
     GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
+
     // Initialize backend buffers from a measure graph
     // Initialize backend buffers from a measure graph
     GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
     GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
+
     // Get the number of splits of the last graph
     // Get the number of splits of the last graph
     GGML_API int                  ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
     GGML_API int                  ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
+    GGML_API int                  ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
 
 
     GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
     GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
 
 
-    GGML_API void                 ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
-    GGML_API ggml_backend_t       ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
+    GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
+    GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
 
 
     // Allocate and compute graph on the backend scheduler
     // Allocate and compute graph on the backend scheduler
+    GGML_API bool                 ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
     GGML_API enum ggml_status     ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
     GGML_API enum ggml_status     ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
+    GGML_API enum ggml_status     ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
+    GGML_API void                 ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
 
 
     // Reset all assignments and allocators - must be called before changing the node backends
     // Reset all assignments and allocators - must be called before changing the node backends
     GGML_API void                 ggml_backend_sched_reset(ggml_backend_sched_t sched);
     GGML_API void                 ggml_backend_sched_reset(ggml_backend_sched_t sched);

+ 146 - 29
ggml-cuda.cu

@@ -72,6 +72,7 @@
 #define cudaEventCreateWithFlags hipEventCreateWithFlags
 #define cudaEventCreateWithFlags hipEventCreateWithFlags
 #define cudaEventDisableTiming hipEventDisableTiming
 #define cudaEventDisableTiming hipEventDisableTiming
 #define cudaEventRecord hipEventRecord
 #define cudaEventRecord hipEventRecord
+#define cudaEventSynchronize hipEventSynchronize
 #define cudaEvent_t hipEvent_t
 #define cudaEvent_t hipEvent_t
 #define cudaEventDestroy hipEventDestroy
 #define cudaEventDestroy hipEventDestroy
 #define cudaFree hipFree
 #define cudaFree hipFree
@@ -81,6 +82,7 @@
 #define cudaGetDeviceProperties hipGetDeviceProperties
 #define cudaGetDeviceProperties hipGetDeviceProperties
 #define cudaGetErrorString hipGetErrorString
 #define cudaGetErrorString hipGetErrorString
 #define cudaGetLastError hipGetLastError
 #define cudaGetLastError hipGetLastError
+#define cudaLaunchHostFunc hipLaunchHostFunc
 #ifdef GGML_HIP_UMA
 #ifdef GGML_HIP_UMA
 #define cudaMalloc hipMallocManaged
 #define cudaMalloc hipMallocManaged
 #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
 #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
@@ -104,6 +106,7 @@
 #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
 #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
 #define cudaStreamFireAndForget hipStreamFireAndForget
 #define cudaStreamFireAndForget hipStreamFireAndForget
 #define cudaStreamNonBlocking hipStreamNonBlocking
 #define cudaStreamNonBlocking hipStreamNonBlocking
+#define cudaStreamPerThread hipStreamPerThread
 #define cudaStreamSynchronize hipStreamSynchronize
 #define cudaStreamSynchronize hipStreamSynchronize
 #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
 #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
 #define cudaStream_t hipStream_t
 #define cudaStream_t hipStream_t
@@ -10641,8 +10644,20 @@ GGML_CALL void ggml_cuda_get_device_description(int device, char * description,
 #define UNUSED GGML_UNUSED
 #define UNUSED GGML_UNUSED
 
 
 struct ggml_backend_cuda_context {
 struct ggml_backend_cuda_context {
+    explicit ggml_backend_cuda_context(int device) :
+        device(device),
+        name(GGML_CUDA_NAME + std::to_string(device)) {
+    }
+
+    ~ggml_backend_cuda_context() {
+        if (copy_event != nullptr) {
+            CUDA_CHECK(cudaEventDestroy(copy_event));
+        }
+    }
+
     int device;
     int device;
     std::string name;
     std::string name;
+    cudaEvent_t copy_event = nullptr;
 };
 };
 
 
 // cuda buffer
 // cuda buffer
@@ -10732,9 +10747,8 @@ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
 
 
     ggml_cuda_set_device(ctx->device);
     ggml_cuda_set_device(ctx->device);
-    CUDA_CHECK(cudaDeviceSynchronize());
-    CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
-    CUDA_CHECK(cudaDeviceSynchronize());
+    CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
+    CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
 }
 }
 
 
 GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
 GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@@ -10743,26 +10757,25 @@ GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
 
 
     ggml_cuda_set_device(ctx->device);
     ggml_cuda_set_device(ctx->device);
-    CUDA_CHECK(cudaDeviceSynchronize());
-    CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
-    CUDA_CHECK(cudaDeviceSynchronize());
+    CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
+    CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
 }
 }
 
 
 GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
 GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
     if (ggml_backend_buffer_is_cuda(src->buffer)) {
     if (ggml_backend_buffer_is_cuda(src->buffer)) {
         ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
         ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
-        ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-
-        ggml_cuda_set_device(src_ctx->device);
-        CUDA_CHECK(cudaDeviceSynchronize());
-        ggml_cuda_set_device(dst_ctx->device);
-        CUDA_CHECK(cudaDeviceSynchronize());
-        CUDA_CHECK(cudaMemcpy((char *)dst->data, (const char *)src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice));
-        CUDA_CHECK(cudaDeviceSynchronize());
-
+        ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context;
+        if (src_ctx->device == dst_ctx->device) {
+            CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
+        } else {
+            CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread));
+        }
+        CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
         return true;
         return true;
     }
     }
     return false;
     return false;
+
+    UNUSED(buffer);
 }
 }
 
 
 GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
 GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -11007,7 +11020,11 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buf
         }
         }
 
 
         const char * buf_host = (const char *)data + offset_split;
         const char * buf_host = (const char *)data + offset_split;
-        CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
+    }
+
+    for (int id = 0; id < g_device_count; ++id) {
+        CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
     }
     }
 }
 }
 
 
@@ -11041,7 +11058,11 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buf
         }
         }
 
 
         char * buf_host = (char *)data + offset_split;
         char * buf_host = (char *)data + offset_split;
-        CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost));
+        CUDA_CHECK(cudaMemcpyAsync(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
+    }
+
+    for (int id = 0; id < g_device_count; ++id) {
+        CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
     }
     }
 }
 }
 
 
@@ -11220,6 +11241,10 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
     return &ggml_backend_cuda_buffer_type_host;
     return &ggml_backend_cuda_buffer_type_host;
 }
 }
 
 
+//static bool ggml_backend_buffer_is_cuda_host(ggml_backend_buffer_t buffer) {
+//    return buffer->buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
+//}
+
 // backend
 // backend
 
 
 GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
 GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
@@ -11243,8 +11268,9 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer
 
 
 GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
 GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
 
 
-    GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
+    GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
     GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
     GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
 
 
     CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
     CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
@@ -11252,22 +11278,61 @@ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend,
 
 
 GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
 GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
 
 
-    GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
+    GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
     GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
     GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
 
 
     CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
     CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
 }
 }
 
 
-GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
-    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
+    GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
 
 
-    if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) {
-        CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx->device][0]));
-        return true;
+    ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
+    ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
+
+    if (!ggml_backend_buffer_is_cuda(src->buffer)) {
+        return false;
     }
     }
 
 
-    return false;
+    if (!ggml_backend_buffer_is_cuda(dst->buffer)) {
+        return false;
+    }
+
+    // device -> device
+    ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
+    ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
+
+    if (backend_src != backend_dst) {
+        ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
+        ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
+
+        GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
+        GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
+
+        if (!cuda_ctx_src->copy_event) {
+            ggml_cuda_set_device(cuda_ctx_src->device);
+            CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
+        }
+
+        // copy on src stream
+        if (cuda_ctx_src->device == cuda_ctx_dst->device) {
+            CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
+        } else {
+            CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), g_cudaStreams[cuda_ctx_src->device][0]));
+        }
+
+        // record event on src stream
+        CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, g_cudaStreams[cuda_ctx_src->device][0]));
+
+        // wait on dst stream for the copy to complete
+        CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx_dst->device][0], cuda_ctx_src->copy_event, 0));
+    } else {
+        // src and dst are on the same backend
+        CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
+    }
+    return true;
 }
 }
 
 
 GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
 GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
@@ -11444,6 +11509,52 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
     UNUSED(backend);
     UNUSED(backend);
 }
 }
 
 
+static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
+    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+
+    ggml_cuda_set_device(cuda_ctx->device);
+
+    cudaEvent_t event;
+    CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+
+    return new ggml_backend_event {
+        /* .backend = */ backend,
+        /* .context = */ event,
+    };
+}
+
+static void ggml_backend_cuda_event_free(ggml_backend_event_t event) {
+    CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context));
+
+    delete event;
+}
+
+static void ggml_backend_cuda_event_record(ggml_backend_event_t event) {
+    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)event->backend->context;
+
+    CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, g_cudaStreams[cuda_ctx->device][0]));
+}
+
+static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
+    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+
+    if (ggml_backend_is_cuda(event->backend)) {
+        CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx->device][0], (cudaEvent_t)event->context, 0));
+    } else {
+        // untested
+        auto wait_fn = [](void * user_data) {
+            ggml_backend_event_t event = (ggml_backend_event_t)user_data;
+            ggml_backend_event_synchronize(event);
+        };
+
+        CUDA_CHECK(cudaLaunchHostFunc(g_cudaStreams[cuda_ctx->device][0], wait_fn, event));
+    }
+}
+
+static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) {
+    CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
+}
+
 static ggml_backend_i ggml_backend_cuda_interface = {
 static ggml_backend_i ggml_backend_cuda_interface = {
     /* .get_name                = */ ggml_backend_cuda_name,
     /* .get_name                = */ ggml_backend_cuda_name,
     /* .free                    = */ ggml_backend_cuda_free,
     /* .free                    = */ ggml_backend_cuda_free,
@@ -11457,6 +11568,11 @@ static ggml_backend_i ggml_backend_cuda_interface = {
     /* .graph_plan_compute      = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_cuda_graph_compute,
     /* .graph_compute           = */ ggml_backend_cuda_graph_compute,
     /* .supports_op             = */ ggml_backend_cuda_supports_op,
     /* .supports_op             = */ ggml_backend_cuda_supports_op,
+    /* .event_new               = */ ggml_backend_cuda_event_new,
+    /* .event_free              = */ ggml_backend_cuda_event_free,
+    /* .event_record            = */ ggml_backend_cuda_event_record,
+    /* .event_wait              = */ ggml_backend_cuda_event_wait,
+    /* .event_synchronize       = */ ggml_backend_cuda_event_synchronize,
 };
 };
 
 
 static ggml_guid_t ggml_backend_cuda_guid() {
 static ggml_guid_t ggml_backend_cuda_guid() {
@@ -11475,10 +11591,11 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
     // not strictly necessary, but it may reduce the overhead of the first graph_compute
     // not strictly necessary, but it may reduce the overhead of the first graph_compute
     ggml_cuda_set_main_device(device);
     ggml_cuda_set_main_device(device);
 
 
-    ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context {
-        /* .device = */ device,
-        /* .name   = */ GGML_CUDA_NAME + std::to_string(device),
-    };
+    ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
+    if (ctx == nullptr) {
+        fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
+        return nullptr;
+    }
 
 
     ggml_backend_t cuda_backend = new ggml_backend {
     ggml_backend_t cuda_backend = new ggml_backend {
         /* .guid      = */ ggml_backend_cuda_guid(),
         /* .guid      = */ ggml_backend_cuda_guid(),

+ 5 - 0
ggml-kompute.cpp

@@ -1951,6 +1951,11 @@ static struct ggml_backend_i kompute_backend_i = {
     /* .graph_plan_compute      = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_kompute_graph_compute,
     /* .graph_compute           = */ ggml_backend_kompute_graph_compute,
     /* .supports_op             = */ ggml_backend_kompute_supports_op,
     /* .supports_op             = */ ggml_backend_kompute_supports_op,
+    /* .event_new               = */ NULL,
+    /* .event_free              = */ NULL,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+    /* .event_synchronize       = */ NULL,
 };
 };
 
 
 static ggml_guid_t ggml_backend_kompute_guid() {
 static ggml_guid_t ggml_backend_kompute_guid() {

+ 5 - 0
ggml-metal.m

@@ -2820,6 +2820,11 @@ static struct ggml_backend_i ggml_backend_metal_i = {
     /* .graph_plan_compute      = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_metal_graph_compute,
     /* .graph_compute           = */ ggml_backend_metal_graph_compute,
     /* .supports_op             = */ ggml_backend_metal_supports_op,
     /* .supports_op             = */ ggml_backend_metal_supports_op,
+    /* .event_new               = */ NULL,
+    /* .event_free              = */ NULL,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+    /* .event_synchronize       = */ NULL,
 };
 };
 
 
 void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
 void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {

+ 6 - 1
ggml-sycl.cpp

@@ -17249,13 +17249,18 @@ static ggml_backend_i ggml_backend_sycl_interface = {
     /* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type,
     /* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type,
     /* .set_tensor_async        = */ ggml_backend_sycl_set_tensor_async,
     /* .set_tensor_async        = */ ggml_backend_sycl_set_tensor_async,
     /* .get_tensor_async        = */ ggml_backend_sycl_get_tensor_async,
     /* .get_tensor_async        = */ ggml_backend_sycl_get_tensor_async,
-    /* .cpy_tensor_async        = */ ggml_backend_sycl_cpy_tensor_async,
+    /* .cpy_tensor_async        = */ NULL, //ggml_backend_sycl_cpy_tensor_async, // TODO: update for the new interface
     /* .synchronize             = */ ggml_backend_sycl_synchronize,
     /* .synchronize             = */ ggml_backend_sycl_synchronize,
     /* .graph_plan_create       = */ NULL,
     /* .graph_plan_create       = */ NULL,
     /* .graph_plan_free         = */ NULL,
     /* .graph_plan_free         = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_sycl_graph_compute,
     /* .graph_compute           = */ ggml_backend_sycl_graph_compute,
     /* .supports_op             = */ ggml_backend_sycl_supports_op,
     /* .supports_op             = */ ggml_backend_sycl_supports_op,
+    /* .event_new               = */ NULL,
+    /* .event_free              = */ NULL,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+    /* .event_synchronize       = */ NULL,
 };
 };
 
 
 static ggml_guid_t ggml_backend_sycl_guid() {
 static ggml_guid_t ggml_backend_sycl_guid() {

+ 5 - 0
ggml-vulkan.cpp

@@ -5693,6 +5693,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
     /* .graph_plan_compute      = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_vk_graph_compute,
     /* .graph_compute           = */ ggml_backend_vk_graph_compute,
     /* .supports_op             = */ ggml_backend_vk_supports_op,
     /* .supports_op             = */ ggml_backend_vk_supports_op,
+    /* .event_new               = */ NULL,
+    /* .event_free              = */ NULL,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+    /* .event_synchronize       = */ NULL,
 };
 };
 
 
 static ggml_guid_t ggml_backend_vk_guid() {
 static ggml_guid_t ggml_backend_vk_guid() {

+ 68 - 45
ggml.c

@@ -11560,8 +11560,6 @@ static void ggml_compute_forward_get_rows_q(
     const struct ggml_tensor * src0 = dst->src[0];
     const struct ggml_tensor * src0 = dst->src[0];
     const struct ggml_tensor * src1 = dst->src[1];
     const struct ggml_tensor * src1 = dst->src[1];
 
 
-    assert(params->ith == 0);
-
     if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
     if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
         return;
         return;
     }
     }
@@ -11569,7 +11567,7 @@ static void ggml_compute_forward_get_rows_q(
     GGML_TENSOR_BINARY_OP_LOCALS
     GGML_TENSOR_BINARY_OP_LOCALS
 
 
     const int64_t nc = ne00;
     const int64_t nc = ne00;
-    const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
+    const int64_t nr = ggml_nelements(src1);
 
 
     const enum ggml_type type = src0->type;
     const enum ggml_type type = src0->type;
     ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
     ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
@@ -11579,17 +11577,25 @@ static void ggml_compute_forward_get_rows_q(
     assert(nb00 == ggml_type_size(type));
     assert(nb00 == ggml_type_size(type));
     assert(ggml_nrows(dst) == nr);
     assert(ggml_nrows(dst) == nr);
 
 
-    // TODO: multi-thread
-    for (int64_t i12 = 0; i12 < ne12; ++i12) {
-        for (int64_t i11 = 0; i11 < ne11; ++i11) {
-            for (int64_t i10 = 0; i10 < ne10; ++i10) {
-                const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+    const int ith = params->ith;
+    const int nth = params->nth;
 
 
-                dequantize_row_q(
-                        (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
-                             (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
-            }
-        }
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int64_t i = ir0; i < ir1; ++i) {
+        const int64_t i12 = i/(ne11*ne10);
+        const int64_t i11 = (i - i12*ne11*ne10)/ne10;
+        const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
+        const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+
+        dequantize_row_q(
+                (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
+                     (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
     }
     }
 }
 }
 
 
@@ -11600,8 +11606,6 @@ static void ggml_compute_forward_get_rows_f16(
     const struct ggml_tensor * src0 = dst->src[0];
     const struct ggml_tensor * src0 = dst->src[0];
     const struct ggml_tensor * src1 = dst->src[1];
     const struct ggml_tensor * src1 = dst->src[1];
 
 
-    assert(params->ith == 0);
-
     if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
     if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
         return;
         return;
     }
     }
@@ -11609,24 +11613,32 @@ static void ggml_compute_forward_get_rows_f16(
     GGML_TENSOR_BINARY_OP_LOCALS
     GGML_TENSOR_BINARY_OP_LOCALS
 
 
     const int64_t nc = ne00;
     const int64_t nc = ne00;
-    const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
+    const int64_t nr = ggml_nelements(src1);
 
 
     assert(ne0  == nc);
     assert(ne0  == nc);
     assert(ne02 == ne11);
     assert(ne02 == ne11);
     assert(nb00 == sizeof(ggml_fp16_t));
     assert(nb00 == sizeof(ggml_fp16_t));
     assert(ggml_nrows(dst) == nr);
     assert(ggml_nrows(dst) == nr);
 
 
-    // TODO: multi-thread
-    for (int64_t i12 = 0; i12 < ne12; ++i12) {
-        for (int64_t i11 = 0; i11 < ne11; ++i11) {
-            for (int64_t i10 = 0; i10 < ne10; ++i10) {
-                const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
 
 
-                ggml_fp16_to_fp32_row(
-                        (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
-                             (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
-            }
-        }
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int64_t i = ir0; i < ir1; ++i) {
+        const int64_t i12 = i/(ne11*ne10);
+        const int64_t i11 = (i - i12*ne11*ne10)/ne10;
+        const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
+        const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+
+        ggml_fp16_to_fp32_row(
+                (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
+                     (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
     }
     }
 }
 }
 
 
@@ -11637,8 +11649,6 @@ static void ggml_compute_forward_get_rows_f32(
     const struct ggml_tensor * src0 = dst->src[0];
     const struct ggml_tensor * src0 = dst->src[0];
     const struct ggml_tensor * src1 = dst->src[1];
     const struct ggml_tensor * src1 = dst->src[1];
 
 
-    assert(params->ith == 0);
-
     if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
     if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
         return;
         return;
     }
     }
@@ -11646,24 +11656,32 @@ static void ggml_compute_forward_get_rows_f32(
     GGML_TENSOR_BINARY_OP_LOCALS
     GGML_TENSOR_BINARY_OP_LOCALS
 
 
     const int64_t nc = ne00;
     const int64_t nc = ne00;
-    const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
+    const int64_t nr = ggml_nelements(src1);
 
 
     assert(ne0  == nc);
     assert(ne0  == nc);
     assert(ne02 == ne11);
     assert(ne02 == ne11);
     assert(nb00 == sizeof(float));
     assert(nb00 == sizeof(float));
     assert(ggml_nrows(dst) == nr);
     assert(ggml_nrows(dst) == nr);
 
 
-    // TODO: multi-thread
-    for (int64_t i12 = 0; i12 < ne12; ++i12) {
-        for (int64_t i11 = 0; i11 < ne11; ++i11) {
-            for (int64_t i10 = 0; i10 < ne10; ++i10) {
-                const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+    const int ith = params->ith;
+    const int nth = params->nth;
 
 
-                ggml_vec_cpy_f32(nc,
-                        (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3),
-                        (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
-            }
-        }
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int64_t i = ir0; i < ir1; ++i) {
+        const int64_t i12 = i/(ne11*ne10);
+        const int64_t i11 = (i - i12*ne11*ne10)/ne10;
+        const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
+        const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+
+        ggml_vec_cpy_f32(nc,
+                (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3),
+                (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
     }
     }
 }
 }
 
 
@@ -17796,7 +17814,7 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
     node->perf_time_us += time_us_cur;
     node->perf_time_us += time_us_cur;
 }
 }
 
 
-static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
+static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) {
     int n_tasks = 0;
     int n_tasks = 0;
 
 
     switch (node->op) {
     switch (node->op) {
@@ -17877,6 +17895,12 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
             {
             {
                 n_tasks = n_threads;
                 n_tasks = n_threads;
             } break;
             } break;
+        case GGML_OP_GET_ROWS:
+            {
+                // FIXME: the cost of launching additional threads decreases performance with GPU offloading
+                //n_tasks = MIN(n_threads, ggml_nelements(node->src[1]));
+                n_tasks = MIN(n_cur_threads, ggml_nelements(node->src[1]));
+            } break;
         case GGML_OP_SCALE:
         case GGML_OP_SCALE:
         case GGML_OP_SET:
         case GGML_OP_SET:
         case GGML_OP_CONT:
         case GGML_OP_CONT:
@@ -17884,7 +17908,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
         case GGML_OP_VIEW:
         case GGML_OP_VIEW:
         case GGML_OP_PERMUTE:
         case GGML_OP_PERMUTE:
         case GGML_OP_TRANSPOSE:
         case GGML_OP_TRANSPOSE:
-        case GGML_OP_GET_ROWS:
         case GGML_OP_GET_ROWS_BACK:
         case GGML_OP_GET_ROWS_BACK:
         case GGML_OP_DIAG:
         case GGML_OP_DIAG:
             {
             {
@@ -18102,7 +18125,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                 /* FINALIZE */
                 /* FINALIZE */
                 struct ggml_tensor * node = cgraph->nodes[node_n];
                 struct ggml_tensor * node = cgraph->nodes[node_n];
                 if (GGML_OP_HAS_FINALIZE[node->op]) {
                 if (GGML_OP_HAS_FINALIZE[node->op]) {
-                    params.nth = ggml_get_n_tasks(node, n_threads);
+                    params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
                     ggml_compute_forward(&params, node);
                     ggml_compute_forward(&params, node);
                 }
                 }
                 ggml_graph_compute_perf_stats_node(node, state->shared);
                 ggml_graph_compute_perf_stats_node(node, state->shared);
@@ -18112,7 +18135,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
             while (++node_n < cgraph->n_nodes) {
             while (++node_n < cgraph->n_nodes) {
                 GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
                 GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
                 struct ggml_tensor * node = cgraph->nodes[node_n];
                 struct ggml_tensor * node = cgraph->nodes[node_n];
-                const int n_tasks = ggml_get_n_tasks(node, n_threads);
+                const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
 
 
                 state->shared->perf_node_start_cycles  = ggml_perf_cycles();
                 state->shared->perf_node_start_cycles  = ggml_perf_cycles();
                 state->shared->perf_node_start_time_us = ggml_perf_time_us();
                 state->shared->perf_node_start_time_us = ggml_perf_time_us();
@@ -18160,7 +18183,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 
 
         /* INIT & COMPUTE */
         /* INIT & COMPUTE */
         struct ggml_tensor * node = cgraph->nodes[node_n];
         struct ggml_tensor * node = cgraph->nodes[node_n];
-        const int n_tasks = ggml_get_n_tasks(node, n_threads);
+        const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
 
 
         struct ggml_compute_params params = {
         struct ggml_compute_params params = {
             /*.type  =*/ GGML_TASK_TYPE_INIT,
             /*.type  =*/ GGML_TASK_TYPE_INIT,
@@ -18225,7 +18248,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
     for (int i = 0; i < cgraph->n_nodes; i++) {
     for (int i = 0; i < cgraph->n_nodes; i++) {
         struct ggml_tensor * node = cgraph->nodes[i];
         struct ggml_tensor * node = cgraph->nodes[i];
 
 
-        const int n_tasks = ggml_get_n_tasks(node, n_threads);
+        const int n_tasks = ggml_get_n_tasks(node, n_threads, 1);
 
 
         max_tasks = MAX(max_tasks, n_tasks);
         max_tasks = MAX(max_tasks, n_tasks);
 
 

La diferencia del archivo ha sido suprimido porque es demasiado grande
+ 235 - 235
llama.cpp


+ 8 - 1
llama.h

@@ -234,7 +234,8 @@ extern "C" {
     struct llama_context_params {
     struct llama_context_params {
         uint32_t seed;              // RNG seed, -1 for random
         uint32_t seed;              // RNG seed, -1 for random
         uint32_t n_ctx;             // text context, 0 = from model
         uint32_t n_ctx;             // text context, 0 = from model
-        uint32_t n_batch;           // prompt processing maximum batch size
+        uint32_t n_batch;           // logical maximum batch size that can be submitted to llama_decode
+        uint32_t n_ubatch;          // physical maximum batch size
         uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
         uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
         uint32_t n_threads;         // number of threads to use for generation
         uint32_t n_threads;         // number of threads to use for generation
         uint32_t n_threads_batch;   // number of threads to use for batch processing
         uint32_t n_threads_batch;   // number of threads to use for batch processing
@@ -377,6 +378,7 @@ extern "C" {
 
 
     LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
+    LLAMA_API uint32_t llama_n_ubatch   (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_seq_max  (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_seq_max  (const struct llama_context * ctx);
 
 
     LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
     LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
@@ -650,6 +652,11 @@ extern "C" {
     // Set abort callback
     // Set abort callback
     LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
     LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
 
 
+    // Wait until all computations are finished
+    // This is automatically done when using one of the functions below to obtain the computation results
+    // and is not necessary to call it explicitly in most cases
+    LLAMA_API void llama_synchronize(struct llama_context * ctx);
+
     // Token logits obtained from the last call to llama_decode()
     // Token logits obtained from the last call to llama_decode()
     // The logits for the last token are stored in the last row
     // The logits for the last token are stored in the last row
     // Logits for which llama_batch.logits[i] == 0 are undefined
     // Logits for which llama_batch.logits[i] == 0 are undefined

Algunos archivos no se mostraron porque demasiados archivos cambiaron en este cambio