Просмотр исходного кода

llama : refactor graph build code (#3837)

* llama : factor out ggml-alloc from graph graph build functions

ggml-ci

* metal : disable kernel load log

* llama : factor out tensor offloading outside the build call (wip)

ggml-ci

* llama : offload rest of the models

ggml-ci

* llama : update offload log messages to print node index

* llama : comments

* llama : support offloading result_norm + comments

* llama : factor graph input into a function

* llama : do tensor offload only with CUDA

* llama : fix res_norm offloading

* llama : try to optimize offloading code

* llama : fix non-CUDA build

* llama : try to fix build

* llama : move refact in correct place + optimize graph input

* llama : refactor tensor offloading as callback

* llama : add layer index to all tensor names

* llama : add functional header

* llama : comment

ggml-ci

* llama : remove obsolete map for layer counting

* llama : add llm_build helper functions (#3848)

* llama : add llm_build_norm helper function

ggml-ci

* llama : add llm_build_ffn helper function (#3849)

ggml-ci

* llama : add llm_build_k_shift helper

ggml-ci

* llama : fix offloading after recent changes

* llama : add llm_build_kv_store helper

ggml-ci

* llama : remove obsolete offload names

* llama : fix llm_build_k_shift to use n_head_kv instead of n_head

* llama : simplify falcon Q, K, V computation

* llama : remove obsolete comments in build graphs

* llama : add llm_build_kqv helper

ggml-ci

* llama : minor

* llama : add LLAMA_OFFLOAD_DEBUG + fix starcoder offloading

* llama : fix input allocation logic

* llama : update offload functions for KQ tensors

* llama : normalize tensor names

ggml-ci

* llama : enable warning about not offloaded tensors

* llama : remove extra ; + deduplicate gate_b logic

* llama : add llm_build_inp_embd helper
Georgi Gerganov 2 лет назад
Родитель
Сommit
71e3718abd
3 измененных файлов с 473 добавлено и 355 удалено
  1. 7 4
      ggml-metal.m
  2. 1 1
      ggml.h
  3. 465 350
      llama.cpp

+ 7 - 4
ggml-metal.m

@@ -238,14 +238,17 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
     // load kernels
     {
         NSError * error = nil;
-#define GGML_METAL_ADD_KERNEL(name) \
-        ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
-        ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
+
+        /*
         GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
                 (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
                 (int) ctx->pipeline_##name.threadExecutionWidth); \
+        */
+#define GGML_METAL_ADD_KERNEL(name) \
+        ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
+        ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
         if (error) { \
-          GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
+            GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
             return NULL; \
         }
 

+ 1 - 1
ggml.h

@@ -709,7 +709,7 @@ extern "C" {
     // Context tensor enumeration and lookup
     GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
     GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
-    GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
+    GGML_API struct ggml_tensor * ggml_get_tensor      (struct ggml_context * ctx, const char * name);
 
     GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
     GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);

Разница между файлами не показана из-за своего большого размера
+ 465 - 350
llama.cpp


Некоторые файлы не были показаны из-за большого количества измененных файлов