1 år sedan · 95f57bb5d5
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -144,9 +144,6 @@ option(LLAMA_BUILD_SERVER                    "llama: build server example"
 
				 option(LLAMA_LASX                            "llama: enable lasx"                               ON)
			
 
				 option(LLAMA_LSX                             "llama: enable lsx"                                ON)
			
 
				 
			
 
				-# add perf arguments
			
 
				-option(LLAMA_PERF                            "llama: enable perf"                               OFF)
			
 
				-
			
 
				 # Required for relocatable CMake package
			
 
				 include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
			
 
				 
			
@@ -870,10 +867,6 @@ if (LLAMA_CPU_HBM)
 
				     target_link_libraries(ggml PUBLIC memkind)
			
 
				 endif()
			
 
				 
			
 
				-if (LLAMA_PERF)
			
 
				-    add_compile_definitions(GGML_PERF)
			
 
				-endif()
			
 
				-
			
 
				 function(get_flags CCID CCVER)
			
 
				     set(C_FLAGS "")
			
 
				     set(CXX_FLAGS "")
			
--- a/Makefile
+++ b/Makefile
@@ -344,9 +344,6 @@ ifdef LLAMA_GPROF
 
				 	MK_CFLAGS   += -pg
			
 
				 	MK_CXXFLAGS += -pg
			
 
				 endif
			
 
				-ifdef LLAMA_PERF
			
 
				-	MK_CPPFLAGS += -DGGML_PERF
			
 
				-endif
			
 
				 
			
 
				 # Architecture specific
			
 
				 # TODO: probably these flags need to be tweaked on some architectures
			
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -513,8 +513,8 @@ static size_t vk_skip_checks;
 
				 static size_t vk_output_tensor;
			
 
				 
			
 
				 static void ggml_vk_print_tensor(ggml_backend * ctx, const ggml_tensor * tensor, const char * name);
			
 
				-static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor);
			
 
				-static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor);
			
 
				+static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * tensor);
			
 
				+static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor * tensor);
			
 
				 #endif
			
 
				 
			
 
				 typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
			
@@ -5644,7 +5644,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
 
				     }
			
 
				 }
			
 
				 
			
 
				-static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
			
 
				+static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor){
			
 
				     ggml_tensor_extra_gpu * extra = nullptr;
			
 
				 
			
 
				     switch (tensor->op) {
			
@@ -5697,17 +5697,10 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
 
				         return false;
			
 
				     }
			
 
				 
			
 
				-    if (params->ith != 0) {
			
 
				-        return true;
			
 
				-    }
			
 
				-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
			
 
				-        return true;
			
 
				-    }
			
 
				-
			
 
				     VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");
			
 
				 
			
 
				 #ifdef GGML_VULKAN_CHECK_RESULTS
			
 
				-    ggml_vk_check_results_0(ctx, params, tensor);
			
 
				+    ggml_vk_check_results_0(ctx, tensor);
			
 
				 #endif
			
 
				 
			
 
				     vk_context& subctx = ctx->gc.contexts[extra->ctx_idx];
			
@@ -6214,9 +6207,6 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
 
				         ggml_vk_build_graph(ctx,cgraph->nodes[i], i == last_node);
			
 
				     }
			
 
				 
			
 
				-    ggml_compute_params params = {};
			
 
				-    params.type = GGML_TASK_TYPE_COMPUTE;
			
 
				-    params.ith = 0;
			
 
				     for (int i = 0; i < cgraph->n_nodes; i++) {
			
 
				         ggml_tensor * node = cgraph->nodes[i];
			
 
				 
			
@@ -6224,13 +6214,13 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
 
				             continue;
			
 
				         }
			
 
				 
			
 
				-        bool ok = ggml_vk_compute_forward(ctx, &params, node);
			
 
				+        bool ok = ggml_vk_compute_forward(ctx, node);
			
 
				         if (!ok) {
			
 
				             fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
			
 
				         }
			
 
				 #ifdef GGML_VULKAN_CHECK_RESULTS
			
 
				         else {
			
 
				-            ggml_vk_check_results_1(ctx, &params, node);
			
 
				+            ggml_vk_check_results_1(ctx, node);
			
 
				         }
			
 
				 #endif
			
 
				         GGML_ASSERT(ok);
			
@@ -6600,11 +6590,8 @@ void * comp_result;
 
				 size_t comp_size;
			
 
				 size_t comp_nb[GGML_MAX_DIMS];
			
 
				 size_t check_counter = 0;
			
 
				-static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor) {
			
 
				-    if (params->ith != 0) {
			
 
				-        return;
			
 
				-    }
			
 
				-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
			
 
				+static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * tensor) {
			
 
				+        if (tensor->op == GGML_OP_TRANSPOSE) {
			
 
				         return;
			
 
				     }
			
 
				 
			
@@ -6908,11 +6895,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
 
				     ggml_free(ggml_ctx);
			
 
				 }
			
 
				 
			
 
				-static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor) {
			
 
				-    if (params->ith != 0) {
			
 
				-        return;
			
 
				-    }
			
 
				-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
			
 
				+static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor * tensor) {
			
 
				+    if (tensor->op == GGML_OP_TRANSPOSE) {
			
 
				         return;
			
 
				     }
			
 
				     if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) {
			
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@@ -591,11 +591,7 @@ extern "C" {
 
				         struct ggml_tensor * grad;
			
 
				         struct ggml_tensor * src[GGML_MAX_SRC];
			
 
				 
			
 
				-        // performance
			
 
				-        int     perf_runs;
			
 
				-        int64_t perf_cycles;
			
 
				-        int64_t perf_time_us;
			
 
				-
			
 
				+        // source tensor and offset for views
			
 
				         struct ggml_tensor * view_src;
			
 
				         size_t               view_offs;
			
 
				 
			
@@ -605,7 +601,7 @@ extern "C" {
 
				 
			
 
				         void * extra; // extra things e.g. for ggml-cuda.cu
			
 
				 
			
 
				-        char padding[8];
			
 
				+        // char padding[4];
			
 
				     };
			
 
				 
			
 
				     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
			
@@ -652,11 +648,6 @@ extern "C" {
 
				         struct ggml_hash_set visited_hash_table;
			
 
				 
			
 
				         enum ggml_cgraph_eval_order order;
			
 
				-
			
 
				-        // performance
			
 
				-        int     perf_runs;
			
 
				-        int64_t perf_cycles;
			
 
				-        int64_t perf_time_us;
			
 
				     };
			
 
				 
			
 
				     // scratch buffer
			
@@ -673,28 +664,6 @@ extern "C" {
 
				         bool   no_alloc;   // don't allocate memory for the tensor data
			
 
				     };
			
 
				 
			
 
				-
			
 
				-    // compute types
			
 
				-
			
 
				-    // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
			
 
				-    // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
			
 
				-    enum ggml_task_type {
			
 
				-        GGML_TASK_TYPE_INIT = 0,
			
 
				-        GGML_TASK_TYPE_COMPUTE,
			
 
				-        GGML_TASK_TYPE_FINALIZE,
			
 
				-    };
			
 
				-
			
 
				-    struct ggml_compute_params {
			
 
				-        enum ggml_task_type type;
			
 
				-
			
 
				-        // ith = thread index, nth = number of threads
			
 
				-        int ith, nth;
			
 
				-
			
 
				-        // work buffer for all threads
			
 
				-        size_t wsize;
			
 
				-        void * wdata;
			
 
				-    };
			
 
				-
			
 
				     // numa strategies
			
 
				     enum ggml_numa_strategy {
			
 
				         GGML_NUMA_STRATEGY_DISABLED   = 0,
			
--- a/llama.cpp
+++ b/llama.cpp
@@ -12785,12 +12785,6 @@ static int llama_decode_internal(
 
				             }
			
 
				         }
			
 
				 
			
 
				-#ifdef GGML_PERF
			
 
				-        // print timing information per ggml operation (for debugging purposes)
			
 
				-        // requires GGML_PERF to be defined
			
 
				-        ggml_graph_print(gf);
			
 
				-#endif
			
 
				-
			
 
				         // plot the computation graph in dot format (for debugging purposes)
			
 
				         //if (n_past%100 == 0) {
			
 
				         //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
			
--- a/sgemm.cpp
+++ b/sgemm.cpp
@@ -249,9 +249,8 @@ class tinyBLAS {
 
				         : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
			
 
				     }
			
 
				 
			
 
				-    void matmul(int64_t m, int64_t n, int task) {
			
 
				-        if (task == GGML_TASK_TYPE_COMPUTE)
			
 
				-            mnpack(0, m, 0, n);
			
 
				+    void matmul(int64_t m, int64_t n) {
			
 
				+        mnpack(0, m, 0, n);
			
 
				     }
			
 
				 
			
 
				   private:
			
@@ -458,9 +457,8 @@ class tinyBLAS_Q0_ARM {
 
				         : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
			
 
				     }
			
 
				 
			
 
				-    void matmul(int64_t m, int64_t n, int task) {
			
 
				-        if (task == GGML_TASK_TYPE_COMPUTE)
			
 
				-            mnpack(0, m, 0, n);
			
 
				+    void matmul(int64_t m, int64_t n) {
			
 
				+        mnpack(0, m, 0, n);
			
 
				     }
			
 
				 
			
 
				   private:
			
@@ -596,9 +594,8 @@ class tinyBLAS_Q0_AVX {
 
				         : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
			
 
				     }
			
 
				 
			
 
				-    void matmul(int64_t m, int64_t n, int task) {
			
 
				-        if (task == GGML_TASK_TYPE_COMPUTE)
			
 
				-            mnpack(0, m, 0, n);
			
 
				+    void matmul(int64_t m, int64_t n) {
			
 
				+        mnpack(0, m, 0, n);
			
 
				     }
			
 
				 
			
 
				   private:
			
@@ -829,7 +826,7 @@ class tinyBLAS_Q0_AVX {
 
				  * For example, for single-threaded single-precision GEMM you can say
			
 
				  *
			
 
				  *     llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc,
			
 
				- *                     0, 1, GGML_TASK_TYPE_COMPUTE,
			
 
				+ *                     0, 1,
			
 
				  *                     GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32);
			
 
				  *
			
 
				  * @param m is rows in `A` and `C`
			
@@ -843,14 +840,13 @@ class tinyBLAS_Q0_AVX {
 
				  * @param ldc is row stride of `C`
			
 
				  * @param ith is thread id (must be less than `nth`)
			
 
				  * @param nth is number of threads (must be greater than zero)
			
 
				- * @param task is GGML task type
			
 
				  * @param Atype is GGML data type of `A`
			
 
				  * @param Btype is GGML data type of `B`
			
 
				  * @param Ctype is GGML data type of `C`
			
 
				  * @return true if this function was able to service the matmul request
			
 
				  */
			
 
				 bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
			
 
				-                     int64_t ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype) {
			
 
				+                     int64_t ldc, int ith, int nth, int Atype, int Btype, int Ctype) {
			
 
				 
			
 
				     assert(m >= 0);
			
 
				     assert(n >= 0);
			
@@ -877,7 +873,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
 
				             (const float *)B, ldb,
			
 
				             (float *)C, ldc,
			
 
				             ith, nth};
			
 
				-        tb.matmul(m, n, task);
			
 
				+        tb.matmul(m, n);
			
 
				         return true;
			
 
				 #elif defined(__AVX__) || defined(__AVX2__)
			
 
				         if (k % 8)
			
@@ -887,7 +883,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
 
				             (const float *)B, ldb,
			
 
				             (float *)C, ldc,
			
 
				             ith, nth};
			
 
				-        tb.matmul(m, n, task);
			
 
				+        tb.matmul(m, n);
			
 
				         return true;
			
 
				 #elif defined(__ARM_NEON)
			
 
				         if (n < 4)
			
@@ -899,7 +895,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
 
				             (const float *)B, ldb,
			
 
				             (float *)C, ldc,
			
 
				             ith, nth};
			
 
				-        tb.matmul(m, n, task);
			
 
				+        tb.matmul(m, n);
			
 
				         return true;
			
 
				 #else
			
 
				         return false;
			
@@ -917,7 +913,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
 
				             (const float *)B, ldb,
			
 
				             (float *)C, ldc,
			
 
				             ith, nth};
			
 
				-        tb.matmul(m, n, task);
			
 
				+        tb.matmul(m, n);
			
 
				         return true;
			
 
				 #elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
			
 
				         if (k % 8)
			
@@ -929,7 +925,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
 
				             (const float *)B, ldb,
			
 
				             (float *)C, ldc,
			
 
				             ith, nth};
			
 
				-        tb.matmul(m, n, task);
			
 
				+        tb.matmul(m, n);
			
 
				         return true;
			
 
				 #elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
			
 
				         if (n < 8)
			
@@ -943,7 +939,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
 
				             (const ggml_fp16_t *)B, ldb,
			
 
				             (float *)C, ldc,
			
 
				             ith, nth};
			
 
				-        tb.matmul(m, n, task);
			
 
				+        tb.matmul(m, n);
			
 
				         return true;
			
 
				 #elif defined(__ARM_NEON) && !defined(_MSC_VER)
			
 
				         if (k % 4)
			
@@ -955,7 +951,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
 
				             (const float *)B, ldb,
			
 
				             (float *)C, ldc,
			
 
				             ith, nth};
			
 
				-        tb.matmul(m, n, task);
			
 
				+        tb.matmul(m, n);
			
 
				         return true;
			
 
				 #else
			
 
				         return false;
			
@@ -971,7 +967,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
 
				             (const block_q8_0 *)B, ldb,
			
 
				             (float *)C, ldc,
			
 
				             ith, nth};
			
 
				-        tb.matmul(m, n, task);
			
 
				+        tb.matmul(m, n);
			
 
				         return true;
			
 
				 #elif defined(__ARM_FEATURE_DOTPROD)
			
 
				         tinyBLAS_Q0_ARM<block_q8_0> tb{
			
@@ -979,7 +975,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
 
				             (const block_q8_0 *)B, ldb,
			
 
				             (float *)C, ldc,
			
 
				             ith, nth};
			
 
				-        tb.matmul(m, n, task);
			
 
				+        tb.matmul(m, n);
			
 
				         return true;
			
 
				 #else
			
 
				         return false;
			
@@ -995,7 +991,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
 
				             (const block_q8_0 *)B, ldb,
			
 
				             (float *)C, ldc,
			
 
				             ith, nth};
			
 
				-        tb.matmul(m, n, task);
			
 
				+        tb.matmul(m, n);
			
 
				         return true;
			
 
				 #elif defined(__ARM_FEATURE_DOTPROD)
			
 
				         tinyBLAS_Q0_ARM<block_q4_0> tb{
			
@@ -1003,7 +999,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
 
				             (const block_q8_0 *)B, ldb,
			
 
				             (float *)C, ldc,
			
 
				             ith, nth};
			
 
				-        tb.matmul(m, n, task);
			
 
				+        tb.matmul(m, n);
			
 
				         return true;
			
 
				 #else
			
 
				         return false;
			
@@ -1025,7 +1021,6 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
 
				     (void)ldc;
			
 
				     (void)ith;
			
 
				     (void)nth;
			
 
				-    (void)task;
			
 
				     (void)Atype;
			
 
				     (void)Btype;
			
 
				     (void)Ctype;
			
--- a/sgemm.h
+++ b/sgemm.h
@@ -7,7 +7,7 @@ extern "C" {
 
				 
			
 
				 bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
			
 
				                      const void *, int64_t, void *, int64_t, int, int,
			
 
				-                     int, int, int, int);
			
 
				+                     int, int, int);
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 }