1 maand geleden · 196f5083ef
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -26,7 +26,6 @@
 
				 #include <sstream>
			
 
				 #include <string>
			
 
				 #include <thread>
			
 
				-#include <unordered_map>
			
 
				 #include <unordered_set>
			
 
				 #include <vector>
			
 
				 
			
@@ -60,6 +59,14 @@
 
				 #pragma warning(disable: 4244 4267) // possible loss of data
			
 
				 #endif
			
 
				 
			
 
				+common_time_meas::common_time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
			
 
				+
			
 
				+common_time_meas::~common_time_meas() {
			
 
				+    if (t_start_us >= 0) {
			
 
				+        t_acc += ggml_time_us() - t_start_us;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				 //
			
 
				 // CPU utils
			
 
				 //
			
--- a/common/common.h
+++ b/common/common.h
@@ -2,17 +2,15 @@
 
				 
			
 
				 #pragma once
			
 
				 
			
 
				+#include "ggml-opt.h"
			
 
				+#include "llama-cpp.h"
			
 
				+
			
 
				 #include <set>
			
 
				 #include <sstream>
			
 
				 #include <string>
			
 
				 #include <string_view>
			
 
				 #include <vector>
			
 
				 #include <map>
			
 
				-#include <sstream>
			
 
				-#include <cmath>
			
 
				-
			
 
				-#include "ggml-opt.h"
			
 
				-#include "llama-cpp.h"
			
 
				 
			
 
				 #ifdef _WIN32
			
 
				 #define DIRECTORY_SEPARATOR '\\'
			
@@ -30,6 +28,15 @@
 
				 
			
 
				 #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
			
 
				 
			
 
				+struct common_time_meas {
			
 
				+    common_time_meas(int64_t & t_acc, bool disable = false);
			
 
				+    ~common_time_meas();
			
 
				+
			
 
				+    const int64_t t_start_us;
			
 
				+
			
 
				+    int64_t & t_acc;
			
 
				+};
			
 
				+
			
 
				 struct common_adapter_lora_info {
			
 
				     std::string path;
			
 
				     float scale;
			
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -3,9 +3,10 @@
 
				 #include "common.h"
			
 
				 #include "log.h"
			
 
				 
			
 
				+#include <algorithm>
			
 
				 #include <cmath>
			
 
				+#include <cstring>
			
 
				 #include <unordered_map>
			
 
				-#include <algorithm>
			
 
				 
			
 
				 // the ring buffer works similarly to std::deque, but with a fixed capacity
			
 
				 // TODO: deduplicate with llama-impl.h
			
@@ -112,6 +113,13 @@ struct common_sampler {
 
				 
			
 
				     llama_token_data_array cur_p;
			
 
				 
			
 
				+    void reset() {
			
 
				+        prev.clear();
			
 
				+
			
 
				+        llama_sampler_reset(grmr);
			
 
				+        llama_sampler_reset(chain);
			
 
				+    }
			
 
				+
			
 
				     void set_logits(struct llama_context * ctx, int idx) {
			
 
				         const auto * logits = llama_get_logits_ith(ctx, idx);
			
 
				 
			
@@ -128,6 +136,12 @@ struct common_sampler {
 
				 
			
 
				         cur_p = { cur.data(), cur.size(), -1, false };
			
 
				     }
			
 
				+
			
 
				+    common_time_meas tm() {
			
 
				+        return common_time_meas(t_total_us, params.no_perf);
			
 
				+    }
			
 
				+
			
 
				+    mutable int64_t t_total_us = 0;
			
 
				 };
			
 
				 
			
 
				 std::string common_params_sampling::print() const {
			
@@ -298,6 +312,8 @@ void common_sampler_free(struct common_sampler * gsmpl) {
 
				 }
			
 
				 
			
 
				 void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
			
 
				+    const auto tm = gsmpl->tm();
			
 
				+
			
 
				     if (accept_grammar) {
			
 
				         llama_sampler_accept(gsmpl->grmr, token);
			
 
				     }
			
@@ -308,9 +324,7 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo
 
				 }
			
 
				 
			
 
				 void common_sampler_reset(struct common_sampler * gsmpl) {
			
 
				-    llama_sampler_reset(gsmpl->grmr);
			
 
				-
			
 
				-    llama_sampler_reset(gsmpl->chain);
			
 
				+    gsmpl->reset();
			
 
				 }
			
 
				 
			
 
				 struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
			
@@ -327,16 +341,54 @@ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
 
				 void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
			
 
				     // TODO: measure grammar performance
			
 
				 
			
 
				+    const double t_sampling_ms = gsmpl ? 1e-3*gsmpl->t_total_us : 0;
			
 
				+
			
 
				+    llama_perf_sampler_data data_smpl;
			
 
				+    llama_perf_context_data data_ctx;
			
 
				+
			
 
				+    memset(&data_smpl, 0, sizeof(data_smpl));
			
 
				+    memset(&data_ctx,  0, sizeof(data_ctx));
			
 
				+
			
 
				     if (gsmpl) {
			
 
				-        llama_perf_sampler_print(gsmpl->chain);
			
 
				+        auto & data = data_smpl;
			
 
				+
			
 
				+        data = llama_perf_sampler(gsmpl->chain);
			
 
				+
			
 
				+        // note: the sampling time includes the samplers time + extra time spent in common/sampling
			
 
				+        LOG_INF("%s:    sampling time = %10.2f ms\n", __func__, t_sampling_ms);
			
 
				+        LOG_INF("%s:    samplers time = %10.2f ms / %5d tokens\n", __func__, data.t_sample_ms, data.n_sample);
			
 
				     }
			
 
				+
			
 
				     if (ctx) {
			
 
				-        llama_perf_context_print(ctx);
			
 
				+        auto & data = data_ctx;
			
 
				+
			
 
				+        data = llama_perf_context(ctx);
			
 
				+
			
 
				+        const double t_end_ms = 1e-3 * ggml_time_us();
			
 
				+
			
 
				+        const double t_total_ms = t_end_ms - data.t_start_ms;
			
 
				+        const double t_unacc_ms = t_total_ms - (t_sampling_ms + data.t_p_eval_ms + data.t_eval_ms);
			
 
				+        const double t_unacc_pc = 100.0 * t_unacc_ms /  t_total_ms;
			
 
				+
			
 
				+        LOG_INF("%s:        load time = %10.2f ms\n", __func__, data.t_load_ms);
			
 
				+        LOG_INF("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
			
 
				+                __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
			
 
				+        LOG_INF("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
			
 
				+                __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
			
 
				+        LOG_INF("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
			
 
				+        LOG_INF("%s: unaccounted time = %10.2f ms / %5.1f %%      (total - sampling - prompt eval - eval) / (total)\n", __func__, t_unacc_ms, t_unacc_pc);
			
 
				+        LOG_INF("%s:    graphs reused = %10d\n", __func__, data.n_reused);
			
 
				+
			
 
				         llama_memory_breakdown_print(ctx);
			
 
				     }
			
 
				 }
			
 
				 
			
 
				 llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
			
 
				+    llama_synchronize(ctx);
			
 
				+
			
 
				+    // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
			
 
				+    const auto tm = gsmpl->tm();
			
 
				+
			
 
				     gsmpl->set_logits(ctx, idx);
			
 
				 
			
 
				     auto & grmr  = gsmpl->grmr;
			
@@ -428,6 +480,8 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
 
				 // helpers
			
 
				 
			
 
				 llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
			
 
				+    const auto tm = gsmpl->tm();
			
 
				+
			
 
				     auto * res = &gsmpl->cur_p;
			
 
				 
			
 
				     if (do_sort && !res->sorted) {
			
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -4,10 +4,10 @@
 
				 #include "llama.h"
			
 
				 #include "ggml.h"
			
 
				 
			
 
				+#include <cmath>
			
 
				 #include <cstdio>
			
 
				 #include <string>
			
 
				 #include <vector>
			
 
				-#include <numeric>
			
 
				 
			
 
				 /**
			
 
				  * This the arbitrary data which will be passed to each callback.
			
@@ -37,23 +37,23 @@ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
 
				     return u.f;
			
 
				 }
			
 
				 
			
 
				-static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
			
 
				+static float ggml_get_float_value(const uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
			
 
				     size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
			
 
				     float v;
			
 
				     if (type == GGML_TYPE_F16) {
			
 
				-        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
			
 
				+        v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
			
 
				     } else if (type == GGML_TYPE_F32) {
			
 
				-        v = *(float *) &data[i];
			
 
				+        v = *(const float *) &data[i];
			
 
				     } else if (type == GGML_TYPE_I64) {
			
 
				-        v = (float) *(int64_t *) &data[i];
			
 
				+        v = (float) *(const int64_t *) &data[i];
			
 
				     } else if (type == GGML_TYPE_I32) {
			
 
				-        v = (float) *(int32_t *) &data[i];
			
 
				+        v = (float) *(const int32_t *) &data[i];
			
 
				     } else if (type == GGML_TYPE_I16) {
			
 
				-        v = (float) *(int16_t *) &data[i];
			
 
				+        v = (float) *(const int16_t *) &data[i];
			
 
				     } else if (type == GGML_TYPE_I8) {
			
 
				-        v = (float) *(int8_t *) &data[i];
			
 
				+        v = (float) *(const int8_t *) &data[i];
			
 
				     } else if (type == GGML_TYPE_BF16) {
			
 
				-        v = ggml_compute_bf16_to_fp32(*(ggml_bf16_t *) &data[i]);
			
 
				+        v = ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
			
 
				     } else {
			
 
				         GGML_ABORT("fatal error");
			
 
				     }
			
--- a/src/llama-impl.cpp
+++ b/src/llama-impl.cpp
@@ -20,10 +20,10 @@ static llama_logger_state g_logger_state;
 
				 time_meas::time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
			
 
				 
			
 
				 time_meas::~time_meas() {
			
 
				-        if (t_start_us >= 0) {
			
 
				-            t_acc += ggml_time_us() - t_start_us;
			
 
				-        }
			
 
				+    if (t_start_us >= 0) {
			
 
				+        t_acc += ggml_time_us() - t_start_us;
			
 
				     }
			
 
				+}
			
 
				 
			
 
				 void llama_log_set(ggml_log_callback log_callback, void * user_data) {
			
 
				     ggml_log_set(log_callback, user_data);
			
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -472,9 +472,6 @@ static void llama_sampler_chain_reset(struct llama_sampler * smpl) {
 
				     for (auto * smpl : chain->samplers) {
			
 
				         llama_sampler_reset(smpl);
			
 
				     }
			
 
				-
			
 
				-    chain->t_sample_us = 0;
			
 
				-    chain->n_sample    = 0;
			
 
				 }
			
 
				 
			
 
				 static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampler * smpl) {
			
@@ -2670,8 +2667,7 @@ struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * c
 
				 void llama_perf_sampler_print(const struct llama_sampler * chain) {
			
 
				     const auto data = llama_perf_sampler(chain);
			
 
				 
			
 
				-    LLAMA_LOG_INFO("%s:    sampling time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
			
 
				-            __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
			
 
				+    LLAMA_LOG_INFO("%s:    samplers time = %10.2f ms / %5d runs\n", __func__, data.t_sample_ms, data.n_sample);
			
 
				 }
			
 
				 
			
 
				 void llama_perf_sampler_reset(struct llama_sampler * chain) {
			
@@ -2681,5 +2677,6 @@ void llama_perf_sampler_reset(struct llama_sampler * chain) {
 
				 
			
 
				     auto * ctx = (struct llama_sampler_chain *) chain->ctx;
			
 
				 
			
 
				-    ctx->t_sample_us = ctx->n_sample = 0;
			
 
				+    ctx->t_sample_us = 0;
			
 
				+    ctx->n_sample    = 0;
			
 
				 }
			
--- a/tools/main/main.cpp
+++ b/tools/main/main.cpp
@@ -147,11 +147,15 @@ int main(int argc, char ** argv) {
 
				         return 1;
			
 
				     }
			
 
				 
			
 
				-    auto * mem = llama_get_memory(ctx);
			
 
				-
			
 
				+    llama_memory_t mem = llama_get_memory(ctx);
			
 
				     const llama_vocab * vocab = llama_model_get_vocab(model);
			
 
				+
			
 
				+    // note: the time for chat template initialization is not negligible:
			
 
				     auto chat_templates = common_chat_templates_init(model, params.chat_template);
			
 
				 
			
 
				+    // start measuring performance timings from here
			
 
				+    llama_perf_context_reset(ctx);
			
 
				+
			
 
				     LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
			
 
				 
			
 
				     auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);