пре 2 година · 4f447a4833
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -5840,7 +5840,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
 
				         return ptr;
			
 
				     }
			
 
				 #ifdef DEBUG_CUDA_MALLOC
			
 
				-    fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
			
 
				+    fprintf(stderr, "%s: %d buffers, max_size = %u MiB, tot_size = %u MiB, requested %u MiB\n", __func__, nnz,
			
 
				             (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
			
 
				 #endif
			
 
				     void * ptr;
			
@@ -5978,7 +5978,7 @@ void * ggml_cuda_host_malloc(size_t size) {
 
				         // The allocation error can be bypassed. A null ptr will assigned out of this function.
			
 
				         // This can fixed the OOM error in WSL.
			
 
				         cudaGetLastError();
			
 
				-        fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
			
 
				+        fprintf(stderr, "WARNING: failed to allocate %.2f MiB of pinned memory: %s\n",
			
 
				             size/1024.0/1024.0, cudaGetErrorString(err));
			
 
				         return nullptr;
			
 
				     }
			
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -345,10 +345,10 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
 
				         }
			
 
				     }
			
 
				 
			
 
				-    GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
			
 
				-    GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
			
 
				+    GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",        __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
			
 
				+    GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MiB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
			
 
				     if (ctx->device.maxTransferRate != 0) {
			
 
				-        GGML_METAL_LOG_INFO("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
			
 
				+        GGML_METAL_LOG_INFO("%s: maxTransferRate               = %8.2f MiB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
			
 
				     } else {
			
 
				         GGML_METAL_LOG_INFO("%s: maxTransferRate               = built-in GPU\n", __func__);
			
 
				     }
			
@@ -541,11 +541,11 @@ bool ggml_metal_add_buffer(
 
				             ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
			
 
				 
			
 
				             if (ctx->buffers[ctx->n_buffers].metal == nil) {
			
 
				-                GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
			
 
				+                GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
			
 
				                 return false;
			
 
				             }
			
 
				 
			
 
				-            GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
			
 
				+            GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB", __func__, name, size_aligned / 1024.0 / 1024.0);
			
 
				 
			
 
				             ++ctx->n_buffers;
			
 
				         } else {
			
@@ -565,11 +565,11 @@ bool ggml_metal_add_buffer(
 
				                 ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
			
 
				 
			
 
				                 if (ctx->buffers[ctx->n_buffers].metal == nil) {
			
 
				-                    GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
			
 
				+                    GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
			
 
				                     return false;
			
 
				                 }
			
 
				 
			
 
				-                GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
			
 
				+                GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
			
 
				                 if (i + size_step < size) {
			
 
				                     GGML_METAL_LOG_INFO("\n");
			
 
				                 }
			
--- a/llama.cpp
+++ b/llama.cpp
@@ -1087,9 +1087,9 @@ enum e_model {
 
				     MODEL_70B,
			
 
				 };
			
 
				 
			
 
				-static const size_t kB = 1024;
			
 
				-static const size_t MB = 1024*kB;
			
 
				-static const size_t GB = 1024*MB;
			
 
				+static const size_t kiB = 1024;
			
 
				+static const size_t MiB = 1024*kiB;
			
 
				+static const size_t GiB = 1024*MiB;
			
 
				 
			
 
				 struct llama_hparams {
			
 
				     bool     vocab_only;
			
@@ -1488,7 +1488,7 @@ static bool llama_kv_cache_init(
 
				             vram_kv_cache += ggml_nbytes(cache.k);
			
 
				         }
			
 
				         if (vram_kv_cache > 0) {
			
 
				-            LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
			
 
				+            LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
			
 
				         }
			
 
				     }
			
 
				 #endif
			
@@ -2543,8 +2543,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
 
				     LLAMA_LOG_INFO("%s: model type       = %s\n",     __func__, llama_model_type_name(model.type));
			
 
				     LLAMA_LOG_INFO("%s: model ftype      = %s\n",     __func__, llama_model_ftype_name(model.ftype).c_str());
			
 
				     LLAMA_LOG_INFO("%s: model params     = %.2f B\n", __func__, ml.n_elements*1e-9);
			
 
				-    if (ml.n_bytes < GB) {
			
 
				-        LLAMA_LOG_INFO("%s: model size       = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
			
 
				+    if (ml.n_bytes < GiB) {
			
 
				+        LLAMA_LOG_INFO("%s: model size       = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0,        ml.n_bytes*8.0/ml.n_elements);
			
 
				     } else {
			
 
				         LLAMA_LOG_INFO("%s: model size       = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
			
 
				     }
			
@@ -2582,7 +2582,7 @@ static void llm_load_tensors(
 
				 
			
 
				     ml.calc_sizes(ctx_size, mmapped_size);
			
 
				 
			
 
				-    LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
			
 
				+    LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
			
 
				 
			
 
				     // create the ggml context
			
 
				     {
			
@@ -3231,7 +3231,7 @@ static void llm_load_tensors(
 
				             ctx_size +
			
 
				             mmapped_size - vram_weights; // weights in VRAM not in memory
			
 
				 
			
 
				-        LLAMA_LOG_INFO("%s: mem required  = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
			
 
				+        LLAMA_LOG_INFO("%s: mem required  = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0);
			
 
				 
			
 
				 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
			
 
				         const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
			
@@ -3250,7 +3250,7 @@ static void llm_load_tensors(
 
				 #endif // GGML_USE_CUBLAS
			
 
				 
			
 
				         LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
			
 
				-        LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
			
 
				+        LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
			
 
				 #else
			
 
				         (void) n_gpu_layers;
			
 
				 #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
			
@@ -7962,7 +7962,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
				                 workers.clear();
			
 
				             }
			
 
				 
			
 
				-            LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
			
 
				+            LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
			
 
				             int64_t tot_count = 0;
			
 
				             for (size_t i = 0; i < hist_cur.size(); i++) {
			
 
				                 hist_all[i] += hist_cur[i];
			
@@ -8502,7 +8502,7 @@ struct llama_context * llama_new_context_with_model(
 
				 
			
 
				         {
			
 
				             const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
			
 
				-            LLAMA_LOG_INFO("%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
			
 
				+            LLAMA_LOG_INFO("%s: kv self size  = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0);
			
 
				         }
			
 
				 
			
 
				         // resized during inference
			
@@ -8547,7 +8547,7 @@ struct llama_context * llama_new_context_with_model(
 
				             // measure memory requirements for the graph
			
 
				             size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
			
 
				 
			
 
				-            LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
			
 
				+            LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
			
 
				 
			
 
				             // recreate allocator with exact memory requirements
			
 
				             ggml_allocr_free(ctx->alloc);
			
@@ -8561,7 +8561,7 @@ struct llama_context * llama_new_context_with_model(
 
				 #endif
			
 
				 #ifdef GGML_USE_CUBLAS
			
 
				             ggml_cuda_set_scratch_size(alloc_size);
			
 
				-            LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
			
 
				+            LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
			
 
				 
			
 
				             // calculate total VRAM usage
			
 
				             auto add_tensor = [](const ggml_tensor * t, size_t & size) {
			
@@ -8581,10 +8581,10 @@ struct llama_context * llama_new_context_with_model(
 
				             size_t ctx_vram_size = alloc_size + kv_vram_size;
			
 
				             size_t total_vram_size = model_vram_size + ctx_vram_size;
			
 
				 
			
 
				-            LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__,
			
 
				+            LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
			
 
				                     total_vram_size / 1024.0 / 1024.0,
			
 
				                     model_vram_size / 1024.0 / 1024.0,
			
 
				-                    ctx_vram_size / 1024.0 / 1024.0);
			
 
				+                    ctx_vram_size   / 1024.0 / 1024.0);
			
 
				 #endif
			
 
				         }
			
 
				 
			
@@ -8605,7 +8605,7 @@ struct llama_context * llama_new_context_with_model(
 
				 
			
 
				             const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
			
 
				 
			
 
				-            LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
			
 
				+            LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB\n", __func__, max_size/1024.0/1024.0);
			
 
				 
			
 
				 #define LLAMA_METAL_CHECK_BUF(result)                            \
			
 
				             if (!(result)) {                                             \