2 years ago · c7e9701f86
--- a/llama.cpp
+++ b/llama.cpp
@@ -2372,7 +2372,8 @@ struct llama_model_loader {
 
				         }
			
 
				     }
			
 
				 
			
 
				-    void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
			
 
				+    // Returns false if cancelled by progress_callback
			
 
				+    bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
			
 
				         size_t size_data = 0;
			
 
				 
			
 
				         for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
			
@@ -2404,7 +2405,9 @@ struct llama_model_loader {
 
				             GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
			
 
				 
			
 
				             if (progress_callback) {
			
 
				-                progress_callback((float) size_done / size_data, progress_callback_user_data);
			
 
				+                if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
			
 
				+                    return false;
			
 
				+                }
			
 
				             }
			
 
				 
			
 
				             const size_t offs = file_offset(ggml_get_name(cur));
			
@@ -2466,8 +2469,11 @@ struct llama_model_loader {
 
				         }
			
 
				 
			
 
				         if (progress_callback) {
			
 
				-            progress_callback(1.0f, progress_callback_user_data);
			
 
				+            // Even though the model is done loading, we still honor
			
 
				+            // cancellation since we need to free allocations.
			
 
				+            return progress_callback(1.0f, progress_callback_user_data);
			
 
				         }
			
 
				+        return true;
			
 
				     }
			
 
				 };
			
 
				 
			
@@ -3044,7 +3050,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
 
				     if (vocab.linefeed_id    != -1) { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, vocab.linefeed_id,    vocab.id_to_token[vocab.linefeed_id].text.c_str() );    }
			
 
				 }
			
 
				 
			
 
				-static void llm_load_tensors(
			
 
				+// Returns false if cancelled by progress_callback
			
 
				+static bool llm_load_tensors(
			
 
				         llama_model_loader & ml,
			
 
				         llama_model & model,
			
 
				         int n_gpu_layers,
			
@@ -3722,16 +3729,20 @@ static void llm_load_tensors(
 
				         model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
			
 
				     }
			
 
				 
			
 
				-    ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL);
			
 
				+    if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) {
			
 
				+        return false;
			
 
				+    }
			
 
				 
			
 
				     model.mapping = std::move(ml.mapping);
			
 
				 
			
 
				     // loading time will be recalculate after the first eval, so
			
 
				     // we take page faults deferred by mmap() into consideration
			
 
				     model.t_load_us = ggml_time_us() - model.t_start_us;
			
 
				+    return true;
			
 
				 }
			
 
				 
			
 
				-static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
			
 
				+// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
			
 
				+static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
			
 
				     try {
			
 
				         llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
			
 
				 
			
@@ -3749,19 +3760,21 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con
 
				 
			
 
				         if (params.vocab_only) {
			
 
				             LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
			
 
				-            return true;
			
 
				+            return 0;
			
 
				         }
			
 
				 
			
 
				-        llm_load_tensors(
			
 
				+        if (!llm_load_tensors(
			
 
				             ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
			
 
				             params.progress_callback, params.progress_callback_user_data
			
 
				-        );
			
 
				+        )) {
			
 
				+            return -2;
			
 
				+        }
			
 
				     } catch (const std::exception & err) {
			
 
				         LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
			
 
				-        return false;
			
 
				+        return -1;
			
 
				     }
			
 
				 
			
 
				-    return true;
			
 
				+    return 0;
			
 
				 }
			
 
				 
			
 
				 //
			
@@ -9141,11 +9154,18 @@ struct llama_model * llama_load_model_from_file(
 
				                     LLAMA_LOG_INFO("\n");
			
 
				                 }
			
 
				             }
			
 
				+            return true;
			
 
				         };
			
 
				     }
			
 
				 
			
 
				-    if (!llama_model_load(path_model, *model, params)) {
			
 
				-        LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
			
 
				+    int status = llama_model_load(path_model, *model, params);
			
 
				+    GGML_ASSERT(status <= 0);
			
 
				+    if (status < 0) {
			
 
				+        if (status == -1) {
			
 
				+            LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
			
 
				+        } else if (status == -2) {
			
 
				+            LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
			
 
				+        }
			
 
				         delete model;
			
 
				         return nullptr;
			
 
				     }
			
--- a/llama.h
+++ b/llama.h
@@ -127,7 +127,7 @@ extern "C" {
 
				         bool sorted;
			
 
				     } llama_token_data_array;
			
 
				 
			
 
				-    typedef void (*llama_progress_callback)(float progress, void *ctx);
			
 
				+    typedef bool (*llama_progress_callback)(float progress, void *ctx);
			
 
				 
			
 
				     // Input data for llama_decode
			
 
				     // A llama_batch object can contain input about one or many sequences
			
@@ -180,7 +180,9 @@ extern "C" {
 
				         int32_t main_gpu;     // the GPU that is used for scratch and small tensors
			
 
				         const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
			
 
				 
			
 
				-        // called with a progress value between 0 and 1, pass NULL to disable
			
 
				+        // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
			
 
				+        // If the provided progress_callback returns true, model loading continues.
			
 
				+        // If it returns false, model loading is immediately aborted.
			
 
				         llama_progress_callback progress_callback;
			
 
				 
			
 
				         // context pointer passed to the progress callback