|
|
@@ -2372,7 +2372,8 @@ struct llama_model_loader {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
|
|
|
+ // Returns false if cancelled by progress_callback
|
|
|
+ bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
|
|
|
size_t size_data = 0;
|
|
|
|
|
|
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
|
|
|
@@ -2404,7 +2405,9 @@ struct llama_model_loader {
|
|
|
GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
|
|
|
|
|
|
if (progress_callback) {
|
|
|
- progress_callback((float) size_done / size_data, progress_callback_user_data);
|
|
|
+ if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
const size_t offs = file_offset(ggml_get_name(cur));
|
|
|
@@ -2466,8 +2469,11 @@ struct llama_model_loader {
|
|
|
}
|
|
|
|
|
|
if (progress_callback) {
|
|
|
- progress_callback(1.0f, progress_callback_user_data);
|
|
|
+ // Even though the model is done loading, we still honor
|
|
|
+ // cancellation since we need to free allocations.
|
|
|
+ return progress_callback(1.0f, progress_callback_user_data);
|
|
|
}
|
|
|
+ return true;
|
|
|
}
|
|
|
};
|
|
|
|
|
|
@@ -3044,7 +3050,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
|
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
|
|
}
|
|
|
|
|
|
-static void llm_load_tensors(
|
|
|
+// Returns false if cancelled by progress_callback
|
|
|
+static bool llm_load_tensors(
|
|
|
llama_model_loader & ml,
|
|
|
llama_model & model,
|
|
|
int n_gpu_layers,
|
|
|
@@ -3722,16 +3729,20 @@ static void llm_load_tensors(
|
|
|
model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
|
|
|
}
|
|
|
|
|
|
- ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL);
|
|
|
+ if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
|
|
|
model.mapping = std::move(ml.mapping);
|
|
|
|
|
|
// loading time will be recalculate after the first eval, so
|
|
|
// we take page faults deferred by mmap() into consideration
|
|
|
model.t_load_us = ggml_time_us() - model.t_start_us;
|
|
|
+ return true;
|
|
|
}
|
|
|
|
|
|
-static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
|
|
|
+// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
|
|
+static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
|
|
|
try {
|
|
|
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
|
|
|
|
|
|
@@ -3749,19 +3760,21 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con
|
|
|
|
|
|
if (params.vocab_only) {
|
|
|
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
|
|
|
- return true;
|
|
|
+ return 0;
|
|
|
}
|
|
|
|
|
|
- llm_load_tensors(
|
|
|
+ if (!llm_load_tensors(
|
|
|
ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
|
|
|
params.progress_callback, params.progress_callback_user_data
|
|
|
- );
|
|
|
+ )) {
|
|
|
+ return -2;
|
|
|
+ }
|
|
|
} catch (const std::exception & err) {
|
|
|
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
|
|
- return false;
|
|
|
+ return -1;
|
|
|
}
|
|
|
|
|
|
- return true;
|
|
|
+ return 0;
|
|
|
}
|
|
|
|
|
|
//
|
|
|
@@ -9141,11 +9154,18 @@ struct llama_model * llama_load_model_from_file(
|
|
|
LLAMA_LOG_INFO("\n");
|
|
|
}
|
|
|
}
|
|
|
+ return true;
|
|
|
};
|
|
|
}
|
|
|
|
|
|
- if (!llama_model_load(path_model, *model, params)) {
|
|
|
- LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
|
|
+ int status = llama_model_load(path_model, *model, params);
|
|
|
+ GGML_ASSERT(status <= 0);
|
|
|
+ if (status < 0) {
|
|
|
+ if (status == -1) {
|
|
|
+ LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
|
|
+ } else if (status == -2) {
|
|
|
+ LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
|
|
|
+ }
|
|
|
delete model;
|
|
|
return nullptr;
|
|
|
}
|