1 month ago · a52dc60ba3
--- a/include/llama.h
+++ b/include/llama.h
@@ -467,10 +467,16 @@ extern "C" {
 
				     // Frees all allocated memory
			
 
				     LLAMA_API void llama_free(struct llama_context * ctx);
			
 
				 
			
 
				+    enum llama_params_fit_status {
			
 
				+        LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
			
 
				+        LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
			
 
				+        LLAMA_PARAMS_FIT_STATUS_ERROR   = 2, // a hard error occured, e.g. because no model could be found at the specified path
			
 
				+    };
			
 
				+
			
 
				     // fits mparams and cparams to free device memory (assumes system memory is unlimited)
			
 
				     // returns true if the parameters could be successfully modified to fit device memory
			
 
				     // this function is NOT thread safe because it modifies the global llama logger state
			
 
				-    LLAMA_API bool llama_params_fit(
			
 
				+    LLAMA_API enum llama_params_fit_status llama_params_fit(
			
 
				                                    const char   * path_model,
			
 
				                     struct llama_model_params   * mparams,
			
 
				                     struct llama_context_params * cparams,
			
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -140,6 +140,10 @@ enum layer_fraction_t {
 
				 };
			
 
				 // this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
			
 
				 
			
 
				+class llama_params_fit_exception : public std::runtime_error {
			
 
				+    using std::runtime_error::runtime_error;
			
 
				+};
			
 
				+
			
 
				 static void llama_params_fit_impl(
			
 
				         const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
			
 
				         float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
			
@@ -281,28 +285,28 @@ static void llama_params_fit_impl(
 
				     }
			
 
				 
			
 
				     if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
			
 
				-        throw std::runtime_error("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
			
 
				+        throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
			
 
				     }
			
 
				     if (nd > 1) {
			
 
				         if (!tensor_split) {
			
 
				-            throw std::runtime_error("did not provide a buffer to write the tensor_split to, abort");
			
 
				+            throw llama_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
			
 
				         }
			
 
				         if (mparams->tensor_split) {
			
 
				             for (size_t id = 0; id < nd; id++) {
			
 
				                 if (mparams->tensor_split[id] != 0.0f) {
			
 
				-                    throw std::runtime_error("model_params::tensor_split already set by user, abort");
			
 
				+                    throw llama_params_fit_exception("model_params::tensor_split already set by user, abort");
			
 
				                 }
			
 
				             }
			
 
				         }
			
 
				         if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
			
 
				-            throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
			
 
				+            throw llama_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
			
 
				         }
			
 
				     }
			
 
				     if (!tensor_buft_overrides) {
			
 
				-        throw std::runtime_error("did not provide buffer to set tensor_buft_overrides, abort");
			
 
				+        throw llama_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
			
 
				     }
			
 
				     if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
			
 
				-        throw std::runtime_error("model_params::tensor_buft_overrides already set by user, abort");
			
 
				+        throw llama_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
			
 
				     }
			
 
				 
			
 
				     // step 3: iteratively fill the back to front with "dense" layers
			
@@ -385,7 +389,7 @@ static void llama_params_fit_impl(
 
				                     tensor_buft_overrides[itbo].buft    = nullptr;
			
 
				                     itbo++;
			
 
				                     mparams.tensor_buft_overrides = tensor_buft_overrides;
			
 
				-                    throw std::runtime_error("llama_params_fit_n_tensor_buft_overrides() == "
			
 
				+                    throw llama_params_fit_exception("llama_params_fit_n_tensor_buft_overrides() == "
			
 
				                         + std::to_string(ntbo) + " is insufficient for model\n");
			
 
				                 }
			
 
				                 tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
			
@@ -683,22 +687,25 @@ static void llama_params_fit_impl(
 
				     set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
			
 
				 }
			
 
				 
			
 
				-bool llama_params_fit(
			
 
				+enum llama_params_fit_status llama_params_fit(
			
 
				         const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
			
 
				         float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
			
 
				         size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
			
 
				     const int64_t t0_us = llama_time_us();
			
 
				-    bool ok = true;
			
 
				+    llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
			
 
				     try {
			
 
				         llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
			
 
				         LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
			
 
				-    } catch (const std::runtime_error & e) {
			
 
				+    } catch (const llama_params_fit_exception & e) {
			
 
				         LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
			
 
				-        ok = false;
			
 
				+        status = LLAMA_PARAMS_FIT_STATUS_FAILURE;
			
 
				+    } catch (const std::runtime_error & e) {
			
 
				+        LLAMA_LOG_ERROR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
			
 
				+        status = LLAMA_PARAMS_FIT_STATUS_ERROR;
			
 
				     }
			
 
				     const int64_t t1_us = llama_time_us();
			
 
				     LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
			
 
				-    return ok;
			
 
				+    return status;
			
 
				 }
			
 
				 
			
 
				 struct llama_sampler_chain_params llama_sampler_chain_default_params() {
			
--- a/tools/fit-params/fit-params.cpp
+++ b/tools/fit-params/fit-params.cpp
@@ -26,10 +26,10 @@ int main(int argc, char ** argv) {
 
				     llama_numa_init(params.numa);
			
 
				     auto mparams = common_model_params_to_llama(params);
			
 
				     auto cparams = common_context_params_to_llama(params);
			
 
				-    const bool success = llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
			
 
				+    const llama_params_fit_status status = llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
			
 
				         params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
			
 
				         params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
			
 
				-    if (!success) {
			
 
				+    if (status != LLAMA_PARAMS_FIT_STATUS_SUCCESS) {
			
 
				         LOG_ERR("%s: failed to fit CLI arguments to free memory, exiting...\n", __func__);
			
 
				         exit(1);
			
 
				     }