|
@@ -71,28 +71,27 @@ extern "C" {
|
|
|
|
|
|
|
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
|
|
|
|
|
|
|
- struct llama_context_params {
|
|
|
|
|
|
|
+ struct llama_context_params {
|
|
|
|
|
+ int seed; // RNG seed, -1 for random
|
|
|
int n_ctx; // text context
|
|
int n_ctx; // text context
|
|
|
int n_batch; // prompt processing batch size
|
|
int n_batch; // prompt processing batch size
|
|
|
int n_gpu_layers; // number of layers to store in VRAM
|
|
int n_gpu_layers; // number of layers to store in VRAM
|
|
|
int main_gpu; // the GPU that is used for scratch and small tensors
|
|
int main_gpu; // the GPU that is used for scratch and small tensors
|
|
|
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
|
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
|
|
- bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
|
|
|
|
- int seed; // RNG seed, -1 for random
|
|
|
|
|
|
|
+ // called with a progress value between 0 and 1, pass NULL to disable
|
|
|
|
|
+ llama_progress_callback progress_callback;
|
|
|
|
|
+ // context pointer passed to the progress callback
|
|
|
|
|
+ void * progress_callback_user_data;
|
|
|
|
|
|
|
|
|
|
+ // Keep the booleans together to avoid misalignment during copy-by-value.
|
|
|
|
|
+ bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
|
|
bool f16_kv; // use fp16 for KV cache
|
|
bool f16_kv; // use fp16 for KV cache
|
|
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
|
|
bool vocab_only; // only load the vocabulary, no weights
|
|
bool vocab_only; // only load the vocabulary, no weights
|
|
|
bool use_mmap; // use mmap if possible
|
|
bool use_mmap; // use mmap if possible
|
|
|
bool use_mlock; // force system to keep model in RAM
|
|
bool use_mlock; // force system to keep model in RAM
|
|
|
bool embedding; // embedding mode only
|
|
bool embedding; // embedding mode only
|
|
|
-
|
|
|
|
|
- // called with a progress value between 0 and 1, pass NULL to disable
|
|
|
|
|
- llama_progress_callback progress_callback;
|
|
|
|
|
- // context pointer passed to the progress callback
|
|
|
|
|
- void * progress_callback_user_data;
|
|
|
|
|
};
|
|
};
|
|
|
-
|
|
|
|
|
// model file types
|
|
// model file types
|
|
|
enum llama_ftype {
|
|
enum llama_ftype {
|
|
|
LLAMA_FTYPE_ALL_F32 = 0,
|
|
LLAMA_FTYPE_ALL_F32 = 0,
|