|
@@ -1,6 +1,7 @@
|
|
|
// Defines fileno on msys:
|
|
// Defines fileno on msys:
|
|
|
#ifndef _GNU_SOURCE
|
|
#ifndef _GNU_SOURCE
|
|
|
#define _GNU_SOURCE
|
|
#define _GNU_SOURCE
|
|
|
|
|
+#include <cstddef>
|
|
|
#include <cstdint>
|
|
#include <cstdint>
|
|
|
#include <cstdio>
|
|
#include <cstdio>
|
|
|
#endif
|
|
#endif
|
|
@@ -645,7 +646,7 @@ struct llama_model_loader {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
|
|
|
|
|
|
|
+ struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
|
|
auto it = tensors_map.name_to_idx.find(name);
|
|
auto it = tensors_map.name_to_idx.find(name);
|
|
|
if (it == tensors_map.name_to_idx.end()) {
|
|
if (it == tensors_map.name_to_idx.end()) {
|
|
|
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
|
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
|
@@ -656,10 +657,10 @@ struct llama_model_loader {
|
|
|
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
|
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- return get_tensor_for(lt);
|
|
|
|
|
|
|
+ return get_tensor_for(lt, backend);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
|
|
|
|
|
|
|
+ struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
|
|
|
struct ggml_tensor * tensor;
|
|
struct ggml_tensor * tensor;
|
|
|
if (lt.ne.size() == 2) {
|
|
if (lt.ne.size() == 2) {
|
|
|
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
|
|
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
|
|
@@ -669,6 +670,7 @@ struct llama_model_loader {
|
|
|
}
|
|
}
|
|
|
ggml_set_name(tensor, lt.name.c_str());
|
|
ggml_set_name(tensor, lt.name.c_str());
|
|
|
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
|
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
|
|
|
|
+ tensor->backend = backend;
|
|
|
lt.ggml_tensor = tensor;
|
|
lt.ggml_tensor = tensor;
|
|
|
num_ggml_tensors_created++;
|
|
num_ggml_tensors_created++;
|
|
|
return tensor;
|
|
return tensor;
|
|
@@ -682,12 +684,16 @@ struct llama_model_loader {
|
|
|
|
|
|
|
|
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
|
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
|
|
size_t data_size = 0;
|
|
size_t data_size = 0;
|
|
|
|
|
+ size_t prefetch_size = 0;
|
|
|
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
|
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
|
|
data_size += lt.size;
|
|
data_size += lt.size;
|
|
|
|
|
+ if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
|
|
|
|
+ prefetch_size += lt.size;
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
if (use_mmap) {
|
|
if (use_mmap) {
|
|
|
- mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
|
|
|
|
|
|
|
+ mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
|
|
|
if (!lmlock) {
|
|
if (!lmlock) {
|
|
|
// Don't call the callback since the actual loading will be lazy
|
|
// Don't call the callback since the actual loading will be lazy
|
|
|
// and we can't measure it.
|
|
// and we can't measure it.
|
|
@@ -700,6 +706,9 @@ struct llama_model_loader {
|
|
|
|
|
|
|
|
size_t done_size = 0;
|
|
size_t done_size = 0;
|
|
|
for (llama_load_tensor & lt : tensors_map.tensors) {
|
|
for (llama_load_tensor & lt : tensors_map.tensors) {
|
|
|
|
|
+ if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
|
|
|
|
|
+ continue;
|
|
|
|
|
+ }
|
|
|
if (progress_callback) {
|
|
if (progress_callback) {
|
|
|
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
|
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
|
|
}
|
|
}
|
|
@@ -712,9 +721,6 @@ struct llama_model_loader {
|
|
|
lmlock->grow_to(done_size);
|
|
lmlock->grow_to(done_size);
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
- if (progress_callback) {
|
|
|
|
|
- progress_callback(1.0f, progress_callback_user_data);
|
|
|
|
|
- }
|
|
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
void load_data_for(llama_load_tensor & lt) {
|
|
void load_data_for(llama_load_tensor & lt) {
|
|
@@ -969,27 +975,7 @@ static void llama_model_load_internal(
|
|
|
size_t ctx_size;
|
|
size_t ctx_size;
|
|
|
size_t mmapped_size;
|
|
size_t mmapped_size;
|
|
|
ml->calc_sizes(&ctx_size, &mmapped_size);
|
|
ml->calc_sizes(&ctx_size, &mmapped_size);
|
|
|
- fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/1024.0/1024.0);
|
|
|
|
|
-
|
|
|
|
|
- // print memory requirements
|
|
|
|
|
- {
|
|
|
|
|
- const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
|
|
|
|
-
|
|
|
|
|
- // this is the total memory required to run the inference
|
|
|
|
|
- const size_t mem_required =
|
|
|
|
|
- ctx_size +
|
|
|
|
|
- mmapped_size +
|
|
|
|
|
- MEM_REQ_SCRATCH0().at(model.type) +
|
|
|
|
|
- MEM_REQ_SCRATCH1().at(model.type) +
|
|
|
|
|
- MEM_REQ_EVAL().at(model.type);
|
|
|
|
|
-
|
|
|
|
|
- // this is the memory required by one llama_state
|
|
|
|
|
- const size_t mem_required_state =
|
|
|
|
|
- scale*MEM_REQ_KV_SELF().at(model.type);
|
|
|
|
|
-
|
|
|
|
|
- fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
|
|
|
|
- mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
|
|
|
|
|
|
|
|
// create the ggml context
|
|
// create the ggml context
|
|
|
{
|
|
{
|
|
@@ -1011,7 +997,14 @@ static void llama_model_load_internal(
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+#ifdef GGML_USE_CUBLAS
|
|
|
|
|
+#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
|
|
|
|
|
+#else
|
|
|
|
|
+#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
|
|
|
|
+#endif
|
|
|
|
|
+
|
|
|
// prepare memory for the weights
|
|
// prepare memory for the weights
|
|
|
|
|
+ size_t vram_total = 0;
|
|
|
{
|
|
{
|
|
|
const uint32_t n_embd = hparams.n_embd;
|
|
const uint32_t n_embd = hparams.n_embd;
|
|
|
const uint32_t n_layer = hparams.n_layer;
|
|
const uint32_t n_layer = hparams.n_layer;
|
|
@@ -1019,70 +1012,122 @@ static void llama_model_load_internal(
|
|
|
|
|
|
|
|
ml->ggml_ctx = ctx;
|
|
ml->ggml_ctx = ctx;
|
|
|
|
|
|
|
|
- model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
|
|
|
|
|
- model.norm = ml->get_tensor("norm.weight", {n_embd});
|
|
|
|
|
- model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
|
|
|
|
|
|
|
+ model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
|
|
|
|
+ model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
|
|
|
|
|
+
|
|
|
|
|
+ // "output" tensor
|
|
|
|
|
+ {
|
|
|
|
|
+ ggml_backend backend_output;
|
|
|
|
|
+ if (n_gpu_layers > int(n_layer)) { // NOLINT
|
|
|
|
|
+ backend_output = LLAMA_BACKEND_OFFLOAD;
|
|
|
|
|
+ } else {
|
|
|
|
|
+ backend_output = GGML_BACKEND_CPU;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ const int i_gpu_start = n_layer - n_gpu_layers;
|
|
|
|
|
|
|
|
model.layers.resize(n_layer);
|
|
model.layers.resize(n_layer);
|
|
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
|
|
|
|
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
|
|
|
|
+
|
|
|
auto & layer = model.layers[i];
|
|
auto & layer = model.layers[i];
|
|
|
|
|
|
|
|
std::string layers_i = "layers." + std::to_string(i);
|
|
std::string layers_i = "layers." + std::to_string(i);
|
|
|
|
|
|
|
|
- layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
|
|
|
|
|
|
|
+ layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
|
|
|
|
|
|
|
|
- layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd});
|
|
|
|
|
- layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
|
|
|
|
|
- layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
|
|
|
|
|
- layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
|
|
|
|
|
|
|
+ layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
|
|
|
|
|
+ layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
|
|
|
|
|
+ layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
|
|
|
|
|
+ layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
|
|
|
|
|
|
|
|
- layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd});
|
|
|
|
|
|
|
+ layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
|
|
|
|
|
|
|
|
- layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
|
|
|
|
|
- layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
|
|
|
|
|
- layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
|
|
|
|
|
|
|
+ layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
|
|
|
|
|
+ layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
|
|
|
|
|
+ layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
|
|
|
|
|
+
|
|
|
|
|
+ if (backend == GGML_BACKEND_CUDA) {
|
|
|
|
|
+ vram_total +=
|
|
|
|
|
+ ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
|
|
|
|
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
|
|
|
|
|
+ ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
ml->done_getting_tensors();
|
|
ml->done_getting_tensors();
|
|
|
|
|
|
|
|
- // populate `tensors_by_name`
|
|
|
|
|
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
|
|
|
|
- model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ // print memory requirements
|
|
|
|
|
+ {
|
|
|
|
|
+ const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
|
|
|
|
|
|
|
- ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
|
|
|
|
|
|
+ // this is the total memory required to run the inference
|
|
|
|
|
+ const size_t mem_required =
|
|
|
|
|
+ ctx_size +
|
|
|
|
|
+ mmapped_size - vram_total + // weights in VRAM not in memory
|
|
|
|
|
+ MEM_REQ_SCRATCH0().at(model.type) +
|
|
|
|
|
+ MEM_REQ_SCRATCH1().at(model.type) +
|
|
|
|
|
+ MEM_REQ_EVAL().at(model.type);
|
|
|
|
|
+
|
|
|
|
|
+ // this is the memory required by one llama_state
|
|
|
|
|
+ const size_t mem_required_state =
|
|
|
|
|
+ scale*MEM_REQ_KV_SELF().at(model.type);
|
|
|
|
|
+
|
|
|
|
|
+ fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
|
|
|
|
+ mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
|
|
|
|
|
|
|
- model.mapping = std::move(ml->mapping);
|
|
|
|
|
#ifdef GGML_USE_CUBLAS
|
|
#ifdef GGML_USE_CUBLAS
|
|
|
- {
|
|
|
|
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
|
|
|
|
|
|
|
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
|
|
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
|
|
|
|
|
+ if (n_gpu_layers > (int) hparams.n_layer) {
|
|
|
|
|
+ fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
|
|
|
|
|
+ }
|
|
|
|
|
+ fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
|
|
|
|
+#else
|
|
|
|
|
+ (void) n_gpu_layers;
|
|
|
|
|
+#endif
|
|
|
|
|
+ }
|
|
|
|
|
|
|
|
- size_t vram_total = 0;
|
|
|
|
|
|
|
+ // populate `tensors_by_name`
|
|
|
|
|
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
|
|
|
|
+ model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
|
|
|
|
|
+ }
|
|
|
|
|
|
|
|
- for (int i = 0; i < n_gpu; ++i) {
|
|
|
|
|
- const auto & layer = model.layers[i];
|
|
|
|
|
|
|
+ ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
|
|
|
|
|
|
|
- ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
|
|
|
|
|
- ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
|
|
|
|
|
- ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
|
|
|
|
|
- ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
|
|
|
|
|
- ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
|
|
|
|
|
- ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
|
|
|
|
|
- ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
|
|
|
|
|
|
|
+#ifdef GGML_USE_CUBLAS
|
|
|
|
|
+ {
|
|
|
|
|
+ size_t done_size = 0;
|
|
|
|
|
+ size_t data_size = 0;
|
|
|
|
|
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
|
|
|
|
+ data_size += lt.size;
|
|
|
|
|
+ if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
|
|
|
|
+ done_size += lt.size;
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
- if (n_gpu_layers > (int) hparams.n_layer) {
|
|
|
|
|
- fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
|
|
|
|
|
- ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
|
|
|
|
|
|
|
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
|
|
|
|
+ if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
|
|
|
|
|
+ continue;
|
|
|
|
|
+ }
|
|
|
|
|
+ if (progress_callback) {
|
|
|
|
|
+ progress_callback((float) done_size / data_size, progress_callback_user_data);
|
|
|
|
|
+ }
|
|
|
|
|
+ ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
|
|
|
|
|
+ done_size += lt.size;
|
|
|
}
|
|
}
|
|
|
|
|
+ }
|
|
|
|
|
+#endif // GGML_USE_CUBLAS
|
|
|
|
|
|
|
|
- fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
|
|
|
|
|
|
+ if (progress_callback) {
|
|
|
|
|
+ progress_callback(1.0f, progress_callback_user_data);
|
|
|
}
|
|
}
|
|
|
-#else
|
|
|
|
|
- (void) n_gpu_layers;
|
|
|
|
|
-#endif
|
|
|
|
|
|
|
+
|
|
|
|
|
+ model.mapping = std::move(ml->mapping);
|
|
|
|
|
|
|
|
// loading time will be recalculate after the first eval, so
|
|
// loading time will be recalculate after the first eval, so
|
|
|
// we take page faults deferred by mmap() into consideration
|
|
// we take page faults deferred by mmap() into consideration
|
|
@@ -1181,10 +1226,8 @@ static bool llama_eval_internal(
|
|
|
{
|
|
{
|
|
|
cur = ggml_rms_norm(ctx0, inpL);
|
|
cur = ggml_rms_norm(ctx0, inpL);
|
|
|
|
|
|
|
|
- // cur = attention_norm*cur
|
|
|
|
|
- cur = ggml_mul(ctx0,
|
|
|
|
|
- ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
|
|
|
|
|
- cur);
|
|
|
|
|
|
|
+ // cur = cur*attention_norm(broadcasted)
|
|
|
|
|
+ cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
// self-attention
|
|
// self-attention
|
|
@@ -1291,10 +1334,8 @@ static bool llama_eval_internal(
|
|
|
{
|
|
{
|
|
|
cur = ggml_rms_norm(ctx0, inpFF);
|
|
cur = ggml_rms_norm(ctx0, inpFF);
|
|
|
|
|
|
|
|
- // cur = ffn_norm*cur
|
|
|
|
|
- cur = ggml_mul(ctx0,
|
|
|
|
|
- ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
|
|
|
|
|
- cur);
|
|
|
|
|
|
|
+ // cur = cur*ffn_norm(broadcasted)
|
|
|
|
|
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
|
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
|
@@ -1331,10 +1372,8 @@ static bool llama_eval_internal(
|
|
|
|
|
|
|
|
inpL = ggml_rms_norm(ctx0, inpL);
|
|
inpL = ggml_rms_norm(ctx0, inpL);
|
|
|
|
|
|
|
|
- // inpL = norm*inpL
|
|
|
|
|
- inpL = ggml_mul(ctx0,
|
|
|
|
|
- ggml_repeat(ctx0, model.norm, inpL),
|
|
|
|
|
- inpL);
|
|
|
|
|
|
|
+ // inpL = inpL*norm(broadcasted)
|
|
|
|
|
+ inpL = ggml_mul(ctx0, inpL, model.norm);
|
|
|
|
|
|
|
|
embeddings = inpL;
|
|
embeddings = inpL;
|
|
|
}
|
|
}
|
|
@@ -2158,7 +2197,7 @@ struct llama_context * llama_init_from_file(
|
|
|
unsigned * cur_percentage_p = (unsigned *) ctx;
|
|
unsigned * cur_percentage_p = (unsigned *) ctx;
|
|
|
unsigned percentage = (unsigned) (100 * progress);
|
|
unsigned percentage = (unsigned) (100 * progress);
|
|
|
while (percentage > *cur_percentage_p) {
|
|
while (percentage > *cur_percentage_p) {
|
|
|
- ++*cur_percentage_p;
|
|
|
|
|
|
|
+ *cur_percentage_p = percentage;
|
|
|
fprintf(stderr, ".");
|
|
fprintf(stderr, ".");
|
|
|
fflush(stderr);
|
|
fflush(stderr);
|
|
|
if (percentage >= 100) {
|
|
if (percentage >= 100) {
|
|
@@ -2315,7 +2354,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
|
|
|
|
|
|
// maybe this should in llama_model_loader
|
|
// maybe this should in llama_model_loader
|
|
|
if (model_loader->use_mmap) {
|
|
if (model_loader->use_mmap) {
|
|
|
- model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
|
|
|
|
|
|
|
+ model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -2408,7 +2447,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
|
}
|
|
}
|
|
|
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
|
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
|
|
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
|
|
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
|
|
|
- base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
|
|
|
|
|
|
|
+ base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
|
|
|
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
|
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
|
|
model_loader->load_data_for(lt);
|
|
model_loader->load_data_for(lt);
|
|
|
lt.ggml_tensor->data = lt.data;
|
|
lt.ggml_tensor->data = lt.data;
|