2 years ago · 527b6fba1d
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -536,7 +536,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
 
				     return res;
			
 
				 }
			
 
				 
			
 
				-struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
			
 
				+std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
			
 
				     auto lparams = llama_context_default_params();
			
 
				 
			
 
				     lparams.n_ctx        = params.n_ctx;
			
@@ -552,25 +552,33 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
 
				     lparams.logits_all   = params.perplexity;
			
 
				     lparams.embedding    = params.embedding;
			
 
				 
			
 
				-    llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams);
			
 
				+    llama_model * model  = llama_load_model_from_file(params.model.c_str(), lparams);
			
 
				+    if (model == NULL) {
			
 
				+        fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
			
 
				+        return std::make_tuple(nullptr, nullptr);
			
 
				+    }
			
 
				 
			
 
				+    llama_context * lctx = llama_new_context_with_model(model, lparams);
			
 
				     if (lctx == NULL) {
			
 
				-        fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
			
 
				-        return NULL;
			
 
				+        fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
			
 
				+        llama_free_model(model);
			
 
				+        return std::make_tuple(nullptr, nullptr);
			
 
				     }
			
 
				 
			
 
				     if (!params.lora_adapter.empty()) {
			
 
				-        int err = llama_apply_lora_from_file(lctx,
			
 
				+        int err = llama_model_apply_lora_from_file(model,
			
 
				                                              params.lora_adapter.c_str(),
			
 
				                                              params.lora_base.empty() ? NULL : params.lora_base.c_str(),
			
 
				                                              params.n_threads);
			
 
				         if (err != 0) {
			
 
				             fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
			
 
				-            return NULL;
			
 
				+            llama_free(lctx);
			
 
				+            llama_free_model(model);
			
 
				+            return std::make_tuple(nullptr, nullptr);
			
 
				         }
			
 
				     }
			
 
				 
			
 
				-    return lctx;
			
 
				+    return std::make_tuple(model, lctx);
			
 
				 }
			
 
				 
			
 
				 void console_init(console_state & con_st) {
			
--- a/examples/common.h
+++ b/examples/common.h
@@ -9,6 +9,7 @@
 
				 #include <random>
			
 
				 #include <thread>
			
 
				 #include <unordered_map>
			
 
				+#include <tuple>
			
 
				 
			
 
				 #if !defined (_WIN32)
			
 
				 #include <stdio.h>
			
@@ -95,7 +96,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
 
				 // Model utils
			
 
				 //
			
 
				 
			
 
				-struct llama_context * llama_init_from_gpt_params(const gpt_params & params);
			
 
				+std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
			
 
				 
			
 
				 //
			
 
				 // Console utils
			
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -37,11 +37,12 @@ int main(int argc, char ** argv) {
 
				 
			
 
				     llama_init_backend();
			
 
				 
			
 
				+    llama_model * model;
			
 
				     llama_context * ctx;
			
 
				 
			
 
				     // load the model
			
 
				-    ctx = llama_init_from_gpt_params(params);
			
 
				-    if (ctx == NULL) {
			
 
				+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
			
 
				+    if (model == NULL) {
			
 
				         fprintf(stderr, "%s: error: unable to load model\n", __func__);
			
 
				         return 1;
			
 
				     }
			
@@ -90,6 +91,7 @@ int main(int argc, char ** argv) {
 
				 
			
 
				     llama_print_timings(ctx);
			
 
				     llama_free(ctx);
			
 
				+    llama_free_model(model);
			
 
				 
			
 
				     return 0;
			
 
				 }
			
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -107,12 +107,13 @@ int main(int argc, char ** argv) {
 
				 
			
 
				     llama_init_backend();
			
 
				 
			
 
				+    llama_model * model;
			
 
				     llama_context * ctx;
			
 
				     g_ctx = &ctx;
			
 
				 
			
 
				     // load the model and apply lora adapter, if any
			
 
				-    ctx = llama_init_from_gpt_params(params);
			
 
				-    if (ctx == NULL) {
			
 
				+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
			
 
				+    if (model == NULL) {
			
 
				         fprintf(stderr, "%s: error: unable to load model\n", __func__);
			
 
				         return 1;
			
 
				     }
			
@@ -139,6 +140,7 @@ int main(int argc, char ** argv) {
 
				 
			
 
				         llama_print_timings(ctx);
			
 
				         llama_free(ctx);
			
 
				+        llama_free_model(model);
			
 
				 
			
 
				         return 0;
			
 
				     }
			
@@ -147,6 +149,7 @@ int main(int argc, char ** argv) {
 
				     if (params.export_cgraph) {
			
 
				         llama_eval_export(ctx, "llama.ggml");
			
 
				         llama_free(ctx);
			
 
				+        llama_free_model(model);
			
 
				 
			
 
				         return 0;
			
 
				     }
			
@@ -666,6 +669,7 @@ int main(int argc, char ** argv) {
 
				 
			
 
				     llama_print_timings(ctx);
			
 
				     llama_free(ctx);
			
 
				+    llama_free_model(model);
			
 
				 
			
 
				     return 0;
			
 
				 }
			
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -149,11 +149,12 @@ int main(int argc, char ** argv) {
 
				 
			
 
				     llama_init_backend();
			
 
				 
			
 
				+    llama_model * model;
			
 
				     llama_context * ctx;
			
 
				 
			
 
				     // load the model and apply lora adapter, if any
			
 
				-    ctx = llama_init_from_gpt_params(params);
			
 
				-    if (ctx == NULL) {
			
 
				+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
			
 
				+    if (model == NULL) {
			
 
				         fprintf(stderr, "%s: error: unable to load model\n", __func__);
			
 
				         return 1;
			
 
				     }
			
@@ -169,6 +170,7 @@ int main(int argc, char ** argv) {
 
				 
			
 
				     llama_print_timings(ctx);
			
 
				     llama_free(ctx);
			
 
				+    llama_free_model(model);
			
 
				 
			
 
				     return 0;
			
 
				 }
			
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -320,6 +320,7 @@ int main(int argc, char ** argv) {
 
				     fprintf(stderr, "Loading model\n");
			
 
				 
			
 
				     const int64_t t_main_start_us = ggml_time_us();
			
 
				+    llama_model * model;
			
 
				     llama_context * ctx;
			
 
				 
			
 
				     {
			
@@ -330,12 +331,20 @@ int main(int argc, char ** argv) {
 
				         lparams.f16_kv     = false;
			
 
				         lparams.use_mlock  = false;
			
 
				 
			
 
				-        ctx = llama_init_from_file(params.model.c_str(), lparams);
			
 
				+        model = llama_load_model_from_file(params.model.c_str(), lparams);
			
 
				 
			
 
				-        if (ctx == NULL) {
			
 
				+        if (model == NULL) {
			
 
				             fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
			
 
				             return 1;
			
 
				         }
			
 
				+
			
 
				+        ctx = llama_new_context_with_model(model, lparams);
			
 
				+
			
 
				+        if (ctx == NULL) {
			
 
				+            fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
			
 
				+            llama_free_model(model);
			
 
				+            return 1;
			
 
				+        }
			
 
				     }
			
 
				 
			
 
				     const auto &tensors = llama_internal_get_tensor_map(ctx);
			
@@ -357,6 +366,7 @@ int main(int argc, char ** argv) {
 
				             fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
			
 
				                 "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
			
 
				             llama_free(ctx);
			
 
				+            llama_free_model(model);
			
 
				             return 1;
			
 
				         }
			
 
				         included_layers++;
			
@@ -415,6 +425,7 @@ int main(int argc, char ** argv) {
 
				 
			
 
				 
			
 
				     llama_free(ctx);
			
 
				+    llama_free_model(model);
			
 
				     // report timing
			
 
				     {
			
 
				         const int64_t t_main_end_us = ggml_time_us();
			
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -35,12 +35,22 @@ int main(int argc, char ** argv) {
 
				     auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
			
 
				 
			
 
				     // init
			
 
				-    auto ctx = llama_init_from_file(params.model.c_str(), lparams);
			
 
				+    auto model = llama_load_model_from_file(params.model.c_str(), lparams);
			
 
				+    if (model == nullptr) {
			
 
				+        return 1;
			
 
				+    }
			
 
				+    auto ctx = llama_new_context_with_model(model, lparams);
			
 
				+    if (ctx == nullptr) {
			
 
				+        llama_free_model(model);
			
 
				+        return 1;
			
 
				+    }
			
 
				     auto tokens = std::vector<llama_token>(params.n_ctx);
			
 
				     auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
			
 
				 
			
 
				     if (n_prompt_tokens < 1) {
			
 
				         fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
			
 
				+        llama_free(ctx);
			
 
				+        llama_free_model(model);
			
 
				         return 1;
			
 
				     }
			
 
				 
			
@@ -84,6 +94,8 @@ int main(int argc, char ** argv) {
 
				         printf("%s", next_token_str);
			
 
				         if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
			
 
				             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
			
 
				+            llama_free(ctx);
			
 
				+            llama_free_model(model);
			
 
				             return 1;
			
 
				         }
			
 
				         n_past += 1;
			
@@ -91,23 +103,27 @@ int main(int argc, char ** argv) {
 
				 
			
 
				     printf("\n\n");
			
 
				 
			
 
				-    // free old model
			
 
				+    // free old context
			
 
				     llama_free(ctx);
			
 
				 
			
 
				-    // load new model
			
 
				-    auto ctx2 = llama_init_from_file(params.model.c_str(), lparams);
			
 
				+    // make new context
			
 
				+    auto ctx2 = llama_new_context_with_model(model, lparams);
			
 
				 
			
 
				     // Load state (rng, logits, embedding and kv_cache) from file
			
 
				     {
			
 
				         FILE *fp_read = fopen("dump_state.bin", "rb");
			
 
				         if (state_size != llama_get_state_size(ctx2)) {
			
 
				             fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
			
 
				+            llama_free(ctx2);
			
 
				+            llama_free_model(model);
			
 
				             return 1;
			
 
				         }
			
 
				 
			
 
				         const size_t ret = fread(state_mem, 1, state_size, fp_read);
			
 
				         if (ret != state_size) {
			
 
				             fprintf(stderr, "\n%s : failed to read state\n", __func__);
			
 
				+            llama_free(ctx2);
			
 
				+            llama_free_model(model);
			
 
				             return 1;
			
 
				         }
			
 
				 
			
@@ -138,6 +154,8 @@ int main(int argc, char ** argv) {
 
				         printf("%s", next_token_str);
			
 
				         if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
			
 
				             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
			
 
				+            llama_free(ctx2);
			
 
				+            llama_free_model(model);
			
 
				             return 1;
			
 
				         }
			
 
				         n_past += 1;
			
@@ -145,5 +163,8 @@ int main(int argc, char ** argv) {
 
				 
			
 
				     printf("\n\n");
			
 
				 
			
 
				+    llama_free(ctx2);
			
 
				+    llama_free_model(model);
			
 
				+
			
 
				     return 0;
			
 
				 }
			
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -115,6 +115,7 @@ struct llama_server_context {
 
				     std::vector<llama_token> embd;
			
 
				     std::vector<llama_token> last_n_tokens;
			
 
				 
			
 
				+    llama_model * model = nullptr;
			
 
				     llama_context * ctx = nullptr;
			
 
				     gpt_params params;
			
 
				 
			
@@ -130,6 +131,10 @@ struct llama_server_context {
 
				             llama_free(ctx);
			
 
				             ctx = nullptr;
			
 
				         }
			
 
				+        if (model) {
			
 
				+            llama_free_model(model);
			
 
				+            model = nullptr;
			
 
				+        }
			
 
				     }
			
 
				 
			
 
				     void rewind() {
			
@@ -150,8 +155,8 @@ struct llama_server_context {
 
				 
			
 
				     bool loadModel(const gpt_params & params_) {
			
 
				         params = params_;
			
 
				-        ctx = llama_init_from_gpt_params(params);
			
 
				-        if (ctx == nullptr) {
			
 
				+        std::tie(model, ctx) = llama_init_from_gpt_params(params);
			
 
				+        if (model == nullptr) {
			
 
				             LOG_ERROR("unable to load model", { { "model", params_.model } });
			
 
				             return false;
			
 
				         }
			
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -68,11 +68,12 @@ int main(int argc, char ** argv)
 
				 
			
 
				     llama_init_backend();
			
 
				 
			
 
				-    llama_context * ctx ;
			
 
				+    llama_model * model;
			
 
				+    llama_context * ctx;
			
 
				 
			
 
				-    ctx = llama_init_from_gpt_params( params );
			
 
				+    std::tie(model, ctx) = llama_init_from_gpt_params( params );
			
 
				 
			
 
				-    if ( ctx == NULL )
			
 
				+    if ( model == NULL )
			
 
				     {
			
 
				         fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
			
 
				         return 1;
			
@@ -170,6 +171,7 @@ int main(int argc, char ** argv)
 
				     } // wend of main loop
			
 
				 
			
 
				     llama_free( ctx );
			
 
				+    llama_free_model( model );
			
 
				 
			
 
				     return 0;
			
 
				 }
			
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -3054,7 +3054,8 @@ int main(int argc, char ** argv) {
 
				     struct llama_context_params llama_params = llama_context_default_params();
			
 
				     llama_params.vocab_only = true;
			
 
				 
			
 
				-    struct llama_context * lctx = llama_init_from_file(params.fn_vocab_model, llama_params);
			
 
				+    struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, llama_params);
			
 
				+    struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
			
 
				 
			
 
				     struct llama_vocab vocab;
			
 
				     {
			
@@ -3395,6 +3396,8 @@ int main(int argc, char ** argv) {
 
				     delete[] compute_addr;
			
 
				     delete[] compute_buf_0;
			
 
				     delete[] compute_buf_1;
			
 
				+    llama_free(lctx);
			
 
				+    llama_free_model(lmodel);
			
 
				     ggml_free(model.ctx);
			
 
				 
			
 
				     return 0;
			
--- a/llama.cpp
+++ b/llama.cpp
@@ -182,6 +182,19 @@ struct llama_kv_cache {
 
				     }
			
 
				 };
			
 
				 
			
 
				+struct llama_vocab {
			
 
				+    using id    = int32_t;
			
 
				+    using token = std::string;
			
 
				+
			
 
				+    struct token_score {
			
 
				+        token tok;
			
 
				+        float score;
			
 
				+    };
			
 
				+
			
 
				+    std::unordered_map<token, id> token_to_id;
			
 
				+    std::vector<token_score> id_to_token;
			
 
				+};
			
 
				+
			
 
				 struct llama_model {
			
 
				     e_model type = MODEL_UNKNOWN;
			
 
				 
			
@@ -198,10 +211,6 @@ struct llama_model {
 
				     // context
			
 
				     struct ggml_context * ctx = NULL;
			
 
				 
			
 
				-    // key + value cache for the self attention
			
 
				-    // TODO: move to llama_state
			
 
				-    struct llama_kv_cache kv_self;
			
 
				-
			
 
				     // the model memory buffer
			
 
				     llama_ctx_buffer buf;
			
 
				 
			
@@ -215,6 +224,11 @@ struct llama_model {
 
				     // for quantize-stats only
			
 
				     std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
			
 
				 
			
 
				+    int64_t t_load_us = 0;
			
 
				+    int64_t t_start_us = 0;
			
 
				+
			
 
				+    llama_vocab vocab;
			
 
				+
			
 
				     ~llama_model() {
			
 
				         if (ctx) {
			
 
				             ggml_free(ctx);
			
@@ -233,24 +247,11 @@ struct llama_model {
 
				     }
			
 
				 };
			
 
				 
			
 
				-struct llama_vocab {
			
 
				-    using id    = int32_t;
			
 
				-    using token = std::string;
			
 
				-
			
 
				-    struct token_score {
			
 
				-        token tok;
			
 
				-        float score;
			
 
				-    };
			
 
				-
			
 
				-    std::unordered_map<token, id> token_to_id;
			
 
				-    std::vector<token_score> id_to_token;
			
 
				-};
			
 
				-
			
 
				 struct llama_context {
			
 
				+    llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
			
 
				+
			
 
				     std::mt19937 rng;
			
 
				 
			
 
				-    int64_t t_load_us = 0;
			
 
				-    int64_t t_start_us = 0;
			
 
				     bool has_evaluated_once = false;
			
 
				 
			
 
				     int64_t t_sample_us = 0;
			
@@ -261,8 +262,16 @@ struct llama_context {
 
				     int32_t n_eval   = 0; // number of eval calls
			
 
				     int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
			
 
				 
			
 
				-    llama_model model;
			
 
				-    llama_vocab vocab;
			
 
				+    const llama_model & model;
			
 
				+    const llama_vocab & vocab;
			
 
				+
			
 
				+    bool model_owner = false;
			
 
				+
			
 
				+    int64_t t_load_us;
			
 
				+    int64_t t_start_us;
			
 
				+
			
 
				+    // key + value cache for the self attention
			
 
				+    struct llama_kv_cache kv_self;
			
 
				 
			
 
				     size_t mem_per_token = 0;
			
 
				 
			
@@ -1033,7 +1042,8 @@ static const char *llama_model_type_name(e_model type) {
 
				 
			
 
				 static void llama_model_load_internal(
			
 
				         const std::string & fname,
			
 
				-        llama_context & lctx,
			
 
				+        llama_model & model,
			
 
				+        llama_vocab & vocab,
			
 
				         int n_ctx,
			
 
				         int n_batch,
			
 
				         int n_gpu_layers,
			
@@ -1047,12 +1057,11 @@ static void llama_model_load_internal(
 
				         llama_progress_callback progress_callback,
			
 
				         void * progress_callback_user_data) {
			
 
				 
			
 
				-    lctx.t_start_us = ggml_time_us();
			
 
				+    model.t_start_us = ggml_time_us();
			
 
				 
			
 
				     std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
			
 
				 
			
 
				-    lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
			
 
				-    auto & model = lctx.model;
			
 
				+    vocab = std::move(ml->file_loaders.at(0)->vocab);
			
 
				     model.hparams = ml->file_loaders.at(0)->hparams;
			
 
				     model.n_gpu_layers = n_gpu_layers;
			
 
				     llama_file_version file_version = ml->file_loaders.at(0)->file_version;
			
@@ -1122,15 +1131,15 @@ static void llama_model_load_internal(
 
				 
			
 
				     // create the ggml context
			
 
				     {
			
 
				-        lctx.model.buf.resize(ctx_size);
			
 
				+        model.buf.resize(ctx_size);
			
 
				         if (use_mlock) {
			
 
				-            lctx.model.mlock_buf.init(lctx.model.buf.addr);
			
 
				-            lctx.model.mlock_buf.grow_to(lctx.model.buf.size);
			
 
				+            model.mlock_buf.init(model.buf.addr);
			
 
				+            model.mlock_buf.grow_to(model.buf.size);
			
 
				         }
			
 
				 
			
 
				         struct ggml_init_params params = {
			
 
				-            /*.mem_size   =*/ lctx.model.buf.size,
			
 
				-            /*.mem_buffer =*/ lctx.model.buf.addr,
			
 
				+            /*.mem_size   =*/ model.buf.size,
			
 
				+            /*.mem_buffer =*/ model.buf.addr,
			
 
				             /*.no_alloc   =*/ ml->use_mmap,
			
 
				         };
			
 
				 
			
@@ -1311,7 +1320,7 @@ static void llama_model_load_internal(
 
				     }
			
 
				 #endif
			
 
				 
			
 
				-    ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
			
 
				+    ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
			
 
				 
			
 
				     if (progress_callback) {
			
 
				         progress_callback(1.0f, progress_callback_user_data);
			
@@ -1321,12 +1330,13 @@ static void llama_model_load_internal(
 
				 
			
 
				     // loading time will be recalculate after the first eval, so
			
 
				     // we take page faults deferred by mmap() into consideration
			
 
				-    lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
			
 
				+    model.t_load_us = ggml_time_us() - model.t_start_us;
			
 
				 }
			
 
				 
			
 
				 static bool llama_model_load(
			
 
				         const std::string & fname,
			
 
				-        llama_context & lctx,
			
 
				+        llama_model & model,
			
 
				+        llama_vocab & vocab,
			
 
				         int n_ctx,
			
 
				         int n_batch,
			
 
				         int n_gpu_layers,
			
@@ -1340,7 +1350,7 @@ static bool llama_model_load(
 
				         llama_progress_callback progress_callback,
			
 
				         void *progress_callback_user_data) {
			
 
				     try {
			
 
				-        llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
			
 
				+        llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
			
 
				                                   use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
			
 
				         return true;
			
 
				     } catch (const std::exception & err) {
			
@@ -1378,7 +1388,7 @@ static bool llama_eval_internal(
 
				     const auto & model   = lctx.model;
			
 
				     const auto & hparams = model.hparams;
			
 
				 
			
 
				-    const auto & kv_self = model.kv_self;
			
 
				+    const auto & kv_self = lctx.kv_self;
			
 
				 
			
 
				     LLAMA_ASSERT(!!kv_self.ctx);
			
 
				 
			
@@ -1726,7 +1736,7 @@ static bool llama_eval_internal(
 
				     //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
			
 
				 
			
 
				     // update kv token count
			
 
				-    lctx.model.kv_self.n = n_past + N;
			
 
				+    lctx.kv_self.n = n_past + N;
			
 
				 
			
 
				     // extract logits
			
 
				     {
			
@@ -2634,12 +2644,39 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
				 // interface implementation
			
 
				 //
			
 
				 
			
 
				-struct llama_context * llama_init_from_file(
			
 
				+struct llama_model * llama_load_model_from_file(
			
 
				                              const char * path_model,
			
 
				             struct llama_context_params   params) {
			
 
				     ggml_time_init();
			
 
				 
			
 
				-    llama_context * ctx = new llama_context;
			
 
				+    llama_model * model = new llama_model;
			
 
				+
			
 
				+    ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
			
 
				+
			
 
				+    if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
			
 
				+                params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
			
 
				+                params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
			
 
				+        delete model;
			
 
				+        fprintf(stderr, "%s: failed to load model\n", __func__);
			
 
				+        return nullptr;
			
 
				+    }
			
 
				+
			
 
				+    return model;
			
 
				+}
			
 
				+
			
 
				+void llama_free_model(struct llama_model * model) {
			
 
				+    delete model;
			
 
				+}
			
 
				+
			
 
				+struct llama_context * llama_new_context_with_model(
			
 
				+                             struct llama_model * model,
			
 
				+            struct llama_context_params   params) {
			
 
				+
			
 
				+    if (!model) {
			
 
				+        return nullptr;
			
 
				+    }
			
 
				+
			
 
				+    llama_context * ctx = new llama_context(*model, model->vocab);
			
 
				 
			
 
				     if (params.seed < 0) {
			
 
				         params.seed = time(NULL);
			
@@ -2667,24 +2704,16 @@ struct llama_context * llama_init_from_file(
 
				 
			
 
				     ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
			
 
				 
			
 
				-    if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
			
 
				-                params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
			
 
				-                params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
			
 
				-        fprintf(stderr, "%s: failed to load model\n", __func__);
			
 
				-        llama_free(ctx);
			
 
				-        return nullptr;
			
 
				-    }
			
 
				-
			
 
				     // reserve memory for context buffers
			
 
				     if (!params.vocab_only) {
			
 
				-        if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
			
 
				+        if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
			
 
				             fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
			
 
				             llama_free(ctx);
			
 
				             return nullptr;
			
 
				         }
			
 
				 
			
 
				         {
			
 
				-            const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v);
			
 
				+            const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
			
 
				             fprintf(stderr, "%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
			
 
				         }
			
 
				 
			
@@ -2736,8 +2765,8 @@ struct llama_context * llama_init_from_file(
 
				 
			
 
				         LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
			
 
				 
			
 
				-        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr,       ctx->buf_compute.size,       0));
			
 
				-        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv",   ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0));
			
 
				+        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
			
 
				+        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv",   ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
			
 
				 
			
 
				         LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
			
 
				         LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
			
@@ -2748,7 +2777,23 @@ struct llama_context * llama_init_from_file(
 
				     return ctx;
			
 
				 }
			
 
				 
			
 
				+struct llama_context * llama_init_from_file(
			
 
				+                             const char * path_model,
			
 
				+            struct llama_context_params   params) {
			
 
				+
			
 
				+    struct llama_model * model = llama_load_model_from_file(path_model, params);
			
 
				+    if (!model) {
			
 
				+        return nullptr;
			
 
				+    }
			
 
				+    struct llama_context * ctx = llama_new_context_with_model(model, params);
			
 
				+    ctx->model_owner = true;
			
 
				+    return ctx;
			
 
				+}
			
 
				+
			
 
				 void llama_free(struct llama_context * ctx) {
			
 
				+    if (ctx->model_owner) {
			
 
				+        delete &ctx->model;
			
 
				+    }
			
 
				     delete ctx;
			
 
				 }
			
 
				 
			
@@ -2765,11 +2810,9 @@ int llama_model_quantize(
 
				     }
			
 
				 }
			
 
				 
			
 
				-int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
			
 
				+int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
			
 
				     fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
			
 
				 
			
 
				-    auto & model = ctx->model;
			
 
				-
			
 
				     const int64_t t_start_lora_us = ggml_time_us();
			
 
				 
			
 
				     auto fin = std::ifstream(path_lora, std::ios::binary);
			
@@ -3012,7 +3055,16 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
 
				 
			
 
				 int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
			
 
				     try {
			
 
				-        return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
			
 
				+        return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
			
 
				+    } catch (const std::exception & err) {
			
 
				+        fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
			
 
				+        return 1;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
			
 
				+    try {
			
 
				+        return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
			
 
				     } catch (const std::exception & err) {
			
 
				         fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
			
 
				         return 1;
			
@@ -3020,7 +3072,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
 
				 }
			
 
				 
			
 
				 int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
			
 
				-    return ctx->model.kv_self.n;
			
 
				+    return ctx->kv_self.n;
			
 
				 }
			
 
				 
			
 
				 #define LLAMA_MAX_RNG_STATE (64*1024)
			
@@ -3045,7 +3097,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
 
				     const size_t s_embedding       = ctx->embedding.size() * sizeof(float);
			
 
				     const size_t s_kv_size         = sizeof(size_t);
			
 
				     const size_t s_kv_ntok         = sizeof(int);
			
 
				-    const size_t s_kv              = ctx->model.kv_self.buf.size;
			
 
				+    const size_t s_kv              = ctx->kv_self.buf.size;
			
 
				 
			
 
				     const size_t s_total = (
			
 
				         + s_rng_size
			
@@ -3111,7 +3163,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
 
				 
			
 
				     // copy kv cache
			
 
				     {
			
 
				-        const auto & kv_self = ctx->model.kv_self;
			
 
				+        const auto & kv_self = ctx->kv_self;
			
 
				         const auto & hparams = ctx->model.hparams;
			
 
				         const int    n_layer = hparams.n_layer;
			
 
				         const int    n_embd  = hparams.n_embd;
			
@@ -3215,7 +3267,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
 
				 
			
 
				     // set kv cache
			
 
				     {
			
 
				-        const auto & kv_self = ctx->model.kv_self;
			
 
				+        const auto & kv_self = ctx->kv_self;
			
 
				         const auto & hparams = ctx->model.hparams;
			
 
				         const int    n_layer = hparams.n_layer;
			
 
				         const int    n_embd  = hparams.n_embd;
			
@@ -3259,7 +3311,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
 
				             ggml_free(cpy_ctx);
			
 
				         }
			
 
				 
			
 
				-        ctx->model.kv_self.n = kv_ntok;
			
 
				+        ctx->kv_self.n = kv_ntok;
			
 
				     }
			
 
				 
			
 
				     const size_t nread    = inp - src;
			
@@ -3506,6 +3558,6 @@ const char * llama_print_system_info(void) {
 
				 }
			
 
				 
			
 
				 // For internal test use
			
 
				-std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
			
 
				+const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
			
 
				     return ctx->model.tensors_by_name;
			
 
				 }
			
--- a/llama.h
+++ b/llama.h
@@ -26,6 +26,14 @@
 
				 #    define LLAMA_API
			
 
				 #endif
			
 
				 
			
 
				+#ifdef __GNUC__
			
 
				+#    define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
			
 
				+#elif defined(_MSC_VER)
			
 
				+#    define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
			
 
				+#else
			
 
				+#    define DEPRECATED(func, hint) func
			
 
				+#endif
			
 
				+
			
 
				 #define LLAMA_FILE_MAGIC_GGJT        0x67676a74u // 'ggjt'
			
 
				 #define LLAMA_FILE_MAGIC_GGLA        0x67676c61u // 'ggla'
			
 
				 #define LLAMA_FILE_MAGIC_GGMF        0x67676d66u // 'ggmf'
			
@@ -53,6 +61,7 @@ extern "C" {
 
				     // TODO: show sample usage
			
 
				     //
			
 
				 
			
 
				+    struct llama_model;
			
 
				     struct llama_context;
			
 
				 
			
 
				     typedef int llama_token;
			
@@ -136,12 +145,23 @@ extern "C" {
 
				 
			
 
				     LLAMA_API int64_t llama_time_us();
			
 
				 
			
 
				+    LLAMA_API struct llama_model * llama_load_model_from_file(
			
 
				+                             const char * path_model,
			
 
				+            struct llama_context_params   params);
			
 
				+
			
 
				+    LLAMA_API void llama_free_model(struct llama_model * model);
			
 
				+
			
 
				+    LLAMA_API struct llama_context * llama_new_context_with_model(
			
 
				+                     struct llama_model * model,
			
 
				+            struct llama_context_params   params);
			
 
				+
			
 
				     // Various functions for loading a ggml llama model.
			
 
				     // Allocate (almost) all memory needed for the model.
			
 
				     // Return NULL on failure
			
 
				-    LLAMA_API struct llama_context * llama_init_from_file(
			
 
				+    LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
			
 
				                              const char * path_model,
			
 
				-            struct llama_context_params   params);
			
 
				+            struct llama_context_params   params),
			
 
				+            "please use llama_load_model_from_file combined with llama_new_context_with_model instead");
			
 
				 
			
 
				     // Frees all allocated memory
			
 
				     LLAMA_API void llama_free(struct llama_context * ctx);
			
@@ -158,8 +178,15 @@ extern "C" {
 
				     // The model needs to be reloaded before applying a new adapter, otherwise the adapter
			
 
				     // will be applied on top of the previous one
			
 
				     // Returns 0 on success
			
 
				-    LLAMA_API int llama_apply_lora_from_file(
			
 
				+    LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
			
 
				             struct llama_context * ctx,
			
 
				+                      const char * path_lora,
			
 
				+                      const char * path_base_model,
			
 
				+                             int   n_threads),
			
 
				+            "please use llama_model_apply_lora_from_file instead");
			
 
				+
			
 
				+    LLAMA_API int llama_model_apply_lora_from_file(
			
 
				+            const struct llama_model * model,
			
 
				                       const char * path_lora,
			
 
				                       const char * path_base_model,
			
 
				                              int   n_threads);
			
@@ -310,7 +337,7 @@ extern "C" {
 
				 #include <string>
			
 
				 struct ggml_tensor;
			
 
				 
			
 
				-std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
			
 
				+const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
			
 
				 
			
 
				 #endif
			
 
				 
			
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -28,6 +28,7 @@ int main(int argc, char **argv) {
 
				 
			
 
				     fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
			
 
				 
			
 
				+    llama_model * model;
			
 
				     llama_context * ctx;
			
 
				 
			
 
				     // load the vocab
			
@@ -36,10 +37,18 @@ int main(int argc, char **argv) {
 
				 
			
 
				         lparams.vocab_only = true;
			
 
				 
			
 
				-        ctx = llama_init_from_file(fname.c_str(), lparams);
			
 
				+        model = llama_load_model_from_file(fname.c_str(), lparams);
			
 
				+
			
 
				+        if (model == NULL) {
			
 
				+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
			
 
				+            return 1;
			
 
				+        }
			
 
				+
			
 
				+        ctx = llama_new_context_with_model(model, lparams);
			
 
				 
			
 
				         if (ctx == NULL) {
			
 
				             fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
			
 
				+            llama_free_model(model);
			
 
				             return 1;
			
 
				         }
			
 
				     }
			
@@ -48,6 +57,8 @@ int main(int argc, char **argv) {
 
				 
			
 
				     if (n_vocab != 32000) {
			
 
				         fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab);
			
 
				+        llama_free_model(model);
			
 
				+        llama_free(ctx);
			
 
				         return 2;
			
 
				     }
			
 
				 
			
@@ -77,10 +88,13 @@ int main(int argc, char **argv) {
 
				             }
			
 
				             fprintf(stderr, "\n");
			
 
				 
			
 
				+            llama_free_model(model);
			
 
				+            llama_free(ctx);
			
 
				             return 3;
			
 
				         }
			
 
				     }
			
 
				 
			
 
				+    llama_free_model(model);
			
 
				     llama_free(ctx);
			
 
				 
			
 
				     return 0;