1 năm trước cách đây · 1442677f92
--- a/common/common.cpp
+++ b/common/common.cpp
--- a/common/common.h
+++ b/common/common.h
@@ -60,7 +60,7 @@ struct gpt_params {
 
				     int32_t n_threads_batch       = -1;    // number of threads to use for batch processing (-1 = use n_threads)
			
 
				     int32_t n_threads_batch_draft = -1;
			
 
				     int32_t n_predict             = -1;    // new tokens to predict
			
 
				-    int32_t n_ctx                 = 512;   // context size
			
 
				+    int32_t n_ctx                 = 0;     // context size
			
 
				     int32_t n_batch               = 2048;  // logical batch size for prompt processing (must be >=32 to use BLAS)
			
 
				     int32_t n_ubatch              = 512;   // physical batch size for prompt processing (must be >=32 to use BLAS)
			
 
				     int32_t n_keep                = 0;     // number of tokens to keep from initial prompt
			
@@ -99,23 +99,23 @@ struct gpt_params {
 
				     // // sampling parameters
			
 
				     struct llama_sampling_params sparams;
			
 
				 
			
 
				-    std::string model                = "";  // model path
			
 
				-    std::string model_draft          = "";  // draft model for speculative decoding
			
 
				+    std::string model                = ""; // model path
			
 
				+    std::string model_draft          = ""; // draft model for speculative decoding
			
 
				     std::string model_alias          = "unknown"; // model alias
			
 
				-    std::string model_url            = "";  // model url to download
			
 
				-    std::string hf_repo              = "";  // HF repo
			
 
				-    std::string hf_file              = "";  // HF file
			
 
				+    std::string model_url            = ""; // model url to download
			
 
				+    std::string hf_repo              = ""; // HF repo
			
 
				+    std::string hf_file              = ""; // HF file
			
 
				     std::string prompt               = "";
			
 
				-    std::string prompt_file          = "";  // store the external prompt file name
			
 
				-    std::string path_prompt_cache    = "";  // path to file for saving/loading prompt eval state
			
 
				-    std::string input_prefix         = "";  // string to prefix user inputs with
			
 
				-    std::string input_suffix         = "";  // string to suffix user inputs with
			
 
				-    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
			
 
				-    std::string logdir               = "";  // directory in which to save YAML log files
			
 
				+    std::string prompt_file          = ""; // store the external prompt file name
			
 
				+    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state
			
 
				+    std::string input_prefix         = ""; // string to prefix user inputs with
			
 
				+    std::string input_suffix         = ""; // string to suffix user inputs with
			
 
				+    std::string logdir               = ""; // directory in which to save YAML log files
			
 
				     std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding
			
 
				     std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
			
 
				-    std::string logits_file          = "";  // file for saving *all* logits
			
 
				+    std::string logits_file          = ""; // file for saving *all* logits
			
 
				 
			
 
				+    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
			
 
				     std::vector<llama_model_kv_override> kv_overrides;
			
 
				 
			
 
				     // TODO: avoid tuple, use struct
			
@@ -127,8 +127,8 @@ struct gpt_params {
 
				     int32_t control_vector_layer_start = -1; // layer range for control vector
			
 
				     int32_t control_vector_layer_end   = -1; // layer range for control vector
			
 
				 
			
 
				-    int  ppl_stride        = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
			
 
				-    int  ppl_output_type   = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
			
 
				+    int32_t ppl_stride      = 0;    // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
			
 
				+    int32_t ppl_output_type = 0;    // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
			
 
				                                     //                                       (which is more convenient to use for plotting)
			
 
				                                     //
			
 
				     bool   hellaswag       = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
			
@@ -142,19 +142,17 @@ struct gpt_params {
 
				 
			
 
				     bool   kl_divergence   = false; // compute KL divergence
			
 
				 
			
 
				-    bool random_prompt     = false; // do not randomize prompt if none provided
			
 
				+    bool usage             = false; // print usage
			
 
				     bool use_color         = false; // use color to distinguish generations and inputs
			
 
				-    bool interactive       = false; // interactive mode
			
 
				-    bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
			
 
				     bool special           = false; // enable special token output
			
 
				+    bool interactive       = false; // interactive mode
			
 
				+    bool interactive_first = false; // wait for user input immediately
			
 
				     bool conversation      = false; // conversation mode (does not print special tokens and suffix/prefix)
			
 
				-    bool chatml            = false; // chatml mode (used for models trained on chatml syntax)
			
 
				     bool prompt_cache_all  = false; // save user input and generations to prompt cache
			
 
				     bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it
			
 
				 
			
 
				     bool embedding         = false; // get only sentence embedding
			
 
				-    bool escape            = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
			
 
				-    bool interactive_first = false; // wait for user input immediately
			
 
				+    bool escape            = true;  // escape "\n", "\r", "\t", "\'", "\"", and "\\"
			
 
				     bool multiline_input   = false; // reverse the usage of `\`
			
 
				     bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
			
 
				     bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
			
@@ -162,10 +160,10 @@ struct gpt_params {
 
				 
			
 
				     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
			
 
				     bool ignore_eos        = false; // ignore generated EOS tokens
			
 
				-    bool instruct          = false; // instruction mode (used for Alpaca models)
			
 
				     bool logits_all        = false; // return logits for all tokens in the batch
			
 
				     bool use_mmap          = true;  // use mmap for faster loads
			
 
				     bool use_mlock         = false; // use mlock to keep model in memory
			
 
				+    bool verbose           = false;
			
 
				     bool verbose_prompt    = false; // print prompt tokens before generation
			
 
				     bool display_prompt    = true;  // print prompt before generation
			
 
				     bool infill            = false; // use infill mode
			
@@ -180,6 +178,47 @@ struct gpt_params {
 
				     // multimodal models (see examples/llava)
			
 
				     std::string mmproj = "";        // path to multimodal projector
			
 
				     std::vector<std::string> image; // path to image file(s)
			
 
				+
			
 
				+    // server params
			
 
				+    int32_t port           = 8080;
			
 
				+    int32_t timeout_read   = 600;
			
 
				+    int32_t timeout_write  = timeout_read;
			
 
				+    int32_t n_threads_http = -1;
			
 
				+
			
 
				+    std::string hostname      = "127.0.0.1";
			
 
				+    std::string public_path   = "";
			
 
				+    std::string chat_template = "";
			
 
				+    std::string system_prompt = "";
			
 
				+
			
 
				+    std::vector<std::string> api_keys;
			
 
				+
			
 
				+    std::string ssl_file_key  = "";
			
 
				+    std::string ssl_file_cert = "";
			
 
				+
			
 
				+    bool endpoint_slots   = true;
			
 
				+    bool endpoint_metrics = false;
			
 
				+
			
 
				+    bool log_json = false;
			
 
				+
			
 
				+    std::string slot_save_path;
			
 
				+
			
 
				+    // batched-bench params
			
 
				+    bool is_pp_shared = false;
			
 
				+
			
 
				+    std::vector<int32_t> n_pp;
			
 
				+    std::vector<int32_t> n_tg;
			
 
				+    std::vector<int32_t> n_pl;
			
 
				+
			
 
				+    // retrieval params
			
 
				+    std::vector<std::string> context_files; // context files to embed
			
 
				+
			
 
				+    int32_t chunk_size = 64; // chunk size for context embedding
			
 
				+
			
 
				+    std::string chunk_separator = "\n"; // chunk separator for context embedding
			
 
				+
			
 
				+    // passkey params
			
 
				+    int32_t n_junk = 250; // number of times to repeat the junk text
			
 
				+    int32_t i_pos  = -1;  // position of the passkey in the junk text
			
 
				 };
			
 
				 
			
 
				 void gpt_params_handle_model_default(gpt_params & params);
			
@@ -199,7 +238,20 @@ std::vector<std::string> string_split(std::string input, char separator);
 
				 
			
 
				 std::string string_strip(const std::string & str);
			
 
				 std::string string_get_sortable_timestamp();
			
 
				-std::string string_random_prompt(std::mt19937 & rng);
			
 
				+
			
 
				+template<class T>
			
 
				+static std::vector<T> string_split(const std::string & str, char delim) {
			
 
				+    std::vector<T> values;
			
 
				+    std::istringstream str_stream(str);
			
 
				+    std::string token;
			
 
				+    while (std::getline(str_stream, token, delim)) {
			
 
				+        T value;
			
 
				+        std::istringstream token_stream(token);
			
 
				+        token_stream >> value;
			
 
				+        values.push_back(value);
			
 
				+    }
			
 
				+    return values;
			
 
				+}
			
 
				 
			
 
				 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
			
 
				 void string_process_escapes(std::string & input);
			
@@ -282,6 +334,13 @@ std::string llama_detokenize_bpe(
 
				 // defaults to true when model type is SPM, otherwise false.
			
 
				 bool llama_should_add_bos_token(const llama_model * model);
			
 
				 
			
 
				+//
			
 
				+// Chat template utils
			
 
				+//
			
 
				+
			
 
				+// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
			
 
				+bool llama_chat_verify_template(const std::string & tmpl);
			
 
				+
			
 
				 //
			
 
				 // KV cache utils
			
 
				 //
			
--- a/examples/batched-bench/README.md
+++ b/examples/batched-bench/README.md
@@ -10,16 +10,16 @@ There are 2 modes of operation:
 
				 - `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
			
 
				 
			
 
				 ```bash
			
 
				-./batched-bench MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
			
 
				+./batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
			
 
				 
			
 
				 # LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
			
 
				-./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 2048 512 0 99
			
 
				+./batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
			
 
				 
			
 
				 # LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
			
 
				-./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 2048 512 1 99
			
 
				+./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
			
 
				 
			
 
				 # custom set of batches
			
 
				-./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 512 512 0 999 0 128,256,512 128,256 1,2,4,8,16,32
			
 
				+./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
			
 
				 ```
			
 
				 
			
 
				 ## Sample results
			
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -28,67 +28,27 @@ static std::vector<int> parse_list(char * p) {
 
				     return ret;
			
 
				 }
			
 
				 
			
 
				-int main(int argc, char ** argv) {
			
 
				-    gpt_params params;
			
 
				-
			
 
				-    if (argc == 1 || argv[1][0] == '-') {
			
 
				-        printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [FATTN] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
			
 
				-        printf("  <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
			
 
				-        printf("  example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
			
 
				-        return 1 ;
			
 
				-    }
			
 
				-
			
 
				-    int n_kv_max     = 2048;
			
 
				-    int n_batch      = 2048;
			
 
				-    int n_ubatch     = 512;
			
 
				-    bool flash_attn  = false;
			
 
				-    int is_pp_shared = 0;
			
 
				-    int n_gpu_layers = 0;
			
 
				-
			
 
				-    std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
			
 
				-    std::vector<int> n_tg = { 128, 256, };
			
 
				-    std::vector<int> n_pl = { 1, 2, 4, 8, 16, 32, };
			
 
				-    //std::vector<int> n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };
			
 
				-
			
 
				-    if (argc >= 2) {
			
 
				-        params.model = argv[1];
			
 
				-    }
			
 
				-
			
 
				-    if (argc >= 3) {
			
 
				-        n_kv_max = std::atoi(argv[2]);
			
 
				-    }
			
 
				-
			
 
				-    if (argc >= 4) {
			
 
				-        n_batch = std::atoi(argv[3]);
			
 
				-    }
			
 
				-
			
 
				-    if (argc >= 5) {
			
 
				-        n_ubatch = std::atoi(argv[4]);
			
 
				-    }
			
 
				-
			
 
				-    if (argc >= 6) {
			
 
				-        flash_attn = std::atoi(argv[5]);
			
 
				-    }
			
 
				+static void print_usage(int argc, char ** argv, const gpt_params & params) {
			
 
				+    gpt_params_print_usage(argc, argv, params);
			
 
				 
			
 
				-    if (argc >= 7) {
			
 
				-        is_pp_shared = std::atoi(argv[6]);
			
 
				-    }
			
 
				+    LOG_TEE("\nexample usage:\n");
			
 
				+    LOG_TEE("\n    %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
			
 
				+    LOG_TEE("\n");
			
 
				+}
			
 
				 
			
 
				-    if (argc >= 8) {
			
 
				-        n_gpu_layers = std::atoi(argv[7]);
			
 
				-    }
			
 
				+int main(int argc, char ** argv) {
			
 
				+    gpt_params params;
			
 
				 
			
 
				-    if (argc >= 9) {
			
 
				-        n_pp = parse_list(argv[8]);
			
 
				+    if (!gpt_params_parse(argc, argv, params)) {
			
 
				+        print_usage(argc, argv, params);
			
 
				+        return 1;
			
 
				     }
			
 
				 
			
 
				-    if (argc >= 10) {
			
 
				-        n_tg = parse_list(argv[9]);
			
 
				-    }
			
 
				+    int is_pp_shared = params.is_pp_shared;
			
 
				 
			
 
				-    if (argc >= 11) {
			
 
				-        n_pl = parse_list(argv[10]);
			
 
				-    }
			
 
				+    std::vector<int> n_pp = params.n_pp;
			
 
				+    std::vector<int> n_tg = params.n_tg;
			
 
				+    std::vector<int> n_pl = params.n_pl;
			
 
				 
			
 
				     // init LLM
			
 
				 
			
@@ -97,12 +57,7 @@ int main(int argc, char ** argv) {
 
				 
			
 
				     // initialize the model
			
 
				 
			
 
				-    llama_model_params model_params = llama_model_default_params();
			
 
				-
			
 
				-    const std::vector<float> t_split(llama_max_devices(), 0.0f);
			
 
				-
			
 
				-    model_params.n_gpu_layers = n_gpu_layers;
			
 
				-    model_params.tensor_split = t_split.data();
			
 
				+    llama_model_params model_params = llama_model_params_from_gpt_params(params);
			
 
				 
			
 
				     llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
			
 
				 
			
@@ -111,16 +66,7 @@ int main(int argc, char ** argv) {
 
				         return 1;
			
 
				     }
			
 
				 
			
 
				-    llama_context_params ctx_params = llama_context_default_params();
			
 
				-
			
 
				-    ctx_params.seed       = 1234;
			
 
				-    ctx_params.n_ctx      = n_kv_max;
			
 
				-    ctx_params.n_batch    = n_batch;
			
 
				-    ctx_params.n_ubatch   = n_ubatch;
			
 
				-    ctx_params.flash_attn = flash_attn;
			
 
				-
			
 
				-    ctx_params.n_threads       = params.n_threads;
			
 
				-    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
			
 
				+    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
			
 
				 
			
 
				     // ensure enough sequences are available
			
 
				     ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
			
@@ -132,6 +78,8 @@ int main(int argc, char ** argv) {
 
				         return 1;
			
 
				     }
			
 
				 
			
 
				+    const int32_t n_kv_max = llama_n_ctx(ctx);
			
 
				+
			
 
				     llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
			
 
				 
			
 
				     // decode in batches of ctx_params.n_batch tokens
			
@@ -175,7 +123,7 @@ int main(int argc, char ** argv) {
 
				     }
			
 
				 
			
 
				     LOG_TEE("\n");
			
 
				-    LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, flash_attn, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
			
 
				+    LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
			
 
				     LOG_TEE("\n");
			
 
				 
			
 
				     LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
			
--- a/examples/batched/README.md
+++ b/examples/batched/README.md
@@ -3,7 +3,7 @@
 
				 The example demonstrates batched generation from a given prompt
			
 
				 
			
 
				 ```bash
			
 
				-./batched ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" 4
			
 
				+./batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
			
 
				 
			
 
				 ...
			
 
				 
			
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -7,48 +7,31 @@
 
				 #include <string>
			
 
				 #include <vector>
			
 
				 
			
 
				-int main(int argc, char ** argv) {
			
 
				-    gpt_params params;
			
 
				-
			
 
				-    if (argc == 1 || argv[1][0] == '-') {
			
 
				-        printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
			
 
				-        return 1 ;
			
 
				-    }
			
 
				-
			
 
				-    // number of parallel batches
			
 
				-    int n_parallel = 1;
			
 
				+static void print_usage(int argc, char ** argv, const gpt_params & params) {
			
 
				+    gpt_params_print_usage(argc, argv, params);
			
 
				 
			
 
				-    // total length of the sequences including the prompt
			
 
				-    int n_len = 32;
			
 
				-
			
 
				-    // number of layers to offload to the GPU
			
 
				-    int n_gpu_layers = 0;
			
 
				-
			
 
				-    if (argc >= 2) {
			
 
				-        params.model = argv[1];
			
 
				-    }
			
 
				+    LOG_TEE("\nexample usage:\n");
			
 
				+    LOG_TEE("\n    %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
			
 
				+    LOG_TEE("\n");
			
 
				+}
			
 
				 
			
 
				-    if (argc >= 3) {
			
 
				-        params.prompt = argv[2];
			
 
				-    }
			
 
				+int main(int argc, char ** argv) {
			
 
				+    gpt_params params;
			
 
				 
			
 
				-    if (argc >= 4) {
			
 
				-        n_parallel = std::atoi(argv[3]);
			
 
				-    }
			
 
				+    params.prompt = "Hello my name is";
			
 
				+    params.n_predict = 32;
			
 
				 
			
 
				-    if (argc >= 5) {
			
 
				-        n_len = std::atoi(argv[4]);
			
 
				+    if (!gpt_params_parse(argc, argv, params)) {
			
 
				+        print_usage(argc, argv, params);
			
 
				+        return 1;
			
 
				     }
			
 
				 
			
 
				-    if (argc >= 6) {
			
 
				-        n_gpu_layers = std::atoi(argv[5]);
			
 
				-    }
			
 
				 
			
 
				-    if (params.prompt.empty()) {
			
 
				-        params.prompt = "Hello my name is";
			
 
				-    }
			
 
				+    // number of parallel batches
			
 
				+    int n_parallel = params.n_parallel;
			
 
				 
			
 
				-    string_process_escapes(params.prompt);
			
 
				+    // total length of the sequences including the prompt
			
 
				+    int n_predict = 32;
			
 
				 
			
 
				     // init LLM
			
 
				 
			
@@ -57,9 +40,7 @@ int main(int argc, char ** argv) {
 
				 
			
 
				     // initialize the model
			
 
				 
			
 
				-    llama_model_params model_params = llama_model_default_params();
			
 
				-
			
 
				-    model_params.n_gpu_layers = n_gpu_layers;
			
 
				+    llama_model_params model_params = llama_model_params_from_gpt_params(params);
			
 
				 
			
 
				     llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
			
 
				 
			
@@ -73,18 +54,14 @@ int main(int argc, char ** argv) {
 
				     std::vector<llama_token> tokens_list;
			
 
				     tokens_list = ::llama_tokenize(model, params.prompt, true);
			
 
				 
			
 
				-    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;
			
 
				+    const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
			
 
				 
			
 
				     // initialize the context
			
 
				 
			
 
				-    llama_context_params ctx_params = llama_context_default_params();
			
 
				+    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
			
 
				 
			
 
				-    ctx_params.seed  = 1234;
			
 
				     ctx_params.n_ctx   = n_kv_req;
			
 
				-    ctx_params.n_batch = std::max(n_len, n_parallel);
			
 
				-    ctx_params.n_seq_max       = n_parallel;
			
 
				-    ctx_params.n_threads       = params.n_threads;
			
 
				-    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
			
 
				+    ctx_params.n_batch = std::max(n_predict, n_parallel);
			
 
				 
			
 
				     llama_context * ctx = llama_new_context_with_model(model, ctx_params);
			
 
				 
			
@@ -93,9 +70,9 @@ int main(int argc, char ** argv) {
 
				         return 1;
			
 
				     }
			
 
				 
			
 
				-    const int n_ctx    = llama_n_ctx(ctx);
			
 
				+    const int n_ctx = llama_n_ctx(ctx);
			
 
				 
			
 
				-    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
			
 
				+    LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
			
 
				 
			
 
				     // make sure the KV cache is big enough to hold all the prompt and generated tokens
			
 
				     if (n_kv_req > n_ctx) {
			
@@ -156,7 +133,7 @@ int main(int argc, char ** argv) {
 
				 
			
 
				     const auto t_main_start = ggml_time_us();
			
 
				 
			
 
				-    while (n_cur <= n_len) {
			
 
				+    while (n_cur <= n_predict) {
			
 
				         // prepare the next batch
			
 
				         llama_batch_clear(batch);
			
 
				 
			
@@ -192,7 +169,7 @@ int main(int argc, char ** argv) {
 
				             //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
			
 
				 
			
 
				             // is it an end of generation? -> mark the stream as finished
			
 
				-            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
			
 
				+            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
			
 
				                 i_batch[i] = -1;
			
 
				                 LOG_TEE("\n");
			
 
				                 if (n_parallel > 1) {
			
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -63,6 +63,7 @@ int main(int argc, char ** argv) {
 
				     gpt_params params;
			
 
				 
			
 
				     if (!gpt_params_parse(argc, argv, params)) {
			
 
				+        gpt_params_print_usage(argc, argv, params);
			
 
				         return 1;
			
 
				     }
			
 
				 
			
@@ -79,9 +80,6 @@ int main(int argc, char ** argv) {
 
				     fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
			
 
				 
			
 
				     std::mt19937 rng(params.seed);
			
 
				-    if (params.random_prompt) {
			
 
				-        params.prompt = string_random_prompt(rng);
			
 
				-    }
			
 
				 
			
 
				     llama_backend_init();
			
 
				     llama_numa_init(params.numa);
			
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -140,20 +140,18 @@ static bool run(llama_context * ctx, const gpt_params & params) {
 
				 }
			
 
				 
			
 
				 int main(int argc, char ** argv) {
			
 
				-
			
 
				     callback_data cb_data;
			
 
				 
			
 
				     gpt_params params;
			
 
				+
			
 
				     if (!gpt_params_parse(argc, argv, params)) {
			
 
				+        gpt_params_print_usage(argc, argv, params);
			
 
				         return 1;
			
 
				     }
			
 
				 
			
 
				     print_build_info();
			
 
				 
			
 
				     std::mt19937 rng(params.seed);
			
 
				-    if (params.random_prompt) {
			
 
				-        params.prompt = string_random_prompt(rng);
			
 
				-    }
			
 
				 
			
 
				     llama_backend_init();
			
 
				     llama_numa_init(params.numa);
			
--- a/examples/gguf-split/tests.sh
+++ b/examples/gguf-split/tests.sh
@@ -41,7 +41,7 @@ echo PASS
 
				 echo
			
 
				 
			
 
				 # 2b. Test the sharded model is loading properly
			
 
				-$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --random-prompt --n-predict 32
			
 
				+$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32
			
 
				 echo PASS
			
 
				 echo
			
 
				 
			
@@ -51,7 +51,7 @@ echo PASS
 
				 echo
			
 
				 
			
 
				 # 3b. Test the merged model is loading properly
			
 
				-$MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32
			
 
				+$MAIN --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32
			
 
				 echo PASS
			
 
				 echo
			
 
				 
			
@@ -61,7 +61,7 @@ echo PASS
 
				 echo
			
 
				 
			
 
				 # 4b. Test the sharded model is loading properly
			
 
				-$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --random-prompt --n-predict 32
			
 
				+$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32
			
 
				 echo PASS
			
 
				 echo
			
 
				 
			
@@ -71,7 +71,7 @@ echo
 
				 #echo
			
 
				 
			
 
				 # 5b. Test the merged model is loading properly
			
 
				-#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --random-prompt --n-predict 32
			
 
				+#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32
			
 
				 #echo PASS
			
 
				 #echo
			
 
				 
			
@@ -81,7 +81,7 @@ echo PASS
 
				 echo
			
 
				 
			
 
				 # 6b. Test the sharded model is loading properly
			
 
				-$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --random-prompt --n-predict 32
			
 
				+$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32
			
 
				 echo PASS
			
 
				 echo
			
 
				 
			
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -153,7 +153,9 @@ static std::string gritlm_instruction(const std::string & instruction) {
 
				 
			
 
				 int main(int argc, char * argv[]) {
			
 
				     gpt_params params;
			
 
				+
			
 
				     if (!gpt_params_parse(argc, argv, params)) {
			
 
				+        gpt_params_print_usage(argc, argv, params);
			
 
				         return 1;
			
 
				     }
			
 
				 
			
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -533,7 +533,6 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
 
				 }
			
 
				 
			
 
				 int main(int argc, char ** argv) {
			
 
				-
			
 
				     StatParams sparams;
			
 
				     std::string prev_result_file;
			
 
				     std::string combine_files;
			
@@ -581,7 +580,9 @@ int main(int argc, char ** argv) {
 
				 
			
 
				     gpt_params params;
			
 
				     params.n_batch = 512;
			
 
				-    if (!gpt_params_parse(args.size(), args.data(), params)) {
			
 
				+
			
 
				+    if (!gpt_params_parse(argc, argv, params)) {
			
 
				+        gpt_params_print_usage(argc, argv, params);
			
 
				         return 1;
			
 
				     }
			
 
				 
			
@@ -597,9 +598,6 @@ int main(int argc, char ** argv) {
 
				     fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
			
 
				 
			
 
				     std::mt19937 rng(params.seed);
			
 
				-    if (params.random_prompt) {
			
 
				-        params.prompt = string_random_prompt(rng);
			
 
				-    }
			
 
				 
			
 
				     sparams.dataset = params.prompt_file;
			
 
				     g_collector.set_parameters(std::move(sparams));
			
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -107,6 +107,7 @@ int main(int argc, char ** argv) {
 
				     g_params = &params;
			
 
				 
			
 
				     if (!gpt_params_parse(argc, argv, params)) {
			
 
				+        gpt_params_print_usage(argc, argv, params);
			
 
				         return 1;
			
 
				     }
			
 
				 
			
@@ -139,27 +140,6 @@ int main(int argc, char ** argv) {
 
				         LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
			
 
				         params.n_ctx = 8;
			
 
				     }
			
 
				-    if (params.instruct) {
			
 
				-        printf("\n************\n");
			
 
				-        printf("%s: please use the 'main' tool for instruct mode\n", __func__);
			
 
				-        printf("************\n\n");
			
 
				-
			
 
				-        return 0;
			
 
				-    }
			
 
				-    if (params.chatml) {
			
 
				-        printf("\n************\n");
			
 
				-        printf("%s: please use the 'main' tool for chatml mode\n", __func__);
			
 
				-        printf("************\n\n");
			
 
				-
			
 
				-        return 0;
			
 
				-    }
			
 
				-    if (!params.antiprompt.empty()) {
			
 
				-        printf("\n************\n");
			
 
				-        printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
			
 
				-        printf("************\n\n");
			
 
				-
			
 
				-        return 0;
			
 
				-    }
			
 
				     if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
			
 
				         printf("\n************\n");
			
 
				         printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
			
@@ -167,20 +147,6 @@ int main(int argc, char ** argv) {
 
				 
			
 
				         return 0;
			
 
				     }
			
 
				-    if (params.random_prompt) {
			
 
				-        printf("\n************\n");
			
 
				-        printf("%s: please use the 'main' tool for random prompt mode\n", __func__);
			
 
				-        printf("************\n\n");
			
 
				-
			
 
				-        return 0;
			
 
				-    }
			
 
				-    if (!params.path_prompt_cache.empty()) {
			
 
				-        printf("\n************\n");
			
 
				-        printf("%s: infill does not support prompt caching\n", __func__);
			
 
				-        printf("************\n\n");
			
 
				-
			
 
				-        return 0;
			
 
				-    }
			
 
				 
			
 
				     if (params.rope_freq_base != 0.0) {
			
 
				         LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
			
@@ -207,17 +173,13 @@ int main(int argc, char ** argv) {
 
				 
			
 
				     llama_model * model;
			
 
				     llama_context * ctx;
			
 
				-    llama_context * ctx_guidance = NULL;
			
 
				+
			
 
				     g_model = &model;
			
 
				     g_ctx = &ctx;
			
 
				 
			
 
				     // load the model and apply lora adapter, if any
			
 
				     LOG("%s: load the model and apply lora adapter, if any\n", __func__);
			
 
				     std::tie(model, ctx) = llama_init_from_gpt_params(params);
			
 
				-    if (sparams.cfg_scale > 1.f) {
			
 
				-        struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
			
 
				-        ctx_guidance = llama_new_context_with_model(model, lparams);
			
 
				-    }
			
 
				 
			
 
				     if (model == NULL) {
			
 
				         LOG_TEE("%s: error: unable to load model\n", __func__);
			
@@ -273,25 +235,6 @@ int main(int argc, char ** argv) {
 
				         LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
			
 
				     }
			
 
				 
			
 
				-    // Tokenize negative prompt
			
 
				-    std::vector<llama_token> guidance_inp;
			
 
				-    int guidance_offset = 0;
			
 
				-    int original_prompt_len = 0;
			
 
				-    if (ctx_guidance) {
			
 
				-        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
			
 
				-
			
 
				-        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
			
 
				-        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
			
 
				-
			
 
				-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
			
 
				-        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
			
 
				-
			
 
				-        original_prompt_len = original_inp.size();
			
 
				-        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
			
 
				-        LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
			
 
				-        LOG("guidance_offset:     %s", log_tostr(guidance_offset));
			
 
				-    }
			
 
				-
			
 
				     if ((int) embd_inp.size() > n_ctx - 4) {
			
 
				         LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
			
 
				         return 1;
			
@@ -319,15 +262,6 @@ int main(int argc, char ** argv) {
 
				             LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
			
 
				         }
			
 
				 
			
 
				-        if (ctx_guidance) {
			
 
				-            LOG_TEE("\n");
			
 
				-            LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
			
 
				-            LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
			
 
				-            for (int i = 0; i < (int) guidance_inp.size(); i++) {
			
 
				-                LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
			
 
				-            }
			
 
				-        }
			
 
				-
			
 
				         if (params.n_keep > 0) {
			
 
				         LOG_TEE("%s: static prompt based on n_keep: '", __func__);
			
 
				             for (int i = 0; i < params.n_keep; i++) {
			
@@ -395,12 +329,11 @@ int main(int argc, char ** argv) {
 
				         is_interacting = params.interactive_first;
			
 
				     }
			
 
				 
			
 
				-    bool input_echo           = true;
			
 
				+    bool input_echo = true;
			
 
				 
			
 
				-    int n_past             = 0;
			
 
				-    int n_remain           = params.n_predict;
			
 
				-    int n_consumed         = 0;
			
 
				-    int n_past_guidance    = 0;
			
 
				+    int n_past     = 0;
			
 
				+    int n_remain   = params.n_predict;
			
 
				+    int n_consumed = 0;
			
 
				 
			
 
				     std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
			
 
				     std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
			
@@ -410,7 +343,6 @@ int main(int argc, char ** argv) {
 
				     console::set_display(console::prompt);
			
 
				 
			
 
				     std::vector<llama_token> embd;
			
 
				-    std::vector<llama_token> embd_guidance;
			
 
				 
			
 
				     struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
			
 
				 
			
@@ -436,7 +368,7 @@ int main(int argc, char ** argv) {
 
				             // if we run out of context:
			
 
				             // - take the n_keep first tokens from the original prompt (via n_past)
			
 
				             // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
			
 
				-            if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
			
 
				+            if (n_past + (int) embd.size() > n_ctx) {
			
 
				                 if (params.n_predict == -2) {
			
 
				                     LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
			
 
				                     break;
			
@@ -453,11 +385,7 @@ int main(int argc, char ** argv) {
 
				 
			
 
				                 n_past -= n_discard;
			
 
				 
			
 
				-                if (ctx_guidance) {
			
 
				-                    n_past_guidance -= n_discard;
			
 
				-                }
			
 
				-
			
 
				-                LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
			
 
				+                LOG("after swap: n_past = %d\n", n_past);
			
 
				 
			
 
				                 LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
			
 
				 
			
@@ -465,45 +393,6 @@ int main(int argc, char ** argv) {
 
				 
			
 
				             // evaluate tokens in batches
			
 
				             // embd is typically prepared beforehand to fit within a batch, but not always
			
 
				-
			
 
				-            if (ctx_guidance) {
			
 
				-                int input_size = 0;
			
 
				-                llama_token * input_buf = NULL;
			
 
				-
			
 
				-                if (n_past_guidance < (int) guidance_inp.size()) {
			
 
				-                    // Guidance context should have the same data with these modifications:
			
 
				-                    //
			
 
				-                    // * Replace the initial prompt
			
 
				-                    // * Shift everything by guidance_offset
			
 
				-                    embd_guidance = guidance_inp;
			
 
				-                    if (embd.begin() + original_prompt_len < embd.end()) {
			
 
				-                        embd_guidance.insert(
			
 
				-                            embd_guidance.end(),
			
 
				-                            embd.begin() + original_prompt_len,
			
 
				-                            embd.end()
			
 
				-                        );
			
 
				-                    }
			
 
				-
			
 
				-                    input_buf  = embd_guidance.data();
			
 
				-                    input_size = embd_guidance.size();
			
 
				-
			
 
				-                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
			
 
				-                } else {
			
 
				-                    input_buf  = embd.data();
			
 
				-                    input_size = embd.size();
			
 
				-                }
			
 
				-
			
 
				-                for (int i = 0; i < input_size; i += params.n_batch) {
			
 
				-                    int n_eval = std::min(input_size - i, params.n_batch);
			
 
				-                    if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
			
 
				-                        LOG_TEE("%s : failed to eval\n", __func__);
			
 
				-                        return 1;
			
 
				-                    }
			
 
				-
			
 
				-                    n_past_guidance += n_eval;
			
 
				-                }
			
 
				-            }
			
 
				-
			
 
				             for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
			
 
				                 int n_eval = (int) embd.size() - i;
			
 
				                 if (n_eval > params.n_batch) {
			
@@ -525,11 +414,9 @@ int main(int argc, char ** argv) {
 
				         }
			
 
				 
			
 
				         embd.clear();
			
 
				-        embd_guidance.clear();
			
 
				 
			
 
				         if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
			
 
				-
			
 
				-            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
			
 
				+            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr);
			
 
				 
			
 
				             llama_sampling_accept(ctx_sampling, ctx, id, true);
			
 
				 
			
@@ -583,7 +470,6 @@ int main(int argc, char ** argv) {
 
				 
			
 
				         // if not currently processing queued inputs;
			
 
				         if ((int) embd_inp.size() <= n_consumed) {
			
 
				-
			
 
				             // deal with eot token in infill mode
			
 
				             if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
			
 
				                 if (is_interacting && !params.interactive_first) {
			
@@ -644,7 +530,6 @@ int main(int argc, char ** argv) {
 
				                 embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
			
 
				                 embd_inp.push_back(llama_token_middle(model));
			
 
				                 embd.clear();
			
 
				-                embd_guidance.clear();
			
 
				                 n_remain = params.n_predict;
			
 
				                 n_past = 0;
			
 
				                 n_consumed = 0;
			
@@ -751,7 +636,6 @@ int main(int argc, char ** argv) {
 
				     llama_print_timings(ctx);
			
 
				     write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
			
 
				 
			
 
				-    if (ctx_guidance) { llama_free(ctx_guidance); }
			
 
				     llama_free(ctx);
			
 
				     llama_free_model(model);
			
 
				 
			
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -41,20 +41,6 @@ static std::string join(const std::vector<T> & values, const std::string & delim
 
				     return str.str();
			
 
				 }
			
 
				 
			
 
				-template<class T>
			
 
				-static std::vector<T> split(const std::string & str, char delim) {
			
 
				-    std::vector<T> values;
			
 
				-    std::istringstream str_stream(str);
			
 
				-    std::string token;
			
 
				-    while (std::getline(str_stream, token, delim)) {
			
 
				-        T value;
			
 
				-        std::istringstream token_stream(token);
			
 
				-        token_stream >> value;
			
 
				-        values.push_back(value);
			
 
				-    }
			
 
				-    return values;
			
 
				-}
			
 
				-
			
 
				 template<typename T, typename F>
			
 
				 static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
			
 
				     std::vector<std::string> str_values;
			
@@ -322,28 +308,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
 
				                 invalid_param = true;
			
 
				                 break;
			
 
				             }
			
 
				-            auto p = split<std::string>(argv[i], split_delim);
			
 
				+            auto p = string_split<std::string>(argv[i], split_delim);
			
 
				             params.model.insert(params.model.end(), p.begin(), p.end());
			
 
				         } else if (arg == "-p" || arg == "--n-prompt") {
			
 
				             if (++i >= argc) {
			
 
				                 invalid_param = true;
			
 
				                 break;
			
 
				             }
			
 
				-            auto p = split<int>(argv[i], split_delim);
			
 
				+            auto p = string_split<int>(argv[i], split_delim);
			
 
				             params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
			
 
				         } else if (arg == "-n" || arg == "--n-gen") {
			
 
				             if (++i >= argc) {
			
 
				                 invalid_param = true;
			
 
				                 break;
			
 
				             }
			
 
				-            auto p = split<int>(argv[i], split_delim);
			
 
				+            auto p = string_split<int>(argv[i], split_delim);
			
 
				             params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
			
 
				         } else if (arg == "-pg") {
			
 
				             if (++i >= argc) {
			
 
				                 invalid_param = true;
			
 
				                 break;
			
 
				             }
			
 
				-            auto p = split<std::string>(argv[i], ',');
			
 
				+            auto p = string_split<std::string>(argv[i], ',');
			
 
				             if (p.size() != 2) {
			
 
				                 invalid_param = true;
			
 
				                 break;
			
@@ -354,21 +340,21 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
 
				                 invalid_param = true;
			
 
				                 break;
			
 
				             }
			
 
				-            auto p = split<int>(argv[i], split_delim);
			
 
				+            auto p = string_split<int>(argv[i], split_delim);
			
 
				             params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
			
 
				         } else if (arg == "-ub" || arg == "--ubatch-size") {
			
 
				             if (++i >= argc) {
			
 
				                 invalid_param = true;
			
 
				                 break;
			
 
				             }
			
 
				-            auto p = split<int>(argv[i], split_delim);
			
 
				+            auto p = string_split<int>(argv[i], split_delim);
			
 
				             params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
			
 
				         } else if (arg == "-ctk" || arg == "--cache-type-k") {
			
 
				             if (++i >= argc) {
			
 
				                 invalid_param = true;
			
 
				                 break;
			
 
				             }
			
 
				-            auto p = split<std::string>(argv[i], split_delim);
			
 
				+            auto p = string_split<std::string>(argv[i], split_delim);
			
 
				             std::vector<ggml_type> types;
			
 
				             for (const auto & t : p) {
			
 
				                 ggml_type gt = ggml_type_from_name(t);
			
@@ -384,7 +370,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
 
				                 invalid_param = true;
			
 
				                 break;
			
 
				             }
			
 
				-            auto p = split<std::string>(argv[i], split_delim);
			
 
				+            auto p = string_split<std::string>(argv[i], split_delim);
			
 
				             std::vector<ggml_type> types;
			
 
				             for (const auto & t : p) {
			
 
				                 ggml_type gt = ggml_type_from_name(t);
			
@@ -400,14 +386,14 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
 
				                 invalid_param = true;
			
 
				                 break;
			
 
				             }
			
 
				-            auto p = split<int>(argv[i], split_delim);
			
 
				+            auto p = string_split<int>(argv[i], split_delim);
			
 
				             params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
			
 
				         } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
			
 
				             if (++i >= argc) {
			
 
				                 invalid_param = true;
			
 
				                 break;
			
 
				             }
			
 
				-            auto p = split<int>(argv[i], split_delim);
			
 
				+            auto p = string_split<int>(argv[i], split_delim);
			
 
				             params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
			
 
				         } else if (arg == "-rpc" || arg == "--rpc") {
			
 
				             if (++i >= argc) {
			
@@ -420,7 +406,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
 
				                 invalid_param = true;
			
 
				                 break;
			
 
				             }
			
 
				-            auto p = split<std::string>(argv[i], split_delim);
			
 
				+            auto p = string_split<std::string>(argv[i], split_delim);
			
 
				             std::vector<llama_split_mode> modes;
			
 
				             for (const auto & m : p) {
			
 
				                 llama_split_mode mode;
			
@@ -442,13 +428,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
 
				                 invalid_param = true;
			
 
				                 break;
			
 
				             }
			
 
				-            params.main_gpu = split<int>(argv[i], split_delim);
			
 
				+            params.main_gpu = string_split<int>(argv[i], split_delim);
			
 
				         } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
			
 
				             if (++i >= argc) {
			
 
				                 invalid_param = true;
			
 
				                 break;
			
 
				             }
			
 
				-            auto p = split<bool>(argv[i], split_delim);
			
 
				+            auto p = string_split<bool>(argv[i], split_delim);
			
 
				             params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
			
 
				         } else if (arg == "--numa") {
			
 
				             if (++i >= argc) {
			
@@ -466,28 +452,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
 
				                 invalid_param = true;
			
 
				                 break;
			
 
				             }
			
 
				-            auto p = split<bool>(argv[i], split_delim);
			
 
				+            auto p = string_split<bool>(argv[i], split_delim);
			
 
				             params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
			
 
				         } else if (arg == "-mmp" || arg == "--mmap") {
			
 
				             if (++i >= argc) {
			
 
				                 invalid_param = true;
			
 
				                 break;
			
 
				             }
			
 
				-            auto p = split<bool>(argv[i], split_delim);
			
 
				+            auto p = string_split<bool>(argv[i], split_delim);
			
 
				             params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
			
 
				         } else if (arg == "-embd" || arg == "--embeddings") {
			
 
				             if (++i >= argc) {
			
 
				                 invalid_param = true;
			
 
				                 break;
			
 
				             }
			
 
				-            auto p = split<bool>(argv[i], split_delim);
			
 
				+            auto p = string_split<bool>(argv[i], split_delim);
			
 
				             params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
			
 
				         } else if (arg == "-ts" || arg == "--tensor-split") {
			
 
				             if (++i >= argc) {
			
 
				                 invalid_param = true;
			
 
				                 break;
			
 
				             }
			
 
				-            for (auto ts : split<std::string>(argv[i], split_delim)) {
			
 
				+            for (auto ts : string_split<std::string>(argv[i], split_delim)) {
			
 
				                 // split string by ; and /
			
 
				                 const std::regex regex{R"([;/]+)"};
			
 
				                 std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
			
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -112,9 +112,12 @@ struct llava_context {
 
				     struct llama_model * model = NULL;
			
 
				 };
			
 
				 
			
 
				-static void show_additional_info(int /*argc*/, char ** argv) {
			
 
				-    LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
			
 
				-    LOG_TEE("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
			
 
				+static void print_usage(int argc, char ** argv, const gpt_params & params) {
			
 
				+    gpt_params_print_usage(argc, argv, params);
			
 
				+
			
 
				+    LOG_TEE("\n example usage:\n");
			
 
				+    LOG_TEE("\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
			
 
				+    LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
			
 
				 }
			
 
				 
			
 
				 static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
			
@@ -278,7 +281,7 @@ int main(int argc, char ** argv) {
 
				     gpt_params params;
			
 
				 
			
 
				     if (!gpt_params_parse(argc, argv, params)) {
			
 
				-        show_additional_info(argc, argv);
			
 
				+        print_usage(argc, argv, params);
			
 
				         return 1;
			
 
				     }
			
 
				 
			
@@ -290,8 +293,7 @@ int main(int argc, char ** argv) {
 
				 #endif // LOG_DISABLE_LOGS
			
 
				 
			
 
				     if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
			
 
				-        gpt_params_print_usage(argc, argv, params);
			
 
				-        show_additional_info(argc, argv);
			
 
				+        print_usage(argc, argv, {});
			
 
				         return 1;
			
 
				     }
			
 
				     auto model = llava_init(&params);
			
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -37,7 +37,8 @@ struct ngram_container {
 
				 int main(int argc, char ** argv) {
			
 
				     gpt_params params;
			
 
				 
			
 
				-    if (gpt_params_parse(argc, argv, params) == false) {
			
 
				+    if (!gpt_params_parse(argc, argv, params)) {
			
 
				+        gpt_params_print_usage(argc, argv, params);
			
 
				         return 1;
			
 
				     }
			
 
				 
			
--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@@ -14,8 +14,10 @@ int main(int argc, char ** argv){
 
				     gpt_params params;
			
 
				 
			
 
				     if (!gpt_params_parse(argc, argv, params)) {
			
 
				+        gpt_params_print_usage(argc, argv, params);
			
 
				         return 1;
			
 
				     }
			
 
				+
			
 
				     // init llama.cpp
			
 
				     llama_backend_init();
			
 
				     llama_numa_init(params.numa);
			
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -16,6 +16,7 @@ int main(int argc, char ** argv){
 
				     gpt_params params;
			
 
				 
			
 
				     if (!gpt_params_parse(argc, argv, params)) {
			
 
				+        gpt_params_print_usage(argc, argv, params);
			
 
				         return 1;
			
 
				     }
			
 
				 
			
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -15,6 +15,7 @@ int main(int argc, char ** argv){
 
				     gpt_params params;
			
 
				 
			
 
				     if (!gpt_params_parse(argc, argv, params)) {
			
 
				+        gpt_params_print_usage(argc, argv, params);
			
 
				         return 1;
			
 
				     }
			
 
				 
			
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -53,13 +53,13 @@ The following command generates "infinite" text from a starting prompt (you can
 
				 #### Unix-based systems (Linux, macOS, etc.):
			
 
				 
			
 
				 ```bash
			
 
				-./main -m models/7B/ggml-model.bin --ignore-eos -n -1 --random-prompt
			
 
				+./main -m models/7B/ggml-model.bin --ignore-eos -n -1
			
 
				 ```
			
 
				 
			
 
				 #### Windows:
			
 
				 
			
 
				 ```powershell
			
 
				-main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
			
 
				+main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1
			
 
				 ```
			
 
				 
			
 
				 ## Common Options
			
@@ -80,7 +80,6 @@ The `main` program provides several ways to interact with the LLaMA models using
 
				 -   `--prompt PROMPT`: Provide a prompt directly as a command-line option.
			
 
				 -   `--file FNAME`: Provide a file containing a prompt or multiple prompts.
			
 
				 -   `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
			
 
				--   `--random-prompt`: Start with a randomized prompt.
			
 
				 
			
 
				 ## Interaction
			
 
				 
			
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -122,8 +122,10 @@ int main(int argc, char ** argv) {
 
				     g_params = &params;
			
 
				 
			
 
				     if (!gpt_params_parse(argc, argv, params)) {
			
 
				+        gpt_params_print_usage(argc, argv, params);
			
 
				         return 1;
			
 
				     }
			
 
				+
			
 
				     llama_sampling_params & sparams = params.sparams;
			
 
				 
			
 
				 #ifndef LOG_DISABLE_LOGS
			
@@ -180,9 +182,6 @@ int main(int argc, char ** argv) {
 
				     LOG_TEE("%s: seed  = %u\n", __func__, params.seed);
			
 
				 
			
 
				     std::mt19937 rng(params.seed);
			
 
				-    if (params.random_prompt) {
			
 
				-        params.prompt = string_random_prompt(rng);
			
 
				-    }
			
 
				 
			
 
				     LOG("%s: llama backend init\n", __func__);
			
 
				     llama_backend_init();
			
@@ -250,11 +249,8 @@ int main(int argc, char ** argv) {
 
				 
			
 
				     std::vector<llama_token> embd_inp;
			
 
				 
			
 
				-    if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
			
 
				+    if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
			
 
				         LOG("tokenize the prompt\n");
			
 
				-        if (params.chatml) {
			
 
				-            params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
			
 
				-        }
			
 
				         embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
			
 
				     } else {
			
 
				         LOG("use session tokens\n");
			
@@ -332,37 +328,13 @@ int main(int argc, char ** argv) {
 
				     }
			
 
				 
			
 
				     // number of tokens to keep when resetting context
			
 
				-    if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) {
			
 
				+    if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
			
 
				         params.n_keep = (int)embd_inp.size();
			
 
				     } else {
			
 
				         params.n_keep += add_bos; // always keep the BOS token
			
 
				     }
			
 
				 
			
 
				-    // prefix & suffix for instruct mode
			
 
				-    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true,  true);
			
 
				-    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false, true);
			
 
				-
			
 
				-    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
			
 
				-    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
			
 
				-
			
 
				-    // chatml prefix & suffix
			
 
				-    const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true);
			
 
				-    const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
			
 
				-
			
 
				-    LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
			
 
				-    LOG("cml_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_sfx).c_str());
			
 
				-
			
 
				-    // in instruct mode, we inject a prefix and a suffix to each input by the user
			
 
				-    if (params.instruct) {
			
 
				-        params.interactive_first = true;
			
 
				-        params.antiprompt.emplace_back("### Instruction:\n\n");
			
 
				-    }
			
 
				-    // similar for chatml mode
			
 
				-    else if (params.chatml) {
			
 
				-        params.interactive_first = true;
			
 
				-        params.antiprompt.emplace_back("<|im_start|>user\n");
			
 
				-    }
			
 
				-    else if (params.conversation) {
			
 
				+    if (params.conversation) {
			
 
				         params.interactive_first = true;
			
 
				     }
			
 
				 
			
@@ -823,15 +795,13 @@ int main(int argc, char ** argv) {
 
				 
			
 
				                     is_interacting = true;
			
 
				                     printf("\n");
			
 
				-                } else if (params.instruct || params.chatml) {
			
 
				-                    is_interacting = true;
			
 
				                 }
			
 
				             }
			
 
				 
			
 
				             if (n_past > 0 && is_interacting) {
			
 
				                 LOG("waiting for user input\n");
			
 
				 
			
 
				-                if (params.conversation || params.instruct || params.chatml) {
			
 
				+                if (params.conversation) {
			
 
				                     printf("\n> ");
			
 
				                 }
			
 
				 
			
@@ -874,24 +844,12 @@ int main(int argc, char ** argv) {
 
				 
			
 
				                     const size_t original_size = embd_inp.size();
			
 
				 
			
 
				-                    // instruct mode: insert instruction prefix
			
 
				-                    if (params.instruct && !is_antiprompt) {
			
 
				-                        LOG("inserting instruction prefix\n");
			
 
				-                        n_consumed = embd_inp.size();
			
 
				-                        embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
			
 
				-                    }
			
 
				-                    // chatml mode: insert user chat prefix
			
 
				-                    if (params.chatml && !is_antiprompt) {
			
 
				-                        LOG("inserting chatml prefix\n");
			
 
				-                        n_consumed = embd_inp.size();
			
 
				-                        embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
			
 
				-                    }
			
 
				                     if (params.escape) {
			
 
				                         string_process_escapes(buffer);
			
 
				                     }
			
 
				 
			
 
				                     const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
			
 
				-                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, params.interactive_specials);
			
 
				+                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, false);
			
 
				                     const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
			
 
				 
			
 
				                     LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
			
@@ -900,17 +858,6 @@ int main(int argc, char ** argv) {
 
				                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
			
 
				                     embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
			
 
				 
			
 
				-                    // instruct mode: insert response suffix
			
 
				-                    if (params.instruct) {
			
 
				-                        LOG("inserting instruction suffix\n");
			
 
				-                        embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
			
 
				-                    }
			
 
				-                    // chatml mode: insert assistant chat suffix
			
 
				-                    if (params.chatml) {
			
 
				-                        LOG("inserting chatml suffix\n");
			
 
				-                        embd_inp.insert(embd_inp.end(), cml_sfx.begin(), cml_sfx.end());
			
 
				-                    }
			
 
				-
			
 
				                     for (size_t i = original_size; i < embd_inp.size(); ++i) {
			
 
				                         const llama_token token = embd_inp[i];
			
 
				                         output_tokens.push_back(token);
			
@@ -935,7 +882,7 @@ int main(int argc, char ** argv) {
 
				         }
			
 
				 
			
 
				         // end of generation
			
 
				-        if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.instruct || params.interactive || params.chatml)) {
			
 
				+        if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
			
 
				             LOG_TEE(" [end of text]\n");
			
 
				             break;
			
 
				         }
			
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -100,7 +100,8 @@ int main(int argc, char ** argv) {
 
				 
			
 
				     gpt_params params;
			
 
				 
			
 
				-    if (gpt_params_parse(argc, argv, params) == false) {
			
 
				+    if (!gpt_params_parse(argc, argv, params)) {
			
 
				+        gpt_params_print_usage(argc, argv, params);
			
 
				         return 1;
			
 
				     }
			
 
				 
			
--- a/examples/passkey/README.md
+++ b/examples/passkey/README.md
@@ -8,5 +8,5 @@ See the following PRs for more info:
 
				 ### Usage
			
 
				 
			
 
				 ```bash
			
 
				-make -j && ./passkey ./models/llama-7b-v2/ggml-model-f16.gguf 250
			
 
				+make -j && ./passkey -m ./models/llama-7b-v2/ggml-model-f16.gguf --junk 250
			
 
				 ```
			
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -6,46 +6,32 @@
 
				 #include <string>
			
 
				 #include <vector>
			
 
				 
			
 
				-int main(int argc, char ** argv) {
			
 
				-    gpt_params params;
			
 
				-
			
 
				-    if (argc == 1 || argv[1][0] == '-') {
			
 
				-        printf("usage: %s MODEL_PATH N_JUNK N_GRP I_POS SEED\n" , argv[0]);
			
 
				-        return 1 ;
			
 
				-    }
			
 
				-
			
 
				-    int seed = -1;
			
 
				+static void print_usage(int argc, char ** argv, const gpt_params & params) {
			
 
				+    gpt_params_print_usage(argc, argv, params);
			
 
				 
			
 
				-    int n_junk = 250; // number of times to repeat the junk text
			
 
				-    int n_keep = 32;  // number of tokens in the prompt prefix
			
 
				-    int n_grp  = 1;   // if more than 1 - perform LongLM SelfExtend
			
 
				-    int i_pos  = -1;  // position of the passkey in the junk text
			
 
				-
			
 
				-    if (argc >= 2) {
			
 
				-        params.model = argv[1];
			
 
				-    }
			
 
				-
			
 
				-    if (argc >= 3) {
			
 
				-        n_junk = std::stoi(argv[2]);
			
 
				-    }
			
 
				+    LOG_TEE("\nexample usage:\n");
			
 
				+    LOG_TEE("\n    %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
			
 
				+    LOG_TEE("\n");
			
 
				+}
			
 
				 
			
 
				-    if (argc >= 4) {
			
 
				-        n_grp = std::stoi(argv[3]);
			
 
				-    }
			
 
				+int main(int argc, char ** argv) {
			
 
				+    gpt_params params;
			
 
				 
			
 
				-    if (argc >= 5) {
			
 
				-        i_pos = std::stoi(argv[4]);
			
 
				-    }
			
 
				+    params.n_junk = 250;
			
 
				+    params.n_keep = 32;
			
 
				+    params.i_pos  = -1;
			
 
				 
			
 
				-    if (argc >= 6) {
			
 
				-        seed = std::stoi(argv[5]);
			
 
				+    if (!gpt_params_parse(argc, argv, params)) {
			
 
				+        print_usage(argc, argv, params);
			
 
				+        return 1;
			
 
				     }
			
 
				 
			
 
				-    if (seed == -1) {
			
 
				-        seed = time(NULL);
			
 
				-    }
			
 
				+    srand(params.seed == LLAMA_DEFAULT_SEED ? time(NULL) : params.seed);
			
 
				 
			
 
				-    srand(seed);
			
 
				+    int n_junk = params.n_junk;
			
 
				+    int n_keep = params.n_keep;
			
 
				+    int n_grp  = params.grp_attn_n;
			
 
				+    int i_pos  = params.i_pos;
			
 
				 
			
 
				     if (i_pos == -1) {
			
 
				         i_pos = rand() % n_junk;
			
@@ -76,9 +62,7 @@ int main(int argc, char ** argv) {
 
				 
			
 
				     // initialize the model
			
 
				 
			
 
				-    llama_model_params model_params = llama_model_default_params();
			
 
				-
			
 
				-    model_params.n_gpu_layers = 99; // offload all layers to the GPU
			
 
				+    llama_model_params model_params = llama_model_params_from_gpt_params(params);
			
 
				 
			
 
				     llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
			
 
				 
			
@@ -89,13 +73,9 @@ int main(int argc, char ** argv) {
 
				 
			
 
				     // initialize the context
			
 
				 
			
 
				-    llama_context_params ctx_params = llama_context_default_params();
			
 
				+    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
			
 
				 
			
 
				-    ctx_params.seed    = seed;
			
 
				-    ctx_params.n_ctx   = llama_n_ctx_train(model)*n_grp + n_keep;
			
 
				-    ctx_params.n_batch = 512;
			
 
				-    ctx_params.n_threads       = params.n_threads;
			
 
				-    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
			
 
				+    ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
			
 
				 
			
 
				     GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");
			
 
				 
			
@@ -135,7 +115,7 @@ int main(int argc, char ** argv) {
 
				     LOG_TEE("prompt tokens: %d\n", n_tokens_all);
			
 
				     //LOG_TEE("prompt: %s\n", params.prompt.c_str());
			
 
				 
			
 
				-    llama_batch batch = llama_batch_init(512, 0, 1);
			
 
				+    llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
			
 
				 
			
 
				     int n_past = 0;
			
 
				 
			
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1032,7 +1032,7 @@ struct winogrande_entry {
 
				     std::vector<llama_token> seq_tokens[2];
			
 
				 };
			
 
				 
			
 
				-static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string& prompt) {
			
 
				+static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string & prompt) {
			
 
				     std::vector<winogrande_entry> result;
			
 
				     std::istringstream in(prompt);
			
 
				     std::string line;
			
@@ -1964,12 +1964,14 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
 
				 int main(int argc, char ** argv) {
			
 
				     gpt_params params;
			
 
				 
			
 
				+    params.n_ctx = 512;
			
 
				+    params.logits_all = true;
			
 
				+
			
 
				     if (!gpt_params_parse(argc, argv, params)) {
			
 
				+        gpt_params_print_usage(argc, argv, params);
			
 
				         return 1;
			
 
				     }
			
 
				 
			
 
				-    params.logits_all = true;
			
 
				-
			
 
				     const int32_t n_ctx = params.n_ctx;
			
 
				 
			
 
				     if (n_ctx <= 0) {
			
@@ -2006,9 +2008,6 @@ int main(int argc, char ** argv) {
 
				     fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
			
 
				 
			
 
				     std::mt19937 rng(params.seed);
			
 
				-    if (params.random_prompt) {
			
 
				-        params.prompt = string_random_prompt(rng);
			
 
				-    }
			
 
				 
			
 
				     llama_backend_init();
			
 
				     llama_numa_init(params.numa);
			
@@ -2027,6 +2026,7 @@ int main(int argc, char ** argv) {
 
				     }
			
 
				 
			
 
				     const int n_ctx_train = llama_n_ctx_train(model);
			
 
				+
			
 
				     if (params.n_ctx > n_ctx_train) {
			
 
				         fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
			
 
				                 __func__, n_ctx_train, params.n_ctx);
			
--- a/examples/quantize/tests.sh
+++ b/examples/quantize/tests.sh
@@ -47,7 +47,7 @@ echo PASS
 
				 echo
			
 
				 
			
 
				 # 3a. Test the requanted model is loading properly
			
 
				-$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --random-prompt --n-predict 32
			
 
				+$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --n-predict 32
			
 
				 echo PASS
			
 
				 echo
			
 
				 
			
@@ -57,7 +57,7 @@ echo PASS
 
				 echo
			
 
				 
			
 
				 # 4b. Test the requanted model is loading properly
			
 
				-$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --random-prompt --n-predict 32
			
 
				+$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --n-predict 32
			
 
				 echo PASS
			
 
				 echo
			
 
				 
			
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -4,72 +4,12 @@
 
				 #include <algorithm>
			
 
				 #include <fstream>
			
 
				 
			
 
				-struct retrieval_params {
			
 
				-    std::vector<std::string> context_files; // context files to embed
			
 
				-    int32_t chunk_size            = 64;     // chunk size for context embedding
			
 
				-    std::string chunk_separator   = "\n";   // chunk separator for context embedding
			
 
				-};
			
 
				+static void print_usage(int argc, char ** argv, const gpt_params & params) {
			
 
				+    gpt_params_print_usage(argc, argv, params);
			
 
				 
			
 
				-static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) {
			
 
				-    gpt_params_print_usage(argc, argv, gpt_params);
			
 
				-    printf("retrieval options:\n");
			
 
				-    printf("  --context-file FNAME  file containing context to embed.\n");
			
 
				-    printf("                        specify multiple files by providing --context-file option multiple times.\n");
			
 
				-    printf("  --chunk-size N        minimum length of embedded text chunk (default:%d)\n", params.chunk_size);
			
 
				-    printf("  --chunk-separator STRING\n");
			
 
				-    printf("                        string to separate chunks (default: \"\\n\")\n");
			
 
				-    printf("\n");
			
 
				-}
			
 
				-
			
 
				-static void retrieval_params_parse(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & retrieval_params) {
			
 
				-    int i = 1;
			
 
				-    std::string arg;
			
 
				-    while (i < argc) {
			
 
				-        arg = argv[i];
			
 
				-        bool invalid_gpt_param = false;
			
 
				-        if(gpt_params_find_arg(argc, argv, argv[i], gpt_params, i, invalid_gpt_param)) {
			
 
				-            if (invalid_gpt_param) {
			
 
				-                fprintf(stderr, "error: invalid argument: %s\n", arg.c_str());
			
 
				-                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
			
 
				-                exit(1);
			
 
				-            }
			
 
				-            // option was parsed by gpt_params_find_arg
			
 
				-        } else if (arg == "--context-file") {
			
 
				-            if (++i >= argc) {
			
 
				-                fprintf(stderr, "error: missing argument for --context-file\n");
			
 
				-                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
			
 
				-                exit(1);
			
 
				-            }
			
 
				-            std::ifstream file(argv[i]);
			
 
				-            if (!file) {
			
 
				-                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
			
 
				-                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
			
 
				-                exit(1);
			
 
				-            }
			
 
				-            // store the external file name in params
			
 
				-            retrieval_params.context_files.push_back(argv[i]);
			
 
				-        } else if (arg == "--chunk-size") {
			
 
				-            if (++i >= argc) {
			
 
				-                fprintf(stderr, "error: missing argument for --chunk-size\n");
			
 
				-                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
			
 
				-                exit(1);
			
 
				-            }
			
 
				-            retrieval_params.chunk_size = std::stoi(argv[i]);
			
 
				-        } else if (arg == "--chunk-separator") {
			
 
				-            if (++i >= argc) {
			
 
				-                fprintf(stderr, "error: missing argument for --chunk-separator\n");
			
 
				-                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
			
 
				-                exit(1);
			
 
				-            }
			
 
				-            retrieval_params.chunk_separator = argv[i];
			
 
				-        } else {
			
 
				-            // unknown argument
			
 
				-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
			
 
				-            retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
			
 
				-            exit(1);
			
 
				-        }
			
 
				-        i++;
			
 
				-    }
			
 
				+    LOG_TEE("\nexample usage:\n");
			
 
				+    LOG_TEE("\n    %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
			
 
				+    LOG_TEE("\n");
			
 
				 }
			
 
				 
			
 
				 struct chunk {
			
@@ -171,33 +111,35 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 
				 
			
 
				 int main(int argc, char ** argv) {
			
 
				     gpt_params params;
			
 
				-    retrieval_params retrieval_params;
			
 
				 
			
 
				-    retrieval_params_parse(argc, argv, params, retrieval_params);
			
 
				+    if (!gpt_params_parse(argc, argv, params)) {
			
 
				+        print_usage(argc, argv, params);
			
 
				+        return 1;
			
 
				+    }
			
 
				 
			
 
				     // For BERT models, batch size must be equal to ubatch size
			
 
				     params.n_ubatch = params.n_batch;
			
 
				+    params.embedding = true;
			
 
				 
			
 
				-    if (retrieval_params.chunk_size <= 0) {
			
 
				+    if (params.chunk_size <= 0) {
			
 
				         fprintf(stderr, "chunk_size must be positive\n");
			
 
				         return 1;
			
 
				     }
			
 
				-    if (retrieval_params.context_files.empty()) {
			
 
				+    if (params.context_files.empty()) {
			
 
				         fprintf(stderr, "context_files must be specified\n");
			
 
				         return 1;
			
 
				     }
			
 
				-    params.embedding = true;
			
 
				 
			
 
				     print_build_info();
			
 
				 
			
 
				     printf("processing files:\n");
			
 
				-    for (auto & context_file : retrieval_params.context_files) {
			
 
				+    for (auto & context_file : params.context_files) {
			
 
				         printf("%s\n", context_file.c_str());
			
 
				     }
			
 
				 
			
 
				     std::vector<chunk> chunks;
			
 
				-    for (auto & context_file : retrieval_params.context_files) {
			
 
				-        std::vector<chunk> file_chunk = chunk_file(context_file, retrieval_params.chunk_size, retrieval_params.chunk_separator);
			
 
				+    for (auto & context_file : params.context_files) {
			
 
				+        std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
			
 
				         chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
			
 
				     }
			
 
				     printf("Number of chunks: %ld\n", chunks.size());
			
@@ -242,7 +184,7 @@ int main(int argc, char ** argv) {
 
				             return 1;
			
 
				         }
			
 
				         // add eos if not present
			
 
				-        if (inp.empty() || inp.back() != llama_token_eos(model)) {
			
 
				+        if (llama_token_eos(model) >= 0 && (inp.empty() || inp.back() != llama_token_eos(model))) {
			
 
				             inp.push_back(llama_token_eos(model));
			
 
				         }
			
 
				         chunk.tokens = inp;
			
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -11,6 +11,7 @@ int main(int argc, char ** argv) {
 
				     params.prompt = "The quick brown fox";
			
 
				 
			
 
				     if (!gpt_params_parse(argc, argv, params)) {
			
 
				+        gpt_params_print_usage(argc, argv, params);
			
 
				         return 1;
			
 
				     }
			
 
				 
			
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -123,29 +123,6 @@ struct slot_params {
 
				     json input_suffix;
			
 
				 };
			
 
				 
			
 
				-struct server_params {
			
 
				-    int32_t port           = 8080;
			
 
				-    int32_t read_timeout   = 600;
			
 
				-    int32_t write_timeout  = 600;
			
 
				-    int32_t n_threads_http = -1;
			
 
				-
			
 
				-    std::string hostname      = "127.0.0.1";
			
 
				-    std::string public_path   = "";
			
 
				-    std::string chat_template = "";
			
 
				-    std::string system_prompt = "";
			
 
				-
			
 
				-    std::vector<std::string> api_keys;
			
 
				-
			
 
				-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
			
 
				-    std::string ssl_key_file = "";
			
 
				-    std::string ssl_cert_file = "";
			
 
				-#endif
			
 
				-
			
 
				-    bool slots_endpoint   = true;
			
 
				-    bool metrics_endpoint = false;
			
 
				-    std::string slot_save_path;
			
 
				-};
			
 
				-
			
 
				 struct server_slot {
			
 
				     int id;
			
 
				     int id_task = -1;
			
@@ -1261,7 +1238,7 @@ struct server_context {
 
				     }
			
 
				 
			
 
				     json get_formated_generation(const server_slot & slot) const {
			
 
				-        const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
			
 
				+        const auto eos_bias   =             slot.sparams.logit_bias.find(llama_token_eos(model));
			
 
				         const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second);
			
 
				 
			
 
				         std::vector<std::string> samplers_sequence;
			
@@ -2334,561 +2311,6 @@ struct server_context {
 
				     }
			
 
				 };
			
 
				 
			
 
				-static void server_print_usage(const char * argv0, const gpt_params & params, const server_params & sparams) {
			
 
				-    printf("usage: %s [options]\n", argv0);
			
 
				-    printf("\n");
			
 
				-    printf("options:\n");
			
 
				-    printf("  -h, --help                show this help message and exit\n");
			
 
				-    printf("  -v, --verbose             verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
			
 
				-    printf("  -t N, --threads N         number of threads to use during computation (default: %d)\n", params.n_threads);
			
 
				-    printf("  -tb N, --threads-batch N  number of threads to use during batch and prompt processing (default: same as --threads)\n");
			
 
				-    printf("  --threads-http N          number of threads in the http server pool to process requests (default: max(hardware concurrency - 1, --parallel N + 2))\n");
			
 
				-    printf("  -c N, --ctx-size N        size of the prompt context (default: %d)\n", params.n_ctx);
			
 
				-    printf("  --rope-scaling {none,linear,yarn}\n");
			
 
				-    printf("                            RoPE frequency scaling method, defaults to linear unless specified by the model\n");
			
 
				-    printf("  --rope-freq-base N        RoPE base frequency (default: loaded from model)\n");
			
 
				-    printf("  --rope-freq-scale N       RoPE frequency scaling factor, expands context by a factor of 1/N\n");
			
 
				-    printf("  --yarn-ext-factor N       YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
			
 
				-    printf("  --yarn-attn-factor N      YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
			
 
				-    printf("  --yarn-beta-slow N        YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
			
 
				-    printf("  --yarn-beta-fast N        YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
			
 
				-    printf("  --pooling {none,mean,cls} pooling type for embeddings, use model default if unspecified\n");
			
 
				-    printf("  -dt N, --defrag-thold N\n");
			
 
				-    printf("                            KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
			
 
				-    printf("  -b N, --batch-size N      logical maximum batch size (default: %d)\n", params.n_batch);
			
 
				-    printf("  -ub N, --ubatch-size N    physical maximum batch size (default: %d)\n", params.n_ubatch);
			
 
				-    if (llama_supports_mlock()) {
			
 
				-        printf("  --mlock                   force system to keep model in RAM rather than swapping or compressing\n");
			
 
				-    }
			
 
				-    if (llama_supports_mmap()) {
			
 
				-        printf("  --no-mmap                 do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
			
 
				-    }
			
 
				-    printf("  --numa TYPE               attempt optimizations that help on some NUMA systems\n");
			
 
				-    printf("                              - distribute: spread execution evenly over all nodes\n");
			
 
				-    printf("                              - isolate: only spawn threads on CPUs on the node that execution started on\n");
			
 
				-    printf("                              - numactl: use the CPU map provided my numactl\n");
			
 
				-    if (llama_supports_gpu_offload()) {
			
 
				-        printf("  -ngl N, --n-gpu-layers N\n");
			
 
				-        printf("                            number of layers to store in VRAM\n");
			
 
				-        printf("  -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
			
 
				-        printf("                            how to split the model across multiple GPUs, one of:\n");
			
 
				-        printf("                              - none: use one GPU only\n");
			
 
				-        printf("                              - layer (default): split layers and KV across GPUs\n");
			
 
				-        printf("                              - row: split rows across GPUs\n");
			
 
				-        printf("  -ts SPLIT --tensor-split SPLIT\n");
			
 
				-        printf("                            fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
			
 
				-        printf("  -mg i, --main-gpu i       the GPU to use for the model (with split-mode = none),\n");
			
 
				-        printf("                            or for intermediate results and KV (with split-mode = row)\n");
			
 
				-        printf("  -nkvo, --no-kv-offload\n");
			
 
				-        printf("                            disable KV offload\n");
			
 
				-    }
			
 
				-    printf("  -m FNAME, --model FNAME\n");
			
 
				-    printf("                            model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);
			
 
				-    printf("  -mu MODEL_URL, --model-url MODEL_URL\n");
			
 
				-    printf("                            model download url (default: unused)\n");
			
 
				-    printf("  -hfr REPO, --hf-repo REPO\n");
			
 
				-    printf("                            Hugging Face model repository (default: unused)\n");
			
 
				-    printf("  -hff FILE, --hf-file FILE\n");
			
 
				-    printf("                            Hugging Face model file (default: unused)\n");
			
 
				-    printf("  -a ALIAS, --alias ALIAS\n");
			
 
				-    printf("                            set an alias for the model, will be added as `model` field in completion response\n");
			
 
				-    printf("  --lora FNAME              apply LoRA adapter (implies --no-mmap)\n");
			
 
				-    printf("  --lora-base FNAME         optional model to use as a base for the layers modified by the LoRA adapter\n");
			
 
				-    printf("  --host                    ip address to listen (default  (default: %s)\n", sparams.hostname.c_str());
			
 
				-    printf("  --port PORT               port to listen (default  (default: %d)\n", sparams.port);
			
 
				-    printf("  --rpc SERVERS             comma separated list of RPC servers\n");
			
 
				-    printf("  --path PUBLIC_PATH        path from which to serve static files (default: disabled)\n");
			
 
				-    printf("  --api-key API_KEY         optional api key to enhance server security. If set, requests must include this key for access.\n");
			
 
				-    printf("  --api-key-file FNAME      path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
			
 
				-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
			
 
				-    printf("  --ssl-key-file FNAME      path to file a PEM-encoded SSL private key\n");
			
 
				-    printf("  --ssl-cert-file FNAME     path to file a PEM-encoded SSL certificate\n");
			
 
				-#endif
			
 
				-    printf("  -to N, --timeout N        server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
			
 
				-    printf("  --embeddings              enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
			
 
				-    printf("  -np N, --parallel N       number of slots for process requests (default: %d)\n", params.n_parallel);
			
 
				-    printf("  -cb, --cont-batching      enable continuous batching (a.k.a dynamic batching) (default: enabled)\n");
			
 
				-    printf("  -fa, --flash-attn         enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
			
 
				-    printf("  -spf FNAME, --system-prompt-file FNAME\n");
			
 
				-    printf("                            set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
			
 
				-    printf("  -ctk TYPE, --cache-type-k TYPE\n");
			
 
				-    printf("                            KV cache data type for K (default: f16)\n");
			
 
				-    printf("  -ctv TYPE, --cache-type-v TYPE\n");
			
 
				-    printf("                            KV cache data type for V (default: f16)\n");
			
 
				-    printf("  --log-format              log output format: json or text (default: json)\n");
			
 
				-    printf("  --log-disable             disables logging to a file.\n");
			
 
				-    printf("  --slots-endpoint-disable  disables slots monitoring endpoint.\n");
			
 
				-    printf("  --metrics                 enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled");
			
 
				-    printf("  --slot-save-path PATH     path to save slot kv cache (default: disabled)\n");
			
 
				-    printf("\n");
			
 
				-    printf("  -n, --n-predict           maximum tokens to predict (default: %d)\n", params.n_predict);
			
 
				-    printf("  --override-kv KEY=TYPE:VALUE\n");
			
 
				-    printf("                            advanced option to override model metadata by key. may be specified multiple times.\n");
			
 
				-    printf("                            types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
			
 
				-    printf("  -gan N, --grp-attn-n N    set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`\n");
			
 
				-    printf("  -gaw N, --grp-attn-w N    set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n");
			
 
				-    printf("  --chat-template JINJA_TEMPLATE\n");
			
 
				-    printf("                            set custom jinja chat template (default: template taken from model's metadata)\n");
			
 
				-    printf("                            only commonly used templates are accepted:\n");
			
 
				-    printf("                            https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template\n");
			
 
				-    printf("\n");
			
 
				-}
			
 
				-
			
 
				-static void server_params_parse(int argc, char ** argv, server_params & sparams, gpt_params & params) {
			
 
				-    gpt_params    default_params;
			
 
				-    server_params default_sparams;
			
 
				-
			
 
				-    std::string arg;
			
 
				-    bool invalid_param = false;
			
 
				-
			
 
				-    for (int i = 1; i < argc; i++) {
			
 
				-        arg = argv[i];
			
 
				-        if (arg == "--port") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            sparams.port = std::stoi(argv[i]);
			
 
				-        } else if (arg == "--rpc") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            params.rpc_servers = argv[i];
			
 
				-        } else if (arg == "--host") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            sparams.hostname = argv[i];
			
 
				-        } else if (arg == "--path") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            sparams.public_path = argv[i];
			
 
				-        } else if (arg == "--api-key") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            sparams.api_keys.push_back(argv[i]);
			
 
				-        } else if (arg == "--api-key-file") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            std::ifstream key_file(argv[i]);
			
 
				-            if (!key_file) {
			
 
				-                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            std::string key;
			
 
				-            while (std::getline(key_file, key)) {
			
 
				-               if (key.size() > 0) {
			
 
				-                   sparams.api_keys.push_back(key);
			
 
				-               }
			
 
				-            }
			
 
				-            key_file.close();
			
 
				-
			
 
				-        }
			
 
				-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
			
 
				-        else if (arg == "--ssl-key-file") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            sparams.ssl_key_file = argv[i];
			
 
				-        } else if (arg == "--ssl-cert-file") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            sparams.ssl_cert_file = argv[i];
			
 
				-        }
			
 
				-#endif
			
 
				-        else if (arg == "--timeout" || arg == "-to") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            sparams.read_timeout = std::stoi(argv[i]);
			
 
				-            sparams.write_timeout = std::stoi(argv[i]);
			
 
				-        } else if (arg == "-m" || arg == "--model") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            params.model = argv[i];
			
 
				-        } else if (arg == "-mu" || arg == "--model-url") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            params.model_url = argv[i];
			
 
				-        } else if (arg == "-hfr" || arg == "--hf-repo") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            params.hf_repo = argv[i];
			
 
				-        } else if (arg == "-hff" || arg == "--hf-file") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            params.hf_file = argv[i];
			
 
				-        } else if (arg == "-a" || arg == "--alias") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            params.model_alias = argv[i];
			
 
				-        } else if (arg == "-h" || arg == "--help") {
			
 
				-            server_print_usage(argv[0], default_params, default_sparams);
			
 
				-            exit(0);
			
 
				-        } else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            params.n_ctx = std::stoi(argv[i]);
			
 
				-        } else if (arg == "--rope-scaling") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            std::string value(argv[i]);
			
 
				-            /**/ if (value == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
			
 
				-            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
			
 
				-            else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
			
 
				-            else { invalid_param = true; break; }
			
 
				-        } else if (arg == "--rope-freq-base") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            params.rope_freq_base = std::stof(argv[i]);
			
 
				-        } else if (arg == "--rope-freq-scale") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            params.rope_freq_scale = std::stof(argv[i]);
			
 
				-        } else if (arg == "--yarn-ext-factor") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            params.yarn_ext_factor = std::stof(argv[i]);
			
 
				-        }
			
 
				-        else if (arg == "--yarn-attn-factor") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            params.yarn_attn_factor = std::stof(argv[i]);
			
 
				-        } else if (arg == "--yarn-beta-fast") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            params.yarn_beta_fast = std::stof(argv[i]);
			
 
				-        } else if (arg == "--yarn-beta-slow") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            params.yarn_beta_slow = std::stof(argv[i]);
			
 
				-        } else if (arg == "--pooling") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            std::string value(argv[i]);
			
 
				-            /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
			
 
				-            else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
			
 
				-            else if (value == "cls")  { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
			
 
				-            else { invalid_param = true; break; }
			
 
				-        } else if (arg == "--defrag-thold" || arg == "-dt") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            params.defrag_thold = std::stof(argv[i]);
			
 
				-        } else if (arg == "--threads" || arg == "-t") {
			
 
				-            if (++i >= argc)
			
 
				-            {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            params.n_threads = std::stoi(argv[i]);
			
 
				-        } else if (arg == "--grp-attn-n" || arg == "-gan") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-
			
 
				-            params.grp_attn_n = std::stoi(argv[i]);
			
 
				-        } else if (arg == "--grp-attn-w" || arg == "-gaw") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-
			
 
				-            params.grp_attn_w = std::stoi(argv[i]);
			
 
				-        } else if (arg == "--threads-batch" || arg == "-tb") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            params.n_threads_batch = std::stoi(argv[i]);
			
 
				-        } else if (arg == "--threads-http") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            sparams.n_threads_http = std::stoi(argv[i]);
			
 
				-        } else if (arg == "-b" || arg == "--batch-size") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            params.n_batch = std::stoi(argv[i]);
			
 
				-        } else if (arg == "-ub" || arg == "--ubatch-size") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            params.n_ubatch = std::stoi(argv[i]);
			
 
				-        } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            if (llama_supports_gpu_offload()) {
			
 
				-                params.n_gpu_layers = std::stoi(argv[i]);
			
 
				-            } else {
			
 
				-                LOG_WARNING(
			
 
				-                    "Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
			
 
				-                    "See main README.md for information on enabling GPU BLAS support",
			
 
				-                    {{"n_gpu_layers", params.n_gpu_layers}});
			
 
				-            }
			
 
				-        } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
			
 
				-            params.no_kv_offload = true;
			
 
				-        } else if (arg == "--split-mode" || arg == "-sm") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            std::string arg_next = argv[i];
			
 
				-            if (arg_next == "none") {
			
 
				-                params.split_mode = LLAMA_SPLIT_MODE_NONE;
			
 
				-            } else if (arg_next == "layer") {
			
 
				-                params.split_mode = LLAMA_SPLIT_MODE_LAYER;
			
 
				-            } else if (arg_next == "row") {
			
 
				-                params.split_mode = LLAMA_SPLIT_MODE_ROW;
			
 
				-            } else {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-#ifndef GGML_USE_CUDA
			
 
				-            fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n");
			
 
				-#endif // GGML_USE_CUDA
			
 
				-        } else if (arg == "--tensor-split" || arg == "-ts") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
			
 
				-            std::string arg_next = argv[i];
			
 
				-
			
 
				-            // split string by , and /
			
 
				-            const std::regex regex{R"([,/]+)"};
			
 
				-            std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
			
 
				-            std::vector<std::string> split_arg{it, {}};
			
 
				-            GGML_ASSERT(split_arg.size() <= llama_max_devices());
			
 
				-
			
 
				-            for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device) {
			
 
				-                if (i_device < split_arg.size()) {
			
 
				-                    params.tensor_split[i_device] = std::stof(split_arg[i_device]);
			
 
				-                } else {
			
 
				-                    params.tensor_split[i_device] = 0.0f;
			
 
				-                }
			
 
				-            }
			
 
				-#else
			
 
				-            LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n", {});
			
 
				-#endif // GGML_USE_CUDA
			
 
				-        } else if (arg == "--main-gpu" || arg == "-mg") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
			
 
				-            params.main_gpu = std::stoi(argv[i]);
			
 
				-#else
			
 
				-            LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a main GPU.", {});
			
 
				-#endif
			
 
				-        } else if (arg == "--lora") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            params.lora_adapter.emplace_back(argv[i], 1.0f);
			
 
				-            params.use_mmap = false;
			
 
				-        } else if (arg == "--lora-scaled") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            const char * lora_adapter = argv[i];
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
			
 
				-            params.use_mmap = false;
			
 
				-        } else if (arg == "--lora-base") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            params.lora_base = argv[i];
			
 
				-        } else if (arg == "-v" || arg == "--verbose") {
			
 
				-#if SERVER_VERBOSE != 1
			
 
				-            LOG_WARNING("server.cpp is not built with verbose logging.", {});
			
 
				-#else
			
 
				-            server_verbose = true;
			
 
				-#endif
			
 
				-        } else if (arg == "--mlock") {
			
 
				-            params.use_mlock = true;
			
 
				-        } else if (arg == "--no-mmap") {
			
 
				-            params.use_mmap = false;
			
 
				-        } else if (arg == "--numa") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            } else {
			
 
				-                std::string value(argv[i]);
			
 
				-                /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
			
 
				-                else if (value == "isolate")                    { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
			
 
				-                else if (value == "numactl")                    { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
			
 
				-                else { invalid_param = true; break; }
			
 
				-            }
			
 
				-        } else if (arg == "--embedding" || arg == "--embeddings") {
			
 
				-            params.embedding = true;
			
 
				-        } else if (arg == "-cb" || arg == "--cont-batching") {
			
 
				-            params.cont_batching = true;
			
 
				-        } else if (arg == "-fa" || arg == "--flash-attn") {
			
 
				-            params.flash_attn = true;
			
 
				-        } else if (arg == "-np" || arg == "--parallel") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            params.n_parallel = std::stoi(argv[i]);
			
 
				-        } else if (arg == "-n" || arg == "--n-predict") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            params.n_predict = std::stoi(argv[i]);
			
 
				-        } else if (arg == "-spf" || arg == "--system-prompt-file") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            std::ifstream file(argv[i]);
			
 
				-            if (!file) {
			
 
				-                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            std::string system_prompt;
			
 
				-            std::copy(
			
 
				-                std::istreambuf_iterator<char>(file),
			
 
				-                std::istreambuf_iterator<char>(),
			
 
				-                std::back_inserter(system_prompt)
			
 
				-            );
			
 
				-            sparams.system_prompt = system_prompt;
			
 
				-        } else if (arg == "-ctk" || arg == "--cache-type-k") {
			
 
				-            params.cache_type_k = argv[++i];
			
 
				-        } else if (arg == "-ctv" || arg == "--cache-type-v") {
			
 
				-            params.cache_type_v = argv[++i];
			
 
				-        } else if (arg == "--log-format") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            if (std::strcmp(argv[i], "json") == 0) {
			
 
				-                server_log_json = true;
			
 
				-            } else if (std::strcmp(argv[i], "text") == 0) {
			
 
				-                server_log_json = false;
			
 
				-            } else {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-        } else if (arg == "--log-disable") {
			
 
				-            log_set_target(stdout);
			
 
				-            LOG_INFO("logging to file is disabled.", {});
			
 
				-        } else if (arg == "--slots-endpoint-disable") {
			
 
				-            sparams.slots_endpoint = false;
			
 
				-        } else if (arg == "--metrics") {
			
 
				-            sparams.metrics_endpoint = true;
			
 
				-        } else if (arg == "--slot-save-path") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            sparams.slot_save_path = argv[i];
			
 
				-            // if doesn't end with DIRECTORY_SEPARATOR, add it
			
 
				-            if (!sparams.slot_save_path.empty() && sparams.slot_save_path[sparams.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
			
 
				-                sparams.slot_save_path += DIRECTORY_SEPARATOR;
			
 
				-            }
			
 
				-        } else if (arg == "--chat-template") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            if (!verify_custom_template(argv[i])) {
			
 
				-                fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]);
			
 
				-                fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n");
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            sparams.chat_template = argv[i];
			
 
				-        } else if (arg == "--override-kv") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
			
 
				-                fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-        } else {
			
 
				-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
			
 
				-            server_print_usage(argv[0], default_params, default_sparams);
			
 
				-            exit(1);
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-    gpt_params_handle_model_default(params);
			
 
				-
			
 
				-    if (!params.kv_overrides.empty()) {
			
 
				-        params.kv_overrides.emplace_back();
			
 
				-        params.kv_overrides.back().key[0] = 0;
			
 
				-    }
			
 
				-
			
 
				-    if (invalid_param) {
			
 
				-        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
			
 
				-        server_print_usage(argv[0], default_params, default_sparams);
			
 
				-        exit(1);
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				 static void log_server_request(const httplib::Request & req, const httplib::Response & res) {
			
 
				     // skip GH copilot requests when using default port
			
 
				     if (req.path == "/v1/health" || req.path == "/v1/completions") {
			
@@ -2929,16 +2351,22 @@ int main(int argc, char ** argv) {
 
				     log_disable();
			
 
				 #endif
			
 
				     // own arguments required by this example
			
 
				-    gpt_params    params;
			
 
				-    server_params sparams;
			
 
				+    gpt_params params;
			
 
				+
			
 
				+    if (!gpt_params_parse(argc, argv, params)) {
			
 
				+        gpt_params_print_usage(argc, argv, params);
			
 
				+        return 1;
			
 
				+    }
			
 
				+
			
 
				+    // TODO: not great to use extern vars
			
 
				+    server_log_json = params.log_json;
			
 
				+    server_verbose = params.verbose;
			
 
				 
			
 
				     // struct that contains llama context and inference
			
 
				     server_context ctx_server;
			
 
				 
			
 
				-    server_params_parse(argc, argv, sparams, params);
			
 
				-
			
 
				-    if (!sparams.system_prompt.empty()) {
			
 
				-        ctx_server.system_prompt_set(sparams.system_prompt);
			
 
				+    if (!params.system_prompt.empty()) {
			
 
				+        ctx_server.system_prompt_set(params.system_prompt);
			
 
				     }
			
 
				 
			
 
				     if (params.model_alias == "unknown") {
			
@@ -2962,10 +2390,10 @@ int main(int argc, char ** argv) {
 
				 
			
 
				     std::unique_ptr<httplib::Server> svr;
			
 
				 #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
			
 
				-    if (sparams.ssl_key_file != "" && sparams.ssl_cert_file != "") {
			
 
				-        LOG_INFO("Running with SSL", {{"key", sparams.ssl_key_file}, {"cert", sparams.ssl_cert_file}});
			
 
				+    if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
			
 
				+        LOG_INFO("Running with SSL", {{"key", params.ssl_file_key}, {"cert", params.ssl_file_cert}});
			
 
				         svr.reset(
			
 
				-            new httplib::SSLServer(sparams.ssl_cert_file.c_str(), sparams.ssl_key_file.c_str())
			
 
				+            new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str())
			
 
				         );
			
 
				     } else {
			
 
				         LOG_INFO("Running without SSL", {});
			
@@ -3019,24 +2447,24 @@ int main(int argc, char ** argv) {
 
				     });
			
 
				 
			
 
				     // set timeouts and change hostname and port
			
 
				-    svr->set_read_timeout (sparams.read_timeout);
			
 
				-    svr->set_write_timeout(sparams.write_timeout);
			
 
				+    svr->set_read_timeout (params.timeout_read);
			
 
				+    svr->set_write_timeout(params.timeout_write);
			
 
				 
			
 
				-    if (!svr->bind_to_port(sparams.hostname, sparams.port)) {
			
 
				-        fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", sparams.hostname.c_str(), sparams.port);
			
 
				+    if (!svr->bind_to_port(params.hostname, params.port)) {
			
 
				+        fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", params.hostname.c_str(), params.port);
			
 
				         return 1;
			
 
				     }
			
 
				 
			
 
				     std::unordered_map<std::string, std::string> log_data;
			
 
				 
			
 
				-    log_data["hostname"] = sparams.hostname;
			
 
				-    log_data["port"]     = std::to_string(sparams.port);
			
 
				+    log_data["hostname"] = params.hostname;
			
 
				+    log_data["port"]     = std::to_string(params.port);
			
 
				 
			
 
				-    if (sparams.api_keys.size() == 1) {
			
 
				-        auto key = sparams.api_keys[0];
			
 
				+    if (params.api_keys.size() == 1) {
			
 
				+        auto key = params.api_keys[0];
			
 
				         log_data["api_key"] = "api_key: ****" + key.substr(std::max((int)(key.length() - 4), 0));
			
 
				-    } else if (sparams.api_keys.size() > 1) {
			
 
				-        log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded";
			
 
				+    } else if (params.api_keys.size() > 1) {
			
 
				+        log_data["api_key"] = "api_key: " + std::to_string(params.api_keys.size()) + " keys loaded";
			
 
				     }
			
 
				 
			
 
				     // load the model
			
@@ -3053,10 +2481,10 @@ int main(int argc, char ** argv) {
 
				     const auto model_meta = ctx_server.model_meta();
			
 
				 
			
 
				     // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
			
 
				-    if (sparams.chat_template.empty()) {
			
 
				+    if (params.chat_template.empty()) {
			
 
				         if (!ctx_server.validate_model_chat_template()) {
			
 
				             LOG_ERROR("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
			
 
				-            sparams.chat_template = "chatml";
			
 
				+            params.chat_template = "chatml";
			
 
				         }
			
 
				     }
			
 
				 
			
@@ -3068,11 +2496,11 @@ int main(int argc, char ** argv) {
 
				         chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
			
 
				         chat.push_back({{"role", "user"},      {"content", "How are you?"}});
			
 
				 
			
 
				-        const std::string chat_example = format_chat(ctx_server.model, sparams.chat_template, chat);
			
 
				+        const std::string chat_example = format_chat(ctx_server.model, params.chat_template, chat);
			
 
				 
			
 
				         LOG_INFO("chat template", {
			
 
				             {"chat_example", chat_example},
			
 
				-            {"built_in", sparams.chat_template.empty()},
			
 
				+            {"built_in", params.chat_template.empty()},
			
 
				         });
			
 
				     }
			
 
				 
			
@@ -3080,7 +2508,7 @@ int main(int argc, char ** argv) {
 
				     // Middlewares
			
 
				     //
			
 
				 
			
 
				-    auto middleware_validate_api_key = [&sparams, &res_error](const httplib::Request & req, httplib::Response & res) {
			
 
				+    auto middleware_validate_api_key = [&params, &res_error](const httplib::Request & req, httplib::Response & res) {
			
 
				         // TODO: should we apply API key to all endpoints, including "/health" and "/models"?
			
 
				         static const std::set<std::string> protected_endpoints = {
			
 
				             "/props",
			
@@ -3098,7 +2526,7 @@ int main(int argc, char ** argv) {
 
				         };
			
 
				 
			
 
				         // If API key is not set, skip validation
			
 
				-        if (sparams.api_keys.empty()) {
			
 
				+        if (params.api_keys.empty()) {
			
 
				             return true;
			
 
				         }
			
 
				 
			
@@ -3113,7 +2541,7 @@ int main(int argc, char ** argv) {
 
				         std::string prefix = "Bearer ";
			
 
				         if (auth_header.substr(0, prefix.size()) == prefix) {
			
 
				             std::string received_api_key = auth_header.substr(prefix.size());
			
 
				-            if (std::find(sparams.api_keys.begin(), sparams.api_keys.end(), received_api_key) != sparams.api_keys.end()) {
			
 
				+            if (std::find(params.api_keys.begin(), params.api_keys.end(), received_api_key) != params.api_keys.end()) {
			
 
				                 return true; // API key is valid
			
 
				             }
			
 
				         }
			
@@ -3168,7 +2596,7 @@ int main(int argc, char ** argv) {
 
				                     };
			
 
				 
			
 
				                     res.status = 200; // HTTP OK
			
 
				-                    if (sparams.slots_endpoint && req.has_param("include_slots")) {
			
 
				+                    if (params.endpoint_slots && req.has_param("include_slots")) {
			
 
				                         health["slots"] = result.data.at("slots");
			
 
				                     }
			
 
				 
			
@@ -3194,7 +2622,7 @@ int main(int argc, char ** argv) {
 
				     };
			
 
				 
			
 
				     const auto handle_slots = [&](const httplib::Request &, httplib::Response & res) {
			
 
				-        if (!sparams.slots_endpoint) {
			
 
				+        if (!params.endpoint_slots) {
			
 
				             res_error(res, format_error_response("This server does not support slots endpoint.", ERROR_TYPE_NOT_SUPPORTED));
			
 
				             return;
			
 
				         }
			
@@ -3218,7 +2646,7 @@ int main(int argc, char ** argv) {
 
				     };
			
 
				 
			
 
				     const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) {
			
 
				-        if (!sparams.metrics_endpoint) {
			
 
				+        if (!params.endpoint_metrics) {
			
 
				             res_error(res, format_error_response("This server does not support metrics endpoint.", ERROR_TYPE_NOT_SUPPORTED));
			
 
				             return;
			
 
				         }
			
@@ -3318,14 +2746,14 @@ int main(int argc, char ** argv) {
 
				         res.status = 200; // HTTP OK
			
 
				     };
			
 
				 
			
 
				-    const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
			
 
				+    const auto handle_slots_save = [&ctx_server, &res_error, &params](const httplib::Request & req, httplib::Response & res, int id_slot) {
			
 
				         json request_data = json::parse(req.body);
			
 
				         std::string filename = request_data.at("filename");
			
 
				         if (!fs_validate_filename(filename)) {
			
 
				             res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
			
 
				             return;
			
 
				         }
			
 
				-        std::string filepath = sparams.slot_save_path + filename;
			
 
				+        std::string filepath = params.slot_save_path + filename;
			
 
				 
			
 
				         server_task task;
			
 
				         task.type = SERVER_TASK_TYPE_SLOT_SAVE;
			
@@ -3348,14 +2776,14 @@ int main(int argc, char ** argv) {
 
				         }
			
 
				     };
			
 
				 
			
 
				-    const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
			
 
				+    const auto handle_slots_restore = [&ctx_server, &res_error, &params](const httplib::Request & req, httplib::Response & res, int id_slot) {
			
 
				         json request_data = json::parse(req.body);
			
 
				         std::string filename = request_data.at("filename");
			
 
				         if (!fs_validate_filename(filename)) {
			
 
				             res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
			
 
				             return;
			
 
				         }
			
 
				-        std::string filepath = sparams.slot_save_path + filename;
			
 
				+        std::string filepath = params.slot_save_path + filename;
			
 
				 
			
 
				         server_task task;
			
 
				         task.type = SERVER_TASK_TYPE_SLOT_RESTORE;
			
@@ -3530,9 +2958,9 @@ int main(int argc, char ** argv) {
 
				         res.set_content(models.dump(), "application/json; charset=utf-8");
			
 
				     };
			
 
				 
			
 
				-    const auto handle_chat_completions = [&ctx_server, &sparams, &res_error](const httplib::Request & req, httplib::Response & res) {
			
 
				+    const auto handle_chat_completions = [&ctx_server, &params, &res_error](const httplib::Request & req, httplib::Response & res) {
			
 
				         res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
			
 
				-        json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), sparams.chat_template);
			
 
				+        json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
			
 
				 
			
 
				         const int id_task = ctx_server.queue_tasks.get_new_id();
			
 
				 
			
@@ -3757,29 +3185,29 @@ int main(int argc, char ** argv) {
 
				     //
			
 
				 
			
 
				     // register static assets routes
			
 
				-    if (!sparams.public_path.empty()) {
			
 
				+    if (!params.public_path.empty()) {
			
 
				         // Set the base directory for serving static files
			
 
				-        svr->set_base_dir(sparams.public_path);
			
 
				+        svr->set_base_dir(params.public_path);
			
 
				     }
			
 
				+
			
 
				     // using embedded static files
			
 
				-    svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
			
 
				-    svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
			
 
				-    svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
			
 
				-    svr->Get("/json-schema-to-grammar.mjs", handle_static_file(
			
 
				-      json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
			
 
				+    svr->Get("/",                           handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
			
 
				+    svr->Get("/index.js",                   handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
			
 
				+    svr->Get("/completion.js",              handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
			
 
				+    svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
			
 
				 
			
 
				     // add new-ui files
			
 
				-    svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8"));
			
 
				-    svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8"));
			
 
				+    svr->Get("/colorthemes.css",       handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8"));
			
 
				+    svr->Get("/style.css",             handle_static_file(style_css, style_css_len, "text/css; charset=utf-8"));
			
 
				     svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8"));
			
 
				-    svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8"));
			
 
				-    svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8"));
			
 
				-    svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8"));
			
 
				-    svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8"));
			
 
				-    svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8"));
			
 
				-    svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
			
 
				-    svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
			
 
				-    svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
			
 
				+    svr->Get("/theme-ketivah.css",     handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8"));
			
 
				+    svr->Get("/theme-mangotango.css",  handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8"));
			
 
				+    svr->Get("/theme-playground.css",  handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8"));
			
 
				+    svr->Get("/theme-polarnight.css",  handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8"));
			
 
				+    svr->Get("/theme-snowstorm.css",   handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8"));
			
 
				+    svr->Get("/index-new.html",        handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
			
 
				+    svr->Get("/system-prompts.js",     handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
			
 
				+    svr->Get("/prompt-formats.js",     handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
			
 
				 
			
 
				     // register API routes
			
 
				     svr->Get ("/health",              handle_health);
			
@@ -3798,7 +3226,7 @@ int main(int argc, char ** argv) {
 
				     svr->Post("/v1/embeddings",       handle_embeddings);
			
 
				     svr->Post("/tokenize",            handle_tokenize);
			
 
				     svr->Post("/detokenize",          handle_detokenize);
			
 
				-    if (!sparams.slot_save_path.empty()) {
			
 
				+    if (!params.slot_save_path.empty()) {
			
 
				         // only enable slot endpoints if slot_save_path is set
			
 
				         svr->Post("/slots/:id_slot",  handle_slots_action);
			
 
				     }
			
@@ -3806,12 +3234,12 @@ int main(int argc, char ** argv) {
 
				     //
			
 
				     // Start the server
			
 
				     //
			
 
				-    if (sparams.n_threads_http < 1) {
			
 
				+    if (params.n_threads_http < 1) {
			
 
				         // +2 threads for monitoring endpoints
			
 
				-        sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
			
 
				+        params.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
			
 
				     }
			
 
				-    log_data["n_threads_http"] =  std::to_string(sparams.n_threads_http);
			
 
				-    svr->new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
			
 
				+    log_data["n_threads_http"] =  std::to_string(params.n_threads_http);
			
 
				+    svr->new_task_queue = [&params] { return new httplib::ThreadPool(params.n_threads_http); };
			
 
				 
			
 
				     LOG_INFO("HTTP server listening", log_data);
			
 
				 
			
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -116,13 +116,6 @@ static inline void server_log(const char * level, const char * function, int lin
 
				 // chat template utils
			
 
				 //
			
 
				 
			
 
				-// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
			
 
				-inline bool verify_custom_template(const std::string & tmpl) {
			
 
				-    llama_chat_message chat[] = {{"user", "test"}};
			
 
				-    int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
			
 
				-    return res >= 0;
			
 
				-}
			
 
				-
			
 
				 // Format given chat. If tmpl is empty, we take the template from model metadata
			
 
				 inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
			
 
				     size_t alloc_size = 0;
			
--- a/examples/simple/README.md
+++ b/examples/simple/README.md
@@ -3,7 +3,7 @@
 
				 The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.
			
 
				 
			
 
				 ```bash
			
 
				-./simple ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"
			
 
				+./simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is"
			
 
				 
			
 
				 ...
			
 
				 
			
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -6,28 +6,27 @@
 
				 #include <string>
			
 
				 #include <vector>
			
 
				 
			
 
				-int main(int argc, char ** argv) {
			
 
				-    gpt_params params;
			
 
				+static void print_usage(int argc, char ** argv, const gpt_params & params) {
			
 
				+    gpt_params_print_usage(argc, argv, params);
			
 
				 
			
 
				-    if (argc == 1 || argv[1][0] == '-') {
			
 
				-        printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
			
 
				-        return 1 ;
			
 
				-    }
			
 
				+    LOG_TEE("\nexample usage:\n");
			
 
				+    LOG_TEE("\n    %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
			
 
				+    LOG_TEE("\n");
			
 
				+}
			
 
				 
			
 
				-    if (argc >= 2) {
			
 
				-        params.model = argv[1];
			
 
				-    }
			
 
				+int main(int argc, char ** argv) {
			
 
				+    gpt_params params;
			
 
				 
			
 
				-    if (argc >= 3) {
			
 
				-        params.prompt = argv[2];
			
 
				-    }
			
 
				+    params.prompt = "Hello my name is";
			
 
				+    params.n_predict = 32;
			
 
				 
			
 
				-    if (params.prompt.empty()) {
			
 
				-        params.prompt = "Hello my name is";
			
 
				+    if (!gpt_params_parse(argc, argv, params)) {
			
 
				+        print_usage(argc, argv, params);
			
 
				+        return 1;
			
 
				     }
			
 
				 
			
 
				     // total length of the sequence including the prompt
			
 
				-    const int n_len = 32;
			
 
				+    const int n_predict = params.n_predict;
			
 
				 
			
 
				     // init LLM
			
 
				 
			
@@ -36,9 +35,7 @@ int main(int argc, char ** argv) {
 
				 
			
 
				     // initialize the model
			
 
				 
			
 
				-    llama_model_params model_params = llama_model_default_params();
			
 
				-
			
 
				-    // model_params.n_gpu_layers = 99; // offload all layers to the GPU
			
 
				+    llama_model_params model_params = llama_model_params_from_gpt_params(params);
			
 
				 
			
 
				     llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
			
 
				 
			
@@ -49,12 +46,7 @@ int main(int argc, char ** argv) {
 
				 
			
 
				     // initialize the context
			
 
				 
			
 
				-    llama_context_params ctx_params = llama_context_default_params();
			
 
				-
			
 
				-    ctx_params.seed  = 1234;
			
 
				-    ctx_params.n_ctx = 2048;
			
 
				-    ctx_params.n_threads = params.n_threads;
			
 
				-    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
			
 
				+    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
			
 
				 
			
 
				     llama_context * ctx = llama_new_context_with_model(model, ctx_params);
			
 
				 
			
@@ -69,14 +61,14 @@ int main(int argc, char ** argv) {
 
				     tokens_list = ::llama_tokenize(ctx, params.prompt, true);
			
 
				 
			
 
				     const int n_ctx    = llama_n_ctx(ctx);
			
 
				-    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
			
 
				+    const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
			
 
				 
			
 
				-    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_kv_req);
			
 
				+    LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
			
 
				 
			
 
				     // make sure the KV cache is big enough to hold all the prompt and generated tokens
			
 
				     if (n_kv_req > n_ctx) {
			
 
				         LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
			
 
				-        LOG_TEE("%s:        either reduce n_len or increase n_ctx\n", __func__);
			
 
				+        LOG_TEE("%s:        either reduce n_predict or increase n_ctx\n", __func__);
			
 
				         return 1;
			
 
				     }
			
 
				 
			
@@ -115,7 +107,7 @@ int main(int argc, char ** argv) {
 
				 
			
 
				     const auto t_main_start = ggml_time_us();
			
 
				 
			
 
				-    while (n_cur <= n_len) {
			
 
				+    while (n_cur <= n_predict) {
			
 
				         // sample the next token
			
 
				         {
			
 
				             auto   n_vocab = llama_n_vocab(model);
			
@@ -134,7 +126,7 @@ int main(int argc, char ** argv) {
 
				             const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
			
 
				 
			
 
				             // is it an end of generation?
			
 
				-            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
			
 
				+            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
			
 
				                 LOG_TEE("\n");
			
 
				 
			
 
				                 break;
			
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -27,7 +27,8 @@ struct seq_draft {
 
				 int main(int argc, char ** argv) {
			
 
				     gpt_params params;
			
 
				 
			
 
				-    if (gpt_params_parse(argc, argv, params) == false) {
			
 
				+    if (!gpt_params_parse(argc, argv, params)) {
			
 
				+        gpt_params_print_usage(argc, argv, params);
			
 
				         return 1;
			
 
				     }
			
 
				 
			
--- a/llama.cpp
+++ b/llama.cpp
@@ -108,7 +108,7 @@
 
				 //
			
 
				 
			
 
				 LLAMA_ATTRIBUTE_FORMAT(2, 3)
			
 
				-static void llama_log_internal        (ggml_log_level level, const char* format, ...);
			
 
				+static void llama_log_internal        (ggml_log_level level, const char * format, ...);
			
 
				 static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
			
 
				 
			
 
				 #define LLAMA_LOG_INFO(...)  llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
			
--- a/scripts/run-with-preset.py
+++ b/scripts/run-with-preset.py
@@ -13,12 +13,12 @@ logger = logging.getLogger("run-with-preset")
 
				 CLI_ARGS_MAIN_PERPLEXITY = [
			
 
				     "batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape",
			
 
				     "export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
			
 
				-    "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct",
			
 
				+    "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix",
			
 
				     "interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
			
 
				     "low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
			
 
				     "model", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
			
 
				     "np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt",
			
 
				-    "prompt-cache", "prompt-cache-all", "prompt-cache-ro", "random-prompt", "repeat-last-n",
			
 
				+    "prompt-cache", "prompt-cache-all", "prompt-cache-ro", "repeat-last-n",
			
 
				     "repeat-penalty", "reverse-prompt", "rope-freq-base", "rope-freq-scale", "rope-scale", "seed",
			
 
				     "simple-io", "tensor-split", "threads", "temp", "tfs", "top-k", "top-p", "typical",
			
 
				     "verbose-prompt"