1 год назад · 6c5bc0625f
--- a/common/common.h
+++ b/common/common.h
@@ -215,7 +215,7 @@ struct common_params {
 
				     struct common_params_speculative speculative;
			
 
				 
			
 
				     std::string model                = ""; // model path                                                    // NOLINT
			
 
				-    std::string model_alias          = "unknown"; // model alias                                            // NOLINT
			
 
				+    std::string model_alias          = ""; // model alias                                                   // NOLINT
			
 
				     std::string model_url            = ""; // model url to download                                         // NOLINT
			
 
				     std::string hf_token             = ""; // HF token                                                      // NOLINT
			
 
				     std::string hf_repo              = ""; // HF repo                                                       // NOLINT
			
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -473,9 +473,11 @@ Notice that each `probs` is an array of length `n_probs`.
 
				 - `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.).
			
 
				 - `model`: The path to the model loaded with `-m`
			
 
				 - `prompt`: The provided `prompt`
			
 
				-- `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
			
 
				-- `stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered
			
 
				-- `stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided
			
 
				+- `stop_type`: Indicating whether the completion has stopped. Possible values are:
			
 
				+  - `none`: Generating (not stopped)
			
 
				+  - `eos`: Stopped because it encountered the EOS token
			
 
				+  - `limit`: Stopped because `n_predict` tokens were generated before stop words or EOS was encountered
			
 
				+  - `word`: Stopped due to encountering a stopping word from `stop` JSON array provided
			
 
				 - `stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word)
			
 
				 - `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
			
 
				 - `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
			
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@@ -44,4 +44,10 @@ To run with stdout/stderr display in real time (verbose output, but useful for d
 
				 DEBUG=1 ./tests.sh -s -v -x
			
 
				 ```
			
 
				 
			
 
				+Hint: You can compile and run test in single command, useful for local developement:
			
 
				+
			
 
				+```shell
			
 
				+cmake --build build -j --target llama-server && ./examples/server/tests/tests.sh
			
 
				+```
			
 
				+
			
 
				 To see all available arguments, please refer to [pytest documentation](https://docs.pytest.org/en/stable/how-to/usage.html)
			
--- a/examples/server/tests/tests.sh
+++ b/examples/server/tests/tests.sh
@@ -1,5 +1,9 @@
 
				 #!/bin/bash
			
 
				 
			
 
				+# make sure we are in the right directory
			
 
				+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
			
 
				+cd $SCRIPT_DIR
			
 
				+
			
 
				 set -eu
			
 
				 
			
 
				 if [ $# -lt 1 ]
			
--- a/examples/server/tests/unit/test_chat_completion.py
+++ b/examples/server/tests/unit/test_chat_completion.py
@@ -12,13 +12,13 @@ def create_server():
 
				 
			
 
				 
			
 
				 @pytest.mark.parametrize(
			
 
				-    "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,truncated",
			
 
				+    "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason",
			
 
				     [
			
 
				-        ("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, False),
			
 
				-        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, False),
			
 
				+        (None, "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"),
			
 
				+        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"),
			
 
				     ]
			
 
				 )
			
 
				-def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, truncated):
			
 
				+def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
			
 
				     global server
			
 
				     server.start()
			
 
				     res = server.make_request("POST", "/chat/completions", data={
			
@@ -30,29 +30,27 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
 
				         ],
			
 
				     })
			
 
				     assert res.status_code == 200
			
 
				+    assert res.body["model"] == model if model is not None else server.model_alias
			
 
				     assert res.body["usage"]["prompt_tokens"] == n_prompt
			
 
				     assert res.body["usage"]["completion_tokens"] == n_predicted
			
 
				     choice = res.body["choices"][0]
			
 
				     assert "assistant" == choice["message"]["role"]
			
 
				     assert match_regex(re_content, choice["message"]["content"])
			
 
				-    if truncated:
			
 
				-        assert choice["finish_reason"] == "length"
			
 
				-    else:
			
 
				-        assert choice["finish_reason"] == "stop"
			
 
				+    assert choice["finish_reason"] == finish_reason
			
 
				 
			
 
				 
			
 
				 @pytest.mark.parametrize(
			
 
				-    "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,truncated",
			
 
				+    "system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason",
			
 
				     [
			
 
				-        ("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, False),
			
 
				-        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, False),
			
 
				+        ("Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"),
			
 
				+        ("You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"),
			
 
				     ]
			
 
				 )
			
 
				-def test_chat_completion_stream(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, truncated):
			
 
				+def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
			
 
				     global server
			
 
				+    server.model_alias = None # try using DEFAULT_OAICOMPAT_MODEL
			
 
				     server.start()
			
 
				     res = server.make_stream_request("POST", "/chat/completions", data={
			
 
				-        "model": model,
			
 
				         "max_tokens": max_tokens,
			
 
				         "messages": [
			
 
				             {"role": "system", "content": system_prompt},
			
@@ -63,16 +61,13 @@ def test_chat_completion_stream(model, system_prompt, user_prompt, max_tokens, r
 
				     content = ""
			
 
				     for data in res:
			
 
				         choice = data["choices"][0]
			
 
				+        assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
			
 
				         if choice["finish_reason"] in ["stop", "length"]:
			
 
				             assert data["usage"]["prompt_tokens"] == n_prompt
			
 
				             assert data["usage"]["completion_tokens"] == n_predicted
			
 
				             assert "content" not in choice["delta"]
			
 
				             assert match_regex(re_content, content)
			
 
				-            # FIXME: not sure why this is incorrect in stream mode
			
 
				-            # if truncated:
			
 
				-            #   assert choice["finish_reason"] == "length"
			
 
				-            # else:
			
 
				-            #   assert choice["finish_reason"] == "stop"
			
 
				+            assert choice["finish_reason"] == finish_reason
			
 
				         else:
			
 
				             assert choice["finish_reason"] is None
			
 
				             content += choice["delta"]["content"]
			
@@ -93,7 +88,7 @@ def test_chat_completion_with_openai_library():
 
				         temperature=0.8,
			
 
				     )
			
 
				     print(res)
			
 
				-    assert res.choices[0].finish_reason == "stop"
			
 
				+    assert res.choices[0].finish_reason == "length"
			
 
				     assert res.choices[0].message.content is not None
			
 
				     assert match_regex("(Suddenly)+", res.choices[0].message.content)
			
 
				 
			
--- a/examples/server/tests/unit/test_completion.py
+++ b/examples/server/tests/unit/test_completion.py
@@ -51,6 +51,24 @@ def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_promp
 
				             content += data["content"]
			
 
				 
			
 
				 
			
 
				+def test_completion_stream_vs_non_stream():
			
 
				+    global server
			
 
				+    server.start()
			
 
				+    res_stream = server.make_stream_request("POST", "/completion", data={
			
 
				+        "n_predict": 8,
			
 
				+        "prompt": "I believe the meaning of life is",
			
 
				+        "stream": True,
			
 
				+    })
			
 
				+    res_non_stream = server.make_request("POST", "/completion", data={
			
 
				+        "n_predict": 8,
			
 
				+        "prompt": "I believe the meaning of life is",
			
 
				+    })
			
 
				+    content_stream = ""
			
 
				+    for data in res_stream:
			
 
				+        content_stream += data["content"]
			
 
				+    assert content_stream == res_non_stream.body["content"]
			
 
				+
			
 
				+
			
 
				 @pytest.mark.parametrize("n_slots", [1, 2])
			
 
				 def test_consistent_result_same_seed(n_slots: int):
			
 
				     global server
			
@@ -221,3 +239,24 @@ def test_completion_parallel_slots(n_slots: int, n_requests: int):
 
				         assert len(res.body["content"]) > 10
			
 
				         # FIXME: the result is not deterministic when using other slot than slot 0
			
 
				         # assert match_regex(re_content, res.body["content"])
			
 
				+
			
 
				+
			
 
				+def test_n_probs():
			
 
				+    global server
			
 
				+    server.start()
			
 
				+    res = server.make_request("POST", "/completion", data={
			
 
				+        "prompt": "I believe the meaning of life is",
			
 
				+        "n_probs": 10,
			
 
				+        "temperature": 0.0,
			
 
				+        "n_predict": 5,
			
 
				+    })
			
 
				+    assert res.status_code == 200
			
 
				+    assert "completion_probabilities" in res.body
			
 
				+    assert len(res.body["completion_probabilities"]) == 5
			
 
				+    for tok in res.body["completion_probabilities"]:
			
 
				+        assert "probs" in tok
			
 
				+        assert len(tok["probs"]) == 10
			
 
				+        for prob in tok["probs"]:
			
 
				+            assert "prob" in prob
			
 
				+            assert "tok_str" in prob
			
 
				+            assert 0.0 <= prob["prob"] <= 1.0
			
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -20,6 +20,7 @@
 
				 #include <sstream>
			
 
				 #include <string>
			
 
				 #include <vector>
			
 
				+#include <memory>
			
 
				 
			
 
				 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
			
 
				 
			
@@ -40,17 +41,6 @@ using json = nlohmann::ordered_json;
 
				 #define QUE_ERR(fmt, ...) LOG_ERR("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
			
 
				 #define QUE_DBG(fmt, ...) LOG_DBG("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
			
 
				 
			
 
				-// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
			
 
				-enum error_type {
			
 
				-    ERROR_TYPE_INVALID_REQUEST,
			
 
				-    ERROR_TYPE_AUTHENTICATION,
			
 
				-    ERROR_TYPE_SERVER,
			
 
				-    ERROR_TYPE_NOT_FOUND,
			
 
				-    ERROR_TYPE_PERMISSION,
			
 
				-    ERROR_TYPE_UNAVAILABLE, // custom error
			
 
				-    ERROR_TYPE_NOT_SUPPORTED, // custom error
			
 
				-};
			
 
				-
			
 
				 template <typename T>
			
 
				 static T json_value(const json & body, const std::string & key, const T & default_value) {
			
 
				     // Fallback null to default value
			
@@ -485,48 +475,11 @@ static std::string tokens_to_output_formatted_string(const llama_context * ctx,
 
				     return out;
			
 
				 }
			
 
				 
			
 
				-struct completion_token_output {
			
 
				-    llama_token tok;
			
 
				-    std::string text_to_send;
			
 
				-
			
 
				-    struct token_prob {
			
 
				-        llama_token tok;
			
 
				-        float prob;
			
 
				-    };
			
 
				-
			
 
				-    std::vector<token_prob> probs;
			
 
				-};
			
 
				-
			
 
				-// convert a vector of completion_token_output to json
			
 
				-static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> & probs) {
			
 
				-    json out = json::array();
			
 
				-
			
 
				-    for (const auto & prob : probs) {
			
 
				-        json probs_for_token = json::array();
			
 
				-
			
 
				-        for (const auto & p : prob.probs) {
			
 
				-            const std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
			
 
				-            probs_for_token.push_back(json {
			
 
				-                {"tok_str", tok_str},
			
 
				-                {"prob",    p.prob},
			
 
				-            });
			
 
				-        }
			
 
				-
			
 
				-        const std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
			
 
				-        out.push_back(json {
			
 
				-            {"content", tok_str},
			
 
				-            {"probs",   probs_for_token},
			
 
				-        });
			
 
				-    }
			
 
				-
			
 
				-    return out;
			
 
				-}
			
 
				-
			
 
				 static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) {
			
 
				     const std::string str =
			
 
				         std::string(event) + ": " +
			
 
				         data.dump(-1, ' ', false, json::error_handler_t::replace) +
			
 
				-        "\n\n"; // note: these newlines are important (not sure why though, if you know, add a comment to explain)
			
 
				+        "\n\n"; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row).
			
 
				 
			
 
				     LOG_DBG("data stream, to_send: %s", str.c_str());
			
 
				 
			
@@ -604,164 +557,6 @@ static json oaicompat_completion_params_parse(
 
				     return llama_params;
			
 
				 }
			
 
				 
			
 
				-static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false, bool verbose = false) {
			
 
				-    bool stopped_word        = result.count("stopped_word") != 0;
			
 
				-    bool stopped_eos         = json_value(result, "stopped_eos", false);
			
 
				-    int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
			
 
				-    int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
			
 
				-    std::string content      = json_value(result, "content", std::string(""));
			
 
				-
			
 
				-    std::string finish_reason = "length";
			
 
				-    if (stopped_word || stopped_eos) {
			
 
				-        finish_reason = "stop";
			
 
				-    }
			
 
				-
			
 
				-    json choices =
			
 
				-        streaming ? json::array({json{{"finish_reason", finish_reason},
			
 
				-                                        {"index", 0},
			
 
				-                                        {"delta", json::object()}}})
			
 
				-                  : json::array({json{{"finish_reason", finish_reason},
			
 
				-                                        {"index", 0},
			
 
				-                                        {"message", json{{"content", content},
			
 
				-                                                         {"role", "assistant"}}}}});
			
 
				-
			
 
				-    std::time_t t = std::time(0);
			
 
				-
			
 
				-    json res = json {
			
 
				-        {"choices", choices},
			
 
				-        {"created", t},
			
 
				-        {"model",
			
 
				-            json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
			
 
				-        {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
			
 
				-        {"usage", json {
			
 
				-            {"completion_tokens", num_tokens_predicted},
			
 
				-            {"prompt_tokens",     num_prompt_tokens},
			
 
				-            {"total_tokens",      num_tokens_predicted + num_prompt_tokens}
			
 
				-        }},
			
 
				-        {"id", completion_id}
			
 
				-    };
			
 
				-
			
 
				-    // extra fields for debugging purposes
			
 
				-    if (verbose) {
			
 
				-        res["__verbose"] = result;
			
 
				-    }
			
 
				-
			
 
				-    if (result.contains("completion_probabilities")) {
			
 
				-        res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
			
 
				-    }
			
 
				-
			
 
				-    if (result.contains("timings")) {
			
 
				-        res.push_back({"timings", json_value(result, "timings", json::object())});
			
 
				-    }
			
 
				-
			
 
				-    return res;
			
 
				-}
			
 
				-
			
 
				-// return value is vector as there is one case where we might need to generate two responses
			
 
				-static std::vector<json> format_partial_response_oaicompat(const json & result, const std::string & completion_id) {
			
 
				-    if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
			
 
				-        return std::vector<json>({result});
			
 
				-    }
			
 
				-
			
 
				-    bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
			
 
				-    std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
			
 
				-
			
 
				-    bool stopped_word   = json_value(result, "stopped_word",  false);
			
 
				-    bool stopped_eos    = json_value(result, "stopped_eos",   false);
			
 
				-    bool stopped_limit  = json_value(result, "stopped_limit", false);
			
 
				-    std::string content = json_value(result, "content",       std::string(""));
			
 
				-
			
 
				-    std::string finish_reason;
			
 
				-    if (stopped_word || stopped_eos) {
			
 
				-        finish_reason = "stop";
			
 
				-    }
			
 
				-    if (stopped_limit) {
			
 
				-        finish_reason = "length";
			
 
				-    }
			
 
				-
			
 
				-    std::time_t t = std::time(0);
			
 
				-
			
 
				-    json choices;
			
 
				-
			
 
				-    if (!finish_reason.empty()) {
			
 
				-        choices = json::array({json{{"finish_reason", finish_reason},
			
 
				-                                    {"index", 0},
			
 
				-                                    {"delta", json::object()}}});
			
 
				-    } else {
			
 
				-        if (first) {
			
 
				-            if (content.empty()) {
			
 
				-                choices = json::array({json{{"finish_reason", nullptr},
			
 
				-                                            {"index", 0},
			
 
				-                                            {"delta", json{{"role", "assistant"}}}}});
			
 
				-            } else {
			
 
				-                // We have to send this as two updates to conform to openai behavior
			
 
				-                json initial_ret = json{{"choices", json::array({json{
			
 
				-                                        {"finish_reason", nullptr},
			
 
				-                                        {"index", 0},
			
 
				-                                        {"delta", json{
			
 
				-                                            {"role", "assistant"}
			
 
				-                                        }}}})},
			
 
				-                            {"created", t},
			
 
				-                            {"id", completion_id},
			
 
				-                            {"model", modelname},
			
 
				-                            {"object", "chat.completion.chunk"}};
			
 
				-
			
 
				-                json second_ret = json{
			
 
				-                            {"choices", json::array({json{{"finish_reason", nullptr},
			
 
				-                                                            {"index", 0},
			
 
				-                                                            {"delta", json{
			
 
				-                                                            {"content", content}}}
			
 
				-                                                            }})},
			
 
				-                            {"created", t},
			
 
				-                            {"id", completion_id},
			
 
				-                            {"model", modelname},
			
 
				-                            {"object", "chat.completion.chunk"}};
			
 
				-
			
 
				-                return std::vector<json>({initial_ret, second_ret});
			
 
				-            }
			
 
				-        } else {
			
 
				-            // Some idiosyncrasy in task processing logic makes several trailing calls
			
 
				-            // with empty content, we ignore these at the calee site.
			
 
				-            if (content.empty()) {
			
 
				-                return std::vector<json>({json::object()});
			
 
				-            }
			
 
				-
			
 
				-            choices = json::array({json{
			
 
				-                {"finish_reason", nullptr},
			
 
				-                {"index", 0},
			
 
				-                {"delta",
			
 
				-                json{
			
 
				-                    {"content", content},
			
 
				-                }},
			
 
				-            }});
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-    json ret = json {
			
 
				-        {"choices", choices},
			
 
				-        {"created", t},
			
 
				-        {"id",      completion_id},
			
 
				-        {"model",   modelname},
			
 
				-        {"object",  "chat.completion.chunk"}
			
 
				-    };
			
 
				-
			
 
				-    if (result.contains("timings")) {
			
 
				-        ret.push_back({"timings", json_value(result, "timings", json::object())});
			
 
				-    }
			
 
				-
			
 
				-    if (!finish_reason.empty()) {
			
 
				-        int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
			
 
				-        int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
			
 
				-        ret.push_back({"usage", json {
			
 
				-            {"completion_tokens", num_tokens_predicted},
			
 
				-            {"prompt_tokens",     num_prompt_tokens},
			
 
				-            {"total_tokens",      num_tokens_predicted + num_prompt_tokens}
			
 
				-        }});
			
 
				-    }
			
 
				-
			
 
				-    return std::vector<json>({ret});
			
 
				-}
			
 
				-
			
 
				 static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
			
 
				     json data = json::array();
			
 
				     int i = 0;
			
@@ -853,43 +648,3 @@ static json format_detokenized_response(const std::string & content) {
 
				         {"content", content}
			
 
				     };
			
 
				 }
			
 
				-
			
 
				-static json format_error_response(const std::string & message, const enum error_type type) {
			
 
				-    std::string type_str;
			
 
				-    int code = 500;
			
 
				-    switch (type) {
			
 
				-        case ERROR_TYPE_INVALID_REQUEST:
			
 
				-            type_str = "invalid_request_error";
			
 
				-            code = 400;
			
 
				-            break;
			
 
				-        case ERROR_TYPE_AUTHENTICATION:
			
 
				-            type_str = "authentication_error";
			
 
				-            code = 401;
			
 
				-            break;
			
 
				-        case ERROR_TYPE_NOT_FOUND:
			
 
				-            type_str = "not_found_error";
			
 
				-            code = 404;
			
 
				-            break;
			
 
				-        case ERROR_TYPE_SERVER:
			
 
				-            type_str = "server_error";
			
 
				-            code = 500;
			
 
				-            break;
			
 
				-        case ERROR_TYPE_PERMISSION:
			
 
				-            type_str = "permission_error";
			
 
				-            code = 403;
			
 
				-            break;
			
 
				-        case ERROR_TYPE_NOT_SUPPORTED:
			
 
				-            type_str = "not_supported_error";
			
 
				-            code = 501;
			
 
				-            break;
			
 
				-        case ERROR_TYPE_UNAVAILABLE:
			
 
				-            type_str = "unavailable_error";
			
 
				-            code = 503;
			
 
				-            break;
			
 
				-    }
			
 
				-    return json {
			
 
				-        {"code", code},
			
 
				-        {"message", message},
			
 
				-        {"type", type_str},
			
 
				-    };
			
 
				-}