|
|
@@ -20,6 +20,7 @@
|
|
|
#include <sstream>
|
|
|
#include <string>
|
|
|
#include <vector>
|
|
|
+#include <memory>
|
|
|
|
|
|
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
|
|
|
|
|
|
@@ -40,17 +41,6 @@ using json = nlohmann::ordered_json;
|
|
|
#define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
|
#define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
|
|
|
|
-// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
|
|
|
-enum error_type {
|
|
|
- ERROR_TYPE_INVALID_REQUEST,
|
|
|
- ERROR_TYPE_AUTHENTICATION,
|
|
|
- ERROR_TYPE_SERVER,
|
|
|
- ERROR_TYPE_NOT_FOUND,
|
|
|
- ERROR_TYPE_PERMISSION,
|
|
|
- ERROR_TYPE_UNAVAILABLE, // custom error
|
|
|
- ERROR_TYPE_NOT_SUPPORTED, // custom error
|
|
|
-};
|
|
|
-
|
|
|
template <typename T>
|
|
|
static T json_value(const json & body, const std::string & key, const T & default_value) {
|
|
|
// Fallback null to default value
|
|
|
@@ -485,48 +475,11 @@ static std::string tokens_to_output_formatted_string(const llama_context * ctx,
|
|
|
return out;
|
|
|
}
|
|
|
|
|
|
-struct completion_token_output {
|
|
|
- llama_token tok;
|
|
|
- std::string text_to_send;
|
|
|
-
|
|
|
- struct token_prob {
|
|
|
- llama_token tok;
|
|
|
- float prob;
|
|
|
- };
|
|
|
-
|
|
|
- std::vector<token_prob> probs;
|
|
|
-};
|
|
|
-
|
|
|
-// convert a vector of completion_token_output to json
|
|
|
-static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> & probs) {
|
|
|
- json out = json::array();
|
|
|
-
|
|
|
- for (const auto & prob : probs) {
|
|
|
- json probs_for_token = json::array();
|
|
|
-
|
|
|
- for (const auto & p : prob.probs) {
|
|
|
- const std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
|
|
|
- probs_for_token.push_back(json {
|
|
|
- {"tok_str", tok_str},
|
|
|
- {"prob", p.prob},
|
|
|
- });
|
|
|
- }
|
|
|
-
|
|
|
- const std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
|
|
|
- out.push_back(json {
|
|
|
- {"content", tok_str},
|
|
|
- {"probs", probs_for_token},
|
|
|
- });
|
|
|
- }
|
|
|
-
|
|
|
- return out;
|
|
|
-}
|
|
|
-
|
|
|
static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) {
|
|
|
const std::string str =
|
|
|
std::string(event) + ": " +
|
|
|
data.dump(-1, ' ', false, json::error_handler_t::replace) +
|
|
|
- "\n\n"; // note: these newlines are important (not sure why though, if you know, add a comment to explain)
|
|
|
+ "\n\n"; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row).
|
|
|
|
|
|
LOG_DBG("data stream, to_send: %s", str.c_str());
|
|
|
|
|
|
@@ -604,164 +557,6 @@ static json oaicompat_completion_params_parse(
|
|
|
return llama_params;
|
|
|
}
|
|
|
|
|
|
-static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false, bool verbose = false) {
|
|
|
- bool stopped_word = result.count("stopped_word") != 0;
|
|
|
- bool stopped_eos = json_value(result, "stopped_eos", false);
|
|
|
- int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
|
|
|
- int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
|
|
|
- std::string content = json_value(result, "content", std::string(""));
|
|
|
-
|
|
|
- std::string finish_reason = "length";
|
|
|
- if (stopped_word || stopped_eos) {
|
|
|
- finish_reason = "stop";
|
|
|
- }
|
|
|
-
|
|
|
- json choices =
|
|
|
- streaming ? json::array({json{{"finish_reason", finish_reason},
|
|
|
- {"index", 0},
|
|
|
- {"delta", json::object()}}})
|
|
|
- : json::array({json{{"finish_reason", finish_reason},
|
|
|
- {"index", 0},
|
|
|
- {"message", json{{"content", content},
|
|
|
- {"role", "assistant"}}}}});
|
|
|
-
|
|
|
- std::time_t t = std::time(0);
|
|
|
-
|
|
|
- json res = json {
|
|
|
- {"choices", choices},
|
|
|
- {"created", t},
|
|
|
- {"model",
|
|
|
- json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
|
|
|
- {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
|
|
|
- {"usage", json {
|
|
|
- {"completion_tokens", num_tokens_predicted},
|
|
|
- {"prompt_tokens", num_prompt_tokens},
|
|
|
- {"total_tokens", num_tokens_predicted + num_prompt_tokens}
|
|
|
- }},
|
|
|
- {"id", completion_id}
|
|
|
- };
|
|
|
-
|
|
|
- // extra fields for debugging purposes
|
|
|
- if (verbose) {
|
|
|
- res["__verbose"] = result;
|
|
|
- }
|
|
|
-
|
|
|
- if (result.contains("completion_probabilities")) {
|
|
|
- res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
|
|
|
- }
|
|
|
-
|
|
|
- if (result.contains("timings")) {
|
|
|
- res.push_back({"timings", json_value(result, "timings", json::object())});
|
|
|
- }
|
|
|
-
|
|
|
- return res;
|
|
|
-}
|
|
|
-
|
|
|
-// return value is vector as there is one case where we might need to generate two responses
|
|
|
-static std::vector<json> format_partial_response_oaicompat(const json & result, const std::string & completion_id) {
|
|
|
- if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
|
|
|
- return std::vector<json>({result});
|
|
|
- }
|
|
|
-
|
|
|
- bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
|
|
|
- std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
|
|
|
-
|
|
|
- bool stopped_word = json_value(result, "stopped_word", false);
|
|
|
- bool stopped_eos = json_value(result, "stopped_eos", false);
|
|
|
- bool stopped_limit = json_value(result, "stopped_limit", false);
|
|
|
- std::string content = json_value(result, "content", std::string(""));
|
|
|
-
|
|
|
- std::string finish_reason;
|
|
|
- if (stopped_word || stopped_eos) {
|
|
|
- finish_reason = "stop";
|
|
|
- }
|
|
|
- if (stopped_limit) {
|
|
|
- finish_reason = "length";
|
|
|
- }
|
|
|
-
|
|
|
- std::time_t t = std::time(0);
|
|
|
-
|
|
|
- json choices;
|
|
|
-
|
|
|
- if (!finish_reason.empty()) {
|
|
|
- choices = json::array({json{{"finish_reason", finish_reason},
|
|
|
- {"index", 0},
|
|
|
- {"delta", json::object()}}});
|
|
|
- } else {
|
|
|
- if (first) {
|
|
|
- if (content.empty()) {
|
|
|
- choices = json::array({json{{"finish_reason", nullptr},
|
|
|
- {"index", 0},
|
|
|
- {"delta", json{{"role", "assistant"}}}}});
|
|
|
- } else {
|
|
|
- // We have to send this as two updates to conform to openai behavior
|
|
|
- json initial_ret = json{{"choices", json::array({json{
|
|
|
- {"finish_reason", nullptr},
|
|
|
- {"index", 0},
|
|
|
- {"delta", json{
|
|
|
- {"role", "assistant"}
|
|
|
- }}}})},
|
|
|
- {"created", t},
|
|
|
- {"id", completion_id},
|
|
|
- {"model", modelname},
|
|
|
- {"object", "chat.completion.chunk"}};
|
|
|
-
|
|
|
- json second_ret = json{
|
|
|
- {"choices", json::array({json{{"finish_reason", nullptr},
|
|
|
- {"index", 0},
|
|
|
- {"delta", json{
|
|
|
- {"content", content}}}
|
|
|
- }})},
|
|
|
- {"created", t},
|
|
|
- {"id", completion_id},
|
|
|
- {"model", modelname},
|
|
|
- {"object", "chat.completion.chunk"}};
|
|
|
-
|
|
|
- return std::vector<json>({initial_ret, second_ret});
|
|
|
- }
|
|
|
- } else {
|
|
|
- // Some idiosyncrasy in task processing logic makes several trailing calls
|
|
|
- // with empty content, we ignore these at the calee site.
|
|
|
- if (content.empty()) {
|
|
|
- return std::vector<json>({json::object()});
|
|
|
- }
|
|
|
-
|
|
|
- choices = json::array({json{
|
|
|
- {"finish_reason", nullptr},
|
|
|
- {"index", 0},
|
|
|
- {"delta",
|
|
|
- json{
|
|
|
- {"content", content},
|
|
|
- }},
|
|
|
- }});
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- json ret = json {
|
|
|
- {"choices", choices},
|
|
|
- {"created", t},
|
|
|
- {"id", completion_id},
|
|
|
- {"model", modelname},
|
|
|
- {"object", "chat.completion.chunk"}
|
|
|
- };
|
|
|
-
|
|
|
- if (result.contains("timings")) {
|
|
|
- ret.push_back({"timings", json_value(result, "timings", json::object())});
|
|
|
- }
|
|
|
-
|
|
|
- if (!finish_reason.empty()) {
|
|
|
- int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
|
|
|
- int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
|
|
|
- ret.push_back({"usage", json {
|
|
|
- {"completion_tokens", num_tokens_predicted},
|
|
|
- {"prompt_tokens", num_prompt_tokens},
|
|
|
- {"total_tokens", num_tokens_predicted + num_prompt_tokens}
|
|
|
- }});
|
|
|
- }
|
|
|
-
|
|
|
- return std::vector<json>({ret});
|
|
|
-}
|
|
|
-
|
|
|
static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
|
|
|
json data = json::array();
|
|
|
int i = 0;
|
|
|
@@ -853,43 +648,3 @@ static json format_detokenized_response(const std::string & content) {
|
|
|
{"content", content}
|
|
|
};
|
|
|
}
|
|
|
-
|
|
|
-static json format_error_response(const std::string & message, const enum error_type type) {
|
|
|
- std::string type_str;
|
|
|
- int code = 500;
|
|
|
- switch (type) {
|
|
|
- case ERROR_TYPE_INVALID_REQUEST:
|
|
|
- type_str = "invalid_request_error";
|
|
|
- code = 400;
|
|
|
- break;
|
|
|
- case ERROR_TYPE_AUTHENTICATION:
|
|
|
- type_str = "authentication_error";
|
|
|
- code = 401;
|
|
|
- break;
|
|
|
- case ERROR_TYPE_NOT_FOUND:
|
|
|
- type_str = "not_found_error";
|
|
|
- code = 404;
|
|
|
- break;
|
|
|
- case ERROR_TYPE_SERVER:
|
|
|
- type_str = "server_error";
|
|
|
- code = 500;
|
|
|
- break;
|
|
|
- case ERROR_TYPE_PERMISSION:
|
|
|
- type_str = "permission_error";
|
|
|
- code = 403;
|
|
|
- break;
|
|
|
- case ERROR_TYPE_NOT_SUPPORTED:
|
|
|
- type_str = "not_supported_error";
|
|
|
- code = 501;
|
|
|
- break;
|
|
|
- case ERROR_TYPE_UNAVAILABLE:
|
|
|
- type_str = "unavailable_error";
|
|
|
- code = 503;
|
|
|
- break;
|
|
|
- }
|
|
|
- return json {
|
|
|
- {"code", code},
|
|
|
- {"message", message},
|
|
|
- {"type", type_str},
|
|
|
- };
|
|
|
-}
|