| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395 |
- #include "common.h"
- #include "arg.h"
- #include "console.h"
- // #include "log.h"
- #include "server-context.h"
- #include "server-task.h"
- #include <atomic>
- #include <fstream>
- #include <thread>
- #include <signal.h>
- #if defined(_WIN32)
- #define WIN32_LEAN_AND_MEAN
- #ifndef NOMINMAX
- # define NOMINMAX
- #endif
- #include <windows.h>
- #endif
- const char * LLAMA_ASCII_LOGO = R"(
- ▄▄ ▄▄
- ██ ██
- ██ ██ ▀▀█▄ ███▄███▄ ▀▀█▄ ▄████ ████▄ ████▄
- ██ ██ ▄█▀██ ██ ██ ██ ▄█▀██ ██ ██ ██ ██ ██
- ██ ██ ▀█▄██ ██ ██ ██ ▀█▄██ ██ ▀████ ████▀ ████▀
- ██ ██
- ▀▀ ▀▀
- )";
- static std::atomic<bool> g_is_interrupted = false;
- static bool should_stop() {
- return g_is_interrupted.load();
- }
- #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
- static void signal_handler(int) {
- if (g_is_interrupted.load()) {
- // second Ctrl+C - exit immediately
- // make sure to clear colors before exiting (not using LOG or console.cpp here to avoid deadlock)
- fprintf(stdout, "\033[0m\n");
- fflush(stdout);
- std::exit(130);
- }
- g_is_interrupted.store(true);
- }
- #endif
- struct cli_context {
- server_context ctx_server;
- json messages = json::array();
- std::vector<raw_buffer> input_files;
- task_params defaults;
- // thread for showing "loading" animation
- std::atomic<bool> loading_show;
- cli_context(const common_params & params) {
- defaults.sampling = params.sampling;
- defaults.speculative = params.speculative;
- defaults.n_keep = params.n_keep;
- defaults.n_predict = params.n_predict;
- defaults.antiprompt = params.antiprompt;
- defaults.stream = true; // make sure we always use streaming mode
- defaults.timings_per_token = true; // in order to get timings even when we cancel mid-way
- // defaults.return_progress = true; // TODO: show progress
- defaults.oaicompat_chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
- }
- std::string generate_completion(result_timings & out_timings) {
- server_response_reader rd = ctx_server.get_response_reader();
- {
- // TODO: reduce some copies here in the future
- server_task task = server_task(SERVER_TASK_TYPE_COMPLETION);
- task.id = rd.get_new_id();
- task.index = 0;
- task.params = defaults; // copy
- task.cli_input = messages; // copy
- task.cli_files = input_files; // copy
- rd.post_task({std::move(task)});
- }
- // wait for first result
- console::spinner::start();
- server_task_result_ptr result = rd.next(should_stop);
- console::spinner::stop();
- std::string curr_content;
- bool is_thinking = false;
- while (result) {
- if (should_stop()) {
- break;
- }
- if (result->is_error()) {
- json err_data = result->to_json();
- if (err_data.contains("message")) {
- console::error("Error: %s\n", err_data["message"].get<std::string>().c_str());
- } else {
- console::error("Error: %s\n", err_data.dump().c_str());
- }
- return curr_content;
- }
- auto res_partial = dynamic_cast<server_task_result_cmpl_partial *>(result.get());
- if (res_partial) {
- out_timings = std::move(res_partial->timings);
- for (const auto & diff : res_partial->oaicompat_msg_diffs) {
- if (!diff.content_delta.empty()) {
- if (is_thinking) {
- console::log("\n[End thinking]\n\n");
- console::set_display(DISPLAY_TYPE_RESET);
- is_thinking = false;
- }
- curr_content += diff.content_delta;
- console::log("%s", diff.content_delta.c_str());
- console::flush();
- }
- if (!diff.reasoning_content_delta.empty()) {
- console::set_display(DISPLAY_TYPE_REASONING);
- if (!is_thinking) {
- console::log("[Start thinking]\n");
- }
- is_thinking = true;
- console::log("%s", diff.reasoning_content_delta.c_str());
- console::flush();
- }
- }
- }
- auto res_final = dynamic_cast<server_task_result_cmpl_final *>(result.get());
- if (res_final) {
- out_timings = std::move(res_final->timings);
- break;
- }
- result = rd.next(should_stop);
- }
- g_is_interrupted.store(false);
- // server_response_reader automatically cancels pending tasks upon destruction
- return curr_content;
- }
- // TODO: support remote files in the future (http, https, etc)
- std::string load_input_file(const std::string & fname, bool is_media) {
- std::ifstream file(fname, std::ios::binary);
- if (!file) {
- return "";
- }
- if (is_media) {
- raw_buffer buf;
- buf.assign((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
- input_files.push_back(std::move(buf));
- return mtmd_default_marker();
- } else {
- std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
- return content;
- }
- }
- };
- int main(int argc, char ** argv) {
- common_params params;
- params.verbosity = LOG_LEVEL_ERROR; // by default, less verbose logs
- if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CLI)) {
- return 1;
- }
- // TODO: maybe support it later?
- if (params.conversation_mode == COMMON_CONVERSATION_MODE_DISABLED) {
- console::error("--no-conversation is not supported by llama-cli\n");
- console::error("please use llama-completion instead\n");
- }
- common_init();
- // struct that contains llama context and inference
- cli_context ctx_cli(params);
- llama_backend_init();
- llama_numa_init(params.numa);
- // TODO: avoid using atexit() here by making `console` a singleton
- console::init(params.simple_io, params.use_color);
- atexit([]() { console::cleanup(); });
- console::set_display(DISPLAY_TYPE_RESET);
- #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
- struct sigaction sigint_action;
- sigint_action.sa_handler = signal_handler;
- sigemptyset (&sigint_action.sa_mask);
- sigint_action.sa_flags = 0;
- sigaction(SIGINT, &sigint_action, NULL);
- sigaction(SIGTERM, &sigint_action, NULL);
- #elif defined (_WIN32)
- auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
- return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
- };
- SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
- #endif
- console::log("\nLoading model... "); // followed by loading animation
- console::spinner::start();
- if (!ctx_cli.ctx_server.load_model(params)) {
- console::spinner::stop();
- console::error("\nFailed to load the model\n");
- return 1;
- }
- ctx_cli.ctx_server.init();
- console::spinner::stop();
- console::log("\n");
- std::thread inference_thread([&ctx_cli]() {
- ctx_cli.ctx_server.start_loop();
- });
- auto inf = ctx_cli.ctx_server.get_info();
- std::string modalities = "text";
- if (inf.has_inp_image) {
- modalities += ", vision";
- }
- if (inf.has_inp_audio) {
- modalities += ", audio";
- }
- if (!params.system_prompt.empty()) {
- ctx_cli.messages.push_back({
- {"role", "system"},
- {"content", params.system_prompt}
- });
- }
- console::log("\n");
- console::log("%s\n", LLAMA_ASCII_LOGO);
- console::log("build : %s\n", inf.build_info.c_str());
- console::log("model : %s\n", inf.model_name.c_str());
- console::log("modalities : %s\n", modalities.c_str());
- if (!params.system_prompt.empty()) {
- console::log("using custom system prompt\n");
- }
- console::log("\n");
- console::log("available commands:\n");
- console::log(" /exit or Ctrl+C stop or exit\n");
- console::log(" /regen regenerate the last response\n");
- console::log(" /clear clear the chat history\n");
- console::log(" /read add a text file\n");
- if (inf.has_inp_image) {
- console::log(" /image <file> add an image file\n");
- }
- if (inf.has_inp_audio) {
- console::log(" /audio <file> add an audio file\n");
- }
- console::log("\n");
- // interactive loop
- std::string cur_msg;
- while (true) {
- std::string buffer;
- console::set_display(DISPLAY_TYPE_USER_INPUT);
- if (params.prompt.empty()) {
- console::log("\n> ");
- std::string line;
- bool another_line = true;
- do {
- another_line = console::readline(line, params.multiline_input);
- buffer += line;
- } while (another_line);
- } else {
- // process input prompt from args
- for (auto & fname : params.image) {
- std::string marker = ctx_cli.load_input_file(fname, true);
- if (marker.empty()) {
- console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
- break;
- }
- console::log("Loaded media from '%s'\n", fname.c_str());
- cur_msg += marker;
- }
- buffer = params.prompt;
- if (buffer.size() > 500) {
- console::log("\n> %s ... (truncated)\n", buffer.substr(0, 500).c_str());
- } else {
- console::log("\n> %s\n", buffer.c_str());
- }
- params.prompt.clear(); // only use it once
- }
- console::set_display(DISPLAY_TYPE_RESET);
- console::log("\n");
- if (should_stop()) {
- g_is_interrupted.store(false);
- break;
- }
- // remove trailing newline
- if (!buffer.empty() &&buffer.back() == '\n') {
- buffer.pop_back();
- }
- // skip empty messages
- if (buffer.empty()) {
- continue;
- }
- bool add_user_msg = true;
- // process commands
- if (string_starts_with(buffer, "/exit")) {
- break;
- } else if (string_starts_with(buffer, "/regen")) {
- if (ctx_cli.messages.size() >= 2) {
- size_t last_idx = ctx_cli.messages.size() - 1;
- ctx_cli.messages.erase(last_idx);
- add_user_msg = false;
- } else {
- console::error("No message to regenerate.\n");
- continue;
- }
- } else if (string_starts_with(buffer, "/clear")) {
- ctx_cli.messages.clear();
- ctx_cli.input_files.clear();
- console::log("Chat history cleared.\n");
- continue;
- } else if (
- (string_starts_with(buffer, "/image ") && inf.has_inp_image) ||
- (string_starts_with(buffer, "/audio ") && inf.has_inp_audio)) {
- // just in case (bad copy-paste for example), we strip all trailing/leading spaces
- std::string fname = string_strip(buffer.substr(7));
- std::string marker = ctx_cli.load_input_file(fname, true);
- if (marker.empty()) {
- console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
- continue;
- }
- cur_msg += marker;
- console::log("Loaded media from '%s'\n", fname.c_str());
- continue;
- } else if (string_starts_with(buffer, "/read ")) {
- std::string fname = string_strip(buffer.substr(6));
- std::string marker = ctx_cli.load_input_file(fname, false);
- if (marker.empty()) {
- console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
- continue;
- }
- cur_msg += marker;
- console::log("Loaded text from '%s'\n", fname.c_str());
- continue;
- } else {
- // not a command
- cur_msg += buffer;
- }
- // generate response
- if (add_user_msg) {
- ctx_cli.messages.push_back({
- {"role", "user"},
- {"content", cur_msg}
- });
- cur_msg.clear();
- }
- result_timings timings;
- std::string assistant_content = ctx_cli.generate_completion(timings);
- ctx_cli.messages.push_back({
- {"role", "assistant"},
- {"content", assistant_content}
- });
- console::log("\n");
- if (params.show_timings) {
- console::set_display(DISPLAY_TYPE_INFO);
- console::log("\n");
- console::log("[ Prompt: %.1f t/s | Generation: %.1f t/s ]\n", timings.prompt_per_second, timings.predicted_per_second);
- console::set_display(DISPLAY_TYPE_RESET);
- }
- if (params.single_turn) {
- break;
- }
- }
- console::set_display(DISPLAY_TYPE_RESET);
- console::log("\nExiting...\n");
- ctx_cli.ctx_server.terminate();
- inference_thread.join();
- // bump the log level to display timings
- common_log_set_verbosity_thold(LOG_LEVEL_INFO);
- llama_memory_breakdown_print(ctx_cli.ctx_server.get_llama_context());
- return 0;
- }
|