Преглед изворни кода

main : don't print special tokens with --grammar (#6923)

* main : don't print special tokens with --grammar

The CLI interface was recently changed to print special control tokens
like the </s> stop message one. This token shouldn't be printed if the
grammar flag was passed, unless the grammar specifies it, because that
breaks shell-scriptability.

* main: use seperate stream for control characters

* main: use dprintf and add --ctrl-token-no-out and --ctrl-token-fd-out

* main: dprintf isn't part of the IEEE POSIX standard. Just use write().

* main: remove --ctrl-token-fd-out in favor for fcntl() based detection

* common.cpp: accidentally removed --interactive-first

* main: only merge stdout and control token if not in conversation or grammar mode

* main: rejig control token descriptor handling

* main: must check pipe status on very top of program

* main: renamed --no-special from  --ctrl-token-no-out and other refactoring

* main: refactor ctrl_token_no_out --> no_special

* llama: rename llama_token_is_control_token() to llama_token_is_control()

* main: remove special token file descriptor feature (#5)

---------

Co-authored-by: Brian <mofosyne@gmail.com>
Justine Tunney пре 1 година
родитељ
комит
00c6390793
5 измењених фајлова са 30 додато и 3 уклоњено
  1. 5 0
      common/common.cpp
  2. 1 0
      common/common.h
  3. 17 3
      examples/main/main.cpp
  4. 4 0
      llama.cpp
  5. 3 0
      llama.h

+ 5 - 0
common/common.cpp

@@ -904,6 +904,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.interactive_specials = true;
         params.interactive_specials = true;
         return true;
         return true;
     }
     }
+    if (arg == "--no-special") {
+        params.no_special = true;
+        return true;
+    }
     if (arg == "--embedding") {
     if (arg == "--embedding") {
         params.embedding = true;
         params.embedding = true;
         return true;
         return true;
@@ -1364,6 +1368,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     printf("  -i, --interactive     run in interactive mode\n");
     printf("  -i, --interactive     run in interactive mode\n");
     printf("  --interactive-specials allow special tokens in user text, in interactive mode\n");
     printf("  --interactive-specials allow special tokens in user text, in interactive mode\n");
     printf("  --interactive-first   run in interactive mode and wait for input right away\n");
     printf("  --interactive-first   run in interactive mode and wait for input right away\n");
+    printf("  --no-special          control tokens output disabled\n");
     printf("  -cnv, --conversation  run in conversation mode (does not print special tokens and suffix/prefix)\n");
     printf("  -cnv, --conversation  run in conversation mode (does not print special tokens and suffix/prefix)\n");
     printf("  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
     printf("  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
     printf("  -cml, --chatml        run in chatml mode (use with ChatML-compatible models)\n");
     printf("  -cml, --chatml        run in chatml mode (use with ChatML-compatible models)\n");

+ 1 - 0
common/common.h

@@ -146,6 +146,7 @@ struct gpt_params {
     bool use_color         = false; // use color to distinguish generations and inputs
     bool use_color         = false; // use color to distinguish generations and inputs
     bool interactive       = false; // interactive mode
     bool interactive       = false; // interactive mode
     bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
     bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
+    bool no_special        = false; // disable control token output
     bool conversation      = false; // conversation mode (does not print special tokens and suffix/prefix)
     bool conversation      = false; // conversation mode (does not print special tokens and suffix/prefix)
     bool chatml            = false; // chatml mode (used for models trained on chatml syntax)
     bool chatml            = false; // chatml mode (used for models trained on chatml syntax)
     bool prompt_cache_all  = false; // save user input and generations to prompt cache
     bool prompt_cache_all  = false; // save user input and generations to prompt cache

+ 17 - 3
examples/main/main.cpp

@@ -740,18 +740,32 @@ int main(int argc, char ** argv) {
         // display text
         // display text
         if (input_echo && display) {
         if (input_echo && display) {
             for (auto id : embd) {
             for (auto id : embd) {
-                const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation);
-                printf("%s", token_str.c_str());
+                const std::string token_str = llama_token_to_piece(ctx, id);
+
+                // Console/Stream Output
+                if (!llama_token_is_control(llama_get_model(ctx), id)) {
+                    // Stream Output Token To Standard Output
+                    fprintf(stdout, "%s", token_str.c_str());
+                } else if (!params.no_special && !params.conversation) {
+                    // Stream Control Token To Standard Output Stream
+                    fprintf(stdout, "%s", token_str.c_str());
+                }
 
 
+                // Record Displayed Tokens To Log
+                // Note: Generated tokens are created one by one hence this check
                 if (embd.size() > 1) {
                 if (embd.size() > 1) {
+                    // Incoming Requested Tokens
                     input_tokens.push_back(id);
                     input_tokens.push_back(id);
                 } else {
                 } else {
+                    // Outgoing Generated Tokens
                     output_tokens.push_back(id);
                     output_tokens.push_back(id);
                     output_ss << token_str;
                     output_ss << token_str;
                 }
                 }
+
+                fflush(stdout);
             }
             }
-            fflush(stdout);
         }
         }
+
         // reset color to default if there is no pending user input
         // reset color to default if there is no pending user input
         if (input_echo && (int) embd_inp.size() == n_consumed) {
         if (input_echo && (int) embd_inp.size() == n_consumed) {
             console::set_display(console::reset);
             console::set_display(console::reset);

+ 4 - 0
llama.cpp

@@ -17861,6 +17861,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
     );
     );
 }
 }
 
 
+bool llama_token_is_control(const struct llama_model * model, llama_token token) {
+    return llama_is_control_token(model->vocab, token);
+}
+
 llama_token llama_token_bos(const struct llama_model * model) {
 llama_token llama_token_bos(const struct llama_model * model) {
     return model->vocab.special_bos_id;
     return model->vocab.special_bos_id;
 }
 }

+ 3 - 0
llama.h

@@ -823,6 +823,9 @@ extern "C" {
     // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
     // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
     LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
     LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
 
 
+    // Identify if Token Id is a control token or a render-able token
+    LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
+
     // Special tokens
     // Special tokens
     LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
     LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
     LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
     LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence