Sfoglia il codice sorgente

tokenize : add --show-count (token) option (#8299)

This commit adds a new option to the tokenize example, --show-count.
When this is set the total number of tokens are printed to stdout.

This was added as an option as I was concerned that there might be
scripts that use the output from this program and it might be better to
not print this information by default.

The motivation for this is that can be useful to find out how many
tokens a file contains, for example when trying to determine prompt
input file sizes for testing.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
Daniel Bevenius 1 anno fa
parent
commit
6f63d646c1
1 ha cambiato i file con 8 aggiunte e 0 eliminazioni
  1. 8 0
      examples/tokenize/tokenize.cpp

+ 8 - 0
examples/tokenize/tokenize.cpp

@@ -30,6 +30,7 @@ static void print_usage_information(const char * argv0, FILE * stream) {
     fprintf(stream, "    --stdin                              read prompt from standard input.\n");
     fprintf(stream, "    --stdin                              read prompt from standard input.\n");
     fprintf(stream, "    --no-bos                             do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
     fprintf(stream, "    --no-bos                             do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
     fprintf(stream, "    --log-disable                        disable logs. Makes stderr quiet when loading the model.\n");
     fprintf(stream, "    --log-disable                        disable logs. Makes stderr quiet when loading the model.\n");
+    fprintf(stream, "    --show-count                         print the total number of tokens.\n");
 }
 }
 
 
 static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
 static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
@@ -195,6 +196,7 @@ int main(int raw_argc, char ** raw_argv) {
     bool printing_ids = false;
     bool printing_ids = false;
     bool no_bos = false;
     bool no_bos = false;
     bool disable_logging = false;
     bool disable_logging = false;
+    bool show_token_count = false;
     const char * model_path = NULL;
     const char * model_path = NULL;
     const char * prompt_path = NULL;
     const char * prompt_path = NULL;
     const char * prompt_arg = NULL;
     const char * prompt_arg = NULL;
@@ -249,6 +251,9 @@ int main(int raw_argc, char ** raw_argv) {
         else if (arg == "--log-disable") {
         else if (arg == "--log-disable") {
             disable_logging = true;
             disable_logging = true;
         }
         }
+        else if (arg == "--show-count") {
+            show_token_count = true;
+        }
         else {
         else {
             fprintf(stderr, "Error: unknown option '%s'\n", argv[iarg].c_str());
             fprintf(stderr, "Error: unknown option '%s'\n", argv[iarg].c_str());
             return 1;
             return 1;
@@ -384,6 +389,9 @@ int main(int raw_argc, char ** raw_argv) {
         printf("]\n");
         printf("]\n");
     }
     }
 
 
+    if (show_token_count) {
+        printf("Total number of tokens: %ld\n", tokens.size());
+    }
     // silence valgrind
     // silence valgrind
     llama_free(ctx);
     llama_free(ctx);
     llama_free_model(model);
     llama_free_model(model);