Browse Source

Don't tell users to use a bad number of threads (#243)

The readme tells people to use the command line option "-t 8", causing 8
threads to be started. On systems with fewer than 8 cores, this causes a
significant slowdown. Remove the option from the example command lines
and use /proc/cpuinfo on Linux to determine a sensible default.
Stephan Walter 2 years ago
parent
commit
367946c668
5 changed files with 19 additions and 11 deletions
  1. 1 1
      .devops/tools.sh
  2. 5 5
      README.md
  3. 0 4
      ggml.c
  4. 12 0
      utils.cpp
  5. 1 1
      utils.h

+ 1 - 1
.devops/tools.sh

@@ -34,7 +34,7 @@ else
     echo "Unknown command: $arg1"
     echo "Unknown command: $arg1"
     echo "Available commands: "
     echo "Available commands: "
     echo "  --run (-r): Run a model previously converted into ggml"
     echo "  --run (-r): Run a model previously converted into ggml"
-    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -t 8 -n 512"
+    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
     echo "  --convert (-c): Convert a llama model into ggml"
     echo "  --convert (-c): Convert a llama model into ggml"
     echo "              ex: \"/models/7B/\" 1"
     echo "              ex: \"/models/7B/\" 1"
     echo "  --quantize (-q): Optimize with quantization process ggml"
     echo "  --quantize (-q): Optimize with quantization process ggml"

+ 5 - 5
README.md

@@ -39,7 +39,7 @@ Supported platforms:
 Here is a typical run using LLaMA-7B:
 Here is a typical run using LLaMA-7B:
 
 
 ```java
 ```java
-make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
+make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
 I llama.cpp build info:
 I llama.cpp build info:
 I UNAME_S:  Darwin
 I UNAME_S:  Darwin
 I UNAME_P:  arm
 I UNAME_P:  arm
@@ -150,7 +150,7 @@ python3 convert-pth-to-ggml.py models/7B/ 1
 ./quantize.sh 7B
 ./quantize.sh 7B
 
 
 # run the inference
 # run the inference
-./main -m ./models/7B/ggml-model-q4_0.bin -t 8 -n 128
+./main -m ./models/7B/ggml-model-q4_0.bin -n 128
 ```
 ```
 
 
 When running the larger models, make sure you have enough disk space to store all the intermediate files.
 When running the larger models, make sure you have enough disk space to store all the intermediate files.
@@ -164,7 +164,7 @@ In this mode, you can always interrupt generation by pressing Ctrl+C and enter o
 
 
 Here is an example few-shot interaction, invoked with the command
 Here is an example few-shot interaction, invoked with the command
 ```
 ```
-./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 256 --repeat_penalty 1.0 --color -i -r "User:" \
+./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" \
                                            -p \
                                            -p \
 "Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
 "Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
 
 
@@ -218,13 +218,13 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-on
 On complete, you are ready to play!
 On complete, you are ready to play!
 
 
 ```bash
 ```bash
-docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
+docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
 ```
 ```
 
 
 or with light image:
 or with light image:
 
 
 ```bash
 ```bash
-docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
+docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
 ```
 ```
 
 
 ## Limitations
 ## Limitations

+ 0 - 4
ggml.c

@@ -9318,10 +9318,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 }
 }
 
 
 void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
 void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
-    if (cgraph->n_threads <= 0) {
-        cgraph->n_threads = 8;
-    }
-
     const int n_threads = cgraph->n_threads;
     const int n_threads = cgraph->n_threads;
 
 
     struct ggml_compute_state_shared state_shared = {
     struct ggml_compute_state_shared state_shared = {

+ 12 - 0
utils.cpp

@@ -16,6 +16,18 @@
  #endif
  #endif
 
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
+    // determine sensible default number of threads.
+    // std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
+#ifdef __linux__
+    std::ifstream cpuinfo("/proc/cpuinfo");
+    params.n_threads = std::count(std::istream_iterator<std::string>(cpuinfo),
+                                  std::istream_iterator<std::string>(),
+                                  std::string("processor"));
+#endif
+    if (params.n_threads == 0) {
+        params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
+    }
+
     for (int i = 1; i < argc; i++) {
     for (int i = 1; i < argc; i++) {
         std::string arg = argv[i];
         std::string arg = argv[i];
 
 

+ 1 - 1
utils.h

@@ -14,7 +14,7 @@
 
 
 struct gpt_params {
 struct gpt_params {
     int32_t seed      = -1; // RNG seed
     int32_t seed      = -1; // RNG seed
-    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t n_threads;
     int32_t n_predict = 128; // new tokens to predict
     int32_t n_predict = 128; // new tokens to predict
     int32_t repeat_last_n = 64;  // last n tokens to penalize
     int32_t repeat_last_n = 64;  // last n tokens to penalize
     int32_t n_ctx = 512; //context size
     int32_t n_ctx = 512; //context size