hace 2 años · e8b8d32e86
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -504,9 +504,11 @@ struct llama_server_context
 
				                                            });
			
 
				         }
			
 
				 
			
 
				+        bool tg = true;
			
 
				         while (n_past < embd.size())
			
 
				         {
			
 
				             int n_eval = (int)embd.size() - n_past;
			
 
				+            tg = n_eval == 1;
			
 
				             if (n_eval > params.n_batch)
			
 
				             {
			
 
				                 n_eval = params.n_batch;
			
@@ -633,7 +635,9 @@ struct llama_server_context
 
				 
			
 
				             last_n_tokens.erase(last_n_tokens.begin());
			
 
				             last_n_tokens.push_back(result.tok);
			
 
				-            num_tokens_predicted++;
			
 
				+            if (tg) {
			
 
				+                num_tokens_predicted++;
			
 
				+            }
			
 
				         }
			
 
				 
			
 
				         // add it to the context
			
@@ -1124,8 +1128,6 @@ static json format_timings(llama_server_context &llama)
 
				 {
			
 
				     const auto timings = llama_get_timings(llama.ctx);
			
 
				 
			
 
				-    assert(timings.n_eval == ptrdiff_t(llama.num_tokens_predicted));
			
 
				-
			
 
				     return json{
			
 
				         {"prompt_n", timings.n_p_eval},
			
 
				         {"prompt_ms", timings.t_p_eval_ms},