2 лет назад · 0df7d63e5b
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -10,10 +10,10 @@ on:
 
				   push:
			
 
				     branches:
			
 
				       - master
			
 
				-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
			
 
				+    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']
			
 
				   pull_request:
			
 
				     types: [opened, synchronize, reopened]
			
 
				-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
			
 
				+    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']
			
 
				 
			
 
				 env:
			
 
				  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
			
@@ -157,15 +157,15 @@ jobs:
 
				       matrix:
			
 
				         include:
			
 
				           - build: 'avx2'
			
 
				-            defines: ''
			
 
				+            defines: '-DLLAMA_BUILD_SERVER=ON'
			
 
				           - build: 'avx'
			
 
				-            defines: '-DLLAMA_AVX2=OFF'
			
 
				+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF'
			
 
				           - build: 'avx512'
			
 
				-            defines: '-DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
			
 
				+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
			
 
				           - build: 'clblast'
			
 
				-            defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
			
 
				+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
			
 
				           - build: 'openblas'
			
 
				-            defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
			
 
				+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
			
 
				 
			
 
				     steps:
			
 
				       - name: Clone
			
@@ -292,7 +292,7 @@ jobs:
 
				         run: |
			
 
				           mkdir build
			
 
				           cd build
			
 
				-          cmake .. -DLLAMA_CUBLAS=ON
			
 
				+          cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
			
 
				           cmake --build . --config Release
			
 
				 
			
 
				       - name: Get commit hash
			
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,11 @@
 
				 # Define the default target now so that it is always the first target
			
 
				-default: main quantize quantize-stats perplexity embedding vdot
			
 
				+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot
			
 
				+
			
 
				+ifdef LLAMA_BUILD_SERVER
			
 
				+	BUILD_TARGETS += server
			
 
				+endif
			
 
				+
			
 
				+default: $(BUILD_TARGETS)
			
 
				 
			
 
				 ifndef UNAME_S
			
 
				 UNAME_S := $(shell uname -s)
			
@@ -210,7 +216,7 @@ libllama.so: llama.o ggml.o $(OBJS)
 
				 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
			
 
				 
			
 
				 clean:
			
 
				-	rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state build-info.h
			
 
				+	rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot build-info.h
			
 
				 
			
 
				 #
			
 
				 # Examples
			
@@ -237,6 +243,9 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o
 
				 save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
			
 
				 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
			
 
				 
			
 
				+server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
			
 
				+	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
			
 
				+
			
 
				 build-info.h: $(wildcard .git/index) scripts/build-info.sh
			
 
				 	@sh scripts/build-info.sh > $@.tmp
			
 
				 	@if ! cmp -s $@.tmp $@; then \
			
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -61,7 +61,7 @@ struct llama_server_context
 
				     std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
			
 
				     // compare the evaluated prompt with the new prompt
			
 
				     int new_prompt_len = 0;
			
 
				-    for (int i = 0;i < prompt_tokens.size(); i++) {
			
 
				+    for (size_t i = 0; i < prompt_tokens.size(); i++) {
			
 
				       if (i < processed_tokens.size() &&
			
 
				         processed_tokens[i] == prompt_tokens[i])
			
 
				       {
			
@@ -71,7 +71,7 @@ struct llama_server_context
 
				       {
			
 
				         embd_inp.push_back(prompt_tokens[i]);
			
 
				         if(new_prompt_len == 0) {
			
 
				-          if(i - 1 < n_past) {
			
 
				+          if(int32_t(i) - 1 < n_past) {
			
 
				             processed_tokens.erase(processed_tokens.begin() + i, processed_tokens.end());
			
 
				           }
			
 
				           // Evaluate the new fragment prompt from the last token processed.
			
@@ -136,7 +136,7 @@ struct llama_server_context
 
				     {
			
 
				       // out of user input, sample next token
			
 
				       const float temp = params.temp;
			
 
				-      const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
			
 
				+      // const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
			
 
				       const float top_p = params.top_p;
			
 
				       const float tfs_z = params.tfs_z;
			
 
				       const float typical_p = params.typical_p;
			
@@ -306,12 +306,12 @@ struct llama_server_context
 
				     // Avoid add the no show words to the response
			
 
				     for (std::vector<llama_token> word_tokens : no_show_words)
			
 
				     {
			
 
				-      int match_token = 1;
			
 
				+      size_t match_token = 1;
			
 
				       if (tokens_predicted.front() == word_tokens.front())
			
 
				       {
			
 
				         bool execute_matching = true;
			
 
				         if (tokens_predicted.size() > 1) { // if previus tokens had been tested
			
 
				-          for (int i = 1; i < word_tokens.size(); i++)
			
 
				+          for (size_t i = 1; i < word_tokens.size(); i++)
			
 
				           {
			
 
				             if (i >= tokens_predicted.size()) {
			
 
				               match_token = i;
			
@@ -601,7 +601,7 @@ int main(int argc, char **argv)
 
				 
			
 
				   Server svr;
			
 
				 
			
 
				-  svr.Get("/", [](const Request &req, Response &res)
			
 
				+  svr.Get("/", [](const Request &, Response &res)
			
 
				           { res.set_content("<h1>llama.cpp server works</h1>", "text/html"); });
			
 
				 
			
 
				   svr.Post("/completion", [&llama](const Request &req, Response &res)
			
@@ -649,7 +649,7 @@ int main(int argc, char **argv)
 
				                       {"tokens_predicted", llama.num_tokens_predicted}};
			
 
				                   return res.set_content(data.dump(), "application/json");
			
 
				                 }
			
 
				-                catch (json::exception e)
			
 
				+                catch (const json::exception &e)
			
 
				                 {
			
 
				                   // Some tokens have bad UTF-8 strings, the json parser is very sensitive
			
 
				                   json data = {
			
@@ -701,7 +701,7 @@ int main(int argc, char **argv)
 
				                         {"content", result },
			
 
				                         {"stop", !llama.has_next_token }};
			
 
				               return res.set_content(data.dump(), "application/json");
			
 
				-            } catch (json::exception e) {
			
 
				+            } catch (const json::exception &e) {
			
 
				               // Some tokens have bad UTF-8 strings, the json parser is very sensitive
			
 
				               json data = {
			
 
				                         {"content", "" },