преди 1 месец · fb644247de
--- a/.devops/cann.Dockerfile
+++ b/.devops/cann.Dockerfile
@@ -107,7 +107,7 @@ ENTRYPOINT ["/app/tools.sh"]
 
															 # ENTRYPOINT ["/app/llama-server"]
														
 
															 ### Target: light
														
 
															-# Lightweight image containing only llama-cli
														
 
															+# Lightweight image containing only llama-cli and llama-completion
														
 
															 # ==============================================================================
														
 
															 FROM base AS light
														
--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@@ -23,11 +23,12 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
 
															 RUN echo "Building with static libs" && \
														
 
															     source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
														
 
															     cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF  && \
														
 
															-    cmake --build build --config Release --target llama-cli
														
 
															+    cmake --build build --config Release --target llama-cli && \
														
 
															+    cmake --build build --config Release --target llama-completion
														
 
															 # TODO: use image with NNRT
														
 
															 FROM ascendai/cann:$ASCEND_VERSION AS runtime
														
 
															-COPY --from=build /app/build/bin/llama-cli /llama-cli
														
 
															+COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion /
														
 
															 ENV LC_ALL=C.utf8
														
--- a/.devops/llama-cpp-cuda.srpm.spec
+++ b/.devops/llama-cpp-cuda.srpm.spec
@@ -37,6 +37,7 @@ make -j GGML_CUDA=1
 
															 %install
														
 
															 mkdir -p %{buildroot}%{_bindir}/
														
 
															 cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
														
 
															+cp -p llama-completion %{buildroot}%{_bindir}/llama-cuda-completion
														
 
															 cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
														
 
															 cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
														
@@ -68,6 +69,7 @@ rm -rf %{_builddir}/*
 
															 %files
														
 
															 %{_bindir}/llama-cuda-cli
														
 
															+%{_bindir}/llama-cuda-completion
														
 
															 %{_bindir}/llama-cuda-server
														
 
															 %{_bindir}/llama-cuda-simple
														
 
															 /usr/lib/systemd/system/llamacuda.service
														
--- a/.devops/llama-cpp.srpm.spec
+++ b/.devops/llama-cpp.srpm.spec
@@ -39,6 +39,7 @@ make -j
 
															 %install
														
 
															 mkdir -p %{buildroot}%{_bindir}/
														
 
															 cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
														
 
															+cp -p llama-completion %{buildroot}%{_bindir}/llama-completion
														
 
															 cp -p llama-server %{buildroot}%{_bindir}/llama-server
														
 
															 cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
														
@@ -70,6 +71,7 @@ rm -rf %{_builddir}/*
 
															 %files
														
 
															 %{_bindir}/llama-cli
														
 
															+%{_bindir}/llama-completion
														
 
															 %{_bindir}/llama-server
														
 
															 %{_bindir}/llama-simple
														
 
															 /usr/lib/systemd/system/llama.service
														
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -7,9 +7,9 @@
 
															 ## Images
														
 
															 We have three Docker images available for this project:
														
 
															-1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
														
 
															-2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
														
 
															-3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
														
 
															+1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
														
 
															+2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the `llama-cli` and `llama-completion` executables. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
														
 
															+3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the `llama-server` executable. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
														
 
															 Additionally, there the following images, similar to the above:
														
@@ -44,13 +44,15 @@ docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --all-in-o
 
															 On completion, you are ready to play!
														
 
															 ```bash
														
 
															-docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
														
 
															+docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf
														
 
															+docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run-legacy -m /models/32B/ggml-model-q8_0.gguf -no-cnv -p "Building a mobile app can be done in 15 steps:" -n 512
														
 
															 ```
														
 
															 or with a light image:
														
 
															 ```bash
														
 
															-docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
														
 
															+docker run -v /path/to/models:/models --entrypoint /app/llama-cli ghcr.io/ggml-org/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf
														
 
															+docker run -v /path/to/models:/models --entrypoint /app/llama-completion ghcr.io/ggml-org/llama.cpp:light -m /models/32B/ggml-model-q8_0.gguf -no-cnv -p "Building a mobile app can be done in 15 steps:" -n 512
														
 
															 ```
														
 
															 or with a server image:
														
@@ -59,6 +61,8 @@ or with a server image:
 
															 docker run -v /path/to/models:/models -p 8080:8080 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512
														
 
															 ```
														
 
															+In the above examples, `--entrypoint /app/llama-cli` is specified for clarity, but you can safely omit it since it's the default entrypoint in the container.
														
 
															+
														
 
															 ## Docker With CUDA
														
 
															 Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
														
@@ -80,9 +84,9 @@ The defaults are:
 
															 The resulting images, are essentially the same as the non-CUDA images:
														
 
															-1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
														
 
															-2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
														
 
															-3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.
														
 
															+1. `local/llama.cpp:full-cuda`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
														
 
															+2. `local/llama.cpp:light-cuda`: This image only includes the `llama-cli` and `llama-completion` executables.
														
 
															+3. `local/llama.cpp:server-cuda`: This image only includes the `llama-server` executable.
														
 
															 ## Usage
														
@@ -114,9 +118,9 @@ The defaults are:
 
															 The resulting images, are essentially the same as the non-MUSA images:
														
 
															-1. `local/llama.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
														
 
															-2. `local/llama.cpp:light-musa`: This image only includes the main executable file.
														
 
															-3. `local/llama.cpp:server-musa`: This image only includes the server executable file.
														
 
															+1. `local/llama.cpp:full-musa`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
														
 
															+2. `local/llama.cpp:light-musa`: This image only includes the `llama-cli` and `llama-completion` executables.
														
 
															+3. `local/llama.cpp:server-musa`: This image only includes the `llama-server` executable.
														
 
															 ## Usage