1 mese fa · fb644247de
--- a/.devops/cann.Dockerfile
+++ b/.devops/cann.Dockerfile
@@ -107,7 +107,7 @@ ENTRYPOINT ["/app/tools.sh"]
 
				 # ENTRYPOINT ["/app/llama-server"]
			
 
				 
			
 
				 ### Target: light
			
 
				-# Lightweight image containing only llama-cli
			
 
				+# Lightweight image containing only llama-cli and llama-completion
			
 
				 # ==============================================================================
			
 
				 FROM base AS light
			
 
				 
			
--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@@ -23,11 +23,12 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
 
				 RUN echo "Building with static libs" && \
			
 
				     source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
			
 
				     cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF  && \
			
 
				-    cmake --build build --config Release --target llama-cli
			
 
				+    cmake --build build --config Release --target llama-cli && \
			
 
				+    cmake --build build --config Release --target llama-completion
			
 
				 
			
 
				 # TODO: use image with NNRT
			
 
				 FROM ascendai/cann:$ASCEND_VERSION AS runtime
			
 
				-COPY --from=build /app/build/bin/llama-cli /llama-cli
			
 
				+COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion /
			
 
				 
			
 
				 ENV LC_ALL=C.utf8
			
 
				 
			
--- a/.devops/llama-cpp-cuda.srpm.spec
+++ b/.devops/llama-cpp-cuda.srpm.spec
@@ -37,6 +37,7 @@ make -j GGML_CUDA=1
 
				 %install
			
 
				 mkdir -p %{buildroot}%{_bindir}/
			
 
				 cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
			
 
				+cp -p llama-completion %{buildroot}%{_bindir}/llama-cuda-completion
			
 
				 cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
			
 
				 cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
			
 
				 
			
@@ -68,6 +69,7 @@ rm -rf %{_builddir}/*
 
				 
			
 
				 %files
			
 
				 %{_bindir}/llama-cuda-cli
			
 
				+%{_bindir}/llama-cuda-completion
			
 
				 %{_bindir}/llama-cuda-server
			
 
				 %{_bindir}/llama-cuda-simple
			
 
				 /usr/lib/systemd/system/llamacuda.service
			
--- a/.devops/llama-cpp.srpm.spec
+++ b/.devops/llama-cpp.srpm.spec
@@ -39,6 +39,7 @@ make -j
 
				 %install
			
 
				 mkdir -p %{buildroot}%{_bindir}/
			
 
				 cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
			
 
				+cp -p llama-completion %{buildroot}%{_bindir}/llama-completion
			
 
				 cp -p llama-server %{buildroot}%{_bindir}/llama-server
			
 
				 cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
			
 
				 
			
@@ -70,6 +71,7 @@ rm -rf %{_builddir}/*
 
				 
			
 
				 %files
			
 
				 %{_bindir}/llama-cli
			
 
				+%{_bindir}/llama-completion
			
 
				 %{_bindir}/llama-server
			
 
				 %{_bindir}/llama-simple
			
 
				 /usr/lib/systemd/system/llama.service
			
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -7,9 +7,9 @@
 
				 ## Images
			
 
				 We have three Docker images available for this project:
			
 
				 
			
 
				-1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
			
 
				-2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
			
 
				-3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
			
 
				+1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
			
 
				+2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the `llama-cli` and `llama-completion` executables. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
			
 
				+3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the `llama-server` executable. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
			
 
				 
			
 
				 Additionally, there the following images, similar to the above:
			
 
				 
			
@@ -44,13 +44,15 @@ docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --all-in-o
 
				 On completion, you are ready to play!
			
 
				 
			
 
				 ```bash
			
 
				-docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
			
 
				+docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf
			
 
				+docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run-legacy -m /models/32B/ggml-model-q8_0.gguf -no-cnv -p "Building a mobile app can be done in 15 steps:" -n 512
			
 
				 ```
			
 
				 
			
 
				 or with a light image:
			
 
				 
			
 
				 ```bash
			
 
				-docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
			
 
				+docker run -v /path/to/models:/models --entrypoint /app/llama-cli ghcr.io/ggml-org/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf
			
 
				+docker run -v /path/to/models:/models --entrypoint /app/llama-completion ghcr.io/ggml-org/llama.cpp:light -m /models/32B/ggml-model-q8_0.gguf -no-cnv -p "Building a mobile app can be done in 15 steps:" -n 512
			
 
				 ```
			
 
				 
			
 
				 or with a server image:
			
@@ -59,6 +61,8 @@ or with a server image:
 
				 docker run -v /path/to/models:/models -p 8080:8080 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512
			
 
				 ```
			
 
				 
			
 
				+In the above examples, `--entrypoint /app/llama-cli` is specified for clarity, but you can safely omit it since it's the default entrypoint in the container.
			
 
				+
			
 
				 ## Docker With CUDA
			
 
				 
			
 
				 Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
			
@@ -80,9 +84,9 @@ The defaults are:
 
				 
			
 
				 The resulting images, are essentially the same as the non-CUDA images:
			
 
				 
			
 
				-1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
			
 
				-2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
			
 
				-3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.
			
 
				+1. `local/llama.cpp:full-cuda`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
			
 
				+2. `local/llama.cpp:light-cuda`: This image only includes the `llama-cli` and `llama-completion` executables.
			
 
				+3. `local/llama.cpp:server-cuda`: This image only includes the `llama-server` executable.
			
 
				 
			
 
				 ## Usage
			
 
				 
			
@@ -114,9 +118,9 @@ The defaults are:
 
				 
			
 
				 The resulting images, are essentially the same as the non-MUSA images:
			
 
				 
			
 
				-1. `local/llama.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
			
 
				-2. `local/llama.cpp:light-musa`: This image only includes the main executable file.
			
 
				-3. `local/llama.cpp:server-musa`: This image only includes the server executable file.
			
 
				+1. `local/llama.cpp:full-musa`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
			
 
				+2. `local/llama.cpp:light-musa`: This image only includes the `llama-cli` and `llama-completion` executables.
			
 
				+3. `local/llama.cpp:server-musa`: This image only includes the `llama-server` executable.
			
 
				 
			
 
				 ## Usage