%!s(int64=2) %!d(string=hai) anos · 39baaf55a1
--- a/.devops/server-cuda.Dockerfile
+++ b/.devops/server-cuda.Dockerfile
@@ -0,0 +1,32 @@
 
															+ARG UBUNTU_VERSION=22.04
														
 
															+# This needs to generally match the container host's environment.
														
 
															+ARG CUDA_VERSION=11.7.1
														
 
															+# Target the CUDA build image
														
 
															+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
														
 
															+# Target the CUDA runtime image
														
 
															+ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
														
 
															+
														
 
															+FROM ${BASE_CUDA_DEV_CONTAINER} as build
														
 
															+
														
 
															+# Unless otherwise specified, we make a fat build.
														
 
															+ARG CUDA_DOCKER_ARCH=all
														
 
															+
														
 
															+RUN apt-get update && \
														
 
															+    apt-get install -y build-essential git
														
 
															+
														
 
															+WORKDIR /app
														
 
															+
														
 
															+COPY . .
														
 
															+
														
 
															+# Set nvcc architecture
														
 
															+ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
														
 
															+# Enable cuBLAS
														
 
															+ENV LLAMA_CUBLAS=1
														
 
															+
														
 
															+RUN make
														
 
															+
														
 
															+FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
														
 
															+
														
 
															+COPY --from=build /app/server /server
														
 
															+
														
 
															+ENTRYPOINT [ "/server" ]
														
--- a/.devops/server-intel.Dockerfile
+++ b/.devops/server-intel.Dockerfile
@@ -0,0 +1,25 @@
 
															+ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
														
 
															+ARG UBUNTU_VERSION=22.04
														
 
															+
														
 
															+FROM intel/hpckit:$ONEAPI_VERSION as build
														
 
															+
														
 
															+RUN apt-get update && \
														
 
															+    apt-get install -y git
														
 
															+
														
 
															+WORKDIR /app
														
 
															+
														
 
															+COPY . .
														
 
															+
														
 
															+# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
														
 
															+RUN mkdir build && \
														
 
															+    cd build && \
														
 
															+    cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
														
 
															+    cmake --build . --config Release --target main server
														
 
															+
														
 
															+FROM ubuntu:$UBUNTU_VERSION as runtime
														
 
															+
														
 
															+COPY --from=build /app/build/bin/server /server
														
 
															+
														
 
															+ENV LC_ALL=C.utf8
														
 
															+
														
 
															+ENTRYPOINT [ "/server" ]
														
--- a/.devops/server-rocm.Dockerfile
+++ b/.devops/server-rocm.Dockerfile
@@ -0,0 +1,45 @@
 
															+ARG UBUNTU_VERSION=22.04
														
 
															+
														
 
															+# This needs to generally match the container host's environment.
														
 
															+ARG ROCM_VERSION=5.6
														
 
															+
														
 
															+# Target the CUDA build image
														
 
															+ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
														
 
															+
														
 
															+FROM ${BASE_ROCM_DEV_CONTAINER} as build
														
 
															+
														
 
															+# Unless otherwise specified, we make a fat build.
														
 
															+# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
														
 
															+# This is mostly tied to rocBLAS supported archs.
														
 
															+ARG ROCM_DOCKER_ARCH=\
														
 
															+    gfx803 \
														
 
															+    gfx900 \
														
 
															+    gfx906 \
														
 
															+    gfx908 \
														
 
															+    gfx90a \
														
 
															+    gfx1010 \
														
 
															+    gfx1030 \
														
 
															+    gfx1100 \
														
 
															+    gfx1101 \
														
 
															+    gfx1102
														
 
															+
														
 
															+COPY requirements.txt   requirements.txt
														
 
															+COPY requirements       requirements
														
 
															+
														
 
															+RUN pip install --upgrade pip setuptools wheel \
														
 
															+    && pip install -r requirements.txt
														
 
															+
														
 
															+WORKDIR /app
														
 
															+
														
 
															+COPY . .
														
 
															+
														
 
															+# Set nvcc architecture
														
 
															+ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
														
 
															+# Enable ROCm
														
 
															+ENV LLAMA_HIPBLAS=1
														
 
															+ENV CC=/opt/rocm/llvm/bin/clang
														
 
															+ENV CXX=/opt/rocm/llvm/bin/clang++
														
 
															+
														
 
															+RUN make
														
 
															+
														
 
															+ENTRYPOINT [ "/app/server" ]
														
--- a/.devops/server.Dockerfile
+++ b/.devops/server.Dockerfile
@@ -0,0 +1,20 @@
 
															+ARG UBUNTU_VERSION=22.04
														
 
															+
														
 
															+FROM ubuntu:$UBUNTU_VERSION as build
														
 
															+
														
 
															+RUN apt-get update && \
														
 
															+    apt-get install -y build-essential git
														
 
															+
														
 
															+WORKDIR /app
														
 
															+
														
 
															+COPY . .
														
 
															+
														
 
															+RUN make
														
 
															+
														
 
															+FROM ubuntu:$UBUNTU_VERSION as runtime
														
 
															+
														
 
															+COPY --from=build /app/server /server
														
 
															+
														
 
															+ENV LC_ALL=C.utf8
														
 
															+
														
 
															+ENTRYPOINT [ "/server" ]
														
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -28,14 +28,18 @@ jobs:
 
															         config:
														
 
															           - { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
														
 
															           - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
														
 
															+          - { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
														
 
															           # NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
														
 
															           #                     have disabled them for now until the reason why
														
 
															           #                     is understood.
														
 
															           - { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
														
 
															           - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
														
 
															+          - { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" }
														
 
															           - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
														
 
															           - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
														
 
															+          - { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
														
 
															           - { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
														
 
															+          - { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
														
 
															     steps:
														
 
															       - name: Check out the repo
														
 
															         uses: actions/checkout@v3
														
--- a/README.md
+++ b/README.md
@@ -931,17 +931,20 @@ Place your desired model into the `~/llama.cpp/models/` directory and execute th
 
															 * Create a folder to store big models & intermediate files (ex. /llama/models)
														
 
															 #### Images
														
 
															-We have two Docker images available for this project:
														
 
															+We have three Docker images available for this project:
														
 
															 1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
														
 
															 2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
														
 
															+3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executabhle file. (platforms: `linux/amd64`, `linux/arm64`)
														
 
															 Additionally, there the following images, similar to the above:
														
 
															 - `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
														
 
															 - `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
														
 
															+- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
														
 
															 - `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
														
 
															 - `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
														
 
															+- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
														
 
															 The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
														
@@ -967,6 +970,12 @@ or with a light image:
 
															 docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
														
 
															 ```
														
 
															+or with a server image:
														
 
															+
														
 
															+```bash
														
 
															+docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
														
 
															+```
														
 
															+
														
 
															 ### Docker With CUDA
														
 
															 Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
														
@@ -976,6 +985,7 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia
 
															 ```bash
														
 
															 docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
														
 
															 docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile .
														
 
															+docker build -t local/llama.cpp:server-cuda -f .devops/server-cuda.Dockerfile .
														
 
															 ```
														
 
															 You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
														
@@ -989,6 +999,7 @@ The resulting images, are essentially the same as the non-CUDA images:
 
															 1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
														
 
															 2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
														
 
															+3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.
														
 
															 #### Usage
														
@@ -997,6 +1008,7 @@ After building locally, Usage is similar to the non-CUDA examples, but you'll ne
 
															 ```bash
														
 
															 docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
														
 
															 docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
														
 
															+docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
														
 
															 ```
														
 
															 ### Contributing
														
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -66,6 +66,14 @@ server.exe -m models\7B\ggml-model.gguf -c 2048
 
															 The above command will start a server that by default listens on `127.0.0.1:8080`.
														
 
															 You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url.
														
 
															+### Docker:
														
 
															+```bash
														
 
															+docker run -p 8080:8080 -v /path/to/models:/models ggerganov/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080
														
 
															+
														
 
															+# or, with CUDA:
														
 
															+docker run -p 8080:8080 -v /path/to/models:/models --gpus all ggerganov/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99
														
 
															+```
														
 
															+
														
 
															 ## Testing with CURL
														
 
															 Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the base OS.