1 год назад · 6b91b1e0a9
--- a/.devops/main-intel.Dockerfile
+++ b/.devops/main-intel.Dockerfile
@@ -1,8 +1,8 @@
 
				 ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
			
 
				-ARG UBUNTU_VERSION=22.04
			
 
				 
			
 
				-FROM intel/hpckit:$ONEAPI_VERSION as build
			
 
				+FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
			
 
				 
			
 
				+ARG LLAMA_SYCL_F16=OFF
			
 
				 RUN apt-get update && \
			
 
				     apt-get install -y git
			
 
				 
			
@@ -10,16 +10,18 @@ WORKDIR /app
 
				 
			
 
				 COPY . .
			
 
				 
			
 
				-# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
			
 
				 RUN mkdir build && \
			
 
				     cd build && \
			
 
				-    cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
			
 
				-    cmake --build . --config Release --target main server
			
 
				+    if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
			
 
				+        echo "LLAMA_SYCL_F16 is set" && \
			
 
				+        export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
			
 
				+    fi && \
			
 
				+    cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
			
 
				+    cmake --build . --config Release --target main
			
 
				 
			
 
				-FROM ubuntu:$UBUNTU_VERSION as runtime
			
 
				+FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
			
 
				 
			
 
				 COPY --from=build /app/build/bin/main /main
			
 
				-COPY --from=build /app/build/bin/server /server
			
 
				 
			
 
				 ENV LC_ALL=C.utf8
			
 
				 
			
--- a/.devops/main-vulkan.Dockerfile
+++ b/.devops/main-vulkan.Dockerfile
@@ -0,0 +1,29 @@
 
				+ARG UBUNTU_VERSION=jammy
			
 
				+
			
 
				+FROM ubuntu:$UBUNTU_VERSION as build
			
 
				+
			
 
				+# Install build tools
			
 
				+RUN apt update && apt install -y git build-essential cmake wget
			
 
				+
			
 
				+# Install Vulkan SDK
			
 
				+RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
			
 
				+    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
			
 
				+    apt update -y && \
			
 
				+    apt-get install -y vulkan-sdk
			
 
				+
			
 
				+# Build it
			
 
				+WORKDIR /app
			
 
				+COPY . .
			
 
				+RUN mkdir build && \
			
 
				+    cd build && \
			
 
				+    cmake .. -DLLAMA_VULKAN=1 && \
			
 
				+    cmake --build . --config Release --target main
			
 
				+
			
 
				+# Clean up
			
 
				+WORKDIR /
			
 
				+RUN cp /app/build/bin/main /main && \
			
 
				+    rm -rf /app
			
 
				+
			
 
				+ENV LC_ALL=C.utf8
			
 
				+
			
 
				+ENTRYPOINT [ "/main" ]
			
--- a/.devops/server-intel.Dockerfile
+++ b/.devops/server-intel.Dockerfile
@@ -1,8 +1,8 @@
 
				 ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
			
 
				-ARG UBUNTU_VERSION=22.04
			
 
				 
			
 
				-FROM intel/hpckit:$ONEAPI_VERSION as build
			
 
				+FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
			
 
				 
			
 
				+ARG LLAMA_SYCL_F16=OFF
			
 
				 RUN apt-get update && \
			
 
				     apt-get install -y git
			
 
				 
			
@@ -10,13 +10,16 @@ WORKDIR /app
 
				 
			
 
				 COPY . .
			
 
				 
			
 
				-# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
			
 
				 RUN mkdir build && \
			
 
				     cd build && \
			
 
				-    cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
			
 
				-    cmake --build . --config Release --target main server
			
 
				-
			
 
				-FROM ubuntu:$UBUNTU_VERSION as runtime
			
 
				+    if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
			
 
				+        echo "LLAMA_SYCL_F16 is set" && \
			
 
				+        export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
			
 
				+    fi && \
			
 
				+    cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
			
 
				+    cmake --build . --config Release --target server
			
 
				+
			
 
				+FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
			
 
				 
			
 
				 COPY --from=build /app/build/bin/server /server
			
 
				 
			
--- a/.devops/server-vulkan.Dockerfile
+++ b/.devops/server-vulkan.Dockerfile
@@ -0,0 +1,29 @@
 
				+ARG UBUNTU_VERSION=jammy
			
 
				+
			
 
				+FROM ubuntu:$UBUNTU_VERSION as build
			
 
				+
			
 
				+# Install build tools
			
 
				+RUN apt update && apt install -y git build-essential cmake wget
			
 
				+
			
 
				+# Install Vulkan SDK
			
 
				+RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
			
 
				+    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
			
 
				+    apt update -y && \
			
 
				+    apt-get install -y vulkan-sdk
			
 
				+
			
 
				+# Build it
			
 
				+WORKDIR /app
			
 
				+COPY . .
			
 
				+RUN mkdir build && \
			
 
				+    cd build && \
			
 
				+    cmake .. -DLLAMA_VULKAN=1 && \
			
 
				+    cmake --build . --config Release --target server
			
 
				+
			
 
				+# Clean up
			
 
				+WORKDIR /
			
 
				+RUN cp /app/build/bin/server /server && \
			
 
				+    rm -rf /app
			
 
				+
			
 
				+ENV LC_ALL=C.utf8
			
 
				+
			
 
				+ENTRYPOINT [ "/server" ]
			
--- a/README-sycl.md
+++ b/README-sycl.md
@@ -1,22 +1,15 @@
 
				 # llama.cpp for SYCL
			
 
				 
			
 
				-[Background](#background)
			
 
				-
			
 
				-[OS](#os)
			
 
				-
			
 
				-[Intel GPU](#intel-gpu)
			
 
				-
			
 
				-[Linux](#linux)
			
 
				-
			
 
				-[Windows](#windows)
			
 
				-
			
 
				-[Environment Variable](#environment-variable)
			
 
				-
			
 
				-[Known Issue](#known-issue)
			
 
				-
			
 
				-[Q&A](#q&a)
			
 
				-
			
 
				-[Todo](#todo)
			
 
				+- [Background](#background)
			
 
				+- [OS](#os)
			
 
				+- [Intel GPU](#intel-gpu)
			
 
				+- [Docker](#docker)
			
 
				+- [Linux](#linux)
			
 
				+- [Windows](#windows)
			
 
				+- [Environment Variable](#environment-variable)
			
 
				+- [Known Issue](#known-issue)
			
 
				+- [Q&A](#q&a)
			
 
				+- [Todo](#todo)
			
 
				 
			
 
				 ## Background
			
 
				 
			
@@ -36,7 +29,7 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
 
				 
			
 
				 |OS|Status|Verified|
			
 
				 |-|-|-|
			
 
				-|Linux|Support|Ubuntu 22.04|
			
 
				+|Linux|Support|Ubuntu 22.04, Fedora Silverblue 39|
			
 
				 |Windows|Support|Windows 11|
			
 
				 
			
 
				 
			
@@ -50,7 +43,7 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
 
				 |Intel Data Center Flex Series| Support| Flex 170|
			
 
				 |Intel Arc Series| Support| Arc 770, 730M|
			
 
				 |Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
			
 
				-|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
			
 
				+|Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7|
			
 
				 
			
 
				 Note: If the EUs (Execution Unit) in iGPU is less than 80, the inference speed will be too slow to use.
			
 
				 
			
@@ -64,6 +57,38 @@ For iGPU, please make sure the shared memory from host memory is enough. For lla
 
				 
			
 
				 For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, recommend the device memory is 4GB+.
			
 
				 
			
 
				+## Docker
			
 
				+
			
 
				+Note:
			
 
				+- Only docker on Linux is tested. Docker on WSL may not work.
			
 
				+- You may need to install Intel GPU driver on the host machine (See the [Linux](#linux) section to know how to do that)
			
 
				+
			
 
				+### Build the image
			
 
				+
			
 
				+You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
			
 
				+
			
 
				+
			
 
				+```sh
			
 
				+# For F16:
			
 
				+#docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
			
 
				+
			
 
				+# Or, for F32:
			
 
				+docker build -t llama-cpp-sycl -f .devops/main-intel.Dockerfile .
			
 
				+
			
 
				+# Note: you can also use the ".devops/main-server.Dockerfile", which compiles the "server" example
			
 
				+```
			
 
				+
			
 
				+### Run
			
 
				+
			
 
				+```sh
			
 
				+# Firstly, find all the DRI cards:
			
 
				+ls -la /dev/dri
			
 
				+# Then, pick the card that you want to use.
			
 
				+
			
 
				+# For example with "/dev/dri/card1"
			
 
				+docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
			
 
				+```
			
 
				+
			
 
				 ## Linux
			
 
				 
			
 
				 ### Setup Environment
			
@@ -76,7 +101,7 @@ Note: for iGPU, please install the client GPU driver.
 
				 
			
 
				 b. Add user to group: video, render.
			
 
				 
			
 
				-```
			
 
				+```sh
			
 
				 sudo usermod -aG render username
			
 
				 sudo usermod -aG video username
			
 
				 ```
			
@@ -85,7 +110,7 @@ Note: re-login to enable it.
 
				 
			
 
				 c. Check
			
 
				 
			
 
				-```
			
 
				+```sh
			
 
				 sudo apt install clinfo
			
 
				 sudo clinfo -l
			
 
				 ```
			
@@ -103,7 +128,6 @@ Platform #0: Intel(R) OpenCL HD Graphics
 
				 
			
 
				 2. Install Intel® oneAPI Base toolkit.
			
 
				 
			
 
				-
			
 
				 a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
			
 
				 
			
 
				 Recommend to install to default folder: **/opt/intel/oneapi**.
			
@@ -112,7 +136,7 @@ Following guide use the default folder as example. If you use other folder, plea
 
				 
			
 
				 b. Check
			
 
				 
			
 
				-```
			
 
				+```sh
			
 
				 source /opt/intel/oneapi/setvars.sh
			
 
				 
			
 
				 sycl-ls
			
@@ -131,21 +155,25 @@ Output (example):
 
				 
			
 
				 2. Build locally:
			
 
				 
			
 
				-```
			
 
				+Note:
			
 
				+- You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
			
 
				+- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
			
 
				+
			
 
				+```sh
			
 
				 mkdir -p build
			
 
				 cd build
			
 
				 source /opt/intel/oneapi/setvars.sh
			
 
				 
			
 
				-#for FP16
			
 
				-#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
			
 
				+# For FP16:
			
 
				+#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
			
 
				 
			
 
				-#for FP32
			
 
				+# Or, for FP32:
			
 
				 cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
			
 
				 
			
 
				-#build example/main only
			
 
				+# Build example/main only
			
 
				 #cmake --build . --config Release --target main
			
 
				 
			
 
				-#build all binary
			
 
				+# Or, build all binary
			
 
				 cmake --build . --config Release -v
			
 
				 
			
 
				 cd ..
			
@@ -153,14 +181,10 @@ cd ..
 
				 
			
 
				 or
			
 
				 
			
 
				-```
			
 
				+```sh
			
 
				 ./examples/sycl/build.sh
			
 
				 ```
			
 
				 
			
 
				-Note:
			
 
				-
			
 
				-- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
			
 
				-
			
 
				 ### Run
			
 
				 
			
 
				 1. Put model file to folder **models**
			
@@ -177,10 +201,10 @@ source /opt/intel/oneapi/setvars.sh
 
				 
			
 
				 Run without parameter:
			
 
				 
			
 
				-```
			
 
				+```sh
			
 
				 ./build/bin/ls-sycl-device
			
 
				 
			
 
				-or
			
 
				+# or running the "main" executable and look at the output log:
			
 
				 
			
 
				 ./build/bin/main
			
 
				 ```
			
@@ -209,13 +233,13 @@ found 4 SYCL devices:
 
				 
			
 
				 Set device ID = 0 by **GGML_SYCL_DEVICE=0**
			
 
				 
			
 
				-```
			
 
				+```sh
			
 
				 GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
			
 
				 ```
			
 
				 or run by script:
			
 
				 
			
 
				-```
			
 
				-./examples/sycl/run-llama2.sh
			
 
				+```sh
			
 
				+./examples/sycl/run_llama2.sh
			
 
				 ```
			
 
				 
			
 
				 Note:
			
--- a/README.md
+++ b/README.md
@@ -393,28 +393,28 @@ Building the program with BLAS support may lead to some performance improvements
 
				 
			
 
				   Check [BLIS.md](docs/BLIS.md) for more information.
			
 
				 
			
 
				+- #### SYCL
			
 
				+  SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
			
 
				+
			
 
				+  llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
			
 
				+
			
 
				+  For detailed info, please refer to [llama.cpp for SYCL](README-sycl.md).
			
 
				+
			
 
				 - #### Intel oneMKL
			
 
				+  Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./README-sycl.md).
			
 
				+
			
 
				   - Using manual oneAPI installation:
			
 
				     By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
			
 
				       ```bash
			
 
				       mkdir build
			
 
				       cd build
			
 
				-      source /opt/intel/oneapi/setvars.sh # You can skip this step if  in oneapi-runtime docker image, only required for manual installation
			
 
				+      source /opt/intel/oneapi/setvars.sh # You can skip this step if  in oneapi-basekit docker image, only required for manual installation
			
 
				       cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
			
 
				       cmake --build . --config Release
			
 
				       ```
			
 
				 
			
 
				   - Using oneAPI docker image:
			
 
				-    If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-runtime](https://hub.docker.com/r/intel/oneapi-runtime)
			
 
				-
			
 
				-      ```bash
			
 
				-      mkdir build
			
 
				-      cd build
			
 
				-      cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
			
 
				-      cmake --build . --config Release
			
 
				-      ```
			
 
				-
			
 
				-  Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni.
			
 
				+    If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above.
			
 
				 
			
 
				   Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
			
 
				 
			
@@ -601,14 +601,48 @@ Building the program with BLAS support may lead to some performance improvements
 
				 
			
 
				   You can get a list of platforms and devices from the `clinfo -l` command, etc.
			
 
				 
			
 
				-- #### SYCL
			
 
				+- #### Vulkan
			
 
				 
			
 
				-  SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
			
 
				+  **With docker**:
			
 
				 
			
 
				-  llama.cpp based on SYCL is used to support Intel GPU (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
			
 
				+  You don't need to install Vulkan SDK. It will be installed inside the container.
			
 
				 
			
 
				-  For detailed info, please refer to [llama.cpp for SYCL](README-sycl.md).
			
 
				+  ```sh
			
 
				+  # Build the image
			
 
				+  docker build -t llama-cpp-vulkan -f .devops/main-vulkan.Dockerfile .
			
 
				+
			
 
				+  # Then, use it:
			
 
				+  docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
			
 
				+  ```
			
 
				+
			
 
				+  **Without docker**:
			
 
				+
			
 
				+  Firstly, you need to make sure you installed [Vulkan SDK](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html)
			
 
				 
			
 
				+  For example, on Ubuntu 22.04 (jammy), use the command below:
			
 
				+
			
 
				+  ```bash
			
 
				+  wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add -
			
 
				+  wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
			
 
				+  apt update -y
			
 
				+  apt-get install -y vulkan-sdk
			
 
				+  # To verify the installation, use the command below:
			
 
				+  vulkaninfo
			
 
				+  ```
			
 
				+
			
 
				+  Then, build llama.cpp using the cmake command below:
			
 
				+
			
 
				+  ```bash
			
 
				+  mkdir -p build
			
 
				+  cd build
			
 
				+  cmake .. -DLLAMA_VULKAN=1
			
 
				+  cmake --build . --config Release
			
 
				+  # Test the output binary (with "-ngl 33" to offload all layers to GPU)
			
 
				+  ./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
			
 
				+
			
 
				+  # You should see in the output, ggml_vulkan detected your GPU. For example:
			
 
				+  # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
			
 
				+  ```
			
 
				 
			
 
				 ### Prepare Data & Run