2 лет назад · f0e209324a
--- a/scripts/server-llm.sh
+++ b/scripts/server-llm.sh
@@ -0,0 +1,391 @@
 
				+#!/bin/bash
			
 
				+#
			
 
				+# Helper script for deploying llama.cpp server with a single Bash command
			
 
				+#
			
 
				+# - Works on Linux and macOS
			
 
				+# - Supports: CPU, CUDA, Metal, OpenCL
			
 
				+# - Can run all GGUF models from HuggingFace
			
 
				+# - Can serve requests in parallel
			
 
				+# - Always builds latest llama.cpp from GitHub
			
 
				+#
			
 
				+# Limitations
			
 
				+#
			
 
				+# - Chat templates are poorly supported (base models recommended)
			
 
				+# - Might be unstable!
			
 
				+#
			
 
				+# Usage:
			
 
				+#   ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]
			
 
				+#
			
 
				+#   --port:       port number, default is 8888
			
 
				+#   --repo:       path to a repo containing GGUF model files
			
 
				+#   --wtype:      weights type (f16, q8_0, q4_0, q4_1), default is user-input
			
 
				+#   --backend:    cpu, cuda, metal, opencl, depends on the OS
			
 
				+#   --gpu-id:     gpu id, default is 0
			
 
				+#   --n-parallel: number of parallel requests, default is 8
			
 
				+#   --n-kv:       KV cache size, default is 4096
			
 
				+#   --verbose:    verbose output
			
 
				+#
			
 
				+# Example:
			
 
				+#
			
 
				+#   bash -c "$(curl -s https://ggml.ai/server-llm.sh)"
			
 
				+#
			
 
				+
			
 
				+set -e
			
 
				+
			
 
				+# required utils: curl, git, make
			
 
				+if ! command -v curl &> /dev/null; then
			
 
				+    printf "[-] curl not found\n"
			
 
				+    exit 1
			
 
				+fi
			
 
				+if ! command -v git &> /dev/null; then
			
 
				+    printf "[-] git not found\n"
			
 
				+    exit 1
			
 
				+fi
			
 
				+if ! command -v make &> /dev/null; then
			
 
				+    printf "[-] make not found\n"
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+# parse arguments
			
 
				+port=8888
			
 
				+repo=""
			
 
				+wtype=""
			
 
				+backend="cpu"
			
 
				+
			
 
				+# if macOS, use metal backend by default
			
 
				+if [[ "$OSTYPE" == "darwin"* ]]; then
			
 
				+    backend="metal"
			
 
				+elif command -v nvcc &> /dev/null; then
			
 
				+    backend="cuda"
			
 
				+fi
			
 
				+
			
 
				+gpu_id=0
			
 
				+n_parallel=8
			
 
				+n_kv=4096
			
 
				+verbose=0
			
 
				+
			
 
				+function print_usage {
			
 
				+    printf "Usage:\n"
			
 
				+    printf "  ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]\n\n"
			
 
				+    printf "  --port:       port number, default is 8888\n"
			
 
				+    printf "  --repo:       path to a repo containing GGUF model files\n"
			
 
				+    printf "  --wtype:      weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
			
 
				+    printf "  --backend:    cpu, cuda, metal, opencl, depends on the OS\n"
			
 
				+    printf "  --gpu-id:     gpu id, default is 0\n"
			
 
				+    printf "  --n-parallel: number of parallel requests, default is 8\n"
			
 
				+    printf "  --n-kv:       KV cache size, default is 4096\n"
			
 
				+    printf "  --verbose:    verbose output\n\n"
			
 
				+    printf "Example:\n\n"
			
 
				+    printf '  bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n'
			
 
				+}
			
 
				+
			
 
				+while [[ $# -gt 0 ]]; do
			
 
				+    key="$1"
			
 
				+    case $key in
			
 
				+        --port)
			
 
				+            port="$2"
			
 
				+            shift
			
 
				+            shift
			
 
				+            ;;
			
 
				+        --repo)
			
 
				+            repo="$2"
			
 
				+            shift
			
 
				+            shift
			
 
				+            ;;
			
 
				+        --wtype)
			
 
				+            wtype="$2"
			
 
				+            shift
			
 
				+            shift
			
 
				+            ;;
			
 
				+        --backend)
			
 
				+            backend="$2"
			
 
				+            shift
			
 
				+            shift
			
 
				+            ;;
			
 
				+        --gpu-id)
			
 
				+            gpu_id="$2"
			
 
				+            shift
			
 
				+            shift
			
 
				+            ;;
			
 
				+        --n-parallel)
			
 
				+            n_parallel="$2"
			
 
				+            shift
			
 
				+            shift
			
 
				+            ;;
			
 
				+        --n-kv)
			
 
				+            n_kv="$2"
			
 
				+            shift
			
 
				+            shift
			
 
				+            ;;
			
 
				+        --verbose)
			
 
				+            verbose=1
			
 
				+            shift
			
 
				+            ;;
			
 
				+        --help)
			
 
				+            print_usage
			
 
				+            exit 0
			
 
				+            ;;
			
 
				+        *)
			
 
				+            echo "Unknown argument: $key"
			
 
				+            print_usage
			
 
				+            exit 1
			
 
				+            ;;
			
 
				+    esac
			
 
				+done
			
 
				+
			
 
				+# available weights types
			
 
				+wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K")
			
 
				+
			
 
				+wfiles=()
			
 
				+for wt in "${wtypes[@]}"; do
			
 
				+    wfiles+=("")
			
 
				+done
			
 
				+
			
 
				+# sample repos
			
 
				+repos=(
			
 
				+    "https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
			
 
				+    "https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
			
 
				+    "https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
			
 
				+    "https://huggingface.co/TheBloke/CodeLlama-7B-GGUF"
			
 
				+    "https://huggingface.co/TheBloke/CodeLlama-13B-GGUF"
			
 
				+    "https://huggingface.co/TheBloke/CodeLlama-34B-GGUF"
			
 
				+    "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
			
 
				+    "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
			
 
				+    "https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
			
 
				+    "https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
			
 
				+)
			
 
				+
			
 
				+printf "\n"
			
 
				+printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
			
 
				+printf "    Based on the options that follow, the script might download a model file\n"
			
 
				+printf "    from the internet, which can be a few GBs in size. The script will also\n"
			
 
				+printf "    build the latest llama.cpp source code from GitHub, which can be unstable.\n"
			
 
				+printf "\n"
			
 
				+printf "    Upon success, an HTTP server will be started and it will serve the selected\n"
			
 
				+printf "    model using llama.cpp for demonstration purposes.\n"
			
 
				+printf "\n"
			
 
				+printf "    Please note:\n"
			
 
				+printf "\n"
			
 
				+printf "    - All new data will be stored in the current folder\n"
			
 
				+printf "    - The server will be listening on all network interfaces\n"
			
 
				+printf "    - The server will run with default settings which are not always optimal\n"
			
 
				+printf "    - Do not judge the quality of a model based on the results from this script\n"
			
 
				+printf "    - Do not use this script to benchmark llama.cpp\n"
			
 
				+printf "    - Do not use this script in production\n"
			
 
				+printf "    - This script is only for demonstration purposes\n"
			
 
				+printf "\n"
			
 
				+printf "    If you don't know what you are doing, please press Ctrl-C to abort now\n"
			
 
				+printf "\n"
			
 
				+printf "    Press Enter to continue ...\n\n"
			
 
				+
			
 
				+read
			
 
				+
			
 
				+if [[ -z "$repo" ]]; then
			
 
				+    printf "[+] No repo provided from the command line\n"
			
 
				+    printf "    Please select a number from the list below or enter an URL:\n\n"
			
 
				+
			
 
				+    is=0
			
 
				+    for r in "${repos[@]}"; do
			
 
				+        printf "    %2d) %s\n" $is "$r"
			
 
				+        is=$((is+1))
			
 
				+    done
			
 
				+
			
 
				+    # ask for repo until index of sample repo is provided or an URL
			
 
				+    while [[ -z "$repo" ]]; do
			
 
				+        printf "\n    Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n"
			
 
				+        read -p "[+] Select repo: " repo
			
 
				+
			
 
				+        # check if the input is a number
			
 
				+        if [[ "$repo" =~ ^[0-9]+$ ]]; then
			
 
				+            if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then
			
 
				+                repo="${repos[$repo]}"
			
 
				+            else
			
 
				+                printf "[-] Invalid repo index: %s\n" "$repo"
			
 
				+                repo=""
			
 
				+            fi
			
 
				+        elif [[ "$repo" =~ ^https?:// ]]; then
			
 
				+            repo="$repo"
			
 
				+        else
			
 
				+            printf "[-] Invalid repo URL: %s\n" "$repo"
			
 
				+            repo=""
			
 
				+        fi
			
 
				+    done
			
 
				+fi
			
 
				+
			
 
				+# remove suffix
			
 
				+repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g')
			
 
				+
			
 
				+printf "[+] Checking for GGUF model files in %s\n" "$repo"
			
 
				+
			
 
				+# find GGUF files in the source
			
 
				+# TODO: better logic
			
 
				+model_tree="${repo%/}/tree/main"
			
 
				+model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g')
			
 
				+
			
 
				+# list all files in the provided git repo
			
 
				+printf "[+] Model files:\n\n"
			
 
				+for file in $model_files; do
			
 
				+    # determine iw by grepping the filename with wtypes
			
 
				+    iw=-1
			
 
				+    is=0
			
 
				+    for wt in "${wtypes[@]}"; do
			
 
				+        # uppercase
			
 
				+        ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]')
			
 
				+        if [[ "$ufile" =~ "$wt" ]]; then
			
 
				+            iw=$is
			
 
				+            break
			
 
				+        fi
			
 
				+        is=$((is+1))
			
 
				+    done
			
 
				+
			
 
				+    if [[ $iw -eq -1 ]]; then
			
 
				+        continue
			
 
				+    fi
			
 
				+
			
 
				+    wfiles[$iw]="$file"
			
 
				+
			
 
				+    have=" "
			
 
				+    if [[ -f "$file" ]]; then
			
 
				+        have="*"
			
 
				+    fi
			
 
				+
			
 
				+    printf "    %2d) %s %s\n" $iw "$have" "$file"
			
 
				+done
			
 
				+
			
 
				+# ask for weights type until provided and available
			
 
				+while [[ -z "$wtype" ]]; do
			
 
				+    printf "\n"
			
 
				+    read -p "[+] Select weight type: " wtype
			
 
				+    wfile="${wfiles[$wtype]}"
			
 
				+
			
 
				+    if [[ -z "$wfile" ]]; then
			
 
				+        printf "[-] Invalid weight type: %s\n" "$wtype"
			
 
				+        wtype=""
			
 
				+    fi
			
 
				+done
			
 
				+
			
 
				+printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile"
			
 
				+
			
 
				+url="${repo%/}/resolve/main/$wfile"
			
 
				+
			
 
				+# check file if the model has been downloaded before
			
 
				+chk="$wfile.chk"
			
 
				+
			
 
				+# check if we should download the file
			
 
				+# - if $wfile does not exist
			
 
				+# - if $wfile exists but $chk does not exist
			
 
				+# - if $wfile exists and $chk exists but $wfile is newer than $chk
			
 
				+# TODO: better logic using git lfs info
			
 
				+
			
 
				+do_download=0
			
 
				+
			
 
				+if [[ ! -f "$wfile" ]]; then
			
 
				+    do_download=1
			
 
				+elif [[ ! -f "$chk" ]]; then
			
 
				+    do_download=1
			
 
				+elif [[ "$wfile" -nt "$chk" ]]; then
			
 
				+    do_download=1
			
 
				+fi
			
 
				+
			
 
				+if [[ $do_download -eq 1 ]]; then
			
 
				+    printf "[+] Downloading weights from %s\n" "$url"
			
 
				+
			
 
				+    # download the weights file
			
 
				+    curl -o "$wfile" -# -L "$url"
			
 
				+
			
 
				+    # create a check file if successful
			
 
				+    if [[ $? -eq 0 ]]; then
			
 
				+        printf "[+] Creating check file %s\n" "$chk"
			
 
				+        touch "$chk"
			
 
				+    fi
			
 
				+else
			
 
				+    printf "[+] Using cached weights %s\n" "$wfile"
			
 
				+fi
			
 
				+
			
 
				+# get latest llama.cpp and build
			
 
				+
			
 
				+printf "[+] Downloading latest llama.cpp\n"
			
 
				+
			
 
				+llama_cpp_dir="__llama_cpp_port_${port}__"
			
 
				+
			
 
				+if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then
			
 
				+    # if the dir exists and there isn't a file "__ggml_script__" in it, abort
			
 
				+    printf "[-] Directory %s already exists\n" "$llama_cpp_dir"
			
 
				+    printf "[-] Please remove it and try again\n"
			
 
				+    exit 1
			
 
				+elif [[ -d "$llama_cpp_dir" ]]; then
			
 
				+    printf "[+] Directory %s already exists\n" "$llama_cpp_dir"
			
 
				+    printf "[+] Using cached llama.cpp\n"
			
 
				+
			
 
				+    cd "$llama_cpp_dir"
			
 
				+    git reset --hard
			
 
				+    git fetch
			
 
				+    git checkout origin/master
			
 
				+
			
 
				+    cd ..
			
 
				+else
			
 
				+    printf "[+] Cloning llama.cpp\n"
			
 
				+
			
 
				+    git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
			
 
				+fi
			
 
				+
			
 
				+# mark that that the directory is made by this script
			
 
				+touch "$llama_cpp_dir/__ggml_script__"
			
 
				+
			
 
				+if [[ $verbose -eq 1 ]]; then
			
 
				+    set -x
			
 
				+fi
			
 
				+
			
 
				+# build
			
 
				+cd "$llama_cpp_dir"
			
 
				+
			
 
				+make clean
			
 
				+
			
 
				+log="--silent"
			
 
				+if [[ $verbose -eq 1 ]]; then
			
 
				+    log=""
			
 
				+fi
			
 
				+
			
 
				+if [[ "$backend" == "cuda" ]]; then
			
 
				+    printf "[+] Building with CUDA backend\n"
			
 
				+    LLAMA_CUBLAS=1 make -j server $log
			
 
				+elif [[ "$backend" == "cpu" ]]; then
			
 
				+    printf "[+] Building with CPU backend\n"
			
 
				+    make -j server $log
			
 
				+elif [[ "$backend" == "metal" ]]; then
			
 
				+    printf "[+] Building with Metal backend\n"
			
 
				+    make -j server $log
			
 
				+elif [[ "$backend" == "opencl" ]]; then
			
 
				+    printf "[+] Building with OpenCL backend\n"
			
 
				+    LLAMA_CLBLAST=1 make -j server $log
			
 
				+else
			
 
				+    printf "[-] Unknown backend: %s\n" "$backend"
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+# run the server
			
 
				+
			
 
				+printf "[+] Running server\n"
			
 
				+
			
 
				+args=""
			
 
				+if [[ "$backend" == "cuda" ]]; then
			
 
				+    export CUDA_VISIBLE_DEVICES=$gpu_id
			
 
				+    args="-ngl 999"
			
 
				+elif [[ "$backend" == "cpu" ]]; then
			
 
				+    args="-ngl 0"
			
 
				+elif [[ "$backend" == "metal" ]]; then
			
 
				+    args="-ngl 999"
			
 
				+elif [[ "$backend" == "opencl" ]]; then
			
 
				+    args="-ngl 999"
			
 
				+else
			
 
				+    printf "[-] Unknown backend: %s\n" "$backend"
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+if [[ $verbose -eq 1 ]]; then
			
 
				+    args="$args --verbose"
			
 
				+fi
			
 
				+
			
 
				+./server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args
			
 
				+
			
 
				+exit 0