server-llm.sh 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391
  1. #!/bin/bash
  2. #
  3. # Helper script for deploying llama.cpp server with a single Bash command
  4. #
  5. # - Works on Linux and macOS
  6. # - Supports: CPU, CUDA, Metal, OpenCL
  7. # - Can run all GGUF models from HuggingFace
  8. # - Can serve requests in parallel
  9. # - Always builds latest llama.cpp from GitHub
  10. #
  11. # Limitations
  12. #
  13. # - Chat templates are poorly supported (base models recommended)
  14. # - Might be unstable!
  15. #
  16. # Usage:
  17. # ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]
  18. #
  19. # --port: port number, default is 8888
  20. # --repo: path to a repo containing GGUF model files
  21. # --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input
  22. # --backend: cpu, cuda, metal, opencl, depends on the OS
  23. # --gpu-id: gpu id, default is 0
  24. # --n-parallel: number of parallel requests, default is 8
  25. # --n-kv: KV cache size, default is 4096
  26. # --verbose: verbose output
  27. #
  28. # Example:
  29. #
  30. # bash -c "$(curl -s https://ggml.ai/server-llm.sh)"
  31. #
  32. set -e
  33. # required utils: curl, git, make
  34. if ! command -v curl &> /dev/null; then
  35. printf "[-] curl not found\n"
  36. exit 1
  37. fi
  38. if ! command -v git &> /dev/null; then
  39. printf "[-] git not found\n"
  40. exit 1
  41. fi
  42. if ! command -v make &> /dev/null; then
  43. printf "[-] make not found\n"
  44. exit 1
  45. fi
  46. # parse arguments
  47. port=8888
  48. repo=""
  49. wtype=""
  50. backend="cpu"
  51. # if macOS, use metal backend by default
  52. if [[ "$OSTYPE" == "darwin"* ]]; then
  53. backend="metal"
  54. elif command -v nvcc &> /dev/null; then
  55. backend="cuda"
  56. fi
  57. gpu_id=0
  58. n_parallel=8
  59. n_kv=4096
  60. verbose=0
  61. function print_usage {
  62. printf "Usage:\n"
  63. printf " ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]\n\n"
  64. printf " --port: port number, default is 8888\n"
  65. printf " --repo: path to a repo containing GGUF model files\n"
  66. printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
  67. printf " --backend: cpu, cuda, metal, opencl, depends on the OS\n"
  68. printf " --gpu-id: gpu id, default is 0\n"
  69. printf " --n-parallel: number of parallel requests, default is 8\n"
  70. printf " --n-kv: KV cache size, default is 4096\n"
  71. printf " --verbose: verbose output\n\n"
  72. printf "Example:\n\n"
  73. printf ' bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n'
  74. }
  75. while [[ $# -gt 0 ]]; do
  76. key="$1"
  77. case $key in
  78. --port)
  79. port="$2"
  80. shift
  81. shift
  82. ;;
  83. --repo)
  84. repo="$2"
  85. shift
  86. shift
  87. ;;
  88. --wtype)
  89. wtype="$2"
  90. shift
  91. shift
  92. ;;
  93. --backend)
  94. backend="$2"
  95. shift
  96. shift
  97. ;;
  98. --gpu-id)
  99. gpu_id="$2"
  100. shift
  101. shift
  102. ;;
  103. --n-parallel)
  104. n_parallel="$2"
  105. shift
  106. shift
  107. ;;
  108. --n-kv)
  109. n_kv="$2"
  110. shift
  111. shift
  112. ;;
  113. --verbose)
  114. verbose=1
  115. shift
  116. ;;
  117. --help)
  118. print_usage
  119. exit 0
  120. ;;
  121. *)
  122. echo "Unknown argument: $key"
  123. print_usage
  124. exit 1
  125. ;;
  126. esac
  127. done
  128. # available weights types
  129. wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K")
  130. wfiles=()
  131. for wt in "${wtypes[@]}"; do
  132. wfiles+=("")
  133. done
  134. # sample repos
  135. repos=(
  136. "https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
  137. "https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
  138. "https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
  139. "https://huggingface.co/TheBloke/CodeLlama-7B-GGUF"
  140. "https://huggingface.co/TheBloke/CodeLlama-13B-GGUF"
  141. "https://huggingface.co/TheBloke/CodeLlama-34B-GGUF"
  142. "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
  143. "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
  144. "https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
  145. "https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
  146. )
  147. printf "\n"
  148. printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
  149. printf " Based on the options that follow, the script might download a model file\n"
  150. printf " from the internet, which can be a few GBs in size. The script will also\n"
  151. printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n"
  152. printf "\n"
  153. printf " Upon success, an HTTP server will be started and it will serve the selected\n"
  154. printf " model using llama.cpp for demonstration purposes.\n"
  155. printf "\n"
  156. printf " Please note:\n"
  157. printf "\n"
  158. printf " - All new data will be stored in the current folder\n"
  159. printf " - The server will be listening on all network interfaces\n"
  160. printf " - The server will run with default settings which are not always optimal\n"
  161. printf " - Do not judge the quality of a model based on the results from this script\n"
  162. printf " - Do not use this script to benchmark llama.cpp\n"
  163. printf " - Do not use this script in production\n"
  164. printf " - This script is only for demonstration purposes\n"
  165. printf "\n"
  166. printf " If you don't know what you are doing, please press Ctrl-C to abort now\n"
  167. printf "\n"
  168. printf " Press Enter to continue ...\n\n"
  169. read
  170. if [[ -z "$repo" ]]; then
  171. printf "[+] No repo provided from the command line\n"
  172. printf " Please select a number from the list below or enter an URL:\n\n"
  173. is=0
  174. for r in "${repos[@]}"; do
  175. printf " %2d) %s\n" $is "$r"
  176. is=$((is+1))
  177. done
  178. # ask for repo until index of sample repo is provided or an URL
  179. while [[ -z "$repo" ]]; do
  180. printf "\n Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n"
  181. read -p "[+] Select repo: " repo
  182. # check if the input is a number
  183. if [[ "$repo" =~ ^[0-9]+$ ]]; then
  184. if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then
  185. repo="${repos[$repo]}"
  186. else
  187. printf "[-] Invalid repo index: %s\n" "$repo"
  188. repo=""
  189. fi
  190. elif [[ "$repo" =~ ^https?:// ]]; then
  191. repo="$repo"
  192. else
  193. printf "[-] Invalid repo URL: %s\n" "$repo"
  194. repo=""
  195. fi
  196. done
  197. fi
  198. # remove suffix
  199. repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g')
  200. printf "[+] Checking for GGUF model files in %s\n" "$repo"
  201. # find GGUF files in the source
  202. # TODO: better logic
  203. model_tree="${repo%/}/tree/main"
  204. model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g')
  205. # list all files in the provided git repo
  206. printf "[+] Model files:\n\n"
  207. for file in $model_files; do
  208. # determine iw by grepping the filename with wtypes
  209. iw=-1
  210. is=0
  211. for wt in "${wtypes[@]}"; do
  212. # uppercase
  213. ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]')
  214. if [[ "$ufile" =~ "$wt" ]]; then
  215. iw=$is
  216. break
  217. fi
  218. is=$((is+1))
  219. done
  220. if [[ $iw -eq -1 ]]; then
  221. continue
  222. fi
  223. wfiles[$iw]="$file"
  224. have=" "
  225. if [[ -f "$file" ]]; then
  226. have="*"
  227. fi
  228. printf " %2d) %s %s\n" $iw "$have" "$file"
  229. done
  230. # ask for weights type until provided and available
  231. while [[ -z "$wtype" ]]; do
  232. printf "\n"
  233. read -p "[+] Select weight type: " wtype
  234. wfile="${wfiles[$wtype]}"
  235. if [[ -z "$wfile" ]]; then
  236. printf "[-] Invalid weight type: %s\n" "$wtype"
  237. wtype=""
  238. fi
  239. done
  240. printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile"
  241. url="${repo%/}/resolve/main/$wfile"
  242. # check file if the model has been downloaded before
  243. chk="$wfile.chk"
  244. # check if we should download the file
  245. # - if $wfile does not exist
  246. # - if $wfile exists but $chk does not exist
  247. # - if $wfile exists and $chk exists but $wfile is newer than $chk
  248. # TODO: better logic using git lfs info
  249. do_download=0
  250. if [[ ! -f "$wfile" ]]; then
  251. do_download=1
  252. elif [[ ! -f "$chk" ]]; then
  253. do_download=1
  254. elif [[ "$wfile" -nt "$chk" ]]; then
  255. do_download=1
  256. fi
  257. if [[ $do_download -eq 1 ]]; then
  258. printf "[+] Downloading weights from %s\n" "$url"
  259. # download the weights file
  260. curl -o "$wfile" -# -L "$url"
  261. # create a check file if successful
  262. if [[ $? -eq 0 ]]; then
  263. printf "[+] Creating check file %s\n" "$chk"
  264. touch "$chk"
  265. fi
  266. else
  267. printf "[+] Using cached weights %s\n" "$wfile"
  268. fi
  269. # get latest llama.cpp and build
  270. printf "[+] Downloading latest llama.cpp\n"
  271. llama_cpp_dir="__llama_cpp_port_${port}__"
  272. if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then
  273. # if the dir exists and there isn't a file "__ggml_script__" in it, abort
  274. printf "[-] Directory %s already exists\n" "$llama_cpp_dir"
  275. printf "[-] Please remove it and try again\n"
  276. exit 1
  277. elif [[ -d "$llama_cpp_dir" ]]; then
  278. printf "[+] Directory %s already exists\n" "$llama_cpp_dir"
  279. printf "[+] Using cached llama.cpp\n"
  280. cd "$llama_cpp_dir"
  281. git reset --hard
  282. git fetch
  283. git checkout origin/master
  284. cd ..
  285. else
  286. printf "[+] Cloning llama.cpp\n"
  287. git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
  288. fi
  289. # mark that that the directory is made by this script
  290. touch "$llama_cpp_dir/__ggml_script__"
  291. if [[ $verbose -eq 1 ]]; then
  292. set -x
  293. fi
  294. # build
  295. cd "$llama_cpp_dir"
  296. make clean
  297. log="--silent"
  298. if [[ $verbose -eq 1 ]]; then
  299. log=""
  300. fi
  301. if [[ "$backend" == "cuda" ]]; then
  302. printf "[+] Building with CUDA backend\n"
  303. LLAMA_CUBLAS=1 make -j server $log
  304. elif [[ "$backend" == "cpu" ]]; then
  305. printf "[+] Building with CPU backend\n"
  306. make -j server $log
  307. elif [[ "$backend" == "metal" ]]; then
  308. printf "[+] Building with Metal backend\n"
  309. make -j server $log
  310. elif [[ "$backend" == "opencl" ]]; then
  311. printf "[+] Building with OpenCL backend\n"
  312. LLAMA_CLBLAST=1 make -j server $log
  313. else
  314. printf "[-] Unknown backend: %s\n" "$backend"
  315. exit 1
  316. fi
  317. # run the server
  318. printf "[+] Running server\n"
  319. args=""
  320. if [[ "$backend" == "cuda" ]]; then
  321. export CUDA_VISIBLE_DEVICES=$gpu_id
  322. args="-ngl 999"
  323. elif [[ "$backend" == "cpu" ]]; then
  324. args="-ngl 0"
  325. elif [[ "$backend" == "metal" ]]; then
  326. args="-ngl 999"
  327. elif [[ "$backend" == "opencl" ]]; then
  328. args="-ngl 999"
  329. else
  330. printf "[-] Unknown backend: %s\n" "$backend"
  331. exit 1
  332. fi
  333. if [[ $verbose -eq 1 ]]; then
  334. args="$args --verbose"
  335. fi
  336. ./server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args
  337. exit 0