server-llm.sh 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422
  1. #!/bin/bash
  2. #
  3. # Helper script for deploying llama.cpp server with a single Bash command
  4. #
  5. # - Works on Linux and macOS
  6. # - Supports: CPU, CUDA, Metal, OpenCL
  7. # - Can run all GGUF models from HuggingFace
  8. # - Can serve requests in parallel
  9. # - Always builds latest llama.cpp from GitHub
  10. #
  11. # Limitations
  12. #
  13. # - Chat templates are poorly supported (base models recommended)
  14. # - Might be unstable!
  15. #
  16. # Usage:
  17. # ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]
  18. #
  19. # --port: port number, default is 8888
  20. # --repo: path to a repo containing GGUF model files
  21. # --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input
  22. # --backend: cpu, cuda, metal, opencl, depends on the OS
  23. # --gpu-id: gpu id, default is 0
  24. # --n-parallel: number of parallel requests, default is 8
  25. # --n-kv: KV cache size, default is 4096
  26. # --verbose: verbose output
  27. #
  28. # Example:
  29. #
  30. # bash -c "$(curl -s https://ggml.ai/server-llm.sh)"
  31. #
  32. set -e
  33. # required utils: curl, git, make
  34. if ! command -v curl &> /dev/null; then
  35. printf "[-] curl not found\n"
  36. exit 1
  37. fi
  38. if ! command -v git &> /dev/null; then
  39. printf "[-] git not found\n"
  40. exit 1
  41. fi
  42. if ! command -v make &> /dev/null; then
  43. printf "[-] make not found\n"
  44. exit 1
  45. fi
  46. # parse arguments
  47. is_interactive=1
  48. port=8888
  49. repo=""
  50. wtype=""
  51. backend="cpu"
  52. # if macOS, use metal backend by default
  53. if [[ "$OSTYPE" == "darwin"* ]]; then
  54. backend="metal"
  55. elif command -v nvcc &> /dev/null; then
  56. backend="cuda"
  57. fi
  58. gpu_id=0
  59. n_parallel=8
  60. n_kv=4096
  61. verbose=0
  62. function print_usage {
  63. printf "Usage:\n"
  64. printf " ./server-llm.sh [-interactive] [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]\n\n"
  65. printf " --non-interactive: run without asking a permision to run\n"
  66. printf " --port: port number, default is 8888\n"
  67. printf " --repo: path to a repo containing GGUF model files\n"
  68. printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
  69. printf " --backend: cpu, cuda, metal, opencl, depends on the OS\n"
  70. printf " --gpu-id: gpu id, default is 0\n"
  71. printf " --n-parallel: number of parallel requests, default is 8\n"
  72. printf " --n-kv: KV cache size, default is 4096\n"
  73. printf " --verbose: verbose output\n\n"
  74. printf "Example:\n\n"
  75. printf ' bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n'
  76. }
  77. while [[ $# -gt 0 ]]; do
  78. key="$1"
  79. case $key in
  80. --non-interactive)
  81. is_interactive=0
  82. shift
  83. ;;
  84. --port)
  85. port="$2"
  86. shift
  87. shift
  88. ;;
  89. --repo)
  90. repo="$2"
  91. shift
  92. shift
  93. ;;
  94. --wtype)
  95. wtype="$2"
  96. shift
  97. shift
  98. ;;
  99. --backend)
  100. backend="$2"
  101. shift
  102. shift
  103. ;;
  104. --gpu-id)
  105. gpu_id="$2"
  106. shift
  107. shift
  108. ;;
  109. --n-parallel)
  110. n_parallel="$2"
  111. shift
  112. shift
  113. ;;
  114. --n-kv)
  115. n_kv="$2"
  116. shift
  117. shift
  118. ;;
  119. --verbose)
  120. verbose=1
  121. shift
  122. ;;
  123. --help)
  124. print_usage
  125. exit 0
  126. ;;
  127. *)
  128. echo "Unknown argument: $key"
  129. print_usage
  130. exit 1
  131. ;;
  132. esac
  133. done
  134. # available weights types
  135. wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K")
  136. wfiles=()
  137. for wt in "${wtypes[@]}"; do
  138. wfiles+=("")
  139. done
  140. # map wtype input to index
  141. if [[ ! -z "$wtype" ]]; then
  142. iw=-1
  143. is=0
  144. for wt in "${wtypes[@]}"; do
  145. # uppercase
  146. uwt=$(echo "$wt" | tr '[:lower:]' '[:upper:]')
  147. if [[ "$uwt" == "$wtype" ]]; then
  148. iw=$is
  149. break
  150. fi
  151. is=$((is+1))
  152. done
  153. if [[ $iw -eq -1 ]]; then
  154. printf "[-] Invalid weight type: %s\n" "$wtype"
  155. exit 1
  156. fi
  157. wtype="$iw"
  158. fi
  159. # sample repos
  160. repos=(
  161. "https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
  162. "https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
  163. "https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
  164. "https://huggingface.co/TheBloke/CodeLlama-7B-GGUF"
  165. "https://huggingface.co/TheBloke/CodeLlama-13B-GGUF"
  166. "https://huggingface.co/TheBloke/CodeLlama-34B-GGUF"
  167. "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
  168. "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
  169. "https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
  170. "https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
  171. )
  172. if [ $is_interactive -eq 1 ]; then
  173. printf "\n"
  174. printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
  175. printf " Based on the options that follow, the script might download a model file\n"
  176. printf " from the internet, which can be a few GBs in size. The script will also\n"
  177. printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n"
  178. printf "\n"
  179. printf " Upon success, an HTTP server will be started and it will serve the selected\n"
  180. printf " model using llama.cpp for demonstration purposes.\n"
  181. printf "\n"
  182. printf " Please note:\n"
  183. printf "\n"
  184. printf " - All new data will be stored in the current folder\n"
  185. printf " - The server will be listening on all network interfaces\n"
  186. printf " - The server will run with default settings which are not always optimal\n"
  187. printf " - Do not judge the quality of a model based on the results from this script\n"
  188. printf " - Do not use this script to benchmark llama.cpp\n"
  189. printf " - Do not use this script in production\n"
  190. printf " - This script is only for demonstration purposes\n"
  191. printf "\n"
  192. printf " If you don't know what you are doing, please press Ctrl-C to abort now\n"
  193. printf "\n"
  194. printf " Press Enter to continue ...\n\n"
  195. read
  196. fi
  197. if [[ -z "$repo" ]]; then
  198. printf "[+] No repo provided from the command line\n"
  199. printf " Please select a number from the list below or enter an URL:\n\n"
  200. is=0
  201. for r in "${repos[@]}"; do
  202. printf " %2d) %s\n" $is "$r"
  203. is=$((is+1))
  204. done
  205. # ask for repo until index of sample repo is provided or an URL
  206. while [[ -z "$repo" ]]; do
  207. printf "\n Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n"
  208. read -p "[+] Select repo: " repo
  209. # check if the input is a number
  210. if [[ "$repo" =~ ^[0-9]+$ ]]; then
  211. if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then
  212. repo="${repos[$repo]}"
  213. else
  214. printf "[-] Invalid repo index: %s\n" "$repo"
  215. repo=""
  216. fi
  217. elif [[ "$repo" =~ ^https?:// ]]; then
  218. repo="$repo"
  219. else
  220. printf "[-] Invalid repo URL: %s\n" "$repo"
  221. repo=""
  222. fi
  223. done
  224. fi
  225. # remove suffix
  226. repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g')
  227. printf "[+] Checking for GGUF model files in %s\n" "$repo"
  228. # find GGUF files in the source
  229. # TODO: better logic
  230. model_tree="${repo%/}/tree/main"
  231. model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g')
  232. # list all files in the provided git repo
  233. printf "[+] Model files:\n\n"
  234. for file in $model_files; do
  235. # determine iw by grepping the filename with wtypes
  236. iw=-1
  237. is=0
  238. for wt in "${wtypes[@]}"; do
  239. # uppercase
  240. ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]')
  241. if [[ "$ufile" =~ "$wt" ]]; then
  242. iw=$is
  243. break
  244. fi
  245. is=$((is+1))
  246. done
  247. if [[ $iw -eq -1 ]]; then
  248. continue
  249. fi
  250. wfiles[$iw]="$file"
  251. have=" "
  252. if [[ -f "$file" ]]; then
  253. have="*"
  254. fi
  255. printf " %2d) %s %s\n" $iw "$have" "$file"
  256. done
  257. wfile="${wfiles[$wtype]}"
  258. # ask for weights type until provided and available
  259. while [[ -z "$wfile" ]]; do
  260. printf "\n"
  261. read -p "[+] Select weight type: " wtype
  262. wfile="${wfiles[$wtype]}"
  263. if [[ -z "$wfile" ]]; then
  264. printf "[-] Invalid weight type: %s\n" "$wtype"
  265. wtype=""
  266. fi
  267. done
  268. printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile"
  269. url="${repo%/}/resolve/main/$wfile"
  270. # check file if the model has been downloaded before
  271. chk="$wfile.chk"
  272. # check if we should download the file
  273. # - if $wfile does not exist
  274. # - if $wfile exists but $chk does not exist
  275. # - if $wfile exists and $chk exists but $wfile is newer than $chk
  276. # TODO: better logic using git lfs info
  277. do_download=0
  278. if [[ ! -f "$wfile" ]]; then
  279. do_download=1
  280. elif [[ ! -f "$chk" ]]; then
  281. do_download=1
  282. elif [[ "$wfile" -nt "$chk" ]]; then
  283. do_download=1
  284. fi
  285. if [[ $do_download -eq 1 ]]; then
  286. printf "[+] Downloading weights from %s\n" "$url"
  287. # download the weights file
  288. curl -o "$wfile" -# -L "$url"
  289. # create a check file if successful
  290. if [[ $? -eq 0 ]]; then
  291. printf "[+] Creating check file %s\n" "$chk"
  292. touch "$chk"
  293. fi
  294. else
  295. printf "[+] Using cached weights %s\n" "$wfile"
  296. fi
  297. # get latest llama.cpp and build
  298. printf "[+] Downloading latest llama.cpp\n"
  299. llama_cpp_dir="__llama_cpp_port_${port}__"
  300. if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then
  301. # if the dir exists and there isn't a file "__ggml_script__" in it, abort
  302. printf "[-] Directory %s already exists\n" "$llama_cpp_dir"
  303. printf "[-] Please remove it and try again\n"
  304. exit 1
  305. elif [[ -d "$llama_cpp_dir" ]]; then
  306. printf "[+] Directory %s already exists\n" "$llama_cpp_dir"
  307. printf "[+] Using cached llama.cpp\n"
  308. cd "$llama_cpp_dir"
  309. git reset --hard
  310. git fetch
  311. git checkout origin/master
  312. cd ..
  313. else
  314. printf "[+] Cloning llama.cpp\n"
  315. git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
  316. fi
  317. # mark that that the directory is made by this script
  318. touch "$llama_cpp_dir/__ggml_script__"
  319. if [[ $verbose -eq 1 ]]; then
  320. set -x
  321. fi
  322. # build
  323. cd "$llama_cpp_dir"
  324. make clean
  325. log="--silent"
  326. if [[ $verbose -eq 1 ]]; then
  327. log=""
  328. fi
  329. if [[ "$backend" == "cuda" ]]; then
  330. printf "[+] Building with CUDA backend\n"
  331. LLAMA_CUBLAS=1 make -j server $log
  332. elif [[ "$backend" == "cpu" ]]; then
  333. printf "[+] Building with CPU backend\n"
  334. make -j server $log
  335. elif [[ "$backend" == "metal" ]]; then
  336. printf "[+] Building with Metal backend\n"
  337. make -j server $log
  338. elif [[ "$backend" == "opencl" ]]; then
  339. printf "[+] Building with OpenCL backend\n"
  340. LLAMA_CLBLAST=1 make -j server $log
  341. else
  342. printf "[-] Unknown backend: %s\n" "$backend"
  343. exit 1
  344. fi
  345. # run the server
  346. printf "[+] Running server\n"
  347. args=""
  348. if [[ "$backend" == "cuda" ]]; then
  349. export CUDA_VISIBLE_DEVICES=$gpu_id
  350. args="-ngl 999"
  351. elif [[ "$backend" == "cpu" ]]; then
  352. args="-ngl 0"
  353. elif [[ "$backend" == "metal" ]]; then
  354. args="-ngl 999"
  355. elif [[ "$backend" == "opencl" ]]; then
  356. args="-ngl 999"
  357. else
  358. printf "[-] Unknown backend: %s\n" "$backend"
  359. exit 1
  360. fi
  361. if [[ $verbose -eq 1 ]]; then
  362. args="$args --verbose"
  363. fi
  364. ./server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args
  365. exit 0