server-llm.sh 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418
  1. #!/bin/bash
  2. #
  3. # Helper script for deploying llama.cpp server with a single Bash command
  4. #
  5. # - Works on Linux and macOS
  6. # - Supports: CPU, CUDA, Metal
  7. # - Can run all GGUF models from HuggingFace
  8. # - Can serve requests in parallel
  9. # - Always builds latest llama.cpp from GitHub
  10. #
  11. # Limitations
  12. #
  13. # - Chat templates are poorly supported (base models recommended)
  14. # - Might be unstable!
  15. #
  16. # Usage:
  17. # ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] [-non-interactive]
  18. #
  19. # --port: port number, default is 8888
  20. # --repo: path to a repo containing GGUF model files
  21. # --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input
  22. # --backend: cpu, cuda, metal, depends on the OS
  23. # --gpu-id: gpu id, default is 0
  24. # --n-parallel: number of parallel requests, default is 8
  25. # --n-kv: KV cache size, default is 4096
  26. # --verbose: verbose output
  27. # --non-interactive: run without asking a permission to run
  28. #
  29. # Example:
  30. #
  31. # bash -c "$(curl -s https://ggml.ai/server-llm.sh)"
  32. #
  33. set -e
  34. # required utils: curl, git, make
  35. if ! command -v curl &> /dev/null; then
  36. printf "[-] curl not found\n"
  37. exit 1
  38. fi
  39. if ! command -v git &> /dev/null; then
  40. printf "[-] git not found\n"
  41. exit 1
  42. fi
  43. if ! command -v make &> /dev/null; then
  44. printf "[-] make not found\n"
  45. exit 1
  46. fi
  47. # parse arguments
  48. is_interactive=1
  49. port=8888
  50. repo=""
  51. wtype=""
  52. backend="cpu"
  53. # if macOS, use metal backend by default
  54. if [[ "$OSTYPE" == "darwin"* ]]; then
  55. backend="metal"
  56. elif command -v nvcc &> /dev/null; then
  57. backend="cuda"
  58. fi
  59. gpu_id=0
  60. n_parallel=8
  61. n_kv=4096
  62. verbose=0
  63. function print_usage {
  64. printf "Usage:\n"
  65. printf " ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] [-non-interactive]\n\n"
  66. printf " --port: port number, default is 8888\n"
  67. printf " --repo: path to a repo containing GGUF model files\n"
  68. printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
  69. printf " --backend: cpu, cuda, metal, depends on the OS\n"
  70. printf " --gpu-id: gpu id, default is 0\n"
  71. printf " --n-parallel: number of parallel requests, default is 8\n"
  72. printf " --n-kv: KV cache size, default is 4096\n"
  73. printf " --verbose: verbose output\n\n"
  74. printf " --non-interactive: run without asking a permission to run\n"
  75. printf "Example:\n\n"
  76. printf ' bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n'
  77. }
  78. while [[ $# -gt 0 ]]; do
  79. key="$1"
  80. case $key in
  81. --non-interactive)
  82. is_interactive=0
  83. shift
  84. ;;
  85. --port)
  86. port="$2"
  87. shift
  88. shift
  89. ;;
  90. --repo)
  91. repo="$2"
  92. shift
  93. shift
  94. ;;
  95. --wtype)
  96. wtype="$2"
  97. shift
  98. shift
  99. ;;
  100. --backend)
  101. backend="$2"
  102. shift
  103. shift
  104. ;;
  105. --gpu-id)
  106. gpu_id="$2"
  107. shift
  108. shift
  109. ;;
  110. --n-parallel)
  111. n_parallel="$2"
  112. shift
  113. shift
  114. ;;
  115. --n-kv)
  116. n_kv="$2"
  117. shift
  118. shift
  119. ;;
  120. --verbose)
  121. verbose=1
  122. shift
  123. ;;
  124. --help)
  125. print_usage
  126. exit 0
  127. ;;
  128. *)
  129. echo "Unknown argument: $key"
  130. print_usage
  131. exit 1
  132. ;;
  133. esac
  134. done
  135. # available weights types
  136. wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K")
  137. wfiles=()
  138. for wt in "${wtypes[@]}"; do
  139. wfiles+=("")
  140. done
  141. # map wtype input to index
  142. if [[ ! -z "$wtype" ]]; then
  143. iw=-1
  144. is=0
  145. for wt in "${wtypes[@]}"; do
  146. # uppercase
  147. uwt=$(echo "$wt" | tr '[:lower:]' '[:upper:]')
  148. if [[ "$uwt" == "$wtype" ]]; then
  149. iw=$is
  150. break
  151. fi
  152. is=$((is+1))
  153. done
  154. if [[ $iw -eq -1 ]]; then
  155. printf "[-] Invalid weight type: %s\n" "$wtype"
  156. exit 1
  157. fi
  158. wtype="$iw"
  159. fi
  160. # sample repos
  161. repos=(
  162. "https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
  163. "https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
  164. "https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
  165. "https://huggingface.co/TheBloke/CodeLlama-7B-GGUF"
  166. "https://huggingface.co/TheBloke/CodeLlama-13B-GGUF"
  167. "https://huggingface.co/TheBloke/CodeLlama-34B-GGUF"
  168. "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
  169. "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
  170. "https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
  171. "https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
  172. )
  173. if [ $is_interactive -eq 1 ]; then
  174. printf "\n"
  175. printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
  176. printf " Based on the options that follow, the script might download a model file\n"
  177. printf " from the internet, which can be a few GBs in size. The script will also\n"
  178. printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n"
  179. printf "\n"
  180. printf " Upon success, an HTTP server will be started and it will serve the selected\n"
  181. printf " model using llama.cpp for demonstration purposes.\n"
  182. printf "\n"
  183. printf " Please note:\n"
  184. printf "\n"
  185. printf " - All new data will be stored in the current folder\n"
  186. printf " - The server will be listening on all network interfaces\n"
  187. printf " - The server will run with default settings which are not always optimal\n"
  188. printf " - Do not judge the quality of a model based on the results from this script\n"
  189. printf " - Do not use this script to benchmark llama.cpp\n"
  190. printf " - Do not use this script in production\n"
  191. printf " - This script is only for demonstration purposes\n"
  192. printf "\n"
  193. printf " If you don't know what you are doing, please press Ctrl-C to abort now\n"
  194. printf "\n"
  195. printf " Press Enter to continue ...\n\n"
  196. read
  197. fi
  198. if [[ -z "$repo" ]]; then
  199. printf "[+] No repo provided from the command line\n"
  200. printf " Please select a number from the list below or enter an URL:\n\n"
  201. is=0
  202. for r in "${repos[@]}"; do
  203. printf " %2d) %s\n" $is "$r"
  204. is=$((is+1))
  205. done
  206. # ask for repo until index of sample repo is provided or an URL
  207. while [[ -z "$repo" ]]; do
  208. printf "\n Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n"
  209. read -p "[+] Select repo: " repo
  210. # check if the input is a number
  211. if [[ "$repo" =~ ^[0-9]+$ ]]; then
  212. if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then
  213. repo="${repos[$repo]}"
  214. else
  215. printf "[-] Invalid repo index: %s\n" "$repo"
  216. repo=""
  217. fi
  218. elif [[ "$repo" =~ ^https?:// ]]; then
  219. repo="$repo"
  220. else
  221. printf "[-] Invalid repo URL: %s\n" "$repo"
  222. repo=""
  223. fi
  224. done
  225. fi
  226. # remove suffix
  227. repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g')
  228. printf "[+] Checking for GGUF model files in %s\n" "$repo"
  229. # find GGUF files in the source
  230. # TODO: better logic
  231. model_tree="${repo%/}/tree/main"
  232. model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g')
  233. # list all files in the provided git repo
  234. printf "[+] Model files:\n\n"
  235. for file in $model_files; do
  236. # determine iw by grepping the filename with wtypes
  237. iw=-1
  238. is=0
  239. for wt in "${wtypes[@]}"; do
  240. # uppercase
  241. ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]')
  242. if [[ "$ufile" =~ "$wt" ]]; then
  243. iw=$is
  244. break
  245. fi
  246. is=$((is+1))
  247. done
  248. if [[ $iw -eq -1 ]]; then
  249. continue
  250. fi
  251. wfiles[$iw]="$file"
  252. have=" "
  253. if [[ -f "$file" ]]; then
  254. have="*"
  255. fi
  256. printf " %2d) %s %s\n" $iw "$have" "$file"
  257. done
  258. wfile="${wfiles[$wtype]}"
  259. # ask for weights type until provided and available
  260. while [[ -z "$wfile" ]]; do
  261. printf "\n"
  262. read -p "[+] Select weight type: " wtype
  263. wfile="${wfiles[$wtype]}"
  264. if [[ -z "$wfile" ]]; then
  265. printf "[-] Invalid weight type: %s\n" "$wtype"
  266. wtype=""
  267. fi
  268. done
  269. printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile"
  270. url="${repo%/}/resolve/main/$wfile"
  271. # check file if the model has been downloaded before
  272. chk="$wfile.chk"
  273. # check if we should download the file
  274. # - if $wfile does not exist
  275. # - if $wfile exists but $chk does not exist
  276. # - if $wfile exists and $chk exists but $wfile is newer than $chk
  277. # TODO: better logic using git lfs info
  278. do_download=0
  279. if [[ ! -f "$wfile" ]]; then
  280. do_download=1
  281. elif [[ ! -f "$chk" ]]; then
  282. do_download=1
  283. elif [[ "$wfile" -nt "$chk" ]]; then
  284. do_download=1
  285. fi
  286. if [[ $do_download -eq 1 ]]; then
  287. printf "[+] Downloading weights from %s\n" "$url"
  288. # download the weights file
  289. curl -o "$wfile" -# -L "$url"
  290. # create a check file if successful
  291. if [[ $? -eq 0 ]]; then
  292. printf "[+] Creating check file %s\n" "$chk"
  293. touch "$chk"
  294. fi
  295. else
  296. printf "[+] Using cached weights %s\n" "$wfile"
  297. fi
  298. # get latest llama.cpp and build
  299. printf "[+] Downloading latest llama.cpp\n"
  300. llama_cpp_dir="__llama_cpp_port_${port}__"
  301. if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then
  302. # if the dir exists and there isn't a file "__ggml_script__" in it, abort
  303. printf "[-] Directory %s already exists\n" "$llama_cpp_dir"
  304. printf "[-] Please remove it and try again\n"
  305. exit 1
  306. elif [[ -d "$llama_cpp_dir" ]]; then
  307. printf "[+] Directory %s already exists\n" "$llama_cpp_dir"
  308. printf "[+] Using cached llama.cpp\n"
  309. cd "$llama_cpp_dir"
  310. git reset --hard
  311. git fetch
  312. git checkout origin/master
  313. cd ..
  314. else
  315. printf "[+] Cloning llama.cpp\n"
  316. git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
  317. fi
  318. # mark that that the directory is made by this script
  319. touch "$llama_cpp_dir/__ggml_script__"
  320. if [[ $verbose -eq 1 ]]; then
  321. set -x
  322. fi
  323. # build
  324. cd "$llama_cpp_dir"
  325. make clean
  326. log="--silent"
  327. if [[ $verbose -eq 1 ]]; then
  328. log=""
  329. fi
  330. if [[ "$backend" == "cuda" ]]; then
  331. printf "[+] Building with CUDA backend\n"
  332. LLAMA_CUDA=1 make -j llama-server $log
  333. elif [[ "$backend" == "cpu" ]]; then
  334. printf "[+] Building with CPU backend\n"
  335. make -j llama-server $log
  336. elif [[ "$backend" == "metal" ]]; then
  337. printf "[+] Building with Metal backend\n"
  338. make -j llama-server $log
  339. else
  340. printf "[-] Unknown backend: %s\n" "$backend"
  341. exit 1
  342. fi
  343. # run the server
  344. printf "[+] Running server\n"
  345. args=""
  346. if [[ "$backend" == "cuda" ]]; then
  347. export CUDA_VISIBLE_DEVICES=$gpu_id
  348. args="-ngl 999"
  349. elif [[ "$backend" == "cpu" ]]; then
  350. args="-ngl 0"
  351. elif [[ "$backend" == "metal" ]]; then
  352. args="-ngl 999"
  353. else
  354. printf "[-] Unknown backend: %s\n" "$backend"
  355. exit 1
  356. fi
  357. if [[ $verbose -eq 1 ]]; then
  358. args="$args --verbose"
  359. fi
  360. ./llama-server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args
  361. exit 0