1
0

pod-llama.sh 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. #!/bin/bash
  2. #
  3. # Use this script only on fresh pods (runpod.io)!
  4. # Otherwise, it can break your environment!
  5. #
  6. if [ -z "$1" ]; then
  7. echo "Usage: $0 <data>"
  8. echo " 0: no models"
  9. echo " 1: tinyllama-1b"
  10. echo " 2: codellama-7b"
  11. echo " 3: codellama-13b"
  12. echo " 4: codellama-34b"
  13. echo " 5: codellama-7b-instruct"
  14. echo " 6: codellama-13b-instruct"
  15. echo " 7: codellama-34b-instruct"
  16. exit 1
  17. fi
  18. set -x
  19. # setup deps
  20. apt-get update
  21. apt-get install -y git-lfs cmake cmake-curses-gui vim ruby
  22. git-lfs install
  23. if [ ! -d "/workspace" ]; then
  24. ln -sfn $(pwd) /workspace
  25. fi
  26. # download data
  27. cd /workspace
  28. # this is useful to git clone repos without doubling the disk size due to .git
  29. git clone https://github.com/iboB/git-lfs-download
  30. ln -sfn /workspace/git-lfs-download/git-lfs-download /usr/local/bin/git-lfs-download
  31. # llama.cpp
  32. cd /workspace
  33. git clone https://github.com/ggerganov/llama.cpp
  34. cd llama.cpp
  35. LLAMA_CUDA=1 make -j
  36. ln -sfn /workspace/TinyLlama-1.1B-Chat-v0.3 ./models/tinyllama-1b
  37. ln -sfn /workspace/CodeLlama-7b-hf ./models/codellama-7b
  38. ln -sfn /workspace/CodeLlama-13b-hf ./models/codellama-13b
  39. ln -sfn /workspace/CodeLlama-34b-hf ./models/codellama-34b
  40. ln -sfn /workspace/CodeLlama-7b-Instruct-hf ./models/codellama-7b-instruct
  41. ln -sfn /workspace/CodeLlama-13b-Instruct-hf ./models/codellama-13b-instruct
  42. ln -sfn /workspace/CodeLlama-34b-Instruct-hf ./models/codellama-34b-instruct
  43. pip install -r requirements.txt
  44. # cmake
  45. cd /workspace/llama.cpp
  46. mkdir build-cublas
  47. cd build-cublas
  48. cmake -DLLAMA_CUDA=1 ../
  49. make -j
  50. if [ "$1" -eq "0" ]; then
  51. exit 0
  52. fi
  53. # more models
  54. if [ "$1" -eq "1" ]; then
  55. cd /workspace
  56. git-lfs-download https://huggingface.co/PY007/TinyLlama-1.1B-Chat-v0.3
  57. cd /workspace/llama.cpp
  58. python3 convert.py ./models/tinyllama-1b --outfile ./models/tinyllama-1b/ggml-model-f16.gguf --outtype f16
  59. ./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_0.gguf q4_0
  60. ./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_k.gguf q4_k
  61. ./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q8_0.gguf q8_0
  62. fi
  63. if [ "$1" -eq "2" ]; then
  64. cd /workspace
  65. git-lfs-download https://huggingface.co/codellama/CodeLlama-7b-hf --without *safetensors*
  66. rm -v ./CodeLlama-7b-hf/*safetensors*
  67. cd /workspace/llama.cpp
  68. python3 convert.py ./models/codellama-7b --outfile ./models/codellama-7b/ggml-model-f16.gguf --outtype f16
  69. ./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_0.gguf q4_0
  70. ./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_k.gguf q4_k
  71. ./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q8_0.gguf q8_0
  72. fi
  73. if [ "$1" -eq "3" ]; then
  74. cd /workspace
  75. git-lfs-download https://huggingface.co/codellama/CodeLlama-13b-hf --without *safetensors*
  76. rm -v ./CodeLlama-13b-hf/*safetensors*
  77. cd /workspace/llama.cpp
  78. python3 convert.py ./models/codellama-13b --outfile ./models/codellama-13b/ggml-model-f16.gguf --outtype f16
  79. ./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_0.gguf q4_0
  80. ./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_k.gguf q4_k
  81. ./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q8_0.gguf q8_0
  82. fi
  83. if [ "$1" -eq "4" ]; then
  84. cd /workspace
  85. git-lfs-download https://huggingface.co/codellama/CodeLlama-34b-hf --without *safetensors*
  86. rm -v ./CodeLlama-34b-hf/*safetensors*
  87. cd /workspace/llama.cpp
  88. python3 convert.py ./models/codellama-34b --outfile ./models/codellama-34b/ggml-model-f16.gguf --outtype f16
  89. ./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_0.gguf q4_0
  90. ./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_k.gguf q4_k
  91. ./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q8_0.gguf q8_0
  92. fi
  93. if [ "$1" -eq "5" ]; then
  94. cd /workspace
  95. git-lfs-download https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf --without *safetensors*
  96. rm -v ./CodeLlama-7b-Instruct-hf/*safetensors*
  97. cd /workspace/llama.cpp
  98. python3 convert.py ./models/codellama-7b-instruct --outfile ./models/codellama-7b-instruct/ggml-model-f16.gguf --outtype f16
  99. ./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_0.gguf q4_0
  100. ./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_k.gguf q4_k
  101. ./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q8_0.gguf q8_0
  102. fi
  103. if [ "$1" -eq "6" ]; then
  104. cd /workspace
  105. git-lfs-download https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf --without *safetensors*
  106. rm -v ./CodeLlama-13b-Instruct-hf/*safetensors*
  107. cd /workspace/llama.cpp
  108. python3 convert.py ./models/codellama-13b-instruct --outfile ./models/codellama-13b-instruct/ggml-model-f16.gguf --outtype f16
  109. ./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_0.gguf q4_0
  110. ./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_k.gguf q4_k
  111. ./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q8_0.gguf q8_0
  112. fi
  113. if [ "$1" -eq "7" ]; then
  114. cd /workspace
  115. git-lfs-download https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf --without *safetensors*
  116. rm -v ./CodeLlama-34b-Instruct-hf/*safetensors*
  117. cd /workspace/llama.cpp
  118. python3 convert.py ./models/codellama-34b-instruct --outfile ./models/codellama-34b-instruct/ggml-model-f16.gguf --outtype f16
  119. ./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_0.gguf q4_0
  120. ./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_k.gguf q4_k
  121. ./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q8_0.gguf q8_0
  122. fi
  123. if [ "$1" -eq "1" ]; then
  124. # perf + perplexity
  125. cd /workspace/llama.cpp/build-cublas
  126. make -j && ../scripts/run-all-perf.sh tinyllama-1b "f16" "-ngl 99 -t 1 -p 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,32,64,128,256,512,1024,2048 -n 128"
  127. ../scripts/get-wikitext-2.sh
  128. unzip wikitext-2-raw-v1.zip
  129. make -j && ./bin/perplexity -m ../models/tinyllama-1b/ggml-model-f16.gguf -f ./wikitext-2-raw/wiki.test.raw -ngl 100 --chunks 32
  130. # batched
  131. cd /workspace/llama.cpp
  132. LLAMA_CUDA=1 make -j && ./batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999
  133. # batched-bench
  134. cd /workspace/llama.cpp
  135. LLAMA_CUDA=1 make -j && ./batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32
  136. # parallel
  137. cd /workspace/llama.cpp
  138. LLAMA_CUDA=1 make -j && ./parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb
  139. fi
  140. # speculative
  141. #if [ "$1" -eq "7" ]; then
  142. # cd /workspace/llama.cpp
  143. #
  144. # LLAMA_CUDA=1 make -j && ./speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0
  145. #fi
  146. # more benches
  147. #LLAMA_CUDA=1 make -j && ./batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
  148. #LLAMA_CUDA=1 make -j && ./batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1