convert-hf-to-gguf-update.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275
  1. # This script downloads the tokenizer models of the specified models from Huggingface and
  2. # generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
  3. #
  4. # This is necessary in order to analyze the type of pre-tokenizer used by the model and
  5. # provide the necessary information to llama.cpp via the GGUF header in order to implement
  6. # the same pre-tokenizer.
  7. #
  8. # ref: https://github.com/ggerganov/llama.cpp/pull/6920
  9. #
  10. # Instructions:
  11. #
  12. # - Add a new model to the "models" list
  13. # - Run the script with your huggingface token:
  14. #
  15. # python3 convert-hf-to-gguf-update.py <huggingface_token>
  16. #
  17. # - Copy-paste the generated get_vocab_base_pre() function into convert-hf-to-gguf.py
  18. # - Update llama.cpp with the new pre-tokenizer if necessary
  19. #
  20. # TODO: generate tokenizer tests for llama.cpp
  21. # TODO: automate the update of convert-hf-to-gguf.py
  22. #
  23. import os
  24. import requests
  25. import sys
  26. import json
  27. from hashlib import sha256
  28. from enum import IntEnum, auto
  29. class TOKENIZER_TYPE(IntEnum):
  30. SPM = auto()
  31. BPE = auto()
  32. WPM = auto()
  33. # TODO: this string has to exercise as much pre-tokenizer functionality as possible
  34. # will be updated with time - contributions welcome
  35. chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
  36. if len(sys.argv) == 2:
  37. token = sys.argv[1]
  38. else:
  39. print("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
  40. sys.exit(1)
  41. # TODO: add models here, base models preferred
  42. models = [
  43. { "name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
  44. { "name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
  45. { "name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
  46. { "name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
  47. { "name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
  48. { "name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
  49. { "name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
  50. { "name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
  51. { "name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
  52. { "name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
  53. ]
  54. # make directory "models/tokenizers" if it doesn't exist
  55. if not os.path.exists("models/tokenizers"):
  56. os.makedirs("models/tokenizers")
  57. def download_file_with_auth(url, token, save_path):
  58. headers = {"Authorization": f"Bearer {token}"}
  59. response = requests.get(url, headers=headers)
  60. if response.status_code == 200:
  61. with open(save_path, 'wb') as f:
  62. f.write(response.content)
  63. print(f"File {save_path} downloaded successfully")
  64. else:
  65. print(f"Failed to download file. Status code: {response.status_code}")
  66. # download the tokenizer models
  67. for model in models:
  68. name = model["name"]
  69. repo = model["repo"]
  70. tokt = model["tokt"]
  71. if not os.path.exists(f"models/tokenizers/{name}"):
  72. os.makedirs(f"models/tokenizers/{name}")
  73. else:
  74. print(f"Directory models/tokenizers/{name} already exists - skipping")
  75. continue
  76. print(f"Downloading {name} to models/tokenizers/{name}")
  77. url = f"{repo}/raw/main/config.json"
  78. save_path = f"models/tokenizers/{name}/config.json"
  79. download_file_with_auth(url, token, save_path)
  80. url = f"{repo}/raw/main/tokenizer.json"
  81. save_path = f"models/tokenizers/{name}/tokenizer.json"
  82. download_file_with_auth(url, token, save_path)
  83. if tokt == TOKENIZER_TYPE.SPM:
  84. url = f"{repo}/resolve/main/tokenizer.model"
  85. save_path = f"models/tokenizers/{name}/tokenizer.model"
  86. download_file_with_auth(url, token, save_path)
  87. url = f"{repo}/raw/main/tokenizer_config.json"
  88. save_path = f"models/tokenizers/{name}/tokenizer_config.json"
  89. download_file_with_auth(url, token, save_path)
  90. # generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
  91. # TODO: auto-update convert-hf-to-gguf.py with the generated function
  92. src_ifs = ""
  93. for model in models:
  94. name = model["name"]
  95. tokt = model["tokt"]
  96. if tokt == TOKENIZER_TYPE.SPM:
  97. continue
  98. # create the tokenizer
  99. from transformers import AutoTokenizer
  100. tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
  101. chktok = tokenizer.encode(chktxt)
  102. chkhsh = sha256(str(chktok).encode()).hexdigest()
  103. print(f"model: {name}")
  104. print(f"tokt: {tokt}")
  105. print(f"repo: {model['repo']}")
  106. print(f"chktok: {chktok}")
  107. print(f"chkhsh: {chkhsh}")
  108. # print the "pre_tokenizer" content from the tokenizer.json
  109. with open(f"models/tokenizers/{name}/tokenizer.json", "r") as f:
  110. cfg = json.load(f)
  111. pre_tokenizer = cfg["pre_tokenizer"]
  112. print("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
  113. print(f"\n")
  114. src_ifs += f" if chkhsh == \"{chkhsh}\":\n"
  115. src_ifs += f" # ref: {model['repo']}\n"
  116. src_ifs += f" res = \"{name}\"\n"
  117. src_func = ""
  118. src_func += " def get_vocab_base_pre(self, tokenizer) -> str:\n"
  119. src_func += " # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that\n"
  120. src_func += " # is specific for the BPE pre-tokenizer used by the model\n"
  121. src_func += " # we will use this unique identifier to write a \"tokenizer.ggml.pre\" entry in the GGUF file which we can\n"
  122. src_func += " # use in llama.cpp to implement the same pre-tokenizer\n"
  123. src_func += "\n"
  124. src_func += f" chktxt = {repr(chktxt)}\n"
  125. src_func += "\n"
  126. src_func += " chktok = tokenizer.encode(chktxt)\n"
  127. src_func += " chkhsh = sha256(str(chktok).encode()).hexdigest()\n"
  128. src_func += "\n"
  129. src_func += " print(f\"chktok: {chktok}\")\n"
  130. src_func += " print(f\"chkhsh: {chkhsh}\")\n"
  131. src_func += "\n"
  132. src_func += " res = None\n"
  133. src_func += "\n"
  134. src_func += " # NOTE: if you get an error here, you need to add the model to the if-elif chain below\n"
  135. src_func += " # don't do this manually - use the convert-hf-to-gguf-update.py script!\n"
  136. src_func += f"{src_ifs}\n"
  137. src_func += " if res is None:\n"
  138. src_func += " print(\"\\n\")\n"
  139. src_func += " print(\"**************************************************************************************\")\n"
  140. src_func += " print(\"** WARNING: The BPE pre-tokenizer was not recognized!\")\n"
  141. src_func += " print(\"** This means that it was not added yet or you are using an older version.\")\n"
  142. src_func += " print(\"** Check convert-hf-to-gguf-update.py and update it accordingly.\")\n"
  143. src_func += " print(\"**\")\n"
  144. src_func += " print(f\"** chkhsh: {chkhsh}\")\n"
  145. src_func += " print(\"**************************************************************************************\")\n"
  146. src_func += " print(\"\\n\")\n"
  147. src_func += " raise NotImplementedError(\"BPE pre-tokenizer was not recognized - update get_vocab_base_pre()\")\n"
  148. src_func += "\n"
  149. src_func += " print(f\"tokenizer.ggml.pre: {res}\")\n"
  150. src_func += " print(f\"chkhsh: {chkhsh}\")\n"
  151. src_func += "\n"
  152. src_func += " return res\n"
  153. print(src_func)
  154. print("\n")
  155. print("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
  156. print("\n")
  157. # generate tests for each tokenizer model
  158. tests = [
  159. "",
  160. " ",
  161. " ",
  162. " ",
  163. "\t",
  164. "\n",
  165. "\n\n",
  166. "\n\n\n",
  167. "\t\n",
  168. "Hello world",
  169. " Hello world",
  170. "Hello World",
  171. " Hello World",
  172. " Hello World!",
  173. "Hello, world!",
  174. " Hello, world!",
  175. " this is 🦙.cpp",
  176. "w048 7tuijk dsdfhu",
  177. "нещо на Български",
  178. "កាន់តែពិសេសអាចខលចេញ",
  179. "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
  180. "Hello",
  181. " Hello",
  182. " Hello",
  183. " Hello",
  184. " Hello",
  185. " Hello\n Hello",
  186. " (",
  187. "\n =",
  188. "' era",
  189. "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~",
  190. "3",
  191. "33",
  192. "333",
  193. "3333",
  194. "33333",
  195. "333333",
  196. "3333333",
  197. "33333333",
  198. "333333333",
  199. chktxt,
  200. ]
  201. # write the tests to ./models/ggml-vocab-{name}.gguf.inp
  202. # the format is:
  203. #
  204. # test0
  205. # __ggml_vocab_test__
  206. # test1
  207. # __ggml_vocab_test__
  208. # ...
  209. #
  210. # with each model, encode all tests and write the results in ./models/ggml-vocab-{name}.gguf.out
  211. # for each test, write the resulting tokens on a separate line
  212. for model in models:
  213. name = model["name"]
  214. tokt = model["tokt"]
  215. # create the tokenizer
  216. from transformers import AutoTokenizer
  217. tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
  218. with open(f"models/ggml-vocab-{name}.gguf.inp", "w") as f:
  219. for text in tests:
  220. f.write(f"{text}")
  221. f.write("\n__ggml_vocab_test__\n")
  222. with open(f"models/ggml-vocab-{name}.gguf.out", "w") as f:
  223. for text in tests:
  224. res = tokenizer.encode(text, add_special_tokens=False)
  225. for r in res:
  226. f.write(f" {r}")
  227. f.write("\n")
  228. print(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
  229. # generate commands for creating vocab files
  230. print("\nRun the following commands to generate the vocab files for testing:\n")
  231. for model in models:
  232. name = model["name"]
  233. print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only")
  234. print("\n")