test-tokenizer-random.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561
  1. # Test libllama tokenizer == AutoTokenizer.
  2. # Brute force random words/text generation.
  3. #
  4. # Sample usage:
  5. #
  6. # python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe
  7. #
  8. import time
  9. import logging
  10. import argparse
  11. import subprocess
  12. import random
  13. import unicodedata
  14. from typing import Iterator
  15. import cffi
  16. from transformers import AutoTokenizer
  17. logger = logging.getLogger("test-tokenizer-random")
  18. class LibLlama:
  19. DEFAULT_PATH_LLAMA_H = "./include/llama.h"
  20. DEFAULT_PATH_INCLUDES = ["./ggml/include/", "./include/"]
  21. DEFAULT_PATH_LIBLLAMA = "./build/src/libllama.so" # CMakeLists.txt: BUILD_SHARED_LIBS ON
  22. def __init__(self, path_llama_h: str = None, path_includes: list[str] = [], path_libllama: str = None):
  23. path_llama_h = path_llama_h or self.DEFAULT_PATH_LLAMA_H
  24. path_includes = path_includes or self.DEFAULT_PATH_INCLUDES
  25. path_libllama = path_libllama or self.DEFAULT_PATH_LIBLLAMA
  26. (self.ffi, self.lib) = self._load_libllama_cffi(path_llama_h, path_includes, path_libllama)
  27. self.lib.llama_backend_init()
  28. def _load_libllama_cffi(self, path_llama_h: str, path_includes: list[str], path_libllama: str):
  29. cmd = ["gcc", "-E", "-P", "-D__restrict=", "-D__attribute__(x)=", "-D__asm__(x)="]
  30. cmd += ["-I" + path for path in path_includes] + [path_llama_h]
  31. res = subprocess.run(cmd, stdout=subprocess.PIPE)
  32. assert (res.returncode == 0)
  33. source = res.stdout.decode()
  34. ffi = cffi.FFI()
  35. if True: # workarounds for pycparser
  36. source = "typedef struct { } __builtin_va_list;" + "\n" + source
  37. source = source.replace("sizeof (int)", str(ffi.sizeof("int")))
  38. source = source.replace("sizeof (void *)", str(ffi.sizeof("void*")))
  39. source = source.replace("sizeof (size_t)", str(ffi.sizeof("size_t")))
  40. source = source.replace("sizeof(int32_t)", str(ffi.sizeof("int32_t")))
  41. ffi.cdef(source, override=True)
  42. lib = ffi.dlopen(path_libllama)
  43. return (ffi, lib)
  44. def model_default_params(self, **kwargs):
  45. mparams = self.lib.llama_model_default_params()
  46. for k, v in kwargs.items():
  47. setattr(mparams, k, v)
  48. return mparams
  49. def context_default_params(self, **kwargs):
  50. cparams = self.lib.llama_context_default_params()
  51. for k, v in kwargs.items():
  52. setattr(cparams, k, v)
  53. return cparams
  54. class LibLlamaModel:
  55. def __init__(self, libllama: LibLlama, path_model: str, mparams={}, cparams={}):
  56. self.lib = libllama.lib
  57. self.ffi = libllama.ffi
  58. if isinstance(mparams, dict):
  59. mparams = libllama.model_default_params(**mparams)
  60. self.model = self.lib.llama_load_model_from_file(path_model.encode(), mparams)
  61. if not self.model:
  62. raise RuntimeError("error: failed to load model '%s'" % path_model)
  63. if isinstance(cparams, dict):
  64. cparams = libllama.context_default_params(**cparams)
  65. self.ctx = self.lib.llama_new_context_with_model(self.model, cparams)
  66. if not self.ctx:
  67. raise RuntimeError("error: failed to create context for model '%s'" % path_model)
  68. n_tokens_max = self.lib.llama_n_ctx(self.ctx)
  69. self.token_ids = self.ffi.new("llama_token[]", n_tokens_max)
  70. self.text_buff = self.ffi.new("uint8_t[]", 1024)
  71. def free(self):
  72. if self.ctx:
  73. self.lib.llama_free(self.ctx)
  74. if self.model:
  75. self.lib.llama_free_model(self.model)
  76. self.ctx = None
  77. self.model = None
  78. self.lib = None
  79. def tokenize(self, text: str, add_special: bool = False, parse_special: bool = False) -> list[int]:
  80. text = text.encode("utf-8")
  81. num = self.lib.llama_tokenize(self.model, text, len(text), self.token_ids, len(self.token_ids), add_special, parse_special)
  82. while num < 0 and len(self.token_ids) < (16 << 20):
  83. self.token_ids = self.ffi.new("llama_token[]", -2 * num)
  84. num = self.lib.llama_tokenize(self.model, text, len(text), self.token_ids, len(self.token_ids), add_special, parse_special)
  85. return list(self.token_ids[0:num])
  86. def detokenize(self, ids: list[int], remove_special: bool = False, unparse_special: bool = False) -> str:
  87. if len(self.token_ids) < len(ids):
  88. self.token_ids = self.ffi.new("llama_token[]", 2 * len(ids))
  89. for i, id in enumerate(ids):
  90. self.token_ids[i] = id
  91. num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
  92. while num < 0 and len(self.text_buff) < (16 << 20):
  93. self.text_buff = self.ffi.new("uint8_t[]", -2 * num)
  94. num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
  95. return str(self.ffi.buffer(self.text_buff, num), encoding="utf-8", errors="replace") # replace errors with '\uFFFD'
  96. class Tokenizer:
  97. def encode(self, text: str) -> list[int]:
  98. raise NotImplementedError
  99. def decode(self, ids: list[int]) -> str:
  100. raise NotImplementedError
  101. class TokenizerGroundtruth (Tokenizer):
  102. def __init__(self, dir_tokenizer: str):
  103. self.model = AutoTokenizer.from_pretrained(dir_tokenizer)
  104. # guess BOS and EOS
  105. ids = self.encode("a")
  106. assert 1 <= len(ids) <= 3
  107. add_bos_token = len(ids) > 1 and self.model.bos_token_id == ids[0]
  108. add_eos_token = len(ids) > 1 and self.model.eos_token_id == ids[-1]
  109. self.add_bos_token = getattr(self.model, "add_bos_token", add_bos_token)
  110. self.add_eos_token = getattr(self.model, "add_eos_token", add_eos_token)
  111. # build vocab
  112. tokens = list(self.model.get_vocab().values())
  113. self.vocab = self.model.batch_decode(tokens, skip_special_tokens=True)
  114. self.vocab = list(sorted(self.vocab))
  115. # tokens and lists
  116. self.special_tokens = list(self.model.all_special_tokens)
  117. self.added_tokens = list(self.model.added_tokens_encoder)
  118. self.bos_token = self.model.bos_token
  119. self.eos_token = self.model.eos_token
  120. def encode(self, text: str) -> list[int]:
  121. return self.model.encode(text, add_special_tokens=True)
  122. def decode(self, ids: list[int]) -> str:
  123. return self.model.decode(ids, skip_special_tokens=False)
  124. class TokenizerLlamaCpp (Tokenizer):
  125. libllama: LibLlama = None
  126. def __init__(self, vocab_file: str):
  127. if not self.libllama:
  128. self.libllama = LibLlama()
  129. self.model = LibLlamaModel(self.libllama, vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096))
  130. def encode(self, text: str) -> list[int]:
  131. return self.model.tokenize(text, add_special=True, parse_special=True)
  132. def decode(self, ids: list[int]) -> str:
  133. return self.model.detokenize(ids, remove_special=False, unparse_special=True)
  134. def generator_custom_text() -> Iterator[str]:
  135. """General tests"""
  136. yield from [
  137. "",
  138. " ",
  139. " ",
  140. " ",
  141. "\t",
  142. "\n",
  143. "\n\n",
  144. "\n\n\n",
  145. "\t\n",
  146. "Hello world",
  147. " Hello world",
  148. "Hello World",
  149. " Hello World",
  150. " Hello World!",
  151. "Hello, world!",
  152. " Hello, world!",
  153. " this is 🦙.cpp",
  154. "w048 7tuijk dsdfhu",
  155. "нещо на Български",
  156. "កាន់តែពិសេសអាចខលចេញ",
  157. "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
  158. "Hello",
  159. " Hello",
  160. " Hello",
  161. " Hello",
  162. " Hello",
  163. " Hello\n Hello",
  164. " (",
  165. "\n =",
  166. "' era",
  167. "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~",
  168. "3",
  169. "33",
  170. "333",
  171. "3333",
  172. "33333",
  173. "333333",
  174. "3333333",
  175. "33333333",
  176. "333333333",
  177. ]
  178. def generator_custom_text_edge_cases() -> Iterator[str]:
  179. """Edge cases found while debugging"""
  180. yield from [
  181. '\x1f-a', # unicode_ranges_control, {0x00001C, 0x00001F}
  182. '¼-a', # unicode_ranges_digit, 0x00BC
  183. '½-a', # unicode_ranges_digit, 0x00BD
  184. '¾-a', # unicode_ranges_digit, 0x00BE
  185. 'a 〇b', # unicode_ranges_digit, 0x3007
  186. 'Ⅵ-a', # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
  187. '\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM)
  188. 'Cửa Việt', # llama-3, ignore_merges = true
  189. '<s>a', # Phi-3 fail
  190. '<unk><|endoftext|><s>', # Phi-3 fail
  191. 'a\na', # bert fail
  192. '"`', # falcon
  193. ' \u2e4e', # falcon
  194. 'a\xa0\xa0\x00b', # jina-v2-es
  195. 'one <mask>', # jina-v2-es <mask> lstrip=true
  196. 'a </s> b', # rstrip phi-3
  197. 'a <mask> b', # lstrip jina-v2
  198. '\xa0aC', # deepseek
  199. '\u2029 \uA3E4', # deepseek-llm
  200. "a ?",
  201. 'å', # mpt
  202. '\U000ac517', # utf-8 encode error, falcon
  203. '\U000522f4', # utf-8 encode error, starcoder
  204. "<s><s><unk><s>a<s>b<s>c<unk>d<unk></s>",
  205. "<s> <s> <unk><s>a<s>b<s>c<unk>d<unk></s>",
  206. ]
  207. def generator_vocab_words(tokenizer: TokenizerGroundtruth) -> Iterator[str]:
  208. """Brute force check all vocab words"""
  209. yield from tokenizer.vocab
  210. def generator_ascii_lr_strip() -> Iterator[str]:
  211. WHITESPACES = ["", " ", " "]
  212. CHARACTERS = list(chr(i) for i in range(1, 0x80)) + [""]
  213. for char1 in CHARACTERS:
  214. for char2 in CHARACTERS:
  215. for lstrip in WHITESPACES:
  216. for rstrip in WHITESPACES:
  217. yield lstrip + char1 + char2 + rstrip
  218. yield lstrip + char1 + rstrip + char2
  219. yield char1 + lstrip + char2 + rstrip
  220. def generator_apostrophe() -> Iterator[str]:
  221. WHITESPACES = ["", " ", " "]
  222. CHARACTERS = list(chr(i) for i in range(1, 0x80)) + [""]
  223. for char1 in CHARACTERS:
  224. for char2 in CHARACTERS:
  225. for lstrip in WHITESPACES:
  226. for rstrip in WHITESPACES:
  227. yield char1 + lstrip + "'" + rstrip + char2
  228. yield char1 + char2 + lstrip + "'" + rstrip + "z"
  229. yield "a" + lstrip + "'" + rstrip + char1 + char2
  230. def generator_added_lr_strip(tokenizer: TokenizerGroundtruth) -> Iterator[str]:
  231. WHITESPACES = ["", " ", " ", "\n", "\r\n", "\n\n", "\t", "\t\t"]
  232. all_tokens = list(sorted(set(tokenizer.special_tokens + tokenizer.added_tokens)))
  233. for token in all_tokens:
  234. for lstrip in WHITESPACES:
  235. for rstrip in WHITESPACES:
  236. yield lstrip + token + rstrip
  237. yield "a" + lstrip + token + rstrip
  238. yield lstrip + token + rstrip + "z"
  239. yield "a" + lstrip + token + rstrip + "z"
  240. def generator_random_added_tokens(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
  241. separations = [" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"]
  242. all_tokens = list(sorted(set(tokenizer.special_tokens + tokenizer.added_tokens + separations)))
  243. rand = random.Random()
  244. for m in range(iterations):
  245. rand.seed(m)
  246. words = rand.choices(all_tokens, k=500)
  247. if words and words[0] == tokenizer.bos_token: # skip spam warning of double BOS
  248. while len(words) > 1 and words[1] == tokenizer.bos_token: # leave one starting BOS
  249. words.pop(0)
  250. if tokenizer.add_bos_token: # drop all starting BOS
  251. words.pop(0)
  252. if words and words[-1] == tokenizer.eos_token: # skip spam warning of double EOS
  253. while len(words) > 1 and words[-2] == tokenizer.eos_token: # leave one trailing EOS
  254. words.pop(-1)
  255. if tokenizer.add_bos_token: # drop all trailing EOS
  256. words.pop(-1)
  257. yield "".join(words)
  258. def generator_random_chars(iterations=100) -> Iterator[str]:
  259. """Brute force random text with simple characters"""
  260. NUM_WORDS = 400
  261. WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
  262. CHARS = list(sorted(set("""
  263. ABCDEFGHIJKLMNOPQRSTUVWXYZ
  264. abcdefghijklmnopqrstuvwxyz
  265. ÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÄËÏÖÜ
  266. áéíóúàèìòùâêîôûäëïöü
  267. .-,*/-+ª!"·$%&/()=?¿[]{}<>\\|@#~½¬~;:_
  268. """)))
  269. rand = random.Random()
  270. for m in range(iterations):
  271. rand.seed(m)
  272. text = []
  273. for _ in range(NUM_WORDS):
  274. k = rand.randint(1, 7)
  275. word = rand.choices(CHARS, k=k)
  276. word.append(rand.choice(WHITESPACES))
  277. text.append("".join(word))
  278. yield "".join(text)
  279. def generator_unicodes() -> Iterator[str]:
  280. """Iterate unicode characters"""
  281. MAX_CODEPOINTS = 0x30000 # 0x110000
  282. def _valid(cpt):
  283. if cpt >= 0x30000: # unassigned and supplement­ary
  284. return False
  285. # if cpt == 0x2029: # deepseek-llm
  286. # return False
  287. if unicodedata.category(chr(cpt)) in ("Cn", "Cs", "Co"): # undefined, surrogates, private
  288. return False
  289. return True
  290. characters = [chr(cpt) for cpt in range(0, MAX_CODEPOINTS) if _valid(cpt)]
  291. yield from characters
  292. def generator_random_unicodes(iterations=100) -> Iterator[str]:
  293. """Brute force random text with unicode characters"""
  294. NUM_WORDS = 200
  295. WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
  296. characters = list(generator_unicodes())
  297. rand = random.Random()
  298. for m in range(iterations):
  299. rand.seed(m)
  300. text = []
  301. for _ in range(NUM_WORDS):
  302. k = rand.randint(1, 7)
  303. word = rand.choices(characters, k=k)
  304. word.append(rand.choice(WHITESPACES))
  305. text.append("".join(word))
  306. yield "".join(text)
  307. def generator_random_vocab_chars(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
  308. """Brute force random text with vocab characters"""
  309. vocab_chars = set()
  310. for word in tokenizer.vocab:
  311. vocab_chars.update(word)
  312. vocab_chars = list(sorted(vocab_chars))
  313. rand = random.Random()
  314. for m in range(iterations):
  315. rand.seed(m)
  316. text = rand.choices(vocab_chars, k=1024)
  317. yield "".join(text)
  318. def generator_random_vocab_words(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
  319. """Brute force random text from vocab words"""
  320. vocab = [w.strip() for w in tokenizer.vocab]
  321. yield from vocab
  322. rand = random.Random()
  323. for m in range(iterations):
  324. rand.seed(m)
  325. text = []
  326. num_words = rand.randint(300, 400)
  327. for i in range(num_words):
  328. k = rand.randint(1, 3)
  329. words = rand.choices(vocab, k=k)
  330. sep = rand.choice(" \n\r\t")
  331. text.append("".join(words) + sep)
  332. yield "".join(text)
  333. def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLlamaCpp, generator: Iterator[str]):
  334. def find_first_mismatch(ids1: list[int], ids2: list[int]):
  335. for i, (a, b) in enumerate(zip(ids1, ids2)):
  336. if a != b:
  337. return i
  338. if len(ids1) == len(ids2):
  339. return -1
  340. return min(len(ids1), len(ids2))
  341. def check_detokenizer(text: str, text1: str, text2: str) -> bool:
  342. if text1 == text2: # equal to TokenizerGroundtruth?
  343. return True
  344. # equal to source text?
  345. if tokenizer1.add_bos_token: # remove BOS
  346. if text2.startswith(tokenizer1.bos_token):
  347. text2 = text2[len(tokenizer1.bos_token):]
  348. if tokenizer1.add_eos_token: # remove EOS
  349. if text2.endswith(tokenizer1.eos_token):
  350. text2 = text2[:-len(tokenizer1.eos_token)]
  351. return text == text2
  352. t_encode1 = 0
  353. t_encode2 = 0
  354. t_decode1 = 0
  355. t_decode2 = 0
  356. t_start = time.perf_counter()
  357. encode_errors = 0
  358. decode_errors = 0
  359. MAX_ERRORS = 10
  360. logger.info("%s: %s" % (generator.__name__, "ini"))
  361. for text in generator:
  362. # print(repr(text), text.encode())
  363. # print(repr(text), hex(ord(text[0])), text.encode())
  364. t0 = time.perf_counter()
  365. ids1 = tokenizer1.encode(text)
  366. t1 = time.perf_counter()
  367. ids2 = tokenizer2.encode(text)
  368. t2 = time.perf_counter()
  369. text1 = tokenizer1.decode(ids1)
  370. t3 = time.perf_counter()
  371. text2 = tokenizer2.decode(ids1)
  372. t4 = time.perf_counter()
  373. t_encode1 += t1 - t0
  374. t_encode2 += t2 - t1
  375. t_decode1 += t3 - t2
  376. t_decode2 += t4 - t3
  377. if encode_errors < MAX_ERRORS and ids1 != ids2:
  378. i = find_first_mismatch(ids1, ids2)
  379. ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1]
  380. ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1]
  381. logger.error(" Expected: " + str(ids1))
  382. logger.error(" Result: " + str(ids2))
  383. encode_errors += 1
  384. logger.error(f" {encode_errors=}")
  385. if decode_errors < MAX_ERRORS and not check_detokenizer(text, text1, text2):
  386. i = find_first_mismatch(text1, text2)
  387. text1 = list(text1[max(0, i - 2) : i + 5 + 1])
  388. text2 = list(text2[max(0, i - 2) : i + 5 + 1])
  389. logger.error(" Expected: " + " ".join(hex(ord(x)) for x in text1))
  390. logger.error(" Result: " + " ".join(hex(ord(x)) for x in text2))
  391. decode_errors += 1
  392. logger.error(f" {decode_errors=}")
  393. if encode_errors >= MAX_ERRORS and decode_errors >= MAX_ERRORS:
  394. logger.error(f" EXIT: {encode_errors=} {decode_errors=}")
  395. # raise Exception()
  396. break
  397. t_total = time.perf_counter() - t_start
  398. logger.info(f"{generator.__name__}: end, {t_encode1=:.3f} {t_encode2=:.3f} {t_decode1=:.3f} {t_decode2=:.3f} {t_total=:.3f}")
  399. def main(argv: list[str] = None):
  400. parser = argparse.ArgumentParser()
  401. parser.add_argument("vocab_file", help="path to vocab 'gguf' file")
  402. parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
  403. parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
  404. args = parser.parse_args(argv)
  405. logging.basicConfig(level = logging.DEBUG if args.verbose else logging.INFO)
  406. logger.info(f"VOCABFILE: '{args.vocab_file}'")
  407. tokenizer1 = TokenizerGroundtruth(args.dir_tokenizer)
  408. tokenizer2 = TokenizerLlamaCpp(args.vocab_file)
  409. # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text())
  410. # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text_edge_cases())
  411. compare_tokenizers(tokenizer1, tokenizer2, generator_ascii_lr_strip())
  412. compare_tokenizers(tokenizer1, tokenizer2, generator_apostrophe())
  413. compare_tokenizers(tokenizer1, tokenizer2, generator_unicodes())
  414. compare_tokenizers(tokenizer1, tokenizer2, generator_vocab_words(tokenizer1))
  415. compare_tokenizers(tokenizer1, tokenizer2, generator_added_lr_strip(tokenizer1))
  416. # compare_tokenizers(tokenizer1, tokenizer2, generator_random_added_tokens(tokenizer1, 10_000))
  417. # compare_tokenizers(tokenizer1, tokenizer2, generator_random_chars(10_000))
  418. # compare_tokenizers(tokenizer1, tokenizer2, generator_random_unicodes(10_000))
  419. # compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_chars(tokenizer1, 10_000))
  420. # compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_words(tokenizer1, 5_000))
  421. tokenizer2.model.free()
  422. if __name__ == "__main__":
  423. # main()
  424. if True:
  425. logging.basicConfig(
  426. level = logging.DEBUG,
  427. format = "%(asctime)s.%(msecs)03d %(name)s %(levelname)s %(message)s",
  428. datefmt = "%Y-%m-%d %H:%M:%S",
  429. filename = logger.name + ".log",
  430. filemode = "a"
  431. )
  432. logging.basicConfig(
  433. level = logging.DEBUG,
  434. format = "%(levelname)s %(message)s",
  435. )
  436. path_tokenizers = "./models/tokenizers/"
  437. path_vocab_format = "./models/ggml-vocab-%s.gguf"
  438. tokenizers = [
  439. "llama-spm", # SPM
  440. "phi-3", # SPM
  441. "gemma", # SPM
  442. "gemma-2", # SPM
  443. "baichuan", # SPM
  444. "bert-bge", # WPM
  445. "jina-v2-en", # WPM
  446. "llama-bpe", # BPE
  447. "phi-2", # BPE
  448. "deepseek-llm", # BPE
  449. "deepseek-coder", # BPE
  450. "falcon", # BPE
  451. "mpt", # BPE
  452. "starcoder", # BPE
  453. "gpt-2", # BPE
  454. "stablelm2", # BPE
  455. "refact", # BPE
  456. "qwen2", # BPE
  457. "olmo", # BPE
  458. "jina-v2-es", # BPE
  459. "jina-v2-de", # BPE
  460. "smaug-bpe", # BPE
  461. "poro-chat", # BPE
  462. "jina-v2-code", # BPE
  463. "viking", # BPE
  464. "jais", # BPE
  465. ]
  466. logger.info("=" * 50)
  467. for tokenizer in tokenizers:
  468. logger.info("-" * 50)
  469. logger.info(f"TOKENIZER: '{tokenizer}'")
  470. vocab_file = path_vocab_format % tokenizer
  471. dir_tokenizer = path_tokenizers + "/" + tokenizer
  472. main([vocab_file, dir_tokenizer, "--verbose"])