test-tokenizer-random.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565
  1. # Test libllama tokenizer == AutoTokenizer.
  2. # Brute force random words/text generation.
  3. #
  4. # Sample usage:
  5. #
  6. # python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe
  7. #
  8. from __future__ import annotations
  9. import time
  10. import logging
  11. import argparse
  12. import subprocess
  13. import random
  14. import unicodedata
  15. from pathlib import Path
  16. from typing import Any, Iterator, cast
  17. from typing_extensions import Buffer
  18. import cffi
  19. from transformers import AutoTokenizer
  20. logger = logging.getLogger("test-tokenizer-random")
  21. class LibLlama:
  22. DEFAULT_PATH_LLAMA_H = "./include/llama.h"
  23. DEFAULT_PATH_INCLUDES = ["./ggml/include/", "./include/"]
  24. DEFAULT_PATH_LIBLLAMA = "./build/src/libllama.so" # CMakeLists.txt: BUILD_SHARED_LIBS ON
  25. def __init__(self, path_llama_h: str | None = None, path_includes: list[str] = [], path_libllama: str | None = None):
  26. path_llama_h = path_llama_h or self.DEFAULT_PATH_LLAMA_H
  27. path_includes = path_includes or self.DEFAULT_PATH_INCLUDES
  28. path_libllama = path_libllama or self.DEFAULT_PATH_LIBLLAMA
  29. (self.ffi, self.lib) = self._load_libllama_cffi(path_llama_h, path_includes, path_libllama)
  30. self.lib.llama_backend_init()
  31. def _load_libllama_cffi(self, path_llama_h: str, path_includes: list[str], path_libllama: str) -> tuple[cffi.FFI, Any]:
  32. cmd = ["gcc", "-O0", "-E", "-P", "-D__restrict=", "-D__attribute__(x)=", "-D__asm__(x)="]
  33. cmd += ["-I" + path for path in path_includes] + [path_llama_h]
  34. res = subprocess.run(cmd, stdout=subprocess.PIPE)
  35. assert (res.returncode == 0)
  36. source = res.stdout.decode()
  37. ffi = cffi.FFI()
  38. if True: # workarounds for pycparser
  39. source = "typedef struct { } __builtin_va_list;" + "\n" + source
  40. source = source.replace("sizeof (int)", str(ffi.sizeof("int")))
  41. source = source.replace("sizeof (void *)", str(ffi.sizeof("void*")))
  42. source = source.replace("sizeof (size_t)", str(ffi.sizeof("size_t")))
  43. source = source.replace("sizeof(int32_t)", str(ffi.sizeof("int32_t")))
  44. ffi.cdef(source, override=True)
  45. lib = ffi.dlopen(path_libllama)
  46. return (ffi, lib)
  47. def model_default_params(self, **kwargs):
  48. mparams = self.lib.llama_model_default_params()
  49. for k, v in kwargs.items():
  50. setattr(mparams, k, v)
  51. return mparams
  52. def context_default_params(self, **kwargs):
  53. cparams = self.lib.llama_context_default_params()
  54. for k, v in kwargs.items():
  55. setattr(cparams, k, v)
  56. return cparams
  57. class LibLlamaModel:
  58. def __init__(self, libllama: LibLlama, path_model: str, mparams={}, cparams={}):
  59. self.lib: Any = libllama.lib
  60. self.ffi = libllama.ffi
  61. if isinstance(mparams, dict):
  62. mparams = libllama.model_default_params(**mparams)
  63. self.model = self.lib.llama_load_model_from_file(path_model.encode(), mparams)
  64. if not self.model:
  65. raise RuntimeError("error: failed to load model '%s'" % path_model)
  66. if isinstance(cparams, dict):
  67. cparams = libllama.context_default_params(**cparams)
  68. self.ctx = self.lib.llama_new_context_with_model(self.model, cparams)
  69. if not self.ctx:
  70. raise RuntimeError("error: failed to create context for model '%s'" % path_model)
  71. n_tokens_max = self.lib.llama_n_ctx(self.ctx)
  72. self.token_ids = self.ffi.new("llama_token[]", n_tokens_max)
  73. self.text_buff = self.ffi.new("uint8_t[]", 1024)
  74. def free(self):
  75. if self.ctx:
  76. self.lib.llama_free(self.ctx)
  77. if self.model:
  78. self.lib.llama_free_model(self.model)
  79. self.ctx = None
  80. self.model = None
  81. self.lib = None
  82. def tokenize(self, text: str, add_special: bool = False, parse_special: bool = False) -> list[int]:
  83. encoded_text: bytes = text.encode("utf-8")
  84. num = self.lib.llama_tokenize(self.model, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special)
  85. while num < 0 and len(self.token_ids) < (16 << 20):
  86. self.token_ids = self.ffi.new("llama_token[]", -2 * num)
  87. num = self.lib.llama_tokenize(self.model, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special)
  88. return list(self.token_ids[0:num])
  89. def detokenize(self, ids: list[int], remove_special: bool = False, unparse_special: bool = False) -> str:
  90. if len(self.token_ids) < len(ids):
  91. self.token_ids = self.ffi.new("llama_token[]", 2 * len(ids))
  92. for i, id in enumerate(ids):
  93. self.token_ids[i] = id
  94. num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
  95. while num < 0 and len(self.text_buff) < (16 << 20):
  96. self.text_buff = self.ffi.new("uint8_t[]", -2 * num)
  97. num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
  98. return str(cast(Buffer, self.ffi.buffer(self.text_buff, num)), encoding="utf-8", errors="replace") # replace errors with '\uFFFD'
  99. class Tokenizer:
  100. def encode(self, text: str) -> list[int]:
  101. raise NotImplementedError
  102. def decode(self, ids: list[int]) -> str:
  103. raise NotImplementedError
  104. class TokenizerGroundtruth (Tokenizer):
  105. def __init__(self, dir_tokenizer: str):
  106. self.model = AutoTokenizer.from_pretrained(dir_tokenizer)
  107. # guess BOS and EOS
  108. ids = self.encode("a")
  109. assert 1 <= len(ids) <= 3
  110. add_bos_token = len(ids) > 1 and self.model.bos_token_id == ids[0]
  111. add_eos_token = len(ids) > 1 and self.model.eos_token_id == ids[-1]
  112. self.add_bos_token = getattr(self.model, "add_bos_token", add_bos_token)
  113. self.add_eos_token = getattr(self.model, "add_eos_token", add_eos_token)
  114. # build vocab
  115. tokens = list(self.model.get_vocab().values())
  116. self.vocab = self.model.batch_decode(tokens, skip_special_tokens=True)
  117. self.vocab = list(sorted(self.vocab))
  118. # tokens and lists
  119. self.special_tokens = list(self.model.all_special_tokens)
  120. self.added_tokens = list(self.model.added_tokens_encoder)
  121. self.bos_token = self.model.bos_token
  122. self.eos_token = self.model.eos_token
  123. def encode(self, text: str) -> list[int]:
  124. return self.model.encode(text, add_special_tokens=True)
  125. def decode(self, ids: list[int]) -> str:
  126. return self.model.decode(ids, skip_special_tokens=False)
  127. class TokenizerLlamaCpp (Tokenizer):
  128. libllama: LibLlama | None = None
  129. def __init__(self, vocab_file: str):
  130. if not self.libllama:
  131. self.libllama = LibLlama()
  132. self.model = LibLlamaModel(self.libllama, vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096))
  133. def encode(self, text: str) -> list[int]:
  134. return self.model.tokenize(text, add_special=True, parse_special=True)
  135. def decode(self, ids: list[int]) -> str:
  136. return self.model.detokenize(ids, remove_special=False, unparse_special=True)
  137. def generator_custom_text() -> Iterator[str]:
  138. """General tests"""
  139. yield from [
  140. "",
  141. " ",
  142. " ",
  143. " ",
  144. "\t",
  145. "\n",
  146. "\n\n",
  147. "\n\n\n",
  148. "\t\n",
  149. "Hello world",
  150. " Hello world",
  151. "Hello World",
  152. " Hello World",
  153. " Hello World!",
  154. "Hello, world!",
  155. " Hello, world!",
  156. " this is 🦙.cpp",
  157. "w048 7tuijk dsdfhu",
  158. "нещо на Български",
  159. "កាន់តែពិសេសអាចខលចេញ",
  160. "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
  161. "Hello",
  162. " Hello",
  163. " Hello",
  164. " Hello",
  165. " Hello",
  166. " Hello\n Hello",
  167. " (",
  168. "\n =",
  169. "' era",
  170. "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~",
  171. "3",
  172. "33",
  173. "333",
  174. "3333",
  175. "33333",
  176. "333333",
  177. "3333333",
  178. "33333333",
  179. "333333333",
  180. ]
  181. def generator_custom_text_edge_cases() -> Iterator[str]:
  182. """Edge cases found while debugging"""
  183. yield from [
  184. '\x1f-a', # unicode_ranges_control, {0x00001C, 0x00001F}
  185. '¼-a', # unicode_ranges_digit, 0x00BC
  186. '½-a', # unicode_ranges_digit, 0x00BD
  187. '¾-a', # unicode_ranges_digit, 0x00BE
  188. 'a 〇b', # unicode_ranges_digit, 0x3007
  189. 'Ⅵ-a', # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
  190. '\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM)
  191. 'Cửa Việt', # llama-3, ignore_merges = true
  192. '<s>a', # Phi-3 fail
  193. '<unk><|endoftext|><s>', # Phi-3 fail
  194. 'a\na', # bert fail
  195. '"`', # falcon
  196. ' \u2e4e', # falcon
  197. 'a\xa0\xa0\x00b', # jina-v2-es
  198. 'one <mask>', # jina-v2-es <mask> lstrip=true
  199. 'a </s> b', # rstrip phi-3
  200. 'a <mask> b', # lstrip jina-v2
  201. '\xa0aC', # deepseek
  202. '\u2029 \uA3E4', # deepseek-llm
  203. "a ?",
  204. 'å', # mpt
  205. '\U000ac517', # utf-8 encode error, falcon
  206. '\U000522f4', # utf-8 encode error, starcoder
  207. "<s><s><unk><s>a<s>b<s>c<unk>d<unk></s>",
  208. "<s> <s> <unk><s>a<s>b<s>c<unk>d<unk></s>",
  209. ]
  210. def generator_vocab_words(tokenizer: TokenizerGroundtruth) -> Iterator[str]:
  211. """Brute force check all vocab words"""
  212. yield from tokenizer.vocab
  213. def generator_ascii_lr_strip() -> Iterator[str]:
  214. WHITESPACES = ["", " ", " "]
  215. CHARACTERS = list(chr(i) for i in range(1, 0x80)) + [""]
  216. for char1 in CHARACTERS:
  217. for char2 in CHARACTERS:
  218. for lstrip in WHITESPACES:
  219. for rstrip in WHITESPACES:
  220. yield lstrip + char1 + char2 + rstrip
  221. yield lstrip + char1 + rstrip + char2
  222. yield char1 + lstrip + char2 + rstrip
  223. def generator_apostrophe() -> Iterator[str]:
  224. WHITESPACES = ["", " ", " "]
  225. CHARACTERS = list(chr(i) for i in range(1, 0x80)) + [""]
  226. for char1 in CHARACTERS:
  227. for char2 in CHARACTERS:
  228. for lstrip in WHITESPACES:
  229. for rstrip in WHITESPACES:
  230. yield char1 + lstrip + "'" + rstrip + char2
  231. yield char1 + char2 + lstrip + "'" + rstrip + "z"
  232. yield "a" + lstrip + "'" + rstrip + char1 + char2
  233. def generator_added_lr_strip(tokenizer: TokenizerGroundtruth) -> Iterator[str]:
  234. WHITESPACES = ["", " ", " ", "\n", "\r\n", "\n\n", "\t", "\t\t"]
  235. all_tokens = list(sorted(set(tokenizer.special_tokens + tokenizer.added_tokens)))
  236. for token in all_tokens:
  237. for lstrip in WHITESPACES:
  238. for rstrip in WHITESPACES:
  239. yield lstrip + token + rstrip
  240. yield "a" + lstrip + token + rstrip
  241. yield lstrip + token + rstrip + "z"
  242. yield "a" + lstrip + token + rstrip + "z"
  243. def generator_random_added_tokens(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
  244. separations = [" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"]
  245. all_tokens = list(sorted(set(tokenizer.special_tokens + tokenizer.added_tokens + separations)))
  246. rand = random.Random()
  247. for m in range(iterations):
  248. rand.seed(m)
  249. words = rand.choices(all_tokens, k=500)
  250. if words and words[0] == tokenizer.bos_token: # skip spam warning of double BOS
  251. while len(words) > 1 and words[1] == tokenizer.bos_token: # leave one starting BOS
  252. words.pop(0)
  253. if tokenizer.add_bos_token: # drop all starting BOS
  254. words.pop(0)
  255. if words and words[-1] == tokenizer.eos_token: # skip spam warning of double EOS
  256. while len(words) > 1 and words[-2] == tokenizer.eos_token: # leave one trailing EOS
  257. words.pop(-1)
  258. if tokenizer.add_bos_token: # drop all trailing EOS
  259. words.pop(-1)
  260. yield "".join(words)
  261. def generator_random_chars(iterations=100) -> Iterator[str]:
  262. """Brute force random text with simple characters"""
  263. NUM_WORDS = 400
  264. WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
  265. CHARS = list(sorted(set("""
  266. ABCDEFGHIJKLMNOPQRSTUVWXYZ
  267. abcdefghijklmnopqrstuvwxyz
  268. ÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÄËÏÖÜ
  269. áéíóúàèìòùâêîôûäëïöü
  270. .-,*/-+ª!"·$%&/()=?¿[]{}<>\\|@#~½¬~;:_
  271. """)))
  272. rand = random.Random()
  273. for m in range(iterations):
  274. rand.seed(m)
  275. text = []
  276. for _ in range(NUM_WORDS):
  277. k = rand.randint(1, 7)
  278. word = rand.choices(CHARS, k=k)
  279. word.append(rand.choice(WHITESPACES))
  280. text.append("".join(word))
  281. yield "".join(text)
  282. def generator_unicodes() -> Iterator[str]:
  283. """Iterate unicode characters"""
  284. MAX_CODEPOINTS = 0x30000 # 0x110000
  285. def _valid(cpt):
  286. if cpt >= 0x30000: # unassigned and supplement­ary
  287. return False
  288. # if cpt == 0x2029: # deepseek-llm
  289. # return False
  290. if unicodedata.category(chr(cpt)) in ("Cn", "Cs", "Co"): # undefined, surrogates, private
  291. return False
  292. return True
  293. characters = [chr(cpt) for cpt in range(0, MAX_CODEPOINTS) if _valid(cpt)]
  294. yield from characters
  295. def generator_random_unicodes(iterations=100) -> Iterator[str]:
  296. """Brute force random text with unicode characters"""
  297. NUM_WORDS = 200
  298. WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
  299. characters = list(generator_unicodes())
  300. rand = random.Random()
  301. for m in range(iterations):
  302. rand.seed(m)
  303. text = []
  304. for _ in range(NUM_WORDS):
  305. k = rand.randint(1, 7)
  306. word = rand.choices(characters, k=k)
  307. word.append(rand.choice(WHITESPACES))
  308. text.append("".join(word))
  309. yield "".join(text)
  310. def generator_random_vocab_chars(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
  311. """Brute force random text with vocab characters"""
  312. vocab_chars = set()
  313. for word in tokenizer.vocab:
  314. vocab_chars.update(word)
  315. vocab_chars = list(sorted(vocab_chars))
  316. rand = random.Random()
  317. for m in range(iterations):
  318. rand.seed(m)
  319. text = rand.choices(vocab_chars, k=1024)
  320. yield "".join(text)
  321. def generator_random_vocab_words(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
  322. """Brute force random text from vocab words"""
  323. vocab = [w.strip() for w in tokenizer.vocab]
  324. yield from vocab
  325. rand = random.Random()
  326. for m in range(iterations):
  327. rand.seed(m)
  328. text = []
  329. num_words = rand.randint(300, 400)
  330. for i in range(num_words):
  331. k = rand.randint(1, 3)
  332. words = rand.choices(vocab, k=k)
  333. sep = rand.choice(" \n\r\t")
  334. text.append("".join(words) + sep)
  335. yield "".join(text)
  336. def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLlamaCpp, generator: Iterator[str]):
  337. def find_first_mismatch(ids1: list[int] | str, ids2: list[int] | str):
  338. for i, (a, b) in enumerate(zip(ids1, ids2)):
  339. if a != b:
  340. return i
  341. if len(ids1) == len(ids2):
  342. return -1
  343. return min(len(ids1), len(ids2))
  344. def check_detokenizer(text: str, text1: str, text2: str) -> bool:
  345. if text1 == text2: # equal to TokenizerGroundtruth?
  346. return True
  347. # equal to source text?
  348. if tokenizer1.add_bos_token: # remove BOS
  349. if text2.startswith(tokenizer1.bos_token):
  350. text2 = text2[len(tokenizer1.bos_token):]
  351. if tokenizer1.add_eos_token: # remove EOS
  352. if text2.endswith(tokenizer1.eos_token):
  353. text2 = text2[:-len(tokenizer1.eos_token)]
  354. return text == text2
  355. t_encode1 = 0
  356. t_encode2 = 0
  357. t_decode1 = 0
  358. t_decode2 = 0
  359. t_start = time.perf_counter()
  360. encode_errors = 0
  361. decode_errors = 0
  362. MAX_ERRORS = 10
  363. logger.info("%s: %s" % (generator.__qualname__, "ini"))
  364. for text in generator:
  365. # print(repr(text), text.encode())
  366. # print(repr(text), hex(ord(text[0])), text.encode())
  367. t0 = time.perf_counter()
  368. ids1 = tokenizer1.encode(text)
  369. t1 = time.perf_counter()
  370. ids2 = tokenizer2.encode(text)
  371. t2 = time.perf_counter()
  372. text1 = tokenizer1.decode(ids1)
  373. t3 = time.perf_counter()
  374. text2 = tokenizer2.decode(ids1)
  375. t4 = time.perf_counter()
  376. t_encode1 += t1 - t0
  377. t_encode2 += t2 - t1
  378. t_decode1 += t3 - t2
  379. t_decode2 += t4 - t3
  380. if encode_errors < MAX_ERRORS and ids1 != ids2:
  381. i = find_first_mismatch(ids1, ids2)
  382. ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1]
  383. ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1]
  384. logger.error(" Expected: " + str(ids1))
  385. logger.error(" Result: " + str(ids2))
  386. encode_errors += 1
  387. logger.error(f" {encode_errors=}")
  388. if decode_errors < MAX_ERRORS and not check_detokenizer(text, text1, text2):
  389. i = find_first_mismatch(text1, text2)
  390. text1 = list(text1[max(0, i - 2) : i + 5 + 1])
  391. text2 = list(text2[max(0, i - 2) : i + 5 + 1])
  392. logger.error(" Expected: " + " ".join(hex(ord(x)) for x in text1))
  393. logger.error(" Result: " + " ".join(hex(ord(x)) for x in text2))
  394. decode_errors += 1
  395. logger.error(f" {decode_errors=}")
  396. if encode_errors >= MAX_ERRORS and decode_errors >= MAX_ERRORS:
  397. logger.error(f" EXIT: {encode_errors=} {decode_errors=}")
  398. # raise Exception()
  399. break
  400. t_total = time.perf_counter() - t_start
  401. logger.info(f"{generator.__qualname__}: end, {t_encode1=:.3f} {t_encode2=:.3f} {t_decode1=:.3f} {t_decode2=:.3f} {t_total=:.3f}")
  402. def main(argv: list[str] | None = None):
  403. parser = argparse.ArgumentParser()
  404. parser.add_argument("vocab_file", type=str, help="path to vocab 'gguf' file")
  405. parser.add_argument("dir_tokenizer", type=str, help="directory containing 'tokenizer.model' file")
  406. parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
  407. args = parser.parse_args(argv)
  408. logging.basicConfig(level = logging.DEBUG if args.verbose else logging.INFO)
  409. logger.info(f"VOCABFILE: '{args.vocab_file}'")
  410. tokenizer1 = TokenizerGroundtruth(args.dir_tokenizer)
  411. tokenizer2 = TokenizerLlamaCpp(args.vocab_file)
  412. # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text())
  413. # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text_edge_cases())
  414. compare_tokenizers(tokenizer1, tokenizer2, generator_ascii_lr_strip())
  415. compare_tokenizers(tokenizer1, tokenizer2, generator_apostrophe())
  416. compare_tokenizers(tokenizer1, tokenizer2, generator_unicodes())
  417. compare_tokenizers(tokenizer1, tokenizer2, generator_vocab_words(tokenizer1))
  418. compare_tokenizers(tokenizer1, tokenizer2, generator_added_lr_strip(tokenizer1))
  419. # compare_tokenizers(tokenizer1, tokenizer2, generator_random_added_tokens(tokenizer1, 10_000))
  420. # compare_tokenizers(tokenizer1, tokenizer2, generator_random_chars(10_000))
  421. # compare_tokenizers(tokenizer1, tokenizer2, generator_random_unicodes(10_000))
  422. # compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_chars(tokenizer1, 10_000))
  423. # compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_words(tokenizer1, 5_000))
  424. tokenizer2.model.free()
  425. if __name__ == "__main__":
  426. # main()
  427. if True:
  428. logging.basicConfig(
  429. level = logging.DEBUG,
  430. format = "%(asctime)s.%(msecs)03d %(name)s %(levelname)s %(message)s",
  431. datefmt = "%Y-%m-%d %H:%M:%S",
  432. filename = logger.name + ".log",
  433. filemode = "a"
  434. )
  435. logging.basicConfig(
  436. level = logging.DEBUG,
  437. format = "%(levelname)s %(message)s",
  438. )
  439. path_tokenizers = Path("./models/tokenizers/")
  440. path_vocab_format = "./models/ggml-vocab-%s.gguf"
  441. tokenizers = [
  442. "llama-spm", # SPM
  443. "phi-3", # SPM
  444. "gemma", # SPM
  445. "gemma-2", # SPM
  446. "baichuan", # SPM
  447. "bert-bge", # WPM
  448. "jina-v2-en", # WPM
  449. "llama-bpe", # BPE
  450. "phi-2", # BPE
  451. "deepseek-llm", # BPE
  452. "deepseek-coder", # BPE
  453. "falcon", # BPE
  454. "mpt", # BPE
  455. "starcoder", # BPE
  456. "gpt-2", # BPE
  457. "stablelm2", # BPE
  458. "refact", # BPE
  459. "qwen2", # BPE
  460. "olmo", # BPE
  461. "jina-v2-es", # BPE
  462. "jina-v2-de", # BPE
  463. "smaug-bpe", # BPE
  464. "poro-chat", # BPE
  465. "jina-v2-code", # BPE
  466. "viking", # BPE
  467. "jais", # BPE
  468. ]
  469. logger.info("=" * 50)
  470. for tokenizer in tokenizers:
  471. logger.info("-" * 50)
  472. logger.info(f"TOKENIZER: '{tokenizer}'")
  473. vocab_file = Path(path_vocab_format % tokenizer)
  474. dir_tokenizer = path_tokenizers / tokenizer
  475. main([str(vocab_file), str(dir_tokenizer), "--verbose"])