test-tokenizer-random.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566
  1. # Test libllama tokenizer == AutoTokenizer.
  2. # Brute force random words/text generation.
  3. #
  4. # Sample usage:
  5. #
  6. # python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe
  7. #
  8. from __future__ import annotations
  9. import time
  10. import logging
  11. import argparse
  12. import subprocess
  13. import random
  14. import unicodedata
  15. from pathlib import Path
  16. from typing import Any, Iterator, cast
  17. from typing_extensions import Buffer
  18. import cffi
  19. from transformers import AutoTokenizer, PreTrainedTokenizer
  20. logger = logging.getLogger("test-tokenizer-random")
  21. class LibLlama:
  22. DEFAULT_PATH_LLAMA_H = "./include/llama.h"
  23. DEFAULT_PATH_INCLUDES = ["./ggml/include/", "./include/"]
  24. DEFAULT_PATH_LIBLLAMA = "./build/src/libllama.so" # CMakeLists.txt: BUILD_SHARED_LIBS ON
  25. def __init__(self, path_llama_h: str | None = None, path_includes: list[str] = [], path_libllama: str | None = None):
  26. path_llama_h = path_llama_h or self.DEFAULT_PATH_LLAMA_H
  27. path_includes = path_includes or self.DEFAULT_PATH_INCLUDES
  28. path_libllama = path_libllama or self.DEFAULT_PATH_LIBLLAMA
  29. (self.ffi, self.lib) = self._load_libllama_cffi(path_llama_h, path_includes, path_libllama)
  30. self.lib.llama_backend_init()
  31. def _load_libllama_cffi(self, path_llama_h: str, path_includes: list[str], path_libllama: str) -> tuple[cffi.FFI, Any]:
  32. cmd = ["gcc", "-O0", "-E", "-P", "-D__restrict=", "-D__attribute__(x)=", "-D__asm__(x)="]
  33. cmd += ["-I" + path for path in path_includes] + [path_llama_h]
  34. res = subprocess.run(cmd, stdout=subprocess.PIPE)
  35. assert (res.returncode == 0)
  36. source = res.stdout.decode()
  37. ffi = cffi.FFI()
  38. if True: # workarounds for pycparser
  39. source = "typedef struct { } __builtin_va_list;" + "\n" + source
  40. source = source.replace("sizeof (int)", str(ffi.sizeof("int")))
  41. source = source.replace("sizeof (void *)", str(ffi.sizeof("void*")))
  42. source = source.replace("sizeof (size_t)", str(ffi.sizeof("size_t")))
  43. source = source.replace("sizeof(int32_t)", str(ffi.sizeof("int32_t")))
  44. ffi.cdef(source, override=True)
  45. lib = ffi.dlopen(path_libllama)
  46. return (ffi, lib)
  47. def model_default_params(self, **kwargs):
  48. mparams = self.lib.llama_model_default_params()
  49. for k, v in kwargs.items():
  50. setattr(mparams, k, v)
  51. return mparams
  52. def context_default_params(self, **kwargs):
  53. cparams = self.lib.llama_context_default_params()
  54. for k, v in kwargs.items():
  55. setattr(cparams, k, v)
  56. return cparams
  57. class LibLlamaModel:
  58. def __init__(self, libllama: LibLlama, path_model: str, mparams={}, cparams={}):
  59. self.lib: Any = libllama.lib
  60. self.ffi = libllama.ffi
  61. if isinstance(mparams, dict):
  62. mparams = libllama.model_default_params(**mparams)
  63. self.model = self.lib.llama_load_model_from_file(path_model.encode(), mparams)
  64. if not self.model:
  65. raise RuntimeError("error: failed to load model '%s'" % path_model)
  66. if isinstance(cparams, dict):
  67. cparams = libllama.context_default_params(**cparams)
  68. self.ctx = self.lib.llama_new_context_with_model(self.model, cparams)
  69. if not self.ctx:
  70. raise RuntimeError("error: failed to create context for model '%s'" % path_model)
  71. n_tokens_max = self.lib.llama_n_ctx(self.ctx)
  72. self.token_ids = self.ffi.new("llama_token[]", n_tokens_max)
  73. self.text_buff = self.ffi.new("uint8_t[]", 1024)
  74. def free(self):
  75. if self.ctx:
  76. self.lib.llama_free(self.ctx)
  77. if self.model:
  78. self.lib.llama_free_model(self.model)
  79. self.ctx = None
  80. self.model = None
  81. self.lib = None
  82. def tokenize(self, text: str, add_special: bool = False, parse_special: bool = False) -> list[int]:
  83. encoded_text: bytes = text.encode("utf-8")
  84. num = self.lib.llama_tokenize(self.model, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special)
  85. while num < 0 and len(self.token_ids) < (16 << 20):
  86. self.token_ids = self.ffi.new("llama_token[]", -2 * num)
  87. num = self.lib.llama_tokenize(self.model, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special)
  88. return list(self.token_ids[0:num])
  89. def detokenize(self, ids: list[int], remove_special: bool = False, unparse_special: bool = False) -> str:
  90. if len(self.token_ids) < len(ids):
  91. self.token_ids = self.ffi.new("llama_token[]", 2 * len(ids))
  92. for i, id in enumerate(ids):
  93. self.token_ids[i] = id
  94. num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
  95. while num < 0 and len(self.text_buff) < (16 << 20):
  96. self.text_buff = self.ffi.new("uint8_t[]", -2 * num)
  97. num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
  98. return str(cast(Buffer, self.ffi.buffer(self.text_buff, num)), encoding="utf-8", errors="replace") # replace errors with '\uFFFD'
  99. class Tokenizer:
  100. def encode(self, text: str) -> list[int]:
  101. raise NotImplementedError
  102. def decode(self, ids: list[int]) -> str:
  103. raise NotImplementedError
  104. class TokenizerGroundtruth (Tokenizer):
  105. def __init__(self, dir_tokenizer: str):
  106. self.model: PreTrainedTokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
  107. # guess BOS and EOS
  108. ids = self.encode("a")
  109. assert 1 <= len(ids) <= 3
  110. add_bos_token = len(ids) > 1 and self.model.bos_token_id == ids[0]
  111. add_eos_token = len(ids) > 1 and self.model.eos_token_id == ids[-1]
  112. self.add_bos_token = getattr(self.model, "add_bos_token", add_bos_token)
  113. self.add_eos_token = getattr(self.model, "add_eos_token", add_eos_token)
  114. # build vocab
  115. tokens = list(self.model.get_vocab().values())
  116. self.vocab = self.model.batch_decode(tokens, skip_special_tokens=True)
  117. self.vocab = list(sorted(self.vocab))
  118. # tokens and lists
  119. self.special_tokens = list(self.model.all_special_tokens)
  120. self.added_tokens = self.model.batch_decode(self.model.added_tokens_encoder.values(), skip_special_tokens=False)
  121. self.bos_token = self.model.bos_token
  122. self.eos_token = self.model.eos_token
  123. def encode(self, text: str) -> list[int]:
  124. return self.model.encode(text, add_special_tokens=True)
  125. def decode(self, ids: list[int]) -> str:
  126. return self.model.decode(ids, skip_special_tokens=False)
  127. class TokenizerLlamaCpp (Tokenizer):
  128. libllama: LibLlama | None = None
  129. def __init__(self, vocab_file: str):
  130. if not self.libllama:
  131. self.libllama = LibLlama()
  132. self.model = LibLlamaModel(self.libllama, vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096))
  133. def encode(self, text: str) -> list[int]:
  134. return self.model.tokenize(text, add_special=True, parse_special=True)
  135. def decode(self, ids: list[int]) -> str:
  136. return self.model.detokenize(ids, remove_special=False, unparse_special=True)
  137. def generator_custom_text() -> Iterator[str]:
  138. """General tests"""
  139. yield from [
  140. "",
  141. " ",
  142. " ",
  143. " ",
  144. "\t",
  145. "\n",
  146. "\n\n",
  147. "\n\n\n",
  148. "\t\n",
  149. "Hello world",
  150. " Hello world",
  151. "Hello World",
  152. " Hello World",
  153. " Hello World!",
  154. "Hello, world!",
  155. " Hello, world!",
  156. " this is 🦙.cpp",
  157. "w048 7tuijk dsdfhu",
  158. "нещо на Български",
  159. "កាន់តែពិសេសអាចខលចេញ",
  160. "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
  161. "Hello",
  162. " Hello",
  163. " Hello",
  164. " Hello",
  165. " Hello",
  166. " Hello\n Hello",
  167. " (",
  168. "\n =",
  169. "' era",
  170. "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~",
  171. "3",
  172. "33",
  173. "333",
  174. "3333",
  175. "33333",
  176. "333333",
  177. "3333333",
  178. "33333333",
  179. "333333333",
  180. ]
  181. def generator_custom_text_edge_cases() -> Iterator[str]:
  182. """Edge cases found while debugging"""
  183. yield from [
  184. '\x1f-a', # unicode_ranges_control, {0x00001C, 0x00001F}
  185. '¼-a', # unicode_ranges_digit, 0x00BC
  186. '½-a', # unicode_ranges_digit, 0x00BD
  187. '¾-a', # unicode_ranges_digit, 0x00BE
  188. 'a 〇b', # unicode_ranges_digit, 0x3007
  189. 'Ⅵ-a', # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
  190. '\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM)
  191. 'Cửa Việt', # llama-3, ignore_merges = true
  192. '<s>a', # Phi-3 fail
  193. '<unk><|endoftext|><s>', # Phi-3 fail
  194. 'a\na', # bert fail
  195. '"`', # falcon
  196. ' \u2e4e', # falcon
  197. '\n\x0b ', # falcon
  198. 'a\xa0\xa0\x00b', # jina-v2-es
  199. 'one <mask>', # jina-v2-es <mask> lstrip=true
  200. 'a </s> b', # rstrip phi-3
  201. 'a <mask> b', # lstrip jina-v2
  202. '\xa0aC', # deepseek
  203. '\u2029 \uA3E4', # deepseek-llm
  204. "a ?",
  205. 'å', # mpt
  206. '\U000ac517', # utf-8 encode error, falcon
  207. '\U000522f4', # utf-8 encode error, starcoder
  208. "<s><s><unk><s>a<s>b<s>c<unk>d<unk></s>",
  209. "<s> <s> <unk><s>a<s>b<s>c<unk>d<unk></s>",
  210. ]
  211. def generator_vocab_words(tokenizer: TokenizerGroundtruth) -> Iterator[str]:
  212. """Brute force check all vocab words"""
  213. yield from tokenizer.vocab
  214. def generator_ascii_lr_strip() -> Iterator[str]:
  215. WHITESPACES = ["", " ", " "]
  216. CHARACTERS = list(chr(i) for i in range(1, 0x80)) + [""]
  217. for char1 in CHARACTERS:
  218. for char2 in CHARACTERS:
  219. for lstrip in WHITESPACES:
  220. for rstrip in WHITESPACES:
  221. yield lstrip + char1 + char2 + rstrip
  222. yield lstrip + char1 + rstrip + char2
  223. yield char1 + lstrip + char2 + rstrip
  224. def generator_apostrophe() -> Iterator[str]:
  225. WHITESPACES = ["", " ", " "]
  226. CHARACTERS = list(chr(i) for i in range(1, 0x80)) + [""]
  227. for char1 in CHARACTERS:
  228. for char2 in CHARACTERS:
  229. for lstrip in WHITESPACES:
  230. for rstrip in WHITESPACES:
  231. yield char1 + lstrip + "'" + rstrip + char2
  232. yield char1 + char2 + lstrip + "'" + rstrip + "z"
  233. yield "a" + lstrip + "'" + rstrip + char1 + char2
  234. def generator_added_lr_strip(tokenizer: TokenizerGroundtruth) -> Iterator[str]:
  235. WHITESPACES = ["", " ", " ", "\n", "\r\n", "\n\n", "\t", "\t\t"]
  236. all_tokens = list(sorted(set(tokenizer.special_tokens + tokenizer.added_tokens)))
  237. for token in all_tokens:
  238. for lstrip in WHITESPACES:
  239. for rstrip in WHITESPACES:
  240. yield lstrip + token + rstrip
  241. yield "a" + lstrip + token + rstrip
  242. yield lstrip + token + rstrip + "z"
  243. yield "a" + lstrip + token + rstrip + "z"
  244. def generator_random_added_tokens(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
  245. separations = [" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"]
  246. all_tokens = list(sorted(set(tokenizer.special_tokens + tokenizer.added_tokens + separations)))
  247. rand = random.Random()
  248. for m in range(iterations):
  249. rand.seed(m)
  250. words = rand.choices(all_tokens, k=500)
  251. if words and words[0] == tokenizer.bos_token: # skip spam warning of double BOS
  252. while len(words) > 1 and words[1] == tokenizer.bos_token: # leave one starting BOS
  253. words.pop(0)
  254. if tokenizer.add_bos_token: # drop all starting BOS
  255. words.pop(0)
  256. if words and words[-1] == tokenizer.eos_token: # skip spam warning of double EOS
  257. while len(words) > 1 and words[-2] == tokenizer.eos_token: # leave one trailing EOS
  258. words.pop(-1)
  259. if tokenizer.add_bos_token: # drop all trailing EOS
  260. words.pop(-1)
  261. yield "".join(words)
  262. def generator_random_chars(iterations=100) -> Iterator[str]:
  263. """Brute force random text with simple characters"""
  264. NUM_WORDS = 400
  265. WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
  266. CHARS = list(sorted(set("""
  267. ABCDEFGHIJKLMNOPQRSTUVWXYZ
  268. abcdefghijklmnopqrstuvwxyz
  269. ÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÄËÏÖÜ
  270. áéíóúàèìòùâêîôûäëïöü
  271. .-,*/-+ª!"·$%&/()=?¿[]{}<>\\|@#~½¬~;:_
  272. """)))
  273. rand = random.Random()
  274. for m in range(iterations):
  275. rand.seed(m)
  276. text = []
  277. for _ in range(NUM_WORDS):
  278. k = rand.randint(1, 7)
  279. word = rand.choices(CHARS, k=k)
  280. word.append(rand.choice(WHITESPACES))
  281. text.append("".join(word))
  282. yield "".join(text)
  283. def generator_unicodes() -> Iterator[str]:
  284. """Iterate unicode characters"""
  285. MAX_CODEPOINTS = 0x30000 # 0x110000
  286. def _valid(cpt):
  287. if cpt >= 0x30000: # unassigned and supplement­ary
  288. return False
  289. # if cpt == 0x2029: # deepseek-llm
  290. # return False
  291. if unicodedata.category(chr(cpt)) in ("Cn", "Cs", "Co"): # undefined, surrogates, private
  292. return False
  293. return True
  294. characters = [chr(cpt) for cpt in range(0, MAX_CODEPOINTS) if _valid(cpt)]
  295. yield from characters
  296. def generator_random_unicodes(iterations=100) -> Iterator[str]:
  297. """Brute force random text with unicode characters"""
  298. NUM_WORDS = 200
  299. WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
  300. characters = list(generator_unicodes())
  301. rand = random.Random()
  302. for m in range(iterations):
  303. rand.seed(m)
  304. text = []
  305. for _ in range(NUM_WORDS):
  306. k = rand.randint(1, 7)
  307. word = rand.choices(characters, k=k)
  308. word.append(rand.choice(WHITESPACES))
  309. text.append("".join(word))
  310. yield "".join(text)
  311. def generator_random_vocab_chars(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
  312. """Brute force random text with vocab characters"""
  313. vocab_chars = set()
  314. for word in tokenizer.vocab:
  315. vocab_chars.update(word)
  316. vocab_chars = list(sorted(vocab_chars))
  317. rand = random.Random()
  318. for m in range(iterations):
  319. rand.seed(m)
  320. text = rand.choices(vocab_chars, k=1024)
  321. yield "".join(text)
  322. def generator_random_vocab_words(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
  323. """Brute force random text from vocab words"""
  324. vocab = [w.strip() for w in tokenizer.vocab]
  325. yield from vocab
  326. rand = random.Random()
  327. for m in range(iterations):
  328. rand.seed(m)
  329. text = []
  330. num_words = rand.randint(300, 400)
  331. for i in range(num_words):
  332. k = rand.randint(1, 3)
  333. words = rand.choices(vocab, k=k)
  334. sep = rand.choice(" \n\r\t")
  335. text.append("".join(words) + sep)
  336. yield "".join(text)
  337. def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLlamaCpp, generator: Iterator[str]):
  338. def find_first_mismatch(ids1: list[int] | str, ids2: list[int] | str):
  339. for i, (a, b) in enumerate(zip(ids1, ids2)):
  340. if a != b:
  341. return i
  342. if len(ids1) == len(ids2):
  343. return -1
  344. return min(len(ids1), len(ids2))
  345. def check_detokenizer(text: str, text1: str, text2: str) -> bool:
  346. if text1 == text2: # equal to TokenizerGroundtruth?
  347. return True
  348. # equal to source text?
  349. if tokenizer1.add_bos_token: # remove BOS
  350. if text2.startswith(tokenizer1.bos_token):
  351. text2 = text2[len(tokenizer1.bos_token):]
  352. if tokenizer1.add_eos_token: # remove EOS
  353. if text2.endswith(tokenizer1.eos_token):
  354. text2 = text2[:-len(tokenizer1.eos_token)]
  355. return text == text2
  356. t_encode1 = 0
  357. t_encode2 = 0
  358. t_decode1 = 0
  359. t_decode2 = 0
  360. t_start = time.perf_counter()
  361. encode_errors = 0
  362. decode_errors = 0
  363. MAX_ERRORS = 10
  364. logger.info("%s: %s" % (generator.__qualname__, "ini"))
  365. for text in generator:
  366. # print(repr(text), text.encode())
  367. # print(repr(text), hex(ord(text[0])), text.encode())
  368. t0 = time.perf_counter()
  369. ids1 = tokenizer1.encode(text)
  370. t1 = time.perf_counter()
  371. ids2 = tokenizer2.encode(text)
  372. t2 = time.perf_counter()
  373. text1 = tokenizer1.decode(ids1)
  374. t3 = time.perf_counter()
  375. text2 = tokenizer2.decode(ids1)
  376. t4 = time.perf_counter()
  377. t_encode1 += t1 - t0
  378. t_encode2 += t2 - t1
  379. t_decode1 += t3 - t2
  380. t_decode2 += t4 - t3
  381. if encode_errors < MAX_ERRORS and ids1 != ids2:
  382. i = find_first_mismatch(ids1, ids2)
  383. ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1]
  384. ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1]
  385. logger.error(" Expected: " + str(ids1))
  386. logger.error(" Result: " + str(ids2))
  387. encode_errors += 1
  388. logger.error(f" {encode_errors=}")
  389. if decode_errors < MAX_ERRORS and not check_detokenizer(text, text1, text2):
  390. i = find_first_mismatch(text1, text2)
  391. text1 = list(text1[max(0, i - 2) : i + 5 + 1])
  392. text2 = list(text2[max(0, i - 2) : i + 5 + 1])
  393. logger.error(" Expected: " + " ".join(hex(ord(x)) for x in text1))
  394. logger.error(" Result: " + " ".join(hex(ord(x)) for x in text2))
  395. decode_errors += 1
  396. logger.error(f" {decode_errors=}")
  397. if encode_errors >= MAX_ERRORS and decode_errors >= MAX_ERRORS:
  398. logger.error(f" EXIT: {encode_errors=} {decode_errors=}")
  399. # raise Exception()
  400. break
  401. t_total = time.perf_counter() - t_start
  402. logger.info(f"{generator.__qualname__}: end, {t_encode1=:.3f} {t_encode2=:.3f} {t_decode1=:.3f} {t_decode2=:.3f} {t_total=:.3f}")
  403. def main(argv: list[str] | None = None):
  404. parser = argparse.ArgumentParser()
  405. parser.add_argument("vocab_file", type=str, help="path to vocab 'gguf' file")
  406. parser.add_argument("dir_tokenizer", type=str, help="directory containing 'tokenizer.model' file")
  407. parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
  408. args = parser.parse_args(argv)
  409. logging.basicConfig(level = logging.DEBUG if args.verbose else logging.INFO)
  410. logger.info(f"VOCABFILE: '{args.vocab_file}'")
  411. tokenizer1 = TokenizerGroundtruth(args.dir_tokenizer)
  412. tokenizer2 = TokenizerLlamaCpp(args.vocab_file)
  413. # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text())
  414. # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text_edge_cases())
  415. compare_tokenizers(tokenizer1, tokenizer2, generator_ascii_lr_strip())
  416. compare_tokenizers(tokenizer1, tokenizer2, generator_apostrophe())
  417. compare_tokenizers(tokenizer1, tokenizer2, generator_unicodes())
  418. compare_tokenizers(tokenizer1, tokenizer2, generator_vocab_words(tokenizer1))
  419. compare_tokenizers(tokenizer1, tokenizer2, generator_added_lr_strip(tokenizer1))
  420. # compare_tokenizers(tokenizer1, tokenizer2, generator_random_added_tokens(tokenizer1, 10_000))
  421. # compare_tokenizers(tokenizer1, tokenizer2, generator_random_chars(10_000))
  422. # compare_tokenizers(tokenizer1, tokenizer2, generator_random_unicodes(10_000))
  423. # compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_chars(tokenizer1, 10_000))
  424. # compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_words(tokenizer1, 5_000))
  425. tokenizer2.model.free()
  426. if __name__ == "__main__":
  427. # main()
  428. if True:
  429. logging.basicConfig(
  430. level = logging.DEBUG,
  431. format = "%(asctime)s.%(msecs)03d %(name)s %(levelname)s %(message)s",
  432. datefmt = "%Y-%m-%d %H:%M:%S",
  433. filename = logger.name + ".log",
  434. filemode = "a"
  435. )
  436. logging.basicConfig(
  437. level = logging.DEBUG,
  438. format = "%(levelname)s %(message)s",
  439. )
  440. path_tokenizers = Path("./models/tokenizers/")
  441. path_vocab_format = "./models/ggml-vocab-%s.gguf"
  442. tokenizers = [
  443. "llama-spm", # SPM
  444. "phi-3", # SPM
  445. "gemma", # SPM
  446. "gemma-2", # SPM
  447. "baichuan", # SPM
  448. "bert-bge", # WPM
  449. "jina-v2-en", # WPM
  450. "llama-bpe", # BPE
  451. "phi-2", # BPE
  452. "deepseek-llm", # BPE
  453. "deepseek-coder", # BPE
  454. "falcon", # BPE
  455. "mpt", # BPE
  456. "starcoder", # BPE
  457. "gpt-2", # BPE
  458. "stablelm2", # BPE
  459. "refact", # BPE
  460. "qwen2", # BPE
  461. "olmo", # BPE
  462. "jina-v2-es", # BPE
  463. "jina-v2-de", # BPE
  464. "smaug-bpe", # BPE
  465. "poro-chat", # BPE
  466. "jina-v2-code", # BPE
  467. "viking", # BPE
  468. "jais", # BPE
  469. ]
  470. logger.info("=" * 50)
  471. for tokenizer in tokenizers:
  472. logger.info("-" * 50)
  473. logger.info(f"TOKENIZER: '{tokenizer}'")
  474. vocab_file = Path(path_vocab_format % tokenizer)
  475. dir_tokenizer = path_tokenizers / tokenizer
  476. main([str(vocab_file), str(dir_tokenizer), "--verbose"])