Просмотр исходного кода

fix(gguf-py): special tokens are no longer skipped when add_<token>_token is set to false (#5487)

* fix(gguf-py): special tokens are no longer skipped when add_<token>_token is set to false

* fix(gguf-py): added missing cls and mask token ids to the gguf metadata
Michaël de Vries 1 год назад
Родитель
Сommit
73122473ff
3 измененных файлов с 11 добавлено и 5 удалено
  1. 4 0
      gguf-py/gguf/constants.py
  2. 6 0
      gguf-py/gguf/gguf_writer.py
  3. 1 5
      gguf-py/gguf/vocab.py

+ 4 - 0
gguf-py/gguf/constants.py

@@ -73,6 +73,8 @@ class Keys:
         UNK_ID           = "tokenizer.ggml.unknown_token_id"
         SEP_ID           = "tokenizer.ggml.seperator_token_id"
         PAD_ID           = "tokenizer.ggml.padding_token_id"
+        CLS_ID           = "tokenizer.ggml.cls_token_id"
+        MASK_ID          = "tokenizer.ggml.mask_token_id"
         ADD_BOS          = "tokenizer.ggml.add_bos_token"
         ADD_EOS          = "tokenizer.ggml.add_eos_token"
         ADD_PREFIX       = "tokenizer.ggml.add_space_prefix"
@@ -685,5 +687,7 @@ KEY_TOKENIZER_EOS_ID     = Keys.Tokenizer.EOS_ID
 KEY_TOKENIZER_UNK_ID     = Keys.Tokenizer.UNK_ID
 KEY_TOKENIZER_SEP_ID     = Keys.Tokenizer.SEP_ID
 KEY_TOKENIZER_PAD_ID     = Keys.Tokenizer.PAD_ID
+KEY_TOKENIZER_CLS_ID     = Keys.Tokenizer.CLS_ID
+KEY_TOKENIZER_MASK_ID    = Keys.Tokenizer.MASK_ID
 KEY_TOKENIZER_HF_JSON    = Keys.Tokenizer.HF_JSON
 KEY_TOKENIZER_RWKV       = Keys.Tokenizer.RWKV

+ 6 - 0
gguf-py/gguf/gguf_writer.py

@@ -414,6 +414,12 @@ class GGUFWriter:
     def add_pad_token_id(self, id: int) -> None:
         self.add_uint32(Keys.Tokenizer.PAD_ID, id)
 
+    def add_cls_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.Tokenizer.CLS_ID, id)
+
+    def add_mask_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.Tokenizer.MASK_ID, id)
+
     def add_add_bos_token(self, value: bool) -> None:
         self.add_bool(Keys.Tokenizer.ADD_BOS, value)
 

+ 1 - 5
gguf-py/gguf/vocab.py

@@ -29,7 +29,7 @@ class SpecialVocab:
         if special_token_types is not None:
             self.special_token_types = special_token_types
         else:
-            self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad')
+            self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad', 'cls', 'mask')
         self._load(Path(path))
 
     def __repr__(self) -> str:
@@ -152,10 +152,6 @@ class SpecialVocab:
             add_entry = tokenizer_config.get(f'add_{typ}_token')
             if isinstance(add_entry, bool):
                 self.add_special_token[typ] = add_entry
-            if not added_tokens:
-                # We will need this to get the content for the token, so if it's empty
-                # may as well just give up.
-                continue
             entry = tokenizer_config.get(f'{typ}_token')
             if isinstance(entry, str):
                 tc_content = entry