3 weeks ago · 2b2afade9f
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5287,13 +5287,14 @@ class BertModel(TextModel):
 
															         self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
														
 
															         # convert to phantom space vocab
														
 
															-        def phantom(tok):
														
 
															-            if tok.startswith("[") and tok.endswith("]"):
														
 
															+        def phantom(tok, toktype):
														
 
															+            if toktype == gguf.TokenType.CONTROL:
														
 
															                 return tok
														
 
															             if tok.startswith("##"):
														
 
															                 return tok[2:]
														
 
															             return "\u2581" + tok
														
 
															-        tokens = list(map(phantom, tokens))
														
 
															+        assert len(tokens) == len(toktypes)
														
 
															+        tokens = list(map(phantom, tokens, toktypes))
														
 
															         # add vocab to gguf
														
 
															         self.gguf_writer.add_tokenizer_model("bert")