|
|
@@ -5287,13 +5287,14 @@ class BertModel(TextModel):
|
|
|
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
|
|
|
|
|
|
# convert to phantom space vocab
|
|
|
- def phantom(tok):
|
|
|
- if tok.startswith("[") and tok.endswith("]"):
|
|
|
+ def phantom(tok, toktype):
|
|
|
+ if toktype == gguf.TokenType.CONTROL:
|
|
|
return tok
|
|
|
if tok.startswith("##"):
|
|
|
return tok[2:]
|
|
|
return "\u2581" + tok
|
|
|
- tokens = list(map(phantom, tokens))
|
|
|
+ assert len(tokens) == len(toktypes)
|
|
|
+ tokens = list(map(phantom, tokens, toktypes))
|
|
|
|
|
|
# add vocab to gguf
|
|
|
self.gguf_writer.add_tokenizer_model("bert")
|