|
|
@@ -122,8 +122,30 @@ class SpecialVocab:
|
|
|
tokenizer = json.load(f)
|
|
|
if self.load_merges:
|
|
|
merges = tokenizer.get('model', {}).get('merges')
|
|
|
- if isinstance(merges, list) and merges and isinstance(merges[0], str):
|
|
|
- self.merges = merges
|
|
|
+ if isinstance(merges, list) and merges:
|
|
|
+ if isinstance(merges[0], str):
|
|
|
+ self.merges = merges
|
|
|
+ elif isinstance(merges[0], list) and len(merges[0]) == 2 and isinstance(merges[0][0], str):
|
|
|
+ # New format since transformers 4.45 to support spaces in merges
|
|
|
+ # ref: https://github.com/ggerganov/llama.cpp/issues/9692
|
|
|
+ # TODO: internally store as the new format instead of converting to old
|
|
|
+ if any(' ' in s for pair in merges for s in pair):
|
|
|
+ logger.warning(f'Spaces in merges detected, encoding as {chr(ord(" ") + 256)!r}')
|
|
|
+ self.merges = [
|
|
|
+ ' '.join(
|
|
|
+ [
|
|
|
+ # ensure the spaces are properly encoded
|
|
|
+ ''.join(
|
|
|
+ chr(ord(c) + 256) if c == ' ' else c
|
|
|
+ for c in part
|
|
|
+ )
|
|
|
+ for part in pair
|
|
|
+ ]
|
|
|
+ )
|
|
|
+ for pair in merges
|
|
|
+ ]
|
|
|
+ else:
|
|
|
+ raise ValueError("Unknown tokenizer merges format")
|
|
|
added_tokens = tokenizer.get('added_tokens', {})
|
|
|
else:
|
|
|
added_tokens = {}
|