|
@@ -5524,6 +5524,12 @@ static void llm_load_vocab(
|
|
|
} else if (
|
|
} else if (
|
|
|
tokenizer_pre == "jais") {
|
|
tokenizer_pre == "jais") {
|
|
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
|
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
|
|
|
|
|
+ } else if (
|
|
|
|
|
+ tokenizer_pre == "tekken") {
|
|
|
|
|
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
|
|
|
|
|
+ vocab.tokenizer_clean_spaces = false;
|
|
|
|
|
+ vocab.tokenizer_ignore_merges = true;
|
|
|
|
|
+ vocab.tokenizer_add_bos = true;
|
|
|
} else {
|
|
} else {
|
|
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
|
|
}
|
|
}
|
|
@@ -15585,6 +15591,13 @@ struct llm_tokenizer_bpe {
|
|
|
"\\p{N}",
|
|
"\\p{N}",
|
|
|
};
|
|
};
|
|
|
break;
|
|
break;
|
|
|
|
|
+ case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
|
|
|
|
|
+ // original regex from tokenizer.json
|
|
|
|
|
+ // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
|
|
|
|
+ regex_exprs = {
|
|
|
|
|
+ "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
|
|
|
+ };
|
|
|
|
|
+ break;
|
|
|
default:
|
|
default:
|
|
|
// default regex for BPE tokenization pre-processing
|
|
// default regex for BPE tokenization pre-processing
|
|
|
regex_exprs = {
|
|
regex_exprs = {
|