|
|
@@ -415,6 +415,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|
|
"'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
|
|
|
};
|
|
|
break;
|
|
|
+ case LLAMA_VOCAB_PRE_TYPE_SEED_CODER:
|
|
|
+ regex_exprs = {
|
|
|
+ // original regex from tokenizer.json
|
|
|
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\r\n]+|\\s*[\r\n]+|\\s+(?!\\S)|\\s+"
|
|
|
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
|
+ };
|
|
|
+ break;
|
|
|
default:
|
|
|
// default regex for BPE tokenization pre-processing
|
|
|
regex_exprs = {
|
|
|
@@ -1634,6 +1641,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
tokenizer_pre == "bailingmoe") {
|
|
|
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
|
|
|
clean_spaces = false;
|
|
|
+ } else if (
|
|
|
+ tokenizer_pre == "seed-coder") {
|
|
|
+ pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
|
|
|
+ clean_spaces = false;
|
|
|
} else {
|
|
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
|
|
}
|