|
@@ -359,6 +359,7 @@ enum llm_kv {
|
|
|
LLM_KV_TOKENIZER_SUFFIX_ID,
|
|
LLM_KV_TOKENIZER_SUFFIX_ID,
|
|
|
LLM_KV_TOKENIZER_MIDDLE_ID,
|
|
LLM_KV_TOKENIZER_MIDDLE_ID,
|
|
|
LLM_KV_TOKENIZER_EOT_ID,
|
|
LLM_KV_TOKENIZER_EOT_ID,
|
|
|
|
|
+ LLM_KV_TOKENIZER_EOM_ID,
|
|
|
|
|
|
|
|
LLM_KV_ADAPTER_TYPE,
|
|
LLM_KV_ADAPTER_TYPE,
|
|
|
LLM_KV_ADAPTER_LORA_ALPHA,
|
|
LLM_KV_ADAPTER_LORA_ALPHA,
|
|
@@ -456,6 +457,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
|
|
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
|
|
|
{ LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
|
|
{ LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
|
|
|
{ LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
|
|
{ LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
|
|
|
|
|
+ { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" },
|
|
|
|
|
|
|
|
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
|
|
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
|
|
|
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
|
|
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
|
|
@@ -5583,6 +5585,7 @@ static void llm_load_vocab(
|
|
|
{ LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
|
|
{ LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
|
|
|
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
|
|
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
|
|
|
{ LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
|
|
{ LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
|
|
|
|
|
+ { LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id },
|
|
|
};
|
|
};
|
|
|
|
|
|
|
|
for (const auto & it : special_token_types) {
|
|
for (const auto & it : special_token_types) {
|
|
@@ -5635,6 +5638,17 @@ static void llm_load_vocab(
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
+
|
|
|
|
|
+ // find EOM token: "<|eom_id|>"
|
|
|
|
|
+ //
|
|
|
|
|
+ // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID
|
|
|
|
|
+ // for now, we apply this workaround to find the EOM token based on its text
|
|
|
|
|
+ if (vocab.special_eom_id == -1) {
|
|
|
|
|
+ const auto & t = vocab.token_to_id.find("<|eom_id|>");
|
|
|
|
|
+ if (t != vocab.token_to_id.end()) {
|
|
|
|
|
+ vocab.special_eom_id = t->second;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
// build special tokens cache
|
|
// build special tokens cache
|