|
|
@@ -31,12 +31,14 @@ const char * llm_type_name(llm_type type) {
|
|
|
case LLM_TYPE_17M: return "17M";
|
|
|
case LLM_TYPE_22M: return "22M";
|
|
|
case LLM_TYPE_33M: return "33M";
|
|
|
+ case LLM_TYPE_47M: return "47M";
|
|
|
case LLM_TYPE_60M: return "60M";
|
|
|
case LLM_TYPE_70M: return "70M";
|
|
|
case LLM_TYPE_80M: return "80M";
|
|
|
case LLM_TYPE_109M: return "109M";
|
|
|
case LLM_TYPE_137M: return "137M";
|
|
|
case LLM_TYPE_140M: return "140M";
|
|
|
+ case LLM_TYPE_149M: return "149M";
|
|
|
case LLM_TYPE_160M: return "160M";
|
|
|
case LLM_TYPE_190M: return "190M";
|
|
|
case LLM_TYPE_220M: return "220M";
|
|
|
@@ -46,6 +48,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
case LLM_TYPE_335M: return "335M";
|
|
|
case LLM_TYPE_350M: return "350M";
|
|
|
case LLM_TYPE_360M: return "360M";
|
|
|
+ case LLM_TYPE_395M: return "395M";
|
|
|
case LLM_TYPE_410M: return "410M";
|
|
|
case LLM_TYPE_450M: return "450M";
|
|
|
case LLM_TYPE_475M: return "475M";
|
|
|
@@ -875,6 +878,34 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
default: type = LLM_TYPE_UNKNOWN;
|
|
|
}
|
|
|
} break;
|
|
|
+ case LLM_ARCH_MODERN_BERT:
|
|
|
+ {
|
|
|
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
|
+ if (found_swa && hparams.n_swa > 0) {
|
|
|
+ uint32_t swa_period = 3;
|
|
|
+ hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
|
|
|
+
|
|
|
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
|
|
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
|
|
+ hparams.set_swa_pattern(swa_period);
|
|
|
+ } else {
|
|
|
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
|
+ }
|
|
|
+
|
|
|
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
|
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
|
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
|
+
|
|
|
+ switch (hparams.n_layer) {
|
|
|
+ case 12:
|
|
|
+ type = LLM_TYPE_47M; break; // granite-embedding-small
|
|
|
+ case 22:
|
|
|
+ type = LLM_TYPE_149M; break; // modern-bert-base
|
|
|
+ case 28:
|
|
|
+ type = LLM_TYPE_395M; break; // modern-bert-large
|
|
|
+ default: type = LLM_TYPE_UNKNOWN;
|
|
|
+ }
|
|
|
+ } break;
|
|
|
case LLM_ARCH_JINA_BERT_V2:
|
|
|
{
|
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
|
@@ -3155,6 +3186,37 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
|
|
|
}
|
|
|
} break;
|
|
|
+ case LLM_ARCH_MODERN_BERT:
|
|
|
+ {
|
|
|
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
|
+ tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
|
|
|
+
|
|
|
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
|
+
|
|
|
+ for(int i = 0; i < n_layer; ++i) {
|
|
|
+ auto& layer = layers[i];
|
|
|
+
|
|
|
+ if ( i != 0 ) {
|
|
|
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
|
+ } else{
|
|
|
+ // layer 0 uses identity
|
|
|
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3 * n_embd }, 0);
|
|
|
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
|
+
|
|
|
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, 2 * n_ff}, 0);
|
|
|
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
|
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
|
+ }
|
|
|
+
|
|
|
+ cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
|
|
|
+ cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
|
|
+ cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
|
|
+
|
|
|
+ } break;
|
|
|
case LLM_ARCH_NEO_BERT:
|
|
|
{
|
|
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
|
@@ -7089,6 +7151,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
|
case LLM_ARCH_NEO_BERT:
|
|
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
|
+ case LLM_ARCH_MODERN_BERT:
|
|
|
case LLM_ARCH_GEMMA_EMBEDDING:
|
|
|
case LLM_ARCH_DREAM:
|
|
|
case LLM_ARCH_LLADA:
|
|
|
@@ -7248,6 +7311,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
{
|
|
|
llm = std::make_unique<llm_build_bert>(*this, params);
|
|
|
} break;
|
|
|
+ case LLM_ARCH_MODERN_BERT:
|
|
|
+ {
|
|
|
+ llm = std::make_unique<llm_build_modern_bert<true>>(*this, params);
|
|
|
+ } break;
|
|
|
case LLM_ARCH_NEO_BERT:
|
|
|
{
|
|
|
llm = std::make_unique<llm_build_neo_bert>(*this, params);
|
|
|
@@ -7816,6 +7883,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
case LLM_ARCH_DBRX:
|
|
|
case LLM_ARCH_BERT:
|
|
|
case LLM_ARCH_JINA_BERT_V3:
|
|
|
+ case LLM_ARCH_MODERN_BERT:
|
|
|
case LLM_ARCH_NOMIC_BERT:
|
|
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
|
case LLM_ARCH_STABLELM:
|