|
@@ -130,6 +130,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
case LLM_TYPE_230B_A10B: return "230B.A10B";
|
|
case LLM_TYPE_230B_A10B: return "230B.A10B";
|
|
|
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
|
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
|
|
case LLM_TYPE_300B_A47B: return "300B.A47B";
|
|
case LLM_TYPE_300B_A47B: return "300B.A47B";
|
|
|
|
|
+ case LLM_TYPE_310B_A15B: return "310B.A15B";
|
|
|
case LLM_TYPE_355B_A32B: return "355B.A32B";
|
|
case LLM_TYPE_355B_A32B: return "355B.A32B";
|
|
|
case LLM_TYPE_E2B: return "E2B";
|
|
case LLM_TYPE_E2B: return "E2B";
|
|
|
case LLM_TYPE_E4B: return "E4B";
|
|
case LLM_TYPE_E4B: return "E4B";
|
|
@@ -2339,6 +2340,22 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
default: type = LLM_TYPE_UNKNOWN;
|
|
default: type = LLM_TYPE_UNKNOWN;
|
|
|
}
|
|
}
|
|
|
} break;
|
|
} break;
|
|
|
|
|
+ case LLM_ARCH_MIMO2:
|
|
|
|
|
+ {
|
|
|
|
|
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
|
|
|
+
|
|
|
|
|
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
|
|
|
+
|
|
|
|
|
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
|
|
|
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
|
|
|
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
|
|
|
|
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
|
|
|
|
|
+
|
|
|
|
|
+ switch (hparams.n_layer) {
|
|
|
|
|
+ case 48: type = LLM_TYPE_310B_A15B; break;
|
|
|
|
|
+ default: type = LLM_TYPE_UNKNOWN;
|
|
|
|
|
+ }
|
|
|
|
|
+ } break;
|
|
|
default: throw std::runtime_error("unsupported model architecture");
|
|
default: throw std::runtime_error("unsupported model architecture");
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -6648,6 +6665,44 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
|
|
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
|
|
|
}
|
|
}
|
|
|
} break;
|
|
} break;
|
|
|
|
|
+ case LLM_ARCH_MIMO2:
|
|
|
|
|
+ {
|
|
|
|
|
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
|
|
|
+
|
|
|
|
|
+ // output
|
|
|
|
|
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
|
|
|
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
|
|
|
+
|
|
|
|
|
+ for (int i = 0; i < n_layer; ++i) {
|
|
|
|
|
+ auto & layer = layers[i];
|
|
|
|
|
+ uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
|
|
|
|
|
+ uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
|
|
|
|
|
+ uint32_t n_head = hparams.n_head(i);
|
|
|
|
|
+
|
|
|
|
|
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
|
|
|
|
|
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
|
|
|
|
|
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
|
|
|
|
|
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_v * n_head, n_embd }, 0);
|
|
|
|
|
+
|
|
|
|
|
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
|
|
|
+ layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, TENSOR_NOT_REQUIRED);
|
|
|
|
|
+
|
|
|
|
|
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
|
|
|
+
|
|
|
|
|
+ // non-MoE branch
|
|
|
|
|
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
|
|
|
|
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, TENSOR_NOT_REQUIRED);
|
|
|
|
|
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
|
|
|
|
+
|
|
|
|
|
+ // MoE branch
|
|
|
|
|
+ int64_t n_ff_exp = hparams.n_ff_exp;
|
|
|
|
|
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
|
|
|
|
|
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
|
|
|
|
|
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, TENSOR_NOT_REQUIRED);
|
|
|
|
|
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
|
|
|
|
|
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
|
|
|
|
+ }
|
|
|
|
|
+ } break;
|
|
|
default:
|
|
default:
|
|
|
throw std::runtime_error("unknown architecture");
|
|
throw std::runtime_error("unknown architecture");
|
|
|
}
|
|
}
|
|
@@ -7710,6 +7765,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
{
|
|
{
|
|
|
llm = std::make_unique<llm_build_mistral3>(*this, params);
|
|
llm = std::make_unique<llm_build_mistral3>(*this, params);
|
|
|
} break;
|
|
} break;
|
|
|
|
|
+ case LLM_ARCH_MIMO2:
|
|
|
|
|
+ {
|
|
|
|
|
+ llm = std::make_unique<llm_build_mimo2_iswa>(*this, params);
|
|
|
|
|
+ } break;
|
|
|
default:
|
|
default:
|
|
|
GGML_ABORT("fatal error");
|
|
GGML_ABORT("fatal error");
|
|
|
}
|
|
}
|
|
@@ -7940,6 +7999,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
case LLM_ARCH_PANGU_EMBED:
|
|
case LLM_ARCH_PANGU_EMBED:
|
|
|
case LLM_ARCH_AFMOE:
|
|
case LLM_ARCH_AFMOE:
|
|
|
case LLM_ARCH_QWEN3NEXT:
|
|
case LLM_ARCH_QWEN3NEXT:
|
|
|
|
|
+ case LLM_ARCH_MIMO2:
|
|
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
|
|
|
|
|
|
case LLM_ARCH_QWEN2VL:
|
|
case LLM_ARCH_QWEN2VL:
|