|
@@ -14,6 +14,7 @@ class TensorNameMap:
|
|
|
"transformer.word_embeddings", # falcon
|
|
"transformer.word_embeddings", # falcon
|
|
|
"word_embeddings", # bloom
|
|
"word_embeddings", # bloom
|
|
|
"model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2 glm4-0414 plamo2 granite-hybrid
|
|
"model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2 glm4-0414 plamo2 granite-hybrid
|
|
|
|
|
+ "embed_tokens", # embeddinggemma
|
|
|
"tok_embeddings", # llama-pth
|
|
"tok_embeddings", # llama-pth
|
|
|
"embeddings.word_embeddings", # bert nomic-bert
|
|
"embeddings.word_embeddings", # bert nomic-bert
|
|
|
"language_model.embedding.word_embeddings", # persimmon
|
|
"language_model.embedding.word_embeddings", # persimmon
|
|
@@ -141,6 +142,7 @@ class TensorNameMap:
|
|
|
"rwkv.blocks.{bid}.ln1", # rwkv6
|
|
"rwkv.blocks.{bid}.ln1", # rwkv6
|
|
|
"model.layers.{bid}.ln1", # rwkv7
|
|
"model.layers.{bid}.ln1", # rwkv7
|
|
|
"model.layers.{bid}.input_layernorm", # llama4
|
|
"model.layers.{bid}.input_layernorm", # llama4
|
|
|
|
|
+ "layers.{bid}.input_layernorm", # embeddinggemma
|
|
|
"transformer_encoder.{bid}.attention_norm", # neobert
|
|
"transformer_encoder.{bid}.attention_norm", # neobert
|
|
|
"model.layers.{bid}.operator_norm", # lfm2
|
|
"model.layers.{bid}.operator_norm", # lfm2
|
|
|
"model.transformer.blocks.{bid}.attn_norm", # llada
|
|
"model.transformer.blocks.{bid}.attn_norm", # llada
|
|
@@ -179,6 +181,7 @@ class TensorNameMap:
|
|
|
# Attention query
|
|
# Attention query
|
|
|
MODEL_TENSOR.ATTN_Q: (
|
|
MODEL_TENSOR.ATTN_Q: (
|
|
|
"model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo2 phimoe
|
|
"model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo2 phimoe
|
|
|
|
|
+ "layers.{bid}.self_attn.q_proj", # embeddinggemma
|
|
|
"model.layers.{bid}.self_attn.q_proj_no_perm", # llama-custom
|
|
"model.layers.{bid}.self_attn.q_proj_no_perm", # llama-custom
|
|
|
"layers.{bid}.attention.wq", # llama-pth
|
|
"layers.{bid}.attention.wq", # llama-pth
|
|
|
"encoder.layer.{bid}.attention.self.query", # bert
|
|
"encoder.layer.{bid}.attention.self.query", # bert
|
|
@@ -197,6 +200,7 @@ class TensorNameMap:
|
|
|
# Attention key
|
|
# Attention key
|
|
|
MODEL_TENSOR.ATTN_K: (
|
|
MODEL_TENSOR.ATTN_K: (
|
|
|
"model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo2 phimoe
|
|
"model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo2 phimoe
|
|
|
|
|
+ "layers.{bid}.self_attn.k_proj", # embeddinggemma
|
|
|
"model.layers.{bid}.self_attn.k_proj_no_perm", # llama-custom
|
|
"model.layers.{bid}.self_attn.k_proj_no_perm", # llama-custom
|
|
|
"layers.{bid}.attention.wk", # llama-pth
|
|
"layers.{bid}.attention.wk", # llama-pth
|
|
|
"encoder.layer.{bid}.attention.self.key", # bert
|
|
"encoder.layer.{bid}.attention.self.key", # bert
|
|
@@ -216,6 +220,7 @@ class TensorNameMap:
|
|
|
# Attention value
|
|
# Attention value
|
|
|
MODEL_TENSOR.ATTN_V: (
|
|
MODEL_TENSOR.ATTN_V: (
|
|
|
"model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo2 phimoe
|
|
"model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo2 phimoe
|
|
|
|
|
+ "layers.{bid}.self_attn.v_proj", # embeddinggemma
|
|
|
"layers.{bid}.attention.wv", # llama-pth
|
|
"layers.{bid}.attention.wv", # llama-pth
|
|
|
"encoder.layer.{bid}.attention.self.value", # bert
|
|
"encoder.layer.{bid}.attention.self.value", # bert
|
|
|
"transformer.layer.{bid}.attention.v_lin", # distillbert
|
|
"transformer.layer.{bid}.attention.v_lin", # distillbert
|
|
@@ -239,6 +244,7 @@ class TensorNameMap:
|
|
|
"transformer.h.{bid}.self_attention.dense", # falcon
|
|
"transformer.h.{bid}.self_attention.dense", # falcon
|
|
|
"h.{bid}.self_attention.dense", # bloom
|
|
"h.{bid}.self_attention.dense", # bloom
|
|
|
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2 phimoe
|
|
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2 phimoe
|
|
|
|
|
+ "layers.{bid}.self_attn.o_proj", # embeddinggemma
|
|
|
"model.layers.{bid}.self_attn.out_proj", # lfm2
|
|
"model.layers.{bid}.self_attn.out_proj", # lfm2
|
|
|
"model.layers.{bid}.self_attn.linear_attn", # deci
|
|
"model.layers.{bid}.self_attn.linear_attn", # deci
|
|
|
"layers.{bid}.attention.wo", # llama-pth
|
|
"layers.{bid}.attention.wo", # llama-pth
|
|
@@ -277,6 +283,7 @@ class TensorNameMap:
|
|
|
|
|
|
|
|
MODEL_TENSOR.ATTN_POST_NORM: (
|
|
MODEL_TENSOR.ATTN_POST_NORM: (
|
|
|
"model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2 # ge
|
|
"model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2 # ge
|
|
|
|
|
+ "layers.{bid}.post_attention_layernorm", # embeddinggemma
|
|
|
"model.layers.{bid}.post_self_attn_layernorm", # glm-4-0414
|
|
"model.layers.{bid}.post_self_attn_layernorm", # glm-4-0414
|
|
|
"model.layers.layers.{bid}.post_mixer_norm.weight", # plamo2
|
|
"model.layers.layers.{bid}.post_mixer_norm.weight", # plamo2
|
|
|
),
|
|
),
|
|
@@ -320,12 +327,14 @@ class TensorNameMap:
|
|
|
# Post feed-forward norm
|
|
# Post feed-forward norm
|
|
|
MODEL_TENSOR.FFN_PRE_NORM: (
|
|
MODEL_TENSOR.FFN_PRE_NORM: (
|
|
|
"model.layers.{bid}.pre_feedforward_layernorm", # gemma2
|
|
"model.layers.{bid}.pre_feedforward_layernorm", # gemma2
|
|
|
|
|
+ "layers.{bid}.pre_feedforward_layernorm", # embeddinggemma
|
|
|
"model.layers.{bid}.pre_ff_layernorm.weight",
|
|
"model.layers.{bid}.pre_ff_layernorm.weight",
|
|
|
),
|
|
),
|
|
|
|
|
|
|
|
# Post feed-forward norm
|
|
# Post feed-forward norm
|
|
|
MODEL_TENSOR.FFN_POST_NORM: (
|
|
MODEL_TENSOR.FFN_POST_NORM: (
|
|
|
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
|
|
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
|
|
|
|
|
+ "layers.{bid}.post_feedforward_layernorm", # embeddinggemma
|
|
|
"model.layers.{bid}.post_mlp_layernorm", # glm-4-0414
|
|
"model.layers.{bid}.post_mlp_layernorm", # glm-4-0414
|
|
|
"model.layers.layers.{bid}.post_mlp_norm.weight", # plamo2
|
|
"model.layers.layers.{bid}.post_mlp_norm.weight", # plamo2
|
|
|
"model.layers.{bid}.feed_forward.up_proj",
|
|
"model.layers.{bid}.feed_forward.up_proj",
|
|
@@ -362,6 +371,7 @@ class TensorNameMap:
|
|
|
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
|
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
|
|
"h.{bid}.mlp.dense_h_to_4h", # bloom
|
|
"h.{bid}.mlp.dense_h_to_4h", # bloom
|
|
|
"model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo2
|
|
"model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo2
|
|
|
|
|
+ "layers.{bid}.mlp.up_proj", # embeddinggemma
|
|
|
"layers.{bid}.feed_forward.w3", # llama-pth
|
|
"layers.{bid}.feed_forward.w3", # llama-pth
|
|
|
"encoder.layer.{bid}.intermediate.dense", # bert
|
|
"encoder.layer.{bid}.intermediate.dense", # bert
|
|
|
"transformer.layer.{bid}.ffn.lin1", # distillbert
|
|
"transformer.layer.{bid}.ffn.lin1", # distillbert
|
|
@@ -421,6 +431,7 @@ class TensorNameMap:
|
|
|
# Feed-forward gate
|
|
# Feed-forward gate
|
|
|
MODEL_TENSOR.FFN_GATE: (
|
|
MODEL_TENSOR.FFN_GATE: (
|
|
|
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo2
|
|
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo2
|
|
|
|
|
+ "layers.{bid}.mlp.gate_proj", # embeddinggemma
|
|
|
"layers.{bid}.feed_forward.w1", # llama-pth
|
|
"layers.{bid}.feed_forward.w1", # llama-pth
|
|
|
"transformer.h.{bid}.mlp.w2", # qwen
|
|
"transformer.h.{bid}.mlp.w2", # qwen
|
|
|
"transformer.h.{bid}.mlp.c_fc2", # jais
|
|
"transformer.h.{bid}.mlp.c_fc2", # jais
|
|
@@ -461,6 +472,7 @@ class TensorNameMap:
|
|
|
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
|
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
|
|
"h.{bid}.mlp.dense_4h_to_h", # bloom
|
|
"h.{bid}.mlp.dense_4h_to_h", # bloom
|
|
|
"model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo2
|
|
"model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo2
|
|
|
|
|
+ "layers.{bid}.mlp.down_proj", # embeddinggemma
|
|
|
"layers.{bid}.feed_forward.w2", # llama-pth
|
|
"layers.{bid}.feed_forward.w2", # llama-pth
|
|
|
"encoder.layer.{bid}.output.dense", # bert
|
|
"encoder.layer.{bid}.output.dense", # bert
|
|
|
"transformer.layer.{bid}.ffn.lin2", # distillbert
|
|
"transformer.layer.{bid}.ffn.lin2", # distillbert
|
|
@@ -513,6 +525,7 @@ class TensorNameMap:
|
|
|
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
|
|
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
|
|
|
"model.layers.{bid}.self_attn.query_layernorm", # hunyuan
|
|
"model.layers.{bid}.self_attn.query_layernorm", # hunyuan
|
|
|
"model.layers.{bid}.self_attn.q_norm", # cohere olmoe chameleon olmo2
|
|
"model.layers.{bid}.self_attn.q_norm", # cohere olmoe chameleon olmo2
|
|
|
|
|
+ "layers.{bid}.self_attn.q_norm", # embeddinggemma
|
|
|
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
|
|
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
|
|
|
"encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
|
|
"encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
|
|
|
"transformer.layers.{bid}.attn.q_norm", # openelm
|
|
"transformer.layers.{bid}.attn.q_norm", # openelm
|
|
@@ -525,6 +538,7 @@ class TensorNameMap:
|
|
|
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
|
|
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
|
|
|
"model.layers.{bid}.self_attn.key_layernorm", # hunyuan
|
|
"model.layers.{bid}.self_attn.key_layernorm", # hunyuan
|
|
|
"model.layers.{bid}.self_attn.k_norm", # cohere olmoe chameleon olmo2
|
|
"model.layers.{bid}.self_attn.k_norm", # cohere olmoe chameleon olmo2
|
|
|
|
|
+ "layers.{bid}.self_attn.k_norm", # embeddinggemma
|
|
|
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
|
|
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
|
|
|
"encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
|
|
"encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
|
|
|
"transformer.layers.{bid}.attn.k_norm", # openelm
|
|
"transformer.layers.{bid}.attn.k_norm", # openelm
|