|
@@ -17,6 +17,7 @@ class TensorNameMap:
|
|
|
"embed_tokens", # embeddinggemma
|
|
"embed_tokens", # embeddinggemma
|
|
|
"tok_embeddings", # llama-pth
|
|
"tok_embeddings", # llama-pth
|
|
|
"embeddings.word_embeddings", # bert nomic-bert
|
|
"embeddings.word_embeddings", # bert nomic-bert
|
|
|
|
|
+ "embeddings.tok_embeddings", # modern-bert
|
|
|
"language_model.embedding.word_embeddings", # persimmon
|
|
"language_model.embedding.word_embeddings", # persimmon
|
|
|
"wte", # gpt2
|
|
"wte", # gpt2
|
|
|
"transformer.embd.wte", # phi2
|
|
"transformer.embd.wte", # phi2
|
|
@@ -46,6 +47,7 @@ class TensorNameMap:
|
|
|
MODEL_TENSOR.TOKEN_EMBD_NORM: (
|
|
MODEL_TENSOR.TOKEN_EMBD_NORM: (
|
|
|
"word_embeddings_layernorm", # bloom
|
|
"word_embeddings_layernorm", # bloom
|
|
|
"embeddings.LayerNorm", # bert
|
|
"embeddings.LayerNorm", # bert
|
|
|
|
|
+ "embeddings.norm", # modern-bert
|
|
|
"emb_ln", # nomic-bert
|
|
"emb_ln", # nomic-bert
|
|
|
"transformer.norm", # openelm
|
|
"transformer.norm", # openelm
|
|
|
"rwkv.blocks.0.pre_ln", # rwkv
|
|
"rwkv.blocks.0.pre_ln", # rwkv
|
|
@@ -75,6 +77,7 @@ class TensorNameMap:
|
|
|
"head.out", # wavtokenizer
|
|
"head.out", # wavtokenizer
|
|
|
"lm_head", # llama4
|
|
"lm_head", # llama4
|
|
|
"model.transformer.ff_out", # llada
|
|
"model.transformer.ff_out", # llada
|
|
|
|
|
+ "head.decoder", # modern-bert
|
|
|
),
|
|
),
|
|
|
MODEL_TENSOR.DENSE_2_OUT: (
|
|
MODEL_TENSOR.DENSE_2_OUT: (
|
|
|
"dense_2_out", # embeddinggemma
|
|
"dense_2_out", # embeddinggemma
|
|
@@ -104,6 +107,7 @@ class TensorNameMap:
|
|
|
"backbone.final_layer_norm", # wavtokenizer
|
|
"backbone.final_layer_norm", # wavtokenizer
|
|
|
"model.norm", # llama4
|
|
"model.norm", # llama4
|
|
|
"model.transformer.ln_f", # llada
|
|
"model.transformer.ln_f", # llada
|
|
|
|
|
+ "final_norm", # modern-bert
|
|
|
"model.norm", # cogvlm
|
|
"model.norm", # cogvlm
|
|
|
),
|
|
),
|
|
|
|
|
|
|
@@ -151,6 +155,7 @@ class TensorNameMap:
|
|
|
"model.layers.{bid}.input_layernorm", # llama4
|
|
"model.layers.{bid}.input_layernorm", # llama4
|
|
|
"layers.{bid}.input_layernorm", # embeddinggemma
|
|
"layers.{bid}.input_layernorm", # embeddinggemma
|
|
|
"transformer_encoder.{bid}.attention_norm", # neobert
|
|
"transformer_encoder.{bid}.attention_norm", # neobert
|
|
|
|
|
+ "layers.{bid}.attn_norm", # modern-bert
|
|
|
"model.layers.{bid}.operator_norm", # lfm2
|
|
"model.layers.{bid}.operator_norm", # lfm2
|
|
|
"model.transformer.blocks.{bid}.attn_norm", # llada
|
|
"model.transformer.blocks.{bid}.attn_norm", # llada
|
|
|
"layers.{bid}.input_layernorm", # qwen3-embedding
|
|
"layers.{bid}.input_layernorm", # qwen3-embedding
|
|
@@ -187,6 +192,7 @@ class TensorNameMap:
|
|
|
"encoder.layers.{bid}.self_attention.query_key_value", # chatglm
|
|
"encoder.layers.{bid}.self_attention.query_key_value", # chatglm
|
|
|
"transformer.layers.{bid}.attn.qkv_proj", # openelm
|
|
"transformer.layers.{bid}.attn.qkv_proj", # openelm
|
|
|
"transformer_encoder.{bid}.qkv", # neobert
|
|
"transformer_encoder.{bid}.qkv", # neobert
|
|
|
|
|
+ "layers.{bid}.attn.Wqkv", # modern-bert
|
|
|
"model.layers.{bid}.self_attn.language_expert_query_key_value", # cogvlm
|
|
"model.layers.{bid}.self_attn.language_expert_query_key_value", # cogvlm
|
|
|
),
|
|
),
|
|
|
|
|
|
|
@@ -261,6 +267,7 @@ class TensorNameMap:
|
|
|
"model.layers.{bid}.self_attn.linear_attn", # deci
|
|
"model.layers.{bid}.self_attn.linear_attn", # deci
|
|
|
"layers.{bid}.attention.wo", # llama-pth
|
|
"layers.{bid}.attention.wo", # llama-pth
|
|
|
"encoder.layer.{bid}.attention.output.dense", # bert
|
|
"encoder.layer.{bid}.attention.output.dense", # bert
|
|
|
|
|
+ "layers.{bid}.attn.Wo", # modern-bert
|
|
|
"transformer.layer.{bid}.attention.out_lin", # distillbert
|
|
"transformer.layer.{bid}.attention.out_lin", # distillbert
|
|
|
"transformer.h.{bid}.attn.out_proj", # gpt-j
|
|
"transformer.h.{bid}.attn.out_proj", # gpt-j
|
|
|
"language_model.encoder.layers.{bid}.self_attention.dense", # persimmon
|
|
"language_model.encoder.layers.{bid}.self_attention.dense", # persimmon
|
|
@@ -344,6 +351,7 @@ class TensorNameMap:
|
|
|
"layers.{bid}.post_attention_layernorm", # qwen3-embedding
|
|
"layers.{bid}.post_attention_layernorm", # qwen3-embedding
|
|
|
"model.layers.{bid}.feedforward_layernorm", # apertus
|
|
"model.layers.{bid}.feedforward_layernorm", # apertus
|
|
|
"model.layers.{bid}.pre_mlp_layernorm", # kormo
|
|
"model.layers.{bid}.pre_mlp_layernorm", # kormo
|
|
|
|
|
+ "layers.{bid}.mlp_norm" # modern-bert
|
|
|
),
|
|
),
|
|
|
|
|
|
|
|
# Pre feed-forward norm
|
|
# Pre feed-forward norm
|
|
@@ -407,6 +415,7 @@ class TensorNameMap:
|
|
|
"layers.{bid}.mlp.up_proj", # embeddinggemma
|
|
"layers.{bid}.mlp.up_proj", # embeddinggemma
|
|
|
"layers.{bid}.feed_forward.w3", # llama-pth
|
|
"layers.{bid}.feed_forward.w3", # llama-pth
|
|
|
"encoder.layer.{bid}.intermediate.dense", # bert
|
|
"encoder.layer.{bid}.intermediate.dense", # bert
|
|
|
|
|
+ "layers.{bid}.mlp.Wi", # modern-bert
|
|
|
"transformer.layer.{bid}.ffn.lin1", # distillbert
|
|
"transformer.layer.{bid}.ffn.lin1", # distillbert
|
|
|
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
|
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
|
|
"transformer.h.{bid}.mlp.linear_3", # refact
|
|
"transformer.h.{bid}.mlp.linear_3", # refact
|
|
@@ -521,6 +530,7 @@ class TensorNameMap:
|
|
|
"layers.{bid}.mlp.down_proj", # embeddinggemma
|
|
"layers.{bid}.mlp.down_proj", # embeddinggemma
|
|
|
"layers.{bid}.feed_forward.w2", # llama-pth
|
|
"layers.{bid}.feed_forward.w2", # llama-pth
|
|
|
"encoder.layer.{bid}.output.dense", # bert
|
|
"encoder.layer.{bid}.output.dense", # bert
|
|
|
|
|
+ "layers.{bid}.mlp.Wo", # modern-bert
|
|
|
"transformer.layer.{bid}.ffn.lin2", # distillbert
|
|
"transformer.layer.{bid}.ffn.lin2", # distillbert
|
|
|
"transformer.h.{bid}.mlp.fc_out", # gpt-j
|
|
"transformer.h.{bid}.mlp.fc_out", # gpt-j
|
|
|
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
|
|
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
|
|
@@ -1122,6 +1132,7 @@ class TensorNameMap:
|
|
|
"classifier.dense", # roberta
|
|
"classifier.dense", # roberta
|
|
|
"pre_classifier", # distillbert
|
|
"pre_classifier", # distillbert
|
|
|
"dense", # neobert
|
|
"dense", # neobert
|
|
|
|
|
+ "head.dense", # modern-bert
|
|
|
),
|
|
),
|
|
|
|
|
|
|
|
MODEL_TENSOR.CLS_OUT: (
|
|
MODEL_TENSOR.CLS_OUT: (
|