|
|
@@ -713,6 +713,9 @@ class ModelBase:
|
|
|
if "llm_config" in config:
|
|
|
# rename for InternVL
|
|
|
config["text_config"] = config["llm_config"]
|
|
|
+ if "lm_config" in config:
|
|
|
+ # rename for GlmASR
|
|
|
+ config["text_config"] = config["lm_config"]
|
|
|
if "thinker_config" in config:
|
|
|
# rename for Qwen2.5-Omni
|
|
|
config["text_config"] = config["thinker_config"]["text_config"]
|
|
|
@@ -1529,6 +1532,21 @@ class TextModel(ModelBase):
|
|
|
raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
|
|
|
self.gguf_writer.add_pooling_type(pooling_type)
|
|
|
|
|
|
+ def _set_vocab_glmedge(self):
|
|
|
+ from transformers import AutoTokenizer
|
|
|
+ tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
|
|
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
|
|
+ tokens, toktypes, tokpre = self.get_vocab_base()
|
|
|
+ self.gguf_writer.add_tokenizer_model("gpt2")
|
|
|
+ self.gguf_writer.add_tokenizer_pre(tokpre)
|
|
|
+ self.gguf_writer.add_token_list(tokens)
|
|
|
+ self.gguf_writer.add_token_types(toktypes)
|
|
|
+ special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
|
|
|
+ special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
|
|
|
+ special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
|
|
|
+ special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"])
|
|
|
+ special_vocab.add_to_gguf(self.gguf_writer)
|
|
|
+
|
|
|
def _set_vocab_interns1(self):
|
|
|
tokens: list[str] = []
|
|
|
toktypes: list[int] = []
|
|
|
@@ -1658,7 +1676,7 @@ class MmprojModel(ModelBase):
|
|
|
preprocessor_config: dict[str, Any]
|
|
|
global_config: dict[str, Any]
|
|
|
|
|
|
- n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"]
|
|
|
+ n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "encoder_layers"]
|
|
|
|
|
|
has_vision_encoder: bool = True # by default
|
|
|
has_audio_encoder: bool = False
|
|
|
@@ -1734,7 +1752,8 @@ class MmprojModel(ModelBase):
|
|
|
return self.global_config.get(config_name)
|
|
|
|
|
|
def get_audio_config(self) -> dict[str, Any] | None:
|
|
|
- return self.global_config.get("audio_config")
|
|
|
+ mm_config_key = "whisper_config" if "whisper_config" in self.hparams else "audio_config"
|
|
|
+ return self.global_config.get(mm_config_key)
|
|
|
|
|
|
def set_type(self):
|
|
|
self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)
|
|
|
@@ -2372,8 +2391,13 @@ class LlamaModel(TextModel):
|
|
|
# fix for SmolVLM2, missing `num_attention_heads` in config.json
|
|
|
if self.hf_arch == "VLlama3ForCausalLM":
|
|
|
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
|
|
|
+ hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
|
|
|
+ self.origin_hf_arch = hparams.get('architectures', [None])[0]
|
|
|
|
|
|
def set_vocab(self):
|
|
|
+ if self.origin_hf_arch == "GlmasrModel":
|
|
|
+ return self._set_vocab_glmedge()
|
|
|
+
|
|
|
if self.is_mistral_format:
|
|
|
return self._set_vocab_mistral()
|
|
|
|
|
|
@@ -2444,6 +2468,7 @@ class LlamaModel(TextModel):
|
|
|
"vision_language_adapter.",
|
|
|
"patch_merger.",
|
|
|
"pre_mm_projector_norm",
|
|
|
+ "audio_encoder.",
|
|
|
]
|
|
|
|
|
|
is_multimodal_tensor = "vision_tower" in name \
|
|
|
@@ -8846,6 +8871,63 @@ class UltravoxModel(TextModel):
|
|
|
raise NotImplementedError("Ultravox does not have text decoder. Instead, it uses Llama or other models for text. If you want to get the audio encoder, please use --mmproj argument")
|
|
|
|
|
|
|
|
|
+@ModelBase.register("GlmasrModel")
|
|
|
+class GlmASRWhisperEncoderModel(MmprojModel):
|
|
|
+ has_vision_encoder = False
|
|
|
+ has_audio_encoder = True
|
|
|
+
|
|
|
+ def __init__(self, *args, **kwargs):
|
|
|
+ super().__init__(*args, **kwargs)
|
|
|
+ if "hidden_size" not in self.hparams and "intermediate_size" not in self.hparams:
|
|
|
+ self.hparams["hidden_size"] = self.hparams["d_model"]
|
|
|
+ self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
|
|
|
+ self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
|
|
|
+
|
|
|
+ def set_gguf_parameters(self):
|
|
|
+ super().set_gguf_parameters()
|
|
|
+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLMA)
|
|
|
+ self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"])
|
|
|
+ self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
|
|
|
+ self.gguf_writer.add_audio_stack_factor(self.global_config["merge_factor"])
|
|
|
+
|
|
|
+ def tensor_force_quant(self, name, new_name, bid, n_dims):
|
|
|
+ if ".conv" in name and ".weight" in name:
|
|
|
+ return gguf.GGMLQuantizationType.F16
|
|
|
+ return super().tensor_force_quant(name, new_name, bid, n_dims)
|
|
|
+
|
|
|
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
|
+ del bid # unused
|
|
|
+
|
|
|
+ if name.startswith("model.") or name.startswith("lm_head."):
|
|
|
+ # skip language model tensors
|
|
|
+ return []
|
|
|
+
|
|
|
+ if name.startswith("audio_encoder.whisper."):
|
|
|
+ name = name.replace("audio_encoder.whisper.","audio_tower.")
|
|
|
+ if "audio_encoder.layer_norm." in name or "audio_encoder.proj." in name:
|
|
|
+ name = name.replace("audio_encoder.", "audio_encoder.adapting.")
|
|
|
+
|
|
|
+ if name.startswith("audio_encoder.audio_bos_eos_token."):
|
|
|
+ return [(self.map_tensor_name("model.vision.boi"), data_torch[0]), (self.map_tensor_name("model.vision.eoi"), data_torch[1])]
|
|
|
+
|
|
|
+ if name.startswith("audio_encoder.adapting."):
|
|
|
+ name = name.replace("audio_encoder.adapting.","audio.multi_modal_projector.")
|
|
|
+ if ".layer_norm." in name:
|
|
|
+ name = name.replace(".layer_norm.", ".ln_pre.")
|
|
|
+ if ".0." in name:
|
|
|
+ name = name.replace(".0.", ".linear_1.")
|
|
|
+ if ".2." in name:
|
|
|
+ name = name.replace(".2.", ".linear_2.")
|
|
|
+ if ".proj." in name:
|
|
|
+ return []
|
|
|
+
|
|
|
+ if "conv1.bias" in name or "conv2.bias" in name:
|
|
|
+ # transpose conv1 and conv2 bias
|
|
|
+ data_torch = data_torch.unsqueeze(-1)
|
|
|
+
|
|
|
+ return [(self.map_tensor_name(name), data_torch)]
|
|
|
+
|
|
|
+
|
|
|
@ModelBase.register("Qwen2AudioForConditionalGeneration")
|
|
|
class WhisperEncoderModel(MmprojModel):
|
|
|
has_vision_encoder = False # no vision encoder
|