8 miesięcy temu · c104023994
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -879,9 +879,15 @@ struct clip_graph {
 
				         // add CLS token
			
 
				         inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
			
 
				 
			
 
				+        // The larger models use a different ViT, which uses RMS norm instead of layer norm
			
 
				+        // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
			
 
				+        norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45)
			
 
				+            ? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B)
			
 
				+            : NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models)
			
 
				+
			
 
				         ggml_tensor * cur = build_vit(
			
 
				                                 inp, n_pos,
			
 
				-                                NORM_TYPE_NORMAL,
			
 
				+                                norm_t,
			
 
				                                 hparams.ffn_op,
			
 
				                                 model.position_embeddings,
			
 
				                                 nullptr);