10 месяцев назад · 108e53c2f1
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -180,7 +180,8 @@ class Model:
 
				             extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
			
 
				             missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
			
 
				             if len(extra) == 0 and len(missing_files) > 0:
			
 
				-                raise ValueError(f"Missing or incomplete model files: {missing_files}")
			
 
				+                raise ValueError(f"Missing or incomplete model files: {missing_files}\n"
			
 
				+                                 f"Missing tensors: {missing}")
			
 
				             else:
			
 
				                 raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
			
 
				                                  f"Missing tensors: {missing}\n"
			
@@ -1099,13 +1100,6 @@ class BloomModel(Model):
 
				 
			
 
				         tensors.append((self.map_tensor_name(name), data_torch))
			
 
				 
			
 
				-        if name == "word_embeddings.weight":
			
 
				-            assert self.tensor_names is not None
			
 
				-
			
 
				-            # TODO: tie them at runtime, don't duplicate in the model file
			
 
				-            if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
			
 
				-                tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
			
 
				-
			
 
				         return tensors
			
 
				 
			
 
				 
			
@@ -2423,10 +2417,6 @@ class GPT2Model(Model):
 
				 
			
 
				         tensors.append((new_name, data_torch))
			
 
				 
			
 
				-        # note: GPT2 output is tied to (same as) wte in original model
			
 
				-        if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
			
 
				-            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
			
 
				-
			
 
				         return tensors
			
 
				 
			
 
				 
			
@@ -2756,21 +2746,26 @@ class CodeShellModel(Model):
 
				         self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
			
 
				         self.gguf_writer.add_rope_scaling_factor(1.0)
			
 
				 
			
 
				+    _has_tok_embd = False
			
 
				+
			
 
				     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
			
 
				         del bid  # unused
			
 
				 
			
 
				-        new_name = self.map_tensor_name(name)
			
 
				-
			
 
				-        tensors: list[tuple[str, Tensor]] = [(new_name, data_torch)]
			
 
				+        output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
			
 
				+        tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)
			
 
				 
			
 
				-        if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
			
 
				-            assert self.tensor_names is not None
			
 
				+        new_name = self.map_tensor_name(name)
			
 
				 
			
 
				-            if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
			
 
				-                # copy tok_embd.weight to output.weight
			
 
				-                tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
			
 
				+        # assuming token_embd.weight is seen before output.weight
			
 
				+        if not self._has_tok_embd and new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
			
 
				+            # even though the tensor file(s) does not contain the word embeddings they are still in the weight map
			
 
				+            if self.tensor_names and "transformer.wte.weight" in self.tensor_names:
			
 
				+                logger.debug(f"{tok_embd_name} not found before {output_name}, assuming they are tied")
			
 
				+                self.tensor_names.remove("transformer.wte.weight")
			
 
				+        elif new_name == tok_embd_name:
			
 
				+            self._has_tok_embd = True
			
 
				 
			
 
				-        return tensors
			
 
				+        return [(new_name, data_torch)]
			
 
				 
			
 
				 
			
 
				 @Model.register("InternLM2ForCausalLM")
			
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2020,7 +2020,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
				                     // output
			
 
				                     output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
			
 
				                     output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
			
 
				-                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
			
 
				+                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
			
 
				+
			
 
				+                    // if output is NULL, init from the input tok embed
			
 
				+                    if (output == NULL) {
			
 
				+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
			
 
				+                    }
			
 
				 
			
 
				                     for (int i = 0; i < n_layer; ++i) {
			
 
				                         auto & layer = layers[i];
			
@@ -2381,7 +2386,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
				                     // output
			
 
				                     output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
			
 
				                     output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
			
 
				-                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
			
 
				+                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
			
 
				+
			
 
				+                    // if output is NULL, init from the input tok embed
			
 
				+                    if (output == NULL) {
			
 
				+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
			
 
				+                    }
			
 
				 
			
 
				                     for (int i = 0; i < n_layer; ++i) {
			
 
				                         auto & layer = layers[i];
			
@@ -2407,7 +2417,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
				                 } break;
			
 
				             case LLM_ARCH_CODESHELL:
			
 
				                 {
			
 
				-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
			
 
				+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
			
 
				+
			
 
				+                    // if tok embd is NULL, init from output
			
 
				+                    if (tok_embd == NULL) {
			
 
				+                        tok_embd = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
			
 
				+                    }
			
 
				 
			
 
				                     // output
			
 
				                     output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);