Просмотр исходного кода

Fix some state regressions... still wip

Piotr Wilkin 3 месяцев назад
Родитель
Сommit
8ddaf251ae

+ 1 - 1
examples/eval-callback/eval-callback.cpp

@@ -154,7 +154,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
 
     if (!ggml_is_quantized(t->type)) {
         uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
-        ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
+        ggml_print_tensor(data, t->type, t->ne, t->nb, 8);
     }
 
     return true;

+ 3 - 2
examples/model-conversion/scripts/causal/run-org-model.py

@@ -185,15 +185,16 @@ model_name = os.path.basename(model_path)
 # of using AutoModelForCausalLM.
 print(f"Model class: {model.__class__.__name__}")
 
+device = next(model.parameters()).device
 prompt = "Hello, my name is"
-input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
 
 print(f"Input tokens: {input_ids}")
 print(f"Input text: {repr(prompt)}")
 print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
 
 with torch.no_grad():
-    outputs = model(input_ids.to("cuda"))
+    outputs = model(input_ids)
     logits = outputs.logits
 
     # Extract logits for the last token (next token prediction)

+ 3 - 4
src/models/llm_build_qwen3next.cpp

@@ -528,10 +528,9 @@ ggml_tensor * llm_build_qwen3next::build_qwen3next_linear_attn_layer(llm_graph_i
                                                         (conv_kernel_size - 1) * ggml_element_size(conv_output));
     cb(conv_output_no_padding, "conv_output_no_padding", il);
 
-    // Take only the last n_seq_tokens values
-    ggml_tensor * conv_output_proper = ggml_view_4d(ctx0, conv_output_no_padding, n_seq_tokens, conv_output_no_padding->ne[1], 
-        conv_output_no_padding->ne[2], conv_output_no_padding->ne[3], conv_output_no_padding->nb[1], 
-        conv_output_no_padding->nb[2], conv_output_no_padding->nb[3], (conv_output_no_padding->ne[0] - n_seq_tokens) * ggml_element_size(conv_output_no_padding));
+    // Take only the first n_seq_tokens values
+    ggml_tensor * conv_output_proper = ggml_view_4d(ctx0, conv_output_no_padding, n_seq_tokens, conv_output_no_padding->ne[1], conv_output_no_padding->ne[2], conv_output_no_padding->ne[3],
+                                                        conv_output_no_padding->nb[1], conv_output_no_padding->nb[2], conv_output_no_padding->nb[3], 0);
     cb(conv_output_proper, "conv_output_proper", il);
 
     conv_output_proper = ggml_permute(ctx0, conv_output_proper, 0, 1, 3, 2);