Piotr Wilkin 3 месяцев назад
Родитель
Сommit
5417f3294b

+ 8 - 0
comp.sh

@@ -0,0 +1,8 @@
+#!/bin/bash
+echo "Running converted model."
+llama-cli -no-cnv -m reference/qwen3_ntl/qwen3_ntl.gguf -p "Once upon a time" -n 30 --temp 0 &> data/tinylong-30-tok.txt
+echo "Running original model."
+python examples/model-conversion/scripts/causal/run-org-model-multi-token.py --model-path reference/qwen3_ntl --num-tokens 30 --prompt "Once upon a time" &> data/tinylong-30-tok-org.txt
+echo "Running tensor comparison."
+python reference/compare_tensors.py 30 16 &> data/tinylong-30-compare.txt
+echo "Done."

+ 66 - 77
examples/model-conversion/scripts/causal/run-org-model-multi-token.py

@@ -43,84 +43,19 @@ token_counter = {}
 layer_counter = {}
 num_model_layers = 0
 
-def summarize(tensor: torch.Tensor, name: str, max_seq: int = 3, max_vals: int = 3):
+
+def summarize(tensor: torch.Tensor, name: str, max_seq: int = 4):
+    torch.set_printoptions(precision = 6, edgeitems = max_seq, linewidth = 160, sci_mode = False, threshold = 50)
     global num_model_layers, layer_counter, token_counter
     """
     Print a tensor in llama.cpp debug style.
-
-    Supports:
-    - 2D tensors (seq, hidden)
-    - 3D tensors (batch, seq, hidden)
-    - 4D tensors (batch, seq, heads, dim_per_head) via flattening heads × dim_per_head
-    - 5D tensors
-
     Shows first and last max_vals of each vector per sequence position.
     """
     t = tensor.detach().to(torch.float32).cpu()
-    ten_shape = t.shape
-    while t.ndim > 4:
-        t = t.squeeze(0)
-
-    # Determine dimensions
-    if t.ndim == 3:
-        _, s, _ = t.shape
-    elif t.ndim == 2:
-        _, s = 1, t.shape[0]
-        t = t.unsqueeze(0)
-    elif t.ndim == 4:
-        _, s, _, _ = t.shape
-
-    else:
-        print(f"Skipping tensor due to unsupported dimensions: {t.ndim}")
-        return
-
-    print(f"ggml_debug: {name} = (f32)  ... = {{{ten_shape}}}")
-    print("                                     [")
-    print("                                      [")
-
-    # Determine indices for first and last sequences
-    first_indices = list(range(min(s, max_seq)))
-    last_indices = list(range(max(0, s - max_seq), s))
 
-    # Check if there's an overlap between first and last indices or if we're at the edge case of s = 2 * max_seq
-    has_overlap = bool(set(first_indices) & set(last_indices)) or (max_seq * 2 == s)
-
-    # Combine indices
-    if has_overlap:
-        # If there's overlap, just use the combined unique indices
-        indices = sorted(list(set(first_indices + last_indices)))
-        separator_index = None
-    else:
-        # If no overlap, we'll add a separator between first and last sequences
-        indices = first_indices + last_indices
-        separator_index = len(first_indices)
-
-    for i, si in enumerate(indices):
-        # Add separator if needed
-        if separator_index is not None and i == separator_index:
-            print("                                       ...")
-
-        # Extract appropriate slice
-        vec = t[0, si]
-        if vec.ndim == 2:  # 4D case: flatten heads × dim_per_head
-            flat = vec.flatten().tolist()
-        else:  # 2D or 3D case
-            flat = vec.tolist()
-
-        # First and last slices
-        first = flat[:max_vals]
-        last = flat[-max_vals:] if len(flat) >= 2 * max_vals else flat
-        first_str = ", ".join(f"{v:12.4f}" for v in first)
-        last_str = ", ".join(f"{v:12.4f}" for v in last)
-
-        if len(flat) >= 2 * max_vals:
-            print(f"                                       [{first_str}, ..., {last_str}]")
-        else:
-            print(f"                                       [{last_str}]")
-
-    print("                                      ],")
-    print("                                     ]")
-    print(f"                                     sum = {t.sum().item():.6f}\n")
+    print(f"ggml_debug: {name} = (f32)  ... = {{{t.shape}}}\n")
+    print(t)
+    print(f"\n                                     sum = {t.sum().item():.6f}\n")
 
     indexed_patterns = [ r"model\.layers\.[0-9]+_out", r"recurrent_cache_[0-9]+" ]
     non_indexed_patterns = [ r"k_pad", r"v_pad", r"q_scaled" ]
@@ -146,11 +81,41 @@ def summarize(tensor: torch.Tensor, name: str, max_seq: int = 3, max_vals: int =
                 layer_counter[name] = layer_counter[name] + 1  # skip attention layers
         save_tensor(t, f"reference/tensors/org/{name}_{layer_counter[name]}_{token_counter[name]}.bin")
 
-from transformers.models.qwen3_next.modeling_qwen3_next import torch_causal_conv1d_update, apply_rotary_pos_emb, l2norm  # noqa: E402
+from transformers.models.qwen3_next.modeling_qwen3_next import torch_causal_conv1d_update, apply_rotary_pos_emb, l2norm, repeat_kv  # noqa: E402
+from transformers.processing_utils import Unpack # noqa: E402
+from transformers.utils.generic import TransformersKwargs # noqa: E402
 orig_conv1d_update = torch_causal_conv1d_update
 orig_rope = apply_rotary_pos_emb
 import torch.nn.functional as F  # noqa: E402
 import typing  # noqa: E402
+from torch import nn # noqa: E402
+
+def patched_eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: typing.Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    print(f"\nAttention scaling: {scaling}\n")
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    summarize(attn_output, "attn_output")
+
+    return attn_output, attn_weights
 
 def patched_torch_causal_conv1d_update(
     hidden_states,
@@ -343,7 +308,13 @@ def patched_torch_chunk_gated_delta_rule(
         summarize(k_i, f"k_i_chunk_{i}")
         summarize(v_i, f"v_i_chunk_{i}")
 
-        attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
+        q_k_trans = q_i @ k_i.transpose(-1, -2)
+        summarize(q_k_trans, f"q_k_trans_{i}")
+
+        q_k_trans_decay = q_k_trans * decay_mask[:, :, i]
+        summarize(q_k_trans_decay, f"q_k_trans_decay_{i}")
+
+        attn = q_k_trans_decay.masked_fill_(mask, 0)
         summarize(attn, f"attn_chunk_{i}")
 
         v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
@@ -362,16 +333,31 @@ def patched_torch_chunk_gated_delta_rule(
         summarize(g_last, f"g_last_chunk_{i}")
 
         g_diff_exp = (g[:, :, i, -1, None] - g[:, :, i]).exp()
-        last_recurrent_state = (
-            last_recurrent_state * g_last
-            + (k_i * g_diff_exp[..., None]).transpose(-1, -2) @ v_new
-        )
+        summarize(g_diff_exp, f"g_diff_exp_chunk_{i}")
+
+        state_g_last = last_recurrent_state * g_last
+        summarize(state_g_last, f"state_g_last_{i}")
+
+        k_g_diffexp = (k_i * g_diff_exp[..., None])
+        summarize(k_g_diffexp, f"k_g_diffexp_{i}")
+        
+        k_g_diffexp_T = k_g_diffexp.transpose(-1, -2)
+        summarize(k_g_diffexp, f"k_g_diffexp_T_{i}")
+
+        kgd_mul_vnew = k_g_diffexp_T @ v_new
+        summarize(kgd_mul_vnew, f"kgd_mul_vnew_{i}")
+
+        last_recurrent_state = state_g_last + kgd_mul_vnew
         summarize(last_recurrent_state, f"updated_state_chunk_{i}")
 
     if not output_final_state:
         last_recurrent_state = None
     core_attn_out = core_attn_out.reshape(core_attn_out.shape[0], core_attn_out.shape[1], -1, core_attn_out.shape[-1])
+    summarize(core_attn_out, "attn_out_reshaped")
+
     core_attn_out = core_attn_out[:, :, :num_heads]
+    summarize(core_attn_out, "attn_out_truncated")
+
     core_attn_out = core_attn_out.transpose(1, 2).contiguous().to(initial_dtype)
     summarize(core_attn_out, "attn_out")
 
@@ -451,6 +437,7 @@ qwen_mod.torch_chunk_gated_delta_rule = patched_torch_chunk_gated_delta_rule
 qwen_mod.torch_causal_conv1d_update = patched_torch_causal_conv1d_update
 qwen_mod.apply_rotary_pos_emb = patched_apply_rope
 qwen_mod.torch_recurrent_gated_delta_rule = patched_torch_recurrent_gated_delta_rule
+qwen_mod.eager_attention_forward = patched_eager_attention_forward
 
 # Store original functions for patching
 original_functions = {}
@@ -736,6 +723,8 @@ data_dir.mkdir(exist_ok=True)
 all_generated_tokens = []
 all_logits = []
 
+model.config._attn_implementation = "eager"
+
 with torch.no_grad():
     # Initial forward pass
     print(f"\n=== Initial Forward Pass ===")

+ 1 - 1
ggml/src/ggml-cpu/ops.cpp

@@ -11406,7 +11406,7 @@ void ggml_compute_forward_delta_net_recurrent_f32(const ggml_compute_params * pa
                         *(state_ptr(seq, head, i, j)) = temp_state[state_idx];
                         
                         // Store the final state for this head and sequence (for output)
-                        int64_t final_state_idx = i + j * S_v + head * (S_v * S_v) + seq * (S_v * S_v * H_v);
+                        int64_t final_state_idx = j + i * S_v + head * (S_v * S_v) + seq * (S_v * S_v * H_v);
                         final_state[final_state_idx] = temp_state[state_idx];
                     }
                 }

+ 1 - 1
src/llama-graph.cpp

@@ -1532,7 +1532,7 @@ ggml_tensor * llm_graph_context::build_attn(
 
     if (wo) {
         cur = build_lora_mm(wo, cur);
-        if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_QWEN3NEXT) {
+        if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
             // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
             ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
         }

+ 10393 - 0
tinylong-30-compare.txt

@@ -0,0 +1,10393 @@
+Comparing tensors between original and converted GGML models...
+Tokens: 30, Layers: 16
+================================================================================
+
+================================================================================
+Comparing model.layers.out tensors...
+================================================================================
+
+Layer 0, Token 1 (model.layers.out comparison):
+  Original tensor sum: -109.202682
+  Converted tensor sum: -109.202667
+  Original tensor mean: -3.412584
+  Converted tensor mean: -3.412583
+ Mean difference: 0.00000112
+ Maximum pointwise difference: 0.00000358
+ Max difference location: (0, 3, 2)
+  Values at max diff - Original: -3.23131371, Converted: -3.23131013
+ Biggest difference in row (0, 3), sum -70.622650 vs -70.622643
+Original tensor: 
+
+[[[  0.53282046   0.45114386   2.2156353   -0.5117184   -1.6482054
+     4.6376505   -2.9421384   -3.4354253 ]
+  [ -8.487997    -5.323722    -4.790135    -8.482631     4.4259453
+    -0.7649012   -5.2080426   -3.5365663 ]
+  [ -2.8659308   -0.7302124    3.4494972   -0.7121358   -4.4744496
+     1.4391303   -1.05655     -0.76109344]
+  [-10.8983     -11.325392    -3.2313137  -11.594204   -13.007862
+    -6.099822   -13.027901    -1.4378595 ]]]
+
+Converted tensor: 
+
+[[[  0.53281975   0.45114377   2.215636    -0.5117179   -1.6482062
+     4.6376514   -2.942138    -3.4354265 ]
+  [ -8.487997    -5.323724    -4.7901373   -8.48263      4.425948
+    -0.7649009   -5.208041    -3.5365672 ]
+  [ -2.8659306   -0.7302135    3.4494982   -0.7121362   -4.474449
+     1.4391313   -1.0565499   -0.7610918 ]
+  [-10.898299   -11.325391    -3.2313101  -11.594204   -13.00786
+    -6.099819   -13.027899    -1.437861  ]]]
+
+
+
+Layer 1, Token 1 (model.layers.out comparison):
+  Original tensor sum: -132.672058
+  Converted tensor sum: -132.672043
+  Original tensor mean: -4.146002
+  Converted tensor mean: -4.146001
+ Mean difference: 0.00000322
+ Maximum pointwise difference: 0.00000763
+ Max difference location: (0, 1, 0)
+  Values at max diff - Original: -8.45331192, Converted: -8.45330429
+ Biggest difference in row (0, 2), sum 8.045303 vs 8.045274
+
+Layer 2, Token 1 (model.layers.out comparison):
+  Original tensor sum: -123.594589
+  Converted tensor sum: -123.594765
+  Original tensor mean: -3.862331
+  Converted tensor mean: -3.862336
+ Mean difference: 0.00001101
+ Maximum pointwise difference: 0.00005722
+ Max difference location: (0, 3, 0)
+  Values at max diff - Original: -14.73531914, Converted: -14.73537636
+ Biggest difference in row (0, 3), sum -100.578644 vs -100.578781
+
+Layer 3, Token 1 (model.layers.out comparison):
+  Original tensor sum: -1014.197754
+  Converted tensor sum: -1014.208618
+  Original tensor mean: -31.693680
+  Converted tensor mean: -31.694019
+ Mean difference: 0.00261304
+ Maximum pointwise difference: 0.00854874
+ Max difference location: (0, 3, 4)
+  Values at max diff - Original: -47.59802246, Converted: -47.60657120
+ Biggest difference in row (0, 3), sum -413.478455 vs -413.514832
+
+Layer 4, Token 1 (model.layers.out comparison):
+  Original tensor sum: -974.648987
+  Converted tensor sum: -974.659424
+  Original tensor mean: -30.457781
+  Converted tensor mean: -30.458107
+ Mean difference: 0.00296569
+ Maximum pointwise difference: 0.00885773
+ Max difference location: (0, 3, 4)
+  Values at max diff - Original: -45.65669632, Converted: -45.66555405
+ Biggest difference in row (0, 3), sum -380.904694 vs -380.942291
+
+Layer 5, Token 1 (model.layers.out comparison):
+  Original tensor sum: -842.923950
+  Converted tensor sum: -842.923950
+  Original tensor mean: -26.341373
+  Converted tensor mean: -26.341373
+ Mean difference: 0.00327585
+ Maximum pointwise difference: 0.00857162
+ Max difference location: (0, 3, 4)
+  Values at max diff - Original: -47.09656525, Converted: -47.10513687
+ Biggest difference in row (0, 3), sum -366.704346 vs -366.739746
+
+Layer 6, Token 1 (model.layers.out comparison):
+  Original tensor sum: -940.556580
+  Converted tensor sum: -940.507812
+  Original tensor mean: -29.392393
+  Converted tensor mean: -29.390869
+ Mean difference: 0.00368834
+ Maximum pointwise difference: 0.00840378
+ Max difference location: (0, 3, 4)
+  Values at max diff - Original: -49.34116364, Converted: -49.34956741
+ Biggest difference in row (0, 2), sum -130.006729 vs -129.970612
+
+Layer 7, Token 1 (model.layers.out comparison):
+  Original tensor sum: -1838.171143
+  Converted tensor sum: -1838.228271
+  Original tensor mean: -57.442848
+  Converted tensor mean: -57.444633
+ Mean difference: 0.00574541
+ Maximum pointwise difference: 0.01725769
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: -91.24589539, Converted: -91.26315308
+ Biggest difference in row (0, 0), sum -622.551270 vs -622.626587
+
+Layer 8, Token 1 (model.layers.out comparison):
+  Original tensor sum: -1890.751709
+  Converted tensor sum: -1890.670898
+  Original tensor mean: -59.085991
+  Converted tensor mean: -59.083466
+ Mean difference: 0.01148558
+ Maximum pointwise difference: 0.05082703
+ Max difference location: (0, 2, 6)
+  Values at max diff - Original: -49.12084961, Converted: -49.07002258
+ Biggest difference in row (0, 2), sum -356.818451 vs -356.663208
+
+Layer 9, Token 1 (model.layers.out comparison):
+  Original tensor sum: -1949.811523
+  Converted tensor sum: -1949.711426
+  Original tensor mean: -60.931610
+  Converted tensor mean: -60.928482
+ Mean difference: 0.01115143
+ Maximum pointwise difference: 0.04758072
+ Max difference location: (0, 2, 6)
+  Values at max diff - Original: -49.22105789, Converted: -49.17347717
+ Biggest difference in row (0, 2), sum -367.878845 vs -367.720154
+
+Layer 10, Token 1 (model.layers.out comparison):
+  Original tensor sum: -1955.402832
+  Converted tensor sum: -1955.281250
+  Original tensor mean: -61.106339
+  Converted tensor mean: -61.102539
+ Mean difference: 0.01230341
+ Maximum pointwise difference: 0.04833603
+ Max difference location: (0, 2, 6)
+  Values at max diff - Original: -43.91606140, Converted: -43.86772537
+ Biggest difference in row (0, 2), sum -370.409668 vs -370.259583
+
+Layer 11, Token 1 (model.layers.out comparison):
+  Original tensor sum: -3642.472900
+  Converted tensor sum: -3642.428711
+  Original tensor mean: -113.827278
+  Converted tensor mean: -113.825897
+ Mean difference: 0.01628518
+ Maximum pointwise difference: 0.05126190
+ Max difference location: (0, 2, 6)
+  Values at max diff - Original: -94.39852142, Converted: -94.34725952
+ Biggest difference in row (0, 2), sum -786.509460 vs -786.331726
+
+Layer 12, Token 1 (model.layers.out comparison):
+  Original tensor sum: -3739.976807
+  Converted tensor sum: -3739.936035
+  Original tensor mean: -116.874275
+  Converted tensor mean: -116.873001
+ Mean difference: 0.01711488
+ Maximum pointwise difference: 0.05059052
+ Max difference location: (0, 2, 6)
+  Values at max diff - Original: -95.09668732, Converted: -95.04609680
+ Biggest difference in row (0, 2), sum -816.550781 vs -816.352295
+
+Layer 13, Token 1 (model.layers.out comparison):
+  Original tensor sum: -3821.749268
+  Converted tensor sum: -3821.721680
+  Original tensor mean: -119.429665
+  Converted tensor mean: -119.428802
+ Mean difference: 0.01747012
+ Maximum pointwise difference: 0.05052948
+ Max difference location: (0, 2, 7)
+  Values at max diff - Original: -79.35634613, Converted: -79.30581665
+ Biggest difference in row (0, 2), sum -840.805908 vs -840.616699
+
+Layer 14, Token 1 (model.layers.out comparison):
+  Original tensor sum: -4057.451904
+  Converted tensor sum: -4057.284668
+  Original tensor mean: -126.795372
+  Converted tensor mean: -126.790146
+ Mean difference: 0.01935625
+ Maximum pointwise difference: 0.07952881
+ Max difference location: (0, 2, 6)
+  Values at max diff - Original: -97.11465454, Converted: -97.03512573
+ Biggest difference in row (0, 2), sum -917.124573 vs -916.826172
+Error processing model.layers.out layer 15, token 1: cannot reshape array of size 8 into shape (1,4,8)
+
+Layer 0, Token 2 (model.layers.out comparison):
+  Original tensor sum: -7.280505
+  Converted tensor sum: -7.280507
+  Original tensor mean: -0.910063
+  Converted tensor mean: -0.910063
+ Mean difference: 0.00000097
+ Maximum pointwise difference: 0.00000179
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: -1.49786282, Converted: -1.49786103
+ Biggest difference in row (0, 0), sum -7.280505 vs -7.280507
+
+Layer 1, Token 2 (model.layers.out comparison):
+  Original tensor sum: -7.318125
+  Converted tensor sum: -7.318151
+  Original tensor mean: -0.914766
+  Converted tensor mean: -0.914769
+ Mean difference: 0.00000331
+ Maximum pointwise difference: 0.00000930
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: -3.41128922, Converted: -3.41129851
+ Biggest difference in row (0, 0), sum -7.318125 vs -7.318151
+
+Layer 2, Token 2 (model.layers.out comparison):
+  Original tensor sum: 14.344932
+  Converted tensor sum: 14.344961
+  Original tensor mean: 1.793116
+  Converted tensor mean: 1.793120
+ Mean difference: 0.00000746
+ Maximum pointwise difference: 0.00003266
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 3.23243976, Converted: 3.23247242
+ Biggest difference in row (0, 0), sum 14.344932 vs 14.344961
+
+Layer 3, Token 2 (model.layers.out comparison):
+  Original tensor sum: 46.801067
+  Converted tensor sum: 46.811996
+  Original tensor mean: 5.850133
+  Converted tensor mean: 5.851500
+ Mean difference: 0.00141515
+ Maximum pointwise difference: 0.00275421
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 6.65637064, Converted: 6.65912485
+ Biggest difference in row (0, 0), sum 46.801067 vs 46.811996
+
+Layer 4, Token 2 (model.layers.out comparison):
+  Original tensor sum: 47.891678
+  Converted tensor sum: 47.901840
+  Original tensor mean: 5.986460
+  Converted tensor mean: 5.987730
+ Mean difference: 0.00131346
+ Maximum pointwise difference: 0.00296640
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 7.13961887, Converted: 7.14258528
+ Biggest difference in row (0, 0), sum 47.891678 vs 47.901840
+
+Layer 5, Token 2 (model.layers.out comparison):
+  Original tensor sum: 45.815926
+  Converted tensor sum: 45.826260
+  Original tensor mean: 5.726991
+  Converted tensor mean: 5.728282
+ Mean difference: 0.00137006
+ Maximum pointwise difference: 0.00332642
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 7.94661283, Converted: 7.94993925
+ Biggest difference in row (0, 0), sum 45.815926 vs 45.826260
+
+Layer 6, Token 2 (model.layers.out comparison):
+  Original tensor sum: 40.223167
+  Converted tensor sum: 40.231720
+  Original tensor mean: 5.027896
+  Converted tensor mean: 5.028965
+ Mean difference: 0.00155937
+ Maximum pointwise difference: 0.00270462
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 7.07846451, Converted: 7.08116913
+ Biggest difference in row (0, 0), sum 40.223167 vs 40.231720
+
+Layer 7, Token 2 (model.layers.out comparison):
+  Original tensor sum: 84.588196
+  Converted tensor sum: 84.602402
+  Original tensor mean: 10.573524
+  Converted tensor mean: 10.575300
+ Mean difference: 0.00185513
+ Maximum pointwise difference: 0.00356102
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 13.31151009, Converted: 13.31507111
+ Biggest difference in row (0, 0), sum 84.588196 vs 84.602402
+
+Layer 8, Token 2 (model.layers.out comparison):
+  Original tensor sum: 85.737823
+  Converted tensor sum: 85.749390
+  Original tensor mean: 10.717228
+  Converted tensor mean: 10.718674
+ Mean difference: 0.00189817
+ Maximum pointwise difference: 0.00350094
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 13.90340519, Converted: 13.90690613
+ Biggest difference in row (0, 0), sum 85.737823 vs 85.749390
+
+Layer 9, Token 2 (model.layers.out comparison):
+  Original tensor sum: 83.069107
+  Converted tensor sum: 83.078979
+  Original tensor mean: 10.383638
+  Converted tensor mean: 10.384872
+ Mean difference: 0.00177890
+ Maximum pointwise difference: 0.00335407
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 13.79222488, Converted: 13.79557896
+ Biggest difference in row (0, 0), sum 83.069107 vs 83.078979
+
+Layer 10, Token 2 (model.layers.out comparison):
+  Original tensor sum: 80.782455
+  Converted tensor sum: 80.791588
+  Original tensor mean: 10.097807
+  Converted tensor mean: 10.098948
+ Mean difference: 0.00190949
+ Maximum pointwise difference: 0.00329256
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 6.64920282, Converted: 6.65249538
+ Biggest difference in row (0, 0), sum 80.782455 vs 80.791588
+
+Layer 11, Token 2 (model.layers.out comparison):
+  Original tensor sum: 124.938332
+  Converted tensor sum: 124.953712
+  Original tensor mean: 15.617291
+  Converted tensor mean: 15.619214
+ Mean difference: 0.00253391
+ Maximum pointwise difference: 0.00420666
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 12.86635590, Converted: 12.87056255
+ Biggest difference in row (0, 0), sum 124.938332 vs 124.953712
+
+Layer 12, Token 2 (model.layers.out comparison):
+  Original tensor sum: 124.466995
+  Converted tensor sum: 124.483871
+  Original tensor mean: 15.558374
+  Converted tensor mean: 15.560484
+ Mean difference: 0.00271881
+ Maximum pointwise difference: 0.00506878
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 12.41438103, Converted: 12.41944981
+ Biggest difference in row (0, 0), sum 124.466995 vs 124.483871
+
+Layer 13, Token 2 (model.layers.out comparison):
+  Original tensor sum: 121.646957
+  Converted tensor sum: 121.660385
+  Original tensor mean: 15.205870
+  Converted tensor mean: 15.207548
+ Mean difference: 0.00218880
+ Maximum pointwise difference: 0.00470448
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 12.02227020, Converted: 12.02697468
+ Biggest difference in row (0, 0), sum 121.646957 vs 121.660385
+
+Layer 14, Token 2 (model.layers.out comparison):
+  Original tensor sum: 116.636169
+  Converted tensor sum: 116.658142
+  Original tensor mean: 14.579521
+  Converted tensor mean: 14.582268
+ Mean difference: 0.00299489
+ Maximum pointwise difference: 0.00521469
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 18.26870537, Converted: 18.27392006
+ Biggest difference in row (0, 0), sum 116.636169 vs 116.658142
+
+Layer 15, Token 2 (model.layers.out comparison):
+  Original tensor sum: 201.843384
+  Converted tensor sum: 201.865143
+  Original tensor mean: 25.230423
+  Converted tensor mean: 25.233143
+ Mean difference: 0.00346577
+ Maximum pointwise difference: 0.00746727
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 30.94509888, Converted: 30.95256615
+ Biggest difference in row (0, 0), sum 201.843384 vs 201.865143
+
+Layer 0, Token 3 (model.layers.out comparison):
+  Original tensor sum: 18.698099
+  Converted tensor sum: 18.475292
+  Original tensor mean: 2.337262
+  Converted tensor mean: 2.309412
+ Mean difference: 2.67848086
+ Maximum pointwise difference: 4.89963531
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 2.51813841, Converted: 7.41777372
+ Biggest difference in row (0, 0), sum 18.698099 vs 18.475292
+
+Layer 1, Token 3 (model.layers.out comparison):
+  Original tensor sum: 13.937105
+  Converted tensor sum: 11.538675
+  Original tensor mean: 1.742138
+  Converted tensor mean: 1.442334
+ Mean difference: 2.56903791
+ Maximum pointwise difference: 5.56039190
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: 5.86116695, Converted: 0.30077514
+ Biggest difference in row (0, 0), sum 13.937105 vs 11.538675
+
+Layer 2, Token 3 (model.layers.out comparison):
+  Original tensor sum: 17.835873
+  Converted tensor sum: 9.065081
+  Original tensor mean: 2.229484
+  Converted tensor mean: 1.133135
+ Mean difference: 2.48439741
+ Maximum pointwise difference: 7.80053854
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: 7.08156919, Converted: -0.71896935
+ Biggest difference in row (0, 0), sum 17.835873 vs 9.065081
+
+Layer 3, Token 3 (model.layers.out comparison):
+  Original tensor sum: 19.733971
+  Converted tensor sum: 0.388454
+  Original tensor mean: 2.466746
+  Converted tensor mean: 0.048557
+ Mean difference: 2.74538827
+ Maximum pointwise difference: 8.14173889
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: 7.32600927, Converted: -0.81572962
+ Biggest difference in row (0, 0), sum 19.733971 vs 0.388454
+
+Layer 4, Token 3 (model.layers.out comparison):
+  Original tensor sum: 17.522738
+  Converted tensor sum: 7.885162
+  Original tensor mean: 2.190342
+  Converted tensor mean: 0.985645
+ Mean difference: 4.25575876
+ Maximum pointwise difference: 7.97597837
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 3.98348713, Converted: -3.99249125
+ Biggest difference in row (0, 0), sum 17.522738 vs 7.885162
+
+Layer 5, Token 3 (model.layers.out comparison):
+  Original tensor sum: 21.460897
+  Converted tensor sum: 15.969997
+  Original tensor mean: 2.682612
+  Converted tensor mean: 1.996250
+ Mean difference: 4.34595299
+ Maximum pointwise difference: 8.46822739
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 6.38704681, Converted: -2.08118057
+ Biggest difference in row (0, 0), sum 21.460897 vs 15.969997
+
+Layer 6, Token 3 (model.layers.out comparison):
+  Original tensor sum: 18.336536
+  Converted tensor sum: 9.128950
+  Original tensor mean: 2.292067
+  Converted tensor mean: 1.141119
+ Mean difference: 3.42625880
+ Maximum pointwise difference: 9.18005276
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 6.15963268, Converted: -3.02042007
+ Biggest difference in row (0, 0), sum 18.336536 vs 9.128950
+
+Layer 7, Token 3 (model.layers.out comparison):
+  Original tensor sum: 27.127436
+  Converted tensor sum: -91.853516
+  Original tensor mean: 3.390929
+  Converted tensor mean: -11.481689
+ Mean difference: 14.87261772
+ Maximum pointwise difference: 25.04354668
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 6.62252426, Converted: -18.42102242
+ Biggest difference in row (0, 0), sum 27.127436 vs -91.853516
+
+Layer 8, Token 3 (model.layers.out comparison):
+  Original tensor sum: 22.795490
+  Converted tensor sum: -94.016220
+  Original tensor mean: 2.849436
+  Converted tensor mean: -11.752028
+ Mean difference: 14.60146332
+ Maximum pointwise difference: 26.14372826
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 0.59730154, Converted: -25.54642677
+ Biggest difference in row (0, 0), sum 22.795490 vs -94.016220
+
+Layer 9, Token 3 (model.layers.out comparison):
+  Original tensor sum: 16.367466
+  Converted tensor sum: -159.747223
+  Original tensor mean: 2.045933
+  Converted tensor mean: -19.968403
+ Mean difference: 22.01433563
+ Maximum pointwise difference: 34.04418182
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: -0.55563742, Converted: -34.59981918
+ Biggest difference in row (0, 0), sum 16.367466 vs -159.747223
+
+Layer 10, Token 3 (model.layers.out comparison):
+  Original tensor sum: 22.587862
+  Converted tensor sum: -171.457092
+  Original tensor mean: 2.823483
+  Converted tensor mean: -21.432137
+ Mean difference: 24.25561905
+ Maximum pointwise difference: 40.39982224
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: -0.52963394, Converted: -40.92945480
+ Biggest difference in row (0, 0), sum 22.587862 vs -171.457092
+
+Layer 11, Token 3 (model.layers.out comparison):
+  Original tensor sum: 98.501198
+  Converted tensor sum: -580.205811
+  Original tensor mean: 12.312650
+  Converted tensor mean: -72.525726
+ Mean difference: 84.83837128
+ Maximum pointwise difference: 107.93860626
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 9.91925716, Converted: -98.01934814
+ Biggest difference in row (0, 0), sum 98.501198 vs -580.205811
+
+Layer 12, Token 3 (model.layers.out comparison):
+  Original tensor sum: 96.017456
+  Converted tensor sum: -599.130005
+  Original tensor mean: 12.002182
+  Converted tensor mean: -74.891251
+ Mean difference: 86.89343262
+ Maximum pointwise difference: 107.37790680
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 10.14877987, Converted: -97.22912598
+ Biggest difference in row (0, 0), sum 96.017456 vs -599.130005
+
+Layer 13, Token 3 (model.layers.out comparison):
+  Original tensor sum: 93.969711
+  Converted tensor sum: -604.221680
+  Original tensor mean: 11.746214
+  Converted tensor mean: -75.527710
+ Mean difference: 87.27392578
+ Maximum pointwise difference: 107.42771149
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 9.00540733, Converted: -98.42230225
+ Biggest difference in row (0, 0), sum 93.969711 vs -604.221680
+
+Layer 14, Token 3 (model.layers.out comparison):
+  Original tensor sum: 96.193565
+  Converted tensor sum: -675.267456
+  Original tensor mean: 12.024196
+  Converted tensor mean: -84.408432
+ Mean difference: 96.43263245
+ Maximum pointwise difference: 115.43507385
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 9.17813015, Converted: -106.25694275
+ Biggest difference in row (0, 0), sum 96.193565 vs -675.267456
+
+Layer 15, Token 3 (model.layers.out comparison):
+  Original tensor sum: 203.967834
+  Converted tensor sum: -1113.465820
+  Original tensor mean: 25.495979
+  Converted tensor mean: -139.183228
+ Mean difference: 164.67919922
+ Maximum pointwise difference: 181.33709717
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 23.16110420, Converted: -158.17599487
+ Biggest difference in row (0, 0), sum 203.967834 vs -1113.465820
+
+Layer 0, Token 4 (model.layers.out comparison):
+  Original tensor sum: 1.060196
+  Converted tensor sum: -6.035928
+  Original tensor mean: 0.132525
+  Converted tensor mean: -0.754491
+ Mean difference: 1.11038423
+ Maximum pointwise difference: 2.90589857
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 1.75988472, Converted: -1.14601374
+ Biggest difference in row (0, 0), sum 1.060196 vs -6.035928
+
+Layer 1, Token 4 (model.layers.out comparison):
+  Original tensor sum: -45.922947
+  Converted tensor sum: -53.028908
+  Original tensor mean: -5.740368
+  Converted tensor mean: -6.628613
+ Mean difference: 1.58238363
+ Maximum pointwise difference: 3.98315811
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: -7.45665455, Converted: -11.43981266
+ Biggest difference in row (0, 0), sum -45.922947 vs -53.028908
+
+Layer 2, Token 4 (model.layers.out comparison):
+  Original tensor sum: -47.965603
+  Converted tensor sum: -68.008888
+  Original tensor mean: -5.995700
+  Converted tensor mean: -8.501111
+ Mean difference: 4.45314884
+ Maximum pointwise difference: 12.72673607
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: 10.93319416, Converted: -1.79354143
+ Biggest difference in row (0, 0), sum -47.965603 vs -68.008888
+
+Layer 3, Token 4 (model.layers.out comparison):
+  Original tensor sum: -224.689087
+  Converted tensor sum: -313.872162
+  Original tensor mean: -28.086136
+  Converted tensor mean: -39.234020
+ Mean difference: 11.14788437
+ Maximum pointwise difference: 20.76882172
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: -9.44140339, Converted: -30.21022415
+ Biggest difference in row (0, 0), sum -224.689087 vs -313.872162
+
+Layer 4, Token 4 (model.layers.out comparison):
+  Original tensor sum: -207.206879
+  Converted tensor sum: -293.960205
+  Original tensor mean: -25.900860
+  Converted tensor mean: -36.745026
+ Mean difference: 10.84416676
+ Maximum pointwise difference: 23.99023056
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: -13.00386524, Converted: -36.99409485
+ Biggest difference in row (0, 0), sum -207.206879 vs -293.960205
+
+Layer 5, Token 4 (model.layers.out comparison):
+  Original tensor sum: -185.594986
+  Converted tensor sum: -298.454895
+  Original tensor mean: -23.199373
+  Converted tensor mean: -37.306862
+ Mean difference: 14.10748863
+ Maximum pointwise difference: 27.16260529
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: -13.13538361, Converted: -40.29798889
+ Biggest difference in row (0, 0), sum -185.594986 vs -298.454895
+
+Layer 6, Token 4 (model.layers.out comparison):
+  Original tensor sum: -226.913589
+  Converted tensor sum: -341.315369
+  Original tensor mean: -28.364199
+  Converted tensor mean: -42.664421
+ Mean difference: 14.30021858
+ Maximum pointwise difference: 27.83255386
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: -20.03028870, Converted: -47.86284256
+ Biggest difference in row (0, 0), sum -226.913589 vs -341.315369
+
+Layer 7, Token 4 (model.layers.out comparison):
+  Original tensor sum: -431.530212
+  Converted tensor sum: -553.909912
+  Original tensor mean: -53.941277
+  Converted tensor mean: -69.238739
+ Mean difference: 15.29746723
+ Maximum pointwise difference: 28.98126602
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: -44.07294083, Converted: -73.05420685
+ Biggest difference in row (0, 0), sum -431.530212 vs -553.909912
+
+Layer 8, Token 4 (model.layers.out comparison):
+  Original tensor sum: -442.906403
+  Converted tensor sum: -577.351807
+  Original tensor mean: -55.363300
+  Converted tensor mean: -72.168976
+ Mean difference: 16.80567932
+ Maximum pointwise difference: 24.00010681
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: -44.65782166, Converted: -68.65792847
+ Biggest difference in row (0, 0), sum -442.906403 vs -577.351807
+
+Layer 9, Token 4 (model.layers.out comparison):
+  Original tensor sum: -457.224976
+  Converted tensor sum: -606.660400
+  Original tensor mean: -57.153122
+  Converted tensor mean: -75.832550
+ Mean difference: 18.67943192
+ Maximum pointwise difference: 31.74385834
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: -37.91560364, Converted: -69.65946198
+ Biggest difference in row (0, 0), sum -457.224976 vs -606.660400
+
+Layer 10, Token 4 (model.layers.out comparison):
+  Original tensor sum: -464.368622
+  Converted tensor sum: -617.020081
+  Original tensor mean: -58.046078
+  Converted tensor mean: -77.127510
+ Mean difference: 19.08143044
+ Maximum pointwise difference: 31.15077591
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: -40.76456833, Converted: -71.91534424
+ Biggest difference in row (0, 0), sum -464.368622 vs -617.020081
+
+Layer 11, Token 4 (model.layers.out comparison):
+  Original tensor sum: -848.365112
+  Converted tensor sum: -1029.810791
+  Original tensor mean: -106.045639
+  Converted tensor mean: -128.726349
+ Mean difference: 22.68070793
+ Maximum pointwise difference: 33.58893585
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: -79.47626495, Converted: -113.06520081
+ Biggest difference in row (0, 0), sum -848.365112 vs -1029.810791
+
+Layer 12, Token 4 (model.layers.out comparison):
+  Original tensor sum: -856.364807
+  Converted tensor sum: -1034.875244
+  Original tensor mean: -107.045601
+  Converted tensor mean: -129.359406
+ Mean difference: 22.31380081
+ Maximum pointwise difference: 34.47047424
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: -94.66131592, Converted: -129.13179016
+ Biggest difference in row (0, 0), sum -856.364807 vs -1034.875244
+
+Layer 13, Token 4 (model.layers.out comparison):
+  Original tensor sum: -876.941895
+  Converted tensor sum: -1070.547119
+  Original tensor mean: -109.617737
+  Converted tensor mean: -133.818390
+ Mean difference: 24.20065689
+ Maximum pointwise difference: 35.39904022
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: -93.12728119, Converted: -128.52632141
+ Biggest difference in row (0, 0), sum -876.941895 vs -1070.547119
+
+Layer 14, Token 4 (model.layers.out comparison):
+  Original tensor sum: -914.061707
+  Converted tensor sum: -1087.587036
+  Original tensor mean: -114.257713
+  Converted tensor mean: -135.948380
+ Mean difference: 21.69067001
+ Maximum pointwise difference: 38.16375732
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: -116.85905457, Converted: -155.02281189
+ Biggest difference in row (0, 0), sum -914.061707 vs -1087.587036
+
+Layer 15, Token 4 (model.layers.out comparison):
+  Original tensor sum: -1341.588623
+  Converted tensor sum: -1530.308838
+  Original tensor mean: -167.698578
+  Converted tensor mean: -191.288605
+ Mean difference: 23.59002495
+ Maximum pointwise difference: 40.53677368
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: -178.01094055, Converted: -218.54771423
+ Biggest difference in row (0, 0), sum -1341.588623 vs -1530.308838
+
+Layer 0, Token 5 (model.layers.out comparison):
+  Original tensor sum: 12.113814
+  Converted tensor sum: 1.907211
+  Original tensor mean: 1.514227
+  Converted tensor mean: 0.238401
+ Mean difference: 1.41127276
+ Maximum pointwise difference: 3.03878593
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 1.65080941, Converted: -1.38797641
+ Biggest difference in row (0, 0), sum 12.113814 vs 1.907211
+
+Layer 1, Token 5 (model.layers.out comparison):
+  Original tensor sum: 3.328269
+  Converted tensor sum: 7.141708
+  Original tensor mean: 0.416034
+  Converted tensor mean: 0.892714
+ Mean difference: 1.73651075
+ Maximum pointwise difference: 4.59446096
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: -0.09795946, Converted: 4.49650145
+ Biggest difference in row (0, 0), sum 3.328269 vs 7.141708
+
+Layer 2, Token 5 (model.layers.out comparison):
+  Original tensor sum: -7.901872
+  Converted tensor sum: 12.052417
+  Original tensor mean: -0.987734
+  Converted tensor mean: 1.506552
+ Mean difference: 2.91872406
+ Maximum pointwise difference: 6.22109556
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: -1.61789608, Converted: 4.60319948
+ Biggest difference in row (0, 0), sum -7.901872 vs 12.052417
+
+Layer 3, Token 5 (model.layers.out comparison):
+  Original tensor sum: -206.706451
+  Converted tensor sum: 38.517872
+  Original tensor mean: -25.838306
+  Converted tensor mean: 4.814734
+ Mean difference: 30.65304184
+ Maximum pointwise difference: 36.99858475
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: -30.05084610, Converted: 6.94773912
+ Biggest difference in row (0, 0), sum -206.706451 vs 38.517872
+
+Layer 4, Token 5 (model.layers.out comparison):
+  Original tensor sum: -190.520950
+  Converted tensor sum: 37.683086
+  Original tensor mean: -23.815119
+  Converted tensor mean: 4.710386
+ Mean difference: 28.52550507
+ Maximum pointwise difference: 36.21773911
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: -29.42410278, Converted: 6.79363585
+ Biggest difference in row (0, 0), sum -190.520950 vs 37.683086
+
+Layer 5, Token 5 (model.layers.out comparison):
+  Original tensor sum: -129.615097
+  Converted tensor sum: 37.492149
+  Original tensor mean: -16.201887
+  Converted tensor mean: 4.686519
+ Mean difference: 20.88840675
+ Maximum pointwise difference: 30.11524200
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: -22.47561646, Converted: 7.63962507
+ Biggest difference in row (0, 0), sum -129.615097 vs 37.492149
+
+Layer 6, Token 5 (model.layers.out comparison):
+  Original tensor sum: -168.733810
+  Converted tensor sum: 40.467735
+  Original tensor mean: -21.091726
+  Converted tensor mean: 5.058467
+ Mean difference: 26.15019226
+ Maximum pointwise difference: 35.40680313
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: -27.34041214, Converted: 8.06639194
+ Biggest difference in row (0, 0), sum -168.733810 vs 40.467735
+
+Layer 7, Token 5 (model.layers.out comparison):
+  Original tensor sum: -375.952911
+  Converted tensor sum: 84.494781
+  Original tensor mean: -46.994114
+  Converted tensor mean: 10.561848
+ Mean difference: 57.55596161
+ Maximum pointwise difference: 65.51675415
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: -54.48764038, Converted: 11.02911663
+ Biggest difference in row (0, 0), sum -375.952911 vs 84.494781
+
+Layer 8, Token 5 (model.layers.out comparison):
+  Original tensor sum: -386.335632
+  Converted tensor sum: 90.464653
+  Original tensor mean: -48.291954
+  Converted tensor mean: 11.308082
+ Mean difference: 59.60003662
+ Maximum pointwise difference: 70.12364197
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: -54.78602219, Converted: 15.33761883
+ Biggest difference in row (0, 0), sum -386.335632 vs 90.464653
+
+Layer 9, Token 5 (model.layers.out comparison):
+  Original tensor sum: -407.643036
+  Converted tensor sum: 83.872604
+  Original tensor mean: -50.955379
+  Converted tensor mean: 10.484076
+ Mean difference: 61.43945694
+ Maximum pointwise difference: 73.87419128
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: -62.38755035, Converted: 11.48663712
+ Biggest difference in row (0, 0), sum -407.643036 vs 83.872604
+
+Layer 10, Token 5 (model.layers.out comparison):
+  Original tensor sum: -398.133545
+  Converted tensor sum: 83.310257
+  Original tensor mean: -49.766693
+  Converted tensor mean: 10.413782
+ Mean difference: 60.18047714
+ Maximum pointwise difference: 71.93079376
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: -61.05200958, Converted: 10.87878418
+ Biggest difference in row (0, 0), sum -398.133545 vs 83.310257
+
+Layer 11, Token 5 (model.layers.out comparison):
+  Original tensor sum: -795.896240
+  Converted tensor sum: 161.559113
+  Original tensor mean: -99.487030
+  Converted tensor mean: 20.194889
+ Mean difference: 119.68191528
+ Maximum pointwise difference: 136.52630615
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: -112.33381653, Converted: 24.19249153
+ Biggest difference in row (0, 0), sum -795.896240 vs 161.559113
+
+Layer 12, Token 5 (model.layers.out comparison):
+  Original tensor sum: -795.492065
+  Converted tensor sum: 157.049652
+  Original tensor mean: -99.436508
+  Converted tensor mean: 19.631207
+ Mean difference: 119.06771088
+ Maximum pointwise difference: 138.69142151
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: -115.85614014, Converted: 22.83527946
+ Biggest difference in row (0, 0), sum -795.492065 vs 157.049652
+
+Layer 13, Token 5 (model.layers.out comparison):
+  Original tensor sum: -816.679565
+  Converted tensor sum: 152.172302
+  Original tensor mean: -102.084946
+  Converted tensor mean: 19.021538
+ Mean difference: 121.10647583
+ Maximum pointwise difference: 142.45770264
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: -120.28170013, Converted: 22.17600250
+ Biggest difference in row (0, 0), sum -816.679565 vs 152.172302
+
+Layer 14, Token 5 (model.layers.out comparison):
+  Original tensor sum: -858.712524
+  Converted tensor sum: 152.386047
+  Original tensor mean: -107.339066
+  Converted tensor mean: 19.048256
+ Mean difference: 126.38732147
+ Maximum pointwise difference: 150.80645752
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: -129.48748779, Converted: 21.31897736
+ Biggest difference in row (0, 0), sum -858.712524 vs 152.386047
+
+Layer 15, Token 5 (model.layers.out comparison):
+  Original tensor sum: -1291.953247
+  Converted tensor sum: 244.354996
+  Original tensor mean: -161.494156
+  Converted tensor mean: 30.544374
+ Mean difference: 192.03852844
+ Maximum pointwise difference: 220.75814819
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: -189.25143433, Converted: 31.50671959
+ Biggest difference in row (0, 0), sum -1291.953247 vs 244.354996
+
+Layer 0, Token 6 (model.layers.out comparison):
+  Original tensor sum: 4.713745
+  Converted tensor sum: 11.404326
+  Original tensor mean: 0.589218
+  Converted tensor mean: 1.425541
+ Mean difference: 1.39658785
+ Maximum pointwise difference: 3.99744058
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: -2.16165113, Converted: 1.83578944
+ Biggest difference in row (0, 0), sum 4.713745 vs 11.404326
+
+Layer 1, Token 6 (model.layers.out comparison):
+  Original tensor sum: 2.484277
+  Converted tensor sum: 9.422175
+  Original tensor mean: 0.310535
+  Converted tensor mean: 1.177772
+ Mean difference: 1.56714785
+ Maximum pointwise difference: 3.13825679
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: -2.85257578, Converted: 0.28568110
+ Biggest difference in row (0, 0), sum 2.484277 vs 9.422175
+
+Layer 2, Token 6 (model.layers.out comparison):
+  Original tensor sum: -4.950438
+  Converted tensor sum: -1.357174
+  Original tensor mean: -0.618805
+  Converted tensor mean: -0.169647
+ Mean difference: 1.71385837
+ Maximum pointwise difference: 3.88516402
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 3.02349472, Converted: -0.86166936
+ Biggest difference in row (0, 0), sum -4.950438 vs -1.357174
+
+Layer 3, Token 6 (model.layers.out comparison):
+  Original tensor sum: -125.927612
+  Converted tensor sum: -106.782318
+  Original tensor mean: -15.740952
+  Converted tensor mean: -13.347790
+ Mean difference: 3.11209679
+ Maximum pointwise difference: 4.75263119
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: -17.46803665, Converted: -12.71540546
+ Biggest difference in row (0, 0), sum -125.927612 vs -106.782318
+
+Layer 4, Token 6 (model.layers.out comparison):
+  Original tensor sum: -139.830460
+  Converted tensor sum: -126.311844
+  Original tensor mean: -17.478807
+  Converted tensor mean: -15.788980
+ Mean difference: 3.15184307
+ Maximum pointwise difference: 5.99608994
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: -25.84107971, Converted: -19.84498978
+ Biggest difference in row (0, 0), sum -139.830460 vs -126.311844
+
+Layer 5, Token 6 (model.layers.out comparison):
+  Original tensor sum: -142.974274
+  Converted tensor sum: -73.637054
+  Original tensor mean: -17.871784
+  Converted tensor mean: -9.204632
+ Mean difference: 10.37221718
+ Maximum pointwise difference: 16.99522591
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: -22.96857643, Converted: -5.97335052
+ Biggest difference in row (0, 0), sum -142.974274 vs -73.637054
+
+Layer 6, Token 6 (model.layers.out comparison):
+  Original tensor sum: -180.967728
+  Converted tensor sum: -69.754128
+  Original tensor mean: -22.620966
+  Converted tensor mean: -8.719266
+ Mean difference: 14.33841133
+ Maximum pointwise difference: 25.72810745
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: -36.46190262, Converted: -10.73379517
+ Biggest difference in row (0, 0), sum -180.967728 vs -69.754128
+
+Layer 7, Token 6 (model.layers.out comparison):
+  Original tensor sum: -390.468323
+  Converted tensor sum: -284.137634
+  Original tensor mean: -48.808540
+  Converted tensor mean: -35.517204
+ Mean difference: 14.31795502
+ Maximum pointwise difference: 25.91625977
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: -61.98001099, Converted: -36.06375122
+ Biggest difference in row (0, 0), sum -390.468323 vs -284.137634
+
+Layer 8, Token 6 (model.layers.out comparison):
+  Original tensor sum: -325.042450
+  Converted tensor sum: -284.328186
+  Original tensor mean: -40.630306
+  Converted tensor mean: -35.541023
+ Mean difference: 6.66226053
+ Maximum pointwise difference: 16.25393486
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: -47.66500854, Converted: -31.41107368
+ Biggest difference in row (0, 0), sum -325.042450 vs -284.328186
+
+Layer 9, Token 6 (model.layers.out comparison):
+  Original tensor sum: -350.015503
+  Converted tensor sum: -313.897308
+  Original tensor mean: -43.751938
+  Converted tensor mean: -39.237164
+ Mean difference: 9.32056522
+ Maximum pointwise difference: 23.60877037
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: -54.44406891, Converted: -30.83529854
+ Biggest difference in row (0, 0), sum -350.015503 vs -313.897308
+
+Layer 10, Token 6 (model.layers.out comparison):
+  Original tensor sum: -375.606720
+  Converted tensor sum: -330.646790
+  Original tensor mean: -46.950840
+  Converted tensor mean: -41.330849
+ Mean difference: 8.38710022
+ Maximum pointwise difference: 27.84555435
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: -60.66308594, Converted: -32.81753159
+ Biggest difference in row (0, 0), sum -375.606720 vs -330.646790
+
+Layer 11, Token 6 (model.layers.out comparison):
+  Original tensor sum: -764.285278
+  Converted tensor sum: -730.992798
+  Original tensor mean: -95.535660
+  Converted tensor mean: -91.374100
+ Mean difference: 7.89588118
+ Maximum pointwise difference: 26.59626007
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: -118.78226471, Converted: -92.18600464
+ Biggest difference in row (0, 0), sum -764.285278 vs -730.992798
+
+Layer 12, Token 6 (model.layers.out comparison):
+  Original tensor sum: -777.147827
+  Converted tensor sum: -765.448669
+  Original tensor mean: -97.143478
+  Converted tensor mean: -95.681084
+ Mean difference: 6.33593750
+ Maximum pointwise difference: 19.02982330
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: -119.55146790, Converted: -100.52164459
+ Biggest difference in row (0, 0), sum -777.147827 vs -765.448669
+
+Layer 13, Token 6 (model.layers.out comparison):
+  Original tensor sum: -787.772400
+  Converted tensor sum: -777.362915
+  Original tensor mean: -98.471550
+  Converted tensor mean: -97.170364
+ Mean difference: 7.69482183
+ Maximum pointwise difference: 19.15751648
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: -120.39152527, Converted: -101.23400879
+ Biggest difference in row (0, 0), sum -787.772400 vs -777.362915
+
+Layer 14, Token 6 (model.layers.out comparison):
+  Original tensor sum: -883.013428
+  Converted tensor sum: -881.301514
+  Original tensor mean: -110.376678
+  Converted tensor mean: -110.162689
+ Mean difference: 12.85068035
+ Maximum pointwise difference: 28.13771820
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: -129.54022217, Converted: -101.40250397
+ Biggest difference in row (0, 0), sum -883.013428 vs -881.301514
+
+Layer 15, Token 6 (model.layers.out comparison):
+  Original tensor sum: -1324.892822
+  Converted tensor sum: -1316.172363
+  Original tensor mean: -165.611603
+  Converted tensor mean: -164.521545
+ Mean difference: 12.77940941
+ Maximum pointwise difference: 29.43301392
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: -192.78923035, Converted: -163.35621643
+ Biggest difference in row (0, 0), sum -1324.892822 vs -1316.172363
+
+Layer 0, Token 7 (model.layers.out comparison):
+  Original tensor sum: 16.302702
+  Converted tensor sum: 6.534010
+  Original tensor mean: 2.037838
+  Converted tensor mean: 0.816751
+ Mean difference: 1.39780235
+ Maximum pointwise difference: 4.86297131
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: 4.45225191, Converted: -0.41071916
+ Biggest difference in row (0, 0), sum 16.302702 vs 6.534010
+
+Layer 1, Token 7 (model.layers.out comparison):
+  Original tensor sum: 7.949856
+  Converted tensor sum: 10.515163
+  Original tensor mean: 0.993732
+  Converted tensor mean: 1.314395
+ Mean difference: 1.91308641
+ Maximum pointwise difference: 3.92083621
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 1.42750001, Converted: 5.34833622
+ Biggest difference in row (0, 0), sum 7.949856 vs 10.515163
+
+Layer 2, Token 7 (model.layers.out comparison):
+  Original tensor sum: 5.224671
+  Converted tensor sum: 8.502550
+  Original tensor mean: 0.653084
+  Converted tensor mean: 1.062819
+ Mean difference: 2.38619947
+ Maximum pointwise difference: 6.21067238
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: 4.76728964, Converted: -1.44338274
+ Biggest difference in row (0, 0), sum 5.224671 vs 8.502550
+
+Layer 3, Token 7 (model.layers.out comparison):
+  Original tensor sum: 13.283526
+  Converted tensor sum: 35.439297
+  Original tensor mean: 1.660441
+  Converted tensor mean: 4.429912
+ Mean difference: 3.47373605
+ Maximum pointwise difference: 5.22519779
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 1.58731771, Converted: 6.81251574
+ Biggest difference in row (0, 0), sum 13.283526 vs 35.439297
+
+Layer 4, Token 7 (model.layers.out comparison):
+  Original tensor sum: 17.744591
+  Converted tensor sum: 31.593395
+  Original tensor mean: 2.218074
+  Converted tensor mean: 3.949174
+ Mean difference: 2.68589926
+ Maximum pointwise difference: 4.57245827
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: -2.52367592, Converted: 2.04878211
+ Biggest difference in row (0, 0), sum 17.744591 vs 31.593395
+
+Layer 5, Token 7 (model.layers.out comparison):
+  Original tensor sum: 23.343349
+  Converted tensor sum: 33.269924
+  Original tensor mean: 2.917919
+  Converted tensor mean: 4.158741
+ Mean difference: 2.63248682
+ Maximum pointwise difference: 5.37845278
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: 5.39788294, Converted: 0.01943016
+ Biggest difference in row (0, 0), sum 23.343349 vs 33.269924
+
+Layer 6, Token 7 (model.layers.out comparison):
+  Original tensor sum: 23.346264
+  Converted tensor sum: 35.443920
+  Original tensor mean: 2.918283
+  Converted tensor mean: 4.430490
+ Mean difference: 2.67119837
+ Maximum pointwise difference: 4.63596630
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: 6.03884697, Converted: 1.40288091
+ Biggest difference in row (0, 0), sum 23.346264 vs 35.443920
+
+Layer 7, Token 7 (model.layers.out comparison):
+  Original tensor sum: 64.039200
+  Converted tensor sum: 91.760284
+  Original tensor mean: 8.004900
+  Converted tensor mean: 11.470036
+ Mean difference: 4.01984978
+ Maximum pointwise difference: 7.18059826
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 4.17877483, Converted: 11.35937309
+ Biggest difference in row (0, 0), sum 64.039200 vs 91.760284
+
+Layer 8, Token 7 (model.layers.out comparison):
+  Original tensor sum: 72.276039
+  Converted tensor sum: 93.156998
+  Original tensor mean: 9.034505
+  Converted tensor mean: 11.644625
+ Mean difference: 3.85819149
+ Maximum pointwise difference: 7.09706306
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 6.97290230, Converted: 14.06996536
+ Biggest difference in row (0, 0), sum 72.276039 vs 93.156998
+
+Layer 9, Token 7 (model.layers.out comparison):
+  Original tensor sum: 77.303429
+  Converted tensor sum: 87.750015
+  Original tensor mean: 9.662929
+  Converted tensor mean: 10.968752
+ Mean difference: 3.21908855
+ Maximum pointwise difference: 7.22212887
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 7.19689465, Converted: 14.41902351
+ Biggest difference in row (0, 0), sum 77.303429 vs 87.750015
+
+Layer 10, Token 7 (model.layers.out comparison):
+  Original tensor sum: 75.555130
+  Converted tensor sum: 87.081650
+  Original tensor mean: 9.444391
+  Converted tensor mean: 10.885206
+ Mean difference: 3.37582994
+ Maximum pointwise difference: 7.74006128
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: 13.60124302, Converted: 5.86118174
+ Biggest difference in row (0, 0), sum 75.555130 vs 87.081650
+
+Layer 11, Token 7 (model.layers.out comparison):
+  Original tensor sum: 156.940781
+  Converted tensor sum: 159.013306
+  Original tensor mean: 19.617598
+  Converted tensor mean: 19.876663
+ Mean difference: 3.38565111
+ Maximum pointwise difference: 8.84408474
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: 24.20116806, Converted: 15.35708332
+ Biggest difference in row (0, 0), sum 156.940781 vs 159.013306
+
+Layer 12, Token 7 (model.layers.out comparison):
+  Original tensor sum: 154.763428
+  Converted tensor sum: 153.900482
+  Original tensor mean: 19.345428
+  Converted tensor mean: 19.237560
+ Mean difference: 3.46122217
+ Maximum pointwise difference: 9.50335789
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: 24.17844582, Converted: 14.67508793
+ Biggest difference in row (0, 0), sum 154.763428 vs 153.900482
+
+Layer 13, Token 7 (model.layers.out comparison):
+  Original tensor sum: 153.990646
+  Converted tensor sum: 150.608353
+  Original tensor mean: 19.248831
+  Converted tensor mean: 18.826044
+ Mean difference: 3.53592730
+ Maximum pointwise difference: 9.36601925
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: 23.90514946, Converted: 14.53913021
+ Biggest difference in row (0, 0), sum 153.990646 vs 150.608353
+
+Layer 14, Token 7 (model.layers.out comparison):
+  Original tensor sum: 153.169525
+  Converted tensor sum: 133.618896
+  Original tensor mean: 19.146191
+  Converted tensor mean: 16.702362
+ Mean difference: 4.84187126
+ Maximum pointwise difference: 11.02708149
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: 24.07042313, Converted: 13.04334164
+ Biggest difference in row (0, 0), sum 153.169525 vs 133.618896
+
+Layer 15, Token 7 (model.layers.out comparison):
+  Original tensor sum: 256.612762
+  Converted tensor sum: 236.694611
+  Original tensor mean: 32.076595
+  Converted tensor mean: 29.586826
+ Mean difference: 4.89619875
+ Maximum pointwise difference: 11.06676292
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: 34.29892731, Converted: 23.23216438
+ Biggest difference in row (0, 0), sum 256.612762 vs 236.694611
+
+Layer 0, Token 8 (model.layers.out comparison):
+  Original tensor sum: 4.551975
+  Converted tensor sum: 1.348729
+  Original tensor mean: 0.568997
+  Converted tensor mean: 0.168591
+ Mean difference: 2.05911183
+ Maximum pointwise difference: 5.11385345
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 1.91795087, Converted: -3.19590235
+ Biggest difference in row (0, 0), sum 4.551975 vs 1.348729
+
+Layer 1, Token 8 (model.layers.out comparison):
+  Original tensor sum: -10.499850
+  Converted tensor sum: -11.510830
+  Original tensor mean: -1.312481
+  Converted tensor mean: -1.438854
+ Mean difference: 3.72058988
+ Maximum pointwise difference: 7.12741280
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: -4.90886450, Converted: 2.21854830
+ Biggest difference in row (0, 0), sum -10.499850 vs -11.510830
+
+Layer 2, Token 8 (model.layers.out comparison):
+  Original tensor sum: 21.469618
+  Converted tensor sum: 13.045154
+  Original tensor mean: 2.683702
+  Converted tensor mean: 1.630644
+ Mean difference: 4.73055506
+ Maximum pointwise difference: 11.87027359
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 6.33750200, Converted: -5.53277111
+ Biggest difference in row (0, 0), sum 21.469618 vs 13.045154
+
+Layer 3, Token 8 (model.layers.out comparison):
+  Original tensor sum: 56.933716
+  Converted tensor sum: 65.067757
+  Original tensor mean: 7.116714
+  Converted tensor mean: 8.133470
+ Mean difference: 5.21158791
+ Maximum pointwise difference: 10.06817722
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 10.73284817, Converted: 0.66467106
+ Biggest difference in row (0, 0), sum 56.933716 vs 65.067757
+
+Layer 4, Token 8 (model.layers.out comparison):
+  Original tensor sum: 54.841175
+  Converted tensor sum: 58.977600
+  Original tensor mean: 6.855147
+  Converted tensor mean: 7.372200
+ Mean difference: 5.39579868
+ Maximum pointwise difference: 10.23285866
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 10.38635254, Converted: 0.15349340
+ Biggest difference in row (0, 0), sum 54.841175 vs 58.977600
+
+Layer 5, Token 8 (model.layers.out comparison):
+  Original tensor sum: 59.439285
+  Converted tensor sum: 59.979446
+  Original tensor mean: 7.429911
+  Converted tensor mean: 7.497431
+ Mean difference: 5.44655371
+ Maximum pointwise difference: 11.05043030
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 9.77372646, Converted: -1.27670395
+ Biggest difference in row (0, 0), sum 59.439285 vs 59.979446
+
+Layer 6, Token 8 (model.layers.out comparison):
+  Original tensor sum: 57.398651
+  Converted tensor sum: 56.296188
+  Original tensor mean: 7.174831
+  Converted tensor mean: 7.037024
+ Mean difference: 5.29393005
+ Maximum pointwise difference: 9.82726002
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 9.26543045, Converted: -0.56182986
+ Biggest difference in row (0, 0), sum 57.398651 vs 56.296188
+
+Layer 7, Token 8 (model.layers.out comparison):
+  Original tensor sum: 108.492706
+  Converted tensor sum: 119.552338
+  Original tensor mean: 13.561588
+  Converted tensor mean: 14.944042
+ Mean difference: 5.49957895
+ Maximum pointwise difference: 11.73512173
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 10.45698833, Converted: 22.19211006
+ Biggest difference in row (0, 0), sum 108.492706 vs 119.552338
+
+Layer 8, Token 8 (model.layers.out comparison):
+  Original tensor sum: 106.563354
+  Converted tensor sum: 119.608925
+  Original tensor mean: 13.320419
+  Converted tensor mean: 14.951116
+ Mean difference: 4.46781254
+ Maximum pointwise difference: 10.82487202
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 9.85585308, Converted: 20.68072510
+ Biggest difference in row (0, 0), sum 106.563354 vs 119.608925
+
+Layer 9, Token 8 (model.layers.out comparison):
+  Original tensor sum: 111.512817
+  Converted tensor sum: 109.928528
+  Original tensor mean: 13.939102
+  Converted tensor mean: 13.741066
+ Mean difference: 4.52381039
+ Maximum pointwise difference: 8.89503384
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: 12.07329845, Converted: 20.96833229
+ Biggest difference in row (0, 0), sum 111.512817 vs 109.928528
+
+Layer 10, Token 8 (model.layers.out comparison):
+  Original tensor sum: 111.241730
+  Converted tensor sum: 103.886688
+  Original tensor mean: 13.905216
+  Converted tensor mean: 12.985836
+ Mean difference: 4.59785748
+ Maximum pointwise difference: 8.55565834
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 11.01864815, Converted: 19.57430649
+ Biggest difference in row (0, 0), sum 111.241730 vs 103.886688
+
+Layer 11, Token 8 (model.layers.out comparison):
+  Original tensor sum: 194.094177
+  Converted tensor sum: 193.564484
+  Original tensor mean: 24.261772
+  Converted tensor mean: 24.195560
+ Mean difference: 4.49120235
+ Maximum pointwise difference: 9.88864136
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 22.25957108, Converted: 32.14821243
+ Biggest difference in row (0, 0), sum 194.094177 vs 193.564484
+
+Layer 12, Token 8 (model.layers.out comparison):
+  Original tensor sum: 196.658234
+  Converted tensor sum: 189.827057
+  Original tensor mean: 24.582279
+  Converted tensor mean: 23.728382
+ Mean difference: 5.10350180
+ Maximum pointwise difference: 9.80338287
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 30.23954582, Converted: 20.43616295
+ Biggest difference in row (0, 0), sum 196.658234 vs 189.827057
+
+Layer 13, Token 8 (model.layers.out comparison):
+  Original tensor sum: 193.237976
+  Converted tensor sum: 184.223190
+  Original tensor mean: 24.154747
+  Converted tensor mean: 23.027899
+ Mean difference: 5.11390686
+ Maximum pointwise difference: 10.04300690
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 30.05261230, Converted: 20.00960541
+ Biggest difference in row (0, 0), sum 193.237976 vs 184.223190
+
+Layer 14, Token 8 (model.layers.out comparison):
+  Original tensor sum: 183.582977
+  Converted tensor sum: 183.402130
+  Original tensor mean: 22.947872
+  Converted tensor mean: 22.925266
+ Mean difference: 5.41123581
+ Maximum pointwise difference: 10.28223228
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 19.26763725, Converted: 29.54986954
+ Biggest difference in row (0, 0), sum 183.582977 vs 183.402130
+
+Layer 15, Token 8 (model.layers.out comparison):
+  Original tensor sum: 297.650543
+  Converted tensor sum: 301.053558
+  Original tensor mean: 37.206318
+  Converted tensor mean: 37.631695
+ Mean difference: 5.31624222
+ Maximum pointwise difference: 10.28567123
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 36.54620743, Converted: 46.83187866
+ Biggest difference in row (0, 0), sum 297.650543 vs 301.053558
+
+Layer 0, Token 9 (model.layers.out comparison):
+  Original tensor sum: 27.724323
+  Converted tensor sum: 7.010333
+  Original tensor mean: 3.465540
+  Converted tensor mean: 0.876292
+ Mean difference: 3.55158758
+ Maximum pointwise difference: 7.14975357
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 4.03241444, Converted: -3.11733937
+ Biggest difference in row (0, 0), sum 27.724323 vs 7.010333
+
+Layer 1, Token 9 (model.layers.out comparison):
+  Original tensor sum: 17.384836
+  Converted tensor sum: 7.348456
+  Original tensor mean: 2.173105
+  Converted tensor mean: 0.918557
+ Mean difference: 3.79201698
+ Maximum pointwise difference: 8.55698013
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: -0.55471849, Converted: 8.00226116
+ Biggest difference in row (0, 0), sum 17.384836 vs 7.348456
+
+Layer 2, Token 9 (model.layers.out comparison):
+  Original tensor sum: 20.318661
+  Converted tensor sum: 28.392349
+  Original tensor mean: 2.539833
+  Converted tensor mean: 3.549044
+ Mean difference: 2.94842267
+ Maximum pointwise difference: 9.89197159
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: -1.05586541, Converted: 8.83610630
+ Biggest difference in row (0, 0), sum 20.318661 vs 28.392349
+
+Layer 3, Token 9 (model.layers.out comparison):
+  Original tensor sum: 65.513725
+  Converted tensor sum: 84.414536
+  Original tensor mean: 8.189216
+  Converted tensor mean: 10.551817
+ Mean difference: 4.41447163
+ Maximum pointwise difference: 10.74111176
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 6.86948347, Converted: 17.61059570
+ Biggest difference in row (0, 0), sum 65.513725 vs 84.414536
+
+Layer 4, Token 9 (model.layers.out comparison):
+  Original tensor sum: 61.603691
+  Converted tensor sum: 72.172562
+  Original tensor mean: 7.700461
+  Converted tensor mean: 9.021570
+ Mean difference: 4.32150173
+ Maximum pointwise difference: 10.51774502
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 4.71584320, Converted: 15.23358822
+ Biggest difference in row (0, 0), sum 61.603691 vs 72.172562
+
+Layer 5, Token 9 (model.layers.out comparison):
+  Original tensor sum: 61.554985
+  Converted tensor sum: 60.684212
+  Original tensor mean: 7.694373
+  Converted tensor mean: 7.585526
+ Mean difference: 4.84910297
+ Maximum pointwise difference: 9.77899742
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 5.03849173, Converted: 14.81748962
+ Biggest difference in row (0, 0), sum 61.554985 vs 60.684212
+
+Layer 6, Token 9 (model.layers.out comparison):
+  Original tensor sum: 60.121288
+  Converted tensor sum: 61.323517
+  Original tensor mean: 7.515161
+  Converted tensor mean: 7.665440
+ Mean difference: 4.61501122
+ Maximum pointwise difference: 10.19813538
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 4.46036100, Converted: 14.65849590
+ Biggest difference in row (0, 0), sum 60.121288 vs 61.323517
+
+Layer 7, Token 9 (model.layers.out comparison):
+  Original tensor sum: 120.854408
+  Converted tensor sum: 122.564323
+  Original tensor mean: 15.106801
+  Converted tensor mean: 15.320540
+ Mean difference: 4.58281326
+ Maximum pointwise difference: 10.81363106
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 11.19677925, Converted: 22.01041031
+ Biggest difference in row (0, 0), sum 120.854408 vs 122.564323
+
+Layer 8, Token 9 (model.layers.out comparison):
+  Original tensor sum: 111.411377
+  Converted tensor sum: 113.878586
+  Original tensor mean: 13.926422
+  Converted tensor mean: 14.234823
+ Mean difference: 4.80341482
+ Maximum pointwise difference: 8.54869747
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 10.95728207, Converted: 19.50597954
+ Biggest difference in row (0, 0), sum 111.411377 vs 113.878586
+
+Layer 9, Token 9 (model.layers.out comparison):
+  Original tensor sum: 104.621353
+  Converted tensor sum: 99.551331
+  Original tensor mean: 13.077669
+  Converted tensor mean: 12.443916
+ Mean difference: 4.94641495
+ Maximum pointwise difference: 7.18619919
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 10.17811966, Converted: 17.36431885
+ Biggest difference in row (0, 0), sum 104.621353 vs 99.551331
+
+Layer 10, Token 9 (model.layers.out comparison):
+  Original tensor sum: 105.495895
+  Converted tensor sum: 90.669807
+  Original tensor mean: 13.186987
+  Converted tensor mean: 11.333726
+ Mean difference: 4.88313580
+ Maximum pointwise difference: 8.44397736
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 11.21555328, Converted: 2.77157593
+ Biggest difference in row (0, 0), sum 105.495895 vs 90.669807
+
+Layer 11, Token 9 (model.layers.out comparison):
+  Original tensor sum: 198.914932
+  Converted tensor sum: 187.657013
+  Original tensor mean: 24.864367
+  Converted tensor mean: 23.457127
+ Mean difference: 4.87979174
+ Maximum pointwise difference: 8.17332649
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 22.94329643, Converted: 14.76996994
+ Biggest difference in row (0, 0), sum 198.914932 vs 187.657013
+
+Layer 12, Token 9 (model.layers.out comparison):
+  Original tensor sum: 197.781982
+  Converted tensor sum: 182.248840
+  Original tensor mean: 24.722748
+  Converted tensor mean: 22.781105
+ Mean difference: 5.16355371
+ Maximum pointwise difference: 9.60578632
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 23.29119301, Converted: 13.68540668
+ Biggest difference in row (0, 0), sum 197.781982 vs 182.248840
+
+Layer 13, Token 9 (model.layers.out comparison):
+  Original tensor sum: 191.909027
+  Converted tensor sum: 177.667252
+  Original tensor mean: 23.988628
+  Converted tensor mean: 22.208406
+ Mean difference: 5.14386559
+ Maximum pointwise difference: 9.20664406
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 22.88940430, Converted: 13.68276024
+ Biggest difference in row (0, 0), sum 191.909027 vs 177.667252
+
+Layer 14, Token 9 (model.layers.out comparison):
+  Original tensor sum: 193.112854
+  Converted tensor sum: 170.826324
+  Original tensor mean: 24.139107
+  Converted tensor mean: 21.353291
+ Mean difference: 5.67996836
+ Maximum pointwise difference: 10.54143143
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 29.37781715, Converted: 18.83638573
+ Biggest difference in row (0, 0), sum 193.112854 vs 170.826324
+
+Layer 15, Token 9 (model.layers.out comparison):
+  Original tensor sum: 310.393738
+  Converted tensor sum: 295.392517
+  Original tensor mean: 38.799217
+  Converted tensor mean: 36.924065
+ Mean difference: 5.11053467
+ Maximum pointwise difference: 9.09804153
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 43.17533493, Converted: 34.07729340
+ Biggest difference in row (0, 0), sum 310.393738 vs 295.392517
+
+Layer 0, Token 10 (model.layers.out comparison):
+  Original tensor sum: 11.304202
+  Converted tensor sum: 14.919886
+  Original tensor mean: 1.413025
+  Converted tensor mean: 1.864986
+ Mean difference: 1.20558476
+ Maximum pointwise difference: 2.02042794
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: -0.23466866, Converted: 1.78575933
+ Biggest difference in row (0, 0), sum 11.304202 vs 14.919886
+
+Layer 1, Token 10 (model.layers.out comparison):
+  Original tensor sum: 4.380467
+  Converted tensor sum: 1.448399
+  Original tensor mean: 0.547558
+  Converted tensor mean: 0.181050
+ Mean difference: 1.55803418
+ Maximum pointwise difference: 3.08950615
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 3.13031340, Converted: 0.04080731
+ Biggest difference in row (0, 0), sum 4.380467 vs 1.448399
+
+Layer 2, Token 10 (model.layers.out comparison):
+  Original tensor sum: 21.641123
+  Converted tensor sum: 18.135971
+  Original tensor mean: 2.705140
+  Converted tensor mean: 2.266996
+ Mean difference: 2.29236317
+ Maximum pointwise difference: 5.34974813
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 4.73606253, Converted: -0.61368543
+ Biggest difference in row (0, 0), sum 21.641123 vs 18.135971
+
+Layer 3, Token 10 (model.layers.out comparison):
+  Original tensor sum: 84.183029
+  Converted tensor sum: 75.554764
+  Original tensor mean: 10.522879
+  Converted tensor mean: 9.444345
+ Mean difference: 2.50477004
+ Maximum pointwise difference: 7.33609867
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 11.12465191, Converted: 3.78855324
+ Biggest difference in row (0, 0), sum 84.183029 vs 75.554764
+
+Layer 4, Token 10 (model.layers.out comparison):
+  Original tensor sum: 75.952011
+  Converted tensor sum: 63.684746
+  Original tensor mean: 9.494001
+  Converted tensor mean: 7.960593
+ Mean difference: 2.89978528
+ Maximum pointwise difference: 6.58637476
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 10.07624340, Converted: 3.48986864
+ Biggest difference in row (0, 0), sum 75.952011 vs 63.684746
+
+Layer 5, Token 10 (model.layers.out comparison):
+  Original tensor sum: 67.380692
+  Converted tensor sum: 51.477894
+  Original tensor mean: 8.422586
+  Converted tensor mean: 6.434737
+ Mean difference: 2.92978549
+ Maximum pointwise difference: 6.54403639
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 10.47875404, Converted: 3.93471766
+ Biggest difference in row (0, 0), sum 67.380692 vs 51.477894
+
+Layer 6, Token 10 (model.layers.out comparison):
+  Original tensor sum: 64.356155
+  Converted tensor sum: 44.292259
+  Original tensor mean: 8.044519
+  Converted tensor mean: 5.536532
+ Mean difference: 3.18394947
+ Maximum pointwise difference: 7.18761826
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 9.33854580, Converted: 2.15092754
+ Biggest difference in row (0, 0), sum 64.356155 vs 44.292259
+
+Layer 7, Token 10 (model.layers.out comparison):
+  Original tensor sum: 124.955261
+  Converted tensor sum: 105.713638
+  Original tensor mean: 15.619408
+  Converted tensor mean: 13.214205
+ Mean difference: 3.17375469
+ Maximum pointwise difference: 7.15706635
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 16.18268585, Converted: 9.02561951
+ Biggest difference in row (0, 0), sum 124.955261 vs 105.713638
+
+Layer 8, Token 10 (model.layers.out comparison):
+  Original tensor sum: 105.275124
+  Converted tensor sum: 92.354050
+  Original tensor mean: 13.159390
+  Converted tensor mean: 11.544256
+ Mean difference: 2.89860010
+ Maximum pointwise difference: 6.96542978
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 12.28927994, Converted: 5.32385015
+ Biggest difference in row (0, 0), sum 105.275124 vs 92.354050
+
+Layer 9, Token 10 (model.layers.out comparison):
+  Original tensor sum: 89.282066
+  Converted tensor sum: 75.157639
+  Original tensor mean: 11.160258
+  Converted tensor mean: 9.394705
+ Mean difference: 2.89608860
+ Maximum pointwise difference: 7.40043926
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 9.69557953, Converted: 2.29514027
+ Biggest difference in row (0, 0), sum 89.282066 vs 75.157639
+
+Layer 10, Token 10 (model.layers.out comparison):
+  Original tensor sum: 87.814186
+  Converted tensor sum: 68.457840
+  Original tensor mean: 10.976773
+  Converted tensor mean: 8.557230
+ Mean difference: 3.06474447
+ Maximum pointwise difference: 8.03616142
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 10.20811844, Converted: 2.17195749
+ Biggest difference in row (0, 0), sum 87.814186 vs 68.457840
+
+Layer 11, Token 10 (model.layers.out comparison):
+  Original tensor sum: 184.781067
+  Converted tensor sum: 170.778610
+  Original tensor mean: 23.097633
+  Converted tensor mean: 21.347326
+ Mean difference: 2.85195446
+ Maximum pointwise difference: 6.81012630
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 22.35528374, Converted: 15.54515743
+ Biggest difference in row (0, 0), sum 184.781067 vs 170.778610
+
+Layer 12, Token 10 (model.layers.out comparison):
+  Original tensor sum: 187.157104
+  Converted tensor sum: 166.325562
+  Original tensor mean: 23.394638
+  Converted tensor mean: 20.790695
+ Mean difference: 3.00816154
+ Maximum pointwise difference: 8.29628849
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 23.84814453, Converted: 15.55185604
+ Biggest difference in row (0, 0), sum 187.157104 vs 166.325562
+
+Layer 13, Token 10 (model.layers.out comparison):
+  Original tensor sum: 180.577179
+  Converted tensor sum: 161.409668
+  Original tensor mean: 22.572147
+  Converted tensor mean: 20.176208
+ Mean difference: 3.22855854
+ Maximum pointwise difference: 8.27111149
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 22.88647652, Converted: 14.61536503
+ Biggest difference in row (0, 0), sum 180.577179 vs 161.409668
+
+Layer 14, Token 10 (model.layers.out comparison):
+  Original tensor sum: 176.409912
+  Converted tensor sum: 155.317413
+  Original tensor mean: 22.051239
+  Converted tensor mean: 19.414677
+ Mean difference: 3.30306578
+ Maximum pointwise difference: 8.51622581
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 21.49407005, Converted: 12.97784424
+ Biggest difference in row (0, 0), sum 176.409912 vs 155.317413
+
+Layer 15, Token 10 (model.layers.out comparison):
+  Original tensor sum: 303.652618
+  Converted tensor sum: 289.143890
+  Original tensor mean: 37.956577
+  Converted tensor mean: 36.142986
+ Mean difference: 3.20148277
+ Maximum pointwise difference: 7.65085030
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 38.91091537, Converted: 31.26006508
+ Biggest difference in row (0, 0), sum 303.652618 vs 289.143890
+
+Layer 0, Token 11 (model.layers.out comparison):
+  Original tensor sum: 3.868190
+  Converted tensor sum: -4.365316
+  Original tensor mean: 0.483524
+  Converted tensor mean: -0.545665
+ Mean difference: 1.47696412
+ Maximum pointwise difference: 3.49379730
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: 1.60926533, Converted: -1.88453186
+ Biggest difference in row (0, 0), sum 3.868190 vs -4.365316
+
+Layer 1, Token 11 (model.layers.out comparison):
+  Original tensor sum: -4.763882
+  Converted tensor sum: -8.100720
+  Original tensor mean: -0.595485
+  Converted tensor mean: -1.012590
+ Mean difference: 2.60996270
+ Maximum pointwise difference: 4.04230022
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: -3.43199134, Converted: 0.61030883
+ Biggest difference in row (0, 0), sum -4.763882 vs -8.100720
+
+Layer 2, Token 11 (model.layers.out comparison):
+  Original tensor sum: -8.837991
+  Converted tensor sum: -17.355688
+  Original tensor mean: -1.104749
+  Converted tensor mean: -2.169461
+ Mean difference: 3.57004023
+ Maximum pointwise difference: 7.78442717
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: -2.88003159, Converted: 4.90439558
+ Biggest difference in row (0, 0), sum -8.837991 vs -17.355688
+
+Layer 3, Token 11 (model.layers.out comparison):
+  Original tensor sum: -151.825806
+  Converted tensor sum: -119.589157
+  Original tensor mean: -18.978226
+  Converted tensor mean: -14.948645
+ Mean difference: 4.57043171
+ Maximum pointwise difference: 10.22036552
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: -23.12115479, Converted: -12.90078926
+ Biggest difference in row (0, 0), sum -151.825806 vs -119.589157
+
+Layer 4, Token 11 (model.layers.out comparison):
+  Original tensor sum: -87.672623
+  Converted tensor sum: -49.333626
+  Original tensor mean: -10.959078
+  Converted tensor mean: -6.166703
+ Mean difference: 5.28691673
+ Maximum pointwise difference: 12.21502209
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: -14.83695984, Converted: -2.62193775
+ Biggest difference in row (0, 0), sum -87.672623 vs -49.333626
+
+Layer 5, Token 11 (model.layers.out comparison):
+  Original tensor sum: -19.529230
+  Converted tensor sum: 51.921982
+  Original tensor mean: -2.441154
+  Converted tensor mean: 6.490248
+ Mean difference: 8.93140125
+ Maximum pointwise difference: 17.95970917
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 2.32367539, Converted: 20.28338432
+ Biggest difference in row (0, 0), sum -19.529230 vs 51.921982
+
+Layer 6, Token 11 (model.layers.out comparison):
+  Original tensor sum: -34.699642
+  Converted tensor sum: 49.364166
+  Original tensor mean: -4.337455
+  Converted tensor mean: 6.170521
+ Mean difference: 10.50797558
+ Maximum pointwise difference: 19.14058685
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: -4.33303738, Converted: 14.80754948
+ Biggest difference in row (0, 0), sum -34.699642 vs 49.364166
+
+Layer 7, Token 11 (model.layers.out comparison):
+  Original tensor sum: -174.093460
+  Converted tensor sum: 116.110802
+  Original tensor mean: -21.761683
+  Converted tensor mean: 14.513850
+ Mean difference: 36.27553177
+ Maximum pointwise difference: 45.40389252
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: -18.08833313, Converted: 27.31555748
+ Biggest difference in row (0, 0), sum -174.093460 vs 116.110802
+
+Layer 8, Token 11 (model.layers.out comparison):
+  Original tensor sum: -95.914619
+  Converted tensor sum: 104.116745
+  Original tensor mean: -11.989327
+  Converted tensor mean: 13.014593
+ Mean difference: 25.00392151
+ Maximum pointwise difference: 39.39223480
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: -25.33579826, Converted: 14.05643463
+ Biggest difference in row (0, 0), sum -95.914619 vs 104.116745
+
+Layer 9, Token 11 (model.layers.out comparison):
+  Original tensor sum: -76.038055
+  Converted tensor sum: 86.082336
+  Original tensor mean: -9.504757
+  Converted tensor mean: 10.760292
+ Mean difference: 20.92745209
+ Maximum pointwise difference: 40.40296555
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: -26.76908302, Converted: 13.63388157
+ Biggest difference in row (0, 0), sum -76.038055 vs 86.082336
+
+Layer 10, Token 11 (model.layers.out comparison):
+  Original tensor sum: -62.967239
+  Converted tensor sum: 79.332596
+  Original tensor mean: -7.870905
+  Converted tensor mean: 9.916574
+ Mean difference: 18.64283180
+ Maximum pointwise difference: 40.29864883
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: -27.55656052, Converted: 12.74208832
+ Biggest difference in row (0, 0), sum -62.967239 vs 79.332596
+
+Layer 11, Token 11 (model.layers.out comparison):
+  Original tensor sum: -348.172638
+  Converted tensor sum: 185.268341
+  Original tensor mean: -43.521580
+  Converted tensor mean: 23.158543
+ Mean difference: 66.68012238
+ Maximum pointwise difference: 90.25902557
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: -54.79597092, Converted: 35.46305466
+ Biggest difference in row (0, 0), sum -348.172638 vs 185.268341
+
+Layer 12, Token 11 (model.layers.out comparison):
+  Original tensor sum: -380.460999
+  Converted tensor sum: 184.850082
+  Original tensor mean: -47.557625
+  Converted tensor mean: 23.106260
+ Mean difference: 70.66388702
+ Maximum pointwise difference: 91.58323669
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: -56.39131927, Converted: 35.19191360
+ Biggest difference in row (0, 0), sum -380.460999 vs 184.850082
+
+Layer 13, Token 11 (model.layers.out comparison):
+  Original tensor sum: -387.549927
+  Converted tensor sum: 178.291550
+  Original tensor mean: -48.443741
+  Converted tensor mean: 22.286444
+ Mean difference: 70.73018646
+ Maximum pointwise difference: 92.60649109
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: -57.23683167, Converted: 35.36965561
+ Biggest difference in row (0, 0), sum -387.549927 vs 178.291550
+
+Layer 14, Token 11 (model.layers.out comparison):
+  Original tensor sum: -381.615417
+  Converted tensor sum: 175.841187
+  Original tensor mean: -47.701927
+  Converted tensor mean: 21.980148
+ Mean difference: 69.68207550
+ Maximum pointwise difference: 95.39483643
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: -61.00698853, Converted: 34.38784409
+ Biggest difference in row (0, 0), sum -381.615417 vs 175.841187
+
+Layer 15, Token 11 (model.layers.out comparison):
+  Original tensor sum: -791.898560
+  Converted tensor sum: 313.297852
+  Original tensor mean: -98.987320
+  Converted tensor mean: 39.162231
+ Mean difference: 138.14956665
+ Maximum pointwise difference: 174.31031799
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: -120.39865875, Converted: 53.91165924
+ Biggest difference in row (0, 0), sum -791.898560 vs 313.297852
+
+Layer 0, Token 12 (model.layers.out comparison):
+  Original tensor sum: 17.494652
+  Converted tensor sum: -39.301899
+  Original tensor mean: 2.186831
+  Converted tensor mean: -4.912737
+ Mean difference: 8.11834240
+ Maximum pointwise difference: 15.19715595
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 2.70196438, Converted: -12.49519157
+ Biggest difference in row (0, 0), sum 17.494652 vs -39.301899
+
+Layer 1, Token 12 (model.layers.out comparison):
+  Original tensor sum: 11.314701
+  Converted tensor sum: -35.014473
+  Original tensor mean: 1.414338
+  Converted tensor mean: -4.376809
+ Mean difference: 7.67025709
+ Maximum pointwise difference: 15.05980301
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 2.28716040, Converted: -12.77264309
+ Biggest difference in row (0, 0), sum 11.314701 vs -35.014473
+
+Layer 2, Token 12 (model.layers.out comparison):
+  Original tensor sum: 3.520873
+  Converted tensor sum: -23.351210
+  Original tensor mean: 0.440109
+  Converted tensor mean: -2.918901
+ Mean difference: 7.09708309
+ Maximum pointwise difference: 10.56869507
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: 0.29396084, Converted: -10.27473450
+ Biggest difference in row (0, 0), sum 3.520873 vs -23.351210
+
+Layer 3, Token 12 (model.layers.out comparison):
+  Original tensor sum: -38.507721
+  Converted tensor sum: -65.860725
+  Original tensor mean: -4.813465
+  Converted tensor mean: -8.232591
+ Mean difference: 6.29614639
+ Maximum pointwise difference: 10.23156357
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: -4.52744627, Converted: -14.75901031
+ Biggest difference in row (0, 0), sum -38.507721 vs -65.860725
+
+Layer 4, Token 12 (model.layers.out comparison):
+  Original tensor sum: -25.538549
+  Converted tensor sum: -16.346577
+  Original tensor mean: -3.192319
+  Converted tensor mean: -2.043322
+ Mean difference: 5.56114197
+ Maximum pointwise difference: 11.51591301
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: -9.10746288, Converted: 2.40844989
+ Biggest difference in row (0, 0), sum -25.538549 vs -16.346577
+
+Layer 5, Token 12 (model.layers.out comparison):
+  Original tensor sum: 5.103131
+  Converted tensor sum: -11.820143
+  Original tensor mean: 0.637891
+  Converted tensor mean: -1.477518
+ Mean difference: 6.80205250
+ Maximum pointwise difference: 11.26421928
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 5.06476593, Converted: -6.19945335
+ Biggest difference in row (0, 0), sum 5.103131 vs -11.820143
+
+Layer 6, Token 12 (model.layers.out comparison):
+  Original tensor sum: 1.231229
+  Converted tensor sum: -13.329983
+  Original tensor mean: 0.153904
+  Converted tensor mean: -1.666248
+ Mean difference: 7.36224794
+ Maximum pointwise difference: 11.85875893
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 5.86865807, Converted: -5.99010086
+ Biggest difference in row (0, 0), sum 1.231229 vs -13.329983
+
+Layer 7, Token 12 (model.layers.out comparison):
+  Original tensor sum: 49.883171
+  Converted tensor sum: -138.587738
+  Original tensor mean: 6.235396
+  Converted tensor mean: -17.323467
+ Mean difference: 23.55886269
+ Maximum pointwise difference: 38.93606567
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 13.31498432, Converted: -25.62108231
+ Biggest difference in row (0, 0), sum 49.883171 vs -138.587738
+
+Layer 8, Token 12 (model.layers.out comparison):
+  Original tensor sum: 32.997459
+  Converted tensor sum: -79.532417
+  Original tensor mean: 4.124682
+  Converted tensor mean: -9.941552
+ Mean difference: 15.04267120
+ Maximum pointwise difference: 28.15183258
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 10.99009418, Converted: -17.16173744
+ Biggest difference in row (0, 0), sum 32.997459 vs -79.532417
+
+Layer 9, Token 12 (model.layers.out comparison):
+  Original tensor sum: 30.462442
+  Converted tensor sum: -58.022911
+  Original tensor mean: 3.807805
+  Converted tensor mean: -7.252864
+ Mean difference: 13.06616974
+ Maximum pointwise difference: 26.93473625
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 10.51771736, Converted: -16.41701889
+ Biggest difference in row (0, 0), sum 30.462442 vs -58.022911
+
+Layer 10, Token 12 (model.layers.out comparison):
+  Original tensor sum: 31.758196
+  Converted tensor sum: -31.289818
+  Original tensor mean: 3.969774
+  Converted tensor mean: -3.911227
+ Mean difference: 11.64717674
+ Maximum pointwise difference: 25.19077682
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 10.60759830, Converted: -14.58317757
+ Biggest difference in row (0, 0), sum 31.758196 vs -31.289818
+
+Layer 11, Token 12 (model.layers.out comparison):
+  Original tensor sum: 64.195580
+  Converted tensor sum: -290.078918
+  Original tensor mean: 8.024447
+  Converted tensor mean: -36.259865
+ Mean difference: 44.28431320
+ Maximum pointwise difference: 58.32298279
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 14.01799965, Converted: -44.30498123
+ Biggest difference in row (0, 0), sum 64.195580 vs -290.078918
+
+Layer 12, Token 12 (model.layers.out comparison):
+  Original tensor sum: 65.652679
+  Converted tensor sum: -300.691650
+  Original tensor mean: 8.206585
+  Converted tensor mean: -37.586456
+ Mean difference: 45.79303741
+ Maximum pointwise difference: 64.50979614
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 13.89292812, Converted: -50.61687088
+ Biggest difference in row (0, 0), sum 65.652679 vs -300.691650
+
+Layer 13, Token 12 (model.layers.out comparison):
+  Original tensor sum: 64.880409
+  Converted tensor sum: -292.294403
+  Original tensor mean: 8.110051
+  Converted tensor mean: -36.536800
+ Mean difference: 44.64685059
+ Maximum pointwise difference: 61.03430176
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 8.34301949, Converted: -52.69128418
+ Biggest difference in row (0, 0), sum 64.880409 vs -292.294403
+
+Layer 14, Token 12 (model.layers.out comparison):
+  Original tensor sum: 55.352615
+  Converted tensor sum: -232.615005
+  Original tensor mean: 6.919077
+  Converted tensor mean: -29.076876
+ Mean difference: 35.99595261
+ Maximum pointwise difference: 69.32642365
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 7.07370424, Converted: -62.25271606
+ Biggest difference in row (0, 0), sum 55.352615 vs -232.615005
+
+Layer 15, Token 12 (model.layers.out comparison):
+  Original tensor sum: 191.366241
+  Converted tensor sum: -607.544556
+  Original tensor mean: 23.920780
+  Converted tensor mean: -75.943069
+ Mean difference: 99.86384583
+ Maximum pointwise difference: 121.99198914
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 33.01739502, Converted: -88.97459412
+ Biggest difference in row (0, 0), sum 191.366241 vs -607.544556
+
+Layer 0, Token 13 (model.layers.out comparison):
+  Original tensor sum: 28.716766
+  Converted tensor sum: 24.262428
+  Original tensor mean: 3.589596
+  Converted tensor mean: 3.032804
+ Mean difference: 2.20962214
+ Maximum pointwise difference: 5.77315617
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 4.62014198, Converted: -1.15301442
+ Biggest difference in row (0, 0), sum 28.716766 vs 24.262428
+
+Layer 1, Token 13 (model.layers.out comparison):
+  Original tensor sum: 18.283722
+  Converted tensor sum: 16.804958
+  Original tensor mean: 2.285465
+  Converted tensor mean: 2.100620
+ Mean difference: 2.44061017
+ Maximum pointwise difference: 5.48099232
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: -7.47550392, Converted: -1.99451160
+ Biggest difference in row (0, 0), sum 18.283722 vs 16.804958
+
+Layer 2, Token 13 (model.layers.out comparison):
+  Original tensor sum: 14.973861
+  Converted tensor sum: 10.670280
+  Original tensor mean: 1.871733
+  Converted tensor mean: 1.333785
+ Mean difference: 2.94856715
+ Maximum pointwise difference: 6.09164524
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 2.11467242, Converted: -3.97697282
+ Biggest difference in row (0, 0), sum 14.973861 vs 10.670280
+
+Layer 3, Token 13 (model.layers.out comparison):
+  Original tensor sum: 62.116623
+  Converted tensor sum: 46.581398
+  Original tensor mean: 7.764578
+  Converted tensor mean: 5.822675
+ Mean difference: 3.59710693
+ Maximum pointwise difference: 6.89595842
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 11.14201260, Converted: 4.24605417
+ Biggest difference in row (0, 0), sum 62.116623 vs 46.581398
+
+Layer 4, Token 13 (model.layers.out comparison):
+  Original tensor sum: 65.792244
+  Converted tensor sum: 43.042854
+  Original tensor mean: 8.224030
+  Converted tensor mean: 5.380357
+ Mean difference: 3.63414001
+ Maximum pointwise difference: 8.06606770
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 9.46925735, Converted: 1.40318930
+ Biggest difference in row (0, 0), sum 65.792244 vs 43.042854
+
+Layer 5, Token 13 (model.layers.out comparison):
+  Original tensor sum: 60.294563
+  Converted tensor sum: 38.709320
+  Original tensor mean: 7.536820
+  Converted tensor mean: 4.838665
+ Mean difference: 4.29471397
+ Maximum pointwise difference: 9.28423500
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 9.56281090, Converted: 0.27857587
+ Biggest difference in row (0, 0), sum 60.294563 vs 38.709320
+
+Layer 6, Token 13 (model.layers.out comparison):
+  Original tensor sum: 60.864697
+  Converted tensor sum: 41.897995
+  Original tensor mean: 7.608087
+  Converted tensor mean: 5.237249
+ Mean difference: 4.15325356
+ Maximum pointwise difference: 7.30325747
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 11.01063633, Converted: 3.70737886
+ Biggest difference in row (0, 0), sum 60.864697 vs 41.897995
+
+Layer 7, Token 13 (model.layers.out comparison):
+  Original tensor sum: 124.166924
+  Converted tensor sum: 107.577675
+  Original tensor mean: 15.520865
+  Converted tensor mean: 13.447209
+ Mean difference: 4.08049011
+ Maximum pointwise difference: 7.30880928
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 17.63167572, Converted: 10.32286644
+ Biggest difference in row (0, 0), sum 124.166924 vs 107.577675
+
+Layer 8, Token 13 (model.layers.out comparison):
+  Original tensor sum: 114.534744
+  Converted tensor sum: 106.782104
+  Original tensor mean: 14.316843
+  Converted tensor mean: 13.347763
+ Mean difference: 3.79455638
+ Maximum pointwise difference: 8.56559753
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 18.64526367, Converted: 10.07966614
+ Biggest difference in row (0, 0), sum 114.534744 vs 106.782104
+
+Layer 9, Token 13 (model.layers.out comparison):
+  Original tensor sum: 111.904816
+  Converted tensor sum: 90.398567
+  Original tensor mean: 13.988102
+  Converted tensor mean: 11.299821
+ Mean difference: 4.39770985
+ Maximum pointwise difference: 12.01837921
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 18.37693977, Converted: 6.35856009
+ Biggest difference in row (0, 0), sum 111.904816 vs 90.398567
+
+Layer 10, Token 13 (model.layers.out comparison):
+  Original tensor sum: 106.496719
+  Converted tensor sum: 84.186646
+  Original tensor mean: 13.312090
+  Converted tensor mean: 10.523331
+ Mean difference: 4.35723734
+ Maximum pointwise difference: 11.76342964
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 17.81115723, Converted: 6.04772711
+ Biggest difference in row (0, 0), sum 106.496719 vs 84.186646
+
+Layer 11, Token 13 (model.layers.out comparison):
+  Original tensor sum: 197.848022
+  Converted tensor sum: 191.943436
+  Original tensor mean: 24.731003
+  Converted tensor mean: 23.992929
+ Mean difference: 3.31890941
+ Maximum pointwise difference: 10.13029099
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 27.00849915, Converted: 16.87820816
+ Biggest difference in row (0, 0), sum 197.848022 vs 191.943436
+
+Layer 12, Token 13 (model.layers.out comparison):
+  Original tensor sum: 197.513275
+  Converted tensor sum: 189.807312
+  Original tensor mean: 24.689159
+  Converted tensor mean: 23.725914
+ Mean difference: 3.50938702
+ Maximum pointwise difference: 10.66487598
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 26.39979744, Converted: 15.73492146
+ Biggest difference in row (0, 0), sum 197.513275 vs 189.807312
+
+Layer 13, Token 13 (model.layers.out comparison):
+  Original tensor sum: 193.055618
+  Converted tensor sum: 185.801392
+  Original tensor mean: 24.131952
+  Converted tensor mean: 23.225174
+ Mean difference: 3.32275867
+ Maximum pointwise difference: 10.17280674
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 25.57653046, Converted: 15.40372372
+ Biggest difference in row (0, 0), sum 193.055618 vs 185.801392
+
+Layer 14, Token 13 (model.layers.out comparison):
+  Original tensor sum: 190.084717
+  Converted tensor sum: 186.092697
+  Original tensor mean: 23.760590
+  Converted tensor mean: 23.261587
+ Mean difference: 3.19069362
+ Maximum pointwise difference: 9.42493057
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 24.81001282, Converted: 15.38508224
+ Biggest difference in row (0, 0), sum 190.084717 vs 186.092697
+
+Layer 15, Token 13 (model.layers.out comparison):
+  Original tensor sum: 319.170319
+  Converted tensor sum: 323.837036
+  Original tensor mean: 39.896290
+  Converted tensor mean: 40.479630
+ Mean difference: 3.55193925
+ Maximum pointwise difference: 8.15688324
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 46.74212265, Converted: 38.58523941
+ Biggest difference in row (0, 0), sum 319.170319 vs 323.837036
+
+Layer 0, Token 14 (model.layers.out comparison):
+  Original tensor sum: 60.062901
+  Converted tensor sum: 42.401054
+  Original tensor mean: 7.507863
+  Converted tensor mean: 5.300132
+ Mean difference: 2.97920632
+ Maximum pointwise difference: 7.75320148
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 13.24933434, Converted: 5.49613285
+ Biggest difference in row (0, 0), sum 60.062901 vs 42.401054
+
+Layer 1, Token 14 (model.layers.out comparison):
+  Original tensor sum: 48.843086
+  Converted tensor sum: 34.002205
+  Original tensor mean: 6.105386
+  Converted tensor mean: 4.250276
+ Mean difference: 2.82561874
+ Maximum pointwise difference: 7.41196299
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 12.44728756, Converted: 5.03532457
+ Biggest difference in row (0, 0), sum 48.843086 vs 34.002205
+
+Layer 2, Token 14 (model.layers.out comparison):
+  Original tensor sum: 49.100876
+  Converted tensor sum: 29.831078
+  Original tensor mean: 6.137609
+  Converted tensor mean: 3.728885
+ Mean difference: 3.44625640
+ Maximum pointwise difference: 8.00705624
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 12.05760670, Converted: 4.05055046
+ Biggest difference in row (0, 0), sum 49.100876 vs 29.831078
+
+Layer 3, Token 14 (model.layers.out comparison):
+  Original tensor sum: 94.051392
+  Converted tensor sum: 85.936119
+  Original tensor mean: 11.756424
+  Converted tensor mean: 10.742015
+ Mean difference: 3.43988085
+ Maximum pointwise difference: 6.90394783
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 18.44681168, Converted: 11.54286385
+ Biggest difference in row (0, 0), sum 94.051392 vs 85.936119
+
+Layer 4, Token 14 (model.layers.out comparison):
+  Original tensor sum: 90.357742
+  Converted tensor sum: 82.357994
+  Original tensor mean: 11.294718
+  Converted tensor mean: 10.294749
+ Mean difference: 3.55732656
+ Maximum pointwise difference: 7.83766174
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 19.03264809, Converted: 11.19498634
+ Biggest difference in row (0, 0), sum 90.357742 vs 82.357994
+
+Layer 5, Token 14 (model.layers.out comparison):
+  Original tensor sum: 84.158882
+  Converted tensor sum: 72.302864
+  Original tensor mean: 10.519860
+  Converted tensor mean: 9.037858
+ Mean difference: 3.79493260
+ Maximum pointwise difference: 9.27737904
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 18.81698799, Converted: 9.53960896
+ Biggest difference in row (0, 0), sum 84.158882 vs 72.302864
+
+Layer 6, Token 14 (model.layers.out comparison):
+  Original tensor sum: 82.342606
+  Converted tensor sum: 74.838448
+  Original tensor mean: 10.292826
+  Converted tensor mean: 9.354806
+ Mean difference: 3.72385550
+ Maximum pointwise difference: 8.27861023
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 18.35614967, Converted: 10.07753944
+ Biggest difference in row (0, 0), sum 82.342606 vs 74.838448
+
+Layer 7, Token 14 (model.layers.out comparison):
+  Original tensor sum: 152.811584
+  Converted tensor sum: 143.282593
+  Original tensor mean: 19.101448
+  Converted tensor mean: 17.910324
+ Mean difference: 3.79641771
+ Maximum pointwise difference: 8.94160843
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 28.97978973, Converted: 20.03818130
+ Biggest difference in row (0, 0), sum 152.811584 vs 143.282593
+
+Layer 8, Token 14 (model.layers.out comparison):
+  Original tensor sum: 134.962891
+  Converted tensor sum: 135.762573
+  Original tensor mean: 16.870361
+  Converted tensor mean: 16.970322
+ Mean difference: 3.42910838
+ Maximum pointwise difference: 6.22266769
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 27.13297844, Converted: 20.91031075
+ Biggest difference in row (0, 0), sum 134.962891 vs 135.762573
+
+Layer 9, Token 14 (model.layers.out comparison):
+  Original tensor sum: 131.262939
+  Converted tensor sum: 130.663895
+  Original tensor mean: 16.407867
+  Converted tensor mean: 16.332987
+ Mean difference: 3.14643574
+ Maximum pointwise difference: 6.41224289
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 25.90853310, Converted: 19.49629021
+ Biggest difference in row (0, 0), sum 131.262939 vs 130.663895
+
+Layer 10, Token 14 (model.layers.out comparison):
+  Original tensor sum: 130.994781
+  Converted tensor sum: 121.948547
+  Original tensor mean: 16.374348
+  Converted tensor mean: 15.243568
+ Mean difference: 3.14505911
+ Maximum pointwise difference: 6.92271805
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 25.71545982, Converted: 18.79274178
+ Biggest difference in row (0, 0), sum 130.994781 vs 121.948547
+
+Layer 11, Token 14 (model.layers.out comparison):
+  Original tensor sum: 227.322296
+  Converted tensor sum: 221.945038
+  Original tensor mean: 28.415287
+  Converted tensor mean: 27.743130
+ Mean difference: 2.92038918
+ Maximum pointwise difference: 6.72454262
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 35.09742355, Converted: 28.37288094
+ Biggest difference in row (0, 0), sum 227.322296 vs 221.945038
+
+Layer 12, Token 14 (model.layers.out comparison):
+  Original tensor sum: 226.411957
+  Converted tensor sum: 219.124207
+  Original tensor mean: 28.301495
+  Converted tensor mean: 27.390526
+ Mean difference: 3.00309324
+ Maximum pointwise difference: 5.31435776
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 32.55270767, Converted: 27.23834991
+ Biggest difference in row (0, 0), sum 226.411957 vs 219.124207
+
+Layer 13, Token 14 (model.layers.out comparison):
+  Original tensor sum: 222.480804
+  Converted tensor sum: 215.029236
+  Original tensor mean: 27.810101
+  Converted tensor mean: 26.878654
+ Mean difference: 3.01644969
+ Maximum pointwise difference: 5.75550079
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 32.34063721, Converted: 26.58513641
+ Biggest difference in row (0, 0), sum 222.480804 vs 215.029236
+
+Layer 14, Token 14 (model.layers.out comparison):
+  Original tensor sum: 217.584625
+  Converted tensor sum: 210.219940
+  Original tensor mean: 27.198078
+  Converted tensor mean: 26.277493
+ Mean difference: 3.42921877
+ Maximum pointwise difference: 5.59035873
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 31.23370743, Converted: 25.64334869
+ Biggest difference in row (0, 0), sum 217.584625 vs 210.219940
+
+Layer 15, Token 14 (model.layers.out comparison):
+  Original tensor sum: 347.902100
+  Converted tensor sum: 344.275635
+  Original tensor mean: 43.487762
+  Converted tensor mean: 43.034454
+ Mean difference: 3.27294016
+ Maximum pointwise difference: 5.50515747
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 41.73074341, Converted: 47.23590088
+ Biggest difference in row (0, 0), sum 347.902100 vs 344.275635
+
+Layer 0, Token 15 (model.layers.out comparison):
+  Original tensor sum: 2.268566
+  Converted tensor sum: -1.956201
+  Original tensor mean: 0.283571
+  Converted tensor mean: -0.244525
+ Mean difference: 1.30659735
+ Maximum pointwise difference: 3.65664506
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 3.25675011, Converted: -0.39989486
+ Biggest difference in row (0, 0), sum 2.268566 vs -1.956201
+
+Layer 1, Token 15 (model.layers.out comparison):
+  Original tensor sum: -3.244995
+  Converted tensor sum: -0.596967
+  Original tensor mean: -0.405624
+  Converted tensor mean: -0.074621
+ Mean difference: 1.73462176
+ Maximum pointwise difference: 3.99903250
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 2.07227492, Converted: -1.92675745
+ Biggest difference in row (0, 0), sum -3.244995 vs -0.596967
+
+Layer 2, Token 15 (model.layers.out comparison):
+  Original tensor sum: 18.643393
+  Converted tensor sum: -7.624215
+  Original tensor mean: 2.330424
+  Converted tensor mean: -0.953027
+ Mean difference: 3.99837518
+ Maximum pointwise difference: 9.85657215
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 9.41628456, Converted: -0.44028741
+ Biggest difference in row (0, 0), sum 18.643393 vs -7.624215
+
+Layer 3, Token 15 (model.layers.out comparison):
+  Original tensor sum: 77.711205
+  Converted tensor sum: -115.602707
+  Original tensor mean: 9.713901
+  Converted tensor mean: -14.450338
+ Mean difference: 24.16423798
+ Maximum pointwise difference: 33.14313507
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 17.84219551, Converted: -15.30093956
+ Biggest difference in row (0, 0), sum 77.711205 vs -115.602707
+
+Layer 4, Token 15 (model.layers.out comparison):
+  Original tensor sum: 71.264816
+  Converted tensor sum: -87.184593
+  Original tensor mean: 8.908102
+  Converted tensor mean: -10.898074
+ Mean difference: 19.80617714
+ Maximum pointwise difference: 27.60903931
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 16.59056091, Converted: -11.01847839
+ Biggest difference in row (0, 0), sum 71.264816 vs -87.184593
+
+Layer 5, Token 15 (model.layers.out comparison):
+  Original tensor sum: 65.154488
+  Converted tensor sum: -20.586208
+  Original tensor mean: 8.144311
+  Converted tensor mean: -2.573276
+ Mean difference: 11.36003971
+ Maximum pointwise difference: 17.89420700
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 3.50937057, Converted: -14.38483620
+ Biggest difference in row (0, 0), sum 65.154488 vs -20.586208
+
+Layer 6, Token 15 (model.layers.out comparison):
+  Original tensor sum: 62.447323
+  Converted tensor sum: -39.734089
+  Original tensor mean: 7.805915
+  Converted tensor mean: -4.966761
+ Mean difference: 12.77267647
+ Maximum pointwise difference: 22.75133705
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 14.95188141, Converted: -7.79945612
+ Biggest difference in row (0, 0), sum 62.447323 vs -39.734089
+
+Layer 7, Token 15 (model.layers.out comparison):
+  Original tensor sum: 127.895920
+  Converted tensor sum: -184.804230
+  Original tensor mean: 15.986990
+  Converted tensor mean: -23.100529
+ Mean difference: 39.08751678
+ Maximum pointwise difference: 51.54846191
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 16.32706261, Converted: -35.22139740
+ Biggest difference in row (0, 0), sum 127.895920 vs -184.804230
+
+Layer 8, Token 15 (model.layers.out comparison):
+  Original tensor sum: 109.946281
+  Converted tensor sum: -183.545380
+  Original tensor mean: 13.743285
+  Converted tensor mean: -22.943172
+ Mean difference: 36.68645859
+ Maximum pointwise difference: 44.14192963
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 21.54407120, Converted: -22.59785843
+ Biggest difference in row (0, 0), sum 109.946281 vs -183.545380
+
+Layer 9, Token 15 (model.layers.out comparison):
+  Original tensor sum: 100.719040
+  Converted tensor sum: -189.035889
+  Original tensor mean: 12.589880
+  Converted tensor mean: -23.629486
+ Mean difference: 36.21936798
+ Maximum pointwise difference: 49.71876526
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 20.62917519, Converted: -29.08958817
+ Biggest difference in row (0, 0), sum 100.719040 vs -189.035889
+
+Layer 10, Token 15 (model.layers.out comparison):
+  Original tensor sum: 94.437965
+  Converted tensor sum: -184.073608
+  Original tensor mean: 11.804746
+  Converted tensor mean: -23.009201
+ Mean difference: 34.81394577
+ Maximum pointwise difference: 49.50559998
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 19.98403168, Converted: -29.52156830
+ Biggest difference in row (0, 0), sum 94.437965 vs -184.073608
+
+Layer 11, Token 15 (model.layers.out comparison):
+  Original tensor sum: 187.329086
+  Converted tensor sum: -525.129150
+  Original tensor mean: 23.416136
+  Converted tensor mean: -65.641144
+ Mean difference: 89.05728149
+ Maximum pointwise difference: 114.85643005
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 32.40055466, Converted: -82.45587158
+ Biggest difference in row (0, 0), sum 187.329086 vs -525.129150
+
+Layer 12, Token 15 (model.layers.out comparison):
+  Original tensor sum: 189.391296
+  Converted tensor sum: -524.645203
+  Original tensor mean: 23.673912
+  Converted tensor mean: -65.580650
+ Mean difference: 89.25456238
+ Maximum pointwise difference: 119.02915955
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 33.67853165, Converted: -85.35062408
+ Biggest difference in row (0, 0), sum 189.391296 vs -524.645203
+
+Layer 13, Token 15 (model.layers.out comparison):
+  Original tensor sum: 183.008652
+  Converted tensor sum: -545.134033
+  Original tensor mean: 22.876081
+  Converted tensor mean: -68.141754
+ Mean difference: 91.01783752
+ Maximum pointwise difference: 119.28398132
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 33.81208420, Converted: -85.47189331
+ Biggest difference in row (0, 0), sum 183.008652 vs -545.134033
+
+Layer 14, Token 15 (model.layers.out comparison):
+  Original tensor sum: 179.184265
+  Converted tensor sum: -590.197998
+  Original tensor mean: 22.398033
+  Converted tensor mean: -73.774750
+ Mean difference: 96.17278290
+ Maximum pointwise difference: 126.14685059
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 33.16656876, Converted: -92.98027802
+ Biggest difference in row (0, 0), sum 179.184265 vs -590.197998
+
+Layer 15, Token 15 (model.layers.out comparison):
+  Original tensor sum: 315.300140
+  Converted tensor sum: -976.074097
+  Original tensor mean: 39.412518
+  Converted tensor mean: -122.009262
+ Mean difference: 161.42178345
+ Maximum pointwise difference: 201.52458191
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 52.86392212, Converted: -148.66065979
+ Biggest difference in row (0, 0), sum 315.300140 vs -976.074097
+
+Layer 0, Token 16 (model.layers.out comparison):
+  Original tensor sum: 12.044241
+  Converted tensor sum: 14.548074
+  Original tensor mean: 1.505530
+  Converted tensor mean: 1.818509
+ Mean difference: 3.51175261
+ Maximum pointwise difference: 7.44231224
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: -4.31869221, Converted: 3.12362027
+ Biggest difference in row (0, 0), sum 12.044241 vs 14.548074
+
+Layer 1, Token 16 (model.layers.out comparison):
+  Original tensor sum: 7.660315
+  Converted tensor sum: 1.425261
+  Original tensor mean: 0.957539
+  Converted tensor mean: 0.178158
+ Mean difference: 4.00331783
+ Maximum pointwise difference: 8.79326248
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: 3.55122566, Converted: -5.24203634
+ Biggest difference in row (0, 0), sum 7.660315 vs 1.425261
+
+Layer 2, Token 16 (model.layers.out comparison):
+  Original tensor sum: 5.985608
+  Converted tensor sum: -2.881522
+  Original tensor mean: 0.748201
+  Converted tensor mean: -0.360190
+ Mean difference: 6.00233269
+ Maximum pointwise difference: 9.75814056
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: 3.30634618, Converted: -6.45179462
+ Biggest difference in row (0, 0), sum 5.985608 vs -2.881522
+
+Layer 3, Token 16 (model.layers.out comparison):
+  Original tensor sum: 66.644623
+  Converted tensor sum: 38.471397
+  Original tensor mean: 8.330578
+  Converted tensor mean: 4.808925
+ Mean difference: 5.99987411
+ Maximum pointwise difference: 11.70975304
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 11.37678432, Converted: -0.33296829
+ Biggest difference in row (0, 0), sum 66.644623 vs 38.471397
+
+Layer 4, Token 16 (model.layers.out comparison):
+  Original tensor sum: 55.084259
+  Converted tensor sum: 39.585022
+  Original tensor mean: 6.885532
+  Converted tensor mean: 4.948128
+ Mean difference: 5.54818344
+ Maximum pointwise difference: 10.42512989
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 8.96806908, Converted: -1.45706093
+ Biggest difference in row (0, 0), sum 55.084259 vs 39.585022
+
+Layer 5, Token 16 (model.layers.out comparison):
+  Original tensor sum: 47.768257
+  Converted tensor sum: 29.551674
+  Original tensor mean: 5.971032
+  Converted tensor mean: 3.693959
+ Mean difference: 5.40017319
+ Maximum pointwise difference: 11.83149147
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 9.62209320, Converted: -2.20939875
+ Biggest difference in row (0, 0), sum 47.768257 vs 29.551674
+
+Layer 6, Token 16 (model.layers.out comparison):
+  Original tensor sum: 47.378487
+  Converted tensor sum: 33.471664
+  Original tensor mean: 5.922311
+  Converted tensor mean: 4.183958
+ Mean difference: 5.35756683
+ Maximum pointwise difference: 11.70071220
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 10.01993370, Converted: -1.68077850
+ Biggest difference in row (0, 0), sum 47.378487 vs 33.471664
+
+Layer 7, Token 16 (model.layers.out comparison):
+  Original tensor sum: 121.329849
+  Converted tensor sum: 101.072693
+  Original tensor mean: 15.166231
+  Converted tensor mean: 12.634087
+ Mean difference: 4.85845757
+ Maximum pointwise difference: 11.92098331
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 18.39835739, Converted: 6.47737408
+ Biggest difference in row (0, 0), sum 121.329849 vs 101.072693
+
+Layer 8, Token 16 (model.layers.out comparison):
+  Original tensor sum: 105.626358
+  Converted tensor sum: 92.869370
+  Original tensor mean: 13.203295
+  Converted tensor mean: 11.608671
+ Mean difference: 5.01301622
+ Maximum pointwise difference: 11.09072685
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 15.02331066, Converted: 3.93258405
+ Biggest difference in row (0, 0), sum 105.626358 vs 92.869370
+
+Layer 9, Token 16 (model.layers.out comparison):
+  Original tensor sum: 94.886589
+  Converted tensor sum: 86.461792
+  Original tensor mean: 11.860824
+  Converted tensor mean: 10.807724
+ Mean difference: 5.16425228
+ Maximum pointwise difference: 10.79585648
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 2.03169847, Converted: 12.82755470
+ Biggest difference in row (0, 0), sum 94.886589 vs 86.461792
+
+Layer 10, Token 16 (model.layers.out comparison):
+  Original tensor sum: 93.657555
+  Converted tensor sum: 77.932861
+  Original tensor mean: 11.707194
+  Converted tensor mean: 9.741608
+ Mean difference: 5.07010078
+ Maximum pointwise difference: 11.53797054
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 13.33782196, Converted: 1.79985178
+ Biggest difference in row (0, 0), sum 93.657555 vs 77.932861
+
+Layer 11, Token 16 (model.layers.out comparison):
+  Original tensor sum: 186.086578
+  Converted tensor sum: 176.759811
+  Original tensor mean: 23.260822
+  Converted tensor mean: 22.094976
+ Mean difference: 4.87584686
+ Maximum pointwise difference: 10.12077332
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 14.05643463, Converted: 24.17720795
+ Biggest difference in row (0, 0), sum 186.086578 vs 176.759811
+
+Layer 12, Token 16 (model.layers.out comparison):
+  Original tensor sum: 188.253220
+  Converted tensor sum: 173.150467
+  Original tensor mean: 23.531652
+  Converted tensor mean: 21.643808
+ Mean difference: 5.08278847
+ Maximum pointwise difference: 9.91738033
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 14.99966526, Converted: 24.91704559
+ Biggest difference in row (0, 0), sum 188.253220 vs 173.150467
+
+Layer 13, Token 16 (model.layers.out comparison):
+  Original tensor sum: 181.761749
+  Converted tensor sum: 171.658249
+  Original tensor mean: 22.720219
+  Converted tensor mean: 21.457281
+ Mean difference: 4.79229736
+ Maximum pointwise difference: 9.82627106
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 14.16268539, Converted: 23.98895645
+ Biggest difference in row (0, 0), sum 181.761749 vs 171.658249
+
+Layer 14, Token 16 (model.layers.out comparison):
+  Original tensor sum: 176.198990
+  Converted tensor sum: 170.420898
+  Original tensor mean: 22.024874
+  Converted tensor mean: 21.302612
+ Mean difference: 4.28427029
+ Maximum pointwise difference: 9.05801964
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 13.50310326, Converted: 22.56112289
+ Biggest difference in row (0, 0), sum 176.198990 vs 170.420898
+
+Layer 15, Token 16 (model.layers.out comparison):
+  Original tensor sum: 314.888916
+  Converted tensor sum: 308.839905
+  Original tensor mean: 39.361115
+  Converted tensor mean: 38.604988
+ Mean difference: 4.36002254
+ Maximum pointwise difference: 9.44413185
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 25.14219856, Converted: 34.58633041
+ Biggest difference in row (0, 0), sum 314.888916 vs 308.839905
+
+Layer 0, Token 17 (model.layers.out comparison):
+  Original tensor sum: 6.615214
+  Converted tensor sum: -14.476066
+  Original tensor mean: 0.826902
+  Converted tensor mean: -1.809508
+ Mean difference: 4.01758480
+ Maximum pointwise difference: 12.95696259
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: 8.16467762, Converted: -4.79228544
+ Biggest difference in row (0, 0), sum 6.615214 vs -14.476066
+
+Layer 1, Token 17 (model.layers.out comparison):
+  Original tensor sum: 4.332821
+  Converted tensor sum: -48.476418
+  Original tensor mean: 0.541603
+  Converted tensor mean: -6.059552
+ Mean difference: 8.00736046
+ Maximum pointwise difference: 13.83443928
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: 7.88728952, Converted: -5.94714975
+ Biggest difference in row (0, 0), sum 4.332821 vs -48.476418
+
+Layer 2, Token 17 (model.layers.out comparison):
+  Original tensor sum: 13.631664
+  Converted tensor sum: -24.375608
+  Original tensor mean: 1.703958
+  Converted tensor mean: -3.046951
+ Mean difference: 9.48411465
+ Maximum pointwise difference: 15.28743267
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 2.43811703, Converted: -12.84931564
+ Biggest difference in row (0, 0), sum 13.631664 vs -24.375608
+
+Layer 3, Token 17 (model.layers.out comparison):
+  Original tensor sum: 59.143936
+  Converted tensor sum: -80.541725
+  Original tensor mean: 7.392992
+  Converted tensor mean: -10.067716
+ Mean difference: 17.46070862
+ Maximum pointwise difference: 28.83273697
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 9.60771275, Converted: -19.22502327
+ Biggest difference in row (0, 0), sum 59.143936 vs -80.541725
+
+Layer 4, Token 17 (model.layers.out comparison):
+  Original tensor sum: 51.750626
+  Converted tensor sum: -81.567123
+  Original tensor mean: 6.468828
+  Converted tensor mean: -10.195890
+ Mean difference: 17.13005066
+ Maximum pointwise difference: 30.73341751
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 9.45896626, Converted: -21.27445221
+ Biggest difference in row (0, 0), sum 51.750626 vs -81.567123
+
+Layer 5, Token 17 (model.layers.out comparison):
+  Original tensor sum: 33.377792
+  Converted tensor sum: -8.966677
+  Original tensor mean: 4.172224
+  Converted tensor mean: -1.120835
+ Mean difference: 11.87618256
+ Maximum pointwise difference: 19.17303848
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 0.59302533, Converted: 19.76606369
+ Biggest difference in row (0, 0), sum 33.377792 vs -8.966677
+
+Layer 6, Token 17 (model.layers.out comparison):
+  Original tensor sum: 34.373646
+  Converted tensor sum: -17.893101
+  Original tensor mean: 4.296706
+  Converted tensor mean: -2.236638
+ Mean difference: 12.44108009
+ Maximum pointwise difference: 21.66391373
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 7.57746935, Converted: -14.08644485
+ Biggest difference in row (0, 0), sum 34.373646 vs -17.893101
+
+Layer 7, Token 17 (model.layers.out comparison):
+  Original tensor sum: 117.899002
+  Converted tensor sum: -60.493092
+  Original tensor mean: 14.737375
+  Converted tensor mean: -7.561636
+ Mean difference: 22.75322723
+ Maximum pointwise difference: 41.73314667
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 20.46781158, Converted: -21.26533699
+ Biggest difference in row (0, 0), sum 117.899002 vs -60.493092
+
+Layer 8, Token 17 (model.layers.out comparison):
+  Original tensor sum: 102.151550
+  Converted tensor sum: -53.178627
+  Original tensor mean: 12.768944
+  Converted tensor mean: -6.647328
+ Mean difference: 21.35518456
+ Maximum pointwise difference: 40.89769745
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 18.52126884, Converted: -22.37642860
+ Biggest difference in row (0, 0), sum 102.151550 vs -53.178627
+
+Layer 9, Token 17 (model.layers.out comparison):
+  Original tensor sum: 90.451920
+  Converted tensor sum: -34.497658
+  Original tensor mean: 11.306490
+  Converted tensor mean: -4.312207
+ Mean difference: 18.82321548
+ Maximum pointwise difference: 37.83747864
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 18.08675385, Converted: -19.75072479
+ Biggest difference in row (0, 0), sum 90.451920 vs -34.497658
+
+Layer 10, Token 17 (model.layers.out comparison):
+  Original tensor sum: 87.881783
+  Converted tensor sum: -25.459152
+  Original tensor mean: 10.985223
+  Converted tensor mean: -3.182394
+ Mean difference: 17.43336678
+ Maximum pointwise difference: 35.29803467
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 17.46567726, Converted: -17.83235931
+ Biggest difference in row (0, 0), sum 87.881783 vs -25.459152
+
+Layer 11, Token 17 (model.layers.out comparison):
+  Original tensor sum: 185.306732
+  Converted tensor sum: -264.026886
+  Original tensor mean: 23.163342
+  Converted tensor mean: -33.003361
+ Mean difference: 56.16670227
+ Maximum pointwise difference: 73.40274048
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 27.15820312, Converted: -46.24454117
+ Biggest difference in row (0, 0), sum 185.306732 vs -264.026886
+
+Layer 12, Token 17 (model.layers.out comparison):
+  Original tensor sum: 186.018799
+  Converted tensor sum: -238.738007
+  Original tensor mean: 23.252350
+  Converted tensor mean: -29.842251
+ Mean difference: 53.09460068
+ Maximum pointwise difference: 71.14258575
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 27.12987900, Converted: -44.01270676
+ Biggest difference in row (0, 0), sum 186.018799 vs -238.738007
+
+Layer 13, Token 17 (model.layers.out comparison):
+  Original tensor sum: 178.633179
+  Converted tensor sum: -250.662323
+  Original tensor mean: 22.329147
+  Converted tensor mean: -31.332790
+ Mean difference: 53.66194153
+ Maximum pointwise difference: 72.33184814
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 22.06610680, Converted: -50.26573944
+ Biggest difference in row (0, 0), sum 178.633179 vs -250.662323
+
+Layer 14, Token 17 (model.layers.out comparison):
+  Original tensor sum: 171.761902
+  Converted tensor sum: -301.707916
+  Original tensor mean: 21.470238
+  Converted tensor mean: -37.713490
+ Mean difference: 59.18372726
+ Maximum pointwise difference: 84.33922577
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 21.43586349, Converted: -62.90336227
+ Biggest difference in row (0, 0), sum 171.761902 vs -301.707916
+
+Layer 15, Token 17 (model.layers.out comparison):
+  Original tensor sum: 313.503632
+  Converted tensor sum: -672.745667
+  Original tensor mean: 39.187954
+  Converted tensor mean: -84.093208
+ Mean difference: 123.28115845
+ Maximum pointwise difference: 153.27690125
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 38.26152039, Converted: -115.01538086
+ Biggest difference in row (0, 0), sum 313.503632 vs -672.745667
+
+Layer 0, Token 18 (model.layers.out comparison):
+  Original tensor sum: 37.370514
+  Converted tensor sum: 2.800200
+  Original tensor mean: 4.671314
+  Converted tensor mean: 0.350025
+ Mean difference: 5.54810905
+ Maximum pointwise difference: 9.22967339
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: 9.28797436, Converted: 0.05830121
+ Biggest difference in row (0, 0), sum 37.370514 vs 2.800200
+
+Layer 1, Token 18 (model.layers.out comparison):
+  Original tensor sum: 27.386568
+  Converted tensor sum: -8.815313
+  Original tensor mean: 3.423321
+  Converted tensor mean: -1.101914
+ Mean difference: 5.46173763
+ Maximum pointwise difference: 11.23313618
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: 7.72619963, Converted: -3.50693655
+ Biggest difference in row (0, 0), sum 27.386568 vs -8.815313
+
+Layer 2, Token 18 (model.layers.out comparison):
+  Original tensor sum: 22.950966
+  Converted tensor sum: -26.951405
+  Original tensor mean: 2.868871
+  Converted tensor mean: -3.368926
+ Mean difference: 7.41814232
+ Maximum pointwise difference: 14.15112782
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: 7.93941879, Converted: -6.21170902
+ Biggest difference in row (0, 0), sum 22.950966 vs -26.951405
+
+Layer 3, Token 18 (model.layers.out comparison):
+  Original tensor sum: 75.358887
+  Converted tensor sum: -194.584152
+  Original tensor mean: 9.419861
+  Converted tensor mean: -24.323019
+ Mean difference: 33.74287796
+ Maximum pointwise difference: 39.03241730
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 16.72500801, Converted: -22.30740929
+ Biggest difference in row (0, 0), sum 75.358887 vs -194.584152
+
+Layer 4, Token 18 (model.layers.out comparison):
+  Original tensor sum: 63.885963
+  Converted tensor sum: -193.801666
+  Original tensor mean: 7.985745
+  Converted tensor mean: -24.225208
+ Mean difference: 32.21095276
+ Maximum pointwise difference: 39.04253769
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 15.83776665, Converted: -23.20477104
+ Biggest difference in row (0, 0), sum 63.885963 vs -193.801666
+
+Layer 5, Token 18 (model.layers.out comparison):
+  Original tensor sum: 51.427219
+  Converted tensor sum: -189.920349
+  Original tensor mean: 6.428402
+  Converted tensor mean: -23.740044
+ Mean difference: 30.16844559
+ Maximum pointwise difference: 35.64602280
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 8.10052967, Converted: -27.54549408
+ Biggest difference in row (0, 0), sum 51.427219 vs -189.920349
+
+Layer 6, Token 18 (model.layers.out comparison):
+  Original tensor sum: 52.837097
+  Converted tensor sum: -237.793671
+  Original tensor mean: 6.604637
+  Converted tensor mean: -29.724209
+ Mean difference: 36.32884598
+ Maximum pointwise difference: 41.40105438
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 7.84163952, Converted: -33.55941391
+ Biggest difference in row (0, 0), sum 52.837097 vs -237.793671
+
+Layer 7, Token 18 (model.layers.out comparison):
+  Original tensor sum: 129.848618
+  Converted tensor sum: -405.475128
+  Original tensor mean: 16.231077
+  Converted tensor mean: -50.684391
+ Mean difference: 66.91546631
+ Maximum pointwise difference: 75.46723938
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 22.33297348, Converted: -53.13426590
+ Biggest difference in row (0, 0), sum 129.848618 vs -405.475128
+
+Layer 8, Token 18 (model.layers.out comparison):
+  Original tensor sum: 112.813950
+  Converted tensor sum: -388.213379
+  Original tensor mean: 14.101744
+  Converted tensor mean: -48.526672
+ Mean difference: 62.62841415
+ Maximum pointwise difference: 74.58121490
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 20.05025291, Converted: -54.53096390
+ Biggest difference in row (0, 0), sum 112.813950 vs -388.213379
+
+Layer 9, Token 18 (model.layers.out comparison):
+  Original tensor sum: 98.625351
+  Converted tensor sum: -428.683411
+  Original tensor mean: 12.328169
+  Converted tensor mean: -53.585426
+ Mean difference: 65.91359711
+ Maximum pointwise difference: 78.76679230
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 5.12599134, Converted: -73.64080048
+ Biggest difference in row (0, 0), sum 98.625351 vs -428.683411
+
+Layer 10, Token 18 (model.layers.out comparison):
+  Original tensor sum: 93.009445
+  Converted tensor sum: -432.554626
+  Original tensor mean: 11.626181
+  Converted tensor mean: -54.069328
+ Mean difference: 65.69551086
+ Maximum pointwise difference: 76.13760376
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 4.86473036, Converted: -71.27287292
+ Biggest difference in row (0, 0), sum 93.009445 vs -432.554626
+
+Layer 11, Token 18 (model.layers.out comparison):
+  Original tensor sum: 188.645950
+  Converted tensor sum: -772.146790
+  Original tensor mean: 23.580744
+  Converted tensor mean: -96.518349
+ Mean difference: 120.09909058
+ Maximum pointwise difference: 140.57998657
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 17.72886276, Converted: -122.85112000
+ Biggest difference in row (0, 0), sum 188.645950 vs -772.146790
+
+Layer 12, Token 18 (model.layers.out comparison):
+  Original tensor sum: 191.028870
+  Converted tensor sum: -781.472900
+  Original tensor mean: 23.878609
+  Converted tensor mean: -97.684113
+ Mean difference: 121.56272888
+ Maximum pointwise difference: 143.88111877
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 18.69833946, Converted: -125.18278503
+ Biggest difference in row (0, 0), sum 191.028870 vs -781.472900
+
+Layer 13, Token 18 (model.layers.out comparison):
+  Original tensor sum: 183.829086
+  Converted tensor sum: -808.856689
+  Original tensor mean: 22.978636
+  Converted tensor mean: -101.107086
+ Mean difference: 124.08572388
+ Maximum pointwise difference: 147.60656738
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 18.44003105, Converted: -129.16653442
+ Biggest difference in row (0, 0), sum 183.829086 vs -808.856689
+
+Layer 14, Token 18 (model.layers.out comparison):
+  Original tensor sum: 177.643005
+  Converted tensor sum: -844.687622
+  Original tensor mean: 22.205376
+  Converted tensor mean: -105.585953
+ Mean difference: 127.79132843
+ Maximum pointwise difference: 148.00994873
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 17.69933319, Converted: -130.31060791
+ Biggest difference in row (0, 0), sum 177.643005 vs -844.687622
+
+Layer 15, Token 18 (model.layers.out comparison):
+  Original tensor sum: 320.725769
+  Converted tensor sum: -1234.242676
+  Original tensor mean: 40.090721
+  Converted tensor mean: -154.280334
+ Mean difference: 194.37104797
+ Maximum pointwise difference: 225.51652527
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 39.54684830, Converted: -185.96968079
+ Biggest difference in row (0, 0), sum 320.725769 vs -1234.242676
+
+Layer 0, Token 19 (model.layers.out comparison):
+  Original tensor sum: -9.932329
+  Converted tensor sum: -1.418950
+  Original tensor mean: -1.241541
+  Converted tensor mean: -0.177369
+ Mean difference: 1.91613591
+ Maximum pointwise difference: 5.37744808
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: -4.92564631, Converted: 0.45180166
+ Biggest difference in row (0, 0), sum -9.932329 vs -1.418950
+
+Layer 1, Token 19 (model.layers.out comparison):
+  Original tensor sum: -28.079020
+  Converted tensor sum: 4.360578
+  Original tensor mean: -3.509877
+  Converted tensor mean: 0.545072
+ Mean difference: 4.81566954
+ Maximum pointwise difference: 12.93084526
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: -11.63085365, Converted: 1.29999185
+ Biggest difference in row (0, 0), sum -28.079020 vs 4.360578
+
+Layer 2, Token 19 (model.layers.out comparison):
+  Original tensor sum: -9.719646
+  Converted tensor sum: 14.192688
+  Original tensor mean: -1.214956
+  Converted tensor mean: 1.774086
+ Mean difference: 5.83081627
+ Maximum pointwise difference: 15.01737213
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: -10.13109303, Converted: 4.88627911
+ Biggest difference in row (0, 0), sum -9.719646 vs 14.192688
+
+Layer 3, Token 19 (model.layers.out comparison):
+  Original tensor sum: -78.071198
+  Converted tensor sum: 44.287003
+  Original tensor mean: -9.758900
+  Converted tensor mean: 5.535875
+ Mean difference: 15.29477501
+ Maximum pointwise difference: 25.90341759
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: -17.29398918, Converted: 8.60942841
+ Biggest difference in row (0, 0), sum -78.071198 vs 44.287003
+
+Layer 4, Token 19 (model.layers.out comparison):
+  Original tensor sum: -17.936802
+  Converted tensor sum: 43.255585
+  Original tensor mean: -2.242100
+  Converted tensor mean: 5.406948
+ Mean difference: 9.52408981
+ Maximum pointwise difference: 16.11044312
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: -8.10731792, Converted: 8.00312424
+ Biggest difference in row (0, 0), sum -17.936802 vs 43.255585
+
+Layer 5, Token 19 (model.layers.out comparison):
+  Original tensor sum: 14.270342
+  Converted tensor sum: 40.868690
+  Original tensor mean: 1.783793
+  Converted tensor mean: 5.108586
+ Mean difference: 6.39925480
+ Maximum pointwise difference: 13.00582123
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: -10.49264050, Converted: 2.51318097
+ Biggest difference in row (0, 0), sum 14.270342 vs 40.868690
+
+Layer 6, Token 19 (model.layers.out comparison):
+  Original tensor sum: 8.770991
+  Converted tensor sum: 44.250122
+  Original tensor mean: 1.096374
+  Converted tensor mean: 5.531265
+ Mean difference: 7.05475235
+ Maximum pointwise difference: 14.57606697
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: -11.80261707, Converted: 2.77344990
+ Biggest difference in row (0, 0), sum 8.770991 vs 44.250122
+
+Layer 7, Token 19 (model.layers.out comparison):
+  Original tensor sum: 27.567080
+  Converted tensor sum: 110.976578
+  Original tensor mean: 3.445885
+  Converted tensor mean: 13.872072
+ Mean difference: 11.90625381
+ Maximum pointwise difference: 20.18301392
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: -9.75880718, Converted: 10.42420769
+ Biggest difference in row (0, 0), sum 27.567080 vs 110.976578
+
+Layer 8, Token 19 (model.layers.out comparison):
+  Original tensor sum: 12.723747
+  Converted tensor sum: 112.570312
+  Original tensor mean: 1.590468
+  Converted tensor mean: 14.071289
+ Mean difference: 12.89592552
+ Maximum pointwise difference: 20.84409904
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: -12.16371441, Converted: 8.68038464
+ Biggest difference in row (0, 0), sum 12.723747 vs 112.570312
+
+Layer 9, Token 19 (model.layers.out comparison):
+  Original tensor sum: 10.056442
+  Converted tensor sum: 106.334442
+  Original tensor mean: 1.257055
+  Converted tensor mean: 13.291805
+ Mean difference: 12.47594643
+ Maximum pointwise difference: 22.08431053
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: -14.35114861, Converted: 7.73316193
+ Biggest difference in row (0, 0), sum 10.056442 vs 106.334442
+
+Layer 10, Token 19 (model.layers.out comparison):
+  Original tensor sum: -1.989794
+  Converted tensor sum: 99.182007
+  Original tensor mean: -0.248724
+  Converted tensor mean: 12.397751
+ Mean difference: 13.17310143
+ Maximum pointwise difference: 24.05181694
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: -17.15268326, Converted: 6.89913368
+ Biggest difference in row (0, 0), sum -1.989794 vs 99.182007
+
+Layer 11, Token 19 (model.layers.out comparison):
+  Original tensor sum: 67.349617
+  Converted tensor sum: 188.920929
+  Original tensor mean: 8.418702
+  Converted tensor mean: 23.615116
+ Mean difference: 15.19641399
+ Maximum pointwise difference: 29.17947769
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: -10.99394608, Converted: 18.18553162
+ Biggest difference in row (0, 0), sum 67.349617 vs 188.920929
+
+Layer 12, Token 19 (model.layers.out comparison):
+  Original tensor sum: 65.645859
+  Converted tensor sum: 187.996002
+  Original tensor mean: 8.205732
+  Converted tensor mean: 23.499500
+ Mean difference: 15.29376984
+ Maximum pointwise difference: 29.97419739
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: -11.02998257, Converted: 18.94421577
+ Biggest difference in row (0, 0), sum 65.645859 vs 187.996002
+
+Layer 13, Token 19 (model.layers.out comparison):
+  Original tensor sum: 62.775318
+  Converted tensor sum: 186.939407
+  Original tensor mean: 7.846915
+  Converted tensor mean: 23.367426
+ Mean difference: 15.52051163
+ Maximum pointwise difference: 30.78374863
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: -11.59408474, Converted: 19.18966293
+ Biggest difference in row (0, 0), sum 62.775318 vs 186.939407
+
+Layer 14, Token 19 (model.layers.out comparison):
+  Original tensor sum: 66.572449
+  Converted tensor sum: 192.538483
+  Original tensor mean: 8.321556
+  Converted tensor mean: 24.067310
+ Mean difference: 15.74575615
+ Maximum pointwise difference: 32.26174927
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: -11.68817997, Converted: 20.57357025
+ Biggest difference in row (0, 0), sum 66.572449 vs 192.538483
+
+Layer 15, Token 19 (model.layers.out comparison):
+  Original tensor sum: 224.145126
+  Converted tensor sum: 325.050964
+  Original tensor mean: 28.018141
+  Converted tensor mean: 40.631371
+ Mean difference: 13.45689964
+ Maximum pointwise difference: 28.69198799
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 9.77963829, Converted: 38.47162628
+ Biggest difference in row (0, 0), sum 224.145126 vs 325.050964
+
+Layer 0, Token 20 (model.layers.out comparison):
+  Original tensor sum: -29.569780
+  Converted tensor sum: 10.794893
+  Original tensor mean: -3.696223
+  Converted tensor mean: 1.349362
+ Mean difference: 6.44896221
+ Maximum pointwise difference: 13.91718292
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: -9.61637592, Converted: 4.30080748
+ Biggest difference in row (0, 0), sum -29.569780 vs 10.794893
+
+Layer 1, Token 20 (model.layers.out comparison):
+  Original tensor sum: 1.025735
+  Converted tensor sum: 6.199029
+  Original tensor mean: 0.128217
+  Converted tensor mean: 0.774879
+ Mean difference: 7.59240437
+ Maximum pointwise difference: 14.00857544
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: -9.50434303, Converted: 4.50423241
+ Biggest difference in row (0, 0), sum 1.025735 vs 6.199029
+
+Layer 2, Token 20 (model.layers.out comparison):
+  Original tensor sum: 17.293440
+  Converted tensor sum: 7.479863
+  Original tensor mean: 2.161680
+  Converted tensor mean: 0.934983
+ Mean difference: 7.88275719
+ Maximum pointwise difference: 14.18584061
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: -8.63929176, Converted: 5.54654837
+ Biggest difference in row (0, 0), sum 17.293440 vs 7.479863
+
+Layer 3, Token 20 (model.layers.out comparison):
+  Original tensor sum: 36.610168
+  Converted tensor sum: 49.467545
+  Original tensor mean: 4.576271
+  Converted tensor mean: 6.183443
+ Mean difference: 6.93841553
+ Maximum pointwise difference: 18.18937302
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: -5.24265194, Converted: 12.94672108
+ Biggest difference in row (0, 0), sum 36.610168 vs 49.467545
+
+Layer 4, Token 20 (model.layers.out comparison):
+  Original tensor sum: 29.254171
+  Converted tensor sum: 47.750710
+  Original tensor mean: 3.656771
+  Converted tensor mean: 5.968839
+ Mean difference: 7.21544361
+ Maximum pointwise difference: 18.54884338
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: -5.63391066, Converted: 12.91493225
+ Biggest difference in row (0, 0), sum 29.254171 vs 47.750710
+
+Layer 5, Token 20 (model.layers.out comparison):
+  Original tensor sum: 35.151703
+  Converted tensor sum: 48.878067
+  Original tensor mean: 4.393963
+  Converted tensor mean: 6.109758
+ Mean difference: 6.99968100
+ Maximum pointwise difference: 14.96766090
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: -3.65913010, Converted: 11.30853081
+ Biggest difference in row (0, 0), sum 35.151703 vs 48.878067
+
+Layer 6, Token 20 (model.layers.out comparison):
+  Original tensor sum: 30.034544
+  Converted tensor sum: 47.318748
+  Original tensor mean: 3.754318
+  Converted tensor mean: 5.914844
+ Mean difference: 7.24886227
+ Maximum pointwise difference: 14.76261425
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: -3.74199128, Converted: 11.02062321
+ Biggest difference in row (0, 0), sum 30.034544 vs 47.318748
+
+Layer 7, Token 20 (model.layers.out comparison):
+  Original tensor sum: 93.501678
+  Converted tensor sum: 109.843590
+  Original tensor mean: 11.687710
+  Converted tensor mean: 13.730449
+ Mean difference: 6.95008612
+ Maximum pointwise difference: 15.00504684
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 6.03040743, Converted: 21.03545380
+ Biggest difference in row (0, 0), sum 93.501678 vs 109.843590
+
+Layer 8, Token 20 (model.layers.out comparison):
+  Original tensor sum: 79.472687
+  Converted tensor sum: 102.823357
+  Original tensor mean: 9.934086
+  Converted tensor mean: 12.852920
+ Mean difference: 7.54766369
+ Maximum pointwise difference: 16.02755737
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 5.07132435, Converted: 21.09888077
+ Biggest difference in row (0, 0), sum 79.472687 vs 102.823357
+
+Layer 9, Token 20 (model.layers.out comparison):
+  Original tensor sum: 67.706139
+  Converted tensor sum: 99.777931
+  Original tensor mean: 8.463267
+  Converted tensor mean: 12.472241
+ Mean difference: 8.86232471
+ Maximum pointwise difference: 16.78725052
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 4.42850208, Converted: 21.21575165
+ Biggest difference in row (0, 0), sum 67.706139 vs 99.777931
+
+Layer 10, Token 20 (model.layers.out comparison):
+  Original tensor sum: 63.760403
+  Converted tensor sum: 96.691109
+  Original tensor mean: 7.970050
+  Converted tensor mean: 12.086389
+ Mean difference: 9.02034378
+ Maximum pointwise difference: 16.24016762
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 4.14136124, Converted: 20.38152885
+ Biggest difference in row (0, 0), sum 63.760403 vs 96.691109
+
+Layer 11, Token 20 (model.layers.out comparison):
+  Original tensor sum: 158.635681
+  Converted tensor sum: 194.330322
+  Original tensor mean: 19.829460
+  Converted tensor mean: 24.291290
+ Mean difference: 8.75148964
+ Maximum pointwise difference: 16.25316620
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 15.08195591, Converted: 31.33512306
+ Biggest difference in row (0, 0), sum 158.635681 vs 194.330322
+
+Layer 12, Token 20 (model.layers.out comparison):
+  Original tensor sum: 159.106079
+  Converted tensor sum: 194.084503
+  Original tensor mean: 19.888260
+  Converted tensor mean: 24.260563
+ Mean difference: 8.75931835
+ Maximum pointwise difference: 16.29665756
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 14.28990650, Converted: 30.58656502
+ Biggest difference in row (0, 0), sum 159.106079 vs 194.084503
+
+Layer 13, Token 20 (model.layers.out comparison):
+  Original tensor sum: 153.442200
+  Converted tensor sum: 186.870270
+  Original tensor mean: 19.180275
+  Converted tensor mean: 23.358784
+ Mean difference: 8.66864204
+ Maximum pointwise difference: 15.29904747
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 14.04843903, Converted: 29.34748650
+ Biggest difference in row (0, 0), sum 153.442200 vs 186.870270
+
+Layer 14, Token 20 (model.layers.out comparison):
+  Original tensor sum: 147.691605
+  Converted tensor sum: 175.338470
+  Original tensor mean: 18.461451
+  Converted tensor mean: 21.917309
+ Mean difference: 8.84063625
+ Maximum pointwise difference: 15.63497734
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 13.52752876, Converted: 29.16250610
+ Biggest difference in row (0, 0), sum 147.691605 vs 175.338470
+
+Layer 15, Token 20 (model.layers.out comparison):
+  Original tensor sum: 294.136749
+  Converted tensor sum: 310.250946
+  Original tensor mean: 36.767094
+  Converted tensor mean: 38.781368
+ Mean difference: 9.18845367
+ Maximum pointwise difference: 14.23109627
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 30.77650642, Converted: 45.00760269
+ Biggest difference in row (0, 0), sum 294.136749 vs 310.250946
+
+Layer 0, Token 21 (model.layers.out comparison):
+  Original tensor sum: -18.838482
+  Converted tensor sum: -1.325968
+  Original tensor mean: -2.354810
+  Converted tensor mean: -0.165746
+ Mean difference: 2.79272628
+ Maximum pointwise difference: 6.95248222
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: -6.02015686, Converted: 0.93232512
+ Biggest difference in row (0, 0), sum -18.838482 vs -1.325968
+
+Layer 1, Token 21 (model.layers.out comparison):
+  Original tensor sum: -6.250936
+  Converted tensor sum: -2.277201
+  Original tensor mean: -0.781367
+  Converted tensor mean: -0.284650
+ Mean difference: 5.05594349
+ Maximum pointwise difference: 9.99544907
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 6.23908186, Converted: -3.75636768
+ Biggest difference in row (0, 0), sum -6.250936 vs -2.277201
+
+Layer 2, Token 21 (model.layers.out comparison):
+  Original tensor sum: -2.587172
+  Converted tensor sum: 0.977817
+  Original tensor mean: -0.323396
+  Converted tensor mean: 0.122227
+ Mean difference: 3.66970563
+ Maximum pointwise difference: 9.02869225
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: -7.89160728, Converted: 1.13708520
+ Biggest difference in row (0, 0), sum -2.587172 vs 0.977817
+
+Layer 3, Token 21 (model.layers.out comparison):
+  Original tensor sum: -37.525734
+  Converted tensor sum: 5.221979
+  Original tensor mean: -4.690717
+  Converted tensor mean: 0.652747
+ Mean difference: 6.04651690
+ Maximum pointwise difference: 12.38726807
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: -10.43804359, Converted: 1.94922423
+ Biggest difference in row (0, 0), sum -37.525734 vs 5.221979
+
+Layer 4, Token 21 (model.layers.out comparison):
+  Original tensor sum: 4.066291
+  Converted tensor sum: 13.447447
+  Original tensor mean: 0.508286
+  Converted tensor mean: 1.680931
+ Mean difference: 5.62788296
+ Maximum pointwise difference: 14.04961491
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: -10.56051826, Converted: 3.48909688
+ Biggest difference in row (0, 0), sum 4.066291 vs 13.447447
+
+Layer 5, Token 21 (model.layers.out comparison):
+  Original tensor sum: 22.123846
+  Converted tensor sum: 14.835675
+  Original tensor mean: 2.765481
+  Converted tensor mean: 1.854459
+ Mean difference: 5.25254917
+ Maximum pointwise difference: 11.90699482
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: 7.93798828, Converted: -3.96900630
+ Biggest difference in row (0, 0), sum 22.123846 vs 14.835675
+
+Layer 6, Token 21 (model.layers.out comparison):
+  Original tensor sum: 22.319403
+  Converted tensor sum: 11.047790
+  Original tensor mean: 2.789925
+  Converted tensor mean: 1.380974
+ Mean difference: 5.50898457
+ Maximum pointwise difference: 13.00136471
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: 7.73285818, Converted: -5.26850653
+ Biggest difference in row (0, 0), sum 22.319403 vs 11.047790
+
+Layer 7, Token 21 (model.layers.out comparison):
+  Original tensor sum: 55.420013
+  Converted tensor sum: 74.081238
+  Original tensor mean: 6.927502
+  Converted tensor mean: 9.260155
+ Mean difference: 5.90270138
+ Maximum pointwise difference: 12.46957588
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: -1.65777194, Converted: 10.81180382
+ Biggest difference in row (0, 0), sum 55.420013 vs 74.081238
+
+Layer 8, Token 21 (model.layers.out comparison):
+  Original tensor sum: 39.922848
+  Converted tensor sum: 72.282196
+  Original tensor mean: 4.990356
+  Converted tensor mean: 9.035275
+ Mean difference: 6.12995577
+ Maximum pointwise difference: 13.29505730
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: -3.36732197, Converted: 9.92773533
+ Biggest difference in row (0, 0), sum 39.922848 vs 72.282196
+
+Layer 9, Token 21 (model.layers.out comparison):
+  Original tensor sum: 29.193859
+  Converted tensor sum: 64.425896
+  Original tensor mean: 3.649232
+  Converted tensor mean: 8.053237
+ Mean difference: 6.22422409
+ Maximum pointwise difference: 14.81417084
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: -3.98170996, Converted: 10.83246040
+ Biggest difference in row (0, 0), sum 29.193859 vs 64.425896
+
+Layer 10, Token 21 (model.layers.out comparison):
+  Original tensor sum: 23.706369
+  Converted tensor sum: 55.726307
+  Original tensor mean: 2.963296
+  Converted tensor mean: 6.965788
+ Mean difference: 6.04786444
+ Maximum pointwise difference: 14.39242363
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: -4.86538124, Converted: 9.52704239
+ Biggest difference in row (0, 0), sum 23.706369 vs 55.726307
+
+Layer 11, Token 21 (model.layers.out comparison):
+  Original tensor sum: 123.990646
+  Converted tensor sum: 150.405350
+  Original tensor mean: 15.498831
+  Converted tensor mean: 18.800669
+ Mean difference: 5.61389732
+ Maximum pointwise difference: 14.70817947
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 6.49463272, Converted: 21.20281219
+ Biggest difference in row (0, 0), sum 123.990646 vs 150.405350
+
+Layer 12, Token 21 (model.layers.out comparison):
+  Original tensor sum: 120.701889
+  Converted tensor sum: 144.158798
+  Original tensor mean: 15.087736
+  Converted tensor mean: 18.019850
+ Mean difference: 5.24121237
+ Maximum pointwise difference: 14.31963730
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 5.24581337, Converted: 19.56545067
+ Biggest difference in row (0, 0), sum 120.701889 vs 144.158798
+
+Layer 13, Token 21 (model.layers.out comparison):
+  Original tensor sum: 114.196152
+  Converted tensor sum: 142.528229
+  Original tensor mean: 14.274519
+  Converted tensor mean: 17.816029
+ Mean difference: 5.27994871
+ Maximum pointwise difference: 14.28137684
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 4.50468159, Converted: 18.78605843
+ Biggest difference in row (0, 0), sum 114.196152 vs 142.528229
+
+Layer 14, Token 21 (model.layers.out comparison):
+  Original tensor sum: 109.654587
+  Converted tensor sum: 141.504807
+  Original tensor mean: 13.706823
+  Converted tensor mean: 17.688101
+ Mean difference: 5.26909733
+ Maximum pointwise difference: 15.04267311
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 3.59476113, Converted: 18.63743401
+ Biggest difference in row (0, 0), sum 109.654587 vs 141.504807
+
+Layer 15, Token 21 (model.layers.out comparison):
+  Original tensor sum: 258.799988
+  Converted tensor sum: 280.546570
+  Original tensor mean: 32.349998
+  Converted tensor mean: 35.068321
+ Mean difference: 5.38046169
+ Maximum pointwise difference: 13.79010963
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 21.50290108, Converted: 35.29301071
+ Biggest difference in row (0, 0), sum 258.799988 vs 280.546570
+
+Layer 0, Token 22 (model.layers.out comparison):
+  Original tensor sum: 22.958118
+  Converted tensor sum: -3.202849
+  Original tensor mean: 2.869765
+  Converted tensor mean: -0.400356
+ Mean difference: 4.91125917
+ Maximum pointwise difference: 8.36230850
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 9.21100616, Converted: 0.84869760
+ Biggest difference in row (0, 0), sum 22.958118 vs -3.202849
+
+Layer 1, Token 22 (model.layers.out comparison):
+  Original tensor sum: 25.125549
+  Converted tensor sum: -10.143456
+  Original tensor mean: 3.140694
+  Converted tensor mean: -1.267932
+ Mean difference: 5.89313412
+ Maximum pointwise difference: 9.59223843
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 9.62790585, Converted: 0.03566782
+ Biggest difference in row (0, 0), sum 25.125549 vs -10.143456
+
+Layer 2, Token 22 (model.layers.out comparison):
+  Original tensor sum: 27.315422
+  Converted tensor sum: -15.748328
+  Original tensor mean: 3.414428
+  Converted tensor mean: -1.968541
+ Mean difference: 8.97875500
+ Maximum pointwise difference: 14.55634785
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: 2.85774899, Converted: -11.69859886
+ Biggest difference in row (0, 0), sum 27.315422 vs -15.748328
+
+Layer 3, Token 22 (model.layers.out comparison):
+  Original tensor sum: 65.650429
+  Converted tensor sum: -88.889626
+  Original tensor mean: 8.206304
+  Converted tensor mean: -11.111203
+ Mean difference: 19.31750679
+ Maximum pointwise difference: 27.03379250
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: 6.11478758, Converted: -20.91900444
+ Biggest difference in row (0, 0), sum 65.650429 vs -88.889626
+
+Layer 4, Token 22 (model.layers.out comparison):
+  Original tensor sum: 61.788639
+  Converted tensor sum: -42.131989
+  Original tensor mean: 7.723580
+  Converted tensor mean: -5.266499
+ Mean difference: 12.99007797
+ Maximum pointwise difference: 18.81860924
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: 14.83176613, Converted: -3.98684263
+ Biggest difference in row (0, 0), sum 61.788639 vs -42.131989
+
+Layer 5, Token 22 (model.layers.out comparison):
+  Original tensor sum: 57.004955
+  Converted tensor sum: 4.555844
+  Original tensor mean: 7.125619
+  Converted tensor mean: 0.569481
+ Mean difference: 8.63973427
+ Maximum pointwise difference: 19.13692093
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 2.54869914, Converted: -16.58822250
+ Biggest difference in row (0, 0), sum 57.004955 vs 4.555844
+
+Layer 6, Token 22 (model.layers.out comparison):
+  Original tensor sum: 54.908669
+  Converted tensor sum: -0.669161
+  Original tensor mean: 6.863584
+  Converted tensor mean: -0.083645
+ Mean difference: 8.70907402
+ Maximum pointwise difference: 18.54141235
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 2.42641473, Converted: -16.11499786
+ Biggest difference in row (0, 0), sum 54.908669 vs -0.669161
+
+Layer 7, Token 22 (model.layers.out comparison):
+  Original tensor sum: 125.605499
+  Converted tensor sum: -1.624224
+  Original tensor mean: 15.700687
+  Converted tensor mean: -0.203028
+ Mean difference: 15.90371513
+ Maximum pointwise difference: 27.27110672
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 10.48501492, Converted: -16.78609276
+ Biggest difference in row (0, 0), sum 125.605499 vs -1.624224
+
+Layer 8, Token 22 (model.layers.out comparison):
+  Original tensor sum: 109.340508
+  Converted tensor sum: -1.809371
+  Original tensor mean: 13.667563
+  Converted tensor mean: -0.226171
+ Mean difference: 14.19305420
+ Maximum pointwise difference: 24.19651794
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 6.68867207, Converted: -17.50784492
+ Biggest difference in row (0, 0), sum 109.340508 vs -1.809371
+
+Layer 9, Token 22 (model.layers.out comparison):
+  Original tensor sum: 93.036400
+  Converted tensor sum: -10.185041
+  Original tensor mean: 11.629550
+  Converted tensor mean: -1.273130
+ Mean difference: 13.22967815
+ Maximum pointwise difference: 22.35823822
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 2.86865139, Converted: -19.48958588
+ Biggest difference in row (0, 0), sum 93.036400 vs -10.185041
+
+Layer 10, Token 22 (model.layers.out comparison):
+  Original tensor sum: 85.756668
+  Converted tensor sum: -2.302891
+  Original tensor mean: 10.719584
+  Converted tensor mean: -0.287861
+ Mean difference: 11.49190331
+ Maximum pointwise difference: 20.63401985
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 19.00829315, Converted: -1.62572634
+ Biggest difference in row (0, 0), sum 85.756668 vs -2.302891
+
+Layer 11, Token 22 (model.layers.out comparison):
+  Original tensor sum: 182.162292
+  Converted tensor sum: -8.586711
+  Original tensor mean: 22.770287
+  Converted tensor mean: -1.073339
+ Mean difference: 23.84362602
+ Maximum pointwise difference: 34.19173050
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 13.77398682, Converted: -20.41774368
+ Biggest difference in row (0, 0), sum 182.162292 vs -8.586711
+
+Layer 12, Token 22 (model.layers.out comparison):
+  Original tensor sum: 182.096252
+  Converted tensor sum: -6.677206
+  Original tensor mean: 22.762032
+  Converted tensor mean: -0.834651
+ Mean difference: 23.59668159
+ Maximum pointwise difference: 35.20670319
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 14.20073891, Converted: -21.00596619
+ Biggest difference in row (0, 0), sum 182.096252 vs -6.677206
+
+Layer 13, Token 22 (model.layers.out comparison):
+  Original tensor sum: 176.400360
+  Converted tensor sum: -0.142300
+  Original tensor mean: 22.050045
+  Converted tensor mean: -0.017787
+ Mean difference: 22.06783295
+ Maximum pointwise difference: 34.37791824
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 13.55050278, Converted: -20.82741547
+ Biggest difference in row (0, 0), sum 176.400360 vs -0.142300
+
+Layer 14, Token 22 (model.layers.out comparison):
+  Original tensor sum: 169.308212
+  Converted tensor sum: 22.573196
+  Original tensor mean: 21.163527
+  Converted tensor mean: 2.821650
+ Mean difference: 18.34187508
+ Maximum pointwise difference: 31.75983810
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 11.94197941, Converted: -19.81785965
+ Biggest difference in row (0, 0), sum 169.308212 vs 22.573196
+
+Layer 15, Token 22 (model.layers.out comparison):
+  Original tensor sum: 321.080658
+  Converted tensor sum: 136.787018
+  Original tensor mean: 40.135082
+  Converted tensor mean: 17.098377
+ Mean difference: 23.03670502
+ Maximum pointwise difference: 37.83760452
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 31.89689064, Converted: -5.94071388
+ Biggest difference in row (0, 0), sum 321.080658 vs 136.787018
+
+Layer 0, Token 23 (model.layers.out comparison):
+  Original tensor sum: 3.588341
+  Converted tensor sum: 9.359616
+  Original tensor mean: 0.448543
+  Converted tensor mean: 1.169952
+ Mean difference: 3.70246077
+ Maximum pointwise difference: 5.65140629
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: -1.32952428, Converted: 4.32188225
+ Biggest difference in row (0, 0), sum 3.588341 vs 9.359616
+
+Layer 1, Token 23 (model.layers.out comparison):
+  Original tensor sum: -13.513486
+  Converted tensor sum: 3.000220
+  Original tensor mean: -1.689186
+  Converted tensor mean: 0.375028
+ Mean difference: 3.49640799
+ Maximum pointwise difference: 8.52665997
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: -4.16102409, Converted: 4.36563587
+ Biggest difference in row (0, 0), sum -13.513486 vs 3.000220
+
+Layer 2, Token 23 (model.layers.out comparison):
+  Original tensor sum: -19.782562
+  Converted tensor sum: 4.253428
+  Original tensor mean: -2.472820
+  Converted tensor mean: 0.531678
+ Mean difference: 5.22110939
+ Maximum pointwise difference: 11.62318039
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: -9.56802559, Converted: 2.05515456
+ Biggest difference in row (0, 0), sum -19.782562 vs 4.253428
+
+Layer 3, Token 23 (model.layers.out comparison):
+  Original tensor sum: -117.794266
+  Converted tensor sum: 14.072861
+  Original tensor mean: -14.724283
+  Converted tensor mean: 1.759108
+ Mean difference: 16.48339081
+ Maximum pointwise difference: 22.75023079
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: -16.32844543, Converted: 6.42178583
+ Biggest difference in row (0, 0), sum -117.794266 vs 14.072861
+
+Layer 4, Token 23 (model.layers.out comparison):
+  Original tensor sum: -73.092270
+  Converted tensor sum: 6.691208
+  Original tensor mean: -9.136534
+  Converted tensor mean: 0.836401
+ Mean difference: 10.72612858
+ Maximum pointwise difference: 19.55576324
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: -16.38935280, Converted: 3.16641092
+ Biggest difference in row (0, 0), sum -73.092270 vs 6.691208
+
+Layer 5, Token 23 (model.layers.out comparison):
+  Original tensor sum: -37.015450
+  Converted tensor sum: 8.681388
+  Original tensor mean: -4.626931
+  Converted tensor mean: 1.085173
+ Mean difference: 7.61082363
+ Maximum pointwise difference: 20.55440712
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: -13.93057537, Converted: 6.62383223
+ Biggest difference in row (0, 0), sum -37.015450 vs 8.681388
+
+Layer 6, Token 23 (model.layers.out comparison):
+  Original tensor sum: -90.333237
+  Converted tensor sum: 7.396842
+  Original tensor mean: -11.291655
+  Converted tensor mean: 0.924605
+ Mean difference: 12.82605839
+ Maximum pointwise difference: 34.69086456
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: -27.39507294, Converted: 7.29579258
+ Biggest difference in row (0, 0), sum -90.333237 vs 7.396842
+
+Layer 7, Token 23 (model.layers.out comparison):
+  Original tensor sum: -214.526337
+  Converted tensor sum: 60.269241
+  Original tensor mean: -26.815792
+  Converted tensor mean: 7.533655
+ Mean difference: 34.34944916
+ Maximum pointwise difference: 55.60475159
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: -42.02355576, Converted: 13.58119678
+ Biggest difference in row (0, 0), sum -214.526337 vs 60.269241
+
+Layer 8, Token 23 (model.layers.out comparison):
+  Original tensor sum: -138.238464
+  Converted tensor sum: 48.862061
+  Original tensor mean: -17.279808
+  Converted tensor mean: 6.107758
+ Mean difference: 23.38756561
+ Maximum pointwise difference: 37.09150314
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: -25.42422676, Converted: 11.66727638
+ Biggest difference in row (0, 0), sum -138.238464 vs 48.862061
+
+Layer 9, Token 23 (model.layers.out comparison):
+  Original tensor sum: -129.366013
+  Converted tensor sum: 32.791050
+  Original tensor mean: -16.170752
+  Converted tensor mean: 4.098881
+ Mean difference: 20.26963234
+ Maximum pointwise difference: 31.74017334
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: -22.80648041, Converted: 8.93369293
+ Biggest difference in row (0, 0), sum -129.366013 vs 32.791050
+
+Layer 10, Token 23 (model.layers.out comparison):
+  Original tensor sum: -112.076103
+  Converted tensor sum: 33.542336
+  Original tensor mean: -14.009513
+  Converted tensor mean: 4.192792
+ Mean difference: 18.20230484
+ Maximum pointwise difference: 30.71049118
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: -22.22323608, Converted: 8.48725605
+ Biggest difference in row (0, 0), sum -112.076103 vs 33.542336
+
+Layer 11, Token 23 (model.layers.out comparison):
+  Original tensor sum: -392.294312
+  Converted tensor sum: 130.177963
+  Original tensor mean: -49.036789
+  Converted tensor mean: 16.272245
+ Mean difference: 65.30903625
+ Maximum pointwise difference: 80.34357452
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: -59.32800293, Converted: 21.01557350
+ Biggest difference in row (0, 0), sum -392.294312 vs 130.177963
+
+Layer 12, Token 23 (model.layers.out comparison):
+  Original tensor sum: -416.741821
+  Converted tensor sum: 126.312363
+  Original tensor mean: -52.092728
+  Converted tensor mean: 15.789045
+ Mean difference: 67.88177490
+ Maximum pointwise difference: 87.29133606
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: -65.95119476, Converted: 21.34013939
+ Biggest difference in row (0, 0), sum -416.741821 vs 126.312363
+
+Layer 13, Token 23 (model.layers.out comparison):
+  Original tensor sum: -420.622223
+  Converted tensor sum: 122.472458
+  Original tensor mean: -52.577778
+  Converted tensor mean: 15.309057
+ Mean difference: 67.88684082
+ Maximum pointwise difference: 89.02587891
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: -68.22624969, Converted: 20.79962921
+ Biggest difference in row (0, 0), sum -420.622223 vs 122.472458
+
+Layer 14, Token 23 (model.layers.out comparison):
+  Original tensor sum: -398.408966
+  Converted tensor sum: 120.881279
+  Original tensor mean: -49.801121
+  Converted tensor mean: 15.110160
+ Mean difference: 64.91127777
+ Maximum pointwise difference: 91.32544708
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: -69.91543579, Converted: 21.41001320
+ Biggest difference in row (0, 0), sum -398.408966 vs 120.881279
+
+Layer 15, Token 23 (model.layers.out comparison):
+  Original tensor sum: -754.637085
+  Converted tensor sum: 262.993530
+  Original tensor mean: -94.329636
+  Converted tensor mean: 32.874191
+ Mean difference: 127.20383453
+ Maximum pointwise difference: 157.75305176
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: -119.83902740, Converted: 37.91403198
+ Biggest difference in row (0, 0), sum -754.637085 vs 262.993530
+
+Layer 0, Token 24 (model.layers.out comparison):
+  Original tensor sum: 14.859251
+  Converted tensor sum: 2.731961
+  Original tensor mean: 1.857406
+  Converted tensor mean: 0.341495
+ Mean difference: 4.21605587
+ Maximum pointwise difference: 9.80887794
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 3.86449504, Converted: -5.94438314
+ Biggest difference in row (0, 0), sum 14.859251 vs 2.731961
+
+Layer 1, Token 24 (model.layers.out comparison):
+  Original tensor sum: 13.986740
+  Converted tensor sum: -2.697716
+  Original tensor mean: 1.748343
+  Converted tensor mean: -0.337215
+ Mean difference: 4.70033360
+ Maximum pointwise difference: 10.86390495
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 3.54197407, Converted: -7.32193136
+ Biggest difference in row (0, 0), sum 13.986740 vs -2.697716
+
+Layer 2, Token 24 (model.layers.out comparison):
+  Original tensor sum: 13.856454
+  Converted tensor sum: -1.915652
+  Original tensor mean: 1.732057
+  Converted tensor mean: -0.239456
+ Mean difference: 5.10369968
+ Maximum pointwise difference: 13.13724899
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 3.00663447, Converted: -10.13061428
+ Biggest difference in row (0, 0), sum 13.856454 vs -1.915652
+
+Layer 3, Token 24 (model.layers.out comparison):
+  Original tensor sum: 63.979485
+  Converted tensor sum: -50.051231
+  Original tensor mean: 7.997436
+  Converted tensor mean: -6.256404
+ Mean difference: 14.25383949
+ Maximum pointwise difference: 25.59371948
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 9.45009327, Converted: -16.14362717
+ Biggest difference in row (0, 0), sum 63.979485 vs -50.051231
+
+Layer 4, Token 24 (model.layers.out comparison):
+  Original tensor sum: 60.174347
+  Converted tensor sum: -64.423790
+  Original tensor mean: 7.521793
+  Converted tensor mean: -8.052974
+ Mean difference: 15.57476616
+ Maximum pointwise difference: 27.42375755
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 8.99039078, Converted: -18.43336678
+ Biggest difference in row (0, 0), sum 60.174347 vs -64.423790
+
+Layer 5, Token 24 (model.layers.out comparison):
+  Original tensor sum: 53.195156
+  Converted tensor sum: -88.183350
+  Original tensor mean: 6.649395
+  Converted tensor mean: -11.022919
+ Mean difference: 17.67231369
+ Maximum pointwise difference: 35.18456650
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 7.48332596, Converted: -27.70124054
+ Biggest difference in row (0, 0), sum 53.195156 vs -88.183350
+
+Layer 6, Token 24 (model.layers.out comparison):
+  Original tensor sum: 55.262775
+  Converted tensor sum: -106.113434
+  Original tensor mean: 6.907847
+  Converted tensor mean: -13.264179
+ Mean difference: 20.17202759
+ Maximum pointwise difference: 40.46305084
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 8.41111183, Converted: -32.05193710
+ Biggest difference in row (0, 0), sum 55.262775 vs -106.113434
+
+Layer 7, Token 24 (model.layers.out comparison):
+  Original tensor sum: 120.454941
+  Converted tensor sum: -239.645325
+  Original tensor mean: 15.056868
+  Converted tensor mean: -29.955666
+ Mean difference: 45.01253128
+ Maximum pointwise difference: 65.79338074
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 14.88038158, Converted: -50.91299820
+ Biggest difference in row (0, 0), sum 120.454941 vs -239.645325
+
+Layer 8, Token 24 (model.layers.out comparison):
+  Original tensor sum: 103.648430
+  Converted tensor sum: -223.958084
+  Original tensor mean: 12.956054
+  Converted tensor mean: -27.994761
+ Mean difference: 40.95081329
+ Maximum pointwise difference: 71.07021332
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 13.01342583, Converted: -58.05678558
+ Biggest difference in row (0, 0), sum 103.648430 vs -223.958084
+
+Layer 9, Token 24 (model.layers.out comparison):
+  Original tensor sum: 90.361565
+  Converted tensor sum: -216.935654
+  Original tensor mean: 11.295196
+  Converted tensor mean: -27.116957
+ Mean difference: 38.41215515
+ Maximum pointwise difference: 69.46690369
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 9.60147953, Converted: -59.86542511
+ Biggest difference in row (0, 0), sum 90.361565 vs -216.935654
+
+Layer 10, Token 24 (model.layers.out comparison):
+  Original tensor sum: 83.880753
+  Converted tensor sum: -215.275970
+  Original tensor mean: 10.485094
+  Converted tensor mean: -26.909496
+ Mean difference: 37.39459229
+ Maximum pointwise difference: 70.35929108
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 8.32141781, Converted: -62.03787613
+ Biggest difference in row (0, 0), sum 83.880753 vs -215.275970
+
+Layer 11, Token 24 (model.layers.out comparison):
+  Original tensor sum: 169.893204
+  Converted tensor sum: -521.842712
+  Original tensor mean: 21.236650
+  Converted tensor mean: -65.230339
+ Mean difference: 86.46699524
+ Maximum pointwise difference: 124.57461548
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 19.84806633, Converted: -104.72654724
+ Biggest difference in row (0, 0), sum 169.893204 vs -521.842712
+
+Layer 12, Token 24 (model.layers.out comparison):
+  Original tensor sum: 170.650391
+  Converted tensor sum: -527.495605
+  Original tensor mean: 21.331299
+  Converted tensor mean: -65.936951
+ Mean difference: 87.26824951
+ Maximum pointwise difference: 124.01423645
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 20.41718483, Converted: -103.59705353
+ Biggest difference in row (0, 0), sum 170.650391 vs -527.495605
+
+Layer 13, Token 24 (model.layers.out comparison):
+  Original tensor sum: 167.707260
+  Converted tensor sum: -525.824341
+  Original tensor mean: 20.963408
+  Converted tensor mean: -65.728043
+ Mean difference: 86.69145203
+ Maximum pointwise difference: 120.31568909
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 18.97763062, Converted: -101.33805847
+ Biggest difference in row (0, 0), sum 167.707260 vs -525.824341
+
+Layer 14, Token 24 (model.layers.out comparison):
+  Original tensor sum: 160.910034
+  Converted tensor sum: -562.698975
+  Original tensor mean: 20.113754
+  Converted tensor mean: -70.337372
+ Mean difference: 90.45112610
+ Maximum pointwise difference: 127.80590057
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 17.37784767, Converted: -110.42805481
+ Biggest difference in row (0, 0), sum 160.910034 vs -562.698975
+
+Layer 15, Token 24 (model.layers.out comparison):
+  Original tensor sum: 306.123810
+  Converted tensor sum: -931.621094
+  Original tensor mean: 38.265476
+  Converted tensor mean: -116.452637
+ Mean difference: 154.71810913
+ Maximum pointwise difference: 176.81520081
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 29.99453545, Converted: -146.82066345
+ Biggest difference in row (0, 0), sum 306.123810 vs -931.621094
+
+Layer 0, Token 25 (model.layers.out comparison):
+  Original tensor sum: -6.641135
+  Converted tensor sum: -3.933383
+  Original tensor mean: -0.830142
+  Converted tensor mean: -0.491673
+ Mean difference: 3.03462601
+ Maximum pointwise difference: 5.75030708
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: -6.01051331, Converted: -0.26020634
+ Biggest difference in row (0, 0), sum -6.641135 vs -3.933383
+
+Layer 1, Token 25 (model.layers.out comparison):
+  Original tensor sum: -1.642994
+  Converted tensor sum: -11.347046
+  Original tensor mean: -0.205374
+  Converted tensor mean: -1.418381
+ Mean difference: 2.82665110
+ Maximum pointwise difference: 5.44076443
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: -3.67477202, Converted: -9.11553669
+ Biggest difference in row (0, 0), sum -1.642994 vs -11.347046
+
+Layer 2, Token 25 (model.layers.out comparison):
+  Original tensor sum: 6.404377
+  Converted tensor sum: -14.681939
+  Original tensor mean: 0.800547
+  Converted tensor mean: -1.835242
+ Mean difference: 3.35868859
+ Maximum pointwise difference: 7.97232580
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 5.46229649, Converted: -2.51002932
+ Biggest difference in row (0, 0), sum 6.404377 vs -14.681939
+
+Layer 3, Token 25 (model.layers.out comparison):
+  Original tensor sum: 73.178505
+  Converted tensor sum: -57.235046
+  Original tensor mean: 9.147313
+  Converted tensor mean: -7.154381
+ Mean difference: 16.30169487
+ Maximum pointwise difference: 20.31940651
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 6.96401119, Converted: -13.35539532
+ Biggest difference in row (0, 0), sum 73.178505 vs -57.235046
+
+Layer 4, Token 25 (model.layers.out comparison):
+  Original tensor sum: 65.662933
+  Converted tensor sum: -75.145912
+  Original tensor mean: 8.207867
+  Converted tensor mean: -9.393239
+ Mean difference: 17.60110474
+ Maximum pointwise difference: 25.96934509
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 5.62515926, Converted: -20.34418488
+ Biggest difference in row (0, 0), sum 65.662933 vs -75.145912
+
+Layer 5, Token 25 (model.layers.out comparison):
+  Original tensor sum: 54.107101
+  Converted tensor sum: -105.733917
+  Original tensor mean: 6.763388
+  Converted tensor mean: -13.216740
+ Mean difference: 19.98012924
+ Maximum pointwise difference: 28.99731064
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 10.49883652, Converted: -18.49847412
+ Biggest difference in row (0, 0), sum 54.107101 vs -105.733917
+
+Layer 6, Token 25 (model.layers.out comparison):
+  Original tensor sum: 48.177361
+  Converted tensor sum: -134.772308
+  Original tensor mean: 6.022170
+  Converted tensor mean: -16.846539
+ Mean difference: 22.86870766
+ Maximum pointwise difference: 36.34035110
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 2.22752476, Converted: -34.11282730
+ Biggest difference in row (0, 0), sum 48.177361 vs -134.772308
+
+Layer 7, Token 25 (model.layers.out comparison):
+  Original tensor sum: 111.839172
+  Converted tensor sum: -277.301056
+  Original tensor mean: 13.979897
+  Converted tensor mean: -34.662632
+ Mean difference: 48.64252853
+ Maximum pointwise difference: 62.89208221
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 9.78997040, Converted: -53.10211182
+ Biggest difference in row (0, 0), sum 111.839172 vs -277.301056
+
+Layer 8, Token 25 (model.layers.out comparison):
+  Original tensor sum: 104.861267
+  Converted tensor sum: -286.217560
+  Original tensor mean: 13.107658
+  Converted tensor mean: -35.777195
+ Mean difference: 48.88484955
+ Maximum pointwise difference: 65.30915833
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 20.24993896, Converted: -45.05921936
+ Biggest difference in row (0, 0), sum 104.861267 vs -286.217560
+
+Layer 9, Token 25 (model.layers.out comparison):
+  Original tensor sum: 96.630295
+  Converted tensor sum: -313.393005
+  Original tensor mean: 12.078787
+  Converted tensor mean: -39.174126
+ Mean difference: 51.25291061
+ Maximum pointwise difference: 67.83577728
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 19.77431297, Converted: -48.06146622
+ Biggest difference in row (0, 0), sum 96.630295 vs -313.393005
+
+Layer 10, Token 25 (model.layers.out comparison):
+  Original tensor sum: 89.098160
+  Converted tensor sum: -316.188721
+  Original tensor mean: 11.137270
+  Converted tensor mean: -39.523590
+ Mean difference: 50.66085815
+ Maximum pointwise difference: 63.01490784
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 18.63522339, Converted: -44.37968445
+ Biggest difference in row (0, 0), sum 89.098160 vs -316.188721
+
+Layer 11, Token 25 (model.layers.out comparison):
+  Original tensor sum: 183.329193
+  Converted tensor sum: -640.859741
+  Original tensor mean: 22.916149
+  Converted tensor mean: -80.107468
+ Mean difference: 103.02362061
+ Maximum pointwise difference: 123.61917114
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 28.08130074, Converted: -95.53787231
+ Biggest difference in row (0, 0), sum 183.329193 vs -640.859741
+
+Layer 12, Token 25 (model.layers.out comparison):
+  Original tensor sum: 183.012512
+  Converted tensor sum: -647.243774
+  Original tensor mean: 22.876564
+  Converted tensor mean: -80.905472
+ Mean difference: 103.78203583
+ Maximum pointwise difference: 121.95301819
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 28.78862381, Converted: -93.16439056
+ Biggest difference in row (0, 0), sum 183.012512 vs -647.243774
+
+Layer 13, Token 25 (model.layers.out comparison):
+  Original tensor sum: 179.038055
+  Converted tensor sum: -675.284363
+  Original tensor mean: 22.379757
+  Converted tensor mean: -84.410545
+ Mean difference: 106.79029846
+ Maximum pointwise difference: 124.18766785
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 29.24967384, Converted: -94.93799591
+ Biggest difference in row (0, 0), sum 179.038055 vs -675.284363
+
+Layer 14, Token 25 (model.layers.out comparison):
+  Original tensor sum: 177.600830
+  Converted tensor sum: -653.687622
+  Original tensor mean: 22.200104
+  Converted tensor mean: -81.710953
+ Mean difference: 103.91105652
+ Maximum pointwise difference: 120.82553864
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 21.10656929, Converted: -99.71897125
+ Biggest difference in row (0, 0), sum 177.600830 vs -653.687622
+
+Layer 15, Token 25 (model.layers.out comparison):
+  Original tensor sum: 323.013031
+  Converted tensor sum: -1030.671143
+  Original tensor mean: 40.376629
+  Converted tensor mean: -128.833893
+ Mean difference: 169.21054077
+ Maximum pointwise difference: 193.25675964
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 50.33515549, Converted: -142.92160034
+ Biggest difference in row (0, 0), sum 323.013031 vs -1030.671143
+
+Layer 0, Token 26 (model.layers.out comparison):
+  Original tensor sum: 65.941025
+  Converted tensor sum: -21.309677
+  Original tensor mean: 8.242628
+  Converted tensor mean: -2.663710
+ Mean difference: 10.92460823
+ Maximum pointwise difference: 22.60500336
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 19.03843307, Converted: -3.56657028
+ Biggest difference in row (0, 0), sum 65.941025 vs -21.309677
+
+Layer 1, Token 26 (model.layers.out comparison):
+  Original tensor sum: 52.076649
+  Converted tensor sum: -57.925156
+  Original tensor mean: 6.509581
+  Converted tensor mean: -7.240644
+ Mean difference: 14.23825073
+ Maximum pointwise difference: 19.17949104
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 5.37531137, Converted: -13.80417919
+ Biggest difference in row (0, 0), sum 52.076649 vs -57.925156
+
+Layer 2, Token 26 (model.layers.out comparison):
+  Original tensor sum: 51.231728
+  Converted tensor sum: -47.847797
+  Original tensor mean: 6.403966
+  Converted tensor mean: -5.980975
+ Mean difference: 12.38494110
+ Maximum pointwise difference: 23.60085297
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 9.02445030, Converted: -14.57640362
+ Biggest difference in row (0, 0), sum 51.231728 vs -47.847797
+
+Layer 3, Token 26 (model.layers.out comparison):
+  Original tensor sum: 107.302612
+  Converted tensor sum: -173.292923
+  Original tensor mean: 13.412827
+  Converted tensor mean: -21.661615
+ Mean difference: 35.07444000
+ Maximum pointwise difference: 43.60850143
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 14.85190392, Converted: -28.75659752
+ Biggest difference in row (0, 0), sum 107.302612 vs -173.292923
+
+Layer 4, Token 26 (model.layers.out comparison):
+  Original tensor sum: 97.273697
+  Converted tensor sum: -182.550171
+  Original tensor mean: 12.159212
+  Converted tensor mean: -22.818771
+ Mean difference: 34.97798157
+ Maximum pointwise difference: 46.59681320
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 14.26772594, Converted: -32.32908630
+ Biggest difference in row (0, 0), sum 97.273697 vs -182.550171
+
+Layer 5, Token 26 (model.layers.out comparison):
+  Original tensor sum: 85.259064
+  Converted tensor sum: -172.859528
+  Original tensor mean: 10.657383
+  Converted tensor mean: -21.607441
+ Mean difference: 32.26482391
+ Maximum pointwise difference: 44.72983170
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 13.95336819, Converted: -30.77646255
+ Biggest difference in row (0, 0), sum 85.259064 vs -172.859528
+
+Layer 6, Token 26 (model.layers.out comparison):
+  Original tensor sum: 87.096161
+  Converted tensor sum: -208.315033
+  Original tensor mean: 10.887020
+  Converted tensor mean: -26.039379
+ Mean difference: 36.92639923
+ Maximum pointwise difference: 45.54611206
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 14.15797043, Converted: -31.38814354
+ Biggest difference in row (0, 0), sum 87.096161 vs -208.315033
+
+Layer 7, Token 26 (model.layers.out comparison):
+  Original tensor sum: 160.905060
+  Converted tensor sum: -356.607910
+  Original tensor mean: 20.113132
+  Converted tensor mean: -44.575989
+ Mean difference: 64.68911743
+ Maximum pointwise difference: 73.27433014
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 19.29874229, Converted: -53.97558594
+ Biggest difference in row (0, 0), sum 160.905060 vs -356.607910
+
+Layer 8, Token 26 (model.layers.out comparison):
+  Original tensor sum: 147.546188
+  Converted tensor sum: -372.627655
+  Original tensor mean: 18.443274
+  Converted tensor mean: -46.578457
+ Mean difference: 65.02172852
+ Maximum pointwise difference: 75.06597900
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 17.32047462, Converted: -57.74550629
+ Biggest difference in row (0, 0), sum 147.546188 vs -372.627655
+
+Layer 9, Token 26 (model.layers.out comparison):
+  Original tensor sum: 142.108231
+  Converted tensor sum: -384.533997
+  Original tensor mean: 17.763529
+  Converted tensor mean: -48.066750
+ Mean difference: 65.83027649
+ Maximum pointwise difference: 80.39822388
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 9.57865334, Converted: -70.81957245
+ Biggest difference in row (0, 0), sum 142.108231 vs -384.533997
+
+Layer 10, Token 26 (model.layers.out comparison):
+  Original tensor sum: 136.597595
+  Converted tensor sum: -406.001617
+  Original tensor mean: 17.074699
+  Converted tensor mean: -50.750202
+ Mean difference: 67.82489777
+ Maximum pointwise difference: 83.06503296
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 16.25280952, Converted: -66.81222534
+ Biggest difference in row (0, 0), sum 136.597595 vs -406.001617
+
+Layer 11, Token 26 (model.layers.out comparison):
+  Original tensor sum: 234.238876
+  Converted tensor sum: -719.742371
+  Original tensor mean: 29.279860
+  Converted tensor mean: -89.967796
+ Mean difference: 119.24765778
+ Maximum pointwise difference: 144.35720825
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 22.54579163, Converted: -121.81141663
+ Biggest difference in row (0, 0), sum 234.238876 vs -719.742371
+
+Layer 12, Token 26 (model.layers.out comparison):
+  Original tensor sum: 230.967987
+  Converted tensor sum: -737.411499
+  Original tensor mean: 28.870998
+  Converted tensor mean: -92.176437
+ Mean difference: 121.04743958
+ Maximum pointwise difference: 145.76480103
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 23.33647728, Converted: -122.42832184
+ Biggest difference in row (0, 0), sum 230.967987 vs -737.411499
+
+Layer 13, Token 26 (model.layers.out comparison):
+  Original tensor sum: 225.836136
+  Converted tensor sum: -743.471008
+  Original tensor mean: 28.229517
+  Converted tensor mean: -92.933876
+ Mean difference: 121.16339111
+ Maximum pointwise difference: 141.17944336
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 23.16177559, Converted: -118.01766205
+ Biggest difference in row (0, 0), sum 225.836136 vs -743.471008
+
+Layer 14, Token 26 (model.layers.out comparison):
+  Original tensor sum: 222.057236
+  Converted tensor sum: -845.007874
+  Original tensor mean: 27.757154
+  Converted tensor mean: -105.625984
+ Mean difference: 133.38313293
+ Maximum pointwise difference: 164.57283020
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 29.71310997, Converted: -134.85972595
+ Biggest difference in row (0, 0), sum 222.057236 vs -845.007874
+
+Layer 15, Token 26 (model.layers.out comparison):
+  Original tensor sum: 366.139526
+  Converted tensor sum: -1227.681152
+  Original tensor mean: 45.767441
+  Converted tensor mean: -153.460144
+ Mean difference: 199.22756958
+ Maximum pointwise difference: 235.55526733
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 46.65935516, Converted: -188.89590454
+ Biggest difference in row (0, 0), sum 366.139526 vs -1227.681152
+
+Layer 0, Token 27 (model.layers.out comparison):
+  Original tensor sum: 0.538792
+  Converted tensor sum: -2.767126
+  Original tensor mean: 0.067349
+  Converted tensor mean: -0.345891
+ Mean difference: 1.04583490
+ Maximum pointwise difference: 4.03163290
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: 4.54428434, Converted: 0.51265144
+ Biggest difference in row (0, 0), sum 0.538792 vs -2.767126
+
+Layer 1, Token 27 (model.layers.out comparison):
+  Original tensor sum: -13.666726
+  Converted tensor sum: 4.859785
+  Original tensor mean: -1.708341
+  Converted tensor mean: 0.607473
+ Mean difference: 3.73808312
+ Maximum pointwise difference: 11.04657841
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: -6.84830761, Converted: 4.19827080
+ Biggest difference in row (0, 0), sum -13.666726 vs 4.859785
+
+Layer 2, Token 27 (model.layers.out comparison):
+  Original tensor sum: 19.892342
+  Converted tensor sum: 18.553621
+  Original tensor mean: 2.486543
+  Converted tensor mean: 2.319203
+ Mean difference: 3.86019540
+ Maximum pointwise difference: 12.85380554
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: -5.59446335, Converted: 7.25934219
+ Biggest difference in row (0, 0), sum 19.892342 vs 18.553621
+
+Layer 3, Token 27 (model.layers.out comparison):
+  Original tensor sum: 84.246483
+  Converted tensor sum: 49.827652
+  Original tensor mean: 10.530810
+  Converted tensor mean: 6.228456
+ Mean difference: 6.56024361
+ Maximum pointwise difference: 11.30776882
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 18.61387444, Converted: 7.30610561
+ Biggest difference in row (0, 0), sum 84.246483 vs 49.827652
+
+Layer 4, Token 27 (model.layers.out comparison):
+  Original tensor sum: 72.374397
+  Converted tensor sum: 50.589382
+  Original tensor mean: 9.046800
+  Converted tensor mean: 6.323673
+ Mean difference: 5.51325321
+ Maximum pointwise difference: 11.16050529
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: -2.93389368, Converted: 8.22661209
+ Biggest difference in row (0, 0), sum 72.374397 vs 50.589382
+
+Layer 5, Token 27 (model.layers.out comparison):
+  Original tensor sum: 68.200790
+  Converted tensor sum: 51.359711
+  Original tensor mean: 8.525099
+  Converted tensor mean: 6.419964
+ Mean difference: 4.32947350
+ Maximum pointwise difference: 8.89735222
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: -3.28547406, Converted: 5.61187792
+ Biggest difference in row (0, 0), sum 68.200790 vs 51.359711
+
+Layer 6, Token 27 (model.layers.out comparison):
+  Original tensor sum: 70.421684
+  Converted tensor sum: 41.851700
+  Original tensor mean: 8.802711
+  Converted tensor mean: 5.231462
+ Mean difference: 5.60544014
+ Maximum pointwise difference: 9.42855549
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: 15.64872551, Converted: 6.22017002
+ Biggest difference in row (0, 0), sum 70.421684 vs 41.851700
+
+Layer 7, Token 27 (model.layers.out comparison):
+  Original tensor sum: 138.012558
+  Converted tensor sum: 106.052734
+  Original tensor mean: 17.251570
+  Converted tensor mean: 13.256592
+ Mean difference: 5.83357430
+ Maximum pointwise difference: 9.46822166
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: 20.60037422, Converted: 11.13215256
+ Biggest difference in row (0, 0), sum 138.012558 vs 106.052734
+
+Layer 8, Token 27 (model.layers.out comparison):
+  Original tensor sum: 124.592545
+  Converted tensor sum: 109.657555
+  Original tensor mean: 15.574068
+  Converted tensor mean: 13.707194
+ Mean difference: 4.43112850
+ Maximum pointwise difference: 10.25702190
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: 3.63314724, Converted: 13.89016914
+ Biggest difference in row (0, 0), sum 124.592545 vs 109.657555
+
+Layer 9, Token 27 (model.layers.out comparison):
+  Original tensor sum: 110.794357
+  Converted tensor sum: 109.277565
+  Original tensor mean: 13.849295
+  Converted tensor mean: 13.659696
+ Mean difference: 4.23832560
+ Maximum pointwise difference: 11.81062031
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: 1.84443951, Converted: 13.65505981
+ Biggest difference in row (0, 0), sum 110.794357 vs 109.277565
+
+Layer 10, Token 27 (model.layers.out comparison):
+  Original tensor sum: 104.034340
+  Converted tensor sum: 104.158554
+  Original tensor mean: 13.004292
+  Converted tensor mean: 13.019819
+ Mean difference: 4.04881191
+ Maximum pointwise difference: 12.64350224
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: 0.44486341, Converted: 13.08836555
+ Biggest difference in row (0, 0), sum 104.034340 vs 104.158554
+
+Layer 11, Token 27 (model.layers.out comparison):
+  Original tensor sum: 194.747101
+  Converted tensor sum: 186.990341
+  Original tensor mean: 24.343388
+  Converted tensor mean: 23.373793
+ Mean difference: 4.42853832
+ Maximum pointwise difference: 11.92787266
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: 11.99839497, Converted: 23.92626762
+ Biggest difference in row (0, 0), sum 194.747101 vs 186.990341
+
+Layer 12, Token 27 (model.layers.out comparison):
+  Original tensor sum: 195.014465
+  Converted tensor sum: 185.515793
+  Original tensor mean: 24.376808
+  Converted tensor mean: 23.189474
+ Mean difference: 4.49333429
+ Maximum pointwise difference: 11.10862160
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: 11.92906380, Converted: 23.03768539
+ Biggest difference in row (0, 0), sum 195.014465 vs 185.515793
+
+Layer 13, Token 27 (model.layers.out comparison):
+  Original tensor sum: 187.897064
+  Converted tensor sum: 182.353088
+  Original tensor mean: 23.487133
+  Converted tensor mean: 22.794136
+ Mean difference: 4.64961338
+ Maximum pointwise difference: 12.63825989
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: 10.51706123, Converted: 23.15532112
+ Biggest difference in row (0, 0), sum 187.897064 vs 182.353088
+
+Layer 14, Token 27 (model.layers.out comparison):
+  Original tensor sum: 182.226410
+  Converted tensor sum: 180.585373
+  Original tensor mean: 22.778301
+  Converted tensor mean: 22.573172
+ Mean difference: 4.70111561
+ Maximum pointwise difference: 12.44419956
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: 11.31790829, Converted: 23.76210785
+ Biggest difference in row (0, 0), sum 182.226410 vs 180.585373
+
+Layer 15, Token 27 (model.layers.out comparison):
+  Original tensor sum: 333.560730
+  Converted tensor sum: 318.274811
+  Original tensor mean: 41.695091
+  Converted tensor mean: 39.784351
+ Mean difference: 4.67095470
+ Maximum pointwise difference: 11.04085732
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: 28.01206779, Converted: 39.05292511
+ Biggest difference in row (0, 0), sum 333.560730 vs 318.274811
+
+Layer 0, Token 28 (model.layers.out comparison):
+  Original tensor sum: -40.607262
+  Converted tensor sum: 42.743095
+  Original tensor mean: -5.075908
+  Converted tensor mean: 5.342887
+ Mean difference: 11.17178345
+ Maximum pointwise difference: 22.58385468
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: -14.17651558, Converted: 8.40733814
+ Biggest difference in row (0, 0), sum -40.607262 vs 42.743095
+
+Layer 1, Token 28 (model.layers.out comparison):
+  Original tensor sum: -43.333393
+  Converted tensor sum: 31.481144
+  Original tensor mean: -5.416674
+  Converted tensor mean: 3.935143
+ Mean difference: 11.11242485
+ Maximum pointwise difference: 18.85606575
+ Max difference location: (0, 0, 5)
+  Values at max diff - Original: -10.93557739, Converted: 7.92048883
+ Biggest difference in row (0, 0), sum -43.333393 vs 31.481144
+
+Layer 2, Token 28 (model.layers.out comparison):
+  Original tensor sum: -67.416214
+  Converted tensor sum: 33.172539
+  Original tensor mean: -8.427027
+  Converted tensor mean: 4.146567
+ Mean difference: 14.60656548
+ Maximum pointwise difference: 20.67273331
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: -13.93512535, Converted: 6.73760748
+ Biggest difference in row (0, 0), sum -67.416214 vs 33.172539
+
+Layer 3, Token 28 (model.layers.out comparison):
+  Original tensor sum: -199.361206
+  Converted tensor sum: 72.683899
+  Original tensor mean: -24.920151
+  Converted tensor mean: 9.085487
+ Mean difference: 34.00563812
+ Maximum pointwise difference: 41.37638092
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: -29.60864067, Converted: 11.76773930
+ Biggest difference in row (0, 0), sum -199.361206 vs 72.683899
+
+Layer 4, Token 28 (model.layers.out comparison):
+  Original tensor sum: -137.055893
+  Converted tensor sum: 63.596687
+  Original tensor mean: -17.131987
+  Converted tensor mean: 7.949586
+ Mean difference: 25.75262260
+ Maximum pointwise difference: 40.96822739
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: -29.79143906, Converted: 11.17678833
+ Biggest difference in row (0, 0), sum -137.055893 vs 63.596687
+
+Layer 5, Token 28 (model.layers.out comparison):
+  Original tensor sum: -73.715279
+  Converted tensor sum: 62.123581
+  Original tensor mean: -9.214410
+  Converted tensor mean: 7.765448
+ Mean difference: 17.84333420
+ Maximum pointwise difference: 31.90785027
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: -24.06629181, Converted: 7.84155846
+ Biggest difference in row (0, 0), sum -73.715279 vs 62.123581
+
+Layer 6, Token 28 (model.layers.out comparison):
+  Original tensor sum: -126.770874
+  Converted tensor sum: 61.464096
+  Original tensor mean: -15.846359
+  Converted tensor mean: 7.683012
+ Mean difference: 23.61796379
+ Maximum pointwise difference: 36.02120209
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: -27.60279655, Converted: 8.41840744
+ Biggest difference in row (0, 0), sum -126.770874 vs 61.464096
+
+Layer 7, Token 28 (model.layers.out comparison):
+  Original tensor sum: -254.607422
+  Converted tensor sum: 126.028885
+  Original tensor mean: -31.825928
+  Converted tensor mean: 15.753611
+ Mean difference: 47.57954025
+ Maximum pointwise difference: 61.42348480
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: -40.33562851, Converted: 21.08785439
+ Biggest difference in row (0, 0), sum -254.607422 vs 126.028885
+
+Layer 8, Token 28 (model.layers.out comparison):
+  Original tensor sum: -198.536194
+  Converted tensor sum: 120.381157
+  Original tensor mean: -24.817024
+  Converted tensor mean: 15.047645
+ Mean difference: 39.86466980
+ Maximum pointwise difference: 52.24274063
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: -30.55666733, Converted: 21.68607330
+ Biggest difference in row (0, 0), sum -198.536194 vs 120.381157
+
+Layer 9, Token 28 (model.layers.out comparison):
+  Original tensor sum: -203.318542
+  Converted tensor sum: 118.674896
+  Original tensor mean: -25.414818
+  Converted tensor mean: 14.834362
+ Mean difference: 40.24917984
+ Maximum pointwise difference: 51.74636078
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: -36.37534714, Converted: 15.37101555
+ Biggest difference in row (0, 0), sum -203.318542 vs 118.674896
+
+Layer 10, Token 28 (model.layers.out comparison):
+  Original tensor sum: -173.929123
+  Converted tensor sum: 115.971573
+  Original tensor mean: -21.741140
+  Converted tensor mean: 14.496447
+ Mean difference: 36.23758698
+ Maximum pointwise difference: 47.99763489
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: -32.96516418, Converted: 15.03247166
+ Biggest difference in row (0, 0), sum -173.929123 vs 115.971573
+
+Layer 11, Token 28 (model.layers.out comparison):
+  Original tensor sum: -450.842834
+  Converted tensor sum: 202.799988
+  Original tensor mean: -56.355354
+  Converted tensor mean: 25.349998
+ Mean difference: 81.70535278
+ Maximum pointwise difference: 92.55924988
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: -66.56226349, Converted: 25.99698830
+ Biggest difference in row (0, 0), sum -450.842834 vs 202.799988
+
+Layer 12, Token 28 (model.layers.out comparison):
+  Original tensor sum: -483.456177
+  Converted tensor sum: 204.607147
+  Original tensor mean: -60.432022
+  Converted tensor mean: 25.575893
+ Mean difference: 86.00791931
+ Maximum pointwise difference: 97.30514526
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: -70.58811951, Converted: 26.71702957
+ Biggest difference in row (0, 0), sum -483.456177 vs 204.607147
+
+Layer 13, Token 28 (model.layers.out comparison):
+  Original tensor sum: -487.978210
+  Converted tensor sum: 194.803741
+  Original tensor mean: -60.997276
+  Converted tensor mean: 24.350468
+ Mean difference: 85.34774780
+ Maximum pointwise difference: 97.00595093
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: -71.41757965, Converted: 25.58836937
+ Biggest difference in row (0, 0), sum -487.978210 vs 194.803741
+
+Layer 14, Token 28 (model.layers.out comparison):
+  Original tensor sum: -487.676697
+  Converted tensor sum: 192.080292
+  Original tensor mean: -60.959587
+  Converted tensor mean: 24.010036
+ Mean difference: 84.96962738
+ Maximum pointwise difference: 101.62533569
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: -75.57343292, Converted: 26.05190277
+ Biggest difference in row (0, 0), sum -487.676697 vs 192.080292
+
+Layer 15, Token 28 (model.layers.out comparison):
+  Original tensor sum: -826.685791
+  Converted tensor sum: 324.333130
+  Original tensor mean: -103.335724
+  Converted tensor mean: 40.541641
+ Mean difference: 143.87736511
+ Maximum pointwise difference: 160.84576416
+ Max difference location: (0, 0, 4)
+  Values at max diff - Original: -111.58706665, Converted: 49.25869751
+ Biggest difference in row (0, 0), sum -826.685791 vs 324.333130
+
+Layer 0, Token 29 (model.layers.out comparison):
+  Original tensor sum: -7.335809
+  Converted tensor sum: 5.924038
+  Original tensor mean: -0.916976
+  Converted tensor mean: 0.740505
+ Mean difference: 2.81220579
+ Maximum pointwise difference: 5.74731255
+ Max difference location: (0, 0, 2)
+  Values at max diff - Original: -3.16068745, Converted: 2.58662534
+ Biggest difference in row (0, 0), sum -7.335809 vs 5.924038
+
+Layer 1, Token 29 (model.layers.out comparison):
+  Original tensor sum: -4.554134
+  Converted tensor sum: 7.198357
+  Original tensor mean: -0.569267
+  Converted tensor mean: 0.899795
+ Mean difference: 4.59539890
+ Maximum pointwise difference: 12.22019768
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: -9.41592121, Converted: 2.80427670
+ Biggest difference in row (0, 0), sum -4.554134 vs 7.198357
+
+Layer 2, Token 29 (model.layers.out comparison):
+  Original tensor sum: 18.821238
+  Converted tensor sum: -2.444355
+  Original tensor mean: 2.352655
+  Converted tensor mean: -0.305544
+ Mean difference: 5.75418472
+ Maximum pointwise difference: 9.27616215
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: -6.88754845, Converted: 2.38861346
+ Biggest difference in row (0, 0), sum 18.821238 vs -2.444355
+
+Layer 3, Token 29 (model.layers.out comparison):
+  Original tensor sum: 70.965004
+  Converted tensor sum: -68.014175
+  Original tensor mean: 8.870625
+  Converted tensor mean: -8.501772
+ Mean difference: 17.37239647
+ Maximum pointwise difference: 24.10712433
+ Max difference location: (0, 0, 3)
+  Values at max diff - Original: 16.14313126, Converted: -7.96399307
+ Biggest difference in row (0, 0), sum 70.965004 vs -68.014175
+
+Layer 4, Token 29 (model.layers.out comparison):
+  Original tensor sum: 62.607174
+  Converted tensor sum: -17.623362
+  Original tensor mean: 7.825897
+  Converted tensor mean: -2.202920
+ Mean difference: 10.34164429
+ Maximum pointwise difference: 18.22177315
+ Max difference location: (0, 0, 0)
+  Values at max diff - Original: 8.43466568, Converted: -9.78710747
+ Biggest difference in row (0, 0), sum 62.607174 vs -17.623362
+
+Layer 5, Token 29 (model.layers.out comparison):
+  Original tensor sum: 52.727810
+  Converted tensor sum: 1.219590
+  Original tensor mean: 6.590976
+  Converted tensor mean: 0.152449
+ Mean difference: 7.65116024
+ Maximum pointwise difference: 18.62134933
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 9.17143154, Converted: -9.44991875
+ Biggest difference in row (0, 0), sum 52.727810 vs 1.219590
+
+Layer 6, Token 29 (model.layers.out comparison):
+  Original tensor sum: 56.382370
+  Converted tensor sum: 4.153158
+  Original tensor mean: 7.047796
+  Converted tensor mean: 0.519145
+ Mean difference: 7.41536808
+ Maximum pointwise difference: 17.59628677
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 10.29856682, Converted: -7.29772091
+ Biggest difference in row (0, 0), sum 56.382370 vs 4.153158
+
+Layer 7, Token 29 (model.layers.out comparison):
+  Original tensor sum: 136.310486
+  Converted tensor sum: 3.958838
+  Original tensor mean: 17.038811
+  Converted tensor mean: 0.494855
+ Mean difference: 16.54395676
+ Maximum pointwise difference: 26.66838837
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 18.93185425, Converted: -7.73653507
+ Biggest difference in row (0, 0), sum 136.310486 vs 3.958838
+
+Layer 8, Token 29 (model.layers.out comparison):
+  Original tensor sum: 119.467941
+  Converted tensor sum: 9.372761
+  Original tensor mean: 14.933493
+  Converted tensor mean: 1.171595
+ Mean difference: 13.76189804
+ Maximum pointwise difference: 22.09118652
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: 20.88569450, Converted: -1.20549154
+ Biggest difference in row (0, 0), sum 119.467941 vs 9.372761
+
+Layer 9, Token 29 (model.layers.out comparison):
+  Original tensor sum: 111.468323
+  Converted tensor sum: 12.752249
+  Original tensor mean: 13.933540
+  Converted tensor mean: 1.594031
+ Mean difference: 12.36562347
+ Maximum pointwise difference: 19.99691391
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: 20.75084877, Converted: 0.75393468
+ Biggest difference in row (0, 0), sum 111.468323 vs 12.752249
+
+Layer 10, Token 29 (model.layers.out comparison):
+  Original tensor sum: 103.290207
+  Converted tensor sum: 4.031506
+  Original tensor mean: 12.911276
+  Converted tensor mean: 0.503938
+ Mean difference: 12.90593433
+ Maximum pointwise difference: 20.97147560
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: 20.37113762, Converted: -0.60033715
+ Biggest difference in row (0, 0), sum 103.290207 vs 4.031506
+
+Layer 11, Token 29 (model.layers.out comparison):
+  Original tensor sum: 195.291718
+  Converted tensor sum: 60.566498
+  Original tensor mean: 24.411465
+  Converted tensor mean: 7.570812
+ Mean difference: 16.84065247
+ Maximum pointwise difference: 26.14917755
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: 32.22053146, Converted: 6.07135296
+ Biggest difference in row (0, 0), sum 195.291718 vs 60.566498
+
+Layer 12, Token 29 (model.layers.out comparison):
+  Original tensor sum: 193.868057
+  Converted tensor sum: 56.865105
+  Original tensor mean: 24.233507
+  Converted tensor mean: 7.108138
+ Mean difference: 17.12537003
+ Maximum pointwise difference: 27.68391991
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 25.67190361, Converted: -2.01201606
+ Biggest difference in row (0, 0), sum 193.868057 vs 56.865105
+
+Layer 13, Token 29 (model.layers.out comparison):
+  Original tensor sum: 191.697586
+  Converted tensor sum: 55.096077
+  Original tensor mean: 23.962198
+  Converted tensor mean: 6.887010
+ Mean difference: 17.07518768
+ Maximum pointwise difference: 27.05913162
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 24.89521027, Converted: -2.16392159
+ Biggest difference in row (0, 0), sum 191.697586 vs 55.096077
+
+Layer 14, Token 29 (model.layers.out comparison):
+  Original tensor sum: 188.843628
+  Converted tensor sum: 53.397236
+  Original tensor mean: 23.605453
+  Converted tensor mean: 6.674654
+ Mean difference: 16.93079758
+ Maximum pointwise difference: 25.94162941
+ Max difference location: (0, 0, 6)
+  Values at max diff - Original: 23.64732933, Converted: -2.29430056
+ Biggest difference in row (0, 0), sum 188.843628 vs 53.397236
+
+Layer 15, Token 29 (model.layers.out comparison):
+  Original tensor sum: 336.074646
+  Converted tensor sum: 200.162903
+  Original tensor mean: 42.009331
+  Converted tensor mean: 25.020363
+ Mean difference: 16.98896790
+ Maximum pointwise difference: 25.90124702
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: 47.47709274, Converted: 21.57584572
+ Biggest difference in row (0, 0), sum 336.074646 vs 200.162903
+
+Layer 0, Token 30 (model.layers.out comparison):
+  Original tensor sum: 17.017063
+  Converted tensor sum: 23.545963
+  Original tensor mean: 2.127133
+  Converted tensor mean: 2.943245
+ Mean difference: 2.51119232
+ Maximum pointwise difference: 4.74783516
+ Max difference location: (0, 0, 7)
+  Values at max diff - Original: -3.57869840, Converted: 1.16913700
+ Biggest difference in row (0, 0), sum 17.017063 vs 23.545963
+
+Layer 1, Token 30 (model.layers.out comparison):
+  Original tensor sum: 20.432869
+  Converted tensor sum: 19.928423
+  Original tensor mean: 2.554109
+  Converted tensor mean: 2.491053
+ Mean difference: 3.21921587
+ Maximum pointwise difference: 5.61581087
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 4.57620192, Converted: -1.03960896
+ Biggest difference in row (0, 0), sum 20.432869 vs 19.928423
+
+Layer 2, Token 30 (model.layers.out comparison):
+  Original tensor sum: 28.017879
+  Converted tensor sum: 17.077301
+  Original tensor mean: 3.502235
+  Converted tensor mean: 2.134663
+ Mean difference: 3.63509035
+ Maximum pointwise difference: 8.41316605
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 6.66613007, Converted: -1.74703574
+ Biggest difference in row (0, 0), sum 28.017879 vs 17.077301
+
+Layer 3, Token 30 (model.layers.out comparison):
+  Original tensor sum: 85.620071
+  Converted tensor sum: 45.387245
+  Original tensor mean: 10.702509
+  Converted tensor mean: 5.673406
+ Mean difference: 5.25029612
+ Maximum pointwise difference: 14.27389336
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 17.08827591, Converted: 2.81438255
+ Biggest difference in row (0, 0), sum 85.620071 vs 45.387245
+
+Layer 4, Token 30 (model.layers.out comparison):
+  Original tensor sum: 76.943909
+  Converted tensor sum: 38.849068
+  Original tensor mean: 9.617989
+  Converted tensor mean: 4.856133
+ Mean difference: 5.60086536
+ Maximum pointwise difference: 14.69901657
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 17.17034531, Converted: 2.47132850
+ Biggest difference in row (0, 0), sum 76.943909 vs 38.849068
+
+Layer 5, Token 30 (model.layers.out comparison):
+  Original tensor sum: 59.381409
+  Converted tensor sum: 29.835991
+  Original tensor mean: 7.422676
+  Converted tensor mean: 3.729499
+ Mean difference: 4.96050739
+ Maximum pointwise difference: 12.96257687
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 14.77257729, Converted: 1.81000042
+ Biggest difference in row (0, 0), sum 59.381409 vs 29.835991
+
+Layer 6, Token 30 (model.layers.out comparison):
+  Original tensor sum: 59.339882
+  Converted tensor sum: 27.141592
+  Original tensor mean: 7.417485
+  Converted tensor mean: 3.392699
+ Mean difference: 4.87107563
+ Maximum pointwise difference: 14.00060558
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 15.80483246, Converted: 1.80422711
+ Biggest difference in row (0, 0), sum 59.339882 vs 27.141592
+
+Layer 7, Token 30 (model.layers.out comparison):
+  Original tensor sum: 131.503036
+  Converted tensor sum: 91.997757
+  Original tensor mean: 16.437880
+  Converted tensor mean: 11.499720
+ Mean difference: 5.33721828
+ Maximum pointwise difference: 14.37581253
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 27.27588463, Converted: 12.90007210
+ Biggest difference in row (0, 0), sum 131.503036 vs 91.997757
+
+Layer 8, Token 30 (model.layers.out comparison):
+  Original tensor sum: 123.886139
+  Converted tensor sum: 79.985909
+  Original tensor mean: 15.485767
+  Converted tensor mean: 9.998239
+ Mean difference: 6.03210974
+ Maximum pointwise difference: 16.31963348
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 26.14530563, Converted: 9.82567215
+ Biggest difference in row (0, 0), sum 123.886139 vs 79.985909
+
+Layer 9, Token 30 (model.layers.out comparison):
+  Original tensor sum: 118.487213
+  Converted tensor sum: 61.110474
+  Original tensor mean: 14.810902
+  Converted tensor mean: 7.638809
+ Mean difference: 7.17209244
+ Maximum pointwise difference: 17.08554077
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 26.00649452, Converted: 8.92095280
+ Biggest difference in row (0, 0), sum 118.487213 vs 61.110474
+
+Layer 10, Token 30 (model.layers.out comparison):
+  Original tensor sum: 110.301559
+  Converted tensor sum: 57.444092
+  Original tensor mean: 13.787695
+  Converted tensor mean: 7.180511
+ Mean difference: 6.69173956
+ Maximum pointwise difference: 18.18347359
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 26.85584831, Converted: 8.67237473
+ Biggest difference in row (0, 0), sum 110.301559 vs 57.444092
+
+Layer 11, Token 30 (model.layers.out comparison):
+  Original tensor sum: 209.603394
+  Converted tensor sum: 163.279968
+  Original tensor mean: 26.200424
+  Converted tensor mean: 20.409996
+ Mean difference: 6.23670197
+ Maximum pointwise difference: 18.06859207
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 37.88209915, Converted: 19.81350708
+ Biggest difference in row (0, 0), sum 209.603394 vs 163.279968
+
+Layer 12, Token 30 (model.layers.out comparison):
+  Original tensor sum: 210.341476
+  Converted tensor sum: 159.541199
+  Original tensor mean: 26.292685
+  Converted tensor mean: 19.942650
+ Mean difference: 6.62348843
+ Maximum pointwise difference: 17.79612160
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 37.21872330, Converted: 19.42260170
+ Biggest difference in row (0, 0), sum 210.341476 vs 159.541199
+
+Layer 13, Token 30 (model.layers.out comparison):
+  Original tensor sum: 206.045227
+  Converted tensor sum: 156.530212
+  Original tensor mean: 25.755653
+  Converted tensor mean: 19.566277
+ Mean difference: 6.46108055
+ Maximum pointwise difference: 17.11543655
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 36.85726547, Converted: 19.74182892
+ Biggest difference in row (0, 0), sum 206.045227 vs 156.530212
+
+Layer 14, Token 30 (model.layers.out comparison):
+  Original tensor sum: 204.884491
+  Converted tensor sum: 151.571396
+  Original tensor mean: 25.610561
+  Converted tensor mean: 18.946424
+ Mean difference: 6.66413498
+ Maximum pointwise difference: 18.41207695
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 37.42459488, Converted: 19.01251793
+ Biggest difference in row (0, 0), sum 204.884491 vs 151.571396
+
+Layer 15, Token 30 (model.layers.out comparison):
+  Original tensor sum: 358.352844
+  Converted tensor sum: 289.552582
+  Original tensor mean: 44.794106
+  Converted tensor mean: 36.194073
+ Mean difference: 8.60003757
+ Maximum pointwise difference: 19.94306946
+ Max difference location: (0, 0, 1)
+  Values at max diff - Original: 55.22903824, Converted: 35.28596878
+ Biggest difference in row (0, 0), sum 358.352844 vs 289.552582
+
+================================================================================
+Comparing recurrent cache tensors...
+================================================================================
+
+Layer 0, Token 1 (recurrent cache comparison):
+  Original tensor sum: -3.317356
+  Converted tensor sum: -3.317369
+  Original tensor mean: -0.001037
+  Converted tensor mean: -0.001037
+ Mean difference: 0.00000005
+ Maximum pointwise difference: 0.00000250
+ Max difference location: (0, 4, 8, 1)
+  Values at max diff - Original: -1.34675360, Converted: -1.34675610
+ Biggest difference in row (0, 4, 3), sum -1.531199 vs -1.531201
+Original tensor: 
+
+[[[[-0.01188182  0.00870434 -0.00525597 ...  0.01664828  0.0042294
+     0.01396134]
+   [-0.00601455  0.00372374  0.00119549 ... -0.00689575  0.00234476
+    -0.00023902]
+   [ 0.12993637 -0.07801484  0.03047845 ... -0.05703255 -0.06261977
+    -0.10933896]
+   ...
+   [-0.04649648  0.02312872 -0.00121024 ... -0.02114891  0.02579406
+     0.02258455]
+   [-0.04175662  0.02266306 -0.0035618  ... -0.0084533   0.02211451
+     0.02416236]
+   [ 0.02032246 -0.01281894  0.00930294 ... -0.02656155 -0.00984932
+    -0.02582185]]
+
+  [[ 0.01767723  0.01862493  0.00546727 ...  0.00556207  0.00562948
+     0.02792829]
+   [ 0.00329595  0.00522457  0.00275346 ...  0.00801896  0.0103077
+    -0.00079376]
+   [-0.15666749 -0.19953263 -0.06468897 ... -0.12443222 -0.10325672
+    -0.20960501]
+   ...
+   [ 0.04138051  0.06359718  0.02354327 ...  0.06241166  0.05219408
+     0.03928925]
+   [ 0.04164674  0.06036352  0.02137833 ...  0.05146553  0.04422566
+     0.0441802 ]
+   [-0.03129916 -0.03683262 -0.01027868 ... -0.01391416 -0.00729654
+    -0.0505065 ]]
+
+  [[-0.12362282  0.10214025 -0.01907291 ... -0.06202121 -0.10286148
+    -0.04492377]
+   [-0.0150543   0.08293391 -0.00673187 ... -0.00035791 -0.01116562
+    -0.00036771]
+   [-0.02004597  0.00927652 -0.00294111 ... -0.01171783 -0.01758975
+    -0.00819483]
+   ...
+   [ 0.00270219 -0.04824698  0.00360209 ... -0.00234267  0.00216798
+    -0.00194733]
+   [ 0.01524375 -0.03120736  0.00455077 ...  0.00138342  0.01178958
+     0.00394295]
+   [ 0.02191158 -0.03620601  0.00567079 ...  0.00745023  0.01862757
+     0.00703449]]
+
+  ...
+
+  [[ 0.00741537 -0.04865595 -0.00886576 ... -0.02448454  0.01194548
+    -0.00861733]
+   [-0.00134769  0.01334649  0.01967893 ...  0.02112496 -0.01624596
+     0.00516407]
+   [-0.0050677   0.02272661  0.01807955 ...  0.02094838 -0.01449073
+     0.00967227]
+   ...
+   [-0.02633221  0.05768563  0.01628287 ...  0.0149423  -0.00576269
+     0.04385136]
+   [-0.03326959  0.185886   -0.02219751 ...  0.04430137 -0.00146678
+     0.02707055]
+   [-0.00715611 -0.00657876 -0.10976178 ... -0.09874185  0.08591411
+    -0.00940268]]
+
+  [[-0.03609058 -0.07579004  0.01501239 ... -0.00192132 -0.01605882
+     0.00820769]
+   [-0.00521284 -0.03044076  0.01835437 ... -0.00124992 -0.01034386
+     0.00627647]
+   [ 0.02380822  0.16997556 -0.04292414 ...  0.01702266  0.04020631
+    -0.03895959]
+   ...
+   [-0.00115029 -0.0217499   0.00398471 ... -0.00293407 -0.00470166
+     0.00579625]
+   [-0.00415053 -0.03030142  0.02518196 ... -0.00043284 -0.01240897
+     0.00634339]
+   [ 0.00861687 -0.01112233 -0.03039085 ... -0.00862329  0.00705495
+     0.00750164]]
+
+  [[ 0.00614664 -0.01302179 -0.0609244  ... -0.05605923 -0.06379453
+     0.01912303]
+   [ 0.01061937 -0.00787821 -0.02997783 ... -0.03494435 -0.04587581
+     0.01142649]
+   [-0.04273459  0.08807568  0.18954179 ...  0.19141153  0.05976401
+    -0.01481191]
+   ...
+   [ 0.0059959  -0.01474381 -0.02677062 ... -0.02669823  0.00146604
+    -0.00064257]
+   [ 0.01313105 -0.0043188  -0.02868656 ... -0.03682106 -0.06574353
+     0.01620813]
+   [-0.00286384 -0.03923091 -0.03224784 ... -0.01919729  0.12107897
+    -0.03120236]]]]
+
+Converted tensor: 
+
+[[[[-0.01188182  0.00870434 -0.00525597 ...  0.0166483   0.0042294
+     0.01396135]
+   [-0.00601455  0.00372375  0.00119549 ... -0.00689576  0.00234477
+    -0.00023903]
+   [ 0.12993638 -0.07801486  0.03047847 ... -0.05703259 -0.06261978
+    -0.10933899]
+   ...
+   [-0.04649651  0.02312873 -0.00121024 ... -0.02114895  0.02579408
+     0.02258454]
+   [-0.04175663  0.02266307 -0.0035618  ... -0.00845332  0.02211452
+     0.02416236]
+   [ 0.02032245 -0.01281894  0.00930295 ... -0.02656158 -0.00984932
+    -0.02582186]]
+
+  [[ 0.01767723  0.01862492  0.00546727 ...  0.00556206  0.00562947
+     0.02792831]
+   [ 0.00329595  0.00522458  0.00275346 ...  0.00801897  0.01030772
+    -0.00079377]
+   [-0.15666753 -0.19953264 -0.06468898 ... -0.12443225 -0.10325674
+    -0.20960508]
+   ...
+   [ 0.04138052  0.06359721  0.02354329 ...  0.06241173  0.05219414
+     0.03928925]
+   [ 0.04164676  0.06036354  0.02137835 ...  0.05146557  0.04422571
+     0.0441802 ]
+   [-0.03129917 -0.03683261 -0.01027868 ... -0.01391415 -0.00729652
+    -0.05050653]]
+
+  [[-0.12362286  0.10214026 -0.01907291 ... -0.06202124 -0.10286151
+    -0.04492378]
+   [-0.01505431  0.08293395 -0.00673187 ... -0.00035791 -0.01116562
+    -0.00036771]
+   [-0.02004598  0.00927651 -0.00294111 ... -0.01171784 -0.01758976
+    -0.00819483]
+   ...
+   [ 0.00270219 -0.04824701  0.00360209 ... -0.00234266  0.00216798
+    -0.00194733]
+   [ 0.01524375 -0.03120738  0.00455077 ...  0.00138341  0.01178958
+     0.00394295]
+   [ 0.02191159 -0.03620601  0.00567079 ...  0.00745023  0.01862758
+     0.00703449]]
+
+  ...
+
+  [[ 0.00741537 -0.04865595 -0.00886576 ... -0.02448454  0.01194548
+    -0.00861733]
+   [-0.00134769  0.01334648  0.01967893 ...  0.02112496 -0.01624596
+     0.00516407]
+   [-0.0050677   0.02272661  0.01807955 ...  0.02094838 -0.01449073
+     0.00967227]
+   ...
+   [-0.02633222  0.05768563  0.01628287 ...  0.01494229 -0.00576268
+     0.04385137]
+   [-0.03326959  0.18588606 -0.02219752 ...  0.04430138 -0.00146678
+     0.02707056]
+   [-0.00715612 -0.00657868 -0.1097618  ... -0.09874186  0.08591412
+    -0.00940266]]
+
+  [[-0.03609059 -0.07579008  0.01501241 ... -0.00192132 -0.01605884
+     0.00820769]
+   [-0.00521284 -0.03044078  0.01835438 ... -0.00124992 -0.01034387
+     0.00627648]
+   [ 0.02380823  0.16997567 -0.04292417 ...  0.01702267  0.04020633
+    -0.03895961]
+   ...
+   [-0.00115029 -0.02174992  0.00398472 ... -0.00293407 -0.00470167
+     0.00579625]
+   [-0.00415053 -0.03030144  0.02518198 ... -0.00043284 -0.01240898
+     0.00634339]
+   [ 0.00861687 -0.01112236 -0.03039089 ... -0.0086233   0.00705496
+     0.00750165]]
+
+  [[ 0.00614664 -0.0130218  -0.06092443 ... -0.05605926 -0.06379459
+     0.01912304]
+   [ 0.01061938 -0.00787821 -0.02997785 ... -0.03494437 -0.04587585
+     0.0114265 ]
+   [-0.04273462  0.08807574  0.18954192 ...  0.19141163  0.05976404
+    -0.01481192]
+   ...
+   [ 0.0059959  -0.01474382 -0.02677064 ... -0.02669825  0.00146605
+    -0.00064257]
+   [ 0.01313106 -0.00431879 -0.02868656 ... -0.03682107 -0.06574361
+     0.01620815]
+   [-0.00286384 -0.03923097 -0.0322479  ... -0.01919733  0.12107915
+    -0.03120241]]]]
+
+
+
+Layer 1, Token 1 (recurrent cache comparison):
+  Original tensor sum: 5.922648
+  Converted tensor sum: 5.922640
+  Original tensor mean: 0.001851
+  Converted tensor mean: 0.001851
+ Mean difference: 0.00000005
+ Maximum pointwise difference: 0.00000155
+ Max difference location: (0, 24, 4, 5)
+  Values at max diff - Original: -0.26876855, Converted: -0.26877010
+ Biggest difference in row (0, 14, 3), sum -0.918731 vs -0.918733
+
+Layer 2, Token 1 (recurrent cache comparison):
+  Original tensor sum: 12.229185
+  Converted tensor sum: 12.229182
+  Original tensor mean: 0.003822
+  Converted tensor mean: 0.003822
+ Mean difference: 0.00000009
+ Maximum pointwise difference: 0.00000620
+ Max difference location: (0, 3, 6, 0)
+  Values at max diff - Original: 2.35518169, Converted: 2.35517550
+ Biggest difference in row (0, 3, 6), sum 3.961787 vs 3.961781
+
+Layer 4, Token 1 (recurrent cache comparison):
+  Original tensor sum: 4.260600
+  Converted tensor sum: 4.260149
+  Original tensor mean: 0.001331
+  Converted tensor mean: 0.001331
+ Mean difference: 0.00000526
+ Maximum pointwise difference: 0.00011003
+ Max difference location: (0, 25, 2, 4)
+  Values at max diff - Original: 0.21691340, Converted: 0.21702343
+ Biggest difference in row (0, 3, 1), sum -0.358275 vs -0.358136
+
+Layer 5, Token 1 (recurrent cache comparison):
+  Original tensor sum: 12.744413
+  Converted tensor sum: 12.744514
+  Original tensor mean: 0.003983
+  Converted tensor mean: 0.003983
+ Mean difference: 0.00000413
+ Maximum pointwise difference: 0.00011247
+ Max difference location: (0, 5, 2, 8)
+  Values at max diff - Original: 0.86490124, Converted: 0.86478877
+ Biggest difference in row (0, 5, 2), sum -0.456235 vs -0.456385
+
+Layer 6, Token 1 (recurrent cache comparison):
+  Original tensor sum: -14.490761
+  Converted tensor sum: -14.493523
+  Original tensor mean: -0.004528
+  Converted tensor mean: -0.004529
+ Mean difference: 0.00002331
+ Maximum pointwise difference: 0.00149512
+ Max difference location: (0, 28, 9, 8)
+  Values at max diff - Original: 2.97030377, Converted: 2.96880865
+ Biggest difference in row (0, 8, 5), sum 5.080033 vs 5.077976
+
+Layer 8, Token 1 (recurrent cache comparison):
+  Original tensor sum: -18.806082
+  Converted tensor sum: -18.808296
+  Original tensor mean: -0.005877
+  Converted tensor mean: -0.005878
+ Mean difference: 0.00002112
+ Maximum pointwise difference: 0.00074953
+ Max difference location: (0, 20, 1, 8)
+  Values at max diff - Original: 0.62514198, Converted: 0.62439245
+ Biggest difference in row (0, 25, 6), sum 1.048032 vs 1.047222
+
+Layer 9, Token 1 (recurrent cache comparison):
+  Original tensor sum: 16.764290
+  Converted tensor sum: 16.760258
+  Original tensor mean: 0.005239
+  Converted tensor mean: 0.005238
+ Mean difference: 0.00002129
+ Maximum pointwise difference: 0.00044209
+ Max difference location: (0, 21, 5, 8)
+  Values at max diff - Original: 0.85285813, Converted: 0.85241604
+ Biggest difference in row (0, 0, 1), sum -0.046629 vs -0.046069
+
+Layer 10, Token 1 (recurrent cache comparison):
+  Original tensor sum: 13.242327
+  Converted tensor sum: 13.242817
+  Original tensor mean: 0.004138
+  Converted tensor mean: 0.004138
+ Mean difference: 0.00002325
+ Maximum pointwise difference: 0.00070238
+ Max difference location: (0, 18, 5, 1)
+  Values at max diff - Original: 0.48423475, Converted: 0.48353237
+ Biggest difference in row (0, 10, 0), sum -0.502937 vs -0.502024
+
+Layer 12, Token 1 (recurrent cache comparison):
+  Original tensor sum: 14.374599
+  Converted tensor sum: 14.372844
+  Original tensor mean: 0.004492
+  Converted tensor mean: 0.004492
+ Mean difference: 0.00002070
+ Maximum pointwise difference: 0.00084567
+ Max difference location: (0, 0, 3, 1)
+  Values at max diff - Original: 1.31967652, Converted: 1.31883085
+ Biggest difference in row (0, 0, 5), sum -0.676982 vs -0.676066
+
+Layer 13, Token 1 (recurrent cache comparison):
+  Original tensor sum: 28.120127
+  Converted tensor sum: 28.128502
+  Original tensor mean: 0.008788
+  Converted tensor mean: 0.008790
+ Mean difference: 0.00001703
+ Maximum pointwise difference: 0.00037390
+ Max difference location: (0, 4, 2, 1)
+  Values at max diff - Original: -0.33164161, Converted: -0.33126771
+ Biggest difference in row (0, 24, 1), sum -0.030439 vs -0.029779
+
+Layer 14, Token 1 (recurrent cache comparison):
+  Original tensor sum: 27.012432
+  Converted tensor sum: 27.011541
+  Original tensor mean: 0.008441
+  Converted tensor mean: 0.008441
+ Mean difference: 0.00002248
+ Maximum pointwise difference: 0.00121775
+ Max difference location: (0, 18, 0, 1)
+  Values at max diff - Original: 0.37722895, Converted: 0.37844670
+ Biggest difference in row (0, 28, 1), sum -0.493242 vs -0.492468
+
+Layer 0, Token 2 (recurrent cache comparison):
+  Original tensor sum: 4.531467
+  Converted tensor sum: 4.531466
+  Original tensor mean: 0.001416
+  Converted tensor mean: 0.001416
+ Mean difference: 0.08359446
+ Maximum pointwise difference: 1.77142978
+ Max difference location: (0, 1, 3, 5)
+  Values at max diff - Original: -0.02699410, Converted: 1.74443567
+ Biggest difference in row (0, 25, 2), sum -0.057628 vs -2.844908
+
+Layer 1, Token 2 (recurrent cache comparison):
+  Original tensor sum: 11.008316
+  Converted tensor sum: 11.008326
+  Original tensor mean: 0.003440
+  Converted tensor mean: 0.003440
+ Mean difference: 0.06277661
+ Maximum pointwise difference: 0.71243107
+ Max difference location: (0, 10, 0, 2)
+  Values at max diff - Original: 0.01163737, Converted: 0.72406846
+ Biggest difference in row (0, 12, 3), sum 0.228652 vs -1.768667
+
+Layer 2, Token 2 (recurrent cache comparison):
+  Original tensor sum: 17.248280
+  Converted tensor sum: 17.248241
+  Original tensor mean: 0.005390
+  Converted tensor mean: 0.005390
+ Mean difference: 0.08558470
+ Maximum pointwise difference: 1.97508693
+ Max difference location: (0, 10, 7, 3)
+  Values at max diff - Original: 1.98190892, Converted: 0.00682194
+ Biggest difference in row (0, 27, 7), sum -0.594255 vs 3.191915
+
+Layer 4, Token 2 (recurrent cache comparison):
+  Original tensor sum: 7.984356
+  Converted tensor sum: 7.983810
+  Original tensor mean: 0.002495
+  Converted tensor mean: 0.002495
+ Mean difference: 0.07671142
+ Maximum pointwise difference: 1.85330796
+ Max difference location: (0, 20, 4, 6)
+  Values at max diff - Original: 0.01886898, Converted: 1.87217689
+ Biggest difference in row (0, 20, 6), sum 2.845701 vs -0.305152
+
+Layer 5, Token 2 (recurrent cache comparison):
+  Original tensor sum: 9.205366
+  Converted tensor sum: 9.205467
+  Original tensor mean: 0.002877
+  Converted tensor mean: 0.002877
+ Mean difference: 0.06804129
+ Maximum pointwise difference: 1.41803539
+ Max difference location: (0, 31, 6, 3)
+  Values at max diff - Original: 1.40662789, Converted: -0.01140754
+ Biggest difference in row (0, 24, 8), sum -0.372748 vs -2.656956
+
+Layer 6, Token 2 (recurrent cache comparison):
+  Original tensor sum: -7.876884
+  Converted tensor sum: -7.873561
+  Original tensor mean: -0.002462
+  Converted tensor mean: -0.002460
+ Mean difference: 0.10029175
+ Maximum pointwise difference: 2.66715860
+ Max difference location: (0, 28, 9, 8)
+  Values at max diff - Original: 2.59401202, Converted: -0.07314663
+ Biggest difference in row (0, 19, 4), sum 0.710449 vs -5.203167
+
+Layer 8, Token 2 (recurrent cache comparison):
+  Original tensor sum: -13.154655
+  Converted tensor sum: -13.156775
+  Original tensor mean: -0.004111
+  Converted tensor mean: -0.004111
+ Mean difference: 0.08601540
+ Maximum pointwise difference: 2.83156943
+ Max difference location: (0, 12, 4, 7)
+  Values at max diff - Original: 2.83582592, Converted: 0.00425647
+ Biggest difference in row (0, 30, 3), sum -0.848019 vs -4.754877
+
+Layer 9, Token 2 (recurrent cache comparison):
+  Original tensor sum: 13.187357
+  Converted tensor sum: 13.181618
+  Original tensor mean: 0.004121
+  Converted tensor mean: 0.004119
+ Mean difference: 0.05544823
+ Maximum pointwise difference: 0.65544760
+ Max difference location: (0, 21, 5, 8)
+  Values at max diff - Original: 0.71338689, Converted: 0.05793926
+ Biggest difference in row (0, 19, 9), sum -0.054951 vs -1.900613
+
+Layer 10, Token 2 (recurrent cache comparison):
+  Original tensor sum: 10.860550
+  Converted tensor sum: 10.860478
+  Original tensor mean: 0.003394
+  Converted tensor mean: 0.003394
+ Mean difference: 0.05739149
+ Maximum pointwise difference: 1.22302496
+ Max difference location: (0, 30, 4, 5)
+  Values at max diff - Original: -0.01519475, Converted: 1.20783019
+ Biggest difference in row (0, 23, 3), sum -0.221712 vs -3.841269
+
+Layer 12, Token 2 (recurrent cache comparison):
+  Original tensor sum: 3.134315
+  Converted tensor sum: 3.132089
+  Original tensor mean: 0.000979
+  Converted tensor mean: 0.000979
+ Mean difference: 0.07305207
+ Maximum pointwise difference: 2.30649829
+ Max difference location: (0, 5, 4, 5)
+  Values at max diff - Original: 2.33141446, Converted: 0.02491626
+ Biggest difference in row (0, 0, 1), sum -1.541565 vs -6.179572
+
+Layer 13, Token 2 (recurrent cache comparison):
+  Original tensor sum: 18.773312
+  Converted tensor sum: 18.779602
+  Original tensor mean: 0.005867
+  Converted tensor mean: 0.005869
+ Mean difference: 0.04688552
+ Maximum pointwise difference: 0.60163057
+ Max difference location: (0, 6, 1, 7)
+  Values at max diff - Original: 0.04654653, Converted: 0.64817709
+ Biggest difference in row (0, 4, 1), sum -0.566141 vs -2.623919
+
+Layer 14, Token 2 (recurrent cache comparison):
+  Original tensor sum: 13.960938
+  Converted tensor sum: 13.964265
+  Original tensor mean: 0.004363
+  Converted tensor mean: 0.004364
+ Mean difference: 0.06759205
+ Maximum pointwise difference: 1.25844812
+ Max difference location: (0, 15, 8, 4)
+  Values at max diff - Original: 1.26228178, Converted: 0.00383368
+ Biggest difference in row (0, 31, 3), sum -0.096068 vs -5.326997
+
+Layer 0, Token 3 (recurrent cache comparison):
+  Original tensor sum: 0.684784
+  Converted tensor sum: 0.422194
+  Original tensor mean: 0.000214
+  Converted tensor mean: 0.000132
+ Mean difference: 0.06314481
+ Maximum pointwise difference: 1.39332521
+ Max difference location: (0, 28, 5, 9)
+  Values at max diff - Original: -0.03651731, Converted: 1.35680795
+ Biggest difference in row (0, 4, 9), sum 2.498745 vs 0.335116
+
+Layer 1, Token 3 (recurrent cache comparison):
+  Original tensor sum: 3.526195
+  Converted tensor sum: 7.632782
+  Original tensor mean: 0.001102
+  Converted tensor mean: 0.002385
+ Mean difference: 0.04427468
+ Maximum pointwise difference: 0.98676205
+ Max difference location: (0, 12, 3, 7)
+  Values at max diff - Original: 0.92044085, Converted: -0.06632122
+ Biggest difference in row (0, 24, 2), sum 0.609889 vs -0.814516
+
+Layer 2, Token 3 (recurrent cache comparison):
+  Original tensor sum: 15.850447
+  Converted tensor sum: 14.785593
+  Original tensor mean: 0.004953
+  Converted tensor mean: 0.004620
+ Mean difference: 0.06092339
+ Maximum pointwise difference: 2.43390632
+ Max difference location: (0, 1, 0, 4)
+  Values at max diff - Original: 2.80371213, Converted: 0.36980587
+ Biggest difference in row (0, 1, 0), sum 4.370481 vs 0.423526
+
+Layer 4, Token 3 (recurrent cache comparison):
+  Original tensor sum: 19.856752
+  Converted tensor sum: 11.778177
+  Original tensor mean: 0.006205
+  Converted tensor mean: 0.003681
+ Mean difference: 0.07194611
+ Maximum pointwise difference: 2.48742008
+ Max difference location: (0, 14, 3, 9)
+  Values at max diff - Original: 2.47506452, Converted: -0.01235563
+ Biggest difference in row (0, 19, 2), sum 0.372920 vs -2.973422
+
+Layer 5, Token 3 (recurrent cache comparison):
+  Original tensor sum: 9.792118
+  Converted tensor sum: 9.138845
+  Original tensor mean: 0.003060
+  Converted tensor mean: 0.002856
+ Mean difference: 0.05089124
+ Maximum pointwise difference: 1.42593253
+ Max difference location: (0, 29, 0, 8)
+  Values at max diff - Original: 1.42089915, Converted: -0.00503340
+ Biggest difference in row (0, 29, 0), sum 2.198264 vs 0.182178
+
+Layer 6, Token 3 (recurrent cache comparison):
+  Original tensor sum: 39.415325
+  Converted tensor sum: 64.451355
+  Original tensor mean: 0.012317
+  Converted tensor mean: 0.020141
+ Mean difference: 0.08079723
+ Maximum pointwise difference: 4.89647627
+ Max difference location: (0, 15, 3, 6)
+  Values at max diff - Original: -0.18732879, Converted: 4.70914745
+ Biggest difference in row (0, 6, 0), sum 0.836966 vs 8.447909
+
+Layer 8, Token 3 (recurrent cache comparison):
+  Original tensor sum: 23.689789
+  Converted tensor sum: 12.321936
+  Original tensor mean: 0.007403
+  Converted tensor mean: 0.003851
+ Mean difference: 0.08749782
+ Maximum pointwise difference: 3.85876298
+ Max difference location: (0, 6, 4, 8)
+  Values at max diff - Original: 0.01438628, Converted: 3.87314916
+ Biggest difference in row (0, 6, 4), sum 0.119168 vs 5.376393
+
+Layer 9, Token 3 (recurrent cache comparison):
+  Original tensor sum: 8.901470
+  Converted tensor sum: 4.339914
+  Original tensor mean: 0.002782
+  Converted tensor mean: 0.001356
+ Mean difference: 0.06287189
+ Maximum pointwise difference: 1.40262556
+ Max difference location: (0, 4, 0, 5)
+  Values at max diff - Original: -0.00241524, Converted: 1.40021038
+ Biggest difference in row (0, 18, 1), sum 1.268483 vs -0.696936
+
+Layer 10, Token 3 (recurrent cache comparison):
+  Original tensor sum: 18.375820
+  Converted tensor sum: 3.348410
+  Original tensor mean: 0.005742
+  Converted tensor mean: 0.001046
+ Mean difference: 0.06042652
+ Maximum pointwise difference: 2.94567752
+ Max difference location: (0, 3, 8, 7)
+  Values at max diff - Original: -0.26693973, Converted: 2.67873788
+ Biggest difference in row (0, 3, 8), sum 0.004062 vs 2.562259
+
+Layer 12, Token 3 (recurrent cache comparison):
+  Original tensor sum: 15.322770
+  Converted tensor sum: 2.674777
+  Original tensor mean: 0.004788
+  Converted tensor mean: 0.000836
+ Mean difference: 0.07379209
+ Maximum pointwise difference: 2.73401403
+ Max difference location: (0, 30, 4, 0)
+  Values at max diff - Original: -0.02140745, Converted: 2.71260667
+ Biggest difference in row (0, 7, 6), sum -0.113145 vs 2.612027
+
+Layer 13, Token 3 (recurrent cache comparison):
+  Original tensor sum: 14.910538
+  Converted tensor sum: 8.724025
+  Original tensor mean: 0.004660
+  Converted tensor mean: 0.002726
+ Mean difference: 0.05616682
+ Maximum pointwise difference: 1.43021226
+ Max difference location: (0, 26, 5, 0)
+  Values at max diff - Original: -1.41802061, Converted: 0.01219167
+ Biggest difference in row (0, 3, 9), sum 0.002563 vs -3.030253
+
+Layer 14, Token 3 (recurrent cache comparison):
+  Original tensor sum: 59.583878
+  Converted tensor sum: -2.192444
+  Original tensor mean: 0.018620
+  Converted tensor mean: -0.000685
+ Mean difference: 0.10199536
+ Maximum pointwise difference: 2.77383018
+ Max difference location: (0, 2, 0, 2)
+  Values at max diff - Original: 2.72142744, Converted: -0.05240267
+ Biggest difference in row (0, 16, 6), sum 0.145663 vs -8.295967
+
+Layer 0, Token 4 (recurrent cache comparison):
+  Original tensor sum: 7.899137
+  Converted tensor sum: 4.783788
+  Original tensor mean: 0.002468
+  Converted tensor mean: 0.001495
+ Mean difference: 0.06620996
+ Maximum pointwise difference: 1.03156960
+ Max difference location: (0, 1, 3, 7)
+  Values at max diff - Original: -0.00703356, Converted: -1.03860319
+ Biggest difference in row (0, 21, 4), sum 0.038056 vs -1.875101
+
+Layer 1, Token 4 (recurrent cache comparison):
+  Original tensor sum: 11.224692
+  Converted tensor sum: 15.232712
+  Original tensor mean: 0.003508
+  Converted tensor mean: 0.004760
+ Mean difference: 0.06535107
+ Maximum pointwise difference: 1.53891993
+ Max difference location: (0, 28, 3, 7)
+  Values at max diff - Original: 0.04949531, Converted: 1.58841527
+ Biggest difference in row (0, 28, 3), sum 0.880954 vs 3.297761
+
+Layer 2, Token 4 (recurrent cache comparison):
+  Original tensor sum: 15.875578
+  Converted tensor sum: 5.908407
+  Original tensor mean: 0.004961
+  Converted tensor mean: 0.001846
+ Mean difference: 0.09298474
+ Maximum pointwise difference: 2.68871808
+ Max difference location: (0, 14, 3, 7)
+  Values at max diff - Original: 2.67706752, Converted: -0.01165051
+ Biggest difference in row (0, 27, 2), sum 3.910276 vs 0.204061
+
+Layer 4, Token 4 (recurrent cache comparison):
+  Original tensor sum: 34.602001
+  Converted tensor sum: 14.365917
+  Original tensor mean: 0.010813
+  Converted tensor mean: 0.004489
+ Mean difference: 0.10193390
+ Maximum pointwise difference: 3.22817802
+ Max difference location: (0, 26, 6, 5)
+  Values at max diff - Original: -0.04091755, Converted: 3.18726039
+ Biggest difference in row (0, 26, 6), sum 0.735282 vs 4.516615
+
+Layer 5, Token 4 (recurrent cache comparison):
+  Original tensor sum: 24.322514
+  Converted tensor sum: 18.418108
+  Original tensor mean: 0.007601
+  Converted tensor mean: 0.005756
+ Mean difference: 0.08364967
+ Maximum pointwise difference: 2.23648024
+ Max difference location: (0, 22, 6, 1)
+  Values at max diff - Original: 2.19508362, Converted: -0.04139667
+ Biggest difference in row (0, 3, 0), sum 3.424673 vs -0.163761
+
+Layer 6, Token 4 (recurrent cache comparison):
+  Original tensor sum: 34.762104
+  Converted tensor sum: 77.105461
+  Original tensor mean: 0.010863
+  Converted tensor mean: 0.024095
+ Mean difference: 0.12376648
+ Maximum pointwise difference: 3.91498804
+ Max difference location: (0, 12, 5, 4)
+  Values at max diff - Original: -0.17797241, Converted: 3.73701572
+ Biggest difference in row (0, 10, 4), sum -0.207436 vs 6.811815
+
+Layer 8, Token 4 (recurrent cache comparison):
+  Original tensor sum: 52.858780
+  Converted tensor sum: 5.570855
+  Original tensor mean: 0.016518
+  Converted tensor mean: 0.001741
+ Mean difference: 0.12005786
+ Maximum pointwise difference: 5.32569838
+ Max difference location: (0, 12, 3, 5)
+  Values at max diff - Original: 5.34859705, Converted: 0.02289869
+ Biggest difference in row (0, 20, 0), sum 8.008233 vs -0.003253
+
+Layer 9, Token 4 (recurrent cache comparison):
+  Original tensor sum: 20.435345
+  Converted tensor sum: -2.045311
+  Original tensor mean: 0.006386
+  Converted tensor mean: -0.000639
+ Mean difference: 0.08372314
+ Maximum pointwise difference: 2.78602862
+ Max difference location: (0, 28, 2, 0)
+  Values at max diff - Original: 2.71785426, Converted: -0.06817436
+ Biggest difference in row (0, 28, 2), sum 4.726543 vs 1.302800
+
+Layer 10, Token 4 (recurrent cache comparison):
+  Original tensor sum: 28.353613
+  Converted tensor sum: 12.385429
+  Original tensor mean: 0.008861
+  Converted tensor mean: 0.003870
+ Mean difference: 0.09276734
+ Maximum pointwise difference: 2.28980851
+ Max difference location: (0, 2, 9, 5)
+  Values at max diff - Original: -0.00412231, Converted: 2.28568625
+ Biggest difference in row (0, 13, 8), sum 3.624647 vs 0.020094
+
+Layer 12, Token 4 (recurrent cache comparison):
+  Original tensor sum: 70.502647
+  Converted tensor sum: -11.005323
+  Original tensor mean: 0.022032
+  Converted tensor mean: -0.003439
+ Mean difference: 0.13381547
+ Maximum pointwise difference: 3.57928109
+ Max difference location: (0, 30, 0, 4)
+  Values at max diff - Original: 3.95710707, Converted: 0.37782601
+ Biggest difference in row (0, 21, 9), sum -1.532540 vs -12.302475
+
+Layer 13, Token 4 (recurrent cache comparison):
+  Original tensor sum: 38.753532
+  Converted tensor sum: 5.437235
+  Original tensor mean: 0.012110
+  Converted tensor mean: 0.001699
+ Mean difference: 0.08178755
+ Maximum pointwise difference: 2.55715966
+ Max difference location: (0, 3, 4, 9)
+  Values at max diff - Original: 2.84962225, Converted: 0.29246253
+ Biggest difference in row (0, 3, 4), sum 4.266754 vs 0.847269
+
+Layer 14, Token 4 (recurrent cache comparison):
+  Original tensor sum: 141.714035
+  Converted tensor sum: 3.640444
+  Original tensor mean: 0.044286
+  Converted tensor mean: 0.001138
+ Mean difference: 0.14463389
+ Maximum pointwise difference: 5.68939066
+ Max difference location: (0, 16, 7, 6)
+  Values at max diff - Original: 5.55827475, Converted: -0.13111581
+ Biggest difference in row (0, 28, 1), sum 11.271111 vs 0.609705
+
+Layer 0, Token 5 (recurrent cache comparison):
+  Original tensor sum: 9.131315
+  Converted tensor sum: 12.396471
+  Original tensor mean: 0.002854
+  Converted tensor mean: 0.003874
+ Mean difference: 0.05539500
+ Maximum pointwise difference: 1.09641600
+ Max difference location: (0, 28, 9, 5)
+  Values at max diff - Original: 1.12920201, Converted: 0.03278603
+ Biggest difference in row (0, 4, 9), sum 1.258661 vs 0.175881
+
+Layer 1, Token 5 (recurrent cache comparison):
+  Original tensor sum: 24.366199
+  Converted tensor sum: 10.052802
+  Original tensor mean: 0.007614
+  Converted tensor mean: 0.003142
+ Mean difference: 0.05824861
+ Maximum pointwise difference: 2.14620328
+ Max difference location: (0, 14, 2, 5)
+  Values at max diff - Original: 0.00643282, Converted: 2.15263605
+ Biggest difference in row (0, 6, 4), sum 1.356658 vs -0.156208
+
+Layer 2, Token 5 (recurrent cache comparison):
+  Original tensor sum: 50.376324
+  Converted tensor sum: 20.166676
+  Original tensor mean: 0.015743
+  Converted tensor mean: 0.006302
+ Mean difference: 0.07966200
+ Maximum pointwise difference: 2.04463291
+ Max difference location: (0, 27, 4, 2)
+  Values at max diff - Original: 2.00972342, Converted: -0.03490951
+ Biggest difference in row (0, 27, 2), sum 5.745794 vs 1.959190
+
+Layer 4, Token 5 (recurrent cache comparison):
+  Original tensor sum: 44.478531
+  Converted tensor sum: 48.696777
+  Original tensor mean: 0.013900
+  Converted tensor mean: 0.015218
+ Mean difference: 0.09315307
+ Maximum pointwise difference: 2.43060613
+ Max difference location: (0, 26, 5, 6)
+  Values at max diff - Original: 0.46136302, Converted: 2.89196920
+ Biggest difference in row (0, 8, 6), sum 0.054414 vs 4.076869
+
+Layer 5, Token 5 (recurrent cache comparison):
+  Original tensor sum: 57.863758
+  Converted tensor sum: 66.390915
+  Original tensor mean: 0.018082
+  Converted tensor mean: 0.020747
+ Mean difference: 0.10497291
+ Maximum pointwise difference: 2.49356651
+ Max difference location: (0, 17, 3, 6)
+  Values at max diff - Original: 2.50974846, Converted: 0.01618202
+ Biggest difference in row (0, 28, 9), sum 3.771637 vs 0.053981
+
+Layer 6, Token 5 (recurrent cache comparison):
+  Original tensor sum: 39.502037
+  Converted tensor sum: 161.817169
+  Original tensor mean: 0.012344
+  Converted tensor mean: 0.050568
+ Mean difference: 0.14194940
+ Maximum pointwise difference: 3.58584666
+ Max difference location: (0, 26, 3, 9)
+  Values at max diff - Original: 3.40417242, Converted: -0.18167432
+ Biggest difference in row (0, 12, 4), sum 1.168972 vs 7.813907
+
+Layer 8, Token 5 (recurrent cache comparison):
+  Original tensor sum: 44.896149
+  Converted tensor sum: 38.246201
+  Original tensor mean: 0.014030
+  Converted tensor mean: 0.011952
+ Mean difference: 0.10806250
+ Maximum pointwise difference: 2.33007479
+ Max difference location: (0, 1, 6, 0)
+  Values at max diff - Original: 2.35027504, Converted: 0.02020025
+ Biggest difference in row (0, 1, 6), sum 5.246045 vs 0.247956
+
+Layer 9, Token 5 (recurrent cache comparison):
+  Original tensor sum: 20.569098
+  Converted tensor sum: 11.688971
+  Original tensor mean: 0.006428
+  Converted tensor mean: 0.003653
+ Mean difference: 0.08318320
+ Maximum pointwise difference: 1.79917610
+ Max difference location: (0, 28, 0, 3)
+  Values at max diff - Original: 1.69918346, Converted: -0.09999267
+ Biggest difference in row (0, 3, 4), sum 3.283048 vs 0.225886
+
+Layer 10, Token 5 (recurrent cache comparison):
+  Original tensor sum: 42.493145
+  Converted tensor sum: 26.750286
+  Original tensor mean: 0.013279
+  Converted tensor mean: 0.008359
+ Mean difference: 0.09709122
+ Maximum pointwise difference: 2.97919798
+ Max difference location: (0, 10, 0, 3)
+  Values at max diff - Original: 3.34914303, Converted: 0.36994517
+ Biggest difference in row (0, 10, 0), sum 5.613201 vs -0.079588
+
+Layer 12, Token 5 (recurrent cache comparison):
+  Original tensor sum: 91.460236
+  Converted tensor sum: 14.637827
+  Original tensor mean: 0.028581
+  Converted tensor mean: 0.004574
+ Mean difference: 0.12184902
+ Maximum pointwise difference: 4.17300320
+ Max difference location: (0, 23, 2, 9)
+  Values at max diff - Original: 3.98550677, Converted: -0.18749636
+ Biggest difference in row (0, 28, 5), sum 5.243108 vs -0.797499
+
+Layer 13, Token 5 (recurrent cache comparison):
+  Original tensor sum: 50.306297
+  Converted tensor sum: 16.367235
+  Original tensor mean: 0.015721
+  Converted tensor mean: 0.005115
+ Mean difference: 0.08688851
+ Maximum pointwise difference: 2.08200264
+ Max difference location: (0, 19, 9, 3)
+  Values at max diff - Original: -1.59057343, Converted: 0.49142930
+ Biggest difference in row (0, 19, 5), sum 3.595970 vs 0.049368
+
+Layer 14, Token 5 (recurrent cache comparison):
+  Original tensor sum: 120.273888
+  Converted tensor sum: 44.449192
+  Original tensor mean: 0.037586
+  Converted tensor mean: 0.013890
+ Mean difference: 0.13929905
+ Maximum pointwise difference: 4.73129654
+ Max difference location: (0, 18, 5, 9)
+  Values at max diff - Original: 4.35292673, Converted: -0.37836996
+ Biggest difference in row (0, 18, 5), sum 8.950241 vs -0.746074
+
+Layer 0, Token 6 (recurrent cache comparison):
+  Original tensor sum: 11.608546
+  Converted tensor sum: 10.627696
+  Original tensor mean: 0.003628
+  Converted tensor mean: 0.003321
+ Mean difference: 0.05484011
+ Maximum pointwise difference: 1.12371099
+ Max difference location: (0, 1, 2, 3)
+  Values at max diff - Original: 1.11502755, Converted: -0.00868344
+ Biggest difference in row (0, 28, 5), sum 0.118289 vs 2.332705
+
+Layer 1, Token 6 (recurrent cache comparison):
+  Original tensor sum: 92.219727
+  Converted tensor sum: 28.768579
+  Original tensor mean: 0.028819
+  Converted tensor mean: 0.008990
+ Mean difference: 0.08724788
+ Maximum pointwise difference: 1.51144505
+ Max difference location: (0, 23, 0, 4)
+  Values at max diff - Original: 1.55765891, Converted: 0.04621384
+ Biggest difference in row (0, 14, 0), sum 2.954077 vs -0.012181
+
+Layer 2, Token 6 (recurrent cache comparison):
+  Original tensor sum: 101.609215
+  Converted tensor sum: 93.242142
+  Original tensor mean: 0.031753
+  Converted tensor mean: 0.029138
+ Mean difference: 0.12457406
+ Maximum pointwise difference: 2.07845497
+ Max difference location: (0, 13, 1, 9)
+  Values at max diff - Original: 2.17026591, Converted: 0.09181103
+ Biggest difference in row (0, 5, 5), sum 4.805948 vs -0.569050
+
+Layer 4, Token 6 (recurrent cache comparison):
+  Original tensor sum: 13.856092
+  Converted tensor sum: 22.610188
+  Original tensor mean: 0.004330
+  Converted tensor mean: 0.007066
+ Mean difference: 0.09440003
+ Maximum pointwise difference: 2.37087321
+ Max difference location: (0, 19, 2, 6)
+  Values at max diff - Original: -0.02839734, Converted: 2.34247589
+ Biggest difference in row (0, 28, 1), sum -0.280756 vs 2.458031
+
+Layer 5, Token 6 (recurrent cache comparison):
+  Original tensor sum: 39.960052
+  Converted tensor sum: 41.437057
+  Original tensor mean: 0.012488
+  Converted tensor mean: 0.012949
+ Mean difference: 0.11209048
+ Maximum pointwise difference: 2.79378676
+ Max difference location: (0, 19, 8, 4)
+  Values at max diff - Original: 0.01245314, Converted: 2.80623984
+ Biggest difference in row (0, 13, 1), sum 6.005285 vs -0.085273
+
+Layer 6, Token 6 (recurrent cache comparison):
+  Original tensor sum: -2.419616
+  Converted tensor sum: 156.977676
+  Original tensor mean: -0.000756
+  Converted tensor mean: 0.049056
+ Mean difference: 0.13894926
+ Maximum pointwise difference: 6.69993019
+ Max difference location: (0, 10, 3, 1)
+  Values at max diff - Original: -1.12109971, Converted: 5.57883024
+ Biggest difference in row (0, 12, 1), sum -0.201558 vs 10.382487
+
+Layer 8, Token 6 (recurrent cache comparison):
+  Original tensor sum: 8.213539
+  Converted tensor sum: 18.368313
+  Original tensor mean: 0.002567
+  Converted tensor mean: 0.005740
+ Mean difference: 0.10382870
+ Maximum pointwise difference: 3.36055303
+ Max difference location: (0, 6, 4, 8)
+  Values at max diff - Original: 0.10355368, Converted: 3.46410680
+ Biggest difference in row (0, 6, 4), sum -0.613209 vs 4.409491
+
+Layer 9, Token 6 (recurrent cache comparison):
+  Original tensor sum: 12.889297
+  Converted tensor sum: -0.411069
+  Original tensor mean: 0.004028
+  Converted tensor mean: -0.000128
+ Mean difference: 0.08612256
+ Maximum pointwise difference: 1.89322448
+ Max difference location: (0, 6, 4, 1)
+  Values at max diff - Original: -0.52327746, Converted: 1.36994708
+ Biggest difference in row (0, 21, 7), sum 0.245074 vs -2.764518
+
+Layer 10, Token 6 (recurrent cache comparison):
+  Original tensor sum: 3.506564
+  Converted tensor sum: 11.408216
+  Original tensor mean: 0.001096
+  Converted tensor mean: 0.003565
+ Mean difference: 0.08594991
+ Maximum pointwise difference: 3.30037594
+ Max difference location: (0, 3, 8, 7)
+  Values at max diff - Original: -0.08371022, Converted: 3.21666574
+ Biggest difference in row (0, 0, 7), sum -0.426351 vs 3.218251
+
+Layer 12, Token 6 (recurrent cache comparison):
+  Original tensor sum: 30.742065
+  Converted tensor sum: 1.932971
+  Original tensor mean: 0.009607
+  Converted tensor mean: 0.000604
+ Mean difference: 0.10983281
+ Maximum pointwise difference: 3.31334734
+ Max difference location: (0, 29, 5, 6)
+  Values at max diff - Original: 3.34788132, Converted: 0.03453401
+ Biggest difference in row (0, 29, 5), sum 6.176572 vs 0.072738
+
+Layer 13, Token 6 (recurrent cache comparison):
+  Original tensor sum: 14.579787
+  Converted tensor sum: 9.630959
+  Original tensor mean: 0.004556
+  Converted tensor mean: 0.003010
+ Mean difference: 0.08181592
+ Maximum pointwise difference: 2.27647829
+ Max difference location: (0, 19, 1, 3)
+  Values at max diff - Original: 2.46903062, Converted: 0.19255245
+ Biggest difference in row (0, 19, 5), sum 2.241402 vs -0.017656
+
+Layer 14, Token 6 (recurrent cache comparison):
+  Original tensor sum: 42.673443
+  Converted tensor sum: 13.958614
+  Original tensor mean: 0.013335
+  Converted tensor mean: 0.004362
+ Mean difference: 0.12478559
+ Maximum pointwise difference: 3.53676820
+ Max difference location: (0, 15, 8, 4)
+  Values at max diff - Original: 3.58003521, Converted: 0.04326708
+ Biggest difference in row (0, 16, 6), sum 0.056718 vs -5.064022
+
+Layer 0, Token 7 (recurrent cache comparison):
+  Original tensor sum: 13.531075
+  Converted tensor sum: 7.895350
+  Original tensor mean: 0.004228
+  Converted tensor mean: 0.002467
+ Mean difference: 0.05525878
+ Maximum pointwise difference: 0.84158301
+ Max difference location: (0, 4, 1, 9)
+  Values at max diff - Original: -0.04387791, Converted: 0.79770511
+ Biggest difference in row (0, 11, 9), sum -0.221553 vs -1.606123
+
+Layer 1, Token 7 (recurrent cache comparison):
+  Original tensor sum: 106.468651
+  Converted tensor sum: 29.931305
+  Original tensor mean: 0.033271
+  Converted tensor mean: 0.009354
+ Mean difference: 0.07464606
+ Maximum pointwise difference: 1.52088320
+ Max difference location: (0, 24, 0, 1)
+  Values at max diff - Original: 1.28372872, Converted: -0.23715444
+ Biggest difference in row (0, 31, 9), sum 2.350637 vs -0.270012
+
+Layer 2, Token 7 (recurrent cache comparison):
+  Original tensor sum: 129.077255
+  Converted tensor sum: 124.290329
+  Original tensor mean: 0.040337
+  Converted tensor mean: 0.038841
+ Mean difference: 0.12615709
+ Maximum pointwise difference: 3.32020164
+ Max difference location: (0, 23, 3, 9)
+  Values at max diff - Original: 0.05276818, Converted: 3.37296987
+ Biggest difference in row (0, 5, 6), sum -1.585131 vs 2.877644
+
+Layer 4, Token 7 (recurrent cache comparison):
+  Original tensor sum: 12.337616
+  Converted tensor sum: 29.998875
+  Original tensor mean: 0.003856
+  Converted tensor mean: 0.009375
+ Mean difference: 0.08588156
+ Maximum pointwise difference: 1.48782670
+ Max difference location: (0, 19, 6, 2)
+  Values at max diff - Original: -0.00142645, Converted: 1.48640025
+ Biggest difference in row (0, 8, 3), sum -0.318221 vs 2.809558
+
+Layer 5, Token 7 (recurrent cache comparison):
+  Original tensor sum: 28.667000
+  Converted tensor sum: 37.180931
+  Original tensor mean: 0.008958
+  Converted tensor mean: 0.011619
+ Mean difference: 0.09552816
+ Maximum pointwise difference: 2.18750906
+ Max difference location: (0, 19, 4, 8)
+  Values at max diff - Original: 0.10599449, Converted: 2.29350352
+ Biggest difference in row (0, 28, 9), sum 2.464837 vs 0.175544
+
+Layer 6, Token 7 (recurrent cache comparison):
+  Original tensor sum: -5.179218
+  Converted tensor sum: 165.798248
+  Original tensor mean: -0.001619
+  Converted tensor mean: 0.051812
+ Mean difference: 0.12655024
+ Maximum pointwise difference: 4.26992130
+ Max difference location: (0, 10, 1, 3)
+  Values at max diff - Original: -0.81827015, Converted: 3.45165110
+ Biggest difference in row (0, 12, 6), sum 2.458921 vs 9.472747
+
+Layer 8, Token 7 (recurrent cache comparison):
+  Original tensor sum: 8.037577
+  Converted tensor sum: 36.050400
+  Original tensor mean: 0.002512
+  Converted tensor mean: 0.011266
+ Mean difference: 0.10181364
+ Maximum pointwise difference: 3.21224403
+ Max difference location: (0, 6, 8, 4)
+  Values at max diff - Original: 0.04581403, Converted: 3.25805807
+ Biggest difference in row (0, 6, 8), sum -0.710102 vs 2.858772
+
+Layer 9, Token 7 (recurrent cache comparison):
+  Original tensor sum: 10.771255
+  Converted tensor sum: 9.047117
+  Original tensor mean: 0.003366
+  Converted tensor mean: 0.002827
+ Mean difference: 0.07432807
+ Maximum pointwise difference: 1.92723787
+ Max difference location: (0, 18, 5, 2)
+  Values at max diff - Original: 0.10259621, Converted: 2.02983403
+ Biggest difference in row (0, 14, 2), sum 0.009283 vs 3.088803
+
+Layer 10, Token 7 (recurrent cache comparison):
+  Original tensor sum: 2.196672
+  Converted tensor sum: 31.273930
+  Original tensor mean: 0.000686
+  Converted tensor mean: 0.009773
+ Mean difference: 0.07749946
+ Maximum pointwise difference: 2.52166486
+ Max difference location: (0, 3, 7, 8)
+  Values at max diff - Original: 0.31132898, Converted: 2.83299375
+ Biggest difference in row (0, 20, 9), sum -0.957283 vs 1.748438
+
+Layer 12, Token 7 (recurrent cache comparison):
+  Original tensor sum: 18.589321
+  Converted tensor sum: 5.047585
+  Original tensor mean: 0.005809
+  Converted tensor mean: 0.001577
+ Mean difference: 0.10475901
+ Maximum pointwise difference: 2.85224462
+ Max difference location: (0, 29, 5, 6)
+  Values at max diff - Original: 2.91423106, Converted: 0.06198643
+ Biggest difference in row (0, 29, 5), sum 5.378224 vs 0.028987
+
+Layer 13, Token 7 (recurrent cache comparison):
+  Original tensor sum: 10.072084
+  Converted tensor sum: 22.447376
+  Original tensor mean: 0.003148
+  Converted tensor mean: 0.007015
+ Mean difference: 0.06809221
+ Maximum pointwise difference: 1.16759956
+ Max difference location: (0, 27, 3, 5)
+  Values at max diff - Original: -0.07106454, Converted: 1.09653497
+ Biggest difference in row (0, 27, 3), sum -0.724999 vs 1.414439
+
+Layer 14, Token 7 (recurrent cache comparison):
+  Original tensor sum: 24.727911
+  Converted tensor sum: 26.743217
+  Original tensor mean: 0.007727
+  Converted tensor mean: 0.008357
+ Mean difference: 0.11743267
+ Maximum pointwise difference: 2.98747468
+ Max difference location: (0, 18, 5, 1)
+  Values at max diff - Original: 2.95096135, Converted: -0.03651327
+ Biggest difference in row (0, 28, 1), sum -0.138044 vs 7.456189
+
+Layer 0, Token 8 (recurrent cache comparison):
+  Original tensor sum: 15.709320
+  Converted tensor sum: 12.209140
+  Original tensor mean: 0.004909
+  Converted tensor mean: 0.003815
+ Mean difference: 0.05364013
+ Maximum pointwise difference: 1.00742257
+ Max difference location: (0, 1, 3, 2)
+  Values at max diff - Original: 0.00399712, Converted: 1.01141965
+ Biggest difference in row (0, 28, 5), sum 0.102939 vs 1.531078
+
+Layer 1, Token 8 (recurrent cache comparison):
+  Original tensor sum: 188.393356
+  Converted tensor sum: 69.447678
+  Original tensor mean: 0.058873
+  Converted tensor mean: 0.021702
+ Mean difference: 0.10494157
+ Maximum pointwise difference: 2.06318974
+ Max difference location: (0, 24, 6, 8)
+  Values at max diff - Original: 2.06102371, Converted: -0.00216593
+ Biggest difference in row (0, 14, 0), sum 8.656445 vs 0.053197
+
+Layer 2, Token 8 (recurrent cache comparison):
+  Original tensor sum: 204.433716
+  Converted tensor sum: 228.728714
+  Original tensor mean: 0.063886
+  Converted tensor mean: 0.071478
+ Mean difference: 0.17672807
+ Maximum pointwise difference: 4.02747822
+ Max difference location: (0, 14, 7, 4)
+  Values at max diff - Original: -0.50838530, Converted: 3.51909280
+ Biggest difference in row (0, 14, 7), sum -0.459507 vs 8.282653
+
+Layer 4, Token 8 (recurrent cache comparison):
+  Original tensor sum: 27.791477
+  Converted tensor sum: 81.184990
+  Original tensor mean: 0.008685
+  Converted tensor mean: 0.025370
+ Mean difference: 0.10353857
+ Maximum pointwise difference: 2.46198463
+ Max difference location: (0, 20, 0, 0)
+  Values at max diff - Original: -0.22187454, Converted: 2.24011016
+ Biggest difference in row (0, 20, 0), sum 0.256525 vs 5.813072
+
+Layer 5, Token 8 (recurrent cache comparison):
+  Original tensor sum: 29.250452
+  Converted tensor sum: 93.253128
+  Original tensor mean: 0.009141
+  Converted tensor mean: 0.029142
+ Mean difference: 0.10660823
+ Maximum pointwise difference: 2.56040263
+ Max difference location: (0, 5, 9, 6)
+  Values at max diff - Original: 2.57331157, Converted: 0.01290902
+ Biggest difference in row (0, 6, 9), sum 0.078166 vs 4.415024
+
+Layer 6, Token 8 (recurrent cache comparison):
+  Original tensor sum: 27.846973
+  Converted tensor sum: 254.006149
+  Original tensor mean: 0.008702
+  Converted tensor mean: 0.079377
+ Mean difference: 0.15745334
+ Maximum pointwise difference: 4.78712130
+ Max difference location: (0, 6, 0, 1)
+  Values at max diff - Original: -0.02898185, Converted: 4.75813961
+ Biggest difference in row (0, 6, 0), sum 0.390611 vs 12.429944
+
+Layer 8, Token 8 (recurrent cache comparison):
+  Original tensor sum: 30.536982
+  Converted tensor sum: 101.827225
+  Original tensor mean: 0.009543
+  Converted tensor mean: 0.031821
+ Mean difference: 0.12039161
+ Maximum pointwise difference: 3.22662950
+ Max difference location: (0, 6, 4, 8)
+  Values at max diff - Original: 0.09277204, Converted: 3.31940150
+ Biggest difference in row (0, 6, 4), sum -0.525502 vs 4.532234
+
+Layer 9, Token 8 (recurrent cache comparison):
+  Original tensor sum: 16.682407
+  Converted tensor sum: 55.948948
+  Original tensor mean: 0.005213
+  Converted tensor mean: 0.017484
+ Mean difference: 0.08395444
+ Maximum pointwise difference: 2.21269536
+ Max difference location: (0, 2, 6, 8)
+  Values at max diff - Original: -0.01177103, Converted: 2.20092440
+ Biggest difference in row (0, 2, 6), sum 0.250594 vs 2.860795
+
+Layer 10, Token 8 (recurrent cache comparison):
+  Original tensor sum: 12.510189
+  Converted tensor sum: 82.301987
+  Original tensor mean: 0.003909
+  Converted tensor mean: 0.025719
+ Mean difference: 0.08603403
+ Maximum pointwise difference: 2.56086898
+ Max difference location: (0, 3, 8, 7)
+  Values at max diff - Original: -0.06791666, Converted: 2.49295235
+ Biggest difference in row (0, 27, 2), sum -0.661969 vs 2.579364
+
+Layer 12, Token 8 (recurrent cache comparison):
+  Original tensor sum: 32.357769
+  Converted tensor sum: 70.608459
+  Original tensor mean: 0.010112
+  Converted tensor mean: 0.022065
+ Mean difference: 0.11435273
+ Maximum pointwise difference: 2.54995298
+ Max difference location: (0, 29, 5, 6)
+  Values at max diff - Original: 2.57914209, Converted: 0.02918900
+ Biggest difference in row (0, 24, 2), sum -0.360438 vs 5.434034
+
+Layer 13, Token 8 (recurrent cache comparison):
+  Original tensor sum: 15.804648
+  Converted tensor sum: 72.853622
+  Original tensor mean: 0.004939
+  Converted tensor mean: 0.022767
+ Mean difference: 0.07997719
+ Maximum pointwise difference: 2.65385294
+ Max difference location: (0, 26, 0, 4)
+  Values at max diff - Original: -0.03116010, Converted: 2.62269282
+ Biggest difference in row (0, 26, 0), sum -1.206431 vs 2.459876
+
+Layer 14, Token 8 (recurrent cache comparison):
+  Original tensor sum: 69.455246
+  Converted tensor sum: 167.620041
+  Original tensor mean: 0.021705
+  Converted tensor mean: 0.052381
+ Mean difference: 0.15660757
+ Maximum pointwise difference: 2.87237978
+ Max difference location: (0, 29, 9, 1)
+  Values at max diff - Original: -0.04621891, Converted: 2.82616091
+ Biggest difference in row (0, 20, 4), sum -0.064347 vs 6.085094
+
+Layer 0, Token 9 (recurrent cache comparison):
+  Original tensor sum: 13.786104
+  Converted tensor sum: 5.261156
+  Original tensor mean: 0.004308
+  Converted tensor mean: 0.001644
+ Mean difference: 0.06277616
+ Maximum pointwise difference: 1.31032252
+ Max difference location: (0, 4, 1, 9)
+  Values at max diff - Original: -0.02821357, Converted: 1.28210890
+ Biggest difference in row (0, 11, 3), sum 0.289278 vs -0.836586
+
+Layer 1, Token 9 (recurrent cache comparison):
+  Original tensor sum: 203.497635
+  Converted tensor sum: 111.110443
+  Original tensor mean: 0.063593
+  Converted tensor mean: 0.034722
+ Mean difference: 0.10077493
+ Maximum pointwise difference: 1.97459030
+ Max difference location: (0, 24, 0, 1)
+  Values at max diff - Original: 1.88861251, Converted: -0.08597784
+ Biggest difference in row (0, 14, 0), sum 9.054160 vs 0.974541
+
+Layer 2, Token 9 (recurrent cache comparison):
+  Original tensor sum: 210.326843
+  Converted tensor sum: 237.847137
+  Original tensor mean: 0.065727
+  Converted tensor mean: 0.074327
+ Mean difference: 0.16504267
+ Maximum pointwise difference: 2.71314573
+ Max difference location: (0, 4, 8, 1)
+  Values at max diff - Original: -0.00067222, Converted: 2.71247363
+ Biggest difference in row (0, 1, 4), sum 2.414350 vs 7.828261
+
+Layer 4, Token 9 (recurrent cache comparison):
+  Original tensor sum: 76.020309
+  Converted tensor sum: 125.208931
+  Original tensor mean: 0.023756
+  Converted tensor mean: 0.039128
+ Mean difference: 0.11094213
+ Maximum pointwise difference: 3.67572975
+ Max difference location: (0, 27, 7, 5)
+  Values at max diff - Original: 3.66171432, Converted: -0.01401533
+ Biggest difference in row (0, 3, 0), sum 4.612147 vs 0.005273
+
+Layer 5, Token 9 (recurrent cache comparison):
+  Original tensor sum: 70.017532
+  Converted tensor sum: 128.789795
+  Original tensor mean: 0.021880
+  Converted tensor mean: 0.040247
+ Mean difference: 0.11726990
+ Maximum pointwise difference: 2.56784987
+ Max difference location: (0, 6, 7, 6)
+  Values at max diff - Original: 2.56954336, Converted: 0.00169344
+ Biggest difference in row (0, 6, 7), sum 5.224357 vs 0.050091
+
+Layer 6, Token 9 (recurrent cache comparison):
+  Original tensor sum: 97.678406
+  Converted tensor sum: 298.968506
+  Original tensor mean: 0.030525
+  Converted tensor mean: 0.093428
+ Mean difference: 0.16553456
+ Maximum pointwise difference: 4.22000217
+ Max difference location: (0, 14, 1, 7)
+  Values at max diff - Original: -0.10210184, Converted: 4.11790037
+ Biggest difference in row (0, 14, 1), sum -0.198166 vs 10.807201
+
+Layer 8, Token 9 (recurrent cache comparison):
+  Original tensor sum: 106.931870
+  Converted tensor sum: 173.151855
+  Original tensor mean: 0.033416
+  Converted tensor mean: 0.054110
+ Mean difference: 0.14065868
+ Maximum pointwise difference: 3.01797652
+ Max difference location: (0, 14, 9, 5)
+  Values at max diff - Original: -0.05490554, Converted: 2.96307087
+ Biggest difference in row (0, 20, 7), sum 0.154971 vs 7.357482
+
+Layer 9, Token 9 (recurrent cache comparison):
+  Original tensor sum: 64.670883
+  Converted tensor sum: 92.657562
+  Original tensor mean: 0.020210
+  Converted tensor mean: 0.028955
+ Mean difference: 0.09020478
+ Maximum pointwise difference: 3.22673941
+ Max difference location: (0, 18, 5, 2)
+  Values at max diff - Original: 0.18116489, Converted: 3.40790439
+ Biggest difference in row (0, 18, 2), sum 6.946761 vs 1.273814
+
+Layer 10, Token 9 (recurrent cache comparison):
+  Original tensor sum: 52.923912
+  Converted tensor sum: 104.621475
+  Original tensor mean: 0.016539
+  Converted tensor mean: 0.032694
+ Mean difference: 0.08354937
+ Maximum pointwise difference: 1.84956801
+ Max difference location: (0, 3, 7, 8)
+  Values at max diff - Original: 0.37758890, Converted: 2.22715688
+ Biggest difference in row (0, 20, 9), sum -1.298731 vs 2.479056
+
+Layer 12, Token 9 (recurrent cache comparison):
+  Original tensor sum: 87.343620
+  Converted tensor sum: 117.516281
+  Original tensor mean: 0.027295
+  Converted tensor mean: 0.036724
+ Mean difference: 0.12288742
+ Maximum pointwise difference: 3.19170189
+ Max difference location: (0, 13, 2, 4)
+  Values at max diff - Original: -0.11148589, Converted: 3.08021593
+ Biggest difference in row (0, 13, 2), sum 0.993775 vs 6.040417
+
+Layer 13, Token 9 (recurrent cache comparison):
+  Original tensor sum: 77.928635
+  Converted tensor sum: 116.695862
+  Original tensor mean: 0.024353
+  Converted tensor mean: 0.036467
+ Mean difference: 0.09447044
+ Maximum pointwise difference: 1.43028283
+ Max difference location: (0, 26, 0, 4)
+  Values at max diff - Original: -0.00879327, Converted: 1.42148960
+ Biggest difference in row (0, 25, 3), sum -0.128404 vs 3.423045
+
+Layer 14, Token 9 (recurrent cache comparison):
+  Original tensor sum: 162.069077
+  Converted tensor sum: 247.590637
+  Original tensor mean: 0.050647
+  Converted tensor mean: 0.077372
+ Mean difference: 0.17534283
+ Maximum pointwise difference: 3.21209598
+ Max difference location: (0, 28, 1, 9)
+  Values at max diff - Original: -0.25805441, Converted: 2.95404148
+ Biggest difference in row (0, 28, 1), sum 1.364790 vs 9.833094
+
+Layer 0, Token 10 (recurrent cache comparison):
+  Original tensor sum: 7.816267
+  Converted tensor sum: 1.466951
+  Original tensor mean: 0.002443
+  Converted tensor mean: 0.000458
+ Mean difference: 0.05842621
+ Maximum pointwise difference: 1.09208894
+ Max difference location: (0, 21, 4, 1)
+  Values at max diff - Original: 0.04324723, Converted: 1.13533616
+ Biggest difference in row (0, 28, 5), sum 0.301255 vs 2.364079
+
+Layer 1, Token 10 (recurrent cache comparison):
+  Original tensor sum: 223.526520
+  Converted tensor sum: 135.921234
+  Original tensor mean: 0.069852
+  Converted tensor mean: 0.042475
+ Mean difference: 0.10827781
+ Maximum pointwise difference: 1.68770814
+ Max difference location: (0, 16, 6, 1)
+  Values at max diff - Original: 2.02958679, Converted: 0.34187865
+ Biggest difference in row (0, 14, 0), sum 5.745544 vs -0.048143
+
+Layer 2, Token 10 (recurrent cache comparison):
+  Original tensor sum: 215.104584
+  Converted tensor sum: 227.212708
+  Original tensor mean: 0.067220
+  Converted tensor mean: 0.071004
+ Mean difference: 0.17289215
+ Maximum pointwise difference: 3.18850541
+ Max difference location: (0, 26, 3, 8)
+  Values at max diff - Original: 0.01985940, Converted: 3.20836473
+ Biggest difference in row (0, 12, 7), sum 8.279942 vs -0.264312
+
+Layer 4, Token 10 (recurrent cache comparison):
+  Original tensor sum: 185.702744
+  Converted tensor sum: 211.499130
+  Original tensor mean: 0.058032
+  Converted tensor mean: 0.066093
+ Mean difference: 0.12541530
+ Maximum pointwise difference: 2.52001357
+ Max difference location: (0, 27, 5, 8)
+  Values at max diff - Original: 0.05403204, Converted: 2.57404566
+ Biggest difference in row (0, 27, 5), sum 0.682007 vs 7.443546
+
+Layer 5, Token 10 (recurrent cache comparison):
+  Original tensor sum: 169.265594
+  Converted tensor sum: 227.449417
+  Original tensor mean: 0.052895
+  Converted tensor mean: 0.071078
+ Mean difference: 0.13289575
+ Maximum pointwise difference: 3.03736281
+ Max difference location: (0, 6, 2, 6)
+  Values at max diff - Original: 3.01727891, Converted: -0.02008397
+ Biggest difference in row (0, 6, 2), sum 9.659736 vs 0.153498
+
+Layer 6, Token 10 (recurrent cache comparison):
+  Original tensor sum: 230.247437
+  Converted tensor sum: 418.704895
+  Original tensor mean: 0.071952
+  Converted tensor mean: 0.130845
+ Mean difference: 0.17921637
+ Maximum pointwise difference: 4.08086109
+ Max difference location: (0, 6, 0, 1)
+  Values at max diff - Original: 0.00348123, Converted: 4.08434248
+ Biggest difference in row (0, 6, 0), sum 0.879897 vs 15.160538
+
+Layer 8, Token 10 (recurrent cache comparison):
+  Original tensor sum: 206.699799
+  Converted tensor sum: 283.296692
+  Original tensor mean: 0.064594
+  Converted tensor mean: 0.088530
+ Mean difference: 0.15303743
+ Maximum pointwise difference: 3.20992827
+ Max difference location: (0, 14, 4, 5)
+  Values at max diff - Original: 0.00341668, Converted: 3.21334505
+ Biggest difference in row (0, 2, 4), sum -0.470056 vs 8.175467
+
+Layer 9, Token 10 (recurrent cache comparison):
+  Original tensor sum: 155.765579
+  Converted tensor sum: 185.697693
+  Original tensor mean: 0.048677
+  Converted tensor mean: 0.058031
+ Mean difference: 0.09974226
+ Maximum pointwise difference: 2.01155925
+ Max difference location: (0, 14, 1, 8)
+  Values at max diff - Original: -0.00813468, Converted: 2.00342464
+ Biggest difference in row (0, 18, 3), sum -0.273577 vs 5.096995
+
+Layer 10, Token 10 (recurrent cache comparison):
+  Original tensor sum: 147.632782
+  Converted tensor sum: 177.473785
+  Original tensor mean: 0.046135
+  Converted tensor mean: 0.055461
+ Mean difference: 0.10073428
+ Maximum pointwise difference: 2.04938221
+ Max difference location: (0, 3, 8, 7)
+  Values at max diff - Original: -0.06264466, Converted: 1.98673749
+ Biggest difference in row (0, 24, 0), sum 0.061289 vs 4.106022
+
+Layer 12, Token 10 (recurrent cache comparison):
+  Original tensor sum: 189.647308
+  Converted tensor sum: 212.602402
+  Original tensor mean: 0.059265
+  Converted tensor mean: 0.066438
+ Mean difference: 0.12409261
+ Maximum pointwise difference: 3.06548572
+ Max difference location: (0, 14, 1, 8)
+  Values at max diff - Original: -0.05504636, Converted: 3.01043940
+ Biggest difference in row (0, 14, 1), sum -1.444618 vs 6.230721
+
+Layer 13, Token 10 (recurrent cache comparison):
+  Original tensor sum: 176.983215
+  Converted tensor sum: 204.426437
+  Original tensor mean: 0.055307
+  Converted tensor mean: 0.063883
+ Mean difference: 0.10065258
+ Maximum pointwise difference: 1.83688605
+ Max difference location: (0, 26, 0, 4)
+  Values at max diff - Original: -0.00286533, Converted: 1.83402073
+ Biggest difference in row (0, 17, 8), sum 4.395949 vs 0.724224
+
+Layer 14, Token 10 (recurrent cache comparison):
+  Original tensor sum: 362.967407
+  Converted tensor sum: 429.969727
+  Original tensor mean: 0.113427
+  Converted tensor mean: 0.134366
+ Mean difference: 0.20180641
+ Maximum pointwise difference: 3.78999281
+ Max difference location: (0, 8, 9, 2)
+  Values at max diff - Original: -0.03249586, Converted: 3.75749683
+ Biggest difference in row (0, 8, 9), sum 0.437254 vs 14.025442
+
+Layer 0, Token 11 (recurrent cache comparison):
+  Original tensor sum: 1.054740
+  Converted tensor sum: -4.912385
+  Original tensor mean: 0.000330
+  Converted tensor mean: -0.001535
+ Mean difference: 0.06330946
+ Maximum pointwise difference: 0.92195946
+ Max difference location: (0, 4, 9, 1)
+  Values at max diff - Original: 0.89514881, Converted: -0.02681063
+ Biggest difference in row (0, 4, 9), sum 1.999353 vs 0.163843
+
+Layer 1, Token 11 (recurrent cache comparison):
+  Original tensor sum: 229.025497
+  Converted tensor sum: 120.378685
+  Original tensor mean: 0.071570
+  Converted tensor mean: 0.037618
+ Mean difference: 0.11386316
+ Maximum pointwise difference: 2.45059752
+ Max difference location: (0, 14, 7, 2)
+  Values at max diff - Original: 2.53569841, Converted: 0.08510098
+ Biggest difference in row (0, 16, 6), sum 5.812350 vs -0.022719
+
+Layer 2, Token 11 (recurrent cache comparison):
+  Original tensor sum: 158.621384
+  Converted tensor sum: 133.457428
+  Original tensor mean: 0.049569
+  Converted tensor mean: 0.041705
+ Mean difference: 0.14393179
+ Maximum pointwise difference: 2.77776694
+ Max difference location: (0, 12, 7, 9)
+  Values at max diff - Original: 2.95237303, Converted: 0.17460610
+ Biggest difference in row (0, 12, 7), sum 8.065367 vs 1.687768
+
+Layer 4, Token 11 (recurrent cache comparison):
+  Original tensor sum: 216.897552
+  Converted tensor sum: 241.688950
+  Original tensor mean: 0.067780
+  Converted tensor mean: 0.075528
+ Mean difference: 0.14223064
+ Maximum pointwise difference: 3.88969064
+ Max difference location: (0, 19, 2, 0)
+  Values at max diff - Original: 0.01694401, Converted: 3.90663457
+ Biggest difference in row (0, 19, 2), sum 0.437507 vs 8.962053
+
+Layer 5, Token 11 (recurrent cache comparison):
+  Original tensor sum: 252.265610
+  Converted tensor sum: 322.771881
+  Original tensor mean: 0.078833
+  Converted tensor mean: 0.100866
+ Mean difference: 0.17598768
+ Maximum pointwise difference: 7.97533512
+ Max difference location: (0, 28, 6, 9)
+  Values at max diff - Original: 0.35858834, Converted: 8.33392334
+ Biggest difference in row (0, 28, 6), sum 5.014431 vs 26.334686
+
+Layer 6, Token 11 (recurrent cache comparison):
+  Original tensor sum: 291.508423
+  Converted tensor sum: 433.311768
+  Original tensor mean: 0.091096
+  Converted tensor mean: 0.135410
+ Mean difference: 0.17094433
+ Maximum pointwise difference: 3.41666508
+ Max difference location: (0, 6, 4, 5)
+  Values at max diff - Original: 0.27297387, Converted: 3.68963885
+ Biggest difference in row (0, 14, 1), sum -0.165701 vs 10.544808
+
+Layer 8, Token 11 (recurrent cache comparison):
+  Original tensor sum: 215.415359
+  Converted tensor sum: 351.092529
+  Original tensor mean: 0.067317
+  Converted tensor mean: 0.109716
+ Mean difference: 0.18807893
+ Maximum pointwise difference: 3.95769572
+ Max difference location: (0, 23, 4, 7)
+  Values at max diff - Original: 3.95293593, Converted: -0.00475990
+ Biggest difference in row (0, 2, 4), sum 0.017769 vs 8.146402
+
+Layer 9, Token 11 (recurrent cache comparison):
+  Original tensor sum: 230.947296
+  Converted tensor sum: 244.599213
+  Original tensor mean: 0.072171
+  Converted tensor mean: 0.076437
+ Mean difference: 0.13342199
+ Maximum pointwise difference: 2.90320230
+ Max difference location: (0, 18, 3, 2)
+  Values at max diff - Original: -0.01862744, Converted: 2.88457489
+ Biggest difference in row (0, 28, 7), sum 8.403417 vs 1.460527
+
+Layer 10, Token 11 (recurrent cache comparison):
+  Original tensor sum: 271.779785
+  Converted tensor sum: 241.771790
+  Original tensor mean: 0.084931
+  Converted tensor mean: 0.075554
+ Mean difference: 0.15158509
+ Maximum pointwise difference: 3.77889895
+ Max difference location: (0, 0, 3, 7)
+  Values at max diff - Original: 4.08713722, Converted: 0.30823818
+ Biggest difference in row (0, 10, 4), sum 7.732811 vs 0.603564
+
+Layer 12, Token 11 (recurrent cache comparison):
+  Original tensor sum: 274.425629
+  Converted tensor sum: 286.277039
+  Original tensor mean: 0.085758
+  Converted tensor mean: 0.089462
+ Mean difference: 0.16393411
+ Maximum pointwise difference: 3.90725374
+ Max difference location: (0, 14, 1, 8)
+  Values at max diff - Original: 0.01574333, Converted: 3.92299700
+ Biggest difference in row (0, 23, 2), sum 10.560888 vs 1.081235
+
+Layer 13, Token 11 (recurrent cache comparison):
+  Original tensor sum: 212.238953
+  Converted tensor sum: 260.726898
+  Original tensor mean: 0.066325
+  Converted tensor mean: 0.081477
+ Mean difference: 0.12856843
+ Maximum pointwise difference: 3.76317525
+ Max difference location: (0, 17, 8, 2)
+  Values at max diff - Original: 4.56109810, Converted: 0.79792279
+ Biggest difference in row (0, 19, 1), sum 10.229995 vs 2.908604
+
+Layer 14, Token 11 (recurrent cache comparison):
+  Original tensor sum: 502.973511
+  Converted tensor sum: 568.935181
+  Original tensor mean: 0.157179
+  Converted tensor mean: 0.177792
+ Mean difference: 0.27989930
+ Maximum pointwise difference: 4.54578638
+ Max difference location: (0, 16, 7, 6)
+  Values at max diff - Original: 4.27132416, Converted: -0.27446240
+ Biggest difference in row (0, 21, 5), sum -0.168386 vs 13.477350
+
+Layer 0, Token 12 (recurrent cache comparison):
+  Original tensor sum: 4.252830
+  Converted tensor sum: -0.731128
+  Original tensor mean: 0.001329
+  Converted tensor mean: -0.000228
+ Mean difference: 0.06294378
+ Maximum pointwise difference: 1.78251398
+ Max difference location: (0, 1, 3, 2)
+  Values at max diff - Original: -0.00792313, Converted: 1.77459085
+ Biggest difference in row (0, 28, 5), sum 0.238817 vs 2.175461
+
+Layer 1, Token 12 (recurrent cache comparison):
+  Original tensor sum: 242.003052
+  Converted tensor sum: 66.457909
+  Original tensor mean: 0.075626
+  Converted tensor mean: 0.020768
+ Mean difference: 0.11966369
+ Maximum pointwise difference: 2.80864978
+ Max difference location: (0, 24, 0, 1)
+  Values at max diff - Original: 2.71780372, Converted: -0.09084603
+ Biggest difference in row (0, 14, 0), sum 5.513966 vs -0.057299
+
+Layer 2, Token 12 (recurrent cache comparison):
+  Original tensor sum: 212.836731
+  Converted tensor sum: 76.092499
+  Original tensor mean: 0.066511
+  Converted tensor mean: 0.023779
+ Mean difference: 0.14941603
+ Maximum pointwise difference: 2.88118339
+ Max difference location: (0, 12, 7, 0)
+  Values at max diff - Original: 2.70842910, Converted: -0.17275429
+ Biggest difference in row (0, 12, 7), sum 7.969865 vs 0.167881
+
+Layer 4, Token 12 (recurrent cache comparison):
+  Original tensor sum: 128.756699
+  Converted tensor sum: 154.911957
+  Original tensor mean: 0.040236
+  Converted tensor mean: 0.048410
+ Mean difference: 0.10618121
+ Maximum pointwise difference: 2.31433964
+ Max difference location: (0, 8, 1, 6)
+  Values at max diff - Original: 2.26328707, Converted: -0.05105254
+ Biggest difference in row (0, 25, 7), sum 3.269817 vs -0.397900
+
+Layer 5, Token 12 (recurrent cache comparison):
+  Original tensor sum: 176.745117
+  Converted tensor sum: 232.734680
+  Original tensor mean: 0.055233
+  Converted tensor mean: 0.072730
+ Mean difference: 0.13117053
+ Maximum pointwise difference: 4.35398436
+ Max difference location: (0, 28, 6, 9)
+  Values at max diff - Original: 0.18738972, Converted: 4.54137421
+ Biggest difference in row (0, 28, 6), sum 3.095334 vs 9.516649
+
+Layer 6, Token 12 (recurrent cache comparison):
+  Original tensor sum: 259.031647
+  Converted tensor sum: 428.069794
+  Original tensor mean: 0.080947
+  Converted tensor mean: 0.133772
+ Mean difference: 0.16942802
+ Maximum pointwise difference: 5.44846153
+ Max difference location: (0, 26, 9, 3)
+  Values at max diff - Original: -0.01164311, Converted: 5.43681860
+ Biggest difference in row (0, 6, 0), sum 0.994667 vs 12.910238
+
+Layer 8, Token 12 (recurrent cache comparison):
+  Original tensor sum: 221.930222
+  Converted tensor sum: 262.522369
+  Original tensor mean: 0.069353
+  Converted tensor mean: 0.082038
+ Mean difference: 0.17785330
+ Maximum pointwise difference: 4.14597464
+ Max difference location: (0, 21, 9, 9)
+  Values at max diff - Original: -0.07410901, Converted: 4.07186556
+ Biggest difference in row (0, 21, 9), sum -0.204344 vs 10.075971
+
+Layer 9, Token 12 (recurrent cache comparison):
+  Original tensor sum: 189.028931
+  Converted tensor sum: 238.029388
+  Original tensor mean: 0.059072
+  Converted tensor mean: 0.074384
+ Mean difference: 0.14264640
+ Maximum pointwise difference: 2.92814064
+ Max difference location: (0, 14, 1, 2)
+  Values at max diff - Original: -0.88447762, Converted: 2.04366302
+ Biggest difference in row (0, 28, 0), sum 1.806244 vs 7.562672
+
+Layer 10, Token 12 (recurrent cache comparison):
+  Original tensor sum: 236.811234
+  Converted tensor sum: 260.771973
+  Original tensor mean: 0.074004
+  Converted tensor mean: 0.081491
+ Mean difference: 0.15943669
+ Maximum pointwise difference: 5.29651165
+ Max difference location: (0, 24, 0, 1)
+  Values at max diff - Original: 0.03258384, Converted: 5.32909536
+ Biggest difference in row (0, 24, 0), sum 0.082025 vs 10.949675
+
+Layer 12, Token 12 (recurrent cache comparison):
+  Original tensor sum: 244.807922
+  Converted tensor sum: 314.705444
+  Original tensor mean: 0.076502
+  Converted tensor mean: 0.098345
+ Mean difference: 0.16864727
+ Maximum pointwise difference: 4.38556862
+ Max difference location: (0, 20, 3, 2)
+  Values at max diff - Original: -0.00896719, Converted: 4.37660122
+ Biggest difference in row (0, 28, 3), sum 10.509099 vs 0.169576
+
+Layer 13, Token 12 (recurrent cache comparison):
+  Original tensor sum: 195.554291
+  Converted tensor sum: 222.348053
+  Original tensor mean: 0.061111
+  Converted tensor mean: 0.069484
+ Mean difference: 0.13128105
+ Maximum pointwise difference: 3.68478298
+ Max difference location: (0, 17, 2, 8)
+  Values at max diff - Original: 0.00859472, Converted: 3.69337773
+ Biggest difference in row (0, 17, 2), sum 0.146146 vs 8.692631
+
+Layer 14, Token 12 (recurrent cache comparison):
+  Original tensor sum: 483.896393
+  Converted tensor sum: 527.955566
+  Original tensor mean: 0.151218
+  Converted tensor mean: 0.164986
+ Mean difference: 0.27409020
+ Maximum pointwise difference: 4.70396519
+ Max difference location: (0, 25, 4, 1)
+  Values at max diff - Original: -0.42079771, Converted: 4.28316736
+ Biggest difference in row (0, 16, 6), sum -0.041328 vs 13.549324
+
+Layer 0, Token 13 (recurrent cache comparison):
+  Original tensor sum: 1.659033
+  Converted tensor sum: -7.970642
+  Original tensor mean: 0.000518
+  Converted tensor mean: -0.002491
+ Mean difference: 0.07536316
+ Maximum pointwise difference: 1.29645300
+ Max difference location: (0, 4, 9, 1)
+  Values at max diff - Original: 1.30392849, Converted: 0.00747545
+ Biggest difference in row (0, 26, 3), sum -0.329301 vs -3.374216
+
+Layer 1, Token 13 (recurrent cache comparison):
+  Original tensor sum: 239.724915
+  Converted tensor sum: 79.675636
+  Original tensor mean: 0.074914
+  Converted tensor mean: 0.024899
+ Mean difference: 0.12407961
+ Maximum pointwise difference: 2.50358605
+ Max difference location: (0, 24, 0, 1)
+  Values at max diff - Original: 2.48077655, Converted: -0.02280946
+ Biggest difference in row (0, 14, 0), sum 6.016558 vs 0.013054
+
+Layer 2, Token 13 (recurrent cache comparison):
+  Original tensor sum: 247.626099
+  Converted tensor sum: 106.589592
+  Original tensor mean: 0.077383
+  Converted tensor mean: 0.033309
+ Mean difference: 0.15574569
+ Maximum pointwise difference: 3.29841137
+ Max difference location: (0, 4, 2, 8)
+  Values at max diff - Original: 3.44825506, Converted: 0.14984375
+ Biggest difference in row (0, 12, 7), sum 7.714676 vs 0.758271
+
+Layer 4, Token 13 (recurrent cache comparison):
+  Original tensor sum: 123.371284
+  Converted tensor sum: 126.859177
+  Original tensor mean: 0.038554
+  Converted tensor mean: 0.039643
+ Mean difference: 0.08389783
+ Maximum pointwise difference: 2.97862935
+ Max difference location: (0, 28, 2, 3)
+  Values at max diff - Original: 3.17326093, Converted: 0.19463167
+ Biggest difference in row (0, 28, 2), sum 4.464350 vs 0.493919
+
+Layer 5, Token 13 (recurrent cache comparison):
+  Original tensor sum: 147.258102
+  Converted tensor sum: 184.070984
+  Original tensor mean: 0.046018
+  Converted tensor mean: 0.057522
+ Mean difference: 0.10195178
+ Maximum pointwise difference: 2.96551919
+ Max difference location: (0, 28, 6, 9)
+  Values at max diff - Original: 0.10930623, Converted: 3.07482553
+ Biggest difference in row (0, 28, 6), sum 1.825548 vs 9.674469
+
+Layer 6, Token 13 (recurrent cache comparison):
+  Original tensor sum: 283.270142
+  Converted tensor sum: 448.314880
+  Original tensor mean: 0.088522
+  Converted tensor mean: 0.140098
+ Mean difference: 0.15905625
+ Maximum pointwise difference: 3.46541429
+ Max difference location: (0, 24, 8, 2)
+  Values at max diff - Original: -0.00599505, Converted: 3.45941925
+ Biggest difference in row (0, 6, 4), sum 3.774855 vs 11.804656
+
+Layer 8, Token 13 (recurrent cache comparison):
+  Original tensor sum: 241.112183
+  Converted tensor sum: 274.130127
+  Original tensor mean: 0.075348
+  Converted tensor mean: 0.085666
+ Mean difference: 0.10883617
+ Maximum pointwise difference: 4.01715469
+ Max difference location: (0, 21, 9, 9)
+  Values at max diff - Original: -0.09944591, Converted: 3.91770887
+ Biggest difference in row (0, 20, 9), sum -0.234172 vs 5.020240
+
+Layer 9, Token 13 (recurrent cache comparison):
+  Original tensor sum: 173.039688
+  Converted tensor sum: 214.021088
+  Original tensor mean: 0.054075
+  Converted tensor mean: 0.066882
+ Mean difference: 0.09634628
+ Maximum pointwise difference: 1.72028887
+ Max difference location: (0, 18, 6, 2)
+  Values at max diff - Original: 0.10816531, Converted: 1.82845414
+ Biggest difference in row (0, 18, 6), sum 0.705206 vs 4.642780
+
+Layer 10, Token 13 (recurrent cache comparison):
+  Original tensor sum: 213.873550
+  Converted tensor sum: 242.753281
+  Original tensor mean: 0.066835
+  Converted tensor mean: 0.075860
+ Mean difference: 0.09029815
+ Maximum pointwise difference: 1.41950274
+ Max difference location: (0, 11, 2, 6)
+  Values at max diff - Original: 0.20659086, Converted: 1.62609363
+ Biggest difference in row (0, 23, 5), sum 0.448223 vs 3.806486
+
+Layer 12, Token 13 (recurrent cache comparison):
+  Original tensor sum: 233.552292
+  Converted tensor sum: 296.583405
+  Original tensor mean: 0.072985
+  Converted tensor mean: 0.092682
+ Mean difference: 0.08977944
+ Maximum pointwise difference: 1.59837830
+ Max difference location: (0, 19, 7, 7)
+  Values at max diff - Original: 0.86349380, Converted: 2.46187210
+ Biggest difference in row (0, 4, 3), sum 5.997213 vs 0.558758
+
+Layer 13, Token 13 (recurrent cache comparison):
+  Original tensor sum: 172.634430
+  Converted tensor sum: 190.998459
+  Original tensor mean: 0.053948
+  Converted tensor mean: 0.059687
+ Mean difference: 0.07964972
+ Maximum pointwise difference: 2.45006180
+ Max difference location: (0, 26, 4, 0)
+  Values at max diff - Original: 2.51385903, Converted: 0.06379732
+ Biggest difference in row (0, 26, 4), sum 5.078406 vs 0.298857
+
+Layer 14, Token 13 (recurrent cache comparison):
+  Original tensor sum: 516.640808
+  Converted tensor sum: 514.890991
+  Original tensor mean: 0.161450
+  Converted tensor mean: 0.160903
+ Mean difference: 0.14294353
+ Maximum pointwise difference: 2.38266706
+ Max difference location: (0, 8, 9, 3)
+  Values at max diff - Original: 0.05516699, Converted: 2.43783402
+ Biggest difference in row (0, 8, 9), sum -0.157885 vs 10.688316
+
+Layer 0, Token 14 (recurrent cache comparison):
+  Original tensor sum: 5.983342
+  Converted tensor sum: -8.715725
+  Original tensor mean: 0.001870
+  Converted tensor mean: -0.002724
+ Mean difference: 0.07516728
+ Maximum pointwise difference: 1.55751526
+ Max difference location: (0, 25, 8, 2)
+  Values at max diff - Original: 1.57396424, Converted: 0.01644893
+ Biggest difference in row (0, 7, 3), sum 0.124509 vs -1.539357
+
+Layer 1, Token 14 (recurrent cache comparison):
+  Original tensor sum: 229.783936
+  Converted tensor sum: 87.531807
+  Original tensor mean: 0.071807
+  Converted tensor mean: 0.027354
+ Mean difference: 0.11559690
+ Maximum pointwise difference: 2.09234738
+ Max difference location: (0, 24, 0, 1)
+  Values at max diff - Original: 2.22317505, Converted: 0.13082767
+ Biggest difference in row (0, 14, 0), sum 5.592927 vs -0.441425
+
+Layer 2, Token 14 (recurrent cache comparison):
+  Original tensor sum: 268.945923
+  Converted tensor sum: 133.786499
+  Original tensor mean: 0.084046
+  Converted tensor mean: 0.041808
+ Mean difference: 0.16352382
+ Maximum pointwise difference: 2.87041712
+ Max difference location: (0, 11, 9, 6)
+  Values at max diff - Original: 2.94916487, Converted: 0.07874785
+ Biggest difference in row (0, 23, 9), sum 8.145676 vs 0.297307
+
+Layer 4, Token 14 (recurrent cache comparison):
+  Original tensor sum: 117.762733
+  Converted tensor sum: 118.469772
+  Original tensor mean: 0.036801
+  Converted tensor mean: 0.037022
+ Mean difference: 0.09650213
+ Maximum pointwise difference: 1.50842690
+ Max difference location: (0, 20, 6, 4)
+  Values at max diff - Original: 2.04159784, Converted: 0.53317100
+ Biggest difference in row (0, 20, 6), sum 4.022114 vs 0.957074
+
+Layer 5, Token 14 (recurrent cache comparison):
+  Original tensor sum: 128.612335
+  Converted tensor sum: 157.030731
+  Original tensor mean: 0.040191
+  Converted tensor mean: 0.049072
+ Mean difference: 0.10371025
+ Maximum pointwise difference: 2.24814534
+ Max difference location: (0, 8, 5, 9)
+  Values at max diff - Original: -0.03905072, Converted: 2.20909452
+ Biggest difference in row (0, 2, 5), sum 3.689715 vs 0.515908
+
+Layer 6, Token 14 (recurrent cache comparison):
+  Original tensor sum: 284.314667
+  Converted tensor sum: 446.866150
+  Original tensor mean: 0.088848
+  Converted tensor mean: 0.139646
+ Mean difference: 0.16138166
+ Maximum pointwise difference: 3.92217135
+ Max difference location: (0, 26, 9, 3)
+  Values at max diff - Original: -0.00824802, Converted: 3.91392326
+ Biggest difference in row (0, 6, 0), sum 1.467430 vs 14.063056
+
+Layer 8, Token 14 (recurrent cache comparison):
+  Original tensor sum: 268.420227
+  Converted tensor sum: 298.094666
+  Original tensor mean: 0.083881
+  Converted tensor mean: 0.093155
+ Mean difference: 0.17274044
+ Maximum pointwise difference: 3.57632637
+ Max difference location: (0, 21, 9, 9)
+  Values at max diff - Original: -0.07158025, Converted: 3.50474620
+ Biggest difference in row (0, 21, 9), sum -0.176140 vs 9.883745
+
+Layer 9, Token 14 (recurrent cache comparison):
+  Original tensor sum: 153.733398
+  Converted tensor sum: 197.629532
+  Original tensor mean: 0.048042
+  Converted tensor mean: 0.061759
+ Mean difference: 0.11230749
+ Maximum pointwise difference: 2.07441854
+ Max difference location: (0, 2, 6, 8)
+  Values at max diff - Original: -0.01318651, Converted: 2.06123209
+ Biggest difference in row (0, 28, 0), sum 1.515908 vs 6.081204
+
+Layer 10, Token 14 (recurrent cache comparison):
+  Original tensor sum: 196.466980
+  Converted tensor sum: 228.325546
+  Original tensor mean: 0.061396
+  Converted tensor mean: 0.071352
+ Mean difference: 0.11859564
+ Maximum pointwise difference: 4.73182058
+ Max difference location: (0, 24, 0, 1)
+  Values at max diff - Original: 0.02647224, Converted: 4.75829268
+ Biggest difference in row (0, 24, 0), sum 0.182875 vs 9.743350
+
+Layer 12, Token 14 (recurrent cache comparison):
+  Original tensor sum: 235.148682
+  Converted tensor sum: 279.831421
+  Original tensor mean: 0.073484
+  Converted tensor mean: 0.087447
+ Mean difference: 0.14844361
+ Maximum pointwise difference: 3.64688230
+ Max difference location: (0, 28, 4, 2)
+  Values at max diff - Original: 0.01143306, Converted: 3.65831542
+ Biggest difference in row (0, 28, 2), sum 9.410328 vs 0.407452
+
+Layer 13, Token 14 (recurrent cache comparison):
+  Original tensor sum: 165.774078
+  Converted tensor sum: 179.691483
+  Original tensor mean: 0.051804
+  Converted tensor mean: 0.056154
+ Mean difference: 0.09881324
+ Maximum pointwise difference: 3.03563190
+ Max difference location: (0, 11, 0, 4)
+  Values at max diff - Original: -0.10383722, Converted: 2.93179464
+ Biggest difference in row (0, 11, 0), sum 0.092786 vs 5.614193
+
+Layer 14, Token 14 (recurrent cache comparison):
+  Original tensor sum: 519.787109
+  Converted tensor sum: 539.567444
+  Original tensor mean: 0.162433
+  Converted tensor mean: 0.168615
+ Mean difference: 0.25360039
+ Maximum pointwise difference: 4.24835634
+ Max difference location: (0, 15, 8, 2)
+  Values at max diff - Original: -0.01945496, Converted: 4.22890139
+ Biggest difference in row (0, 16, 6), sum 0.069426 vs 10.617959
+
+Layer 0, Token 15 (recurrent cache comparison):
+  Original tensor sum: 1.143128
+  Converted tensor sum: 1.955431
+  Original tensor mean: 0.000357
+  Converted tensor mean: 0.000611
+ Mean difference: 0.06554744
+ Maximum pointwise difference: 1.62353444
+ Max difference location: (0, 1, 3, 2)
+  Values at max diff - Original: -0.04374466, Converted: 1.57978976
+ Biggest difference in row (0, 28, 5), sum 0.256206 vs 2.700654
+
+Layer 1, Token 15 (recurrent cache comparison):
+  Original tensor sum: 237.576813
+  Converted tensor sum: 84.227829
+  Original tensor mean: 0.074243
+  Converted tensor mean: 0.026321
+ Mean difference: 0.12017149
+ Maximum pointwise difference: 2.73136139
+ Max difference location: (0, 17, 6, 0)
+  Values at max diff - Original: 2.89759755, Converted: 0.16623622
+ Biggest difference in row (0, 16, 6), sum 6.557743 vs 0.035282
+
+Layer 2, Token 15 (recurrent cache comparison):
+  Original tensor sum: 311.619568
+  Converted tensor sum: 155.972748
+  Original tensor mean: 0.097381
+  Converted tensor mean: 0.048741
+ Mean difference: 0.17912415
+ Maximum pointwise difference: 3.15524197
+ Max difference location: (0, 12, 0, 0)
+  Values at max diff - Original: 3.30613947, Converted: 0.15089758
+ Biggest difference in row (0, 12, 0), sum 9.937962 vs 2.002455
+
+Layer 4, Token 15 (recurrent cache comparison):
+  Original tensor sum: 167.504608
+  Converted tensor sum: 97.213791
+  Original tensor mean: 0.052345
+  Converted tensor mean: 0.030379
+ Mean difference: 0.11675335
+ Maximum pointwise difference: 2.34569263
+ Max difference location: (0, 28, 2, 3)
+  Values at max diff - Original: 2.36823630, Converted: 0.02254373
+ Biggest difference in row (0, 24, 1), sum 4.970531 vs 0.552202
+
+Layer 5, Token 15 (recurrent cache comparison):
+  Original tensor sum: 165.098206
+  Converted tensor sum: 106.835938
+  Original tensor mean: 0.051593
+  Converted tensor mean: 0.033386
+ Mean difference: 0.11981978
+ Maximum pointwise difference: 3.00254560
+ Max difference location: (0, 19, 0, 4)
+  Values at max diff - Original: -0.04975805, Converted: 2.95278764
+ Biggest difference in row (0, 6, 7), sum 6.529483 vs 0.743666
+
+Layer 6, Token 15 (recurrent cache comparison):
+  Original tensor sum: 328.690277
+  Converted tensor sum: 322.121643
+  Original tensor mean: 0.102716
+  Converted tensor mean: 0.100663
+ Mean difference: 0.17106648
+ Maximum pointwise difference: 3.49930573
+ Max difference location: (0, 10, 4, 0)
+  Values at max diff - Original: -0.02206346, Converted: 3.47724223
+ Biggest difference in row (0, 10, 4), sum 0.599032 vs 10.579692
+
+Layer 8, Token 15 (recurrent cache comparison):
+  Original tensor sum: 317.911224
+  Converted tensor sum: 119.034622
+  Original tensor mean: 0.099347
+  Converted tensor mean: 0.037198
+ Mean difference: 0.17545381
+ Maximum pointwise difference: 5.58166885
+ Max difference location: (0, 12, 5, 9)
+  Values at max diff - Original: -0.00163084, Converted: 5.58003807
+ Biggest difference in row (0, 12, 5), sum -0.115539 vs 9.864284
+
+Layer 9, Token 15 (recurrent cache comparison):
+  Original tensor sum: 190.170853
+  Converted tensor sum: 99.272003
+  Original tensor mean: 0.059428
+  Converted tensor mean: 0.031023
+ Mean difference: 0.10875368
+ Maximum pointwise difference: 2.41038036
+ Max difference location: (0, 18, 2, 3)
+  Values at max diff - Original: 2.75146770, Converted: 0.34108725
+ Biggest difference in row (0, 18, 2), sum 8.039729 vs 1.441757
+
+Layer 10, Token 15 (recurrent cache comparison):
+  Original tensor sum: 224.608826
+  Converted tensor sum: 140.291000
+  Original tensor mean: 0.070190
+  Converted tensor mean: 0.043841
+ Mean difference: 0.13173704
+ Maximum pointwise difference: 3.69921541
+ Max difference location: (0, 0, 7, 3)
+  Values at max diff - Original: -0.00459916, Converted: 3.69461632
+ Biggest difference in row (0, 18, 3), sum 0.045224 vs 5.212623
+
+Layer 12, Token 15 (recurrent cache comparison):
+  Original tensor sum: 284.485657
+  Converted tensor sum: 158.051971
+  Original tensor mean: 0.088902
+  Converted tensor mean: 0.049391
+ Mean difference: 0.16240636
+ Maximum pointwise difference: 3.41311693
+ Max difference location: (0, 30, 4, 0)
+  Values at max diff - Original: -0.00378206, Converted: 3.40933490
+ Biggest difference in row (0, 28, 3), sum 10.288229 vs 0.646799
+
+Layer 13, Token 15 (recurrent cache comparison):
+  Original tensor sum: 217.891571
+  Converted tensor sum: 114.440430
+  Original tensor mean: 0.068091
+  Converted tensor mean: 0.035763
+ Mean difference: 0.11250080
+ Maximum pointwise difference: 2.57714581
+ Max difference location: (0, 3, 9, 4)
+  Values at max diff - Original: -0.00322327, Converted: 2.57392263
+ Biggest difference in row (0, 8, 1), sum 5.657505 vs 0.025426
+
+Layer 14, Token 15 (recurrent cache comparison):
+  Original tensor sum: 613.393188
+  Converted tensor sum: 259.209320
+  Original tensor mean: 0.191685
+  Converted tensor mean: 0.081003
+ Mean difference: 0.25669345
+ Maximum pointwise difference: 4.67302513
+ Max difference location: (0, 16, 6, 7)
+  Values at max diff - Original: 0.00394140, Converted: 4.67696667
+ Biggest difference in row (0, 16, 6), sum 0.113854 vs 11.389561
+
+Layer 0, Token 16 (recurrent cache comparison):
+  Original tensor sum: -7.241831
+  Converted tensor sum: 6.292229
+  Original tensor mean: -0.002263
+  Converted tensor mean: 0.001966
+ Mean difference: 0.07260455
+ Maximum pointwise difference: 1.56294525
+ Max difference location: (0, 4, 9, 1)
+  Values at max diff - Original: 1.55768764, Converted: -0.00525762
+ Biggest difference in row (0, 4, 9), sum 3.422554 vs -0.082252
+
+Layer 1, Token 16 (recurrent cache comparison):
+  Original tensor sum: 208.371277
+  Converted tensor sum: 121.900169
+  Original tensor mean: 0.065116
+  Converted tensor mean: 0.038094
+ Mean difference: 0.10988435
+ Maximum pointwise difference: 2.54077005
+ Max difference location: (0, 16, 6, 1)
+  Values at max diff - Original: 2.44506192, Converted: -0.09570823
+ Biggest difference in row (0, 16, 6), sum 5.495286 vs 0.369152
+
+Layer 2, Token 16 (recurrent cache comparison):
+  Original tensor sum: 271.274109
+  Converted tensor sum: 250.062592
+  Original tensor mean: 0.084773
+  Converted tensor mean: 0.078145
+ Mean difference: 0.18668148
+ Maximum pointwise difference: 3.97749329
+ Max difference location: (0, 4, 8, 2)
+  Values at max diff - Original: 0.00367373, Converted: 3.98116708
+ Biggest difference in row (0, 4, 8), sum 0.084576 vs 8.366636
+
+Layer 4, Token 16 (recurrent cache comparison):
+  Original tensor sum: 245.506393
+  Converted tensor sum: 128.042282
+  Original tensor mean: 0.076721
+  Converted tensor mean: 0.040013
+ Mean difference: 0.13813969
+ Maximum pointwise difference: 2.50754499
+ Max difference location: (0, 27, 2, 5)
+  Values at max diff - Original: 2.48510361, Converted: -0.02244142
+ Biggest difference in row (0, 30, 3), sum 6.143555 vs -0.003137
+
+Layer 5, Token 16 (recurrent cache comparison):
+  Original tensor sum: 252.541031
+  Converted tensor sum: 153.491074
+  Original tensor mean: 0.078919
+  Converted tensor mean: 0.047966
+ Mean difference: 0.13783714
+ Maximum pointwise difference: 4.67899084
+ Max difference location: (0, 6, 2, 9)
+  Values at max diff - Original: 4.74959278, Converted: 0.07060210
+ Biggest difference in row (0, 6, 2), sum 15.435174 vs 0.669571
+
+Layer 6, Token 16 (recurrent cache comparison):
+  Original tensor sum: 417.031616
+  Converted tensor sum: 302.490662
+  Original tensor mean: 0.130322
+  Converted tensor mean: 0.094528
+ Mean difference: 0.18095936
+ Maximum pointwise difference: 3.41091108
+ Max difference location: (0, 1, 9, 8)
+  Values at max diff - Original: 3.94837856, Converted: 0.53746736
+ Biggest difference in row (0, 17, 7), sum 10.598001 vs 1.553886
+
+Layer 8, Token 16 (recurrent cache comparison):
+  Original tensor sum: 360.497803
+  Converted tensor sum: 167.798264
+  Original tensor mean: 0.112656
+  Converted tensor mean: 0.052437
+ Mean difference: 0.18179806
+ Maximum pointwise difference: 4.85258770
+ Max difference location: (0, 20, 6, 7)
+  Values at max diff - Original: 3.78496194, Converted: -1.06762552
+ Biggest difference in row (0, 20, 6), sum 7.293591 vs -2.448533
+
+Layer 9, Token 16 (recurrent cache comparison):
+  Original tensor sum: 231.574097
+  Converted tensor sum: 117.788071
+  Original tensor mean: 0.072367
+  Converted tensor mean: 0.036809
+ Mean difference: 0.12296900
+ Maximum pointwise difference: 1.94617844
+ Max difference location: (0, 18, 2, 3)
+  Values at max diff - Original: 2.51620770, Converted: 0.57002932
+ Biggest difference in row (0, 18, 2), sum 7.408888 vs 2.509162
+
+Layer 10, Token 16 (recurrent cache comparison):
+  Original tensor sum: 251.412247
+  Converted tensor sum: 167.548752
+  Original tensor mean: 0.078566
+  Converted tensor mean: 0.052359
+ Mean difference: 0.13002089
+ Maximum pointwise difference: 2.56599689
+ Max difference location: (0, 24, 1, 0)
+  Values at max diff - Original: 4.14129448, Converted: 1.57529759
+ Biggest difference in row (0, 14, 2), sum 5.702995 vs 0.022515
+
+Layer 12, Token 16 (recurrent cache comparison):
+  Original tensor sum: 309.263367
+  Converted tensor sum: 172.743027
+  Original tensor mean: 0.096645
+  Converted tensor mean: 0.053982
+ Mean difference: 0.16015999
+ Maximum pointwise difference: 4.55992699
+ Max difference location: (0, 28, 3, 4)
+  Values at max diff - Original: 3.40088701, Converted: -1.15903974
+ Biggest difference in row (0, 28, 3), sum 10.782799 vs -1.738761
+
+Layer 13, Token 16 (recurrent cache comparison):
+  Original tensor sum: 245.305267
+  Converted tensor sum: 135.343552
+  Original tensor mean: 0.076658
+  Converted tensor mean: 0.042295
+ Mean difference: 0.11650297
+ Maximum pointwise difference: 2.94789600
+ Max difference location: (0, 11, 4, 0)
+  Values at max diff - Original: 3.34942126, Converted: 0.40152529
+ Biggest difference in row (0, 27, 4), sum 6.619488 vs 0.377767
+
+Layer 14, Token 16 (recurrent cache comparison):
+  Original tensor sum: 677.616821
+  Converted tensor sum: 309.657593
+  Original tensor mean: 0.211755
+  Converted tensor mean: 0.096768
+ Mean difference: 0.25261062
+ Maximum pointwise difference: 4.12457132
+ Max difference location: (0, 21, 3, 5)
+  Values at max diff - Original: 4.07018948, Converted: -0.05438172
+ Biggest difference in row (0, 21, 3), sum 12.550769 vs -0.320660
+
+Layer 0, Token 17 (recurrent cache comparison):
+  Original tensor sum: 8.814422
+  Converted tensor sum: 2.569008
+  Original tensor mean: 0.002755
+  Converted tensor mean: 0.000803
+ Mean difference: 0.07054429
+ Maximum pointwise difference: 2.09221244
+ Max difference location: (0, 1, 2, 3)
+  Values at max diff - Original: 2.03968024, Converted: -0.05253213
+ Biggest difference in row (0, 17, 2), sum 2.854507 vs 0.425217
+
+Layer 1, Token 17 (recurrent cache comparison):
+  Original tensor sum: 202.785217
+  Converted tensor sum: 127.821655
+  Original tensor mean: 0.063370
+  Converted tensor mean: 0.039944
+ Mean difference: 0.11817915
+ Maximum pointwise difference: 2.18196273
+ Max difference location: (0, 23, 4, 0)
+  Values at max diff - Original: 0.00466894, Converted: 2.18663168
+ Biggest difference in row (0, 23, 4), sum 1.189118 vs 6.664180
+
+Layer 2, Token 17 (recurrent cache comparison):
+  Original tensor sum: 269.547241
+  Converted tensor sum: 202.949875
+  Original tensor mean: 0.084234
+  Converted tensor mean: 0.063422
+ Mean difference: 0.17686243
+ Maximum pointwise difference: 3.38580871
+ Max difference location: (0, 30, 3, 9)
+  Values at max diff - Original: -0.03989490, Converted: 3.34591389
+ Biggest difference in row (0, 23, 4), sum 0.959554 vs 6.602069
+
+Layer 4, Token 17 (recurrent cache comparison):
+  Original tensor sum: 285.057709
+  Converted tensor sum: 90.890617
+  Original tensor mean: 0.089081
+  Converted tensor mean: 0.028403
+ Mean difference: 0.14633463
+ Maximum pointwise difference: 3.59569287
+ Max difference location: (0, 19, 2, 9)
+  Values at max diff - Original: 0.11129396, Converted: 3.70698690
+ Biggest difference in row (0, 24, 1), sum 6.665072 vs 0.069785
+
+Layer 5, Token 17 (recurrent cache comparison):
+  Original tensor sum: 305.935303
+  Converted tensor sum: 101.421249
+  Original tensor mean: 0.095605
+  Converted tensor mean: 0.031694
+ Mean difference: 0.15904053
+ Maximum pointwise difference: 2.52599096
+ Max difference location: (0, 6, 2, 9)
+  Values at max diff - Original: 2.51262259, Converted: -0.01336834
+ Biggest difference in row (0, 6, 2), sum 10.206850 vs 1.778209
+
+Layer 6, Token 17 (recurrent cache comparison):
+  Original tensor sum: 411.833740
+  Converted tensor sum: 250.492935
+  Original tensor mean: 0.128698
+  Converted tensor mean: 0.078279
+ Mean difference: 0.18581259
+ Maximum pointwise difference: 4.02491474
+ Max difference location: (0, 17, 7, 5)
+  Values at max diff - Original: 4.38884020, Converted: 0.36392546
+ Biggest difference in row (0, 17, 7), sum 11.349621 vs 0.846145
+
+Layer 8, Token 17 (recurrent cache comparison):
+  Original tensor sum: 373.165680
+  Converted tensor sum: 136.027786
+  Original tensor mean: 0.116614
+  Converted tensor mean: 0.042509
+ Mean difference: 0.18740444
+ Maximum pointwise difference: 4.54259586
+ Max difference location: (0, 20, 0, 7)
+  Values at max diff - Original: 4.54873943, Converted: 0.00614343
+ Biggest difference in row (0, 7, 2), sum 8.000880 vs 0.043800
+
+Layer 9, Token 17 (recurrent cache comparison):
+  Original tensor sum: 239.737335
+  Converted tensor sum: 91.044197
+  Original tensor mean: 0.074918
+  Converted tensor mean: 0.028451
+ Mean difference: 0.11736859
+ Maximum pointwise difference: 1.98427892
+ Max difference location: (0, 14, 2, 1)
+  Values at max diff - Original: 1.90727878, Converted: -0.07700008
+ Biggest difference in row (0, 28, 7), sum 5.596577 vs -0.058259
+
+Layer 10, Token 17 (recurrent cache comparison):
+  Original tensor sum: 260.470673
+  Converted tensor sum: 162.895706
+  Original tensor mean: 0.081397
+  Converted tensor mean: 0.050905
+ Mean difference: 0.14167482
+ Maximum pointwise difference: 3.23060656
+ Max difference location: (0, 24, 1, 0)
+  Values at max diff - Original: 4.23022413, Converted: 0.99961770
+ Biggest difference in row (0, 24, 0), sum 0.125982 vs 9.195232
+
+Layer 12, Token 17 (recurrent cache comparison):
+  Original tensor sum: 321.268158
+  Converted tensor sum: 134.452438
+  Original tensor mean: 0.100396
+  Converted tensor mean: 0.042016
+ Mean difference: 0.17344666
+ Maximum pointwise difference: 4.15682602
+ Max difference location: (0, 14, 8, 1)
+  Values at max diff - Original: 4.38615370, Converted: 0.22932746
+ Biggest difference in row (0, 28, 3), sum 11.304427 vs 0.427086
+
+Layer 13, Token 17 (recurrent cache comparison):
+  Original tensor sum: 255.942596
+  Converted tensor sum: 107.501419
+  Original tensor mean: 0.079982
+  Converted tensor mean: 0.033594
+ Mean difference: 0.11964211
+ Maximum pointwise difference: 2.72310257
+ Max difference location: (0, 11, 4, 0)
+  Values at max diff - Original: 3.71963763, Converted: 0.99653512
+ Biggest difference in row (0, 27, 4), sum 6.949797 vs 0.279431
+
+Layer 14, Token 17 (recurrent cache comparison):
+  Original tensor sum: 718.971008
+  Converted tensor sum: 252.775909
+  Original tensor mean: 0.224678
+  Converted tensor mean: 0.078992
+ Mean difference: 0.28457019
+ Maximum pointwise difference: 4.54859781
+ Max difference location: (0, 5, 8, 9)
+  Values at max diff - Original: -0.00566958, Converted: 4.54292822
+ Biggest difference in row (0, 6, 1), sum 11.820190 vs 0.083275
+
+Layer 0, Token 18 (recurrent cache comparison):
+  Original tensor sum: -2.745796
+  Converted tensor sum: 11.596529
+  Original tensor mean: -0.000858
+  Converted tensor mean: 0.003624
+ Mean difference: 0.06698289
+ Maximum pointwise difference: 1.30398095
+ Max difference location: (0, 1, 2, 3)
+  Values at max diff - Original: 1.23424304, Converted: -0.06973789
+ Biggest difference in row (0, 25, 7), sum -1.491066 vs 0.107394
+
+Layer 1, Token 18 (recurrent cache comparison):
+  Original tensor sum: 196.680084
+  Converted tensor sum: 112.820984
+  Original tensor mean: 0.061463
+  Converted tensor mean: 0.035257
+ Mean difference: 0.11424790
+ Maximum pointwise difference: 1.90677047
+ Max difference location: (0, 14, 2, 3)
+  Values at max diff - Original: 0.10773923, Converted: 2.01450968
+ Biggest difference in row (0, 24, 3), sum 5.388914 vs 0.084538
+
+Layer 2, Token 18 (recurrent cache comparison):
+  Original tensor sum: 269.808228
+  Converted tensor sum: 106.268402
+  Original tensor mean: 0.084315
+  Converted tensor mean: 0.033209
+ Mean difference: 0.16576965
+ Maximum pointwise difference: 2.41004586
+ Max difference location: (0, 12, 0, 0)
+  Values at max diff - Original: 2.62151933, Converted: 0.21147355
+ Biggest difference in row (0, 12, 0), sum 7.396654 vs 0.148190
+
+Layer 4, Token 18 (recurrent cache comparison):
+  Original tensor sum: 299.541138
+  Converted tensor sum: 34.684372
+  Original tensor mean: 0.093607
+  Converted tensor mean: 0.010839
+ Mean difference: 0.15344296
+ Maximum pointwise difference: 4.97097397
+ Max difference location: (0, 27, 8, 5)
+  Values at max diff - Original: 4.93650246, Converted: -0.03447145
+ Biggest difference in row (0, 27, 8), sum 10.168988 vs 0.095367
+
+Layer 5, Token 18 (recurrent cache comparison):
+  Original tensor sum: 322.520721
+  Converted tensor sum: 32.353989
+  Original tensor mean: 0.100788
+  Converted tensor mean: 0.010111
+ Mean difference: 0.15857503
+ Maximum pointwise difference: 3.27807403
+ Max difference location: (0, 28, 9, 6)
+  Values at max diff - Original: 3.39260817, Converted: 0.11453414
+ Biggest difference in row (0, 31, 7), sum 9.317598 vs 0.060667
+
+Layer 6, Token 18 (recurrent cache comparison):
+  Original tensor sum: 404.272705
+  Converted tensor sum: 105.430817
+  Original tensor mean: 0.126335
+  Converted tensor mean: 0.032947
+ Mean difference: 0.18362552
+ Maximum pointwise difference: 4.36808205
+ Max difference location: (0, 6, 5, 4)
+  Values at max diff - Original: 4.63004971, Converted: 0.26196742
+ Biggest difference in row (0, 30, 4), sum 12.429064 vs 1.549177
+
+Layer 8, Token 18 (recurrent cache comparison):
+  Original tensor sum: 379.120117
+  Converted tensor sum: 49.316475
+  Original tensor mean: 0.118475
+  Converted tensor mean: 0.015411
+ Mean difference: 0.18690227
+ Maximum pointwise difference: 4.34863997
+ Max difference location: (0, 20, 0, 7)
+  Values at max diff - Original: 4.50196075, Converted: 0.15332088
+ Biggest difference in row (0, 7, 2), sum 8.701149 vs -1.880803
+
+Layer 9, Token 18 (recurrent cache comparison):
+  Original tensor sum: 247.687454
+  Converted tensor sum: 31.604210
+  Original tensor mean: 0.077402
+  Converted tensor mean: 0.009876
+ Mean difference: 0.12334745
+ Maximum pointwise difference: 2.89748645
+ Max difference location: (0, 14, 2, 1)
+  Values at max diff - Original: 2.54342103, Converted: -0.35406536
+ Biggest difference in row (0, 9, 8), sum 5.984664 vs -0.341670
+
+Layer 10, Token 18 (recurrent cache comparison):
+  Original tensor sum: 262.752014
+  Converted tensor sum: 52.628201
+  Original tensor mean: 0.082110
+  Converted tensor mean: 0.016446
+ Mean difference: 0.13161205
+ Maximum pointwise difference: 2.92723370
+ Max difference location: (0, 24, 1, 0)
+  Values at max diff - Original: 4.35996389, Converted: 1.43273032
+ Biggest difference in row (0, 11, 6), sum 6.418620 vs 0.589213
+
+Layer 12, Token 18 (recurrent cache comparison):
+  Original tensor sum: 326.667419
+  Converted tensor sum: 31.792521
+  Original tensor mean: 0.102084
+  Converted tensor mean: 0.009935
+ Mean difference: 0.17550385
+ Maximum pointwise difference: 4.50774860
+ Max difference location: (0, 14, 8, 1)
+  Values at max diff - Original: 4.50715399, Converted: -0.00059444
+ Biggest difference in row (0, 21, 9), sum 0.167931 vs -15.009873
+
+Layer 13, Token 18 (recurrent cache comparison):
+  Original tensor sum: 261.870972
+  Converted tensor sum: 53.651596
+  Original tensor mean: 0.081835
+  Converted tensor mean: 0.016766
+ Mean difference: 0.12234001
+ Maximum pointwise difference: 4.01087809
+ Max difference location: (0, 11, 4, 0)
+  Values at max diff - Original: 4.08570528, Converted: 0.07482710
+ Biggest difference in row (0, 20, 0), sum 2.293484 vs -4.637159
+
+Layer 14, Token 18 (recurrent cache comparison):
+  Original tensor sum: 740.518921
+  Converted tensor sum: 84.074921
+  Original tensor mean: 0.231412
+  Converted tensor mean: 0.026273
+ Mean difference: 0.28319737
+ Maximum pointwise difference: 4.63366222
+ Max difference location: (0, 1, 4, 6)
+  Values at max diff - Original: -0.00748948, Converted: 4.62617254
+ Biggest difference in row (0, 6, 1), sum 12.889781 vs -0.226667
+
+Layer 0, Token 19 (recurrent cache comparison):
+  Original tensor sum: 0.866719
+  Converted tensor sum: 13.915674
+  Original tensor mean: 0.000271
+  Converted tensor mean: 0.004349
+ Mean difference: 0.05563419
+ Maximum pointwise difference: 1.58602941
+ Max difference location: (0, 1, 5, 3)
+  Values at max diff - Original: 1.50699055, Converted: -0.07903884
+ Biggest difference in row (0, 28, 5), sum 0.233465 vs 1.374955
+
+Layer 1, Token 19 (recurrent cache comparison):
+  Original tensor sum: 143.055450
+  Converted tensor sum: 84.285873
+  Original tensor mean: 0.044705
+  Converted tensor mean: 0.026339
+ Mean difference: 0.11436888
+ Maximum pointwise difference: 2.11188436
+ Max difference location: (0, 15, 8, 5)
+  Values at max diff - Original: -0.06675819, Converted: 2.04512620
+ Biggest difference in row (0, 23, 4), sum 0.445206 vs 4.313503
+
+Layer 2, Token 19 (recurrent cache comparison):
+  Original tensor sum: 206.674835
+  Converted tensor sum: 69.739983
+  Original tensor mean: 0.064586
+  Converted tensor mean: 0.021794
+ Mean difference: 0.14624587
+ Maximum pointwise difference: 2.46052098
+ Max difference location: (0, 5, 4, 5)
+  Values at max diff - Original: 2.46177387, Converted: 0.00125289
+ Biggest difference in row (0, 23, 9), sum 5.872013 vs -0.147400
+
+Layer 4, Token 19 (recurrent cache comparison):
+  Original tensor sum: 223.180557
+  Converted tensor sum: 57.034431
+  Original tensor mean: 0.069744
+  Converted tensor mean: 0.017823
+ Mean difference: 0.13700224
+ Maximum pointwise difference: 4.09037542
+ Max difference location: (0, 25, 1, 9)
+  Values at max diff - Original: 3.97389102, Converted: -0.11648450
+ Biggest difference in row (0, 24, 1), sum 6.574383 vs 0.271665
+
+Layer 5, Token 19 (recurrent cache comparison):
+  Original tensor sum: 315.655853
+  Converted tensor sum: 48.647461
+  Original tensor mean: 0.098642
+  Converted tensor mean: 0.015202
+ Mean difference: 0.15315701
+ Maximum pointwise difference: 6.10414743
+ Max difference location: (0, 28, 9, 6)
+  Values at max diff - Original: 6.29615974, Converted: 0.19201221
+ Biggest difference in row (0, 28, 9), sum 11.702868 vs 0.168917
+
+Layer 6, Token 19 (recurrent cache comparison):
+  Original tensor sum: 358.473572
+  Converted tensor sum: 101.158226
+  Original tensor mean: 0.112023
+  Converted tensor mean: 0.031612
+ Mean difference: 0.16535039
+ Maximum pointwise difference: 3.82374835
+ Max difference location: (0, 18, 1, 7)
+  Values at max diff - Original: 3.88149524, Converted: 0.05774695
+ Biggest difference in row (0, 20, 9), sum 9.851446 vs -0.224849
+
+Layer 8, Token 19 (recurrent cache comparison):
+  Original tensor sum: 346.821899
+  Converted tensor sum: 80.751968
+  Original tensor mean: 0.108382
+  Converted tensor mean: 0.025235
+ Mean difference: 0.17743167
+ Maximum pointwise difference: 3.78403044
+ Max difference location: (0, 20, 0, 7)
+  Values at max diff - Original: 3.89911222, Converted: 0.11508182
+ Biggest difference in row (0, 7, 2), sum 9.025558 vs -0.167117
+
+Layer 9, Token 19 (recurrent cache comparison):
+  Original tensor sum: 249.268311
+  Converted tensor sum: 43.202286
+  Original tensor mean: 0.077896
+  Converted tensor mean: 0.013501
+ Mean difference: 0.12318792
+ Maximum pointwise difference: 2.83834696
+ Max difference location: (0, 9, 8, 6)
+  Values at max diff - Original: 2.71989083, Converted: -0.11845621
+ Biggest difference in row (0, 9, 8), sum 8.573050 vs -0.169431
+
+Layer 10, Token 19 (recurrent cache comparison):
+  Original tensor sum: 291.462646
+  Converted tensor sum: 66.798782
+  Original tensor mean: 0.091082
+  Converted tensor mean: 0.020875
+ Mean difference: 0.14087133
+ Maximum pointwise difference: 3.38042760
+ Max difference location: (0, 25, 1, 9)
+  Values at max diff - Original: 3.51948309, Converted: 0.13905543
+ Biggest difference in row (0, 25, 1), sum 8.942734 vs 0.065733
+
+Layer 12, Token 19 (recurrent cache comparison):
+  Original tensor sum: 342.570038
+  Converted tensor sum: 48.484200
+  Original tensor mean: 0.107053
+  Converted tensor mean: 0.015151
+ Mean difference: 0.17410682
+ Maximum pointwise difference: 4.36208725
+ Max difference location: (0, 14, 8, 1)
+  Values at max diff - Original: 4.34435558, Converted: -0.01773176
+ Biggest difference in row (0, 28, 3), sum 11.628893 vs -0.798577
+
+Layer 13, Token 19 (recurrent cache comparison):
+  Original tensor sum: 270.129211
+  Converted tensor sum: 52.121815
+  Original tensor mean: 0.084415
+  Converted tensor mean: 0.016288
+ Mean difference: 0.12223634
+ Maximum pointwise difference: 3.81266069
+ Max difference location: (0, 11, 4, 0)
+  Values at max diff - Original: 4.31173086, Converted: 0.49907014
+ Biggest difference in row (0, 27, 4), sum 5.975472 vs -0.026263
+
+Layer 14, Token 19 (recurrent cache comparison):
+  Original tensor sum: 772.850342
+  Converted tensor sum: 107.083702
+  Original tensor mean: 0.241516
+  Converted tensor mean: 0.033464
+ Mean difference: 0.28851181
+ Maximum pointwise difference: 4.31482410
+ Max difference location: (0, 28, 4, 1)
+  Values at max diff - Original: 4.32322884, Converted: 0.00840468
+ Biggest difference in row (0, 14, 2), sum 14.072536 vs 0.377507
+
+Layer 0, Token 20 (recurrent cache comparison):
+  Original tensor sum: 2.008890
+  Converted tensor sum: 12.614055
+  Original tensor mean: 0.000628
+  Converted tensor mean: 0.003942
+ Mean difference: 0.05576663
+ Maximum pointwise difference: 1.29991353
+ Max difference location: (0, 1, 5, 3)
+  Values at max diff - Original: 1.36800277, Converted: 0.06808926
+ Biggest difference in row (0, 23, 6), sum 0.260241 vs -1.370477
+
+Layer 1, Token 20 (recurrent cache comparison):
+  Original tensor sum: 58.587276
+  Converted tensor sum: 76.507767
+  Original tensor mean: 0.018309
+  Converted tensor mean: 0.023909
+ Mean difference: 0.10026859
+ Maximum pointwise difference: 2.19443369
+ Max difference location: (0, 14, 2, 2)
+  Values at max diff - Original: -0.11835258, Converted: 2.07608104
+ Biggest difference in row (0, 14, 2), sum -0.449485 vs 3.433519
+
+Layer 2, Token 20 (recurrent cache comparison):
+  Original tensor sum: 165.744568
+  Converted tensor sum: 64.695602
+  Original tensor mean: 0.051795
+  Converted tensor mean: 0.020217
+ Mean difference: 0.14529096
+ Maximum pointwise difference: 2.04155922
+ Max difference location: (0, 5, 4, 5)
+  Values at max diff - Original: 2.00637627, Converted: -0.03518293
+ Biggest difference in row (0, 5, 4), sum 4.793974 vs -0.065828
+
+Layer 4, Token 20 (recurrent cache comparison):
+  Original tensor sum: 212.915298
+  Converted tensor sum: 76.568939
+  Original tensor mean: 0.066536
+  Converted tensor mean: 0.023928
+ Mean difference: 0.12807344
+ Maximum pointwise difference: 3.65112019
+ Max difference location: (0, 25, 1, 9)
+  Values at max diff - Original: 3.82295465, Converted: 0.17183457
+ Biggest difference in row (0, 20, 4), sum 0.743454 vs 6.637871
+
+Layer 5, Token 20 (recurrent cache comparison):
+  Original tensor sum: 258.077209
+  Converted tensor sum: 49.652397
+  Original tensor mean: 0.080649
+  Converted tensor mean: 0.015516
+ Mean difference: 0.13090378
+ Maximum pointwise difference: 2.72355151
+ Max difference location: (0, 28, 9, 6)
+  Values at max diff - Original: 2.71506453, Converted: -0.00848696
+ Biggest difference in row (0, 28, 9), sum 6.250334 vs 0.160866
+
+Layer 6, Token 20 (recurrent cache comparison):
+  Original tensor sum: 336.431519
+  Converted tensor sum: 141.819733
+  Original tensor mean: 0.105135
+  Converted tensor mean: 0.044319
+ Mean difference: 0.16430938
+ Maximum pointwise difference: 3.65949225
+ Max difference location: (0, 6, 5, 4)
+  Values at max diff - Original: 3.87317371, Converted: 0.21368141
+ Biggest difference in row (0, 12, 1), sum 12.053196 vs 2.254734
+
+Layer 8, Token 20 (recurrent cache comparison):
+  Original tensor sum: 345.424561
+  Converted tensor sum: 112.814018
+  Original tensor mean: 0.107945
+  Converted tensor mean: 0.035254
+ Mean difference: 0.17631440
+ Maximum pointwise difference: 3.36074710
+ Max difference location: (0, 20, 0, 7)
+  Values at max diff - Original: 3.50376892, Converted: 0.14302187
+ Biggest difference in row (0, 21, 0), sum 8.499396 vs 0.480686
+
+Layer 9, Token 20 (recurrent cache comparison):
+  Original tensor sum: 261.041870
+  Converted tensor sum: 41.182373
+  Original tensor mean: 0.081576
+  Converted tensor mean: 0.012869
+ Mean difference: 0.12376894
+ Maximum pointwise difference: 2.65249300
+ Max difference location: (0, 18, 2, 3)
+  Values at max diff - Original: 2.77233696, Converted: 0.11984408
+ Biggest difference in row (0, 9, 8), sum 7.305106 vs -0.383589
+
+Layer 10, Token 20 (recurrent cache comparison):
+  Original tensor sum: 276.296692
+  Converted tensor sum: 75.410934
+  Original tensor mean: 0.086343
+  Converted tensor mean: 0.023566
+ Mean difference: 0.12437831
+ Maximum pointwise difference: 2.84117389
+ Max difference location: (0, 25, 1, 9)
+  Values at max diff - Original: 3.23575449, Converted: 0.39458057
+ Biggest difference in row (0, 25, 1), sum 8.240932 vs 1.044036
+
+Layer 12, Token 20 (recurrent cache comparison):
+  Original tensor sum: 345.097260
+  Converted tensor sum: 53.731094
+  Original tensor mean: 0.107843
+  Converted tensor mean: 0.016791
+ Mean difference: 0.17168441
+ Maximum pointwise difference: 4.60863352
+ Max difference location: (0, 14, 8, 1)
+  Values at max diff - Original: 4.63144016, Converted: 0.02280665
+ Biggest difference in row (0, 28, 3), sum 11.591027 vs 0.333645
+
+Layer 13, Token 20 (recurrent cache comparison):
+  Original tensor sum: 253.047394
+  Converted tensor sum: 40.628811
+  Original tensor mean: 0.079077
+  Converted tensor mean: 0.012697
+ Mean difference: 0.11498150
+ Maximum pointwise difference: 4.22373772
+ Max difference location: (0, 11, 4, 0)
+  Values at max diff - Original: 4.22381639, Converted: 0.00007845
+ Biggest difference in row (0, 27, 4), sum 6.304989 vs 0.143700
+
+Layer 14, Token 20 (recurrent cache comparison):
+  Original tensor sum: 769.098083
+  Converted tensor sum: 130.283981
+  Original tensor mean: 0.240343
+  Converted tensor mean: 0.040714
+ Mean difference: 0.28381503
+ Maximum pointwise difference: 4.93393469
+ Max difference location: (0, 28, 4, 1)
+  Values at max diff - Original: 4.91371727, Converted: -0.02021729
+ Biggest difference in row (0, 6, 1), sum 14.151162 vs 0.315893
+
+Layer 0, Token 21 (recurrent cache comparison):
+  Original tensor sum: 1.077594
+  Converted tensor sum: 15.438447
+  Original tensor mean: 0.000337
+  Converted tensor mean: 0.004825
+ Mean difference: 0.05193665
+ Maximum pointwise difference: 0.74260694
+ Max difference location: (0, 28, 8, 5)
+  Values at max diff - Original: 0.72446448, Converted: -0.01814246
+ Biggest difference in row (0, 23, 8), sum -0.100890 vs -1.090759
+
+Layer 1, Token 21 (recurrent cache comparison):
+  Original tensor sum: 28.594997
+  Converted tensor sum: 89.290833
+  Original tensor mean: 0.008936
+  Converted tensor mean: 0.027903
+ Mean difference: 0.10794319
+ Maximum pointwise difference: 1.59959590
+ Max difference location: (0, 20, 2, 0)
+  Values at max diff - Original: 0.00296844, Converted: 1.60256433
+ Biggest difference in row (0, 20, 2), sum 0.038832 vs 3.326198
+
+Layer 2, Token 21 (recurrent cache comparison):
+  Original tensor sum: 146.744446
+  Converted tensor sum: 85.128494
+  Original tensor mean: 0.045858
+  Converted tensor mean: 0.026603
+ Mean difference: 0.15625563
+ Maximum pointwise difference: 3.40082598
+ Max difference location: (0, 4, 2, 4)
+  Values at max diff - Original: 3.41796732, Converted: 0.01714140
+ Biggest difference in row (0, 4, 2), sum 7.012363 vs 0.098989
+
+Layer 4, Token 21 (recurrent cache comparison):
+  Original tensor sum: 120.798615
+  Converted tensor sum: 143.282379
+  Original tensor mean: 0.037750
+  Converted tensor mean: 0.044776
+ Mean difference: 0.13425863
+ Maximum pointwise difference: 2.73616052
+ Max difference location: (0, 24, 9, 1)
+  Values at max diff - Original: 0.23530871, Converted: 2.97146916
+ Biggest difference in row (0, 30, 3), sum 1.329738 vs 6.360154
+
+Layer 5, Token 21 (recurrent cache comparison):
+  Original tensor sum: 222.583710
+  Converted tensor sum: 86.326241
+  Original tensor mean: 0.069557
+  Converted tensor mean: 0.026977
+ Mean difference: 0.13234577
+ Maximum pointwise difference: 2.64859867
+ Max difference location: (0, 28, 7, 6)
+  Values at max diff - Original: 2.67573905, Converted: 0.02714031
+ Biggest difference in row (0, 26, 8), sum 3.963463 vs -0.648591
+
+Layer 6, Token 21 (recurrent cache comparison):
+  Original tensor sum: 317.078064
+  Converted tensor sum: 162.595886
+  Original tensor mean: 0.099087
+  Converted tensor mean: 0.050811
+ Mean difference: 0.15550284
+ Maximum pointwise difference: 3.79531074
+ Max difference location: (0, 17, 7, 5)
+  Values at max diff - Original: 3.63465667, Converted: -0.16065404
+ Biggest difference in row (0, 20, 9), sum 8.606161 vs 0.369012
+
+Layer 8, Token 21 (recurrent cache comparison):
+  Original tensor sum: 345.257385
+  Converted tensor sum: 184.546997
+  Original tensor mean: 0.107893
+  Converted tensor mean: 0.057671
+ Mean difference: 0.18574484
+ Maximum pointwise difference: 3.21210074
+ Max difference location: (0, 7, 2, 9)
+  Values at max diff - Original: 3.22117043, Converted: 0.00906963
+ Biggest difference in row (0, 21, 0), sum 7.649475 vs -0.057539
+
+Layer 9, Token 21 (recurrent cache comparison):
+  Original tensor sum: 268.515228
+  Converted tensor sum: 95.449539
+  Original tensor mean: 0.083911
+  Converted tensor mean: 0.029828
+ Mean difference: 0.13116649
+ Maximum pointwise difference: 3.19655538
+ Max difference location: (0, 18, 2, 3)
+  Values at max diff - Original: 3.71445417, Converted: 0.51789874
+ Biggest difference in row (0, 9, 5), sum 9.129113 vs 0.079633
+
+Layer 10, Token 21 (recurrent cache comparison):
+  Original tensor sum: 259.887024
+  Converted tensor sum: 104.823151
+  Original tensor mean: 0.081215
+  Converted tensor mean: 0.032757
+ Mean difference: 0.12396878
+ Maximum pointwise difference: 3.03640962
+ Max difference location: (0, 25, 1, 9)
+  Values at max diff - Original: 3.24910450, Converted: 0.21269491
+ Biggest difference in row (0, 25, 1), sum 8.045052 vs 0.162466
+
+Layer 12, Token 21 (recurrent cache comparison):
+  Original tensor sum: 340.602814
+  Converted tensor sum: 113.082108
+  Original tensor mean: 0.106438
+  Converted tensor mean: 0.035338
+ Mean difference: 0.17276871
+ Maximum pointwise difference: 4.99602270
+ Max difference location: (0, 14, 8, 1)
+  Values at max diff - Original: 4.72621298, Converted: -0.26980966
+ Biggest difference in row (0, 28, 3), sum 11.259501 vs -0.695297
+
+Layer 13, Token 21 (recurrent cache comparison):
+  Original tensor sum: 236.875137
+  Converted tensor sum: 95.429146
+  Original tensor mean: 0.074023
+  Converted tensor mean: 0.029822
+ Mean difference: 0.11990514
+ Maximum pointwise difference: 3.69410872
+ Max difference location: (0, 11, 4, 0)
+  Values at max diff - Original: 4.10646772, Converted: 0.41235897
+ Biggest difference in row (0, 23, 6), sum 5.276991 vs -0.187177
+
+Layer 14, Token 21 (recurrent cache comparison):
+  Original tensor sum: 782.813049
+  Converted tensor sum: 216.654602
+  Original tensor mean: 0.244629
+  Converted tensor mean: 0.067705
+ Mean difference: 0.29570371
+ Maximum pointwise difference: 5.94400930
+ Max difference location: (0, 28, 4, 1)
+  Values at max diff - Original: 5.97852135, Converted: 0.03451204
+ Biggest difference in row (0, 6, 1), sum 14.360078 vs 0.533817
+
+Layer 0, Token 22 (recurrent cache comparison):
+  Original tensor sum: 2.700914
+  Converted tensor sum: 8.066211
+  Original tensor mean: 0.000844
+  Converted tensor mean: 0.002521
+ Mean difference: 0.06021541
+ Maximum pointwise difference: 1.02617574
+ Max difference location: (0, 28, 9, 5)
+  Values at max diff - Original: 1.17021942, Converted: 0.14404365
+ Biggest difference in row (0, 4, 9), sum 1.758845 vs -0.049155
+
+Layer 1, Token 22 (recurrent cache comparison):
+  Original tensor sum: 9.402251
+  Converted tensor sum: 79.292084
+  Original tensor mean: 0.002938
+  Converted tensor mean: 0.024779
+ Mean difference: 0.09312414
+ Maximum pointwise difference: 1.86848283
+ Max difference location: (0, 14, 2, 2)
+  Values at max diff - Original: -0.01261259, Converted: 1.85587025
+ Biggest difference in row (0, 1, 8), sum -0.144765 vs 2.729439
+
+Layer 2, Token 22 (recurrent cache comparison):
+  Original tensor sum: 150.273865
+  Converted tensor sum: 102.280075
+  Original tensor mean: 0.046961
+  Converted tensor mean: 0.031963
+ Mean difference: 0.15655471
+ Maximum pointwise difference: 2.95679903
+ Max difference location: (0, 4, 8, 6)
+  Values at max diff - Original: -0.15384272, Converted: 2.80295634
+ Biggest difference in row (0, 10, 6), sum -0.449118 vs 3.435276
+
+Layer 4, Token 22 (recurrent cache comparison):
+  Original tensor sum: 109.346573
+  Converted tensor sum: 167.629913
+  Original tensor mean: 0.034171
+  Converted tensor mean: 0.052384
+ Mean difference: 0.12662907
+ Maximum pointwise difference: 2.69411635
+ Max difference location: (0, 19, 2, 0)
+  Values at max diff - Original: 0.00617844, Converted: 2.70029473
+ Biggest difference in row (0, 19, 2), sum -0.222631 vs 5.908413
+
+Layer 5, Token 22 (recurrent cache comparison):
+  Original tensor sum: 191.832321
+  Converted tensor sum: 202.874756
+  Original tensor mean: 0.059948
+  Converted tensor mean: 0.063398
+ Mean difference: 0.15467224
+ Maximum pointwise difference: 6.38972092
+ Max difference location: (0, 28, 6, 9)
+  Values at max diff - Original: 0.04361831, Converted: 6.43333912
+ Biggest difference in row (0, 28, 6), sum 0.738313 vs 17.286346
+
+Layer 6, Token 22 (recurrent cache comparison):
+  Original tensor sum: 304.042816
+  Converted tensor sum: 238.043579
+  Original tensor mean: 0.095013
+  Converted tensor mean: 0.074389
+ Mean difference: 0.15846148
+ Maximum pointwise difference: 3.40163994
+ Max difference location: (0, 12, 2, 1)
+  Values at max diff - Original: 1.29805720, Converted: 4.69969702
+ Biggest difference in row (0, 17, 7), sum 7.862279 vs 0.254134
+
+Layer 8, Token 22 (recurrent cache comparison):
+  Original tensor sum: 352.235718
+  Converted tensor sum: 277.930298
+  Original tensor mean: 0.110074
+  Converted tensor mean: 0.086853
+ Mean difference: 0.19249398
+ Maximum pointwise difference: 3.61912727
+ Max difference location: (0, 7, 2, 9)
+  Values at max diff - Original: 3.80060625, Converted: 0.18147889
+ Biggest difference in row (0, 21, 0), sum 9.710941 vs 0.797433
+
+Layer 9, Token 22 (recurrent cache comparison):
+  Original tensor sum: 273.245667
+  Converted tensor sum: 226.375031
+  Original tensor mean: 0.085389
+  Converted tensor mean: 0.070742
+ Mean difference: 0.14207596
+ Maximum pointwise difference: 2.82711124
+ Max difference location: (0, 14, 1, 2)
+  Values at max diff - Original: 0.05765805, Converted: 2.88476920
+ Biggest difference in row (0, 9, 5), sum 9.348074 vs 1.880102
+
+Layer 10, Token 22 (recurrent cache comparison):
+  Original tensor sum: 239.880463
+  Converted tensor sum: 275.399414
+  Original tensor mean: 0.074963
+  Converted tensor mean: 0.086062
+ Mean difference: 0.15027112
+ Maximum pointwise difference: 3.59689593
+ Max difference location: (0, 0, 7, 8)
+  Values at max diff - Original: -0.00771881, Converted: 3.58917713
+ Biggest difference in row (0, 24, 0), sum 0.303092 vs 7.643524
+
+Layer 12, Token 22 (recurrent cache comparison):
+  Original tensor sum: 327.704742
+  Converted tensor sum: 271.485931
+  Original tensor mean: 0.102408
+  Converted tensor mean: 0.084839
+ Mean difference: 0.17104822
+ Maximum pointwise difference: 4.17193794
+ Max difference location: (0, 14, 8, 1)
+  Values at max diff - Original: 4.78667879, Converted: 0.61474097
+ Biggest difference in row (0, 28, 3), sum 10.929213 vs 0.205626
+
+Layer 13, Token 22 (recurrent cache comparison):
+  Original tensor sum: 231.619003
+  Converted tensor sum: 232.506165
+  Original tensor mean: 0.072381
+  Converted tensor mean: 0.072658
+ Mean difference: 0.13752523
+ Maximum pointwise difference: 4.03583384
+ Max difference location: (0, 11, 4, 0)
+  Values at max diff - Original: 3.99545026, Converted: -0.04038341
+ Biggest difference in row (0, 11, 0), sum -0.083207 vs 6.010875
+
+Layer 14, Token 22 (recurrent cache comparison):
+  Original tensor sum: 772.479431
+  Converted tensor sum: 607.419800
+  Original tensor mean: 0.241400
+  Converted tensor mean: 0.189819
+ Mean difference: 0.31881297
+ Maximum pointwise difference: 5.76619625
+ Max difference location: (0, 28, 4, 1)
+  Values at max diff - Original: 6.25043201, Converted: 0.48423576
+ Biggest difference in row (0, 6, 1), sum 14.585131 vs 0.928486
+
+Layer 0, Token 23 (recurrent cache comparison):
+  Original tensor sum: 4.463778
+  Converted tensor sum: 4.492921
+  Original tensor mean: 0.001395
+  Converted tensor mean: 0.001404
+ Mean difference: 0.06506675
+ Maximum pointwise difference: 1.83452773
+ Max difference location: (0, 1, 3, 5)
+  Values at max diff - Original: -0.04470510, Converted: 1.78982258
+ Biggest difference in row (0, 8, 9), sum 0.088014 vs -1.806111
+
+Layer 1, Token 23 (recurrent cache comparison):
+  Original tensor sum: 16.812580
+  Converted tensor sum: 109.310081
+  Original tensor mean: 0.005254
+  Converted tensor mean: 0.034159
+ Mean difference: 0.09598633
+ Maximum pointwise difference: 1.58349574
+ Max difference location: (0, 14, 2, 2)
+  Values at max diff - Original: -0.00151580, Converted: 1.58197999
+ Biggest difference in row (0, 31, 9), sum 0.029068 vs 3.659988
+
+Layer 2, Token 23 (recurrent cache comparison):
+  Original tensor sum: 75.151047
+  Converted tensor sum: 119.211670
+  Original tensor mean: 0.023485
+  Converted tensor mean: 0.037254
+ Mean difference: 0.13861641
+ Maximum pointwise difference: 2.43731642
+ Max difference location: (0, 1, 3, 2)
+  Values at max diff - Original: 0.08128840, Converted: 2.51860476
+ Biggest difference in row (0, 1, 3), sum 0.598150 vs 6.365501
+
+Layer 4, Token 23 (recurrent cache comparison):
+  Original tensor sum: 76.628754
+  Converted tensor sum: 155.459259
+  Original tensor mean: 0.023946
+  Converted tensor mean: 0.048581
+ Mean difference: 0.11704257
+ Maximum pointwise difference: 2.73834753
+ Max difference location: (0, 19, 9, 2)
+  Values at max diff - Original: 4.03167677, Converted: 1.29332936
+ Biggest difference in row (0, 8, 6), sum 0.016739 vs 3.865431
+
+Layer 5, Token 23 (recurrent cache comparison):
+  Original tensor sum: 150.354111
+  Converted tensor sum: 169.511383
+  Original tensor mean: 0.046986
+  Converted tensor mean: 0.052972
+ Mean difference: 0.12414169
+ Maximum pointwise difference: 4.08761406
+ Max difference location: (0, 28, 8, 6)
+  Values at max diff - Original: 3.60962462, Converted: -0.47798958
+ Biggest difference in row (0, 28, 6), sum 0.154782 vs 5.205485
+
+Layer 6, Token 23 (recurrent cache comparison):
+  Original tensor sum: 225.564255
+  Converted tensor sum: 262.560272
+  Original tensor mean: 0.070489
+  Converted tensor mean: 0.082050
+ Mean difference: 0.14410818
+ Maximum pointwise difference: 5.85085487
+ Max difference location: (0, 12, 6, 1)
+  Values at max diff - Original: 6.56323051, Converted: 0.71237558
+ Biggest difference in row (0, 12, 6), sum 15.410420 vs 3.070242
+
+Layer 8, Token 23 (recurrent cache comparison):
+  Original tensor sum: 216.401703
+  Converted tensor sum: 306.942932
+  Original tensor mean: 0.067626
+  Converted tensor mean: 0.095920
+ Mean difference: 0.14416558
+ Maximum pointwise difference: 3.46720839
+ Max difference location: (0, 22, 4, 7)
+  Values at max diff - Original: 3.77501345, Converted: 0.30780506
+ Biggest difference in row (0, 22, 4), sum 7.765969 vs 0.982070
+
+Layer 9, Token 23 (recurrent cache comparison):
+  Original tensor sum: 247.185196
+  Converted tensor sum: 250.177109
+  Original tensor mean: 0.077245
+  Converted tensor mean: 0.078180
+ Mean difference: 0.11487159
+ Maximum pointwise difference: 2.80121279
+ Max difference location: (0, 14, 2, 1)
+  Values at max diff - Original: 3.97450233, Converted: 1.17328954
+ Biggest difference in row (0, 28, 2), sum 8.960711 vs 2.242082
+
+Layer 10, Token 23 (recurrent cache comparison):
+  Original tensor sum: 193.715546
+  Converted tensor sum: 271.413574
+  Original tensor mean: 0.060536
+  Converted tensor mean: 0.084817
+ Mean difference: 0.13006650
+ Maximum pointwise difference: 3.03568482
+ Max difference location: (0, 0, 3, 7)
+  Values at max diff - Original: 3.30636239, Converted: 0.27067760
+ Biggest difference in row (0, 23, 3), sum 6.103652 vs 1.165035
+
+Layer 12, Token 23 (recurrent cache comparison):
+  Original tensor sum: 277.550171
+  Converted tensor sum: 296.251099
+  Original tensor mean: 0.086734
+  Converted tensor mean: 0.092578
+ Mean difference: 0.12569407
+ Maximum pointwise difference: 2.70571613
+ Max difference location: (0, 20, 2, 3)
+  Values at max diff - Original: 3.96422935, Converted: 1.25851309
+ Biggest difference in row (0, 30, 9), sum 8.374757 vs 2.683706
+
+Layer 13, Token 23 (recurrent cache comparison):
+  Original tensor sum: 189.736130
+  Converted tensor sum: 235.426422
+  Original tensor mean: 0.059293
+  Converted tensor mean: 0.073571
+ Mean difference: 0.09623930
+ Maximum pointwise difference: 3.40506268
+ Max difference location: (0, 17, 8, 2)
+  Values at max diff - Original: 4.38167763, Converted: 0.97661489
+ Biggest difference in row (0, 17, 8), sum 7.328513 vs 2.010616
+
+Layer 14, Token 23 (recurrent cache comparison):
+  Original tensor sum: 508.593140
+  Converted tensor sum: 650.881714
+  Original tensor mean: 0.158935
+  Converted tensor mean: 0.203401
+ Mean difference: 0.21357311
+ Maximum pointwise difference: 4.58951044
+ Max difference location: (0, 28, 4, 1)
+  Values at max diff - Original: 0.01861674, Converted: 4.60812712
+ Biggest difference in row (0, 27, 6), sum -2.170214 vs 13.869398
+
+Layer 0, Token 24 (recurrent cache comparison):
+  Original tensor sum: 0.801011
+  Converted tensor sum: -1.634871
+  Original tensor mean: 0.000250
+  Converted tensor mean: -0.000511
+ Mean difference: 0.07332502
+ Maximum pointwise difference: 1.81247604
+ Max difference location: (0, 1, 3, 5)
+  Values at max diff - Original: -0.04395379, Converted: 1.76852226
+ Biggest difference in row (0, 25, 2), sum 0.205085 vs -3.009443
+
+Layer 1, Token 24 (recurrent cache comparison):
+  Original tensor sum: 23.350971
+  Converted tensor sum: 88.090744
+  Original tensor mean: 0.007297
+  Converted tensor mean: 0.027528
+ Mean difference: 0.08751559
+ Maximum pointwise difference: 1.07916749
+ Max difference location: (0, 20, 7, 8)
+  Values at max diff - Original: 1.04420257, Converted: -0.03496487
+ Biggest difference in row (0, 31, 5), sum -0.311075 vs 1.779173
+
+Layer 2, Token 24 (recurrent cache comparison):
+  Original tensor sum: 108.804047
+  Converted tensor sum: 87.620453
+  Original tensor mean: 0.034001
+  Converted tensor mean: 0.027381
+ Mean difference: 0.12934437
+ Maximum pointwise difference: 2.40617442
+ Max difference location: (0, 1, 2, 3)
+  Values at max diff - Original: 0.02315997, Converted: 2.42933440
+ Biggest difference in row (0, 27, 2), sum 3.832137 vs 0.454090
+
+Layer 4, Token 24 (recurrent cache comparison):
+  Original tensor sum: 89.705452
+  Converted tensor sum: 61.452301
+  Original tensor mean: 0.028033
+  Converted tensor mean: 0.019204
+ Mean difference: 0.11625614
+ Maximum pointwise difference: 3.20758009
+ Max difference location: (0, 19, 2, 9)
+  Values at max diff - Original: -0.01131610, Converted: 3.19626403
+ Biggest difference in row (0, 19, 9), sum 3.560462 vs 0.155535
+
+Layer 5, Token 24 (recurrent cache comparison):
+  Original tensor sum: 153.870117
+  Converted tensor sum: 79.160019
+  Original tensor mean: 0.048084
+  Converted tensor mean: 0.024738
+ Mean difference: 0.12364670
+ Maximum pointwise difference: 2.68913746
+ Max difference location: (0, 28, 3, 6)
+  Values at max diff - Original: 2.79144502, Converted: 0.10230768
+ Biggest difference in row (0, 28, 6), sum 0.099721 vs 5.709799
+
+Layer 6, Token 24 (recurrent cache comparison):
+  Original tensor sum: 230.254852
+  Converted tensor sum: 174.787750
+  Original tensor mean: 0.071955
+  Converted tensor mean: 0.054621
+ Mean difference: 0.15056056
+ Maximum pointwise difference: 6.34924650
+ Max difference location: (0, 12, 6, 1)
+  Values at max diff - Original: 6.46217585, Converted: 0.11292921
+ Biggest difference in row (0, 12, 6), sum 15.171618 vs 0.722292
+
+Layer 8, Token 24 (recurrent cache comparison):
+  Original tensor sum: 235.891174
+  Converted tensor sum: 145.097076
+  Original tensor mean: 0.073716
+  Converted tensor mean: 0.045343
+ Mean difference: 0.16653843
+ Maximum pointwise difference: 3.68727565
+ Max difference location: (0, 21, 9, 7)
+  Values at max diff - Original: -0.03629338, Converted: 3.65098238
+ Biggest difference in row (0, 22, 4), sum 6.845831 vs -0.173057
+
+Layer 9, Token 24 (recurrent cache comparison):
+  Original tensor sum: 230.641953
+  Converted tensor sum: 158.276245
+  Original tensor mean: 0.072076
+  Converted tensor mean: 0.049461
+ Mean difference: 0.13344021
+ Maximum pointwise difference: 2.99997020
+ Max difference location: (0, 28, 7, 0)
+  Values at max diff - Original: 3.18566871, Converted: 0.18569851
+ Biggest difference in row (0, 28, 7), sum 10.468034 vs 1.164585
+
+Layer 10, Token 24 (recurrent cache comparison):
+  Original tensor sum: 196.116974
+  Converted tensor sum: 120.883209
+  Original tensor mean: 0.061287
+  Converted tensor mean: 0.037776
+ Mean difference: 0.14279810
+ Maximum pointwise difference: 3.15166354
+ Max difference location: (0, 24, 0, 1)
+  Values at max diff - Original: 0.01040818, Converted: 3.16207170
+ Biggest difference in row (0, 24, 0), sum 0.920592 vs 9.820712
+
+Layer 12, Token 24 (recurrent cache comparison):
+  Original tensor sum: 263.522400
+  Converted tensor sum: 204.364563
+  Original tensor mean: 0.082351
+  Converted tensor mean: 0.063864
+ Mean difference: 0.15161198
+ Maximum pointwise difference: 3.57106376
+ Max difference location: (0, 30, 4, 9)
+  Values at max diff - Original: 0.28180352, Converted: 3.85286736
+ Biggest difference in row (0, 28, 3), sum 8.790596 vs 0.101635
+
+Layer 13, Token 24 (recurrent cache comparison):
+  Original tensor sum: 174.369919
+  Converted tensor sum: 135.311646
+  Original tensor mean: 0.054491
+  Converted tensor mean: 0.042285
+ Mean difference: 0.11190581
+ Maximum pointwise difference: 3.24499154
+ Max difference location: (0, 9, 2, 1)
+  Values at max diff - Original: -0.00551485, Converted: 3.23947668
+ Biggest difference in row (0, 9, 2), sum -0.041496 vs 5.267887
+
+Layer 14, Token 24 (recurrent cache comparison):
+  Original tensor sum: 507.494324
+  Converted tensor sum: 360.428650
+  Original tensor mean: 0.158592
+  Converted tensor mean: 0.112634
+ Mean difference: 0.25558040
+ Maximum pointwise difference: 5.38855457
+ Max difference location: (0, 28, 6, 1)
+  Values at max diff - Original: 5.42326450, Converted: 0.03470971
+ Biggest difference in row (0, 28, 6), sum 14.975449 vs 0.420049
+
+Layer 0, Token 25 (recurrent cache comparison):
+  Original tensor sum: 3.754472
+  Converted tensor sum: -0.036336
+  Original tensor mean: 0.001173
+  Converted tensor mean: -0.000011
+ Mean difference: 0.07934358
+ Maximum pointwise difference: 1.86529565
+ Max difference location: (0, 1, 2, 3)
+  Values at max diff - Original: 1.82291889, Converted: -0.04237675
+ Biggest difference in row (0, 26, 9), sum -0.049344 vs -1.991895
+
+Layer 1, Token 25 (recurrent cache comparison):
+  Original tensor sum: 69.339890
+  Converted tensor sum: 74.604774
+  Original tensor mean: 0.021669
+  Converted tensor mean: 0.023314
+ Mean difference: 0.08100989
+ Maximum pointwise difference: 1.22147357
+ Max difference location: (0, 23, 0, 4)
+  Values at max diff - Original: 1.23978972, Converted: 0.01831620
+ Biggest difference in row (0, 20, 8), sum 2.595490 vs 0.385527
+
+Layer 2, Token 25 (recurrent cache comparison):
+  Original tensor sum: 122.554489
+  Converted tensor sum: 59.594086
+  Original tensor mean: 0.038298
+  Converted tensor mean: 0.018623
+ Mean difference: 0.14621988
+ Maximum pointwise difference: 3.03828931
+ Max difference location: (0, 8, 9, 3)
+  Values at max diff - Original: 3.01308012, Converted: -0.02520920
+ Biggest difference in row (0, 6, 7), sum 4.544618 vs 0.330778
+
+Layer 4, Token 25 (recurrent cache comparison):
+  Original tensor sum: 135.021027
+  Converted tensor sum: 31.374174
+  Original tensor mean: 0.042194
+  Converted tensor mean: 0.009804
+ Mean difference: 0.11780138
+ Maximum pointwise difference: 2.41319752
+ Max difference location: (0, 26, 6, 5)
+  Values at max diff - Original: -0.06945831, Converted: 2.34373927
+ Biggest difference in row (0, 20, 0), sum 5.919655 vs -0.125531
+
+Layer 5, Token 25 (recurrent cache comparison):
+  Original tensor sum: 151.868256
+  Converted tensor sum: 37.756584
+  Original tensor mean: 0.047459
+  Converted tensor mean: 0.011799
+ Mean difference: 0.11239365
+ Maximum pointwise difference: 2.04250264
+ Max difference location: (0, 28, 8, 6)
+  Values at max diff - Original: 1.64249492, Converted: -0.40000769
+ Biggest difference in row (0, 23, 0), sum 3.497306 vs 0.162423
+
+Layer 6, Token 25 (recurrent cache comparison):
+  Original tensor sum: 251.935211
+  Converted tensor sum: 40.890175
+  Original tensor mean: 0.078730
+  Converted tensor mean: 0.012778
+ Mean difference: 0.15190262
+ Maximum pointwise difference: 5.74138451
+ Max difference location: (0, 12, 6, 1)
+  Values at max diff - Original: 5.98834372, Converted: 0.24695921
+ Biggest difference in row (0, 12, 6), sum 13.863525 vs 0.418773
+
+Layer 8, Token 25 (recurrent cache comparison):
+  Original tensor sum: 253.027832
+  Converted tensor sum: 38.795532
+  Original tensor mean: 0.079071
+  Converted tensor mean: 0.012124
+ Mean difference: 0.15110740
+ Maximum pointwise difference: 2.77147269
+ Max difference location: (0, 21, 8, 9)
+  Values at max diff - Original: 2.86136007, Converted: 0.08988741
+ Biggest difference in row (0, 6, 2), sum 5.609079 vs -2.170572
+
+Layer 9, Token 25 (recurrent cache comparison):
+  Original tensor sum: 207.731750
+  Converted tensor sum: 52.985756
+  Original tensor mean: 0.064916
+  Converted tensor mean: 0.016558
+ Mean difference: 0.11516394
+ Maximum pointwise difference: 2.72221398
+ Max difference location: (0, 28, 7, 0)
+  Values at max diff - Original: 2.76798820, Converted: 0.04577418
+ Biggest difference in row (0, 28, 7), sum 9.256445 vs 0.484987
+
+Layer 10, Token 25 (recurrent cache comparison):
+  Original tensor sum: 196.952515
+  Converted tensor sum: 54.152390
+  Original tensor mean: 0.061548
+  Converted tensor mean: 0.016923
+ Mean difference: 0.12454510
+ Maximum pointwise difference: 2.34993958
+ Max difference location: (0, 10, 3, 5)
+  Values at max diff - Original: -0.00316075, Converted: 2.34677887
+ Biggest difference in row (0, 11, 6), sum 5.878725 vs 0.250239
+
+Layer 12, Token 25 (recurrent cache comparison):
+  Original tensor sum: 255.808289
+  Converted tensor sum: 65.224335
+  Original tensor mean: 0.079940
+  Converted tensor mean: 0.020383
+ Mean difference: 0.14238897
+ Maximum pointwise difference: 2.58750200
+ Max difference location: (0, 30, 8, 9)
+  Values at max diff - Original: -0.02865839, Converted: 2.55884361
+ Biggest difference in row (0, 28, 3), sum 8.769258 vs 0.354862
+
+Layer 13, Token 25 (recurrent cache comparison):
+  Original tensor sum: 166.242828
+  Converted tensor sum: 63.081795
+  Original tensor mean: 0.051951
+  Converted tensor mean: 0.019713
+ Mean difference: 0.10068022
+ Maximum pointwise difference: 2.70444345
+ Max difference location: (0, 26, 4, 0)
+  Values at max diff - Original: 2.70685434, Converted: 0.00241077
+ Biggest difference in row (0, 26, 4), sum 5.351704 vs -0.105821
+
+Layer 14, Token 25 (recurrent cache comparison):
+  Original tensor sum: 542.257324
+  Converted tensor sum: 126.161835
+  Original tensor mean: 0.169455
+  Converted tensor mean: 0.039426
+ Mean difference: 0.22693451
+ Maximum pointwise difference: 4.91657877
+ Max difference location: (0, 28, 6, 1)
+  Values at max diff - Original: 5.17964792, Converted: 0.26306900
+ Biggest difference in row (0, 28, 6), sum 14.244452 vs 1.160758
+
+Layer 0, Token 26 (recurrent cache comparison):
+  Original tensor sum: 2.494154
+  Converted tensor sum: -0.022610
+  Original tensor mean: 0.000779
+  Converted tensor mean: -0.000007
+ Mean difference: 0.07249723
+ Maximum pointwise difference: 1.12537110
+ Max difference location: (0, 23, 8, 6)
+  Values at max diff - Original: -0.77736998, Converted: 0.34800115
+ Biggest difference in row (0, 25, 2), sum 0.139047 vs -2.260486
+
+Layer 1, Token 26 (recurrent cache comparison):
+  Original tensor sum: 89.948196
+  Converted tensor sum: 28.472143
+  Original tensor mean: 0.028109
+  Converted tensor mean: 0.008898
+ Mean difference: 0.08773426
+ Maximum pointwise difference: 1.21594334
+ Max difference location: (0, 31, 9, 5)
+  Values at max diff - Original: 1.12476408, Converted: -0.09117921
+ Biggest difference in row (0, 3, 0), sum 2.359989 vs -0.070505
+
+Layer 2, Token 26 (recurrent cache comparison):
+  Original tensor sum: 129.416809
+  Converted tensor sum: 41.503624
+  Original tensor mean: 0.040443
+  Converted tensor mean: 0.012970
+ Mean difference: 0.15461735
+ Maximum pointwise difference: 2.68493867
+ Max difference location: (0, 8, 8, 3)
+  Values at max diff - Original: 2.36720443, Converted: -0.31773427
+ Biggest difference in row (0, 27, 9), sum 4.510338 vs -0.361951
+
+Layer 4, Token 26 (recurrent cache comparison):
+  Original tensor sum: 167.357330
+  Converted tensor sum: 22.416847
+  Original tensor mean: 0.052299
+  Converted tensor mean: 0.007005
+ Mean difference: 0.12134697
+ Maximum pointwise difference: 2.10167456
+ Max difference location: (0, 27, 2, 5)
+  Values at max diff - Original: 2.16418123, Converted: 0.06250665
+ Biggest difference in row (0, 20, 0), sum 5.742605 vs 0.074519
+
+Layer 5, Token 26 (recurrent cache comparison):
+  Original tensor sum: 163.754578
+  Converted tensor sum: 25.965012
+  Original tensor mean: 0.051173
+  Converted tensor mean: 0.008114
+ Mean difference: 0.12282242
+ Maximum pointwise difference: 1.71204209
+ Max difference location: (0, 6, 7, 6)
+  Values at max diff - Original: 1.87962317, Converted: 0.16758111
+ Biggest difference in row (0, 6, 7), sum 6.898893 vs 0.817218
+
+Layer 6, Token 26 (recurrent cache comparison):
+  Original tensor sum: 280.407990
+  Converted tensor sum: 7.497489
+  Original tensor mean: 0.087628
+  Converted tensor mean: 0.002343
+ Mean difference: 0.16469882
+ Maximum pointwise difference: 5.09109163
+ Max difference location: (0, 12, 6, 1)
+  Values at max diff - Original: 5.84504795, Converted: 0.75395638
+ Biggest difference in row (0, 12, 6), sum 13.522006 vs 2.690509
+
+Layer 8, Token 26 (recurrent cache comparison):
+  Original tensor sum: 290.931335
+  Converted tensor sum: 24.817287
+  Original tensor mean: 0.090916
+  Converted tensor mean: 0.007755
+ Mean difference: 0.16735801
+ Maximum pointwise difference: 2.96624160
+ Max difference location: (0, 12, 7, 4)
+  Values at max diff - Original: 0.00615764, Converted: 2.97239923
+ Biggest difference in row (0, 6, 2), sum 5.174712 vs -3.075627
+
+Layer 9, Token 26 (recurrent cache comparison):
+  Original tensor sum: 196.708160
+  Converted tensor sum: 30.441196
+  Original tensor mean: 0.061471
+  Converted tensor mean: 0.009513
+ Mean difference: 0.11019707
+ Maximum pointwise difference: 2.66847897
+ Max difference location: (0, 28, 7, 0)
+  Values at max diff - Original: 2.53971243, Converted: -0.12876646
+ Biggest difference in row (0, 28, 7), sum 8.254028 vs 0.381486
+
+Layer 10, Token 26 (recurrent cache comparison):
+  Original tensor sum: 199.032516
+  Converted tensor sum: 15.679170
+  Original tensor mean: 0.062198
+  Converted tensor mean: 0.004900
+ Mean difference: 0.11978843
+ Maximum pointwise difference: 2.87448788
+ Max difference location: (0, 24, 1, 0)
+  Values at max diff - Original: 3.14507675, Converted: 0.27058893
+ Biggest difference in row (0, 25, 1), sum 5.510708 vs 0.187406
+
+Layer 12, Token 26 (recurrent cache comparison):
+  Original tensor sum: 260.372742
+  Converted tensor sum: 27.850517
+  Original tensor mean: 0.081366
+  Converted tensor mean: 0.008703
+ Mean difference: 0.15131992
+ Maximum pointwise difference: 3.11937833
+ Max difference location: (0, 29, 6, 5)
+  Values at max diff - Original: -0.00478183, Converted: 3.11459661
+ Biggest difference in row (0, 28, 3), sum 8.629121 vs -0.241569
+
+Layer 13, Token 26 (recurrent cache comparison):
+  Original tensor sum: 175.842209
+  Converted tensor sum: 31.150665
+  Original tensor mean: 0.054951
+  Converted tensor mean: 0.009735
+ Mean difference: 0.10132494
+ Maximum pointwise difference: 2.68282986
+ Max difference location: (0, 26, 4, 0)
+  Values at max diff - Original: 2.69746804, Converted: 0.01463811
+ Biggest difference in row (0, 26, 4), sum 5.839348 vs 0.118608
+
+Layer 14, Token 26 (recurrent cache comparison):
+  Original tensor sum: 549.098877
+  Converted tensor sum: 57.239769
+  Original tensor mean: 0.171593
+  Converted tensor mean: 0.017887
+ Mean difference: 0.23359555
+ Maximum pointwise difference: 4.78898478
+ Max difference location: (0, 28, 6, 1)
+  Values at max diff - Original: 4.82380438, Converted: 0.03481963
+ Biggest difference in row (0, 28, 6), sum 13.322067 vs -0.096704
+
+Layer 0, Token 27 (recurrent cache comparison):
+  Original tensor sum: 1.918821
+  Converted tensor sum: 4.296852
+  Original tensor mean: 0.000600
+  Converted tensor mean: 0.001343
+ Mean difference: 0.06445935
+ Maximum pointwise difference: 1.46873963
+ Max difference location: (0, 1, 3, 2)
+  Values at max diff - Original: -0.01301772, Converted: 1.45572186
+ Biggest difference in row (0, 28, 5), sum 0.223120 vs 1.905128
+
+Layer 1, Token 27 (recurrent cache comparison):
+  Original tensor sum: 160.952576
+  Converted tensor sum: 15.469984
+  Original tensor mean: 0.050298
+  Converted tensor mean: 0.004834
+ Mean difference: 0.10194612
+ Maximum pointwise difference: 1.58813882
+ Max difference location: (0, 10, 6, 8)
+  Values at max diff - Original: 1.63966167, Converted: 0.05152279
+ Biggest difference in row (0, 16, 1), sum 4.988435 vs 0.628698
+
+Layer 2, Token 27 (recurrent cache comparison):
+  Original tensor sum: 195.883148
+  Converted tensor sum: 23.802681
+  Original tensor mean: 0.061213
+  Converted tensor mean: 0.007438
+ Mean difference: 0.16412406
+ Maximum pointwise difference: 3.51121449
+ Max difference location: (0, 18, 2, 1)
+  Values at max diff - Original: 0.00709479, Converted: 3.51830935
+ Biggest difference in row (0, 0, 2), sum 7.858056 vs -0.148840
+
+Layer 4, Token 27 (recurrent cache comparison):
+  Original tensor sum: 233.660095
+  Converted tensor sum: 13.142452
+  Original tensor mean: 0.073019
+  Converted tensor mean: 0.004107
+ Mean difference: 0.12733760
+ Maximum pointwise difference: 2.84240961
+ Max difference location: (0, 27, 8, 5)
+  Values at max diff - Original: 2.76694965, Converted: -0.07546007
+ Biggest difference in row (0, 24, 1), sum 6.535775 vs 0.658166
+
+Layer 5, Token 27 (recurrent cache comparison):
+  Original tensor sum: 251.330231
+  Converted tensor sum: 21.526363
+  Original tensor mean: 0.078541
+  Converted tensor mean: 0.006727
+ Mean difference: 0.13129665
+ Maximum pointwise difference: 2.36431837
+ Max difference location: (0, 6, 2, 8)
+  Values at max diff - Original: 2.37356281, Converted: 0.00924453
+ Biggest difference in row (0, 6, 2), sum 8.637090 vs 0.102351
+
+Layer 6, Token 27 (recurrent cache comparison):
+  Original tensor sum: 362.387848
+  Converted tensor sum: -2.171665
+  Original tensor mean: 0.113246
+  Converted tensor mean: -0.000679
+ Mean difference: 0.18160143
+ Maximum pointwise difference: 5.93641853
+ Max difference location: (0, 12, 6, 1)
+  Values at max diff - Original: 5.75199318, Converted: -0.18442529
+ Biggest difference in row (0, 12, 6), sum 13.466440 vs 2.236503
+
+Layer 8, Token 27 (recurrent cache comparison):
+  Original tensor sum: 350.323914
+  Converted tensor sum: 19.725079
+  Original tensor mean: 0.109476
+  Converted tensor mean: 0.006164
+ Mean difference: 0.17721944
+ Maximum pointwise difference: 3.75930500
+ Max difference location: (0, 20, 0, 7)
+  Values at max diff - Original: 3.75676632, Converted: -0.00253879
+ Biggest difference in row (0, 13, 8), sum 9.584435 vs 0.181711
+
+Layer 9, Token 27 (recurrent cache comparison):
+  Original tensor sum: 240.779663
+  Converted tensor sum: 24.165503
+  Original tensor mean: 0.075244
+  Converted tensor mean: 0.007552
+ Mean difference: 0.11309086
+ Maximum pointwise difference: 2.43383050
+ Max difference location: (0, 28, 7, 0)
+  Values at max diff - Original: 2.44759488, Converted: 0.01376434
+ Biggest difference in row (0, 28, 7), sum 8.022928 vs 0.225877
+
+Layer 10, Token 27 (recurrent cache comparison):
+  Original tensor sum: 244.469070
+  Converted tensor sum: 12.286395
+  Original tensor mean: 0.076397
+  Converted tensor mean: 0.003839
+ Mean difference: 0.11746948
+ Maximum pointwise difference: 2.32974362
+ Max difference location: (0, 24, 1, 0)
+  Values at max diff - Original: 3.20926118, Converted: 0.87951756
+ Biggest difference in row (0, 11, 6), sum 6.942329 vs -0.007718
+
+Layer 12, Token 27 (recurrent cache comparison):
+  Original tensor sum: 306.749817
+  Converted tensor sum: 12.790400
+  Original tensor mean: 0.095859
+  Converted tensor mean: 0.003997
+ Mean difference: 0.15706061
+ Maximum pointwise difference: 3.82620597
+ Max difference location: (0, 14, 8, 1)
+  Values at max diff - Original: 3.90818167, Converted: 0.08197562
+ Biggest difference in row (0, 28, 3), sum 9.139596 vs 0.006271
+
+Layer 13, Token 27 (recurrent cache comparison):
+  Original tensor sum: 231.223206
+  Converted tensor sum: 21.992476
+  Original tensor mean: 0.072257
+  Converted tensor mean: 0.006873
+ Mean difference: 0.10150776
+ Maximum pointwise difference: 2.88272619
+ Max difference location: (0, 11, 4, 0)
+  Values at max diff - Original: 2.93226290, Converted: 0.04953665
+ Biggest difference in row (0, 26, 4), sum 5.809074 vs 0.116277
+
+Layer 14, Token 27 (recurrent cache comparison):
+  Original tensor sum: 648.596985
+  Converted tensor sum: 37.038162
+  Original tensor mean: 0.202687
+  Converted tensor mean: 0.011574
+ Mean difference: 0.25362208
+ Maximum pointwise difference: 4.70936871
+ Max difference location: (0, 28, 6, 1)
+  Values at max diff - Original: 4.71021414, Converted: 0.00084528
+ Biggest difference in row (0, 27, 4), sum 11.931866 vs 0.099372
+
+Layer 0, Token 28 (recurrent cache comparison):
+  Original tensor sum: 3.825253
+  Converted tensor sum: 10.656538
+  Original tensor mean: 0.001195
+  Converted tensor mean: 0.003330
+ Mean difference: 0.06744900
+ Maximum pointwise difference: 1.23786223
+ Max difference location: (0, 1, 5, 3)
+  Values at max diff - Original: 1.16935611, Converted: -0.06850608
+ Biggest difference in row (0, 23, 7), sum -1.495719 vs 0.880324
+
+Layer 1, Token 28 (recurrent cache comparison):
+  Original tensor sum: 64.976830
+  Converted tensor sum: 30.582441
+  Original tensor mean: 0.020305
+  Converted tensor mean: 0.009557
+ Mean difference: 0.08607832
+ Maximum pointwise difference: 1.46864974
+ Max difference location: (0, 16, 2, 9)
+  Values at max diff - Original: 1.56847525, Converted: 0.09982550
+ Biggest difference in row (0, 1, 3), sum 2.236484 vs -0.154611
+
+Layer 2, Token 28 (recurrent cache comparison):
+  Original tensor sum: 104.630646
+  Converted tensor sum: 53.524834
+  Original tensor mean: 0.032697
+  Converted tensor mean: 0.016727
+ Mean difference: 0.14054969
+ Maximum pointwise difference: 2.87744927
+ Max difference location: (0, 13, 1, 7)
+  Values at max diff - Original: 0.03716344, Converted: 2.91461277
+ Biggest difference in row (0, 23, 4), sum 0.081307 vs 4.303990
+
+Layer 4, Token 28 (recurrent cache comparison):
+  Original tensor sum: 192.219788
+  Converted tensor sum: 29.228979
+  Original tensor mean: 0.060069
+  Converted tensor mean: 0.009134
+ Mean difference: 0.12325959
+ Maximum pointwise difference: 4.08833027
+ Max difference location: (0, 19, 0, 2)
+  Values at max diff - Original: 4.01820278, Converted: -0.07012761
+ Biggest difference in row (0, 19, 0), sum 6.219261 vs -0.327518
+
+Layer 5, Token 28 (recurrent cache comparison):
+  Original tensor sum: 243.385864
+  Converted tensor sum: 23.409119
+  Original tensor mean: 0.076058
+  Converted tensor mean: 0.007315
+ Mean difference: 0.14056823
+ Maximum pointwise difference: 5.76254559
+ Max difference location: (0, 28, 9, 6)
+  Values at max diff - Original: 6.02726078, Converted: 0.26471528
+ Biggest difference in row (0, 28, 9), sum 9.890844 vs 0.406699
+
+Layer 6, Token 28 (recurrent cache comparison):
+  Original tensor sum: 283.616272
+  Converted tensor sum: 40.143700
+  Original tensor mean: 0.088630
+  Converted tensor mean: 0.012545
+ Mean difference: 0.16413040
+ Maximum pointwise difference: 4.72735071
+ Max difference location: (0, 12, 1, 2)
+  Values at max diff - Original: 4.75247860, Converted: 0.02512792
+ Biggest difference in row (0, 12, 1), sum 13.120539 vs 0.133712
+
+Layer 8, Token 28 (recurrent cache comparison):
+  Original tensor sum: 228.649261
+  Converted tensor sum: 44.837063
+  Original tensor mean: 0.071453
+  Converted tensor mean: 0.014012
+ Mean difference: 0.15679255
+ Maximum pointwise difference: 3.82907844
+ Max difference location: (0, 23, 4, 7)
+  Values at max diff - Original: 3.84108162, Converted: 0.01200324
+ Biggest difference in row (0, 1, 4), sum 8.901268 vs 0.416754
+
+Layer 9, Token 28 (recurrent cache comparison):
+  Original tensor sum: 212.272324
+  Converted tensor sum: 21.536671
+  Original tensor mean: 0.066335
+  Converted tensor mean: 0.006730
+ Mean difference: 0.11465029
+ Maximum pointwise difference: 2.85586047
+ Max difference location: (0, 15, 2, 3)
+  Values at max diff - Original: 2.84589958, Converted: -0.00996090
+ Biggest difference in row (0, 15, 2), sum 8.293229 vs 0.275981
+
+Layer 10, Token 28 (recurrent cache comparison):
+  Original tensor sum: 212.098206
+  Converted tensor sum: 19.835695
+  Original tensor mean: 0.066281
+  Converted tensor mean: 0.006199
+ Mean difference: 0.14262109
+ Maximum pointwise difference: 4.31178093
+ Max difference location: (0, 24, 1, 0)
+  Values at max diff - Original: 4.53196430, Converted: 0.22018313
+ Biggest difference in row (0, 10, 4), sum 9.766387 vs -0.072625
+
+Layer 12, Token 28 (recurrent cache comparison):
+  Original tensor sum: 280.744019
+  Converted tensor sum: 26.187149
+  Original tensor mean: 0.087733
+  Converted tensor mean: 0.008183
+ Mean difference: 0.15264840
+ Maximum pointwise difference: 4.41812420
+ Max difference location: (0, 21, 2, 4)
+  Values at max diff - Original: 4.41481018, Converted: -0.00331383
+ Biggest difference in row (0, 23, 2), sum 10.581321 vs 0.608111
+
+Layer 13, Token 28 (recurrent cache comparison):
+  Original tensor sum: 220.357834
+  Converted tensor sum: 20.228846
+  Original tensor mean: 0.068862
+  Converted tensor mean: 0.006322
+ Mean difference: 0.11583474
+ Maximum pointwise difference: 4.72553635
+ Max difference location: (0, 17, 8, 2)
+  Values at max diff - Original: 4.72810841, Converted: 0.00257226
+ Biggest difference in row (0, 19, 1), sum 9.879478 vs 0.388081
+
+Layer 14, Token 28 (recurrent cache comparison):
+  Original tensor sum: 515.894897
+  Converted tensor sum: 74.440948
+  Original tensor mean: 0.161217
+  Converted tensor mean: 0.023263
+ Mean difference: 0.23548929
+ Maximum pointwise difference: 4.93366051
+ Max difference location: (0, 16, 7, 6)
+  Values at max diff - Original: 4.92017603, Converted: -0.01348470
+ Biggest difference in row (0, 28, 6), sum 14.032580 vs -0.061767
+
+Layer 0, Token 29 (recurrent cache comparison):
+  Original tensor sum: 7.490709
+  Converted tensor sum: 13.732031
+  Original tensor mean: 0.002341
+  Converted tensor mean: 0.004291
+ Mean difference: 0.06082471
+ Maximum pointwise difference: 1.43740010
+ Max difference location: (0, 1, 3, 3)
+  Values at max diff - Original: 1.39118814, Converted: -0.04621201
+ Biggest difference in row (0, 23, 1), sum -0.646684 vs 1.039518
+
+Layer 1, Token 29 (recurrent cache comparison):
+  Original tensor sum: 34.789967
+  Converted tensor sum: 32.546562
+  Original tensor mean: 0.010872
+  Converted tensor mean: 0.010171
+ Mean difference: 0.08757141
+ Maximum pointwise difference: 1.04371011
+ Max difference location: (0, 6, 1, 2)
+  Values at max diff - Original: 0.99249512, Converted: -0.05121503
+ Biggest difference in row (0, 3, 8), sum -0.976319 vs 2.533029
+
+Layer 2, Token 29 (recurrent cache comparison):
+  Original tensor sum: 81.188293
+  Converted tensor sum: 110.873352
+  Original tensor mean: 0.025371
+  Converted tensor mean: 0.034648
+ Mean difference: 0.13966069
+ Maximum pointwise difference: 2.45380425
+ Max difference location: (0, 13, 7, 1)
+  Values at max diff - Original: 0.05114410, Converted: 2.50494838
+ Biggest difference in row (0, 12, 1), sum 5.281791 vs 0.538119
+
+Layer 4, Token 29 (recurrent cache comparison):
+  Original tensor sum: 188.945206
+  Converted tensor sum: 82.802734
+  Original tensor mean: 0.059045
+  Converted tensor mean: 0.025876
+ Mean difference: 0.13653603
+ Maximum pointwise difference: 2.89840102
+ Max difference location: (0, 19, 0, 2)
+  Values at max diff - Original: 2.89151430, Converted: -0.00688672
+ Biggest difference in row (0, 19, 0), sum 4.444302 vs -0.202434
+
+Layer 5, Token 29 (recurrent cache comparison):
+  Original tensor sum: 234.074219
+  Converted tensor sum: 65.914871
+  Original tensor mean: 0.073148
+  Converted tensor mean: 0.020598
+ Mean difference: 0.14784601
+ Maximum pointwise difference: 3.25614643
+ Max difference location: (0, 28, 9, 6)
+  Values at max diff - Original: 3.74669981, Converted: 0.49055350
+ Biggest difference in row (0, 28, 9), sum 6.683680 vs 1.130066
+
+Layer 6, Token 29 (recurrent cache comparison):
+  Original tensor sum: 312.478729
+  Converted tensor sum: 136.998260
+  Original tensor mean: 0.097650
+  Converted tensor mean: 0.042812
+ Mean difference: 0.19563875
+ Maximum pointwise difference: 4.93519068
+ Max difference location: (0, 12, 6, 2)
+  Values at max diff - Original: 4.85506201, Converted: -0.08012870
+ Biggest difference in row (0, 12, 6), sum 14.484787 vs 2.203152
+
+Layer 8, Token 29 (recurrent cache comparison):
+  Original tensor sum: 249.388092
+  Converted tensor sum: 124.562820
+  Original tensor mean: 0.077934
+  Converted tensor mean: 0.038926
+ Mean difference: 0.18382950
+ Maximum pointwise difference: 3.92004848
+ Max difference location: (0, 20, 7, 0)
+  Values at max diff - Original: 0.21650003, Converted: 4.13654852
+ Biggest difference in row (0, 23, 4), sum 6.951686 vs -0.318011
+
+Layer 9, Token 29 (recurrent cache comparison):
+  Original tensor sum: 200.171021
+  Converted tensor sum: 82.927864
+  Original tensor mean: 0.062553
+  Converted tensor mean: 0.025915
+ Mean difference: 0.12187681
+ Maximum pointwise difference: 2.69074798
+ Max difference location: (0, 15, 2, 3)
+  Values at max diff - Original: 2.69794440, Converted: 0.00719635
+ Biggest difference in row (0, 15, 2), sum 7.941767 vs 0.050363
+
+Layer 10, Token 29 (recurrent cache comparison):
+  Original tensor sum: 213.368591
+  Converted tensor sum: 77.427185
+  Original tensor mean: 0.066678
+  Converted tensor mean: 0.024196
+ Mean difference: 0.13651104
+ Maximum pointwise difference: 3.13308334
+ Max difference location: (0, 24, 1, 0)
+  Values at max diff - Original: 3.58547378, Converted: 0.45239034
+ Biggest difference in row (0, 10, 4), sum 6.818930 vs -0.155169
+
+Layer 12, Token 29 (recurrent cache comparison):
+  Original tensor sum: 263.786377
+  Converted tensor sum: 92.682205
+  Original tensor mean: 0.082433
+  Converted tensor mean: 0.028963
+ Mean difference: 0.15690672
+ Maximum pointwise difference: 3.50486374
+ Max difference location: (0, 23, 2, 9)
+  Values at max diff - Original: 3.44645429, Converted: -0.05840937
+ Biggest difference in row (0, 23, 2), sum 9.830493 vs -0.231371
+
+Layer 13, Token 29 (recurrent cache comparison):
+  Original tensor sum: 193.539474
+  Converted tensor sum: 79.679726
+  Original tensor mean: 0.060481
+  Converted tensor mean: 0.024900
+ Mean difference: 0.11795644
+ Maximum pointwise difference: 3.62266445
+ Max difference location: (0, 11, 4, 0)
+  Values at max diff - Original: 3.49508691, Converted: -0.12757748
+ Biggest difference in row (0, 18, 1), sum 5.632851 vs -0.122056
+
+Layer 14, Token 29 (recurrent cache comparison):
+  Original tensor sum: 525.021179
+  Converted tensor sum: 197.845932
+  Original tensor mean: 0.164069
+  Converted tensor mean: 0.061827
+ Mean difference: 0.25022614
+ Maximum pointwise difference: 4.42602730
+ Max difference location: (0, 15, 2, 8)
+  Values at max diff - Original: 4.48904753, Converted: 0.06302036
+ Biggest difference in row (0, 28, 6), sum 13.769245 vs 2.199155
+
+Layer 0, Token 30 (recurrent cache comparison):
+  Original tensor sum: 4.659326
+  Converted tensor sum: 10.953376
+  Original tensor mean: 0.001456
+  Converted tensor mean: 0.003423
+ Mean difference: 0.06142937
+ Maximum pointwise difference: 1.06926394
+ Max difference location: (0, 28, 5, 9)
+  Values at max diff - Original: -0.05087389, Converted: 1.01839006
+ Biggest difference in row (0, 4, 9), sum 2.534327 vs -0.105926
+
+Layer 1, Token 30 (recurrent cache comparison):
+  Original tensor sum: 24.136578
+  Converted tensor sum: 96.968475
+  Original tensor mean: 0.007543
+  Converted tensor mean: 0.030303
+ Mean difference: 0.08820312
+ Maximum pointwise difference: 1.49761820
+ Max difference location: (0, 6, 4, 4)
+  Values at max diff - Original: 0.06953955, Converted: 1.56715775
+ Biggest difference in row (0, 14, 2), sum 0.115400 vs 3.481205
+
+Layer 2, Token 30 (recurrent cache comparison):
+  Original tensor sum: 64.494400
+  Converted tensor sum: 246.552582
+  Original tensor mean: 0.020155
+  Converted tensor mean: 0.077048
+ Mean difference: 0.16151237
+ Maximum pointwise difference: 3.98919630
+ Max difference location: (0, 4, 8, 4)
+  Values at max diff - Original: -0.10013573, Converted: 3.88906050
+ Biggest difference in row (0, 23, 4), sum -0.108707 vs 7.892229
+
+Layer 4, Token 30 (recurrent cache comparison):
+  Original tensor sum: 190.921097
+  Converted tensor sum: 126.537048
+  Original tensor mean: 0.059663
+  Converted tensor mean: 0.039543
+ Mean difference: 0.13220279
+ Maximum pointwise difference: 2.87259126
+ Max difference location: (0, 8, 6, 5)
+  Values at max diff - Original: 0.00449362, Converted: 2.87708497
+ Biggest difference in row (0, 17, 9), sum 0.710816 vs 6.274773
+
+Layer 5, Token 30 (recurrent cache comparison):
+  Original tensor sum: 222.353195
+  Converted tensor sum: 164.720016
+  Original tensor mean: 0.069485
+  Converted tensor mean: 0.051475
+ Mean difference: 0.15598193
+ Maximum pointwise difference: 2.88562417
+ Max difference location: (0, 28, 9, 6)
+  Values at max diff - Original: 3.18444014, Converted: 0.29881600
+ Biggest difference in row (0, 30, 2), sum 0.004416 vs 6.153850
+
+Layer 6, Token 30 (recurrent cache comparison):
+  Original tensor sum: 339.244141
+  Converted tensor sum: 317.588440
+  Original tensor mean: 0.106014
+  Converted tensor mean: 0.099246
+ Mean difference: 0.21152201
+ Maximum pointwise difference: 4.30255318
+ Max difference location: (0, 6, 4, 8)
+  Values at max diff - Original: -0.19493943, Converted: 4.10761356
+ Biggest difference in row (0, 12, 6), sum 13.503227 vs 2.285058
+
+Layer 8, Token 30 (recurrent cache comparison):
+  Original tensor sum: 261.308044
+  Converted tensor sum: 204.488892
+  Original tensor mean: 0.081659
+  Converted tensor mean: 0.063903
+ Mean difference: 0.18225618
+ Maximum pointwise difference: 3.88148618
+ Max difference location: (0, 21, 7, 9)
+  Values at max diff - Original: 3.48627377, Converted: -0.39521238
+ Biggest difference in row (0, 2, 4), sum -0.009086 vs 6.555274
+
+Layer 9, Token 30 (recurrent cache comparison):
+  Original tensor sum: 187.010895
+  Converted tensor sum: 173.659409
+  Original tensor mean: 0.058441
+  Converted tensor mean: 0.054269
+ Mean difference: 0.12517925
+ Maximum pointwise difference: 2.68900180
+ Max difference location: (0, 15, 2, 3)
+  Values at max diff - Original: 2.59999108, Converted: -0.08901066
+ Biggest difference in row (0, 15, 2), sum 7.543541 vs 0.209705
+
+Layer 10, Token 30 (recurrent cache comparison):
+  Original tensor sum: 206.371735
+  Converted tensor sum: 145.950043
+  Original tensor mean: 0.064491
+  Converted tensor mean: 0.045609
+ Mean difference: 0.12893555
+ Maximum pointwise difference: 2.97875929
+ Max difference location: (0, 24, 1, 0)
+  Values at max diff - Original: 3.54119730, Converted: 0.56243801
+ Biggest difference in row (0, 11, 6), sum 5.982455 vs 0.632388
+
+Layer 12, Token 30 (recurrent cache comparison):
+  Original tensor sum: 251.250732
+  Converted tensor sum: 193.503662
+  Original tensor mean: 0.078516
+  Converted tensor mean: 0.060470
+ Mean difference: 0.14629500
+ Maximum pointwise difference: 3.24942660
+ Max difference location: (0, 28, 2, 4)
+  Values at max diff - Original: 3.09908056, Converted: -0.15034601
+ Biggest difference in row (0, 28, 3), sum 9.363594 vs -0.017764
+
+Layer 13, Token 30 (recurrent cache comparison):
+  Original tensor sum: 176.694855
+  Converted tensor sum: 165.849930
+  Original tensor mean: 0.055217
+  Converted tensor mean: 0.051828
+ Mean difference: 0.11395165
+ Maximum pointwise difference: 3.52955794
+ Max difference location: (0, 11, 4, 0)
+  Values at max diff - Original: 3.33610535, Converted: -0.19345257
+ Biggest difference in row (0, 8, 7), sum -0.009830 vs 4.540796
+
+Layer 14, Token 30 (recurrent cache comparison):
+  Original tensor sum: 562.166748
+  Converted tensor sum: 408.797607
+  Original tensor mean: 0.175677
+  Converted tensor mean: 0.127749
+ Mean difference: 0.25758758
+ Maximum pointwise difference: 4.45499659
+ Max difference location: (0, 15, 2, 8)
+  Values at max diff - Original: 4.37386942, Converted: -0.08112720
+ Biggest difference in row (0, 28, 6), sum 13.013643 vs -0.161676
+
+================================================================================
+Comparing q padded tensors...
+================================================================================
+
+Layer 0, Token 1 (q padded comparison):
+  Original tensor sum: 7.958682
+  Converted tensor sum: 7.958661
+  Original tensor mean: 0.000389
+  Converted tensor mean: 0.000389
+ Mean difference: 0.00000000
+ Maximum pointwise difference: 0.00000076
+ Max difference location: (0, 0, 0, 6)
+  Values at max diff - Original: -0.22316068, Converted: -0.22316144
+ Biggest difference in row (0, 0, 0), sum -0.570113 vs -0.570115
+Original tensor: 
+
+[[[[ 1.97370015e-02 -7.89398551e-02  2.40650475e-02 ... -3.46655026e-02
+    -1.84459373e-01  1.35031175e-02]
+   [-3.90069596e-02 -6.45441562e-02 -9.85123310e-03 ... -7.10528418e-02
+     2.86484748e-01 -4.78143468e-02]
+   [-3.32845971e-02  8.48600932e-04 -1.83281749e-02 ... -3.60261202e-02
+     1.16759300e-01 -3.79200131e-02]
+   ...
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]]
+
+  [[ 1.97370015e-02 -7.89398551e-02  2.40650475e-02 ... -3.46655026e-02
+    -1.84459373e-01  1.35031175e-02]
+   [-3.90069596e-02 -6.45441562e-02 -9.85123310e-03 ... -7.10528418e-02
+     2.86484748e-01 -4.78143468e-02]
+   [-3.32845971e-02  8.48600932e-04 -1.83281749e-02 ... -3.60261202e-02
+     1.16759300e-01 -3.79200131e-02]
+   ...
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]]
+
+  [[-6.58135489e-02  2.45508045e-01 -1.13810226e-02 ...  2.54544546e-03
+     2.51089204e-02  2.86987983e-04]
+   [-1.25565156e-01 -7.94792548e-02 -9.97955501e-02 ...  7.12259486e-02
+     9.36590508e-02 -1.65728614e-01]
+   [-1.35633466e-03 -9.60636213e-02 -8.94494876e-02 ...  1.94221988e-01
+    -4.70091067e-02 -9.31773186e-02]
+   ...
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]]
+
+  ...
+
+  [[-1.80936769e-01  2.09823474e-02 -1.53481111e-01 ... -6.53458312e-02
+     9.94268879e-02  8.78875237e-03]
+   [-1.07081555e-01  1.26294538e-01 -9.78934765e-02 ... -5.38439713e-02
+    -5.59990015e-03  1.52285740e-01]
+   [ 2.60844707e-01  8.11591521e-02  1.12913184e-01 ... -1.86833683e-02
+    -1.93844642e-02 -7.96004198e-03]
+   ...
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]]
+
+  [[ 5.65589257e-02 -7.43661374e-02 -2.00723484e-01 ...  1.52545767e-02
+     1.50462063e-02  2.35310309e-02]
+   [ 7.43804872e-02 -1.34884328e-01  2.01406017e-01 ... -9.13856328e-02
+    -5.48248030e-02  8.11865740e-03]
+   [ 1.52915101e-02 -1.20854350e-02  2.73873240e-01 ... -3.24299149e-02
+    -6.92289770e-02 -1.53110905e-05]
+   ...
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]]
+
+  [[ 5.65589257e-02 -7.43661374e-02 -2.00723484e-01 ...  1.52545767e-02
+     1.50462063e-02  2.35310309e-02]
+   [ 7.43804872e-02 -1.34884328e-01  2.01406017e-01 ... -9.13856328e-02
+    -5.48248030e-02  8.11865740e-03]
+   [ 1.52915101e-02 -1.20854350e-02  2.73873240e-01 ... -3.24299149e-02
+    -6.92289770e-02 -1.53110905e-05]
+   ...
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]]]]
+
+Converted tensor: 
+
+[[[[ 1.97370723e-02 -7.89401382e-02  2.40651332e-02 ... -3.46656255e-02
+    -1.84460029e-01  1.35031650e-02]
+   [-3.90069783e-02 -6.45441785e-02 -9.85123683e-03 ... -7.10528716e-02
+     2.86484867e-01 -4.78143729e-02]
+   [-3.32845971e-02  8.48600990e-04 -1.83281731e-02 ... -3.60261202e-02
+     1.16759300e-01 -3.79200131e-02]
+   ...
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]]
+
+  [[ 1.97370723e-02 -7.89401382e-02  2.40651332e-02 ... -3.46656255e-02
+    -1.84460029e-01  1.35031650e-02]
+   [-3.90069783e-02 -6.45441785e-02 -9.85123683e-03 ... -7.10528716e-02
+     2.86484867e-01 -4.78143729e-02]
+   [-3.32845971e-02  8.48600990e-04 -1.83281731e-02 ... -3.60261202e-02
+     1.16759300e-01 -3.79200131e-02]
+   ...
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]]
+
+  [[-6.58135936e-02  2.45508194e-01 -1.13810301e-02 ...  2.54544709e-03
+     2.51089353e-02  2.86988186e-04]
+   [-1.25565395e-01 -7.94794038e-02 -9.97957364e-02 ...  7.12260827e-02
+     9.36592296e-02 -1.65728927e-01]
+   [-1.35633559e-03 -9.60636735e-02 -8.94495398e-02 ...  1.94222078e-01
+    -4.70091291e-02 -9.31773633e-02]
+   ...
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]]
+
+  ...
+
+  [[-1.80937156e-01  2.09823940e-02 -1.53481439e-01 ... -6.53459728e-02
+     9.94271040e-02  8.78877100e-03]
+   [-1.07081644e-01  1.26294628e-01 -9.78935510e-02 ... -5.38440198e-02
+    -5.59990434e-03  1.52285874e-01]
+   [ 2.60844767e-01  8.11591670e-02  1.12913206e-01 ... -1.86833721e-02
+    -1.93844680e-02 -7.96004292e-03]
+   ...
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]]
+
+  [[ 5.65591007e-02 -7.43663609e-02 -2.00724110e-01 ...  1.52546223e-02
+     1.50462529e-02  2.35311035e-02]
+   [ 7.43805990e-02 -1.34884506e-01  2.01406300e-01 ... -9.13857669e-02
+    -5.48248850e-02  8.11866950e-03]
+   [ 1.52915157e-02 -1.20854378e-02  2.73873329e-01 ... -3.24299261e-02
+    -6.92289993e-02 -1.53110959e-05]
+   ...
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]]
+
+  [[ 5.65591007e-02 -7.43663609e-02 -2.00724110e-01 ...  1.52546223e-02
+     1.50462529e-02  2.35311035e-02]
+   [ 7.43805990e-02 -1.34884506e-01  2.01406300e-01 ... -9.13857669e-02
+    -5.48248850e-02  8.11866950e-03]
+   [ 1.52915157e-02 -1.20854378e-02  2.73873329e-01 ... -3.24299261e-02
+    -6.92289993e-02 -1.53110959e-05]
+   ...
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]
+   [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
+     0.00000000e+00  0.00000000e+00]]]]
+
+
+
+Layer 1, Token 1 (q padded comparison):
+  Original tensor sum: 8.938188
+  Converted tensor sum: 8.938201
+  Original tensor mean: 0.000436
+  Converted tensor mean: 0.000436
+ Mean difference: 0.00000001
+ Maximum pointwise difference: 0.00000305
+ Max difference location: (0, 16, 0, 8)
+  Values at max diff - Original: 0.24272950, Converted: 0.24273255
+ Biggest difference in row (0, 16, 0), sum 0.509919 vs 0.509925
+
+Layer 2, Token 1 (q padded comparison):
+  Original tensor sum: -2.985352
+  Converted tensor sum: -2.985393
+  Original tensor mean: -0.000146
+  Converted tensor mean: -0.000146
+ Mean difference: 0.00000001
+ Maximum pointwise difference: 0.00000104
+ Max difference location: (0, 12, 3, 1)
+  Values at max diff - Original: -0.02719286, Converted: -0.02719390
+ Biggest difference in row (0, 12, 3), sum -0.530951 vs -0.530954
+
+Layer 4, Token 1 (q padded comparison):
+  Original tensor sum: -31.644516
+  Converted tensor sum: -31.643524
+  Original tensor mean: -0.001545
+  Converted tensor mean: -0.001545
+ Mean difference: 0.00000028
+ Maximum pointwise difference: 0.00006898
+ Max difference location: (0, 6, 3, 7)
+  Values at max diff - Original: 0.07510993, Converted: 0.07517891
+ Biggest difference in row (0, 6, 3), sum -0.645874 vs -0.645761
+
+Layer 5, Token 1 (q padded comparison):
+  Original tensor sum: -30.684572
+  Converted tensor sum: -30.685047
+  Original tensor mean: -0.001498
+  Converted tensor mean: -0.001498
+ Mean difference: 0.00000021
+ Maximum pointwise difference: 0.00003881
+ Max difference location: (0, 30, 3, 0)
+  Values at max diff - Original: 0.03456598, Converted: 0.03452717
+ Biggest difference in row (0, 30, 3), sum -0.428461 vs -0.428590
+
+Layer 6, Token 1 (q padded comparison):
+  Original tensor sum: -10.008605
+  Converted tensor sum: -10.014137
+  Original tensor mean: -0.000489
+  Converted tensor mean: -0.000489
+ Mean difference: 0.00000105
+ Maximum pointwise difference: 0.00017181
+ Max difference location: (0, 6, 2, 7)
+  Values at max diff - Original: 0.01523990, Converted: 0.01506809
+ Biggest difference in row (0, 2, 1), sum -0.388271 vs -0.388545
+
+Layer 8, Token 1 (q padded comparison):
+  Original tensor sum: -36.801449
+  Converted tensor sum: -36.801811
+  Original tensor mean: -0.001797
+  Converted tensor mean: -0.001797
+ Mean difference: 0.00000098
+ Maximum pointwise difference: 0.00025206
+ Max difference location: (0, 20, 3, 1)
+  Values at max diff - Original: 0.04204723, Converted: 0.04179518
+ Biggest difference in row (0, 2, 0), sum -0.275884 vs -0.275609
+
+Layer 9, Token 1 (q padded comparison):
+  Original tensor sum: -37.401527
+  Converted tensor sum: -37.397404
+  Original tensor mean: -0.001826
+  Converted tensor mean: -0.001826
+ Mean difference: 0.00000135
+ Maximum pointwise difference: 0.00026937
+ Max difference location: (0, 20, 2, 2)
+  Values at max diff - Original: 0.14496517, Converted: 0.14469580
+ Biggest difference in row (0, 20, 3), sum -0.264264 vs -0.264851
+
+Layer 10, Token 1 (q padded comparison):
+  Original tensor sum: -43.546944
+  Converted tensor sum: -43.543182
+  Original tensor mean: -0.002126
+  Converted tensor mean: -0.002126
+ Mean difference: 0.00000175
+ Maximum pointwise difference: 0.00031144
+ Max difference location: (0, 0, 2, 5)
+  Values at max diff - Original: -0.03211254, Converted: -0.03180110
+ Biggest difference in row (0, 24, 3), sum -0.476393 vs -0.475955
+
+Layer 12, Token 1 (q padded comparison):
+  Original tensor sum: -19.226507
+  Converted tensor sum: -19.226831
+  Original tensor mean: -0.000939
+  Converted tensor mean: -0.000939
+ Mean difference: 0.00000116
+ Maximum pointwise difference: 0.00020705
+ Max difference location: (0, 28, 2, 7)
+  Values at max diff - Original: 0.06080329, Converted: 0.06101035
+ Biggest difference in row (0, 14, 3), sum -0.455543 vs -0.455054
+
+Layer 13, Token 1 (q padded comparison):
+  Original tensor sum: -36.510368
+  Converted tensor sum: -36.510063
+  Original tensor mean: -0.001783
+  Converted tensor mean: -0.001783
+ Mean difference: 0.00000135
+ Maximum pointwise difference: 0.00022900
+ Max difference location: (0, 16, 2, 1)
+  Values at max diff - Original: -0.03357363, Converted: -0.03334463
+ Biggest difference in row (0, 18, 2), sum -0.183418 vs -0.183802
+
+Layer 14, Token 1 (q padded comparison):
+  Original tensor sum: -15.543186
+  Converted tensor sum: -15.543753
+  Original tensor mean: -0.000759
+  Converted tensor mean: -0.000759
+ Mean difference: 0.00000116
+ Maximum pointwise difference: 0.00036725
+ Max difference location: (0, 4, 2, 2)
+  Values at max diff - Original: 0.05589651, Converted: 0.05552926
+ Biggest difference in row (0, 18, 1), sum -0.470654 vs -0.470283
+
+================================================================================
+Comparing k padded tensors...
+================================================================================
+
+Layer 0, Token 1 (k padded comparison):
+  Original tensor sum: -12.851240
+  Converted tensor sum: -12.851334
+  Original tensor mean: -0.000628
+  Converted tensor mean: -0.000628
+ Mean difference: 0.00000002
+ Maximum pointwise difference: 0.00000304
+ Max difference location: (0, 24, 0, 7)
+  Values at max diff - Original: -0.57623452, Converted: -0.57623756
+ Biggest difference in row (0, 24, 0), sum -1.467058 vs -1.467066
+Original tensor: 
+
+[[[[-0.0023386   0.00352692 -0.13370702 ... -0.18872206  0.09370422
+    -0.04139194]
+   [ 0.09375711  0.09519143  0.04368615 ... -0.17057192 -0.09237721
+     0.09026651]
+   [ 0.19408916 -0.1052211  -0.5198605  ... -0.35431755 -0.18219906
+    -0.31666332]
+   ...
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]]
+
+  [[-0.0023386   0.00352692 -0.13370702 ... -0.18872206  0.09370422
+    -0.04139194]
+   [ 0.09375711  0.09519143  0.04368615 ... -0.17057192 -0.09237721
+     0.09026651]
+   [ 0.19408916 -0.1052211  -0.5198605  ... -0.35431755 -0.18219906
+    -0.31666332]
+   ...
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]]
+
+  [[-0.52186674 -0.28046784 -0.03100401 ...  0.12330638 -0.17640771
+    -0.10358577]
+   [-0.4391339  -0.25189647  0.12411524 ... -0.04670377  0.4796994
+     0.13396528]
+   [ 0.80941254  0.33414015  0.10742755 ... -0.17197518 -0.16508798
+    -0.20685418]
+   ...
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]]
+
+  ...
+
+  [[-0.02867949  0.05648347  0.01508509 ...  0.7403576  -0.30081272
+     0.31962797]
+   [ 0.07382206 -0.05249733  0.05087741 ...  0.8205082  -0.03774351
+     0.4122186 ]
+   [-0.10616651 -0.07183579 -0.02862857 ...  0.13253474  0.73543155
+     0.63596827]
+   ...
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]]
+
+  [[ 0.949689   -0.00939775 -0.0047697  ... -0.04689857 -0.0884609
+    -0.20121996]
+   [ 0.9700847  -0.03739532 -0.04046015 ... -0.0640891  -0.11664858
+    -0.14288443]
+   [-0.20942387 -0.21343033 -0.00624497 ...  0.05516734 -0.33565474
+     0.75833493]
+   ...
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]]
+
+  [[ 0.949689   -0.00939775 -0.0047697  ... -0.04689857 -0.0884609
+    -0.20121996]
+   [ 0.9700847  -0.03739532 -0.04046015 ... -0.0640891  -0.11664858
+    -0.14288443]
+   [-0.20942387 -0.21343033 -0.00624497 ...  0.05516734 -0.33565474
+     0.75833493]
+   ...
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]]]]
+
+Converted tensor: 
+
+[[[[-0.0023386   0.00352692 -0.13370706 ... -0.18872213  0.09370426
+    -0.04139195]
+   [ 0.09375713  0.09519145  0.04368616 ... -0.17057195 -0.09237722
+     0.09026653]
+   [ 0.19408953 -0.10522129 -0.5198614  ... -0.3543182  -0.18219939
+    -0.31666392]
+   ...
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]]
+
+  [[-0.0023386   0.00352692 -0.13370706 ... -0.18872213  0.09370426
+    -0.04139195]
+   [ 0.09375713  0.09519145  0.04368616 ... -0.17057195 -0.09237722
+     0.09026653]
+   [ 0.19408953 -0.10522129 -0.5198614  ... -0.3543182  -0.18219939
+    -0.31666392]
+   ...
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]]
+
+  [[-0.5218679  -0.28046846 -0.03100408 ...  0.12330665 -0.1764081
+    -0.10358601]
+   [-0.43913472 -0.25189692  0.12411546 ... -0.04670386  0.47970027
+     0.1339655 ]
+   [ 0.80941284  0.33414027  0.10742759 ... -0.17197524 -0.16508804
+    -0.20685425]
+   ...
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]]
+
+  ...
+
+  [[-0.02867951  0.0564835   0.0150851  ...  0.74035805 -0.30081287
+     0.31962818]
+   [ 0.07382207 -0.05249734  0.05087743 ...  0.82050836 -0.03774352
+     0.41221875]
+   [-0.10616651 -0.07183579 -0.02862857 ...  0.13253474  0.73543155
+     0.6359683 ]
+   ...
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]]
+
+  [[ 0.94968927 -0.00939775 -0.0047697  ... -0.04689858 -0.08846093
+    -0.20122004]
+   [ 0.97008485 -0.03739532 -0.04046015 ... -0.0640891  -0.11664858
+    -0.14288445]
+   [-0.20942406 -0.21343052 -0.00624497 ...  0.05516739 -0.33565506
+     0.7583357 ]
+   ...
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]]
+
+  [[ 0.94968927 -0.00939775 -0.0047697  ... -0.04689858 -0.08846093
+    -0.20122004]
+   [ 0.97008485 -0.03739532 -0.04046015 ... -0.0640891  -0.11664858
+    -0.14288445]
+   [-0.20942406 -0.21343052 -0.00624497 ...  0.05516739 -0.33565506
+     0.7583357 ]
+   ...
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]]]]
+
+
+
+Layer 1, Token 1 (k padded comparison):
+  Original tensor sum: 46.146324
+  Converted tensor sum: 46.146336
+  Original tensor mean: 0.002253
+  Converted tensor mean: 0.002253
+ Mean difference: 0.00000002
+ Maximum pointwise difference: 0.00001496
+ Max difference location: (0, 24, 0, 4)
+  Values at max diff - Original: -0.75322348, Converted: -0.75323844
+ Biggest difference in row (0, 4, 0), sum -1.893247 vs -1.893263
+
+Layer 2, Token 1 (k padded comparison):
+  Original tensor sum: 38.402348
+  Converted tensor sum: 38.402321
+  Original tensor mean: 0.001875
+  Converted tensor mean: 0.001875
+ Mean difference: 0.00000002
+ Maximum pointwise difference: 0.00000370
+ Max difference location: (0, 4, 0, 1)
+  Values at max diff - Original: 0.75365573, Converted: 0.75365943
+ Biggest difference in row (0, 8, 0), sum -1.569355 vs -1.569358
+
+Layer 4, Token 1 (k padded comparison):
+  Original tensor sum: -80.321693
+  Converted tensor sum: -80.319084
+  Original tensor mean: -0.003922
+  Converted tensor mean: -0.003922
+ Mean difference: 0.00000094
+ Maximum pointwise difference: 0.00016582
+ Max difference location: (0, 12, 1, 2)
+  Values at max diff - Original: 0.42303348, Converted: 0.42286766
+ Biggest difference in row (0, 4, 3), sum -0.373179 vs -0.372919
+
+Layer 5, Token 1 (k padded comparison):
+  Original tensor sum: -101.494308
+  Converted tensor sum: -101.496490
+  Original tensor mean: -0.004956
+  Converted tensor mean: -0.004956
+ Mean difference: 0.00000073
+ Maximum pointwise difference: 0.00011382
+ Max difference location: (0, 8, 3, 4)
+  Values at max diff - Original: -0.06280152, Converted: -0.06291535
+ Biggest difference in row (0, 24, 2), sum -1.003613 vs -1.003973
+
+Layer 6, Token 1 (k padded comparison):
+  Original tensor sum: -60.378914
+  Converted tensor sum: -60.399891
+  Original tensor mean: -0.002948
+  Converted tensor mean: -0.002949
+ Mean difference: 0.00000342
+ Maximum pointwise difference: 0.00096719
+ Max difference location: (0, 8, 1, 5)
+  Values at max diff - Original: 0.19049226, Converted: 0.19145945
+ Biggest difference in row (0, 20, 0), sum -1.118855 vs -1.120621
+
+Layer 8, Token 1 (k padded comparison):
+  Original tensor sum: -61.474350
+  Converted tensor sum: -61.483994
+  Original tensor mean: -0.003002
+  Converted tensor mean: -0.003002
+ Mean difference: 0.00000346
+ Maximum pointwise difference: 0.00061786
+ Max difference location: (0, 8, 2, 7)
+  Values at max diff - Original: 0.35214049, Converted: 0.35275835
+ Biggest difference in row (0, 20, 3), sum -0.407597 vs -0.408426
+
+Layer 9, Token 1 (k padded comparison):
+  Original tensor sum: -110.836624
+  Converted tensor sum: -110.841522
+  Original tensor mean: -0.005412
+  Converted tensor mean: -0.005412
+ Mean difference: 0.00000378
+ Maximum pointwise difference: 0.00051466
+ Max difference location: (0, 18, 1, 8)
+  Values at max diff - Original: 0.40876523, Converted: 0.40927988
+ Biggest difference in row (0, 28, 3), sum -0.911474 vs -0.910520
+
+Layer 10, Token 1 (k padded comparison):
+  Original tensor sum: -90.985107
+  Converted tensor sum: -90.978966
+  Original tensor mean: -0.004443
+  Converted tensor mean: -0.004442
+ Mean difference: 0.00000465
+ Maximum pointwise difference: 0.00078443
+ Max difference location: (0, 18, 3, 6)
+  Values at max diff - Original: 0.38864151, Converted: 0.38785708
+ Biggest difference in row (0, 18, 3), sum -0.245571 vs -0.247415
+
+Layer 12, Token 1 (k padded comparison):
+  Original tensor sum: -80.152397
+  Converted tensor sum: -80.143387
+  Original tensor mean: -0.003914
+  Converted tensor mean: -0.003913
+ Mean difference: 0.00000377
+ Maximum pointwise difference: 0.00053528
+ Max difference location: (0, 4, 2, 6)
+  Values at max diff - Original: 0.33732986, Converted: 0.33786514
+ Biggest difference in row (0, 26, 2), sum -2.083733 vs -2.084640
+
+Layer 13, Token 1 (k padded comparison):
+  Original tensor sum: -149.692871
+  Converted tensor sum: -149.699692
+  Original tensor mean: -0.007309
+  Converted tensor mean: -0.007310
+ Mean difference: 0.00000382
+ Maximum pointwise difference: 0.00069700
+ Max difference location: (0, 24, 2, 1)
+  Values at max diff - Original: 0.03209215, Converted: 0.03139514
+ Biggest difference in row (0, 18, 3), sum -1.337807 vs -1.338803
+
+Layer 14, Token 1 (k padded comparison):
+  Original tensor sum: -158.503815
+  Converted tensor sum: -158.505280
+  Original tensor mean: -0.007739
+  Converted tensor mean: -0.007740
+ Mean difference: 0.00000406
+ Maximum pointwise difference: 0.00088650
+ Max difference location: (0, 18, 3, 0)
+  Values at max diff - Original: 0.31103787, Converted: 0.31192437
+ Biggest difference in row (0, 24, 2), sum -2.245067 vs -2.246189
+
+================================================================================
+Comparing v padded tensors...
+================================================================================
+
+Layer 0, Token 1 (v padded comparison):
+  Original tensor sum: 43.396095
+  Converted tensor sum: 43.396103
+  Original tensor mean: 0.002119
+  Converted tensor mean: 0.002119
+ Mean difference: 0.00000000
+ Maximum pointwise difference: 0.00000024
+ Max difference location: (0, 4, 3, 1)
+  Values at max diff - Original: 3.02466559, Converted: 3.02466583
+ Biggest difference in row (0, 4, 3), sum 4.080367 vs 4.080368
+Original tensor: 
+
+[[[[ 0.29945952  0.07364164  0.00633647 ... -0.03352018 -0.13518293
+    -0.24422395]
+   [-0.06384649  0.34527305  0.05128174 ...  0.10202903 -0.27791512
+    -0.26350227]
+   [ 0.32036152 -0.10731668 -0.13258429 ...  0.7373227  -0.21349299
+     0.09487297]
+   ...
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]]
+
+  [[ 0.09463742  0.3331761   0.04175158 ... -0.16139531  0.14495076
+    -0.23538315]
+   [ 0.0059099  -0.22937416 -0.01920018 ... -0.2725759   0.3779854
+    -0.25018957]
+   [-0.02874102 -0.1163442  -0.06129871 ... -0.24273473 -0.2218994
+     0.09502672]
+   ...
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]]
+
+  [[-0.01040334 -0.16231607 -0.19213551 ...  0.26839197 -0.14292948
+    -0.0833158 ]
+   [-0.22485131 -0.26889268 -0.03555897 ... -0.26755306 -0.27845183
+    -0.15565467]
+   [-0.27764964  2.820727   -0.24290419 ...  0.12924032 -0.22718066
+     0.06345078]
+   ...
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]]
+
+  ...
+
+  [[ 0.05224958 -0.27178496  0.02280007 ... -0.17813048 -0.00848302
+     0.3436797 ]
+   [-0.23870829  0.0102903   0.09486482 ... -0.17058551  0.10059616
+     0.45001176]
+   [-0.24846101  1.1912329  -0.26268318 ...  0.148858    0.10272522
+     0.21719539]
+   ...
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]]
+
+  [[-0.20595089 -0.02217443  0.01070492 ...  0.00675152  0.02506094
+    -0.0267982 ]
+   [-0.21499586 -0.25627282 -0.07001566 ...  0.00795406 -0.02202371
+    -0.01158573]
+   [ 0.04917984 -0.27141818 -0.26334    ... -0.09943416  0.03347556
+     0.10718762]
+   ...
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]]
+
+  [[ 0.1791143  -0.0034847   0.9858279  ...  0.19559488 -0.0804936
+    -0.01883564]
+   [-0.17319466  0.07188834 -0.26032022 ... -0.04845351 -0.24498041
+     0.12539098]
+   [ 0.00640415 -0.22212675 -0.22916575 ... -0.170733    0.5452839
+    -0.14139794]
+   ...
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]]]]
+
+Converted tensor: 
+
+[[[[ 0.29945952  0.07364164  0.00633647 ... -0.03352018 -0.13518293
+    -0.24422395]
+   [-0.06384649  0.34527302  0.05128174 ...  0.10202905 -0.27791512
+    -0.26350227]
+   [ 0.3203615  -0.10731667 -0.13258429 ...  0.7373226  -0.213493
+     0.09487297]
+   ...
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]]
+
+  [[ 0.09463742  0.33317608  0.04175158 ... -0.16139533  0.14495076
+    -0.23538315]
+   [ 0.0059099  -0.22937416 -0.01920018 ... -0.27257589  0.3779854
+    -0.25018957]
+   [-0.02874102 -0.11634421 -0.06129871 ... -0.24273473 -0.22189939
+     0.09502671]
+   ...
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]]
+
+  [[-0.01040334 -0.16231604 -0.19213554 ...  0.268392   -0.14292948
+    -0.0833158 ]
+   [-0.22485131 -0.26889268 -0.03555898 ... -0.26755306 -0.27845183
+    -0.15565467]
+   [-0.27764964  2.820727   -0.24290417 ...  0.12924033 -0.22718067
+     0.06345078]
+   ...
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]]
+
+  ...
+
+  [[ 0.05224958 -0.27178493  0.02280007 ... -0.17813048 -0.00848302
+     0.34367973]
+   [-0.23870829  0.0102903   0.09486482 ... -0.17058551  0.10059617
+     0.45001176]
+   [-0.248461    1.1912329  -0.26268318 ...  0.148858    0.10272522
+     0.21719539]
+   ...
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]]
+
+  [[-0.20595089 -0.02217443  0.01070492 ...  0.00675152  0.02506094
+    -0.0267982 ]
+   [-0.21499586 -0.2562728  -0.07001566 ...  0.00795406 -0.02202371
+    -0.01158573]
+   [ 0.04917984 -0.27141815 -0.26334    ... -0.09943416  0.03347556
+     0.10718761]
+   ...
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]]
+
+  [[ 0.1791143  -0.0034847   0.985828   ...  0.19559486 -0.08049361
+    -0.01883564]
+   [-0.17319466  0.07188834 -0.2603202  ... -0.04845351 -0.24498038
+     0.12539098]
+   [ 0.00640414 -0.22212675 -0.22916573 ... -0.17073299  0.5452839
+    -0.14139794]
+   ...
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]
+   [ 0.          0.          0.         ...  0.          0.
+     0.        ]]]]
+
+
+
+Layer 1, Token 1 (v padded comparison):
+  Original tensor sum: 64.583611
+  Converted tensor sum: 64.583618
+  Original tensor mean: 0.003153
+  Converted tensor mean: 0.003153
+ Mean difference: 0.00000000
+ Maximum pointwise difference: 0.00000083
+ Max difference location: (0, 31, 2, 0)
+  Values at max diff - Original: 1.71371531, Converted: 1.71371615
+ Biggest difference in row (0, 14, 2), sum 3.047640 vs 3.047641
+
+Layer 2, Token 1 (v padded comparison):
+  Original tensor sum: 79.718636
+  Converted tensor sum: 79.718628
+  Original tensor mean: 0.003893
+  Converted tensor mean: 0.003893
+ Mean difference: 0.00000002
+ Maximum pointwise difference: 0.00000691
+ Max difference location: (0, 3, 3, 0)
+  Values at max diff - Original: 3.08589840, Converted: 3.08589149
+ Biggest difference in row (0, 3, 3), sum 5.127280 vs 5.127275
+
+Layer 4, Token 1 (v padded comparison):
+  Original tensor sum: -6.421658
+  Converted tensor sum: -6.417439
+  Original tensor mean: -0.000314
+  Converted tensor mean: -0.000313
+ Mean difference: 0.00000083
+ Maximum pointwise difference: 0.00020146
+ Max difference location: (0, 3, 3, 9)
+  Values at max diff - Original: 0.71459866, Converted: 0.71439719
+ Biggest difference in row (0, 2, 2), sum 1.330729 vs 1.330986
+
+Layer 5, Token 1 (v padded comparison):
+  Original tensor sum: -22.732481
+  Converted tensor sum: -22.732681
+  Original tensor mean: -0.001110
+  Converted tensor mean: -0.001110
+ Mean difference: 0.00000057
+ Maximum pointwise difference: 0.00014561
+ Max difference location: (0, 5, 2, 8)
+  Values at max diff - Original: 0.86213899, Converted: 0.86199337
+ Biggest difference in row (0, 5, 2), sum 0.321165 vs 0.320951
+
+Layer 6, Token 1 (v padded comparison):
+  Original tensor sum: 79.420486
+  Converted tensor sum: 79.392494
+  Original tensor mean: 0.003878
+  Converted tensor mean: 0.003877
+ Mean difference: 0.00000437
+ Maximum pointwise difference: 0.00160646
+ Max difference location: (0, 28, 3, 8)
+  Values at max diff - Original: 3.32436800, Converted: 3.32276154
+ Biggest difference in row (0, 8, 2), sum 5.307434 vs 5.305095
+
+Layer 8, Token 1 (v padded comparison):
+  Original tensor sum: 56.337997
+  Converted tensor sum: 56.328655
+  Original tensor mean: 0.002751
+  Converted tensor mean: 0.002750
+ Mean difference: 0.00000345
+ Maximum pointwise difference: 0.00109446
+ Max difference location: (0, 27, 3, 8)
+  Values at max diff - Original: 1.29648387, Converted: 1.29538941
+ Biggest difference in row (0, 0, 2), sum 3.391128 vs 3.390095
+
+Layer 9, Token 1 (v padded comparison):
+  Original tensor sum: -60.833374
+  Converted tensor sum: -60.822338
+  Original tensor mean: -0.002970
+  Converted tensor mean: -0.002970
+ Mean difference: 0.00000277
+ Maximum pointwise difference: 0.00082873
+ Max difference location: (0, 4, 2, 0)
+  Values at max diff - Original: 0.17745507, Converted: 0.17828380
+ Biggest difference in row (0, 29, 3), sum -0.619908 vs -0.618863
+
+Layer 10, Token 1 (v padded comparison):
+  Original tensor sum: -61.881168
+  Converted tensor sum: -61.881893
+  Original tensor mean: -0.003022
+  Converted tensor mean: -0.003022
+ Mean difference: 0.00000326
+ Maximum pointwise difference: 0.00088513
+ Max difference location: (0, 18, 3, 1)
+  Values at max diff - Original: 0.75186056, Converted: 0.75097543
+ Biggest difference in row (0, 1, 2), sum -0.687588 vs -0.688463
+
+Layer 12, Token 1 (v padded comparison):
+  Original tensor sum: -25.326912
+  Converted tensor sum: -25.328352
+  Original tensor mean: -0.001237
+  Converted tensor mean: -0.001237
+ Mean difference: 0.00000326
+ Maximum pointwise difference: 0.00108600
+ Max difference location: (0, 26, 1, 1)
+  Values at max diff - Original: 2.54334521, Converted: 2.54225922
+ Biggest difference in row (0, 16, 2), sum 1.421780 vs 1.420637
+
+Layer 13, Token 1 (v padded comparison):
+  Original tensor sum: -76.935516
+  Converted tensor sum: -76.941040
+  Original tensor mean: -0.003757
+  Converted tensor mean: -0.003757
+ Mean difference: 0.00000263
+ Maximum pointwise difference: 0.00127554
+ Max difference location: (0, 19, 1, 3)
+  Values at max diff - Original: 2.36973763, Converted: 2.36846209
+ Biggest difference in row (0, 19, 1), sum 1.449438 vs 1.448400
+
+Layer 14, Token 1 (v padded comparison):
+  Original tensor sum: -45.008949
+  Converted tensor sum: -45.003647
+  Original tensor mean: -0.002198
+  Converted tensor mean: -0.002197
+ Mean difference: 0.00000327
+ Maximum pointwise difference: 0.00136590
+ Max difference location: (0, 28, 3, 5)
+  Values at max diff - Original: 2.56902742, Converted: 2.56766152
+ Biggest difference in row (0, 28, 3), sum 1.363533 vs 1.361795
+
+================================================================================
+SUMMARY:
+Total comparisons attempted: 876
+Successful comparisons: 875
+Failed comparisons: 1
+
+Maximum difference statistics:
+  Min max difference: 0.00000024
+ Max max difference: 235.55526733
+ Mean of max differences: 18.71273422
+  Median of max differences: 5.37744808
+  Comparisons with diff > 1e-5: 804/875

+ 15 - 1
tools/main/main.cpp

@@ -258,7 +258,21 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
             }
             save_tensor(t, data, (tensor_name + "_" + std::to_string(cb_data->tensors[t->name]) + ".bin").c_str());
         }
-        ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
+        ggml_print_tensor(data, t->type, t->ne, t->nb, 4);
+        if (std::string(tensor_name) == std::string("attn_out_reshaped-0")) {
+            LOG("\nFull output tensor: \n[ ");
+            for (int64_t i3 = 0; i3 < t->ne[3]; i3++) {
+                for (int64_t i2 = 0; i2 < t->ne[2]; i2++) {
+                    for (int64_t i1 = 0; i1 < t->ne[1]; i1++) {
+                        for (int64_t i0 = 0; i0 < t->ne[0]; i0++) {
+                            const float v = ggml_get_float_value(data, t->type, t->nb, i0, i1, i2, i3);
+                            LOG("%.4f  ", v);
+                        }
+                    }
+                }
+            }
+            LOG(" ]");
+        }
     }
 
     return true;