hace 4 meses · 8ad038c0fd
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -244,6 +244,13 @@
 
				 #define GGML_MROPE_SECTIONS   4
			
 
				 
			
 
				 #define GGML_UNUSED(x) (void)(x)
			
 
				+#ifdef __CUDACC__
			
 
				+template<typename... Args>
			
 
				+__host__ __device__ constexpr inline void ggml_unused_vars_impl(Args&&...) noexcept {}
			
 
				+#define GGML_UNUSED_VARS(...) ggml_unused_vars_impl(__VA_ARGS__)
			
 
				+#else
			
 
				+#define GGML_UNUSED_VARS(...) do { (void)sizeof((__VA_ARGS__, 0)); } while(0)
			
 
				+#endif // __CUDACC__
			
 
				 
			
 
				 #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
			
 
				 
			
--- a/ggml/src/ggml-cuda/conv-transpose-1d.cu
+++ b/ggml/src/ggml-cuda/conv-transpose-1d.cu
@@ -34,10 +34,7 @@ static  __global__ void conv_transpose_1d_kernel(
 
				         }
			
 
				     }
			
 
				     dst[global_index] = accumulator;
			
 
				-    GGML_UNUSED(p0); GGML_UNUSED(d0); GGML_UNUSED(src0_ne3);
			
 
				-    GGML_UNUSED(src1_ne3); GGML_UNUSED(dst_ne3);
			
 
				-    GGML_UNUSED(src1_ne1); GGML_UNUSED(dst_ne1);
			
 
				-    GGML_UNUSED(src1_ne2); GGML_UNUSED(dst_ne2);
			
 
				+    GGML_UNUSED_VARS(p0, d0, src0_ne3, src1_ne3, dst_ne3, src1_ne1, dst_ne1, src1_ne2, dst_ne2);
			
 
				 }
			
 
				 
			
 
				 static void conv_transpose_1d_f32_f32_cuda(
			
--- a/ggml/src/ggml-cuda/convert.cu
+++ b/ggml/src/ggml-cuda/convert.cu
@@ -71,9 +71,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
 
				         y2[iy/2 + threadIdx.x] = __hmul2(make_half2(qs.x, qs.y), __half2half2(d));
			
 
				     }
			
 
				 #else
			
 
				-    GGML_UNUSED(vx);
			
 
				-    GGML_UNUSED(y);
			
 
				-    GGML_UNUSED(k);
			
 
				+    GGML_UNUSED_VARS(vx, y, k);
			
 
				     NO_DEVICE_CODE;
			
 
				 #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
			
 
				 }
			
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -134,8 +134,7 @@ void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_des
 
				     CUDA_CHECK(cudaMemcpyAsync(cuda_graph->dest_ptrs_d, host_dest_ptrs, host_dest_ptrs_size*sizeof(char *), cudaMemcpyHostToDevice, stream));
			
 
				     cuda_graph->graph_cpynode_index = 0; // reset index
			
 
				 #else
			
 
				-    GGML_UNUSED(cuda_graph); GGML_UNUSED(host_dest_ptrs);
			
 
				-    GGML_UNUSED(host_dest_ptrs_size); GGML_UNUSED(stream);
			
 
				+    GGML_UNUSED_VARS(cuda_graph, host_dest_ptrs, host_dest_ptrs_size, stream);
			
 
				 #endif
			
 
				 }
			
 
				 
			
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -767,14 +767,11 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
 
				         }
			
 
				     }
			
 
				 #else
			
 
				-    GGML_UNUSED(Q_f2); GGML_UNUSED(K_h2); GGML_UNUSED(V_h2);
			
 
				-    GGML_UNUSED(mask_h2); GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup);
			
 
				-    GGML_UNUSED(scale); GGML_UNUSED(slope); GGML_UNUSED(logit_softcap);
			
 
				-    GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_K); GGML_UNUSED(stride_V);
			
 
				-    GGML_UNUSED(stride_mask); GGML_UNUSED(tile_K);
			
 
				-    GGML_UNUSED(tile_V); GGML_UNUSED(tile_mask); GGML_UNUSED(Q_B);
			
 
				-    GGML_UNUSED(VKQ_C); GGML_UNUSED(KQ_max); GGML_UNUSED(KQ_rowsum);
			
 
				-    GGML_UNUSED(kb0); GGML_UNUSED(tile_Q);
			
 
				+    GGML_UNUSED_VARS(Q_f2, K_h2, V_h2, mask_h2, dstk, dstk_fixup,
			
 
				+        scale, slope, logit_softcap, ne01, ne02,
			
 
				+        stride_K, stride_V, stride_mask,
			
 
				+        tile_Q, tile_K, tile_V, tile_mask,
			
 
				+        Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0);
			
 
				     NO_DEVICE_CODE;
			
 
				 #endif // TURING_MMA_AVAILABLE
			
 
				 }
			
@@ -1236,14 +1233,10 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
 
				         }
			
 
				     }
			
 
				 #else
			
 
				-    GGML_UNUSED(Q_f2); GGML_UNUSED(K_h2); GGML_UNUSED(V_h2);
			
 
				-    GGML_UNUSED(mask_h2); GGML_UNUSED(sinks_f);
			
 
				-    GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup);
			
 
				-    GGML_UNUSED(scale); GGML_UNUSED(slope); GGML_UNUSED(logit_softcap);
			
 
				-    GGML_UNUSED(ne01); GGML_UNUSED(ne02);
			
 
				-    GGML_UNUSED(stride_Q1); GGML_UNUSED(stride_Q2);
			
 
				-    GGML_UNUSED(stride_K); GGML_UNUSED(stride_V); GGML_UNUSED(stride_mask);
			
 
				-    GGML_UNUSED(jt); GGML_UNUSED(kb0_start); GGML_UNUSED(kb0_stop);
			
 
				+    GGML_UNUSED_VARS(Q_f2, K_h2, V_h2, mask_h2, sinks_f, dstk, dstk_fixup,
			
 
				+        scale, slope, logit_softcap, ne01, ne02,
			
 
				+        stride_Q1, stride_Q2, stride_K, stride_V, stride_mask,
			
 
				+        jt, kb0_start, kb0_stop);
			
 
				     NO_DEVICE_CODE;
			
 
				 #endif // TURING_MMA_AVAILABLE
			
 
				 }
			
@@ -1397,17 +1390,15 @@ static __global__ void flash_attn_ext_f16(
 
				         (Q_f2, K_h2, V_h2, mask_h2, sinks_f, dstk, dst_meta, scale, slope, logit_softcap,
			
 
				          ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
			
 
				 #else
			
 
				-    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
			
 
				-    GGML_UNUSED(sinks); GGML_UNUSED(KV_max); GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
			
 
				-    GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
			
 
				-    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
			
 
				-    GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
			
 
				-    GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
			
 
				-    GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
			
 
				-    GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
			
 
				-    GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
			
 
				-    GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
			
 
				-    GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33);
			
 
				+    GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
			
 
				+        max_bias, m0, m1, n_head_log2, logit_softcap,
			
 
				+        ne00, ne01, ne02, ne03,
			
 
				+              nb01, nb02, nb03,
			
 
				+        ne10, ne11, ne12, ne13,
			
 
				+              nb11, nb12, nb13,
			
 
				+              nb21, nb22, nb23,
			
 
				+              ne31, ne32, ne33,
			
 
				+              nb31, nb32, nb33);
			
 
				     NO_DEVICE_CODE;
			
 
				 #endif // defined(FLASH_ATTN_AVAILABLE) && defined(TURING_MMA_AVAILABLE)
			
 
				 }
			
--- a/ggml/src/ggml-cuda/fattn-tile-f16.cu
+++ b/ggml/src/ggml-cuda/fattn-tile-f16.cu
@@ -299,17 +299,15 @@ static __global__ void flash_attn_tile_ext_f16(
 
				         }
			
 
				     }
			
 
				 #else
			
 
				-    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
			
 
				-    GGML_UNUSED(sinks); GGML_UNUSED(KV_max); GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
			
 
				-    GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
			
 
				-    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
			
 
				-    GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
			
 
				-    GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
			
 
				-    GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
			
 
				-    GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
			
 
				-    GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
			
 
				-    GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
			
 
				-    GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33);
			
 
				+    GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
			
 
				+        max_bias, m0, m1, n_head_log2, logit_softcap,
			
 
				+        ne00, ne01, ne02, ne03,
			
 
				+              nb01, nb02, nb03,
			
 
				+        ne10, ne11, ne12, ne13,
			
 
				+              nb11, nb12, nb13,
			
 
				+              nb21, nb22, nb23,
			
 
				+              ne31, ne32, ne33,
			
 
				+              nb31, nb32, nb33);
			
 
				     NO_DEVICE_CODE;
			
 
				 #endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
			
 
				 }
			
--- a/ggml/src/ggml-cuda/fattn-tile-f32.cu
+++ b/ggml/src/ggml-cuda/fattn-tile-f32.cu
@@ -38,6 +38,15 @@ static __global__ void flash_attn_tile_ext_f32(
 
				     return;
			
 
				 #endif // FP16_MMA_AVAILABLE
			
 
				     if (use_logit_softcap && !(D == 128 || D == 256)) {
			
 
				+        GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
			
 
				+            max_bias, m0, m1, n_head_log2, logit_softcap,
			
 
				+            ne00, ne01, ne02, ne03,
			
 
				+                  nb01, nb02, nb03,
			
 
				+            ne10, ne11, ne12, ne13,
			
 
				+                  nb11, nb12, nb13,
			
 
				+                  nb21, nb22, nb23,
			
 
				+                  ne31, ne32, ne33,
			
 
				+                  nb31, nb32, nb33);
			
 
				         NO_DEVICE_CODE;
			
 
				         return;
			
 
				     }
			
@@ -301,17 +310,15 @@ static __global__ void flash_attn_tile_ext_f32(
 
				         }
			
 
				     }
			
 
				 #else
			
 
				-    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
			
 
				-    GGML_UNUSED(sinks); GGML_UNUSED(KV_max); GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
			
 
				-    GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
			
 
				-    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
			
 
				-    GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
			
 
				-    GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
			
 
				-    GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
			
 
				-    GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
			
 
				-    GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
			
 
				-    GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
			
 
				-    GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33);
			
 
				+    GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
			
 
				+        max_bias, m0, m1, n_head_log2, logit_softcap,
			
 
				+        ne00, ne01, ne02, ne03,
			
 
				+              nb01, nb02, nb03,
			
 
				+        ne10, ne11, ne12, ne13,
			
 
				+              nb11, nb12, nb13,
			
 
				+              nb21, nb22, nb23,
			
 
				+              ne31, ne32, ne33,
			
 
				+              nb31, nb32, nb33);
			
 
				     NO_DEVICE_CODE;
			
 
				 #endif // FLASH_ATTN_AVAILABLE
			
 
				 }
			
--- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
@@ -349,17 +349,15 @@ static __global__ void flash_attn_vec_ext_f16(
 
				         dst_meta[((sequence*ne01 + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
			
 
				     }
			
 
				 #else
			
 
				-    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
			
 
				-    GGML_UNUSED(sinks); GGML_UNUSED(KV_max); GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
			
 
				-    GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
			
 
				-    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
			
 
				-    GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
			
 
				-    GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
			
 
				-    GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
			
 
				-    GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
			
 
				-    GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
			
 
				-    GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
			
 
				-    GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33);
			
 
				+    GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
			
 
				+        max_bias, m0, m1, n_head_log2, logit_softcap,
			
 
				+        ne00, ne01, ne02, ne03,
			
 
				+              nb01, nb02, nb03,
			
 
				+        ne10, ne11, ne12, ne13,
			
 
				+              nb11, nb12, nb13,
			
 
				+              nb21, nb22, nb23,
			
 
				+              ne31, ne32, ne33,
			
 
				+              nb31, nb32, nb33);
			
 
				     NO_DEVICE_CODE;
			
 
				 #endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
			
 
				 }
			
--- a/ggml/src/ggml-cuda/fattn-vec-f32.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f32.cuh
@@ -37,6 +37,15 @@ static __global__ void flash_attn_vec_ext_f32(
 
				 
			
 
				     // Skip unused kernel variants for faster compilation:
			
 
				     if (use_logit_softcap && !(D == 128 || D == 256)) {
			
 
				+        GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
			
 
				+            max_bias, m0, m1, n_head_log2, logit_softcap,
			
 
				+            ne00, ne01, ne02, ne03,
			
 
				+                  nb01, nb02, nb03,
			
 
				+            ne10, ne11, ne12, ne13,
			
 
				+                  nb11, nb12, nb13,
			
 
				+                  nb21, nb22, nb23,
			
 
				+                  ne31, ne32, ne33,
			
 
				+                  nb31, nb32, nb33);
			
 
				         NO_DEVICE_CODE;
			
 
				         return;
			
 
				     }
			
@@ -334,17 +343,15 @@ static __global__ void flash_attn_vec_ext_f32(
 
				         dst_meta[((sequence*ne01 + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
			
 
				     }
			
 
				 #else
			
 
				-    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
			
 
				-    GGML_UNUSED(sinks); GGML_UNUSED(KV_max); GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
			
 
				-    GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
			
 
				-    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
			
 
				-    GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
			
 
				-    GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
			
 
				-    GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
			
 
				-    GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33);
			
 
				-    GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
			
 
				-    GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
			
 
				-    GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
			
 
				+    GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
			
 
				+        max_bias, m0, m1, n_head_log2, logit_softcap,
			
 
				+        ne00, ne01, ne02, ne03,
			
 
				+              nb01, nb02, nb03,
			
 
				+        ne10, ne11, ne12, ne13,
			
 
				+              nb11, nb12, nb13,
			
 
				+              nb21, nb22, nb23,
			
 
				+              ne31, ne32, ne33,
			
 
				+              nb31, nb32, nb33);
			
 
				     NO_DEVICE_CODE;
			
 
				 #endif // FLASH_ATTN_AVAILABLE
			
 
				 }
			
--- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu
+++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
@@ -471,16 +471,15 @@ static __global__ void flash_attn_ext_f16(
 
				         dst_meta[j_dst_unrolled] = dst_meta_val;
			
 
				     }
			
 
				 #else
			
 
				-    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
			
 
				-    GGML_UNUSED(sinks); GGML_UNUSED(KV_max); GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
			
 
				-    GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
			
 
				-    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
			
 
				-    GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
			
 
				-    GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
			
 
				-    GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33); GGML_UNUSED(nb31);
			
 
				-    GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
			
 
				-    GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
			
 
				-    GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
			
 
				+    GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
			
 
				+        max_bias, m0, m1, n_head_log2, logit_softcap,
			
 
				+        ne00, ne01, ne02, ne03,
			
 
				+              nb01, nb02, nb03,
			
 
				+        ne10, ne11, ne12, ne13,
			
 
				+              nb11, nb12, nb13,
			
 
				+              nb21, nb22, nb23,
			
 
				+              ne31, ne32, ne33,
			
 
				+              nb31, nb32, nb33);
			
 
				     NO_DEVICE_CODE;
			
 
				 #endif // defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)))
			
 
				 }
			
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -1328,9 +1328,7 @@ static void ggml_cuda_op_mul_mat_cublas(
 
				                     &beta,  dst_dd_i,    ldc));
			
 
				     }
			
 
				 
			
 
				-    GGML_UNUSED(dst);
			
 
				-    GGML_UNUSED(src1_ddq_i);
			
 
				-    GGML_UNUSED(src1_padded_row_size);
			
 
				+    GGML_UNUSED_VARS(dst, src1_ddq_i, src1_padded_row_size);
			
 
				 }
			
 
				 
			
 
				 static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
			
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -291,9 +291,7 @@ namespace ggml_cuda_mma {
 
				             : "=r"(xi[0]), "=r"(xi[2]), "=r"(xi[1]), "=r"(xi[3])
			
 
				             : "l"(xs));
			
 
				 #else
			
 
				-        GGML_UNUSED(t);
			
 
				-        GGML_UNUSED(xs0);
			
 
				-        GGML_UNUSED(stride);
			
 
				+        GGML_UNUSED_VARS(t, xs0, stride);
			
 
				         NO_DEVICE_CODE;
			
 
				 #endif // TURING_MMA_AVAILABLE
			
 
				     }
			
@@ -315,9 +313,7 @@ namespace ggml_cuda_mma {
 
				             : "r"(A.x[1]), "r"(B.x[0]));
			
 
				 #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
			
 
				 #else
			
 
				-        GGML_UNUSED(D);
			
 
				-        GGML_UNUSED(A);
			
 
				-        GGML_UNUSED(B);
			
 
				+        GGML_UNUSED_VARS(D, A, B);
			
 
				         NO_DEVICE_CODE;
			
 
				 #endif // TURING_MMA_AVAILABLE
			
 
				     }
			
@@ -345,9 +341,7 @@ namespace ggml_cuda_mma {
 
				             : "r"(A.x[3]), "r"(B.x[1]));
			
 
				 #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
			
 
				 #else
			
 
				-        GGML_UNUSED(D);
			
 
				-        GGML_UNUSED(A);
			
 
				-        GGML_UNUSED(B);
			
 
				+        GGML_UNUSED_VARS(D, A, B);
			
 
				         NO_DEVICE_CODE;
			
 
				 #endif // TURING_MMA_AVAILABLE
			
 
				     }
			
@@ -372,9 +366,7 @@ namespace ggml_cuda_mma {
 
				             : "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[1]));
			
 
				 #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
			
 
				 #else
			
 
				-        GGML_UNUSED(D);
			
 
				-        GGML_UNUSED(A);
			
 
				-        GGML_UNUSED(B);
			
 
				+        GGML_UNUSED_VARS(D, A, B);
			
 
				         NO_DEVICE_CODE;
			
 
				 #endif // TURING_MMA_AVAILABLE
			
 
				     }
			
@@ -408,9 +400,7 @@ namespace ggml_cuda_mma {
 
				             : "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[3]));
			
 
				 #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
			
 
				 #else
			
 
				-        GGML_UNUSED(D);
			
 
				-        GGML_UNUSED(A);
			
 
				-        GGML_UNUSED(B);
			
 
				+        GGML_UNUSED_VARS(D, A, B);
			
 
				         NO_DEVICE_CODE;
			
 
				 #endif // TURING_MMA_AVAILABLE
			
 
				     }
			
@@ -425,9 +415,7 @@ namespace ggml_cuda_mma {
 
				             : "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3])
			
 
				             : "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]));
			
 
				 #else
			
 
				-        GGML_UNUSED(D);
			
 
				-        GGML_UNUSED(A);
			
 
				-        GGML_UNUSED(B);
			
 
				+        GGML_UNUSED_VARS(D, A, B);
			
 
				         NO_DEVICE_CODE;
			
 
				 #endif // AMPERE_MMA_AVAILABLE
			
 
				     }
			
@@ -452,9 +440,7 @@ namespace ggml_cuda_mma {
 
				             : "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[1]));
			
 
				 #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
			
 
				 #else
			
 
				-        GGML_UNUSED(D);
			
 
				-        GGML_UNUSED(A);
			
 
				-        GGML_UNUSED(B);
			
 
				+        GGML_UNUSED_VARS(D, A, B);
			
 
				         NO_DEVICE_CODE;
			
 
				 #endif // TURING_MMA_AVAILABLE
			
 
				     }
			
@@ -469,9 +455,7 @@ namespace ggml_cuda_mma {
 
				             : "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3])
			
 
				             : "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]));
			
 
				 #else
			
 
				-        GGML_UNUSED(D);
			
 
				-        GGML_UNUSED(A);
			
 
				-        GGML_UNUSED(B);
			
 
				+        GGML_UNUSED_VARS(D, A, B);
			
 
				         NO_DEVICE_CODE;
			
 
				 #endif // AMPERE_MMA_AVAILABLE
			
 
				     }
			
@@ -505,9 +489,7 @@ namespace ggml_cuda_mma {
 
				             : "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[3]));
			
 
				 #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
			
 
				 #else
			
 
				-        GGML_UNUSED(D);
			
 
				-        GGML_UNUSED(A);
			
 
				-        GGML_UNUSED(B);
			
 
				+        GGML_UNUSED_VARS(D, A, B);
			
 
				         NO_DEVICE_CODE;
			
 
				 #endif // TURING_MMA_AVAILABLE
			
 
				     }
			
@@ -533,9 +515,7 @@ namespace ggml_cuda_mma {
 
				                                                       0, 0, 0);
			
 
				 #endif // defined(CDNA3)
			
 
				 #else
			
 
				-        GGML_UNUSED(D);
			
 
				-        GGML_UNUSED(A);
			
 
				-        GGML_UNUSED(B);
			
 
				+        GGML_UNUSED_VARS(D, A, B);
			
 
				         NO_DEVICE_CODE;
			
 
				 #endif // AMD_MFMA_AVAILABLE
			
 
				     }
			
@@ -561,9 +541,7 @@ namespace ggml_cuda_mma {
 
				                                                      0, 0, 0);
			
 
				 #endif // defined(CDNA3)
			
 
				 #else
			
 
				-        GGML_UNUSED(D);
			
 
				-        GGML_UNUSED(A);
			
 
				-        GGML_UNUSED(B);
			
 
				+        GGML_UNUSED_VARS(D, A, B);
			
 
				         NO_DEVICE_CODE;
			
 
				 #endif // AMD_MFMA_AVAILABLE
			
 
				     }
			
--- a/ggml/src/ggml-cuda/mmf.cu
+++ b/ggml/src/ggml-cuda/mmf.cu
@@ -132,11 +132,11 @@ static __global__ void mul_mat_f(
 
				         dst[j*stride_col_dst + row0 + threadIdx.x] = sum;
			
 
				     }
			
 
				 #else
			
 
				+    GGML_UNUSED_VARS(x, y, ids, dst,
			
 
				+        ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst,
			
 
				+        channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
			
 
				+        sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
			
 
				     NO_DEVICE_CODE;
			
 
				-    GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(ids); GGML_UNUSED(dst);
			
 
				-    GGML_UNUSED(ncols); GGML_UNUSED(nchannels_y); GGML_UNUSED(stride_row); GGML_UNUSED(stride_col_y); GGML_UNUSED(stride_col_dst);
			
 
				-    GGML_UNUSED(channel_ratio); GGML_UNUSED(stride_channel_x); GGML_UNUSED(stride_channel_y); GGML_UNUSED(stride_channel_dst);
			
 
				-    GGML_UNUSED(sample_ratio); GGML_UNUSED(stride_sample_x); GGML_UNUSED(stride_sample_y); GGML_UNUSED(stride_sample_dst);
			
 
				 #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
			
 
				 }
			
 
				 
			
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -266,10 +266,7 @@ void ggml_cuda_op_mul_mat_q(
 
				 
			
 
				     ggml_cuda_mul_mat_q_switch_type(ctx, args, stream);
			
 
				 
			
 
				-    GGML_UNUSED(src1);
			
 
				-    GGML_UNUSED(dst);
			
 
				-    GGML_UNUSED(src1_ddf_i);
			
 
				-    GGML_UNUSED(src1_padded_row_size);
			
 
				+    GGML_UNUSED_VARS(src1, dst, src1_ddf_i, src1_padded_row_size);
			
 
				 }
			
 
				 
			
 
				 bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
			
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -1255,7 +1255,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
 
				         }
			
 
				     }
			
 
				 #else
			
 
				-    GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum); GGML_UNUSED(k00);
			
 
				+    GGML_UNUSED_VARS(x, y, sum, k00);
			
 
				     NO_DEVICE_CODE;
			
 
				 #endif // AMD_MFMA_AVAILABLE
			
 
				 }
			
@@ -1572,7 +1572,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
 
				         }
			
 
				     }
			
 
				 #else
			
 
				-    GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum); GGML_UNUSED(k00);
			
 
				+    GGML_UNUSED_VARS(x, y, sum, k00);
			
 
				     NO_DEVICE_CODE;
			
 
				 #endif // AMD_MFMA_AVAILABLE
			
 
				 }
			
@@ -2301,7 +2301,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
 
				         }
			
 
				     }
			
 
				 #else
			
 
				-    GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum); GGML_UNUSED(k00);
			
 
				+    GGML_UNUSED_VARS(x, y, sum, k00);
			
 
				     NO_DEVICE_CODE;
			
 
				 #endif // AMD_MFMA_AVAILABLE
			
 
				 }
			
--- a/ggml/src/ggml-cuda/mmvf.cu
+++ b/ggml/src/ggml-cuda/mmvf.cu
@@ -433,12 +433,7 @@ void ggml_cuda_op_mul_mat_vec_f(
 
				             GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));
			
 
				     }
			
 
				 
			
 
				-    GGML_UNUSED(ctx);
			
 
				-    GGML_UNUSED(src1);
			
 
				-    GGML_UNUSED(dst);
			
 
				-    GGML_UNUSED(src1_ddq_i);
			
 
				-    GGML_UNUSED(src1_ncols);
			
 
				-    GGML_UNUSED(src1_padded_row_size);
			
 
				+    GGML_UNUSED_VARS(ctx, src1, dst, src1_ddq_i, src1_ncols, src1_padded_row_size);
			
 
				 }
			
 
				 
			
 
				 bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11) {
			
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -596,9 +596,5 @@ void ggml_cuda_op_mul_mat_vec_q(
 
				         src0_dd_i, src0->type, src1_ddq_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row_x, stride_col_y, nrows_dst,
			
 
				         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, stream);
			
 
				 
			
 
				-    GGML_UNUSED(src1);
			
 
				-    GGML_UNUSED(dst);
			
 
				-    GGML_UNUSED(src1_ddf_i);
			
 
				-    GGML_UNUSED(src1_ncols);
			
 
				-    GGML_UNUSED(src1_padded_row_size);
			
 
				+    GGML_UNUSED_VARS(src1, dst, src1_ddf_i, src1_ncols, src1_padded_row_size);
			
 
				 }