1 månad sedan · c45f89d551
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -1976,9 +1976,6 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
 
				             break;
			
 
				 
			
 
				         case GGML_TYPE_F16:
			
 
				-            if (!opt_experimental) {
			
 
				-                return false;
			
 
				-            }
			
 
				             break;
			
 
				 
			
 
				         default:
			
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -903,7 +903,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
 
				         const float * restrict vy  = (const float * restrict) y;
			
 
				 
			
 
				         for (uint32_t i = 0; i < n; i++) {
			
 
				-            rsum += vx[i] * (__fp16) vy[i];
			
 
				+            rsum += (float)vx[i] * vy[i];
			
 
				         }
			
 
				         *s = rsum;
			
 
				         return;
			
@@ -917,7 +917,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
 
				 
			
 
				     // for some reason we need volatile here so that the compiler doesn't try anything funky
			
 
				     volatile HVX_Vector rsum = Q6_V_vsplat_R(0);
			
 
				-
			
 
				+    float r_sum_scalar = 0.0f;
			
 
				     uint32_t i = 0;
			
 
				 
			
 
				     for (i = 0; i < nv0; i++) {
			
@@ -926,31 +926,42 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
 
				         HVX_Vector     x  = vx[i];
			
 
				         HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00));  // mul by 1.0
			
 
				 
			
 
				-        HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
			
 
				-        HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
			
 
				+        //NOTE: need volatile here to prevent compiler optimization
			
 
				+        // Seem compiler cannot guarantee read-after-write??
			
 
				+        volatile HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
			
 
				+        volatile HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
			
 
				 
			
 
				         HVX_Vector sum = Q6_Vqf32_vadd_Vqf32Vqf32(hi, lo);
			
 
				         rsum           = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
			
 
				     }
			
 
				 
			
 
				     if (nv1) {
			
 
				-        HVX_VectorPair yp = vy[i];
			
 
				+        // HVX_VectorPair yp = vy[i];
			
 
				 
			
 
				-        HVX_Vector     x  = vx[i];
			
 
				-        HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00));  // mul by 1.0
			
 
				+        // HVX_Vector     x  = vx[i];
			
 
				+        // HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00));  // mul by 1.0
			
 
				 
			
 
				-        if (nv1 >= 32) {
			
 
				-            HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
			
 
				-            rsum          = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi);
			
 
				-            nv1 -= 32;
			
 
				-        }
			
 
				+        // if (nv1 >= 32) {
			
 
				+        //     volatile HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
			
 
				+        //     rsum          = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi);
			
 
				+        //     nv1 -= 32;
			
 
				+        // }
			
 
				 
			
 
				+        // rsum = hvx_vec_qf32_reduce_sum(rsum);
			
 
				+
			
 
				+        // if (nv1) {
			
 
				+        //     volatile HVX_Vector lo  = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
			
 
				+        //     HVX_Vector sum = hvx_vec_qf32_reduce_sum_n(lo, nv1);
			
 
				+        //     rsum           = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
			
 
				+        // }
			
 
				+
			
 
				+        //process the remainder using scalar loop
			
 
				         rsum = hvx_vec_qf32_reduce_sum(rsum);
			
 
				+        const __fp16 * restrict sx = (const __fp16 * restrict) x;
			
 
				+        const float * restrict sy  = (const float * restrict) y;
			
 
				 
			
 
				-        if (nv1) {
			
 
				-            HVX_Vector lo  = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
			
 
				-            HVX_Vector sum = hvx_vec_qf32_reduce_sum_n(lo, nv1);
			
 
				-            rsum           = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
			
 
				+        for (uint32_t i = nv0 * 64; i < n; i++) {
			
 
				+            r_sum_scalar += (float) sx[i] * sy[i];
			
 
				         }
			
 
				 
			
 
				         // hvx_vec_dump_fp16("X", x);
			
@@ -961,7 +972,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
 
				         rsum = hvx_vec_qf32_reduce_sum(rsum);
			
 
				     }
			
 
				 
			
 
				-    *s = hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(rsum));
			
 
				+    *s = hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(rsum)) + r_sum_scalar;
			
 
				 
			
 
				 #    ifdef HTP_DEBUG
			
 
				     {
			
@@ -1498,9 +1509,6 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
 
				     uint64_t t1, t2;
			
 
				     t1 = HAP_perf_get_qtimer_count();
			
 
				 
			
 
				-    const size_t src0_row_size = sizeof(__fp16) * ne00;
			
 
				-    const size_t src1_row_size = sizeof(float) * ne10;
			
 
				-
			
 
				     assert(ne12 % ne02 == 0);
			
 
				     assert(ne13 % ne03 == 0);
			
 
				 
			
@@ -1510,8 +1518,6 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
 
				     // This is the size of the rest of the dimensions of the result
			
 
				     const uint32_t nr1 = ne1 * ne2 * ne3;
			
 
				 
			
 
				-    uint32_t chunk_size = 64;
			
 
				-
			
 
				     // distribute the thread work across the inner or outer loop based on which one is larger
			
 
				     uint32_t nchunk0 = nr0 > nr1 ? nth : 1;  // parallelize by src0 rows
			
 
				     uint32_t nchunk1 = nr0 > nr1 ? 1 : nth;  // parallelize by src1 rows
			
@@ -1544,11 +1550,11 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
 
				     const uint32_t blck_0 = 64;
			
 
				     const uint32_t blck_1 = 64;
			
 
				 
			
 
				-    float tmp[32];
			
 
				+    __attribute__((aligned(128))) float tmp[64];
			
 
				 
			
 
				     for (uint32_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
			
 
				         for (uint32_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
			
 
				-            for (uint32_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1++) {
			
 
				+            for (uint32_t ir1 = iir1; ir1 < MIN(iir1 + blck_1, ir1_end); ir1++) {
			
 
				                 const uint32_t i13 = (ir1 / (ne12 * ne1));
			
 
				                 const uint32_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
			
 
				                 const uint32_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
			
@@ -1561,13 +1567,16 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
 
				                 const uint32_t i2 = i12;
			
 
				                 const uint32_t i3 = i13;
			
 
				 
			
 
				-                const uint8_t * restrict src0_row = (const uint8_t *) src0->data + (0 + i02 * nb02 + i03 * nb03);
			
 
				+                const uint8_t * restrict src0_base = (const uint8_t *) src0->data + (0 + i02 * nb02 + i03 * nb03);
			
 
				                 const uint8_t * restrict src1_col =
			
 
				-                    (const uint8_t *) src1->data + (i11 + i12 * ne11 + i13 * ne12 * ne11) * src1_row_size;
			
 
				+                    (const uint8_t *) src1->data + (i11 * nb11 + i12 * nb12 + i13 * nb13);
			
 
				                 float * dst_col = (float *) ((uint8_t * restrict) dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
			
 
				 
			
 
				-                for (uint32_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0++) {
			
 
				-                    vec_dot_f16_f32(ne00, &tmp[ir0 - iir0], src0_row + ir0 * src0_row_size, src1_col);
			
 
				+                const uint32_t ir0_block_end = MIN(iir0 + blck_0, ir0_end);
			
 
				+                for (uint32_t ir0 = iir0; ir0 < ir0_block_end; ir0++) {
			
 
				+                    // Use nb01 stride for non-contiguous src0 support
			
 
				+                    const uint8_t * restrict src0_row = src0_base + ir0 * nb01;
			
 
				+                    vec_dot_f16_f32(ne00, &tmp[ir0 - iir0], src0_row, src1_col);
			
 
				                 }
			
 
				 
			
 
				                 hvx_copy_fp32_ua((uint8_t *) &dst_col[iir0], (uint8_t *) tmp, MIN(iir0 + blck_0, ir0_end) - iir0);
			
--- a/scripts/snapdragon/adb/run-mtmd.sh
+++ b/scripts/snapdragon/adb/run-mtmd.sh
@@ -0,0 +1,65 @@
 
				+#!/bin/sh
			
 
				+#
			
 
				+
			
 
				+# Basedir on device
			
 
				+basedir=/data/local/tmp/llama.cpp
			
 
				+
			
 
				+cli_opts=
			
 
				+
			
 
				+branch=.
			
 
				+[ "$B" != "" ] && branch=$B
			
 
				+
			
 
				+adbserial=
			
 
				+[ "$S" != "" ] && adbserial="-s $S"
			
 
				+
			
 
				+model="gemma-3-4b-it-Q4_0.gguf"
			
 
				+[ "$M" != "" ] && model="$M"
			
 
				+
			
 
				+mmproj="mmproj-F16.gguf"
			
 
				+[ "$MMPROJ" != "" ] && mmproj="$MMPROJ"
			
 
				+
			
 
				+image=
			
 
				+[ "$IMG" != "" ] && image="$IMG"
			
 
				+
			
 
				+device="HTP0"
			
 
				+[ "$D" != "" ] && device="$D"
			
 
				+
			
 
				+verbose=
			
 
				+[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V"
			
 
				+
			
 
				+experimental="GGML_HEXAGON_EXPERIMENTAL=1"
			
 
				+[ "$E" != "" ] && experimental="GGML_HEXAGON_EXPERIMENTAL=$E"
			
 
				+
			
 
				+sched=
			
 
				+[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
			
 
				+
			
 
				+profile=
			
 
				+[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1"
			
 
				+
			
 
				+opmask=
			
 
				+[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
			
 
				+
			
 
				+nhvx=
			
 
				+[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
			
 
				+
			
 
				+ndev=
			
 
				+[ "$NDEV" != "" ] && ndev="GGML_HEXAGON_NDEV=$NDEV"
			
 
				+
			
 
				+# MTMD backend device for vision model (defaults to CPU if not set)
			
 
				+mtmd_backend=
			
 
				+[ "$MTMD_DEVICE" != "" ] && mtmd_backend="MTMD_BACKEND_DEVICE=$MTMD_DEVICE"
			
 
				+
			
 
				+set -x
			
 
				+
			
 
				+adb $adbserial shell " \
			
 
				+  cd $basedir; ulimit -c unlimited;        \
			
 
				+    LD_LIBRARY_PATH=$basedir/$branch/lib   \
			
 
				+    ADSP_LIBRARY_PATH=$basedir/$branch/lib \
			
 
				+    $verbose $experimental $sched $opmask $profile $nhvx $ndev $mtmd_backend       \
			
 
				+      ./$branch/bin/llama-mtmd-cli --no-mmap -m $basedir/../gguf/$model   \
			
 
				+         --mmproj $basedir/../gguf/$mmproj \
			
 
				+         --image $basedir/../gguf/$image \
			
 
				+         --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1             \
			
 
				+         --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on \
			
 
				+         -ngl 99 --device $device -v $cli_opts $@ \
			
 
				+"