1 month ago · ed75977717
--- a/docs/backend/hexagon/README.md
+++ b/docs/backend/hexagon/README.md
@@ -106,7 +106,7 @@ Here are some examples of running various llama.cpp tools via ADB.
 
				 Simple question for Llama-3.2-1B
			
 
				 
			
 
				 ```
			
 
				-~/src/llama.cpp$ M=Llama-3.2-1B-Instruct-Q4_0.gguf D=HTP0 ./scripts/snapdragon/adb/run-cli.sh -no-cnv -p "what is the most popular cookie in the world?"
			
 
				+~/src/llama.cpp$ M=Llama-3.2-1B-Instruct-Q4_0.gguf D=HTP0 ./scripts/snapdragon/adb/run-completion.sh -p "what is the most popular cookie in the world?"
			
 
				 ...
			
 
				 ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
			
 
				 ggml-hex: Hexagon Arch version v79
			
@@ -136,7 +136,7 @@ llama_memory_breakdown_print: |   - HTP0-REPACK        |                  504 =
 
				 Summary request for OLMoE-1B-7B. This is a large model that requires two HTP sessions/devices
			
 
				 
			
 
				 ```
			
 
				-~/src/llama.cpp$ M=OLMoE-1B-7B-0125-Instruct-Q4_0.gguf NDEV=2 D=HTP0,HTP1 ./scripts/snapdragon/adb/run-cli.sh -f surfing.txt -no-cnv
			
 
				+~/src/llama.cpp$ M=OLMoE-1B-7B-0125-Instruct-Q4_0.gguf NDEV=2 D=HTP0,HTP1 ./scripts/snapdragon/adb/run-completion.sh -f surfing.txt
			
 
				 ...
			
 
				 ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
			
 
				 ggml-hex: Hexagon Arch version v81
			
@@ -234,6 +234,6 @@ build: 6a8cf8914 (6733)
 
				 
			
 
				   Examples:
			
 
				 
			
 
				-      `GGML_HEXAGON_OPMASK=0x1 llama-cli ...` - Ops are enqueued but NPU-side processing is stubbed out
			
 
				-      `GGML_HEXAGON_OPMASK=0x3 llama-cli ...` - NPU performs dynamic quantization and skips the rest
			
 
				-      `GGML_HEXAGON_OPMASK=0x7 llama-cli ...` - Full queuing and processing of Ops (default)
			
 
				+      `GGML_HEXAGON_OPMASK=0x1 llama-completion ...` - Ops are enqueued but NPU-side processing is stubbed out
			
 
				+      `GGML_HEXAGON_OPMASK=0x3 llama-completion ...` - NPU performs dynamic quantization and skips the rest
			
 
				+      `GGML_HEXAGON_OPMASK=0x7 llama-completion ...` - Full queuing and processing of Ops (default)
			
--- a/docs/backend/hexagon/developer.md
+++ b/docs/backend/hexagon/developer.md
@@ -49,7 +49,7 @@ Each Hexagon device behaves like a GPU from the offload and model splitting pers
 
				 Here is an example of running GPT-OSS-20B model on a newer Snapdragon device with 16GB of DDR.
			
 
				 
			
 
				 ```
			
 
				-M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt scripts/snapdragon/adb/run-cli.sh -no-cnv -f surfing.txt -n 32
			
 
				+M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt scripts/snapdragon/adb/run-completion.sh -f surfing.txt -n 32
			
 
				 ...
			
 
				 LD_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
			
 
				 ADSP_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
			
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
--- a/ggml/src/ggml-hexagon/htp-utils.h
+++ b/ggml/src/ggml-hexagon/htp-utils.h
@@ -8,6 +8,7 @@ extern "C" {
 
				 #include <AEEStdErr.h>
			
 
				 #include <inttypes.h>
			
 
				 #include <remote.h>
			
 
				+#include <rpcmem.h>
			
 
				 #include <stdbool.h>
			
 
				 
			
 
				 /* Offset to differentiate HLOS and Hexagon error codes.
			
--- a/ggml/src/ggml-hexagon/op-desc.h
+++ b/ggml/src/ggml-hexagon/op-desc.h
@@ -0,0 +1,153 @@
 
				+#ifndef OP_DESC_H
			
 
				+#define OP_DESC_H
			
 
				+
			
 
				+#define GGML_COMMON_IMPL_CPP
			
 
				+#include "ggml-backend-impl.h"
			
 
				+#include "ggml-common.h"
			
 
				+
			
 
				+#include <string>
			
 
				+#include <stdio.h>
			
 
				+
			
 
				+struct op_desc {
			
 
				+    char strides[64 * GGML_MAX_SRC];
			
 
				+    char dims[64 * GGML_MAX_SRC];
			
 
				+    char types[16 * GGML_MAX_SRC];
			
 
				+    char buffs[64 * GGML_MAX_SRC];
			
 
				+    char names[64 * GGML_MAX_SRC];
			
 
				+
			
 
				+    int format_tensor_dims(char * str, const struct ggml_tensor * t) {
			
 
				+        if (t->ne[2] == 1 && t->ne[3] == 1) {
			
 
				+            return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
			
 
				+        } else {
			
 
				+            return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    void format_op_dims(char * str, const struct ggml_tensor * t) {
			
 
				+        char * p = str;
			
 
				+
			
 
				+        // append src0 and src1 (if any)
			
 
				+        if (t->src[0]) {
			
 
				+            p += format_tensor_dims(p, t->src[0]);
			
 
				+
			
 
				+            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
			
 
				+                p += sprintf(p, " x ");
			
 
				+                p += format_tensor_dims(p, t->src[i]);
			
 
				+            }
			
 
				+
			
 
				+            p += sprintf(p, " -> ");
			
 
				+        }
			
 
				+
			
 
				+        // format self dims separately for better visual alignment
			
 
				+        char self[64];
			
 
				+        format_tensor_dims(self, t);
			
 
				+
			
 
				+        p += sprintf(p, "%s", self);
			
 
				+    }
			
 
				+
			
 
				+    int format_tensor_strides(char * str, const struct ggml_tensor * t) {
			
 
				+        const char * c = ggml_is_contiguous(t) ? "" : "!";
			
 
				+
			
 
				+        if (t->ne[2] == 1 && t->ne[3] == 1) {
			
 
				+            return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
			
 
				+        } else {
			
 
				+            return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    void format_op_strides(char * str, const struct ggml_tensor * t) {
			
 
				+        char * p = str;
			
 
				+
			
 
				+        // append src0 and src1 (if any)
			
 
				+        if (t->src[0]) {
			
 
				+            p += format_tensor_strides(p, t->src[0]);
			
 
				+
			
 
				+            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
			
 
				+                p += sprintf(p, " x ");
			
 
				+                p += format_tensor_strides(p, t->src[i]);
			
 
				+            }
			
 
				+
			
 
				+            p += sprintf(p, " -> ");
			
 
				+        }
			
 
				+
			
 
				+        // format self dims separately for better visual alignment
			
 
				+        char self[64];
			
 
				+        format_tensor_strides(self, t);
			
 
				+
			
 
				+        p += sprintf(p, "%s", self);
			
 
				+    }
			
 
				+
			
 
				+    void format_op_types(char * str, const struct ggml_tensor * t) {
			
 
				+        char * p = str;
			
 
				+
			
 
				+        // append src0 and src1 (if any)
			
 
				+        if (t->src[0]) {
			
 
				+            p += sprintf(p, "%s", ggml_type_name(t->src[0]->type));
			
 
				+
			
 
				+            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
			
 
				+                p += sprintf(p, " x ");
			
 
				+                p += sprintf(p, "%s", ggml_type_name(t->src[i]->type));
			
 
				+            }
			
 
				+
			
 
				+            p += sprintf(p, " -> ");
			
 
				+        }
			
 
				+
			
 
				+        p += sprintf(p, "%s", ggml_type_name(t->type));
			
 
				+    }
			
 
				+
			
 
				+    const char * tensor_buff_name(const struct ggml_tensor * t) {
			
 
				+        if (t->buffer) {
			
 
				+            return ggml_backend_buffer_name(t->buffer);
			
 
				+        }
			
 
				+        return "NONE";
			
 
				+    }
			
 
				+
			
 
				+    void format_op_buffs(char * str, const struct ggml_tensor * t) {
			
 
				+        char * p = str;
			
 
				+
			
 
				+        // append src0 and src1 (if any)
			
 
				+        if (t->src[0]) {
			
 
				+            p += sprintf(p, "%s", tensor_buff_name(t->src[0]));
			
 
				+
			
 
				+            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
			
 
				+                p += sprintf(p, " x ");
			
 
				+                p += sprintf(p, "%s", tensor_buff_name(t->src[i]));
			
 
				+            }
			
 
				+
			
 
				+            p += sprintf(p, " -> ");
			
 
				+        }
			
 
				+
			
 
				+        p += sprintf(p, "%s", tensor_buff_name(t));
			
 
				+    }
			
 
				+
			
 
				+    void format_op_names(char * str, const struct ggml_tensor * t) {
			
 
				+        char * p = str;
			
 
				+
			
 
				+        // append src0 and src1 (if any)
			
 
				+        if (t->src[0]) {
			
 
				+            p += sprintf(p, "%s", t->src[0]->name);
			
 
				+
			
 
				+            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
			
 
				+                p += sprintf(p, " x ");
			
 
				+                p += sprintf(p, "%s", t->src[i]->name);
			
 
				+            }
			
 
				+
			
 
				+            p += sprintf(p, " -> ");
			
 
				+        }
			
 
				+
			
 
				+        p += sprintf(p, "%s", t->name);
			
 
				+    }
			
 
				+
			
 
				+    void format(const ggml_tensor * op) {
			
 
				+        format_op_dims(dims, op);
			
 
				+        format_op_strides(strides, op);
			
 
				+        format_op_types(types, op);
			
 
				+        format_op_buffs(buffs, op);
			
 
				+        format_op_names(names, op);
			
 
				+    }
			
 
				+
			
 
				+    op_desc() {}
			
 
				+    op_desc(const ggml_tensor * op) { format(op); }
			
 
				+};
			
 
				+
			
 
				+#endif // OP_DESC_H
			
--- a/scripts/snapdragon/adb/run-cli.sh
+++ b/scripts/snapdragon/adb/run-cli.sh
@@ -18,17 +18,17 @@ model="Llama-3.2-3B-Instruct-Q4_0.gguf"
 
				 device="HTP0"
			
 
				 [ "$D" != "" ] && device="$D"
			
 
				 
			
 
				-verbose=
			
 
				-[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V"
			
 
				-
			
 
				 experimental=
			
 
				 [ "$E" != "" ] && experimental="GGML_HEXAGON_EXPERIMENTAL=$E"
			
 
				 
			
 
				+verbose=
			
 
				+[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V" cli_opts="$cli_opts -v"
			
 
				+
			
 
				 sched=
			
 
				 [ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
			
 
				 
			
 
				 profile=
			
 
				-[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1"
			
 
				+[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v"
			
 
				 
			
 
				 opmask=
			
 
				 [ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
			
@@ -45,9 +45,9 @@ adb $adbserial shell " \
 
				   cd $basedir; ulimit -c unlimited;        \
			
 
				     LD_LIBRARY_PATH=$basedir/$branch/lib   \
			
 
				     ADSP_LIBRARY_PATH=$basedir/$branch/lib \
			
 
				-    $verbose $experimental $sched $opmask $profile $nhvx $ndev       \
			
 
				-      ./$branch/bin/llama-completion --no-mmap -m $basedir/../gguf/$model   \
			
 
				-         --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1             \
			
 
				-         --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on \
			
 
				-         -ngl 99 --device $device $cli_opts $@ \
			
 
				+    $verbose $experimental $sched $opmask $profile $nhvx $ndev     \
			
 
				+      ./$branch/bin/llama-cli --no-mmap -m $basedir/../gguf/$model \
			
 
				+         --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1           \
			
 
				+         --ctx-size 8192 --batch-size 128 -fa on \
			
 
				+         -ngl 99 --device $device $cli_opts $@   \
			
 
				 "
			
--- a/scripts/snapdragon/adb/run-completion.sh
+++ b/scripts/snapdragon/adb/run-completion.sh
@@ -0,0 +1,53 @@
 
				+#!/bin/sh
			
 
				+#
			
 
				+
			
 
				+# Basedir on device
			
 
				+basedir=/data/local/tmp/llama.cpp
			
 
				+
			
 
				+cli_opts=
			
 
				+
			
 
				+branch=.
			
 
				+[ "$B" != "" ] && branch=$B
			
 
				+
			
 
				+adbserial=
			
 
				+[ "$S" != "" ] && adbserial="-s $S"
			
 
				+
			
 
				+model="Llama-3.2-3B-Instruct-Q4_0.gguf"
			
 
				+[ "$M" != "" ] && model="$M"
			
 
				+
			
 
				+device="HTP0"
			
 
				+[ "$D" != "" ] && device="$D"
			
 
				+
			
 
				+experimental=
			
 
				+[ "$E" != "" ] && experimental="GGML_HEXAGON_EXPERIMENTAL=$E"
			
 
				+
			
 
				+verbose=
			
 
				+[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V" cli_opts="$cli_opts -v"
			
 
				+
			
 
				+sched=
			
 
				+[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
			
 
				+
			
 
				+profile=
			
 
				+[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v"
			
 
				+
			
 
				+opmask=
			
 
				+[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
			
 
				+
			
 
				+nhvx=
			
 
				+[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
			
 
				+
			
 
				+ndev=
			
 
				+[ "$NDEV" != "" ] && ndev="GGML_HEXAGON_NDEV=$NDEV"
			
 
				+
			
 
				+set -x
			
 
				+
			
 
				+adb $adbserial shell " \
			
 
				+  cd $basedir; ulimit -c unlimited;        \
			
 
				+    LD_LIBRARY_PATH=$basedir/$branch/lib   \
			
 
				+    ADSP_LIBRARY_PATH=$basedir/$branch/lib \
			
 
				+    $verbose $experimental $sched $opmask $profile $nhvx $ndev            \
			
 
				+      ./$branch/bin/llama-completion --no-mmap -m $basedir/../gguf/$model \
			
 
				+         --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1                  \
			
 
				+         --ctx-size 8192 --batch-size 128 -fa on \
			
 
				+         -ngl 99 -no-cnv --device $device $cli_opts $@   \
			
 
				+"