Browse Source

ggml-hexagon: create generalized functions for cpu side op (#17500)

* refactor: replace ggml_hexagon_mul_mat with template-based binary operation for improved flexibility

* refactor: replace ggml_hexagon_mul_mat_id with template-based binary operation for improved flexibility

* refactor: initialize buffer types and streamline dspqueue_buffers_init calls for clarity

* add comment

* refactor: remove redundant buffer checks in hexagon supported operations

* wip

* add missing include to fix weak symbol warning

* add ggml_hexagon_op_generic

* refactor: simplify tensor operation initialization and buffer management in hexagon implementation

* refactor: streamline hexagon operation initialization and buffer management

* refactor: update function signatures and streamline request handling in hexagon operations

* wip

* ggml-hexagon: clean up code formatting and improve unary operation handling

* wip

* rename

* fix: add support for permuted F16 tensors and enhance quantization checks in matrix operations

* refactor: replace ggml_hexagon_mul_mat with template-based binary operation for improved flexibility

refactor: replace ggml_hexagon_mul_mat_id with template-based binary operation for improved flexibility

refactor: initialize buffer types and streamline dspqueue_buffers_init calls for clarity

refactor: remove redundant buffer checks in hexagon supported operations

add missing include to fix weak symbol warning

add ggml_hexagon_op_generic

refactor: simplify tensor operation initialization and buffer management in hexagon implementation

refactor: streamline hexagon operation initialization and buffer management

refactor: update function signatures and streamline request handling in hexagon operations

ggml-hexagon: clean up code formatting and improve unary operation handling

fix: add support for permuted F16 tensors and enhance quantization checks in matrix operations

# Conflicts:
#	ggml/src/ggml-hexagon/ggml-hexagon.cpp

* hexagon: fix merge conflicts

* hexagon: minor cleanup for buffer support checks

* hexagon: factor out op_desc and the overal op logging

* hexagon: further simplify and cleanup op dispatch logic

* snapdragon: update adb scripts to use llama-cli and llama-completion

* fix pipeline failure

---------

Co-authored-by: Max Krasnyansky <maxk@qti.qualcomm.com>
nullname 1 month ago
parent
commit
ed75977717

+ 5 - 5
docs/backend/hexagon/README.md

@@ -106,7 +106,7 @@ Here are some examples of running various llama.cpp tools via ADB.
 Simple question for Llama-3.2-1B
 
 ```
-~/src/llama.cpp$ M=Llama-3.2-1B-Instruct-Q4_0.gguf D=HTP0 ./scripts/snapdragon/adb/run-cli.sh -no-cnv -p "what is the most popular cookie in the world?"
+~/src/llama.cpp$ M=Llama-3.2-1B-Instruct-Q4_0.gguf D=HTP0 ./scripts/snapdragon/adb/run-completion.sh -p "what is the most popular cookie in the world?"
 ...
 ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
 ggml-hex: Hexagon Arch version v79
@@ -136,7 +136,7 @@ llama_memory_breakdown_print: |   - HTP0-REPACK        |                  504 =
 Summary request for OLMoE-1B-7B. This is a large model that requires two HTP sessions/devices
 
 ```
-~/src/llama.cpp$ M=OLMoE-1B-7B-0125-Instruct-Q4_0.gguf NDEV=2 D=HTP0,HTP1 ./scripts/snapdragon/adb/run-cli.sh -f surfing.txt -no-cnv
+~/src/llama.cpp$ M=OLMoE-1B-7B-0125-Instruct-Q4_0.gguf NDEV=2 D=HTP0,HTP1 ./scripts/snapdragon/adb/run-completion.sh -f surfing.txt
 ...
 ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
 ggml-hex: Hexagon Arch version v81
@@ -234,6 +234,6 @@ build: 6a8cf8914 (6733)
 
   Examples:
 
-      `GGML_HEXAGON_OPMASK=0x1 llama-cli ...` - Ops are enqueued but NPU-side processing is stubbed out
-      `GGML_HEXAGON_OPMASK=0x3 llama-cli ...` - NPU performs dynamic quantization and skips the rest
-      `GGML_HEXAGON_OPMASK=0x7 llama-cli ...` - Full queuing and processing of Ops (default)
+      `GGML_HEXAGON_OPMASK=0x1 llama-completion ...` - Ops are enqueued but NPU-side processing is stubbed out
+      `GGML_HEXAGON_OPMASK=0x3 llama-completion ...` - NPU performs dynamic quantization and skips the rest
+      `GGML_HEXAGON_OPMASK=0x7 llama-completion ...` - Full queuing and processing of Ops (default)

+ 1 - 1
docs/backend/hexagon/developer.md

@@ -49,7 +49,7 @@ Each Hexagon device behaves like a GPU from the offload and model splitting pers
 Here is an example of running GPT-OSS-20B model on a newer Snapdragon device with 16GB of DDR.
 
 ```
-M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt scripts/snapdragon/adb/run-cli.sh -no-cnv -f surfing.txt -n 32
+M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt scripts/snapdragon/adb/run-completion.sh -f surfing.txt -n 32
 ...
 LD_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
 ADSP_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib

File diff suppressed because it is too large
+ 155 - 708
ggml/src/ggml-hexagon/ggml-hexagon.cpp


+ 1 - 0
ggml/src/ggml-hexagon/htp-utils.h

@@ -8,6 +8,7 @@ extern "C" {
 #include <AEEStdErr.h>
 #include <inttypes.h>
 #include <remote.h>
+#include <rpcmem.h>
 #include <stdbool.h>
 
 /* Offset to differentiate HLOS and Hexagon error codes.

+ 153 - 0
ggml/src/ggml-hexagon/op-desc.h

@@ -0,0 +1,153 @@
+#ifndef OP_DESC_H
+#define OP_DESC_H
+
+#define GGML_COMMON_IMPL_CPP
+#include "ggml-backend-impl.h"
+#include "ggml-common.h"
+
+#include <string>
+#include <stdio.h>
+
+struct op_desc {
+    char strides[64 * GGML_MAX_SRC];
+    char dims[64 * GGML_MAX_SRC];
+    char types[16 * GGML_MAX_SRC];
+    char buffs[64 * GGML_MAX_SRC];
+    char names[64 * GGML_MAX_SRC];
+
+    int format_tensor_dims(char * str, const struct ggml_tensor * t) {
+        if (t->ne[2] == 1 && t->ne[3] == 1) {
+            return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
+        } else {
+            return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
+        }
+    }
+
+    void format_op_dims(char * str, const struct ggml_tensor * t) {
+        char * p = str;
+
+        // append src0 and src1 (if any)
+        if (t->src[0]) {
+            p += format_tensor_dims(p, t->src[0]);
+
+            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+                p += sprintf(p, " x ");
+                p += format_tensor_dims(p, t->src[i]);
+            }
+
+            p += sprintf(p, " -> ");
+        }
+
+        // format self dims separately for better visual alignment
+        char self[64];
+        format_tensor_dims(self, t);
+
+        p += sprintf(p, "%s", self);
+    }
+
+    int format_tensor_strides(char * str, const struct ggml_tensor * t) {
+        const char * c = ggml_is_contiguous(t) ? "" : "!";
+
+        if (t->ne[2] == 1 && t->ne[3] == 1) {
+            return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
+        } else {
+            return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c);
+        }
+    }
+
+    void format_op_strides(char * str, const struct ggml_tensor * t) {
+        char * p = str;
+
+        // append src0 and src1 (if any)
+        if (t->src[0]) {
+            p += format_tensor_strides(p, t->src[0]);
+
+            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+                p += sprintf(p, " x ");
+                p += format_tensor_strides(p, t->src[i]);
+            }
+
+            p += sprintf(p, " -> ");
+        }
+
+        // format self dims separately for better visual alignment
+        char self[64];
+        format_tensor_strides(self, t);
+
+        p += sprintf(p, "%s", self);
+    }
+
+    void format_op_types(char * str, const struct ggml_tensor * t) {
+        char * p = str;
+
+        // append src0 and src1 (if any)
+        if (t->src[0]) {
+            p += sprintf(p, "%s", ggml_type_name(t->src[0]->type));
+
+            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+                p += sprintf(p, " x ");
+                p += sprintf(p, "%s", ggml_type_name(t->src[i]->type));
+            }
+
+            p += sprintf(p, " -> ");
+        }
+
+        p += sprintf(p, "%s", ggml_type_name(t->type));
+    }
+
+    const char * tensor_buff_name(const struct ggml_tensor * t) {
+        if (t->buffer) {
+            return ggml_backend_buffer_name(t->buffer);
+        }
+        return "NONE";
+    }
+
+    void format_op_buffs(char * str, const struct ggml_tensor * t) {
+        char * p = str;
+
+        // append src0 and src1 (if any)
+        if (t->src[0]) {
+            p += sprintf(p, "%s", tensor_buff_name(t->src[0]));
+
+            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+                p += sprintf(p, " x ");
+                p += sprintf(p, "%s", tensor_buff_name(t->src[i]));
+            }
+
+            p += sprintf(p, " -> ");
+        }
+
+        p += sprintf(p, "%s", tensor_buff_name(t));
+    }
+
+    void format_op_names(char * str, const struct ggml_tensor * t) {
+        char * p = str;
+
+        // append src0 and src1 (if any)
+        if (t->src[0]) {
+            p += sprintf(p, "%s", t->src[0]->name);
+
+            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+                p += sprintf(p, " x ");
+                p += sprintf(p, "%s", t->src[i]->name);
+            }
+
+            p += sprintf(p, " -> ");
+        }
+
+        p += sprintf(p, "%s", t->name);
+    }
+
+    void format(const ggml_tensor * op) {
+        format_op_dims(dims, op);
+        format_op_strides(strides, op);
+        format_op_types(types, op);
+        format_op_buffs(buffs, op);
+        format_op_names(names, op);
+    }
+
+    op_desc() {}
+    op_desc(const ggml_tensor * op) { format(op); }
+};
+
+#endif // OP_DESC_H

+ 9 - 9
scripts/snapdragon/adb/run-cli.sh

@@ -18,17 +18,17 @@ model="Llama-3.2-3B-Instruct-Q4_0.gguf"
 device="HTP0"
 [ "$D" != "" ] && device="$D"
 
-verbose=
-[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V"
-
 experimental=
 [ "$E" != "" ] && experimental="GGML_HEXAGON_EXPERIMENTAL=$E"
 
+verbose=
+[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V" cli_opts="$cli_opts -v"
+
 sched=
 [ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
 
 profile=
-[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1"
+[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v"
 
 opmask=
 [ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
@@ -45,9 +45,9 @@ adb $adbserial shell " \
   cd $basedir; ulimit -c unlimited;        \
     LD_LIBRARY_PATH=$basedir/$branch/lib   \
     ADSP_LIBRARY_PATH=$basedir/$branch/lib \
-    $verbose $experimental $sched $opmask $profile $nhvx $ndev       \
-      ./$branch/bin/llama-completion --no-mmap -m $basedir/../gguf/$model   \
-         --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1             \
-         --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on \
-         -ngl 99 --device $device $cli_opts $@ \
+    $verbose $experimental $sched $opmask $profile $nhvx $ndev     \
+      ./$branch/bin/llama-cli --no-mmap -m $basedir/../gguf/$model \
+         --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1           \
+         --ctx-size 8192 --batch-size 128 -fa on \
+         -ngl 99 --device $device $cli_opts $@   \
 "

+ 53 - 0
scripts/snapdragon/adb/run-completion.sh

@@ -0,0 +1,53 @@
+#!/bin/sh
+#
+
+# Basedir on device
+basedir=/data/local/tmp/llama.cpp
+
+cli_opts=
+
+branch=.
+[ "$B" != "" ] && branch=$B
+
+adbserial=
+[ "$S" != "" ] && adbserial="-s $S"
+
+model="Llama-3.2-3B-Instruct-Q4_0.gguf"
+[ "$M" != "" ] && model="$M"
+
+device="HTP0"
+[ "$D" != "" ] && device="$D"
+
+experimental=
+[ "$E" != "" ] && experimental="GGML_HEXAGON_EXPERIMENTAL=$E"
+
+verbose=
+[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V" cli_opts="$cli_opts -v"
+
+sched=
+[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
+
+profile=
+[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v"
+
+opmask=
+[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
+
+nhvx=
+[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
+
+ndev=
+[ "$NDEV" != "" ] && ndev="GGML_HEXAGON_NDEV=$NDEV"
+
+set -x
+
+adb $adbserial shell " \
+  cd $basedir; ulimit -c unlimited;        \
+    LD_LIBRARY_PATH=$basedir/$branch/lib   \
+    ADSP_LIBRARY_PATH=$basedir/$branch/lib \
+    $verbose $experimental $sched $opmask $profile $nhvx $ndev            \
+      ./$branch/bin/llama-completion --no-mmap -m $basedir/../gguf/$model \
+         --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1                  \
+         --ctx-size 8192 --batch-size 128 -fa on \
+         -ngl 99 -no-cnv --device $device $cli_opts $@   \
+"

Some files were not shown because too many files changed in this diff