|
|
@@ -723,6 +723,12 @@ struct ggml_backend_sched {
|
|
|
bool op_offload;
|
|
|
|
|
|
int debug;
|
|
|
+
|
|
|
+ // used for debugging graph reallocations [GGML_SCHED_DEBUG_REALLOC]
|
|
|
+ // ref: https://github.com/ggml-org/llama.cpp/pull/17617
|
|
|
+ int debug_realloc;
|
|
|
+ int debug_graph_size;
|
|
|
+ int debug_prev_graph_size;
|
|
|
};
|
|
|
|
|
|
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
|
|
@@ -1289,6 +1295,11 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
|
|
|
}
|
|
|
|
|
|
int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
|
|
|
+
|
|
|
+ // remember the actual graph_size for performing reallocation checks later [GGML_SCHED_DEBUG_REALLOC]
|
|
|
+ sched->debug_prev_graph_size = sched->debug_graph_size;
|
|
|
+ sched->debug_graph_size = graph_size;
|
|
|
+
|
|
|
if (sched->graph.size < graph_size) {
|
|
|
sched->graph.size = graph_size;
|
|
|
sched->graph.nodes = (ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
|
|
|
@@ -1395,14 +1406,21 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
|
|
|
|
|
// allocate graph
|
|
|
if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
|
|
-#ifdef GGML_SCHED_NO_REALLOC
|
|
|
- GGML_ABORT("%s: failed to allocate graph, but graph re-allocation is disabled by GGML_SCHED_NO_REALLOC\n", __func__);
|
|
|
-#endif
|
|
|
-
|
|
|
#ifndef NDEBUG
|
|
|
GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
|
|
|
#endif
|
|
|
|
|
|
+ if (sched->debug_realloc > 0) {
|
|
|
+ // we are interested only in situations where the graph was reallocated even though its size remained the same [GGML_SCHED_DEBUG_REALLOC]
|
|
|
+ // example: https://github.com/ggml-org/llama.cpp/pull/17143
|
|
|
+ const bool unexpected = !backend_ids_changed && sched->debug_prev_graph_size == sched->debug_graph_size;
|
|
|
+
|
|
|
+ if (unexpected || sched->debug_realloc > 1) {
|
|
|
+ GGML_ABORT("%s: unexpected graph reallocation (graph size = %d, nodes = %d, leafs = %d), debug_realloc = %d\n", __func__,
|
|
|
+ sched->debug_graph_size, sched->graph.n_nodes, sched->graph.n_leafs, sched->debug_realloc);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
// the re-allocation may cause the split inputs to be moved to a different address
|
|
|
// synchronize without ggml_backend_sched_synchronize to avoid changing cur_copy
|
|
|
for (int i = 0; i < sched->n_backends; i++) {
|
|
|
@@ -1620,6 +1638,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
|
|
|
|
const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG");
|
|
|
sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0;
|
|
|
+
|
|
|
+ sched->debug_realloc = 0;
|
|
|
+#ifdef GGML_SCHED_NO_REALLOC
|
|
|
+ sched->debug_realloc = 1;
|
|
|
+#endif
|
|
|
+ const char * GGML_SCHED_DEBUG_REALLOC = getenv("GGML_SCHED_DEBUG_REALLOC");
|
|
|
+ sched->debug_realloc = GGML_SCHED_DEBUG_REALLOC ? atoi(GGML_SCHED_DEBUG_REALLOC) : sched->debug_realloc;
|
|
|
+
|
|
|
sched->n_backends = n_backends;
|
|
|
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
|
|
|
|
|
@@ -1636,6 +1662,9 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
|
sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
|
|
|
sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
|
|
|
|
|
|
+ sched->debug_graph_size = 0;
|
|
|
+ sched->debug_prev_graph_size = 0;
|
|
|
+
|
|
|
sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
|
|
|
sched->context_buffer = (char *) malloc(sched->context_buffer_size);
|
|
|
|