|
@@ -1231,6 +1231,14 @@ struct ggml_backend_vk_context {
|
|
|
vk_pipeline_struct * prealloc_y_last_pipeline_used {};
|
|
vk_pipeline_struct * prealloc_y_last_pipeline_used {};
|
|
|
const ggml_tensor * prealloc_y_last_tensor_used {};
|
|
const ggml_tensor * prealloc_y_last_tensor_used {};
|
|
|
|
|
|
|
|
|
|
+ // Track which nodes have been used since the last sync, and whether they were written to
|
|
|
|
|
+ std::vector<const ggml_tensor *> unsynced_nodes_written;
|
|
|
|
|
+ std::vector<const ggml_tensor *> unsynced_nodes_read;
|
|
|
|
|
+ // Track which prealloc buffers have pending reads that need to be synchronized.
|
|
|
|
|
+ // These are checked before writing to the buffer (and call ggml_vk_sync_buffers if set),
|
|
|
|
|
+ // and set to true after the buffer contents are consumed.
|
|
|
|
|
+ bool prealloc_x_need_sync, prealloc_y_need_sync, prealloc_split_k_need_sync;
|
|
|
|
|
+
|
|
|
vk_buffer buffer_pool[MAX_VK_BUFFERS];
|
|
vk_buffer buffer_pool[MAX_VK_BUFFERS];
|
|
|
|
|
|
|
|
vk_context_ref compute_ctx;
|
|
vk_context_ref compute_ctx;
|
|
@@ -1906,14 +1914,18 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
|
|
|
return { buf, 0, VK_WHOLE_SIZE };
|
|
return { buf, 0, VK_WHOLE_SIZE };
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-static void ggml_vk_sync_buffers(vk_context& ctx) {
|
|
|
|
|
|
|
+static void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subctx) {
|
|
|
VK_LOG_DEBUG("ggml_vk_sync_buffers()");
|
|
VK_LOG_DEBUG("ggml_vk_sync_buffers()");
|
|
|
|
|
|
|
|
- const bool transfer_queue = ctx->p->q->transfer_only;
|
|
|
|
|
|
|
+ const bool transfer_queue = subctx->p->q->transfer_only;
|
|
|
|
|
|
|
|
- ctx->s->buffer.pipelineBarrier(
|
|
|
|
|
- ctx->p->q->stage_flags,
|
|
|
|
|
- ctx->p->q->stage_flags,
|
|
|
|
|
|
|
+ if (ctx) {
|
|
|
|
|
+ ctx->prealloc_x_need_sync = ctx->prealloc_y_need_sync = ctx->prealloc_split_k_need_sync = false;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ subctx->s->buffer.pipelineBarrier(
|
|
|
|
|
+ subctx->p->q->stage_flags,
|
|
|
|
|
+ subctx->p->q->stage_flags,
|
|
|
{},
|
|
{},
|
|
|
{ {
|
|
{ {
|
|
|
{ !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) },
|
|
{ !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) },
|
|
@@ -4898,7 +4910,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
|
|
+ ggml_vk_sync_buffers(ctx, subctx);
|
|
|
subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
|
|
subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
|
|
|
return;
|
|
return;
|
|
|
}
|
|
}
|
|
@@ -4913,7 +4925,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
|
|
|
ggml_vk_ensure_sync_staging_buffer(ctx->device, copy_size);
|
|
ggml_vk_ensure_sync_staging_buffer(ctx->device, copy_size);
|
|
|
VkBufferCopy buf_copy{ 0, offset, copy_size };
|
|
VkBufferCopy buf_copy{ 0, offset, copy_size };
|
|
|
|
|
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
|
|
+ ggml_vk_sync_buffers(ctx, subctx);
|
|
|
vkCmdCopyBuffer(subctx->s->buffer, (VkBuffer)staging->buffer, (VkBuffer)dst->buffer, 1, &buf_copy);
|
|
vkCmdCopyBuffer(subctx->s->buffer, (VkBuffer)staging->buffer, (VkBuffer)dst->buffer, 1, &buf_copy);
|
|
|
|
|
|
|
|
for (uint64_t i3 = 0; i3 < ne3; i3++) {
|
|
for (uint64_t i3 = 0; i3 < ne3; i3++) {
|
|
@@ -4967,7 +4979,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
|
|
+ ggml_vk_sync_buffers(nullptr, subctx);
|
|
|
subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
|
|
subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
|
|
|
return;
|
|
return;
|
|
|
}
|
|
}
|
|
@@ -4988,7 +5000,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz
|
|
|
offset,
|
|
offset,
|
|
|
copy_size};
|
|
copy_size};
|
|
|
|
|
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
|
|
+ ggml_vk_sync_buffers(nullptr, subctx);
|
|
|
vkCmdCopyBuffer(subctx->s->buffer, (VkBuffer)staging_buffer->buffer, (VkBuffer)dst->buffer, 1, &buf_copy);
|
|
vkCmdCopyBuffer(subctx->s->buffer, (VkBuffer)staging_buffer->buffer, (VkBuffer)dst->buffer, 1, &buf_copy);
|
|
|
|
|
|
|
|
if (width == spitch) {
|
|
if (width == spitch) {
|
|
@@ -5068,7 +5080,7 @@ static void ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size
|
|
|
|
|
|
|
|
if (buf != nullptr) {
|
|
if (buf != nullptr) {
|
|
|
// Memory is pinned, use as staging buffer
|
|
// Memory is pinned, use as staging buffer
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
|
|
+ ggml_vk_sync_buffers(nullptr, subctx);
|
|
|
subctx->s->buffer.copyBuffer(src->buffer, buf->buffer, slices);
|
|
subctx->s->buffer.copyBuffer(src->buffer, buf->buffer, slices);
|
|
|
|
|
|
|
|
return;
|
|
return;
|
|
@@ -5085,7 +5097,7 @@ static void ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size
|
|
|
|
|
|
|
|
vk_buffer& staging_buffer = src->device->sync_staging;
|
|
vk_buffer& staging_buffer = src->device->sync_staging;
|
|
|
|
|
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
|
|
+ ggml_vk_sync_buffers(nullptr, subctx);
|
|
|
subctx->s->buffer.copyBuffer(src->buffer, staging_buffer->buffer, slices);
|
|
subctx->s->buffer.copyBuffer(src->buffer, staging_buffer->buffer, slices);
|
|
|
|
|
|
|
|
deferred_memcpy(dst, staging_buffer->ptr, copy_size, &subctx->out_memcpys);
|
|
deferred_memcpy(dst, staging_buffer->ptr, copy_size, &subctx->out_memcpys);
|
|
@@ -5275,13 +5287,16 @@ static void ggml_vk_matmul(
|
|
|
uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3,
|
|
uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3,
|
|
|
uint32_t padded_n) {
|
|
uint32_t padded_n) {
|
|
|
VK_LOG_DEBUG("ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", padded_n: " << padded_n << ")");
|
|
VK_LOG_DEBUG("ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", padded_n: " << padded_n << ")");
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
if (split_k == 1) {
|
|
if (split_k == 1) {
|
|
|
const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3, padded_n };
|
|
const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3, padded_n };
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, pc, { m, n, batch });
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, pc, { m, n, batch });
|
|
|
return;
|
|
return;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ if (ctx->prealloc_split_k_need_sync) {
|
|
|
|
|
+ ggml_vk_sync_buffers(ctx, subctx);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
GGML_ASSERT(batch_stride_d == m * n);
|
|
GGML_ASSERT(batch_stride_d == m * n);
|
|
|
|
|
|
|
|
// Round the split size up to a multiple of 256 (k-quant alignment)
|
|
// Round the split size up to a multiple of 256 (k-quant alignment)
|
|
@@ -5291,9 +5306,10 @@ static void ggml_vk_matmul(
|
|
|
const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k_split, ne02, ne12, broadcast2, broadcast3, padded_n };
|
|
const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k_split, ne02, ne12, broadcast2, broadcast3, padded_n };
|
|
|
// Make sure enough workgroups get assigned for split k to work
|
|
// Make sure enough workgroups get assigned for split k to work
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
|
|
+ ggml_vk_sync_buffers(ctx, subctx);
|
|
|
const std::array<uint32_t, 2> pc2 = { (uint32_t)(m * n * batch), split_k };
|
|
const std::array<uint32_t, 2> pc2 = { (uint32_t)(m * n * batch), split_k };
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2, { m * n * batch, 1, 1 });
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2, { m * n * batch, 1, 1 });
|
|
|
|
|
+ ctx->prealloc_split_k_need_sync = true;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
static vk_pipeline ggml_vk_guess_matmul_id_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, uint32_t m, uint32_t n, bool aligned, ggml_type src0_type) {
|
|
static vk_pipeline ggml_vk_guess_matmul_id_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, uint32_t m, uint32_t n, bool aligned, ggml_type src0_type) {
|
|
@@ -5338,7 +5354,6 @@ static void ggml_vk_matmul_id(
|
|
|
"m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", " <<
|
|
"m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", " <<
|
|
|
"batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", " <<
|
|
"batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", " <<
|
|
|
"n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")");
|
|
"n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")");
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d,
|
|
const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d,
|
|
|
nei0, nei1, nbi1, ne11, padded_n };
|
|
nei0, nei1, nbi1, ne11, padded_n };
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids }, pc, { m, nei1, n_as });
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids }, pc, { m, nei1, n_as });
|
|
@@ -5469,8 +5484,8 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
};
|
|
};
|
|
|
init_pushconst_fastdiv(pc);
|
|
init_pushconst_fastdiv(pc);
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements);
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements);
|
|
|
|
|
+ ggml_vk_sync_buffers(ctx, subctx);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
static vk_pipeline ggml_vk_get_quantize_pipeline(ggml_backend_vk_context * ctx, ggml_type type) {
|
|
static vk_pipeline ggml_vk_get_quantize_pipeline(ggml_backend_vk_context * ctx, ggml_type type) {
|
|
@@ -5488,8 +5503,8 @@ static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& sub
|
|
|
|
|
|
|
|
vk_pipeline pipeline = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1);
|
|
vk_pipeline pipeline = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1);
|
|
|
|
|
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, std::array<uint32_t, 1>{ne}, { ne, 1, 1 });
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, std::array<uint32_t, 1>{ne}, { ne, 1, 1 });
|
|
|
|
|
+ ggml_vk_sync_buffers(ctx, subctx);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -5684,12 +5699,23 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
|
|
GGML_ASSERT(qy_sz == y_sz);
|
|
GGML_ASSERT(qy_sz == y_sz);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ if (x_non_contig || qx_needs_dequant) {
|
|
|
|
|
+ if (ctx->prealloc_x_need_sync) {
|
|
|
|
|
+ ggml_vk_sync_buffers(ctx, subctx);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ if (y_non_contig || quantize_y) {
|
|
|
|
|
+ if (ctx->prealloc_y_need_sync) {
|
|
|
|
|
+ ggml_vk_sync_buffers(ctx, subctx);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
if (x_non_contig) {
|
|
if (x_non_contig) {
|
|
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
|
|
} else if (qx_needs_dequant) {
|
|
} else if (qx_needs_dequant) {
|
|
|
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
|
|
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
|
|
|
|
|
+ ggml_vk_sync_buffers(ctx, subctx);
|
|
|
}
|
|
}
|
|
|
if (y_non_contig) {
|
|
if (y_non_contig) {
|
|
|
if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
|
|
if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
|
|
@@ -5728,6 +5754,13 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
|
|
ne10, ne10, ne01, stride_batch_x, stride_batch_y, ne20*ne21,
|
|
ne10, ne10, ne01, stride_batch_x, stride_batch_y, ne20*ne21,
|
|
|
split_k, ne12*ne13, ne02, ne12, r2, r3, padded_n
|
|
split_k, ne12*ne13, ne02, ne12, r2, r3, padded_n
|
|
|
); // NOLINT
|
|
); // NOLINT
|
|
|
|
|
+
|
|
|
|
|
+ if (x_non_contig || qx_needs_dequant) {
|
|
|
|
|
+ ctx->prealloc_x_need_sync = true;
|
|
|
|
|
+ }
|
|
|
|
|
+ if (y_non_contig || quantize_y) {
|
|
|
|
|
+ ctx->prealloc_y_need_sync = true;
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -5874,6 +5907,17 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|
|
GGML_ASSERT(qy_sz == y_sz);
|
|
GGML_ASSERT(qy_sz == y_sz);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ if (x_non_contig) {
|
|
|
|
|
+ if (ctx->prealloc_x_need_sync) {
|
|
|
|
|
+ ggml_vk_sync_buffers(ctx, subctx);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ if (y_non_contig) {
|
|
|
|
|
+ if (ctx->prealloc_y_need_sync) {
|
|
|
|
|
+ ggml_vk_sync_buffers(ctx, subctx);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
if (x_non_contig) {
|
|
if (x_non_contig) {
|
|
|
GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
|
|
GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
|
|
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
|
@@ -5917,10 +5961,16 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|
|
stride_batch_x, stride_batch_y, stride_batch_d,
|
|
stride_batch_x, stride_batch_y, stride_batch_d,
|
|
|
(uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3,
|
|
(uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3,
|
|
|
};
|
|
};
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
|
|
|
{ vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} },
|
|
{ vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} },
|
|
|
pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z });
|
|
pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z });
|
|
|
|
|
+
|
|
|
|
|
+ if (x_non_contig) {
|
|
|
|
|
+ ctx->prealloc_x_need_sync = true;
|
|
|
|
|
+ }
|
|
|
|
|
+ if (y_non_contig) {
|
|
|
|
|
+ ctx->prealloc_y_need_sync = true;
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -6007,7 +6057,6 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
|
workgroups_z /= gqa_ratio;
|
|
workgroups_z /= gqa_ratio;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1, (uint32_t)ne01, workgroups_z });
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1, (uint32_t)ne01, workgroups_z });
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -6094,7 +6143,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
|
|
|
|
|
|
// compute
|
|
// compute
|
|
|
const std::array<uint32_t, 12> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)), nb03, nb13, nb23 };
|
|
const std::array<uint32_t, 12> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)), nb03, nb13, nb23 };
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32,
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32,
|
|
|
{ vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { (uint32_t)ne03, (uint32_t)ne01, (uint32_t)ne12 });
|
|
{ vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { (uint32_t)ne03, (uint32_t)ne01, (uint32_t)ne12 });
|
|
|
}
|
|
}
|
|
@@ -6306,13 +6354,24 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|
|
GGML_ASSERT(qy_sz == y_sz);
|
|
GGML_ASSERT(qy_sz == y_sz);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ if (x_non_contig || qx_needs_dequant) {
|
|
|
|
|
+ if (ctx->prealloc_x_need_sync) {
|
|
|
|
|
+ ggml_vk_sync_buffers(ctx, subctx);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ if (y_non_contig) {
|
|
|
|
|
+ if (ctx->prealloc_y_need_sync) {
|
|
|
|
|
+ ggml_vk_sync_buffers(ctx, subctx);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
if (x_non_contig) {
|
|
if (x_non_contig) {
|
|
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
|
|
} else if (qx_needs_dequant) {
|
|
} else if (qx_needs_dequant) {
|
|
|
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
|
|
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0,
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0,
|
|
|
{ vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
|
|
{ vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
|
|
|
|
|
+ ggml_vk_sync_buffers(ctx, subctx);
|
|
|
}
|
|
}
|
|
|
if (y_non_contig) {
|
|
if (y_non_contig) {
|
|
|
if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
|
|
if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
|
|
@@ -6343,6 +6402,13 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|
|
stride_batch_x, stride_batch_y, ne20*ne21,
|
|
stride_batch_x, stride_batch_y, ne20*ne21,
|
|
|
n_as, nei0, nei1, nbi1 / ggml_type_size(ids->type), ne11, padded_n
|
|
n_as, nei0, nei1, nbi1 / ggml_type_size(ids->type), ne11, padded_n
|
|
|
); // NOLINT
|
|
); // NOLINT
|
|
|
|
|
+
|
|
|
|
|
+ if (x_non_contig || qx_needs_dequant) {
|
|
|
|
|
+ ctx->prealloc_x_need_sync = true;
|
|
|
|
|
+ }
|
|
|
|
|
+ if (y_non_contig) {
|
|
|
|
|
+ ctx->prealloc_y_need_sync = true;
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, bool dryrun = false) {
|
|
static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -6502,6 +6568,17 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|
|
GGML_ASSERT(qy_sz == y_sz);
|
|
GGML_ASSERT(qy_sz == y_sz);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ if (x_non_contig) {
|
|
|
|
|
+ if (ctx->prealloc_x_need_sync) {
|
|
|
|
|
+ ggml_vk_sync_buffers(ctx, subctx);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ if (y_non_contig) {
|
|
|
|
|
+ if (ctx->prealloc_y_need_sync) {
|
|
|
|
|
+ ggml_vk_sync_buffers(ctx, subctx);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
if (x_non_contig) {
|
|
if (x_non_contig) {
|
|
|
GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
|
|
GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
|
|
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
|
@@ -6538,11 +6615,17 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|
|
(uint32_t)x_ne, stride_batch_y, (uint32_t)(ne20*ne21),
|
|
(uint32_t)x_ne, stride_batch_y, (uint32_t)(ne20*ne21),
|
|
|
(uint32_t)nei0, (uint32_t)ne11,
|
|
(uint32_t)nei0, (uint32_t)ne11,
|
|
|
};
|
|
};
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
|
|
|
{ vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 },
|
|
{ vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 },
|
|
|
vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23}, vk_subbuffer{ d_ids, ids_buf_offset, ids_sz } },
|
|
vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23}, vk_subbuffer{ d_ids, ids_buf_offset, ids_sz } },
|
|
|
pc, { groups_x, (uint32_t)nei0, groups_z });
|
|
pc, { groups_x, (uint32_t)nei0, groups_z });
|
|
|
|
|
+
|
|
|
|
|
+ if (x_non_contig) {
|
|
|
|
|
+ ctx->prealloc_x_need_sync = true;
|
|
|
|
|
+ }
|
|
|
|
|
+ if (y_non_contig) {
|
|
|
|
|
+ ctx->prealloc_y_need_sync = true;
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) {
|
|
static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -6925,9 +7008,11 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
|
|
mask_n_head_log2, m0, m1,
|
|
mask_n_head_log2, m0, m1,
|
|
|
gqa_ratio, split_kv, split_k };
|
|
gqa_ratio, split_kv, split_k };
|
|
|
|
|
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
-
|
|
|
|
|
if (split_k > 1) {
|
|
if (split_k > 1) {
|
|
|
|
|
+ if (ctx->prealloc_split_k_need_sync) {
|
|
|
|
|
+ ggml_vk_sync_buffers(ctx, subctx);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
|
|
{
|
|
{
|
|
|
vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
|
|
vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
|
|
@@ -6943,7 +7028,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
|
|
// cancel out the divide by wg_denoms[0].
|
|
// cancel out the divide by wg_denoms[0].
|
|
|
pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z });
|
|
pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z });
|
|
|
|
|
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
|
|
+ ggml_vk_sync_buffers(ctx, subctx);
|
|
|
const std::array<uint32_t, 5> pc2 = { HSV, (uint32_t)ne1, (uint32_t)ne3, split_k, (sinks != nullptr) };
|
|
const std::array<uint32_t, 5> pc2 = { HSV, (uint32_t)ne1, (uint32_t)ne3, split_k, (sinks != nullptr) };
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce,
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce,
|
|
|
{
|
|
{
|
|
@@ -6952,6 +7037,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
|
|
vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
|
|
vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
|
|
|
},
|
|
},
|
|
|
pc2, { (uint32_t)ne1, HSV, (uint32_t)ne3 });
|
|
pc2, { (uint32_t)ne1, HSV, (uint32_t)ne3 });
|
|
|
|
|
+ ctx->prealloc_split_k_need_sync = true;
|
|
|
} else {
|
|
} else {
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
|
|
{
|
|
{
|
|
@@ -7820,7 +7906,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
|
subbuf_y = { d_X, 0, x_sz };
|
|
subbuf_y = { d_X, 0, x_sz };
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
|
} else if (op == GGML_OP_SOFT_MAX) {
|
|
} else if (op == GGML_OP_SOFT_MAX) {
|
|
|
// Empty src1 and src2 is possible in soft_max, but the shader needs a buffer
|
|
// Empty src1 and src2 is possible in soft_max, but the shader needs a buffer
|
|
@@ -7838,7 +7923,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
|
subbuf_z = { d_X, 0, x_sz };
|
|
subbuf_z = { d_X, 0, x_sz };
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
|
} else if (op == GGML_OP_ROPE || op == GGML_OP_ROPE_BACK) {
|
|
} else if (op == GGML_OP_ROPE || op == GGML_OP_ROPE_BACK) {
|
|
|
// Empty src2 is possible in rope, but the shader needs a buffer
|
|
// Empty src2 is possible in rope, but the shader needs a buffer
|
|
@@ -7849,30 +7933,23 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
|
subbuf_z = { d_X, 0, x_sz };
|
|
subbuf_z = { d_X, 0, x_sz };
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
|
} else if (op == GGML_OP_IM2COL) {
|
|
} else if (op == GGML_OP_IM2COL) {
|
|
|
// im2col uses only src1 and dst buffers
|
|
// im2col uses only src1 and dst buffers
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
|
} else if (op == GGML_OP_COUNT_EQUAL) {
|
|
} else if (op == GGML_OP_COUNT_EQUAL) {
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
// count_equal assumes that destination buffer is initialized with zeroes
|
|
// count_equal assumes that destination buffer is initialized with zeroes
|
|
|
ggml_vk_buffer_memset_async(subctx, d_D, d_buf_offset, 0, d_sz);
|
|
ggml_vk_buffer_memset_async(subctx, d_D, d_buf_offset, 0, d_sz);
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
|
|
+ ggml_vk_sync_buffers(ctx, subctx);
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
|
} else if (op == GGML_OP_OPT_STEP_SGD) {
|
|
} else if (op == GGML_OP_OPT_STEP_SGD) {
|
|
|
// OPT_STEP_SGD works on src0, it does not need dst
|
|
// OPT_STEP_SGD works on src0, it does not need dst
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz } }, pc, elements);
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz } }, pc, elements);
|
|
|
} else if (use_src2) {
|
|
} else if (use_src2) {
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
|
} else if (use_src1) {
|
|
} else if (use_src1) {
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
|
} else {
|
|
} else {
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
@@ -7999,7 +8076,6 @@ static void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx,
|
|
|
elements = { ne, 1, 1 };
|
|
elements = { ne, 1, 1 };
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
|
|
{
|
|
{
|
|
|
vk_subbuffer{ buf[0], offset[0], VK_WHOLE_SIZE },
|
|
vk_subbuffer{ buf[0], offset[0], VK_WHOLE_SIZE },
|
|
@@ -8112,8 +8188,6 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
|
|
|
src_buf_ctxs[i] = (ggml_backend_vk_buffer_context *)dst->src[i]->buffer->context;
|
|
src_buf_ctxs[i] = (ggml_backend_vk_buffer_context *)dst->src[i]->buffer->context;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
-
|
|
|
|
|
vk_buffer d_D = nullptr, d_srcs[7] = { nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr };
|
|
vk_buffer d_D = nullptr, d_srcs[7] = { nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr };
|
|
|
size_t dst_offset = 0, src_offsets[7] = { 0, 0, 0, 0, 0, 0, 0 };
|
|
size_t dst_offset = 0, src_offsets[7] = { 0, 0, 0, 0, 0, 0, 0 };
|
|
|
bool dst_uma = false, srcs_uma[7] = { false, false, false, false, false, false, false };
|
|
bool dst_uma = false, srcs_uma[7] = { false, false, false, false, false, false, false };
|
|
@@ -8251,8 +8325,6 @@ static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_cont
|
|
|
ggml_backend_vk_buffer_context * gv_buf_ctx = (ggml_backend_vk_buffer_context *)gv->buffer->context;
|
|
ggml_backend_vk_buffer_context * gv_buf_ctx = (ggml_backend_vk_buffer_context *)gv->buffer->context;
|
|
|
ggml_backend_vk_buffer_context * p_buf_ctx = (ggml_backend_vk_buffer_context *)p->buffer->context;
|
|
ggml_backend_vk_buffer_context * p_buf_ctx = (ggml_backend_vk_buffer_context *)p->buffer->context;
|
|
|
|
|
|
|
|
- ggml_vk_sync_buffers(subctx);
|
|
|
|
|
-
|
|
|
|
|
vk_buffer d_X = nullptr, d_G = nullptr, d_GM = nullptr, d_GV = nullptr, d_P = nullptr;
|
|
vk_buffer d_X = nullptr, d_G = nullptr, d_GM = nullptr, d_GV = nullptr, d_P = nullptr;
|
|
|
size_t x_offset = 0, g_offset = 0, gm_offset = 0, gv_offset = 0, p_offset = 0;
|
|
size_t x_offset = 0, g_offset = 0, gm_offset = 0, gv_offset = 0, p_offset = 0;
|
|
|
bool X_uma = false, G_uma = false, GM_uma = false, GV_uma = false, P_uma = false;
|
|
bool X_uma = false, G_uma = false, GM_uma = false, GV_uma = false, P_uma = false;
|
|
@@ -9964,6 +10036,83 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ if (!dryrun) {
|
|
|
|
|
+ // This logic detects dependencies between modes in the graph and calls ggml_vk_sync_buffers
|
|
|
|
|
+ // to synchronize them. This handles most "normal" synchronization when computing the graph, and when
|
|
|
|
|
+ // there is no auxiliary memory use, it shouldn't be necessary to call ggml_vk_sync_buffers
|
|
|
|
|
+ // outside of this logic. When a node uses one of the prealloc buffers for something like
|
|
|
|
|
+ // dequantization or split_k, additional synchronization is needed between those passes.
|
|
|
|
|
+ bool need_sync = false;
|
|
|
|
|
+
|
|
|
|
|
+ // Check whether "node" requires synchronization. The node requires synchronization if it
|
|
|
|
|
+ // overlaps in memory with another unsynchronized node and at least one of them is a write.
|
|
|
|
|
+ // Destination nodes are checked against both the written/read lists. Source nodes are only
|
|
|
|
|
+ // checked against the written list. Two nodes overlap in memory if they come from the same
|
|
|
|
|
+ // buffer and the tensor or view ranges overlap.
|
|
|
|
|
+ auto const &overlaps_unsynced = [&](const ggml_tensor *node, const std::vector<const ggml_tensor *> &unsynced_nodes) -> bool {
|
|
|
|
|
+ if (unsynced_nodes.size() == 0) {
|
|
|
|
|
+ return false;
|
|
|
|
|
+ }
|
|
|
|
|
+ auto n_base = vk_tensor_offset(node) + node->view_offs;
|
|
|
|
|
+ auto n_size = ggml_nbytes(node);
|
|
|
|
|
+ ggml_backend_vk_buffer_context * a_buf_ctx = (ggml_backend_vk_buffer_context *)node->buffer->context;
|
|
|
|
|
+ vk_buffer a_buf = a_buf_ctx->dev_buffer;
|
|
|
|
|
+ for (auto &other : unsynced_nodes) {
|
|
|
|
|
+ ggml_backend_vk_buffer_context * o_buf_ctx = (ggml_backend_vk_buffer_context *)other->buffer->context;
|
|
|
|
|
+ vk_buffer o_buf = o_buf_ctx->dev_buffer;
|
|
|
|
|
+ if (a_buf == o_buf) {
|
|
|
|
|
+ auto o_base = vk_tensor_offset(other) + other->view_offs;
|
|
|
|
|
+ auto o_size = ggml_nbytes(other);
|
|
|
|
|
+
|
|
|
|
|
+ if ((o_base <= n_base && n_base < o_base + o_size) ||
|
|
|
|
|
+ (n_base <= o_base && o_base < n_base + n_size)) {
|
|
|
|
|
+ return true;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ return false;
|
|
|
|
|
+ };
|
|
|
|
|
+
|
|
|
|
|
+ // For all fused ops, check if the destination node or any of the source
|
|
|
|
|
+ // nodes require synchronization.
|
|
|
|
|
+ for (int32_t i = 0; i < ctx->num_additional_fused_ops + 1 && !need_sync; ++i) {
|
|
|
|
|
+ const ggml_tensor *cur_node = cgraph->nodes[node_idx + i];
|
|
|
|
|
+ if (overlaps_unsynced(cur_node, ctx->unsynced_nodes_read) || overlaps_unsynced(cur_node, ctx->unsynced_nodes_written)) {
|
|
|
|
|
+ need_sync = true;
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+ for (uint32_t j = 0; j < GGML_MAX_SRC; ++j) {
|
|
|
|
|
+ if (!cur_node->src[j]) {
|
|
|
|
|
+ continue;
|
|
|
|
|
+ }
|
|
|
|
|
+ if (overlaps_unsynced(cur_node->src[j], ctx->unsynced_nodes_written)) {
|
|
|
|
|
+ need_sync = true;
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ if (need_sync) {
|
|
|
|
|
+ VK_LOG_DEBUG("node_idx=" << i << " sync");
|
|
|
|
|
+ ctx->unsynced_nodes_written.clear();
|
|
|
|
|
+ ctx->unsynced_nodes_read.clear();
|
|
|
|
|
+ ggml_vk_sync_buffers(ctx, compute_ctx);
|
|
|
|
|
+ } else {
|
|
|
|
|
+ VK_LOG_DEBUG("node_idx=" << i << " unsynced");
|
|
|
|
|
+ }
|
|
|
|
|
+ // Add the last fused node and all fused source nodes to the unsynchronized list.
|
|
|
|
|
+ const ggml_tensor * last_node = cgraph->nodes[node_idx + ctx->num_additional_fused_ops];
|
|
|
|
|
+ ctx->unsynced_nodes_written.push_back(last_node);
|
|
|
|
|
+ for (int32_t i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
|
|
|
|
|
+ const ggml_tensor *cur_node = cgraph->nodes[node_idx + i];
|
|
|
|
|
+ for (uint32_t j = 0; j < GGML_MAX_SRC; ++j) {
|
|
|
|
|
+ if (!cur_node->src[j]) {
|
|
|
|
|
+ continue;
|
|
|
|
|
+ }
|
|
|
|
|
+ ctx->unsynced_nodes_read.push_back(cur_node->src[j]);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
switch (node->op) {
|
|
switch (node->op) {
|
|
|
case GGML_OP_REPEAT:
|
|
case GGML_OP_REPEAT:
|
|
|
ggml_vk_repeat(ctx, compute_ctx, src0, node, dryrun);
|
|
ggml_vk_repeat(ctx, compute_ctx, src0, node, dryrun);
|
|
@@ -10427,6 +10576,10 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
|
|
ctx->gc.temp_buffers.clear();
|
|
ctx->gc.temp_buffers.clear();
|
|
|
ctx->prealloc_y_last_pipeline_used = {};
|
|
ctx->prealloc_y_last_pipeline_used = {};
|
|
|
|
|
|
|
|
|
|
+ ctx->unsynced_nodes_written.clear();
|
|
|
|
|
+ ctx->unsynced_nodes_read.clear();
|
|
|
|
|
+ ctx->prealloc_x_need_sync = ctx->prealloc_y_need_sync = ctx->prealloc_split_k_need_sync = false;
|
|
|
|
|
+
|
|
|
ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
|
|
ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
|
|
|
ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
|
|
ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
|
|
|
|
|
|