2 lat temu · dc68f0054c
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
--- a/ggml.c
+++ b/ggml.c
@@ -4041,7 +4041,6 @@ static struct ggml_tensor * ggml_group_norm_impl(
 
				     result->op = GGML_OP_GROUP_NORM;
			
 
				     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
			
 
				     result->src[0] = a;
			
 
				-    result->src[1] = NULL; // TODO: maybe store epsilon here?
			
 
				 
			
 
				     return result;
			
 
				 }
			
@@ -5541,7 +5540,6 @@ static struct ggml_tensor * ggml_upscale_impl(
 
				     result->op_params[0] = scale_factor;
			
 
				     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
			
 
				     result->src[0] = a;
			
 
				-    result->src[1] = NULL;
			
 
				 
			
 
				     return result;
			
 
				 }
			
@@ -5846,7 +5844,6 @@ struct ggml_tensor * ggml_get_rel_pos(
 
				     result->op   = GGML_OP_GET_REL_POS;
			
 
				     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
			
 
				     result->src[0] = a;
			
 
				-    result->src[1] = NULL;
			
 
				 
			
 
				     return result;
			
 
				 }
			
--- a/llama.cpp
+++ b/llama.cpp
@@ -9519,7 +9519,8 @@ struct llama_context * llama_new_context_with_model(
 
				             ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc);
			
 
				 #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
			
 
				             if (model->n_gpu_layers > 0) {
			
 
				-                ggml_cuda_set_scratch_size(alloc_size);
			
 
				+                // the CPU buffer adds this padding in case the malloc buffer is not aligned, so we need to do the same for the GPU buffer, since we use the same offsets
			
 
				+                ggml_cuda_set_scratch_size(alloc_size + 64);
			
 
				                 LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
			
 
				 
			
 
				                 // calculate total VRAM usage