|
@@ -3907,28 +3907,27 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|
|
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
|
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-// TODO: this implementation is wrong!
|
|
|
|
|
-//static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
|
|
|
|
|
-// const float p_delta, const int p_delta_rows, const float theta_scale) {
|
|
|
|
|
-// const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
|
|
|
|
-//
|
|
|
|
|
-// if (col >= ncols) {
|
|
|
|
|
-// return;
|
|
|
|
|
-// }
|
|
|
|
|
-//
|
|
|
|
|
-// const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
|
|
|
|
-// const int i = row*ncols + col/2;
|
|
|
|
|
-//
|
|
|
|
|
-// const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
|
|
|
|
|
-// const float sin_theta = sinf(theta);
|
|
|
|
|
-// const float cos_theta = cosf(theta);
|
|
|
|
|
-//
|
|
|
|
|
-// const float x0 = x[i + 0];
|
|
|
|
|
-// const float x1 = x[i + ncols/2];
|
|
|
|
|
-//
|
|
|
|
|
-// dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
|
|
|
|
-// dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
|
|
|
|
-//}
|
|
|
|
|
|
|
+static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
|
|
|
|
|
+ const float p_delta, const int p_delta_rows, const float theta_scale) {
|
|
|
|
|
+ const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
|
|
|
|
+
|
|
|
|
|
+ if (col >= ncols) {
|
|
|
|
|
+ return;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
|
|
|
|
+ const int i = row*ncols + col/2;
|
|
|
|
|
+
|
|
|
|
|
+ const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
|
|
|
|
|
+ const float sin_theta = sinf(theta);
|
|
|
|
|
+ const float cos_theta = cosf(theta);
|
|
|
|
|
+
|
|
|
|
|
+ const float x0 = x[i + 0];
|
|
|
|
|
+ const float x1 = x[i + ncols/2];
|
|
|
|
|
+
|
|
|
|
|
+ dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
|
|
|
|
+ dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
|
|
|
|
+}
|
|
|
|
|
|
|
|
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
|
|
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
|
|
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
|
@@ -4799,13 +4798,21 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
|
|
|
|
|
|
|
|
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
|
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
|
|
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
|
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
|
|
- GGML_ASSERT(nrows % 2 == 0);
|
|
|
|
|
|
|
+ GGML_ASSERT(nrows % 2 == 0); // GG: is this assert really needed? I don't see why
|
|
|
const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
|
|
const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
|
|
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
|
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
|
|
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
|
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
|
|
|
|
+ const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
|
|
|
|
+ const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
|
|
|
|
|
+ const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
|
|
|
|
+ const dim3 block_nums(nrows, num_blocks_x, 1);
|
|
|
|
|
+ rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
|
|
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
|
|
|
GGML_ASSERT(nrows % 4 == 0);
|
|
GGML_ASSERT(nrows % 4 == 0);
|
|
|
const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
|
const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
|
@@ -5548,8 +5555,9 @@ inline void ggml_cuda_op_rope(
|
|
|
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
|
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
|
|
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
|
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
|
|
} else if (is_neox) {
|
|
} else if (is_neox) {
|
|
|
- GGML_ASSERT(false && "RoPE NeoX not implemented yet");
|
|
|
|
|
-#pragma message("TODO: implement RoPE NeoX for CUDA")
|
|
|
|
|
|
|
+ GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
|
|
|
|
+ const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
|
|
|
|
+ rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
|
|
} else {
|
|
} else {
|
|
|
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
|
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
|
|
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
|
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|