|
|
@@ -1420,6 +1420,34 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in
|
|
|
#endif
|
|
|
}
|
|
|
|
|
|
+static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
|
|
+static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
|
|
+
|
|
|
+static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
|
|
|
+ [GGML_TYPE_Q4_0] = {
|
|
|
+ .dequantize_row_q = dequantize_row_q4_0,
|
|
|
+ .quantize_row_q = quantize_row_q4_0,
|
|
|
+ .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference,
|
|
|
+ .quantize_row_q_dot = quantize_row_q8_0,
|
|
|
+ .vec_dot_q = ggml_vec_dot_q4_0_q8_0,
|
|
|
+ },
|
|
|
+ [GGML_TYPE_Q4_1] = {
|
|
|
+ .dequantize_row_q = dequantize_row_q4_1,
|
|
|
+ .quantize_row_q = quantize_row_q4_1,
|
|
|
+ .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference,
|
|
|
+ .quantize_row_q_dot = quantize_row_q4_1,
|
|
|
+ .vec_dot_q = ggml_vec_dot_q4_1,
|
|
|
+ },
|
|
|
+ // TODO: GGML_TYPE_Q8_0
|
|
|
+};
|
|
|
+
|
|
|
+// For internal test use
|
|
|
+quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
|
|
|
+ GGML_ASSERT(i < GGML_TYPE_COUNT);
|
|
|
+ return quantize_fns[i];
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
//
|
|
|
// simd mappings
|
|
|
//
|
|
|
@@ -5588,6 +5616,26 @@ static void ggml_compute_forward_dup_f16(
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
+ } else if (dst->type == GGML_TYPE_Q4_0 || dst->type == GGML_TYPE_Q4_1) {
|
|
|
+ quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q;
|
|
|
+ size_t id = 0;
|
|
|
+ uint8_t * dst_ptr = (uint8_t *) dst->data;
|
|
|
+ size_t dst_row_size = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
|
|
|
+ float * src0_f32 = (float *) params->wdata;
|
|
|
+
|
|
|
+ for (int i03 = 0; i03 < ne03; i03++) {
|
|
|
+ for (int i02 = 0; i02 < ne02; i02++) {
|
|
|
+ for (int i01 = 0; i01 < ne01; i01++) {
|
|
|
+ const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
|
|
+ // convert to f32 and quantize
|
|
|
+ for (int i00 = 0; i00 < ne00; i00++) {
|
|
|
+ src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]);
|
|
|
+ }
|
|
|
+ quantize_row_q(src0_f32, dst_ptr + id, ne00);
|
|
|
+ id += dst_row_size;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
} else {
|
|
|
GGML_ASSERT(false); // TODO: implement
|
|
|
}
|
|
|
@@ -5780,6 +5828,21 @@ static void ggml_compute_forward_dup_f32(
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
+ } else if (dst->type == GGML_TYPE_Q4_0 || dst->type == GGML_TYPE_Q4_1) {
|
|
|
+ quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q;
|
|
|
+ size_t id = 0;
|
|
|
+ uint8_t * dst_ptr = (uint8_t *) dst->data;
|
|
|
+ size_t dst_row_size = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
|
|
|
+
|
|
|
+ for (int i03 = 0; i03 < ne03; i03++) {
|
|
|
+ for (int i02 = 0; i02 < ne02; i02++) {
|
|
|
+ for (int i01 = 0; i01 < ne01; i01++) {
|
|
|
+ const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
|
|
+ quantize_row_q(src0_ptr, dst_ptr + id, ne00);
|
|
|
+ id += dst_row_size;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
} else {
|
|
|
GGML_ASSERT(false); // TODO: implement
|
|
|
}
|
|
|
@@ -5968,6 +6031,212 @@ static void ggml_compute_forward_add_f32(
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+static void ggml_compute_forward_add_f16_f32(
|
|
|
+ const struct ggml_compute_params * params,
|
|
|
+ const struct ggml_tensor * src0,
|
|
|
+ const struct ggml_tensor * src1,
|
|
|
+ struct ggml_tensor * dst) {
|
|
|
+ GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
|
|
+
|
|
|
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ const int ith = params->ith;
|
|
|
+ const int nth = params->nth;
|
|
|
+
|
|
|
+ const int n = ggml_nrows(src0);
|
|
|
+ const int nc = src0->ne[0];
|
|
|
+
|
|
|
+ const size_t nb00 = src0->nb[0];
|
|
|
+ const size_t nb01 = src0->nb[1];
|
|
|
+
|
|
|
+ const size_t nb10 = src1->nb[0];
|
|
|
+ const size_t nb11 = src1->nb[1];
|
|
|
+
|
|
|
+ const size_t nb0 = dst->nb[0];
|
|
|
+ const size_t nb1 = dst->nb[1];
|
|
|
+
|
|
|
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
|
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
|
+ GGML_ASSERT(dst->type == GGML_TYPE_F16);
|
|
|
+
|
|
|
+ GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
|
|
|
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
|
|
+
|
|
|
+ if (nb10 == sizeof(float)) {
|
|
|
+ for (int j = ith; j < n; j += nth) {
|
|
|
+ ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1);
|
|
|
+ ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
|
|
|
+ for (int i = 0; i < nc; i++) {
|
|
|
+ float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10);
|
|
|
+ dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + *src1_ptr);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ else {
|
|
|
+ // src1 is not contiguous
|
|
|
+ GGML_ASSERT(false);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+static void ggml_compute_forward_add_f16_f16(
|
|
|
+ const struct ggml_compute_params * params,
|
|
|
+ const struct ggml_tensor * src0,
|
|
|
+ const struct ggml_tensor * src1,
|
|
|
+ struct ggml_tensor * dst) {
|
|
|
+ GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
|
|
+
|
|
|
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ const int ith = params->ith;
|
|
|
+ const int nth = params->nth;
|
|
|
+
|
|
|
+ const int n = ggml_nrows(src0);
|
|
|
+ const int nc = src0->ne[0];
|
|
|
+
|
|
|
+ const size_t nb00 = src0->nb[0];
|
|
|
+ const size_t nb01 = src0->nb[1];
|
|
|
+
|
|
|
+ const size_t nb10 = src1->nb[0];
|
|
|
+ const size_t nb11 = src1->nb[1];
|
|
|
+
|
|
|
+ const size_t nb0 = dst->nb[0];
|
|
|
+ const size_t nb1 = dst->nb[1];
|
|
|
+
|
|
|
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
|
+ GGML_ASSERT(src1->type == GGML_TYPE_F16);
|
|
|
+ GGML_ASSERT(dst->type == GGML_TYPE_F16);
|
|
|
+
|
|
|
+ GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
|
|
|
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
|
|
+
|
|
|
+ if (nb10 == sizeof(ggml_fp16_t)) {
|
|
|
+ for (int j = ith; j < n; j += nth) {
|
|
|
+ ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1);
|
|
|
+ ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
|
|
|
+ for (int i = 0; i < nc; i++) {
|
|
|
+ ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + j*nb11 + i*nb10);
|
|
|
+ dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(*src1_ptr));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ else {
|
|
|
+ // src1 is not contiguous
|
|
|
+ GGML_ASSERT(false);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+static void ggml_compute_forward_add_q_f32(
|
|
|
+ const struct ggml_compute_params * params,
|
|
|
+ const struct ggml_tensor * src0,
|
|
|
+ const struct ggml_tensor * src1,
|
|
|
+ struct ggml_tensor * dst) {
|
|
|
+ GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
|
|
+
|
|
|
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ const int64_t ne00 = src0->ne[0];
|
|
|
+ const int64_t ne01 = src0->ne[1];
|
|
|
+ const int64_t ne02 = src0->ne[2];
|
|
|
+ const int64_t ne03 = src0->ne[3];
|
|
|
+
|
|
|
+ //const int64_t ne10 = src1->ne[0];
|
|
|
+ //const int64_t ne11 = src1->ne[1];
|
|
|
+ const int64_t ne12 = src1->ne[2];
|
|
|
+ const int64_t ne13 = src1->ne[3];
|
|
|
+
|
|
|
+ //const int64_t ne0 = dst->ne[0];
|
|
|
+ //const int64_t ne1 = dst->ne[1];
|
|
|
+ const int64_t ne2 = dst->ne[2];
|
|
|
+ const int64_t ne3 = dst->ne[3];
|
|
|
+
|
|
|
+ const int nb00 = src0->nb[0];
|
|
|
+ const int nb01 = src0->nb[1];
|
|
|
+ const int nb02 = src0->nb[2];
|
|
|
+ const int nb03 = src0->nb[3];
|
|
|
+
|
|
|
+ const int nb10 = src1->nb[0];
|
|
|
+ const int nb11 = src1->nb[1];
|
|
|
+ const int nb12 = src1->nb[2];
|
|
|
+ const int nb13 = src1->nb[3];
|
|
|
+
|
|
|
+ const int nb0 = dst->nb[0];
|
|
|
+ const int nb1 = dst->nb[1];
|
|
|
+ const int nb2 = dst->nb[2];
|
|
|
+ const int nb3 = dst->nb[3];
|
|
|
+
|
|
|
+ const int ith = params->ith;
|
|
|
+ const int nth = params->nth;
|
|
|
+
|
|
|
+ GGML_ASSERT(ne02 == ne12);
|
|
|
+ GGML_ASSERT(ne03 == ne13);
|
|
|
+ GGML_ASSERT(ne2 == ne12);
|
|
|
+ GGML_ASSERT(ne3 == ne13);
|
|
|
+
|
|
|
+ const enum ggml_type type = src0->type;
|
|
|
+ dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
|
|
|
+ quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q;
|
|
|
+
|
|
|
+ // we don't support permuted src0 or src1
|
|
|
+ GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]);
|
|
|
+ GGML_ASSERT(nb10 == sizeof(float));
|
|
|
+
|
|
|
+ // dst cannot be transposed or permuted
|
|
|
+ GGML_ASSERT(nb0 <= nb1);
|
|
|
+ GGML_ASSERT(nb1 <= nb2);
|
|
|
+ GGML_ASSERT(nb2 <= nb3);
|
|
|
+
|
|
|
+ GGML_ASSERT(src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1);
|
|
|
+ GGML_ASSERT(dst->type == src0->type);
|
|
|
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
|
+
|
|
|
+ // total rows in src0
|
|
|
+ const int nr = ne01*ne02*ne03;
|
|
|
+
|
|
|
+ // rows per thread
|
|
|
+ const int dr = (nr + nth - 1)/nth;
|
|
|
+
|
|
|
+ // row range for this thread
|
|
|
+ const int ir0 = dr*ith;
|
|
|
+ const int ir1 = MIN(ir0 + dr, nr);
|
|
|
+
|
|
|
+ float * wdata = (float*) params->wdata + ne00 * ith;
|
|
|
+
|
|
|
+ for (int ir = ir0; ir < ir1; ++ir) {
|
|
|
+ // src0 indices
|
|
|
+ const int i03 = ir/(ne02*ne01);
|
|
|
+ const int i02 = (ir - i03*ne02*ne01)/ne01;
|
|
|
+ const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
|
|
+
|
|
|
+ // src1 and dst are same shape as src0 => same indices
|
|
|
+ const int i13 = i03;
|
|
|
+ const int i12 = i02;
|
|
|
+ const int i11 = i01;
|
|
|
+
|
|
|
+ const int i3 = i03;
|
|
|
+ const int i2 = i02;
|
|
|
+ const int i1 = i01;
|
|
|
+
|
|
|
+ void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
|
|
|
+ float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
|
|
|
+ void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb0));
|
|
|
+
|
|
|
+ assert(ne00 % 32 == 0);
|
|
|
+
|
|
|
+ // unquantize row from src0 to temp buffer
|
|
|
+ dequantize_row_q(src0_row, wdata, ne00);
|
|
|
+ // add src1
|
|
|
+ ggml_vec_acc_f32(ne00, wdata, src1_row);
|
|
|
+ // quantize row to dst
|
|
|
+ quantize_row_q(wdata, dst_row, ne00);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
static void ggml_compute_forward_add(
|
|
|
const struct ggml_compute_params * params,
|
|
|
const struct ggml_tensor * src0,
|
|
|
@@ -5978,6 +6247,23 @@ static void ggml_compute_forward_add(
|
|
|
{
|
|
|
ggml_compute_forward_add_f32(params, src0, src1, dst);
|
|
|
} break;
|
|
|
+ case GGML_TYPE_F16:
|
|
|
+ {
|
|
|
+ if (src1->type == GGML_TYPE_F16) {
|
|
|
+ ggml_compute_forward_add_f16_f16(params, src0, src1, dst);
|
|
|
+ }
|
|
|
+ else if (src1->type == GGML_TYPE_F32) {
|
|
|
+ ggml_compute_forward_add_f16_f32(params, src0, src1, dst);
|
|
|
+ }
|
|
|
+ else {
|
|
|
+ GGML_ASSERT(false);
|
|
|
+ }
|
|
|
+ } break;
|
|
|
+ case GGML_TYPE_Q4_0:
|
|
|
+ case GGML_TYPE_Q4_1:
|
|
|
+ {
|
|
|
+ ggml_compute_forward_add_q_f32(params, src0, src1, dst);
|
|
|
+ } break;
|
|
|
default:
|
|
|
{
|
|
|
GGML_ASSERT(false);
|
|
|
@@ -7257,30 +7543,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
|
//}
|
|
|
}
|
|
|
|
|
|
-static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
|
|
|
- [GGML_TYPE_Q4_0] = {
|
|
|
- .dequantize_row_q = dequantize_row_q4_0,
|
|
|
- .quantize_row_q = quantize_row_q4_0,
|
|
|
- .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference,
|
|
|
- .quantize_row_q_dot = quantize_row_q8_0,
|
|
|
- .vec_dot_q = ggml_vec_dot_q4_0_q8_0,
|
|
|
- },
|
|
|
- [GGML_TYPE_Q4_1] = {
|
|
|
- .dequantize_row_q = dequantize_row_q4_1,
|
|
|
- .quantize_row_q = quantize_row_q4_1,
|
|
|
- .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference,
|
|
|
- .quantize_row_q_dot = quantize_row_q4_1,
|
|
|
- .vec_dot_q = ggml_vec_dot_q4_1,
|
|
|
- },
|
|
|
- // TODO: GGML_TYPE_Q8_0
|
|
|
-};
|
|
|
-
|
|
|
-// For internal test use
|
|
|
-quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
|
|
|
- GGML_ASSERT(i < GGML_TYPE_COUNT);
|
|
|
- return quantize_fns[i];
|
|
|
-}
|
|
|
-
|
|
|
static void ggml_compute_forward_mul_mat_q_f32(
|
|
|
const struct ggml_compute_params * params,
|
|
|
const struct ggml_tensor * src0,
|
|
|
@@ -10137,13 +10399,29 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
|
struct ggml_tensor * node = cgraph->nodes[i];
|
|
|
|
|
|
switch (node->op) {
|
|
|
+ case GGML_OP_CPY:
|
|
|
case GGML_OP_DUP:
|
|
|
{
|
|
|
node->n_tasks = 1;
|
|
|
+
|
|
|
+ size_t cur = 0;
|
|
|
+ if (node->type == GGML_TYPE_Q4_0 || node->type == GGML_TYPE_Q4_1) {
|
|
|
+ cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0];
|
|
|
+ }
|
|
|
+
|
|
|
+ work_size = MAX(work_size, cur);
|
|
|
} break;
|
|
|
case GGML_OP_ADD:
|
|
|
{
|
|
|
node->n_tasks = n_threads;
|
|
|
+
|
|
|
+ size_t cur = 0;
|
|
|
+
|
|
|
+ if (node->src0->type == GGML_TYPE_Q4_0 || node->src0->type == GGML_TYPE_Q4_1) {
|
|
|
+ cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads;
|
|
|
+ }
|
|
|
+
|
|
|
+ work_size = MAX(work_size, cur);
|
|
|
} break;
|
|
|
case GGML_OP_SUB:
|
|
|
case GGML_OP_MUL:
|
|
|
@@ -10224,7 +10502,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
|
{
|
|
|
node->n_tasks = n_threads;
|
|
|
} break;
|
|
|
- case GGML_OP_CPY:
|
|
|
case GGML_OP_CONT:
|
|
|
case GGML_OP_RESHAPE:
|
|
|
case GGML_OP_VIEW:
|