|
|
@@ -123,6 +123,8 @@ typedef void * thread_ret_t;
|
|
|
#define GGML_GELU_FP16
|
|
|
#define GGML_GELU_QUICK_FP16
|
|
|
#define GGML_SILU_FP16
|
|
|
+// #define GGML_CROSS_ENTROPY_EXP_FP16
|
|
|
+// #define GGML_FLASH_ATTN_EXP_FP16
|
|
|
|
|
|
#define GGML_SOFT_MAX_UNROLL 4
|
|
|
#define GGML_VEC_DOT_UNROLL 2
|
|
|
@@ -186,8 +188,8 @@ typedef void * thread_ret_t;
|
|
|
//
|
|
|
|
|
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
|
|
-#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
|
|
-#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
|
|
+#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
|
|
+#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
|
|
#else
|
|
|
inline static void * ggml_aligned_malloc(size_t size) {
|
|
|
void * aligned_memory = NULL;
|
|
|
@@ -212,8 +214,8 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
|
}
|
|
|
return aligned_memory;
|
|
|
}
|
|
|
-#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
|
|
|
-#define GGML_ALIGNED_FREE(ptr) free(ptr)
|
|
|
+#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
|
|
|
+#define GGML_ALIGNED_FREE(ptr) free(ptr)
|
|
|
#endif
|
|
|
|
|
|
#define UNUSED GGML_UNUSED
|
|
|
@@ -5857,7 +5859,8 @@ struct ggml_tensor * ggml_rms_norm_inplace(
|
|
|
struct ggml_tensor * ggml_rms_norm_back(
|
|
|
struct ggml_context * ctx,
|
|
|
struct ggml_tensor * a,
|
|
|
- struct ggml_tensor * b) {
|
|
|
+ struct ggml_tensor * b,
|
|
|
+ float eps) {
|
|
|
bool is_node = false;
|
|
|
|
|
|
if (a->grad) {
|
|
|
@@ -5867,6 +5870,8 @@ struct ggml_tensor * ggml_rms_norm_back(
|
|
|
|
|
|
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
|
|
|
|
|
+ ggml_set_op_params(result, &eps, sizeof(eps));
|
|
|
+
|
|
|
result->op = GGML_OP_RMS_NORM_BACK;
|
|
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
|
result->src[0] = a;
|
|
|
@@ -9443,6 +9448,8 @@ static void ggml_compute_forward_div_f32(
|
|
|
|
|
|
|
|
|
#ifdef GGML_USE_ACCELERATE
|
|
|
+ UNUSED(ggml_vec_div_f32);
|
|
|
+
|
|
|
vDSP_vdiv(
|
|
|
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
|
|
|
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
|
|
|
@@ -10749,7 +10756,8 @@ static void ggml_compute_forward_rms_norm_back_f32(
|
|
|
|
|
|
GGML_TENSOR_BINARY_OP_LOCALS;
|
|
|
|
|
|
- const float eps = 1e-6f; // TODO: make this a parameter
|
|
|
+ float eps;
|
|
|
+ memcpy(&eps, dst->op_params, sizeof(float));
|
|
|
|
|
|
// TODO: optimize
|
|
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
|
|
@@ -12139,6 +12147,7 @@ static void ggml_compute_forward_soft_max_back_f32(
|
|
|
// dx = J * dy
|
|
|
// dxk = sum_i(Jki * dyi)
|
|
|
// dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
|
|
|
+ // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk
|
|
|
// dxk = sum_i(-yk*yi * dyi) + yk*dyk
|
|
|
// dxk = -yk * sum_i(yi * dyi) + yk*dyk
|
|
|
// dxk = -yk * dot(y, dy) + yk*dyk
|
|
|
@@ -13929,7 +13938,7 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
|
vvexpf(S, S, &Mup);
|
|
|
ggml_vec_sum_f32(Mup, &sum, S);
|
|
|
#else
|
|
|
- uint16_t scvt[GGML_SOFT_MAX_UNROLL];
|
|
|
+ uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
|
|
|
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
|
|
|
|
|
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
|
|
@@ -13939,9 +13948,13 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
|
if (SS[j] == -INFINITY) {
|
|
|
SS[j] = 0.0f;
|
|
|
} else {
|
|
|
+#ifndef GGML_FLASH_ATTN_EXP_FP16
|
|
|
+ const float val = expf(SS[j] - max);
|
|
|
+#else
|
|
|
ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
|
|
|
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
|
|
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
|
|
|
+#endif
|
|
|
sump[j] += (ggml_float)val;
|
|
|
SS[j] = val;
|
|
|
}
|
|
|
@@ -14519,7 +14532,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
|
|
vvexpf(SM, SM, &Mup);
|
|
|
ggml_vec_sum_f32(Mup, &sum, SM);
|
|
|
#else
|
|
|
- uint16_t scvt[GGML_SOFT_MAX_UNROLL];
|
|
|
+ uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
|
|
|
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
|
|
|
|
|
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
|
|
@@ -14530,9 +14543,13 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
|
|
if (SR[j] == -INFINITY) {
|
|
|
SW[j] = 0.0f;
|
|
|
} else {
|
|
|
+#ifndef GGML_FLASH_ATTN_EXP_FP16
|
|
|
+ const float val = expf(SR[j] - max);
|
|
|
+#else
|
|
|
ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
|
|
|
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
|
|
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
|
|
|
+#endif
|
|
|
sump[j] += (ggml_float)val;
|
|
|
SW[j] = val;
|
|
|
}
|
|
|
@@ -15270,6 +15287,8 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
|
const int nc = src0->ne[0];
|
|
|
const int nr = ggml_nrows(src0);
|
|
|
|
|
|
+ GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
|
|
|
+
|
|
|
if (params->type == GGML_TASK_INIT) {
|
|
|
if (ith == 0) {
|
|
|
memset(sums, 0, sizeof(float) * (nth + nth * nc));
|
|
|
@@ -15281,7 +15300,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
|
if (ith == 0) {
|
|
|
float * dp = (float *) dst->data;
|
|
|
ggml_vec_sum_f32(nth, dp, sums);
|
|
|
- dp[0] *= -1.0f;
|
|
|
+ dp[0] *= -1.0f / (float) nr;
|
|
|
}
|
|
|
return;
|
|
|
}
|
|
|
@@ -15298,7 +15317,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
|
|
float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
|
|
|
float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
|
|
|
- float * st = (float *) params->wdata + nth + ith*nc;
|
|
|
+ float * st = ((float *) params->wdata) + nth + ith*nc;
|
|
|
|
|
|
#ifndef NDEBUG
|
|
|
for (int i = 0; i < nc; ++i) {
|
|
|
@@ -15313,15 +15332,19 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
|
float max = -INFINITY;
|
|
|
ggml_vec_max_f32(nc, &max, s0);
|
|
|
|
|
|
- uint16_t scvt;
|
|
|
+ uint16_t scvt; UNUSED(scvt);
|
|
|
for (int i = 0; i < nc; i++) {
|
|
|
if (s0[i] == -INFINITY) {
|
|
|
st[i] = 0.0f;
|
|
|
} else {
|
|
|
- // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max);
|
|
|
+#ifndef GGML_CROSS_ENTROPY_EXP_FP16
|
|
|
+ const float s = s0[i] - max;
|
|
|
+ const float val = expf(s);
|
|
|
+#else
|
|
|
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
|
|
|
memcpy(&scvt, &s, sizeof(scvt));
|
|
|
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
|
|
|
+#endif
|
|
|
sum += (ggml_float)val;
|
|
|
st[i] = val;
|
|
|
}
|
|
|
@@ -15337,7 +15360,9 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
|
ggml_vec_log_f32(nc, st, st);
|
|
|
ggml_vec_mul_f32(nc, st, st, s1);
|
|
|
|
|
|
- ggml_vec_sum_f32(nc, sums + ith, st);
|
|
|
+ float st_sum = 0;
|
|
|
+ ggml_vec_sum_f32(nc, &st_sum, st);
|
|
|
+ sums[ith] += st_sum;
|
|
|
|
|
|
#ifndef NDEBUG
|
|
|
for (int i = 0; i < nc; ++i) {
|
|
|
@@ -15387,7 +15412,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
- const float eps = 1e-9f;
|
|
|
+ const double eps = 1e-9;
|
|
|
|
|
|
// TODO: handle transposed/permuted matrices
|
|
|
const int64_t nc = src0->ne[0];
|
|
|
@@ -15406,7 +15431,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
|
float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]);
|
|
|
float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
|
|
|
float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
|
|
|
- float * sm = (float *) params->wdata + ith*nc;
|
|
|
|
|
|
#ifndef NDEBUG
|
|
|
for (int i = 0; i < nc; ++i) {
|
|
|
@@ -15415,54 +15439,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
|
assert(!isnan(s1[i]));
|
|
|
}
|
|
|
#endif
|
|
|
- // step by step explanation:
|
|
|
- {
|
|
|
- //float * sums = (float *) params->wdata;
|
|
|
-
|
|
|
- // forward pass with annotated gradients from backward pass
|
|
|
- // (built by going in reverse operation order, adding to gradients of current operation args)
|
|
|
- // st0 = exp(s0-max(s0)) grad[st0] = grad[st1]*(1.0 - eps)/sum
|
|
|
- // from softmax_back: grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
|
|
|
- // ggml_vec_scale_f32(nc, st, sum); // st1 = st0*/sum = softmax(s0) grad[st1] = grad[st2]*(1.0 - eps)
|
|
|
- // ggml_vec_scale_f32(nc, st, (1.0f - eps)); // st2 = st1*(1.0 - eps) grad[st2] = grad[st3]
|
|
|
- // ggml_vec_add1_f32(nc, st, st, eps); // st3 = st2 + eps grad[st3] = grad[st4]/st3
|
|
|
- // ggml_vec_log_f32(nc, st, st); // st4 = log(st3) grad[st4] = grad[st5] * s1
|
|
|
- // ggml_vec_mul_f32(nc, st, st, s1); // st5 = st4 * s1 grad[st5] = grad[sums[ith]]
|
|
|
- // ggml_vec_sum_f32(nc, sums + ith, st); // sums[ith] = st5 grad[sums[ith]] = grad[cross_entropy_loss] = -grad[cel]
|
|
|
-
|
|
|
- // substitute into grad[st1], because we can reuse softmax_back from this point on
|
|
|
- // grad[st1] = -grad[cel]*s1*(1.0 - eps)/(eps + softmax(s0)*(1.0 - eps))
|
|
|
- // postorder:
|
|
|
- // grad[st1] := softmax(s0)
|
|
|
- // grad[st1] := grad[st1]*(1.0 - eps)
|
|
|
- // grad[st1] := grad[st1] + eps
|
|
|
- // grad[st1] := s1 / grad[st1]
|
|
|
- // grad[st1] := grad[st1]*(1.0-eps)*-grad[cel]
|
|
|
-
|
|
|
- // src0 gradients by going through softmax_back
|
|
|
- // grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
|
|
|
- // from softmax_back:
|
|
|
- // dxk = yk * (dyk - dot(y, dy))
|
|
|
- // dot_y_dy := dot(y, dy)
|
|
|
- // dx := dy
|
|
|
- // dx := dx - dot_y_dy
|
|
|
- // dx := dx * y
|
|
|
- // postorder:
|
|
|
- // dot_st1_dst1 := dot(st1, grad[st1])
|
|
|
- // grad[s0] := grad[st1]
|
|
|
- // grad[s0] := grad[s0] - dot_st1_dst1
|
|
|
- // grad[s0] := grad[s0] * st1
|
|
|
-
|
|
|
- // prepend postorder from grad[st1] directly using grad[s0] as memory location, as we will grad[s0] := grad[st1]
|
|
|
- // sm := softmax(s0)
|
|
|
- // grad[s0] := sm*(1.0 - eps)
|
|
|
- // grad[s0] := grad[s0] + eps
|
|
|
- // grad[s0] := s1 / grad[s0]
|
|
|
- // grad[s0] := grad[s0]*(1.0-eps)*-grad[cel]
|
|
|
- // dot_st1_dst1 := dot(sm, grad[s0])
|
|
|
- // grad[s0] := grad[s0] - dot_st1_dst1
|
|
|
- // grad[s0] := grad[s0] * sm
|
|
|
- }
|
|
|
|
|
|
// soft_max
|
|
|
ggml_float sum = 0.0;
|
|
|
@@ -15470,39 +15446,37 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
|
float max = -INFINITY;
|
|
|
ggml_vec_max_f32(nc, &max, s0);
|
|
|
|
|
|
- uint16_t scvt;
|
|
|
+ uint16_t scvt; UNUSED(scvt);
|
|
|
for (int i = 0; i < nc; i++) {
|
|
|
if (s0[i] == -INFINITY) {
|
|
|
- sm[i] = 0.0f;
|
|
|
+ ds0[i] = 0.0f;
|
|
|
} else {
|
|
|
- // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max);
|
|
|
+#ifndef GGML_CROSS_ENTROPY_EXP_FP16
|
|
|
+ const float s = s0[i] - max;
|
|
|
+ const float val = expf(s);
|
|
|
+#else
|
|
|
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
|
|
|
memcpy(&scvt, &s, sizeof(scvt));
|
|
|
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
|
|
|
+#endif
|
|
|
sum += (ggml_float)val;
|
|
|
- sm[i] = val;
|
|
|
+ ds0[i] = val;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
assert(sum > 0.0);
|
|
|
- sum = 1.0/sum;
|
|
|
+ sum = (1.0 - eps)/sum;
|
|
|
}
|
|
|
|
|
|
- float dot_st1_dst1 = 0;
|
|
|
- ggml_vec_scale_f32(nc, sm, sum);
|
|
|
- ggml_vec_cpy_f32 (nc, ds0, sm);
|
|
|
- ggml_vec_scale_f32(nc, ds0, (1.0f - eps));
|
|
|
- ggml_vec_add1_f32 (nc, ds0, ds0, eps);
|
|
|
- ggml_vec_div_f32 (nc, ds0, s1, ds0);
|
|
|
- ggml_vec_scale_f32(nc, ds0, -(1.0f - eps)*d[0]);
|
|
|
- ggml_vec_dot_f32 (nc, &dot_st1_dst1, sm, ds0);
|
|
|
- ggml_vec_acc1_f32 (nc, ds0, -dot_st1_dst1);
|
|
|
- ggml_vec_mul_f32 (nc, ds0, ds0, sm);
|
|
|
+ // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
|
|
|
+ ggml_vec_scale_f32(nc, ds0, sum);
|
|
|
+ ggml_vec_add1_f32(nc, ds0, ds0, eps);
|
|
|
+ ggml_vec_sub_f32(nc, ds0, ds0, s1);
|
|
|
+ ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
|
|
|
+
|
|
|
|
|
|
#ifndef NDEBUG
|
|
|
for (int i = 0; i < nc; ++i) {
|
|
|
- assert(!isnan(sm[i]));
|
|
|
- assert(!isinf(sm[i]));
|
|
|
assert(!isnan(ds0[i]));
|
|
|
assert(!isinf(ds0[i]));
|
|
|
}
|
|
|
@@ -16057,9 +16031,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
|
{
|
|
|
// necessary for llama
|
|
|
if (src0->grad) {
|
|
|
+ float eps;
|
|
|
+ memcpy(&eps, tensor->op_params, sizeof(float));
|
|
|
+
|
|
|
src0->grad = ggml_add_impl(ctx,
|
|
|
src0->grad,
|
|
|
- ggml_rms_norm_back(ctx, src0, tensor->grad),
|
|
|
+ ggml_rms_norm_back(ctx, src0, tensor->grad, eps),
|
|
|
inplace);
|
|
|
}
|
|
|
} break;
|
|
|
@@ -16827,9 +16804,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
|
|
|
return result;
|
|
|
}
|
|
|
|
|
|
-struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
|
|
|
- struct ggml_cgraph result = *gf;
|
|
|
-
|
|
|
+void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
|
|
|
GGML_ASSERT(gf->n_nodes > 0);
|
|
|
|
|
|
// if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
|
|
|
@@ -16853,15 +16828,19 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- for (int i = gf->n_nodes - 1; i >= 0; i--) {
|
|
|
+ for (int i = 0; i < gf->n_nodes; i++) {
|
|
|
struct ggml_tensor * node = gf->nodes[i];
|
|
|
|
|
|
if (node->is_param) {
|
|
|
GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
|
|
|
- ggml_build_forward_expand(&result, node->grad);
|
|
|
+ ggml_build_forward_expand(gb, node->grad);
|
|
|
}
|
|
|
}
|
|
|
+}
|
|
|
|
|
|
+struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
|
|
|
+ struct ggml_cgraph result = *gf;
|
|
|
+ ggml_build_backward_expand(ctx, gf, &result, keep);
|
|
|
return result;
|
|
|
}
|
|
|
|
|
|
@@ -17537,10 +17516,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
|
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
|
|
{
|
|
|
n_tasks = n_threads;
|
|
|
-
|
|
|
- size_t cur = ggml_type_size(node->type)*node->src[0]->ne[0]*n_tasks;
|
|
|
-
|
|
|
- work_size = MAX(work_size, cur);
|
|
|
} break;
|
|
|
case GGML_OP_NONE:
|
|
|
{
|
|
|
@@ -18418,14 +18393,16 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
|
struct ggml_opt_params params,
|
|
|
struct ggml_tensor * f,
|
|
|
struct ggml_cgraph * gf,
|
|
|
- struct ggml_cgraph * gb) {
|
|
|
+ struct ggml_cgraph * gb,
|
|
|
+ ggml_opt_callback callback,
|
|
|
+ void * callback_data) {
|
|
|
GGML_ASSERT(ggml_is_scalar(f));
|
|
|
|
|
|
// these will store the parameters we want to optimize
|
|
|
struct ggml_tensor * ps[GGML_MAX_PARAMS];
|
|
|
|
|
|
int np = 0;
|
|
|
- int nx = 0;
|
|
|
+ int64_t nx = 0;
|
|
|
for (int i = 0; i < gf->n_nodes; ++i) {
|
|
|
if (gf->nodes[i]->is_param) {
|
|
|
GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
|
|
|
@@ -18444,31 +18421,32 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
|
}
|
|
|
|
|
|
// constants
|
|
|
- const float sched = params.adam.sched;
|
|
|
- const float decay = params.adam.decay * sched;
|
|
|
- const float alpha = params.adam.alpha * sched;
|
|
|
+ float sched = params.adam.sched;
|
|
|
+ const float alpha = params.adam.alpha;
|
|
|
+ const float decay = params.adam.decay * alpha;
|
|
|
const float beta1 = params.adam.beta1;
|
|
|
const float beta2 = params.adam.beta2;
|
|
|
const float eps = params.adam.eps;
|
|
|
+ const float gclip = params.adam.gclip;
|
|
|
+ const int decay_min_ndim = params.adam.decay_min_ndim;
|
|
|
|
|
|
- float * x = opt->adam.x->data; // view of the parameters
|
|
|
- float * g1 = opt->adam.g1->data; // gradient
|
|
|
- float * g2 = opt->adam.g2->data; // gradient squared
|
|
|
float * m = opt->adam.m->data; // first moment
|
|
|
float * v = opt->adam.v->data; // second moment
|
|
|
- float * mh = opt->adam.mh->data; // first moment hat
|
|
|
- float * vh = opt->adam.vh->data; // second moment hat
|
|
|
|
|
|
float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
|
|
|
|
|
|
- // update view
|
|
|
- ggml_opt_get_params(np, ps, x);
|
|
|
+ if (callback) {
|
|
|
+ callback(callback_data, &sched);
|
|
|
+ }
|
|
|
|
|
|
// compute the function value
|
|
|
ggml_graph_reset (gf);
|
|
|
ggml_set_f32 (f->grad, 1.0f);
|
|
|
|
|
|
- ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
|
|
|
+ struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
|
|
|
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
|
|
|
+ cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
|
|
|
+ ggml_graph_compute(gb, &cplan);
|
|
|
|
|
|
opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
|
|
|
opt->adam.fx_best = opt->adam.fx_prev;
|
|
|
@@ -18476,6 +18454,9 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
|
pf[opt->iter % params.past] = opt->adam.fx_prev;
|
|
|
}
|
|
|
|
|
|
+ opt->loss_before = opt->adam.fx_prev;
|
|
|
+ opt->loss_after = opt->adam.fx_prev;
|
|
|
+
|
|
|
// initialize
|
|
|
if (opt->just_initialized) {
|
|
|
opt->adam.n_no_improvement = 0;
|
|
|
@@ -18508,50 +18489,55 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
|
UNUSED(t_start_cpu);
|
|
|
|
|
|
{
|
|
|
- // update the gradient
|
|
|
- ggml_opt_get_grad(np, ps, g1);
|
|
|
-
|
|
|
- // m_t = beta1*m_t-1 + (1 - beta1)*g_t
|
|
|
- ggml_vec_scale_f32(nx, m, beta1);
|
|
|
- ggml_vec_mad_f32 (nx, m, g1, 1.0f - beta1);
|
|
|
-
|
|
|
- // g2 = g1^2
|
|
|
- ggml_vec_sqr_f32 (nx, g2, g1);
|
|
|
-
|
|
|
- // v_t = beta2*v_t-1 + (1 - beta2)*g_t^2
|
|
|
- ggml_vec_scale_f32(nx, v, beta2);
|
|
|
- ggml_vec_mad_f32 (nx, v, g2, 1.0f - beta2);
|
|
|
-
|
|
|
- // m^hat = m_t / (1 - beta1^t)
|
|
|
- // v^hat = v_t / (1 - beta2^t)
|
|
|
- // x_t = x_t-1 - sched*(alpha*m^hat/(sqrt(v^hat) + eps) + decay*x_t-1)
|
|
|
- // x_t = x_t-1 - sched*alpha*m^hat/(sqrt(v^hat) + eps) - sched*decay*x_t-1
|
|
|
- // x_t = x_t-1*(1-sched*decay) - sched*alpha*m^hat/(sqrt(v^hat) + eps)
|
|
|
- // x_t = x_t-1*(1-sched*decay) + sched*decay*(-alpha/decay)*m^hat/(sqrt(v^hat) + eps)
|
|
|
- // x_t = mix(x_t-1, (-alpha/decay)*m^hat/(sqrt(v^hat) + eps), sched*decay)
|
|
|
- ggml_vec_cpy_f32 (nx, mh, m);
|
|
|
- ggml_vec_cpy_f32 (nx, vh, v);
|
|
|
-
|
|
|
- ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, opt->iter)));
|
|
|
- ggml_vec_scale_f32(nx, vh, 1.0f/(1.0f - powf(beta2, opt->iter)));
|
|
|
-
|
|
|
- ggml_vec_sqrt_f32 (nx, vh, vh);
|
|
|
- ggml_vec_acc1_f32 (nx, vh, eps);
|
|
|
-
|
|
|
- ggml_vec_div_f32 (nx, mh, mh, vh);
|
|
|
- ggml_vec_scale_f32(nx, x, 1.0f - decay);
|
|
|
- ggml_vec_sub_f32 (nx, x, x, mh);
|
|
|
+ float gnorm = 1.0f;
|
|
|
+ if (gclip > 0.0f) {
|
|
|
+ // gradient clipping
|
|
|
+ ggml_float sum = 0.0;
|
|
|
+ for (int p = 0; p < np; ++p) {
|
|
|
+ const int64_t ne = ggml_nelements(ps[p]);
|
|
|
+ for (int64_t j = 0; j < ne; ++j) {
|
|
|
+ float g = ggml_get_f32_1d(ps[p]->grad, j);
|
|
|
+ sum += (ggml_float)(g*g);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ ggml_float norm = sqrt(sum);
|
|
|
+ if (norm > (ggml_float) gclip) {
|
|
|
+ gnorm = (float) ((ggml_float) gclip / norm);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter));
|
|
|
+ const float beta2h = 1.0f/(1.0f - powf(beta2, opt->iter));
|
|
|
+ int64_t i = 0;
|
|
|
+ for (int p = 0; p < np; ++p) {
|
|
|
+ const int64_t ne = ggml_nelements(ps[p]);
|
|
|
+ const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched;
|
|
|
+ for (int64_t j = 0; j < ne; ++j) {
|
|
|
+ float x = ggml_get_f32_1d(ps[p], j);
|
|
|
+ float g = ggml_get_f32_1d(ps[p]->grad, j)*gnorm;
|
|
|
+ m[i] = m[i]*beta1 + g*(1.0f - beta1);
|
|
|
+ v[i] = v[i]*beta2 + g*g*(1.0f - beta2);
|
|
|
+ float mh = m[i]*beta1h;
|
|
|
+ float vh = v[i]*beta2h;
|
|
|
+ vh = sqrtf(vh) + eps;
|
|
|
+ x = x*(1.0f - p_decay) - mh/vh;
|
|
|
+ ggml_set_f32_1d(ps[p], j, x);
|
|
|
+ ++i;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
|
|
|
- // update the parameters
|
|
|
- ggml_opt_set_params(np, ps, x);
|
|
|
+ if (callback) {
|
|
|
+ callback(callback_data, &sched);
|
|
|
}
|
|
|
|
|
|
ggml_graph_reset (gf);
|
|
|
ggml_set_f32 (f->grad, 1.0f);
|
|
|
|
|
|
- ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
|
|
|
+ ggml_graph_compute(gb, &cplan);
|
|
|
|
|
|
const float fx = ggml_get_f32_1d(f, 0);
|
|
|
+ opt->loss_after = fx;
|
|
|
+
|
|
|
|
|
|
// check convergence
|
|
|
if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
|
|
|
@@ -18620,7 +18606,6 @@ struct ggml_lbfgs_iteration_data {
|
|
|
};
|
|
|
|
|
|
static enum ggml_opt_result linesearch_backtracking(
|
|
|
- struct ggml_context * ctx,
|
|
|
const struct ggml_opt_params * params,
|
|
|
int nx,
|
|
|
float * x,
|
|
|
@@ -18632,8 +18617,11 @@ static enum ggml_opt_result linesearch_backtracking(
|
|
|
struct ggml_tensor * f,
|
|
|
struct ggml_cgraph * gf,
|
|
|
struct ggml_cgraph * gb,
|
|
|
+ struct ggml_cplan * cplan,
|
|
|
const int np,
|
|
|
- struct ggml_tensor * ps[]) {
|
|
|
+ struct ggml_tensor * ps[],
|
|
|
+ ggml_opt_callback callback,
|
|
|
+ void * callback_data) {
|
|
|
int count = 0;
|
|
|
|
|
|
float width = 0.0f;
|
|
|
@@ -18662,6 +18650,12 @@ static enum ggml_opt_result linesearch_backtracking(
|
|
|
dgtest = params->lbfgs.ftol*dginit;
|
|
|
|
|
|
while (true) {
|
|
|
+ if (callback) {
|
|
|
+ // LBFG-S does not support learning rate -> ignore learning schedule
|
|
|
+ float sched = 0;
|
|
|
+ callback(callback_data, &sched);
|
|
|
+ }
|
|
|
+
|
|
|
ggml_vec_cpy_f32(nx, x, xp);
|
|
|
ggml_vec_mad_f32(nx, x, d, *step);
|
|
|
|
|
|
@@ -18672,7 +18666,7 @@ static enum ggml_opt_result linesearch_backtracking(
|
|
|
ggml_graph_reset (gf);
|
|
|
ggml_set_f32 (f->grad, 1.0f);
|
|
|
|
|
|
- ggml_graph_compute_with_ctx(ctx, gb, params->n_threads);
|
|
|
+ ggml_graph_compute(gb, cplan);
|
|
|
|
|
|
ggml_opt_get_grad(np, ps, g);
|
|
|
|
|
|
@@ -18732,7 +18726,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
|
struct ggml_opt_params params,
|
|
|
struct ggml_tensor * f,
|
|
|
struct ggml_cgraph * gf,
|
|
|
- struct ggml_cgraph * gb) {
|
|
|
+ struct ggml_cgraph * gb,
|
|
|
+ ggml_opt_callback callback,
|
|
|
+ void * callback_data) {
|
|
|
if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
|
|
|
params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
|
|
|
if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
|
|
|
@@ -18764,6 +18760,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
|
opt->iter = iter;
|
|
|
}
|
|
|
|
|
|
+ struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
|
|
|
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
|
|
|
+ cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
|
|
|
+
|
|
|
float * x = opt->lbfgs.x->data; // current parameters
|
|
|
float * xp = opt->lbfgs.xp->data; // previous parameters
|
|
|
float * g = opt->lbfgs.g->data; // current gradient
|
|
|
@@ -18785,6 +18785,12 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
|
float * lm_s = opt->lbfgs.lms->data;
|
|
|
float * lm_y = opt->lbfgs.lmy->data;
|
|
|
|
|
|
+ if (callback) {
|
|
|
+ // LBFG-S does not support learning rate -> ignore learning schedule
|
|
|
+ float sched = 0;
|
|
|
+ callback(callback_data, &sched);
|
|
|
+ }
|
|
|
+
|
|
|
// evaluate the function value and its gradient
|
|
|
{
|
|
|
ggml_opt_set_params(np, ps, x);
|
|
|
@@ -18792,11 +18798,14 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
|
ggml_graph_reset (gf);
|
|
|
ggml_set_f32 (f->grad, 1.0f);
|
|
|
|
|
|
- ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
|
|
|
+ ggml_graph_compute(gb, &cplan);
|
|
|
|
|
|
ggml_opt_get_grad(np, ps, g);
|
|
|
|
|
|
fx = ggml_get_f32_1d(f, 0);
|
|
|
+
|
|
|
+ opt->loss_before = fx;
|
|
|
+ opt->loss_after = fx;
|
|
|
}
|
|
|
|
|
|
// search direction = -gradient
|
|
|
@@ -18851,7 +18860,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
|
ggml_vec_cpy_f32(nx, xp, x);
|
|
|
ggml_vec_cpy_f32(nx, gp, g);
|
|
|
|
|
|
- ls = linesearch_backtracking(ctx, ¶ms, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps);
|
|
|
+ ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gf, gb, &cplan, np, ps, callback, callback_data);
|
|
|
|
|
|
if (ls < 0) {
|
|
|
// linesearch failed - go back to the previous point and return
|
|
|
@@ -18861,6 +18870,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
|
return ls;
|
|
|
}
|
|
|
|
|
|
+ opt->loss_after = fx;
|
|
|
+
|
|
|
ggml_vec_norm_f32(nx, &xnorm, x);
|
|
|
ggml_vec_norm_f32(nx, &gnorm, g);
|
|
|
|
|
|
@@ -18918,7 +18929,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
|
// ys = y^t \cdot s -> 1 / \rho.
|
|
|
// yy = y^t \cdot y.
|
|
|
//
|
|
|
- ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0] *nx]);
|
|
|
+ ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
|
|
|
ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
|
|
|
|
|
|
lm_ys[end[0]] = ys;
|
|
|
@@ -18981,13 +18992,15 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
|
|
|
.adam = {
|
|
|
.n_iter = 10000,
|
|
|
.sched = 1.000f,
|
|
|
- .decay = 0.001f,
|
|
|
+ .decay = 0.0f,
|
|
|
+ .decay_min_ndim = 2,
|
|
|
.alpha = 0.001f,
|
|
|
.beta1 = 0.9f,
|
|
|
.beta2 = 0.999f,
|
|
|
.eps = 1e-8f,
|
|
|
.eps_f = 1e-5f,
|
|
|
.eps_g = 1e-3f,
|
|
|
+ .gclip = 0.0f,
|
|
|
},
|
|
|
};
|
|
|
} break;
|
|
|
@@ -19037,23 +19050,13 @@ GGML_API void ggml_opt_init(
|
|
|
switch (opt->params.type) {
|
|
|
case GGML_OPT_ADAM:
|
|
|
{
|
|
|
- opt->adam.x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
|
|
- opt->adam.g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
|
|
- opt->adam.g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
|
|
opt->adam.m = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
|
|
opt->adam.v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
|
|
- opt->adam.mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
|
|
- opt->adam.vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
|
|
opt->adam.pf = params.past > 0
|
|
|
? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)
|
|
|
: NULL;
|
|
|
- ggml_set_zero(opt->adam.x);
|
|
|
- ggml_set_zero(opt->adam.g1);
|
|
|
- ggml_set_zero(opt->adam.g2);
|
|
|
ggml_set_zero(opt->adam.m);
|
|
|
ggml_set_zero(opt->adam.v);
|
|
|
- ggml_set_zero(opt->adam.mh);
|
|
|
- ggml_set_zero(opt->adam.vh);
|
|
|
if (opt->adam.pf) {
|
|
|
ggml_set_zero(opt->adam.pf);
|
|
|
}
|
|
|
@@ -19137,7 +19140,7 @@ enum ggml_opt_result ggml_opt_resume(
|
|
|
*gf = ggml_build_forward (f);
|
|
|
*gb = ggml_build_backward(ctx, gf, true);
|
|
|
|
|
|
- return ggml_opt_resume_g(ctx, opt, f, gf, gb);
|
|
|
+ return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
|
|
|
}
|
|
|
|
|
|
enum ggml_opt_result ggml_opt_resume_g(
|
|
|
@@ -19145,7 +19148,9 @@ enum ggml_opt_result ggml_opt_resume_g(
|
|
|
struct ggml_opt_context * opt,
|
|
|
struct ggml_tensor * f,
|
|
|
struct ggml_cgraph * gf,
|
|
|
- struct ggml_cgraph * gb) {
|
|
|
+ struct ggml_cgraph * gb,
|
|
|
+ ggml_opt_callback callback,
|
|
|
+ void * callback_data) {
|
|
|
|
|
|
// build forward + backward compute graphs
|
|
|
enum ggml_opt_result result = GGML_OPT_OK;
|
|
|
@@ -19153,11 +19158,11 @@ enum ggml_opt_result ggml_opt_resume_g(
|
|
|
switch (opt->params.type) {
|
|
|
case GGML_OPT_ADAM:
|
|
|
{
|
|
|
- result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb);
|
|
|
+ result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
|
|
|
} break;
|
|
|
case GGML_OPT_LBFGS:
|
|
|
{
|
|
|
- result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb);
|
|
|
+ result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
|
|
|
} break;
|
|
|
}
|
|
|
|
|
|
@@ -19612,7 +19617,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
|
|
|
|
// read the kv pairs
|
|
|
{
|
|
|
- ctx->kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
|
|
|
+ ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
|
|
|
|
|
|
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
|
|
struct gguf_kv * kv = &ctx->kv[i];
|
|
|
@@ -19695,7 +19700,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
|
|
|
|
// read the tensor infos
|
|
|
{
|
|
|
- ctx->infos = GGML_ALIGNED_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
|
|
+ ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
|
|
|
|
|
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
|
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
|
|
@@ -19896,7 +19901,7 @@ void gguf_free(struct gguf_context * ctx) {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- GGML_ALIGNED_FREE(ctx->kv);
|
|
|
+ free(ctx->kv);
|
|
|
}
|
|
|
|
|
|
if (ctx->infos) {
|
|
|
@@ -19908,7 +19913,7 @@ void gguf_free(struct gguf_context * ctx) {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- GGML_ALIGNED_FREE(ctx->infos);
|
|
|
+ free(ctx->infos);
|
|
|
}
|
|
|
|
|
|
GGML_ALIGNED_FREE(ctx);
|