conformer.cpp 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. #include "models.h"
  2. ggml_cgraph * clip_graph_conformer::build() {
  3. const int n_frames = img.nx;
  4. const int n_pos = n_frames / 2;
  5. const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1;
  6. GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
  7. ggml_tensor * pos_emb = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 512, n_pos_embd);
  8. ggml_set_name(pos_emb, "pos_emb");
  9. ggml_set_input(pos_emb);
  10. ggml_build_forward_expand(gf, pos_emb);
  11. ggml_tensor * inp = build_inp_raw(1);
  12. auto * cur = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
  13. // pre encode, conv subsampling
  14. {
  15. // layer.0 - conv2d
  16. cur = ggml_conv_2d(ctx0, model.pre_encode_conv_X_w[0], cur, 2, 2, 1, 1, 1, 1);
  17. cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[0]);
  18. cb(cur, "conformer.pre_encode.conv.{}", 0);
  19. // layer.1 - relu
  20. cur = ggml_relu_inplace(ctx0, cur);
  21. // layer.2 conv2d dw
  22. cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[2], cur, 2, 2, 1, 1, 1, 1);
  23. cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[2]);
  24. cb(cur, "conformer.pre_encode.conv.{}", 2);
  25. // layer.3 conv2d
  26. cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[3], cur, 1, 1, 0, 0, 1, 1);
  27. cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[3]);
  28. cb(cur, "conformer.pre_encode.conv.{}", 3);
  29. // layer.4 - relu
  30. cur = ggml_relu_inplace(ctx0, cur);
  31. // layer.5 conv2d dw
  32. cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[5], cur, 2, 2, 1, 1, 1, 1);
  33. cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[5]);
  34. cb(cur, "conformer.pre_encode.conv.{}", 5);
  35. // layer.6 conv2d
  36. cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[6], cur, 1, 1, 0, 0, 1, 1);
  37. cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[6]);
  38. cb(cur, "conformer.pre_encode.conv.{}", 6);
  39. // layer.7 - relu
  40. cur = ggml_relu_inplace(ctx0, cur);
  41. // flatten channel and frequency axis
  42. cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3));
  43. cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]);
  44. // calculate out
  45. cur = ggml_mul_mat(ctx0, model.pre_encode_out_w, cur);
  46. cur = ggml_add(ctx0, cur, model.pre_encode_out_b);
  47. cb(cur, "conformer.pre_encode.out", -1);
  48. }
  49. // pos_emb
  50. cb(pos_emb, "pos_emb", -1);
  51. for (int il = 0; il < hparams.n_layer; il++) {
  52. const auto & layer = model.layers[il];
  53. auto * residual = cur;
  54. cb(cur, "layer.in", il);
  55. // feed_forward1
  56. cur = build_norm(cur, layer.ff_norm_w, layer.ff_norm_b, NORM_TYPE_NORMAL, 1e-5, il);
  57. cb(cur, "conformer.layers.{}.norm_feed_forward1", il);
  58. cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b, FFN_SILU,
  59. il);
  60. cb(cur, "conformer.layers.{}.feed_forward1.linear2", il);
  61. const auto fc_factor = 0.5f;
  62. residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor));
  63. // self-attention
  64. {
  65. cur = build_norm(residual, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, 1e-5, il);
  66. cb(cur, "conformer.layers.{}.norm_self_att", il);
  67. ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
  68. Qcur = ggml_add(ctx0, Qcur, layer.q_b);
  69. Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, Qcur->ne[1]);
  70. ggml_tensor * Q_bias_u = ggml_add(ctx0, Qcur, layer.pos_bias_u);
  71. Q_bias_u = ggml_permute(ctx0, Q_bias_u, 0, 2, 1, 3);
  72. ggml_tensor * Q_bias_v = ggml_add(ctx0, Qcur, layer.pos_bias_v);
  73. Q_bias_v = ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3);
  74. // TODO @ngxson : some cont can/should be removed when ggml_mul_mat support these cases
  75. ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
  76. Kcur = ggml_add(ctx0, Kcur, layer.k_b);
  77. Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, Kcur->ne[1]);
  78. Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
  79. ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
  80. Vcur = ggml_add(ctx0, Vcur, layer.v_b);
  81. Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, Vcur->ne[1]);
  82. Vcur = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 1, 2, 0, 3));
  83. // build_attn won't fit due to matrix_ac and matrix_bd separation
  84. ggml_tensor * matrix_ac = ggml_mul_mat(ctx0, Q_bias_u, Kcur);
  85. matrix_ac = ggml_cont(ctx0, ggml_permute(ctx0, matrix_ac, 1, 0, 2, 3));
  86. cb(matrix_ac, "conformer.layers.{}.self_attn.id3", il);
  87. auto * p = ggml_mul_mat(ctx0, layer.linear_pos_w, pos_emb);
  88. cb(p, "conformer.layers.{}.self_attn.linear_pos", il);
  89. p = ggml_reshape_3d(ctx0, p, d_head, n_head, p->ne[1]);
  90. p = ggml_permute(ctx0, p, 0, 2, 1, 3);
  91. auto * matrix_bd = ggml_mul_mat(ctx0, Q_bias_v, p);
  92. matrix_bd = ggml_cont(ctx0, ggml_permute(ctx0, matrix_bd, 1, 0, 2, 3));
  93. // rel shift
  94. {
  95. const auto pos_len = matrix_bd->ne[0];
  96. const auto q_len = matrix_bd->ne[1];
  97. const auto h = matrix_bd->ne[2];
  98. matrix_bd = ggml_pad(ctx0, matrix_bd, 1, 0, 0, 0);
  99. matrix_bd = ggml_roll(ctx0, matrix_bd, 1, 0, 0, 0);
  100. matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, q_len, pos_len + 1, h);
  101. matrix_bd = ggml_view_3d(ctx0, matrix_bd, q_len, pos_len, h, matrix_bd->nb[1],
  102. matrix_bd->nb[2], matrix_bd->nb[0] * q_len);
  103. matrix_bd = ggml_cont_3d(ctx0, matrix_bd, pos_len, q_len, h);
  104. }
  105. matrix_bd = ggml_view_3d(ctx0, matrix_bd, matrix_ac->ne[0], matrix_bd->ne[1],
  106. matrix_bd->ne[2], matrix_bd->nb[1], matrix_bd->nb[2], 0);
  107. auto * scores = ggml_add(ctx0, matrix_ac, matrix_bd);
  108. scores = ggml_scale(ctx0, scores, 1.0f / std::sqrt(d_head));
  109. cb(scores, "conformer.layers.{}.self_attn.id0", il);
  110. ggml_tensor * attn = ggml_soft_max(ctx0, scores);
  111. ggml_tensor * x = ggml_mul_mat(ctx0, attn, Vcur);
  112. x = ggml_permute(ctx0, x, 2, 0, 1, 3);
  113. x = ggml_cont_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]);
  114. ggml_tensor * out = ggml_mul_mat(ctx0, layer.o_w, x);
  115. out = ggml_add(ctx0, out, layer.o_b);
  116. cb(out, "conformer.layers.{}.self_attn.linear_out", il);
  117. cur = out;
  118. }
  119. residual = ggml_add(ctx0, residual, cur);
  120. cur = build_norm(residual, layer.norm_conv_w, layer.norm_conv_b, NORM_TYPE_NORMAL, 1e-5, il);
  121. cb(cur, "conformer.layers.{}.norm_conv", il);
  122. // conv
  123. {
  124. auto * x = cur;
  125. x = ggml_mul_mat(ctx0, layer.conv_pw1_w, x);
  126. x = ggml_add(ctx0, x, layer.conv_pw1_b);
  127. cb(x, "conformer.layers.{}.conv.pointwise_conv1", il);
  128. // ggml_glu doesn't support sigmoid
  129. // TODO @ngxson : support this ops in ggml
  130. {
  131. int64_t d = x->ne[0] / 2;
  132. ggml_tensor * gate = ggml_sigmoid(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0]));
  133. x = ggml_mul(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate);
  134. x = ggml_cont(ctx0, ggml_transpose(ctx0, x));
  135. }
  136. // use ggml_ssm_conv for f32 precision
  137. x = ggml_pad(ctx0, x, 4, 0, 0, 0);
  138. x = ggml_roll(ctx0, x, 4, 0, 0, 0);
  139. x = ggml_pad(ctx0, x, 4, 0, 0, 0);
  140. x = ggml_ssm_conv(ctx0, x, layer.conv_dw_w);
  141. x = ggml_add(ctx0, x, layer.conv_dw_b);
  142. x = ggml_add(ctx0, ggml_mul(ctx0, x, layer.conv_norm_w), layer.conv_norm_b);
  143. x = ggml_silu(ctx0, x);
  144. // pointwise_conv2
  145. x = ggml_mul_mat(ctx0, layer.conv_pw2_w, x);
  146. x = ggml_add(ctx0, x, layer.conv_pw2_b);
  147. cur = x;
  148. }
  149. residual = ggml_add(ctx0, residual, cur);
  150. cur = build_norm(residual, layer.ff_norm_1_w, layer.ff_norm_1_b, NORM_TYPE_NORMAL, 1e-5, il);
  151. cb(cur, "conformer.layers.{}.norm_feed_forward2", il);
  152. cur = build_ffn(cur, layer.ff_up_1_w, layer.ff_up_1_b, nullptr, nullptr, layer.ff_down_1_w, layer.ff_down_1_b,
  153. FFN_SILU, il); // TODO(tarek): read activation for ffn from hparams
  154. cb(cur, "conformer.layers.{}.feed_forward2.linear2", il);
  155. residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor));
  156. cb(residual, "conformer.layers.{}.conv.id", il);
  157. cur = build_norm(residual, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, 1e-5, il);
  158. cb(cur, "conformer.layers.{}.norm_out", il);
  159. }
  160. // audio adapter
  161. cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
  162. cb(cur, "audio_adapter.model.{}", 0);
  163. cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_3_w, model.mm_3_b, FFN_GELU_ERF, -1);
  164. cb(cur, "projected", -1);
  165. ggml_build_forward_expand(gf, cur);
  166. return gf;
  167. }