whisper-enc.cpp 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. #include "models.h"
  2. ggml_cgraph * clip_graph_whisper_enc::build() {
  3. const int n_frames = img.nx;
  4. const int n_pos = n_frames / 2;
  5. GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
  6. ggml_tensor * inp = build_inp_raw(1);
  7. // conv1d block
  8. {
  9. // convolution + gelu
  10. ggml_tensor * cur = ggml_conv_1d_ph(ctx0, model.conv1d_1_w, inp, 1, 1);
  11. cur = ggml_add(ctx0, cur, model.conv1d_1_b);
  12. cur = ggml_gelu_erf(ctx0, cur);
  13. cur = ggml_conv_1d_ph(ctx0, model.conv1d_2_w, cur, 2, 1);
  14. cur = ggml_add(ctx0, cur, model.conv1d_2_b);
  15. cur = ggml_gelu_erf(ctx0, cur);
  16. // transpose
  17. inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  18. cb(inp, "after_conv1d", -1);
  19. }
  20. // sanity check (only check one layer, but it should be the same for all)
  21. GGML_ASSERT(model.layers[0].ln_1_w && model.layers[0].ln_1_b);
  22. GGML_ASSERT(model.layers[0].ln_2_w && model.layers[0].ln_2_b);
  23. GGML_ASSERT(model.layers[0].q_b);
  24. GGML_ASSERT(model.layers[0].v_b);
  25. GGML_ASSERT(!model.layers[0].k_b); // no bias for k
  26. GGML_ASSERT(model.post_ln_w && model.post_ln_b);
  27. ggml_tensor * pos_embd_selected = ggml_view_2d(
  28. ctx0, model.position_embeddings,
  29. model.position_embeddings->ne[0], n_pos,
  30. model.position_embeddings->nb[1], 0
  31. );
  32. ggml_tensor * cur = build_vit(
  33. inp, n_pos,
  34. NORM_TYPE_NORMAL,
  35. hparams.ffn_op,
  36. pos_embd_selected,
  37. nullptr);
  38. cb(cur, "after_transformer", -1);
  39. if (model.audio_has_stack_frames()) {
  40. // StackAudioFrames
  41. // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
  42. int64_t stride = n_embd * hparams.proj_stack_factor;
  43. int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride);
  44. int64_t pad = padded_len - ggml_nelements(cur);
  45. if (pad > 0) {
  46. cur = ggml_view_1d(ctx0, cur, ggml_nelements(cur), 0);
  47. cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
  48. }
  49. cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
  50. ggml_row_size(cur->type, stride), 0);
  51. cb(cur, "after_stacked", -1);
  52. }
  53. if (proj_type == PROJECTOR_TYPE_ULTRAVOX) {
  54. // UltravoxProjector
  55. // pre-norm
  56. cur = ggml_rms_norm(ctx0, cur, 1e-6);
  57. cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
  58. // ffn in
  59. cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
  60. // swiglu
  61. // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
  62. cur = ggml_swiglu_swapped(ctx0, cur);
  63. // mid-norm
  64. cur = ggml_rms_norm(ctx0, cur, 1e-6);
  65. cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);
  66. // ffn out
  67. cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
  68. } else if (proj_type == PROJECTOR_TYPE_QWEN2A) {
  69. // projector
  70. cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
  71. cur = ggml_add(ctx0, cur, model.mm_fc_b);
  72. } else if (proj_type == PROJECTOR_TYPE_VOXTRAL) {
  73. // projector
  74. cur = build_ffn(cur,
  75. model.mm_1_w, model.mm_1_b,
  76. nullptr, nullptr,
  77. model.mm_2_w, model.mm_2_b,
  78. FFN_GELU_ERF,
  79. -1);
  80. } else {
  81. GGML_ABORT("%s: unknown projector type", __func__);
  82. }
  83. cb(cur, "projected", -1);
  84. ggml_build_forward_expand(gf, cur);
  85. return gf;
  86. }