mtmd-audio.h 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. #pragma once
  2. #include "ggml.h"
  3. #include "clip-model.h"
  4. #include <cstdint>
  5. #include <vector>
  6. #include <string>
  7. #define MTMD_INTERNAL_HEADER
  8. struct mtmd_audio_mel {
  9. int n_len;
  10. int n_len_org;
  11. int n_mel;
  12. std::vector<float> data;
  13. };
  14. struct mtmd_audio_mel_filters {
  15. int32_t n_mel;
  16. int32_t n_fft;
  17. std::vector<float> data;
  18. };
  19. // cache for audio processing, each processor instance owns its own cache
  20. struct mtmd_audio_cache {
  21. std::vector<float> sin_vals;
  22. std::vector<float> cos_vals;
  23. std::vector<float> hann_window;
  24. mtmd_audio_mel_filters filters;
  25. void fill_sin_cos_table(int n);
  26. void fill_hann_window(int length, bool periodic);
  27. // Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
  28. // n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
  29. void fill_mel_filterbank_matrix(int n_mel,
  30. int n_fft,
  31. int sample_rate, // e.g. 16000
  32. float fmin = 0.0f, // e.g. 0.0
  33. float fmax = -1.0f, // e.g. sr/2; pass -1 for auto
  34. bool slaney_area_norm = true,
  35. float scale = 1.0f // optional extra scaling
  36. );
  37. };
  38. struct mtmd_audio_preprocessor {
  39. const clip_hparams & hparams;
  40. mtmd_audio_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {}
  41. virtual ~mtmd_audio_preprocessor() = default;
  42. virtual void initialize() = 0; // NOT thread-safe
  43. virtual bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) = 0;
  44. };
  45. struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor {
  46. mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
  47. void initialize() override;
  48. bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
  49. private:
  50. mtmd_audio_cache cache;
  51. };
  52. struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor {
  53. mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
  54. void initialize() override;
  55. bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
  56. private:
  57. mtmd_audio_cache cache;
  58. };
  59. //
  60. // streaming ISTFT - converts spectrogram frames back to audio one frame at a time
  61. //
  62. struct mtmd_audio_streaming_istft {
  63. mtmd_audio_streaming_istft(int n_fft, int hop_length);
  64. // reset streaming state
  65. void reset();
  66. // process a single STFT frame (streaming)
  67. // frame_spectrum: [n_fft_bins x 2] interleaved real/imag
  68. // returns: up to hop_length samples
  69. std::vector<float> process_frame(const float * frame_spectrum);
  70. // flush remaining samples at end of stream
  71. std::vector<float> flush();
  72. private:
  73. int n_fft;
  74. int hop_length;
  75. int n_fft_bins;
  76. // Own cache for output processing
  77. mtmd_audio_cache cache;
  78. // Streaming state
  79. std::vector<float> overlap_buffer;
  80. std::vector<float> window_sum_buffer;
  81. int padding_to_remove;
  82. // Working buffers for IFFT
  83. std::vector<float> ifft_in;
  84. std::vector<float> ifft_out;
  85. };