| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113 |
- #pragma once
- #include "ggml.h"
- #include "clip-model.h"
- #include <cstdint>
- #include <vector>
- #include <string>
- #define MTMD_INTERNAL_HEADER
- struct mtmd_audio_mel {
- int n_len;
- int n_len_org;
- int n_mel;
- std::vector<float> data;
- };
- struct mtmd_audio_mel_filters {
- int32_t n_mel;
- int32_t n_fft;
- std::vector<float> data;
- };
- // cache for audio processing, each processor instance owns its own cache
- struct mtmd_audio_cache {
- std::vector<float> sin_vals;
- std::vector<float> cos_vals;
- std::vector<float> hann_window;
- mtmd_audio_mel_filters filters;
- void fill_sin_cos_table(int n);
- void fill_hann_window(int length, bool periodic);
- // Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
- // n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
- void fill_mel_filterbank_matrix(int n_mel,
- int n_fft,
- int sample_rate, // e.g. 16000
- float fmin = 0.0f, // e.g. 0.0
- float fmax = -1.0f, // e.g. sr/2; pass -1 for auto
- bool slaney_area_norm = true,
- float scale = 1.0f // optional extra scaling
- );
- };
- struct mtmd_audio_preprocessor {
- const clip_hparams & hparams;
- mtmd_audio_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {}
- virtual ~mtmd_audio_preprocessor() = default;
- virtual void initialize() = 0; // NOT thread-safe
- virtual bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) = 0;
- };
- struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor {
- mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
- void initialize() override;
- bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
- private:
- mtmd_audio_cache cache;
- };
- struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor {
- mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
- void initialize() override;
- bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
- private:
- mtmd_audio_cache cache;
- };
- //
- // streaming ISTFT - converts spectrogram frames back to audio one frame at a time
- //
- struct mtmd_audio_streaming_istft {
- mtmd_audio_streaming_istft(int n_fft, int hop_length);
- // reset streaming state
- void reset();
- // process a single STFT frame (streaming)
- // frame_spectrum: [n_fft_bins x 2] interleaved real/imag
- // returns: up to hop_length samples
- std::vector<float> process_frame(const float * frame_spectrum);
- // flush remaining samples at end of stream
- std::vector<float> flush();
- private:
- int n_fft;
- int hop_length;
- int n_fft_bins;
- // Own cache for output processing
- mtmd_audio_cache cache;
- // Streaming state
- std::vector<float> overlap_buffer;
- std::vector<float> window_sum_buffer;
- int padding_to_remove;
- // Working buffers for IFFT
- std::vector<float> ifft_in;
- std::vector<float> ifft_out;
- };
|