mtmd-audio.cpp 26 KB


  1. // fix problem with std::min and std::max
  2. #if defined(_WIN32)
  3. #define WIN32_LEAN_AND_MEAN
  4. #ifndef NOMINMAX
  5. # define NOMINMAX
  6. #endif
  7. #include <windows.h>
  8. #endif
  9. #include "mtmd-audio.h"
  10. //#define MTMD_AUDIO_DEBUG
  11. #define MINIAUDIO_IMPLEMENTATION
  12. #ifndef MTMD_AUDIO_DEBUG
  13. # define MA_NO_ENCODING
  14. #endif
  15. #define MA_NO_DEVICE_IO
  16. #define MA_NO_RESOURCE_MANAGER
  17. #define MA_NO_NODE_GRAPH
  18. #define MA_NO_ENGINE
  19. #define MA_NO_GENERATION
  20. #define MA_API static
  21. #include "miniaudio.h"
  22. #define _USE_MATH_DEFINES // for M_PI
  23. #include <cmath>
  24. #include <cstdint>
  25. #include <cstring>
  26. #include <thread>
  27. #include <vector>
  28. #include <fstream>
  29. #include <algorithm>
  30. // most of the code here is copied from whisper.cpp
  31. // align x to upper multiple of n
  32. #define _ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
  33. namespace whisper_preprocessor {
  34. #define SIN_COS_N_COUNT WHISPER_N_FFT
  35. namespace {
  36. struct whisper_global_cache {
  37. // In FFT, we frequently use sine and cosine operations with the same values.
  38. // We can use precalculated values to speed up the process.
  39. float sin_vals[SIN_COS_N_COUNT];
  40. float cos_vals[SIN_COS_N_COUNT];
  41. // Hann window (Use cosf to eliminate difference)
  42. // ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html
  43. // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147
  44. float hann_window[WHISPER_N_FFT];
  45. whisper_global_cache() {
  46. fill_sin_cos_table();
  47. fill_hann_window(sizeof(hann_window)/sizeof(hann_window[0]), true, hann_window);
  48. }
  49. void fill_sin_cos_table() {
  50. for (int i = 0; i < SIN_COS_N_COUNT; i++) {
  51. double theta = (2 * M_PI * i) / SIN_COS_N_COUNT;
  52. sin_vals[i] = sinf(theta);
  53. cos_vals[i] = cosf(theta);
  54. }
  55. }
  56. void fill_hann_window(int length, bool periodic, float * output) {
  57. int offset = -1;
  58. if (periodic) {
  59. offset = 0;
  60. }
  61. for (int i = 0; i < length; i++) {
  62. output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
  63. }
  64. }
  65. } global_cache;
  66. }
  67. // naive Discrete Fourier Transform
  68. // input is real-valued
  69. // output is complex-valued
  70. static void dft(const float* in, int N, float* out) {
  71. const int sin_cos_step = SIN_COS_N_COUNT / N;
  72. for (int k = 0; k < N; k++) {
  73. float re = 0;
  74. float im = 0;
  75. for (int n = 0; n < N; n++) {
  76. int idx = (k * n * sin_cos_step) % (SIN_COS_N_COUNT); // t = 2*M_PI*k*n/N
  77. re += in[n]*global_cache.cos_vals[idx]; // cos(t)
  78. im -= in[n]*global_cache.sin_vals[idx]; // sin(t)
  79. }
  80. out[k*2 + 0] = re;
  81. out[k*2 + 1] = im;
  82. }
  83. }
  84. // Cooley-Tukey FFT
  85. // poor man's implementation - use something better
  86. // input is real-valued
  87. // output is complex-valued
  88. static void fft(float* in, int N, float* out) {
  89. if (N == 1) {
  90. out[0] = in[0];
  91. out[1] = 0;
  92. return;
  93. }
  94. const int half_N = N / 2;
  95. if (N - half_N*2 == 1) {
  96. dft(in, N, out);
  97. return;
  98. }
  99. float* even = in + N;
  100. for (int i = 0; i < half_N; ++i) {
  101. even[i]= in[2*i];
  102. }
  103. float* even_fft = out + 2 * N;
  104. fft(even, half_N, even_fft);
  105. float* odd = even;
  106. for (int i = 0; i < half_N; ++i) {
  107. odd[i] = in[2*i + 1];
  108. }
  109. float* odd_fft = even_fft + N;
  110. fft(odd, half_N, odd_fft);
  111. const int sin_cos_step = SIN_COS_N_COUNT / N;
  112. for (int k = 0; k < half_N; k++) {
  113. int idx = k * sin_cos_step; // t = 2*M_PI*k/N
  114. float re = global_cache.cos_vals[idx]; // cos(t)
  115. float im = -global_cache.sin_vals[idx]; // sin(t)
  116. float re_odd = odd_fft[2*k + 0];
  117. float im_odd = odd_fft[2*k + 1];
  118. out[2*k + 0] = even_fft[2*k + 0] + re*re_odd - im*im_odd;
  119. out[2*k + 1] = even_fft[2*k + 1] + re*im_odd + im*re_odd;
  120. out[2*(k + half_N) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd;
  121. out[2*(k + half_N) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd;
  122. }
  123. }
  124. static void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::vector<float> & samples,
  125. int n_samples, int frame_size, int frame_step, int n_threads,
  126. const whisper_filters & filters, whisper_mel & mel) {
  127. std::vector<float> fft_in(frame_size * 2, 0.0);
  128. std::vector<float> fft_out(frame_size * 2 * 2 * 2);
  129. int n_fft = filters.n_fft;
  130. int i = ith;
  131. // make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist
  132. WHISPER_ASSERT(n_fft == 1 + (frame_size / 2));
  133. // calculate FFT only when fft_in are not all zero
  134. for (; i < std::min(n_samples / frame_step + 1, mel.n_len); i += n_threads) {
  135. const int offset = i * frame_step;
  136. // apply Hann window (~10% faster)
  137. for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) {
  138. fft_in[j] = hann[j] * samples[offset + j];
  139. }
  140. // fill the rest with zeros
  141. if (n_samples - offset < frame_size) {
  142. std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0);
  143. }
  144. // FFT
  145. fft(fft_in.data(), frame_size, fft_out.data());
  146. // Calculate modulus^2 of complex numbers
  147. // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting.
  148. for (int j = 0; j < n_fft; j++) {
  149. fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]);
  150. }
  151. // mel spectrogram
  152. for (int j = 0; j < mel.n_mel; j++) {
  153. double sum = 0.0;
  154. // unroll loop (suggested by GH user @lunixbochs)
  155. int k = 0;
  156. for (k = 0; k < n_fft - 3; k += 4) {
  157. sum +=
  158. fft_out[k + 0] * filters.data[j * n_fft + k + 0] +
  159. fft_out[k + 1] * filters.data[j * n_fft + k + 1] +
  160. fft_out[k + 2] * filters.data[j * n_fft + k + 2] +
  161. fft_out[k + 3] * filters.data[j * n_fft + k + 3];
  162. }
  163. // handle n_fft remainder
  164. for (; k < n_fft; k++) {
  165. sum += fft_out[k] * filters.data[j * n_fft + k];
  166. }
  167. sum = log10(std::max(sum, 1e-10));
  168. mel.data[j * mel.n_len + i] = sum;
  169. }
  170. }
  171. // Otherwise fft_out are all zero
  172. double sum = log10(1e-10);
  173. for (; i < mel.n_len; i += n_threads) {
  174. for (int j = 0; j < mel.n_mel; j++) {
  175. mel.data[j * mel.n_len + i] = sum;
  176. }
  177. }
  178. }
  179. // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L110-L157
  180. static bool log_mel_spectrogram(
  181. const float * samples,
  182. const int n_samples,
  183. const int /*sample_rate*/,
  184. const int frame_size,
  185. const int frame_step,
  186. const int n_mel,
  187. const int n_threads,
  188. const whisper_filters & filters,
  189. const bool debug,
  190. whisper_mel & mel) {
  191. //const int64_t t_start_us = ggml_time_us();
  192. // Hann window
  193. WHISPER_ASSERT(frame_size == WHISPER_N_FFT && "Unsupported frame_size");
  194. const float * hann = global_cache.hann_window;
  195. // Calculate the length of padding
  196. int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30;
  197. int64_t stage_2_pad = frame_size / 2;
  198. // Initialize a vector and copy data from C array to it.
  199. std::vector<float> samples_padded;
  200. samples_padded.resize(n_samples + stage_1_pad + stage_2_pad * 2);
  201. std::copy(samples, samples + n_samples, samples_padded.begin() + stage_2_pad);
  202. // pad 30 seconds of zeros at the end of audio (480,000 samples) + reflective pad 200 samples at the end of audio
  203. std::fill(samples_padded.begin() + n_samples + stage_2_pad, samples_padded.begin() + n_samples + stage_1_pad + 2 * stage_2_pad, 0);
  204. // reflective pad 200 samples at the beginning of audio
  205. std::reverse_copy(samples + 1, samples + 1 + stage_2_pad, samples_padded.begin());
  206. mel.n_mel = n_mel;
  207. // https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/SpectralOps.cpp#L936
  208. // Calculate number of frames + remove the last frame
  209. mel.n_len = (samples_padded.size() - frame_size) / frame_step;
  210. // Calculate semi-padded sample length to ensure compatibility
  211. mel.n_len_org = 1 + (n_samples + stage_2_pad - frame_size) / frame_step;
  212. mel.data.resize(mel.n_mel * mel.n_len);
  213. {
  214. std::vector<std::thread> workers(n_threads - 1);
  215. for (int iw = 0; iw < n_threads - 1; ++iw) {
  216. workers[iw] = std::thread(
  217. log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded),
  218. n_samples + stage_2_pad, frame_size, frame_step, n_threads,
  219. std::cref(filters), std::ref(mel));
  220. }
  221. // main thread
  222. log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples + stage_2_pad, frame_size, frame_step, n_threads, filters, mel);
  223. for (int iw = 0; iw < n_threads - 1; ++iw) {
  224. workers[iw].join();
  225. }
  226. }
  227. // clamping and normalization
  228. double mmax = -1e20;
  229. for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
  230. if (mel.data[i] > mmax) {
  231. mmax = mel.data[i];
  232. }
  233. }
  234. mmax -= 8.0;
  235. for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
  236. if (mel.data[i] < mmax) {
  237. mel.data[i] = mmax;
  238. }
  239. mel.data[i] = (mel.data[i] + 4.0)/4.0;
  240. }
  241. // Dump log_mel_spectrogram
  242. if (debug) {
  243. std::ofstream outFile("log_mel_spectrogram.json");
  244. outFile << "[";
  245. for (uint64_t i = 0; i < mel.data.size() - 1; i++) {
  246. outFile << mel.data[i] << ", ";
  247. }
  248. outFile << mel.data[mel.data.size() - 1] << "]";
  249. outFile.close();
  250. }
  251. return true;
  252. }
  253. bool preprocess_audio(
  254. const float * samples,
  255. size_t n_samples,
  256. const whisper_filters & filters,
  257. std::vector<whisper_mel> & output) {
  258. if (n_samples == 0) {
  259. // empty audio
  260. return false;
  261. }
  262. whisper_mel out_full;
  263. bool ok = log_mel_spectrogram(
  264. samples,
  265. n_samples,
  266. COMMON_SAMPLE_RATE,
  267. WHISPER_N_FFT,
  268. WHISPER_HOP_LENGTH,
  269. filters.n_mel,
  270. 4, // n_threads
  271. filters,
  272. false, // debug
  273. out_full);
  274. if (!ok) {
  275. return false;
  276. }
  277. // because the cgraph in clip.cpp only accepts 3000 frames each, we need to split the mel
  278. // we always expect the mel to have 3000 silent frames at the end
  279. // printf("n_len %d\n", out_full.n_len);
  280. const size_t frames_per_chunk = 3000;
  281. GGML_ASSERT((size_t)out_full.n_len > frames_per_chunk);
  282. for (size_t off = 0; off < (size_t)out_full.n_len; off += frames_per_chunk) {
  283. int n_len = std::min(frames_per_chunk, (size_t)out_full.n_len - off);
  284. if ((size_t)n_len < frames_per_chunk) {
  285. break; // last uncomplete chunk will always be a padded chunk, safe to ignore
  286. }
  287. whisper_mel out_chunk;
  288. out_chunk.n_len = n_len;
  289. out_chunk.n_mel = out_full.n_mel;
  290. out_chunk.n_len_org = out_full.n_mel; // unused
  291. out_chunk.data.reserve(out_chunk.n_mel * out_chunk.n_len);
  292. for (int i = 0; i < out_full.n_mel; i++) {
  293. auto src = out_full.data.begin() + i*out_full.n_len + off;
  294. out_chunk.data.insert(out_chunk.data.end(), src, src + frames_per_chunk);
  295. }
  296. output.push_back(std::move(out_chunk));
  297. }
  298. return true;
  299. }
  300. } // namespace whisper_preprocessor
  301. namespace audio_helpers {
  302. bool is_audio_file(const char * buf, size_t len) {
  303. if (len < 12) {
  304. return false;
  305. }
  306. // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
  307. // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
  308. bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
  309. bool is_mp3 = len >= 3 && (
  310. memcmp(buf, "ID3", 3) == 0 ||
  311. // Check for MPEG sync word (simplified check)
  312. ((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
  313. );
  314. bool is_flac = memcmp(buf, "fLaC", 4) == 0;
  315. return is_wav || is_mp3 || is_flac;
  316. }
  317. // returns true if the buffer is a valid audio file
  318. bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
  319. ma_result result;
  320. const int channels = 1;
  321. ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
  322. ma_decoder decoder;
  323. result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
  324. if (result != MA_SUCCESS) {
  325. return false;
  326. }
  327. ma_uint64 frame_count;
  328. ma_uint64 frames_read;
  329. result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
  330. if (result != MA_SUCCESS) {
  331. ma_decoder_uninit(&decoder);
  332. return false;
  333. }
  334. pcmf32_mono.resize(frame_count);
  335. result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
  336. if (result != MA_SUCCESS) {
  337. ma_decoder_uninit(&decoder);
  338. return false;
  339. }
  340. #ifdef MTMD_AUDIO_DEBUG
  341. // save audio to wav file
  342. ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
  343. ma_encoder encoder;
  344. ma_encoder_init_file("output.wav", &config, &encoder);
  345. ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
  346. ma_encoder_uninit(&encoder);
  347. #endif
  348. ma_decoder_uninit(&decoder);
  349. return true;
  350. }
  351. } // namespace wav_utils
  352. // precalculated mel filter banks
  353. // values are multiplied by 1000.0 to save space, and will be divided by 1000.0 in the end of the function
  354. //
  355. // generated from python code:
  356. //
  357. // from numpy import load
  358. // data = load('mel_filters.npz')
  359. // lst = data.files
  360. // for item in lst:
  361. // print(item)
  362. // print(data[item].shape)
  363. // n_mel = data[item].shape[0]
  364. // n_fft = data[item].shape[1]
  365. // for i, row in enumerate(data[item]):
  366. // for j, val in enumerate(row):
  367. // val = val * 1000.0
  368. // if val != 0:
  369. // print(f"data[{i*n_fft + j}] = {val:.6f};")
  370. namespace whisper_precalc_filters {
  371. whisper_preprocessor::whisper_filters get_128_bins() {
  372. whisper_preprocessor::whisper_filters filters;
  373. filters.n_mel = 128;
  374. filters.n_fft = 201;
  375. std::vector data(filters.n_mel * filters.n_fft, 0.0f);
  376. data[1] = 12.37398665;
  377. data[202] = 30.39256483;
  378. data[404] = 24.74797331;
  379. data[605] = 18.01857911;
  380. data[807] = 37.12195903;
  381. data[1008] = 5.64459199;
  382. data[1009] = 6.72939420;
  383. data[1210] = 36.03715822;
  384. data[1412] = 19.10337992;
  385. data[1613] = 23.66316877;
  386. data[1815] = 31.47736564;
  387. data[2016] = 11.28918398;
  388. data[2017] = 1.08480197;
  389. data[2218] = 41.68175161;
  390. data[2420] = 13.45878839;
  391. data[2621] = 29.30776216;
  392. data[2823] = 25.83277412;
  393. data[3024] = 16.93377644;
  394. data[3226] = 38.20675984;
  395. data[3427] = 4.55979025;
  396. data[3428] = 7.81419594;
  397. data[3629] = 34.95235741;
  398. data[3831] = 20.18818259;
  399. data[4032] = 22.57836796;
  400. data[4234] = 32.56217018;
  401. data[4435] = 10.20438317;
  402. data[4436] = 2.16960395;
  403. data[4637] = 40.59694707;
  404. data[4839] = 14.54358920;
  405. data[5040] = 28.22295949;
  406. data[5242] = 26.91757679;
  407. data[5443] = 15.84897563;
  408. data[5645] = 39.29156065;
  409. data[5846] = 3.47498828;
  410. data[5847] = 8.89899861;
  411. data[6048] = 33.86755288;
  412. data[6250] = 21.27298526;
  413. data[6451] = 21.49356715;
  414. data[6653] = 33.64697099;
  415. data[6854] = 9.11958050;
  416. data[6855] = 3.25440569;
  417. data[7056] = 39.51214626;
  418. data[7258] = 15.62839188;
  419. data[7459] = 27.13815868;
  420. data[7661] = 28.00237760;
  421. data[7862] = 14.76417296;
  422. data[8064] = 40.37636518;
  423. data[8265] = 2.38068704;
  424. data[8266] = 10.20263787;
  425. data[8467] = 31.61146119;
  426. data[8669] = 24.54700135;
  427. data[8870] = 15.32919332;
  428. data[8871] = 1.66583748;
  429. data[9072] = 36.72905266;
  430. data[9274] = 20.09709924;
  431. data[9475] = 16.93102531;
  432. data[9476] = 2.90265540;
  433. data[9677] = 32.84499049;
  434. data[9879] = 23.52004871;
  435. data[10080] = 11.03894413;
  436. data[10081] = 10.72582975;
  437. data[10282] = 22.71829173;
  438. data[10484] = 32.27872774;
  439. data[10685] = 0.11626833;
  440. data[10686] = 22.85348251;
  441. data[10887] = 8.56344029;
  442. data[10888] = 14.97978810;
  443. data[11089] = 15.51398356;
  444. data[11090] = 8.51490628;
  445. data[11291] = 21.10680379;
  446. data[11292] = 3.32652032;
  447. data[11493] = 25.47064796;
  448. data[11695] = 27.35907957;
  449. data[11896] = 0.65853616;
  450. data[11897] = 23.83812517;
  451. data[12098] = 3.44359246;
  452. data[12099] = 21.22455277;
  453. data[12300] = 5.35842171;
  454. data[12301] = 19.42555793;
  455. data[12502] = 6.49324711;
  456. data[12503] = 18.35542172;
  457. data[12704] = 6.93138083;
  458. data[12705] = 17.93504693;
  459. data[12906] = 6.74968259;
  460. data[12907] = 18.09151843;
  461. data[13108] = 6.01899112;
  462. data[13109] = 18.75767298;
  463. data[13310] = 4.80452832;
  464. data[13311] = 19.87172849;
  465. data[13512] = 3.16627859;
  466. data[13513] = 21.37690969;
  467. data[13514] = 1.25317345;
  468. data[13714] = 1.15934468;
  469. data[13715] = 20.80361731;
  470. data[13716] = 4.04486805;
  471. data[13917] = 17.55363122;
  472. data[13918] = 7.08320038;
  473. data[14119] = 14.07538634;
  474. data[14120] = 10.32655034;
  475. data[14321] = 10.40921453;
  476. data[14322] = 13.73696327;
  477. data[14523] = 6.59187697;
  478. data[14524] = 17.27988198;
  479. data[14525] = 1.46804214;
  480. data[14725] = 2.65681883;
  481. data[14726] = 18.09193194;
  482. data[14727] = 5.85655728;
  483. data[14928] = 13.34277913;
  484. data[14929] = 10.28267574;
  485. data[15130] = 8.56800377;
  486. data[15131] = 14.72230814;
  487. data[15132] = 1.04039861;
  488. data[15332] = 3.79085587;
  489. data[15333] = 17.14678481;
  490. data[15334] = 6.11609267;
  491. data[15535] = 11.75929047;
  492. data[15536] = 11.13393717;
  493. data[15737] = 6.43857848;
  494. data[15738] = 16.07806236;
  495. data[15739] = 4.23917221;
  496. data[15939] = 1.19989377;
  497. data[15940] = 12.75671553;
  498. data[15941] = 9.65298992;
  499. data[16142] = 7.06935255;
  500. data[16143] = 14.94054683;
  501. data[16144] = 4.19024844;
  502. data[16344] = 1.51483389;
  503. data[16345] = 12.00899947;
  504. data[16346] = 9.84823331;
  505. data[16547] = 6.10224018;
  506. data[16548] = 15.33857174;
  507. data[16549] = 5.57676842;
  508. data[16749] = 0.36827257;
  509. data[16750] = 9.89749376;
  510. data[16751] = 11.35340426;
  511. data[16752] = 2.05122307;
  512. data[16952] = 3.89297144;
  513. data[16953] = 12.97352277;
  514. data[16954] = 8.06631614;
  515. data[17155] = 6.74493238;
  516. data[17156] = 13.85874674;
  517. data[17157] = 5.41190524;
  518. data[17357] = 0.74220158;
  519. data[17358] = 8.98779090;
  520. data[17359] = 11.37871388;
  521. data[17360] = 3.32958088;
  522. data[17560] = 2.82313535;
  523. data[17561] = 10.68049297;
  524. data[17562] = 9.43340641;
  525. data[17563] = 1.76325557;
  526. data[17763] = 4.39018616;
  527. data[17764] = 11.87758986;
  528. data[17765] = 7.97005836;
  529. data[17766] = 0.66104700;
  530. data[17966] = 5.49466675;
  531. data[17967] = 12.62953598;
  532. data[17968] = 6.93987962;
  533. data[18169] = 6.18401915;
  534. data[18170] = 12.93473132;
  535. data[18171] = 6.29778765;
  536. data[18371] = 0.02325210;
  537. data[18372] = 6.50206627;
  538. data[18373] = 12.32661773;
  539. data[18374] = 6.00216538;
  540. data[18574] = 0.31548753;
  541. data[18575] = 6.48925547;
  542. data[18576] = 12.04130240;
  543. data[18577] = 6.01462880;
  544. data[18777] = 0.29979556;
  545. data[18778] = 6.18288014;
  546. data[18779] = 12.04272825;
  547. data[18780] = 6.29981188;
  548. data[18781] = 0.55689598;
  549. data[18980] = 0.01120471;
  550. data[18981] = 5.61729167;
  551. data[18982] = 11.22337859;
  552. data[18983] = 6.82516303;
  553. data[18984] = 1.35264499;
  554. data[19184] = 4.82410006;
  555. data[19185] = 10.16623247;
  556. data[19186] = 7.56075513;
  557. data[19187] = 2.34590308;
  558. data[19387] = 3.83235747;
  559. data[19388] = 8.92296247;
  560. data[19389] = 8.47910438;
  561. data[19390] = 3.50978645;
  562. data[19590] = 2.66873185;
  563. data[19591] = 7.51965167;
  564. data[19592] = 9.55500547;
  565. data[19593] = 4.81966138;
  566. data[19594] = 0.08431751;
  567. data[19793] = 1.35767367;
  568. data[19794] = 5.98019501;
  569. data[19795] = 10.60271543;
  570. data[19796] = 6.25298498;
  571. data[19797] = 1.74059917;
  572. data[19997] = 4.32644226;
  573. data[19998] = 8.73131864;
  574. data[19999] = 7.78916525;
  575. data[20000] = 3.48923868;
  576. data[20200] = 2.57835095;
  577. data[20201] = 6.77582854;
  578. data[20202] = 9.40941647;
  579. data[20203] = 5.31194592;
  580. data[20204] = 1.21447595;
  581. data[20403] = 0.75411191;
  582. data[20404] = 4.75395704;
  583. data[20405] = 8.75380263;
  584. data[20406] = 7.19209015;
  585. data[20407] = 3.28754401;
  586. data[20607] = 2.68179690;
  587. data[20608] = 6.49331464;
  588. data[20609] = 9.11457930;
  589. data[20610] = 5.39387390;
  590. data[20611] = 1.67316827;
  591. data[20810] = 0.57394296;
  592. data[20811] = 4.20600036;
  593. data[20812] = 7.83805829;
  594. data[20813] = 7.52023002;
  595. data[20814] = 3.97470826;
  596. data[20815] = 0.42918732;
  597. data[21014] = 1.90464477;
  598. data[21015] = 5.36569161;
  599. data[21016] = 8.82673822;
  600. data[21017] = 6.27609482;
  601. data[21018] = 2.89750961;
  602. data[21218] = 2.89885257;
  603. data[21219] = 6.19694078;
  604. data[21220] = 8.56699049;
  605. data[21221] = 5.34748193;
  606. data[21222] = 2.12797290;
  607. data[21421] = 0.44750227;
  608. data[21422] = 3.59030394;
  609. data[21423] = 6.73310598;
  610. data[21424] = 7.77023612;
  611. data[21425] = 4.70231380;
  612. data[21426] = 1.63439126;
  613. data[21625] = 1.01536023;
  614. data[21626] = 4.01018746;
  615. data[21627] = 7.00501446;
  616. data[21628] = 7.23442994;
  617. data[21629] = 4.31095669;
  618. data[21630] = 1.38748321;
  619. data[21829] = 1.33348850;
  620. data[21830] = 4.18730825;
  621. data[21831] = 7.04112789;
  622. data[21832] = 6.93188375;
  623. data[21833] = 4.14605811;
  624. data[21834] = 1.36023236;
  625. data[22033] = 1.42879714;
  626. data[22034] = 4.14824858;
  627. data[22035] = 6.86769979;
  628. data[22036] = 6.83705276;
  629. data[22037] = 4.18239459;
  630. data[22038] = 1.52773573;
  631. data[22237] = 1.32610439;
  632. data[22238] = 3.91751388;
  633. data[22239] = 6.50892360;
  634. data[22240] = 6.92639686;
  635. data[22241] = 4.39672917;
  636. data[22242] = 1.86706171;
  637. data[22441] = 1.04827771;
  638. data[22442] = 3.51767405;
  639. data[22443] = 5.98707050;
  640. data[22444] = 7.17824046;
  641. data[22445] = 4.76767914;
  642. data[22446] = 2.35711760;
  643. data[22645] = 0.61636406;
  644. data[22646] = 2.96949223;
  645. data[22647] = 5.32262027;
  646. data[22648] = 7.57265091;
  647. data[22649] = 5.27558755;
  648. data[22650] = 2.97852419;
  649. data[22651] = 0.68146095;
  650. data[22849] = 0.04971400;
  651. data[22850] = 2.29204819;
  652. data[22851] = 4.53438237;
  653. data[22852] = 6.77671656;
  654. data[22853] = 5.90240723;
  655. data[22854] = 3.71349836;
  656. data[22855] = 1.52458926;
  657. data[23054] = 1.50285335;
  658. data[23055] = 3.63961048;
  659. data[23056] = 5.77636715;
  660. data[23057] = 6.63159089;
  661. data[23058] = 4.54574358;
  662. data[23059] = 2.45989650;
  663. data[23060] = 0.37404924;
  664. data[23258] = 0.61795861;
  665. data[23259] = 2.65410915;
  666. data[23260] = 4.69025923;
  667. data[23261] = 6.72641024;
  668. data[23262] = 5.46034705;
  669. data[23263] = 3.47270933;
  670. data[23264] = 1.48507138;
  671. data[23463] = 1.59233576;
  672. data[23464] = 3.53261665;
  673. data[23465] = 5.47289755;
  674. data[23466] = 6.44368259;
  675. data[23467] = 4.54962999;
  676. data[23468] = 2.65557761;
  677. data[23469] = 0.76152512;
  678. data[23667] = 0.46749352;
  679. data[23668] = 2.31641904;
  680. data[23669] = 4.16534441;
  681. data[23670] = 6.01426978;
  682. data[23671] = 5.67844696;
  683. data[23672] = 3.87357362;
  684. data[23673] = 2.06870004;
  685. data[23674] = 0.26382666;
  686. data[23872] = 1.05349103;
  687. data[23873] = 2.81536230;
  688. data[23874] = 4.57723346;
  689. data[23875] = 6.33910485;
  690. data[23876] = 5.12815686;
  691. data[23877] = 3.40826320;
  692. data[23878] = 1.68837002;
  693. data[24077] = 1.43350090;
  694. data[24078] = 3.11241671;
  695. data[24079] = 4.79133241;
  696. data[24080] = 6.40943693;
  697. data[24081] = 4.77052201;
  698. data[24082] = 3.13160778;
  699. data[24083] = 1.49269309;
  700. data[24281] = 0.02932359;
  701. data[24282] = 1.62918994;
  702. data[24283] = 3.22905602;
  703. data[24284] = 4.82892245;
  704. data[24285] = 6.14671456;
  705. data[24286] = 4.58496623;
  706. data[24287] = 3.02321767;
  707. data[24288] = 1.46146910;
  708. data[24486] = 0.13601698;
  709. data[24487] = 1.66055572;
  710. data[24488] = 3.18509457;
  711. data[24489] = 4.70963307;
  712. data[24490] = 6.04072399;
  713. data[24491] = 4.55250870;
  714. data[24492] = 3.06429295;
  715. data[24493] = 1.57607743;
  716. data[24494] = 0.08786193;
  717. data[24691] = 0.09328097;
  718. data[24692] = 1.54603878;
  719. data[24693] = 2.99879676;
  720. data[24694] = 4.45155473;
  721. data[24695] = 5.90431225;
  722. data[24696] = 4.65566106;
  723. data[24697] = 3.23751615;
  724. data[24698] = 1.81937125;
  725. data[24699] = 0.40122634;
  726. data[24897] = 1.30262633;
  727. data[24898] = 2.68698297;
  728. data[24899] = 4.07133950;
  729. data[24900] = 5.45569602;
  730. data[24901] = 4.87832492;
  731. data[24902] = 3.52695142;
  732. data[24903] = 2.17557792;
  733. data[24904] = 0.82420459;
  734. data[25102] = 0.94595028;
  735. data[25103] = 2.26512621;
  736. data[25104] = 3.58430226;
  737. data[25105] = 4.90347855;
  738. data[25106] = 5.20569785;
  739. data[25107] = 3.91795207;
  740. data[25108] = 2.63020652;
  741. data[25109] = 1.34246063;
  742. data[25110] = 0.05471494;
  743. data[25307] = 0.49037894;
  744. data[25308] = 1.74744334;
  745. data[25309] = 3.00450763;
  746. data[25310] = 4.26157191;
  747. data[25311] = 5.51863620;
  748. data[25312] = 4.39707236;
  749. data[25313] = 3.16995848;
  750. data[25314] = 1.94284460;
  751. data[25315] = 0.71573065;
  752. data[25513] = 1.14698056;
  753. data[25514] = 2.34485767;
  754. data[25515] = 3.54273478;
  755. data[25516] = 4.74061165;
  756. data[25517] = 4.95198462;
  757. data[25518] = 3.78264743;
  758. data[25519] = 2.61331047;
  759. data[25520] = 1.44397374;
  760. data[25521] = 0.27463681;
  761. data[25718] = 0.47569509;
  762. data[25719] = 1.61717169;
  763. data[25720] = 2.75864848;
  764. data[25721] = 3.90012516;
  765. data[25722] = 5.04160160;
  766. data[25723] = 4.45712078;
  767. data[25724] = 3.34284059;
  768. data[25725] = 2.22856039;
  769. data[25726] = 1.11428020;
  770. for (auto & val : data) {
  771. val /= 1000.0f;
  772. }
  773. filters.data = std::move(data);
  774. return filters;
  775. }
  776. } // namespace whisper_precalc_filters