7 mēneši atpakaļ · 10961339b2
--- a/.editorconfig
+++ b/.editorconfig
@@ -49,6 +49,6 @@ charset = unset
 
				 trim_trailing_whitespace = unset
			
 
				 insert_final_newline = unset
			
 
				 
			
 
				-[tools/mtmd/miniaudio.h]
			
 
				+[tools/mtmd/vendor/miniaudio.h]
			
 
				 trim_trailing_whitespace = unset
			
 
				 insert_final_newline = unset
			
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -1,48 +1,54 @@
 
				 # mtmd
			
 
				 
			
 
				-# compile mtmd-audio separately to avoid long compile times with miniaudio.h
			
 
				-# TODO @ngxson : move miniaudio.h and stb_image.h to mtmd-helper.cpp, then compile the helper as a separate library
			
 
				-add_library(mtmd_audio STATIC mtmd-audio.cpp mtmd-audio.h)
			
 
				-if (BUILD_SHARED_LIBS)
			
 
				-    set_target_properties(mtmd_audio PROPERTIES POSITION_INDEPENDENT_CODE ON)
			
 
				-endif()
			
 
				-target_link_libraries(mtmd_audio PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
			
 
				-target_compile_features(mtmd_audio PRIVATE cxx_std_17)
			
 
				-target_include_directories(mtmd_audio PRIVATE .)
			
 
				-
			
 
				 add_library(mtmd OBJECT
			
 
				             mtmd.cpp
			
 
				-            mtmd-helper.cpp
			
 
				+            mtmd-audio.cpp
			
 
				             mtmd.h
			
 
				             clip.cpp
			
 
				             clip.h
			
 
				             clip-impl.h
			
 
				             )
			
 
				 
			
 
				-target_link_libraries(mtmd PRIVATE ggml llama mtmd_audio ${CMAKE_THREAD_LIBS_INIT})
			
 
				-
			
 
				+target_link_libraries(mtmd PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
			
 
				 target_include_directories(mtmd PUBLIC .)
			
 
				 target_include_directories(mtmd PRIVATE ../..)
			
 
				-target_include_directories(mtmd PRIVATE ../../common) # for stb_image.h
			
 
				-
			
 
				 target_compile_features(mtmd PRIVATE cxx_std_17)
			
 
				 
			
 
				-add_library(mtmd_static STATIC $<TARGET_OBJECTS:mtmd>)
			
 
				+# compile the helper separately, to avoid long compile times with miniaudio.h and stb_image.h
			
 
				+
			
 
				+add_library(mtmd_helper OBJECT
			
 
				+            mtmd-helper.cpp
			
 
				+            mtmd-helper.h
			
 
				+            )
			
 
				+
			
 
				+target_link_libraries(mtmd_helper PRIVATE ggml llama mtmd ${CMAKE_THREAD_LIBS_INIT})
			
 
				+target_include_directories(mtmd_helper PUBLIC .)
			
 
				+target_include_directories(mtmd_helper PRIVATE ./vendor)
			
 
				+target_include_directories(mtmd_helper PRIVATE ../..)
			
 
				+target_compile_features(mtmd_helper PRIVATE cxx_std_17)
			
 
				+
			
 
				 if (BUILD_SHARED_LIBS)
			
 
				     set_target_properties(mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
			
 
				     target_compile_definitions(mtmd PRIVATE LLAMA_SHARED LLAMA_BUILD)
			
 
				     add_library(mtmd_shared SHARED $<TARGET_OBJECTS:mtmd>)
			
 
				-    target_link_libraries(mtmd_shared PRIVATE ggml llama mtmd_audio ${CMAKE_THREAD_LIBS_INIT})
			
 
				+    target_link_libraries(mtmd_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
			
 
				     install(TARGETS mtmd_shared LIBRARY)
			
 
				+
			
 
				+    set_target_properties(mtmd_helper PROPERTIES POSITION_INDEPENDENT_CODE ON)
			
 
				+    target_compile_definitions(mtmd_helper PRIVATE LLAMA_SHARED LLAMA_BUILD)
			
 
				+    add_library(mtmd_helper_shared SHARED $<TARGET_OBJECTS:mtmd>)
			
 
				+    target_link_libraries(mtmd_helper_shared PRIVATE ggml llama mtmd ${CMAKE_THREAD_LIBS_INIT})
			
 
				+    install(TARGETS mtmd_helper_shared LIBRARY)
			
 
				 endif()
			
 
				 
			
 
				 if (NOT MSVC)
			
 
				-    target_compile_options(mtmd PRIVATE -Wno-cast-qual) # stb_image.h
			
 
				-    target_compile_options(mtmd_audio PRIVATE -Wno-cast-qual) # miniaudio.h
			
 
				+    # for stb_image.h and miniaudio.h
			
 
				+    target_compile_options(mtmd_helper PRIVATE -Wno-cast-qual)
			
 
				 endif()
			
 
				 
			
 
				 if(TARGET BUILD_INFO)
			
 
				     add_dependencies(mtmd BUILD_INFO)
			
 
				+    add_dependencies(mtmd_helper BUILD_INFO)
			
 
				 endif()
			
 
				 
			
 
				 add_executable(llama-llava-cli    deprecation-warning.cpp)
			
@@ -54,5 +60,5 @@ set(TARGET llama-mtmd-cli)
 
				 add_executable(${TARGET} mtmd-cli.cpp)
			
 
				 set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
			
 
				 install(TARGETS ${TARGET} RUNTIME)
			
 
				-target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
			
 
				+target_link_libraries(${TARGET} PRIVATE common mtmd mtmd_helper ${CMAKE_THREAD_LIBS_INIT})
			
 
				 target_compile_features(${TARGET} PRIVATE cxx_std_17)
			
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -11,9 +11,6 @@
 
				 #include "ggml-backend.h"
			
 
				 #include "gguf.h"
			
 
				 
			
 
				-#define STB_IMAGE_IMPLEMENTATION
			
 
				-#include "stb_image.h"
			
 
				-
			
 
				 #include <cassert>
			
 
				 #include <cmath>
			
 
				 #include <cstdlib>
			
@@ -2786,30 +2783,6 @@ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny
 
				     memcpy(img->buf.data(), rgb_pixels, img->buf.size());
			
 
				 }
			
 
				 
			
 
				-bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
			
 
				-    int nx, ny, nc;
			
 
				-    auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
			
 
				-    if (!data) {
			
 
				-        LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
			
 
				-        return false;
			
 
				-    }
			
 
				-    clip_build_img_from_pixels(data, nx, ny, img);
			
 
				-    stbi_image_free(data);
			
 
				-    return true;
			
 
				-}
			
 
				-
			
 
				-bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
			
 
				-    int nx, ny, nc;
			
 
				-    auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
			
 
				-    if (!data) {
			
 
				-        LOG_ERR("%s: failed to decode image bytes\n", __func__);
			
 
				-        return false;
			
 
				-    }
			
 
				-    clip_build_img_from_pixels(data, nx, ny, img);
			
 
				-    stbi_image_free(data);
			
 
				-    return true;
			
 
				-}
			
 
				-
			
 
				 // Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
			
 
				 static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
			
 
				     dst.nx = src.nx;
			
--- a/tools/mtmd/mtmd-audio.cpp
+++ b/tools/mtmd/mtmd-audio.cpp
@@ -1,28 +1,5 @@
 
				-// fix problem with std::min and std::max
			
 
				-#if defined(_WIN32)
			
 
				-#define WIN32_LEAN_AND_MEAN
			
 
				-#ifndef NOMINMAX
			
 
				-#   define NOMINMAX
			
 
				-#endif
			
 
				-#include <windows.h>
			
 
				-#endif
			
 
				-
			
 
				 #include "mtmd-audio.h"
			
 
				 
			
 
				-//#define MTMD_AUDIO_DEBUG
			
 
				-
			
 
				-#define MINIAUDIO_IMPLEMENTATION
			
 
				-#ifndef MTMD_AUDIO_DEBUG
			
 
				-#   define MA_NO_ENCODING
			
 
				-#endif
			
 
				-#define MA_NO_DEVICE_IO
			
 
				-#define MA_NO_RESOURCE_MANAGER
			
 
				-#define MA_NO_NODE_GRAPH
			
 
				-#define MA_NO_ENGINE
			
 
				-#define MA_NO_GENERATION
			
 
				-#define MA_API static
			
 
				-#include "miniaudio.h"
			
 
				-
			
 
				 #define _USE_MATH_DEFINES // for M_PI
			
 
				 #include <cmath>
			
 
				 #include <cstdint>
			
@@ -359,69 +336,6 @@ bool preprocess_audio(
 
				 } // namespace whisper_preprocessor
			
 
				 
			
 
				 
			
 
				-namespace audio_helpers {
			
 
				-
			
 
				-bool is_audio_file(const char * buf, size_t len) {
			
 
				-    if (len < 12) {
			
 
				-        return false;
			
 
				-    }
			
 
				-
			
 
				-    // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
			
 
				-    // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
			
 
				-    bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
			
 
				-    bool is_mp3 = len >= 3 && (
			
 
				-        memcmp(buf, "ID3", 3) == 0 ||
			
 
				-        // Check for MPEG sync word (simplified check)
			
 
				-        ((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
			
 
				-    );
			
 
				-    bool is_flac = memcmp(buf, "fLaC", 4) == 0;
			
 
				-
			
 
				-    return is_wav || is_mp3 || is_flac;
			
 
				-}
			
 
				-
			
 
				-// returns true if the buffer is a valid audio file
			
 
				-bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
			
 
				-    ma_result result;
			
 
				-    const int channels = 1;
			
 
				-    ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
			
 
				-    ma_decoder decoder;
			
 
				-
			
 
				-    result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
			
 
				-    if (result != MA_SUCCESS) {
			
 
				-        return false;
			
 
				-    }
			
 
				-
			
 
				-    ma_uint64 frame_count;
			
 
				-    ma_uint64 frames_read;
			
 
				-    result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
			
 
				-    if (result != MA_SUCCESS) {
			
 
				-        ma_decoder_uninit(&decoder);
			
 
				-        return false;
			
 
				-    }
			
 
				-
			
 
				-    pcmf32_mono.resize(frame_count);
			
 
				-    result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
			
 
				-    if (result != MA_SUCCESS) {
			
 
				-        ma_decoder_uninit(&decoder);
			
 
				-        return false;
			
 
				-    }
			
 
				-
			
 
				-#ifdef MTMD_AUDIO_DEBUG
			
 
				-    // save audio to wav file
			
 
				-    ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
			
 
				-    ma_encoder encoder;
			
 
				-    ma_encoder_init_file("output.wav", &config, &encoder);
			
 
				-    ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
			
 
				-    ma_encoder_uninit(&encoder);
			
 
				-#endif
			
 
				-
			
 
				-    ma_decoder_uninit(&decoder);
			
 
				-    return true;
			
 
				-}
			
 
				-
			
 
				-} // namespace wav_utils
			
 
				-
			
 
				-
			
 
				 // precalculated mel filter banks
			
 
				 // values are multiplied by 1000.0 to save space, and will be divided by 1000.0 in the end of the function
			
 
				 //
			
--- a/tools/mtmd/mtmd-audio.h
+++ b/tools/mtmd/mtmd-audio.h
@@ -32,7 +32,7 @@ struct whisper_filters {
 
				     std::vector<float> data;
			
 
				 };
			
 
				 
			
 
				-extern bool preprocess_audio(
			
 
				+bool preprocess_audio(
			
 
				         const float * samples,
			
 
				         size_t n_samples,
			
 
				         const whisper_filters & filters,
			
@@ -40,23 +40,8 @@ extern bool preprocess_audio(
 
				 
			
 
				 } // namespace whisper_preprocessor
			
 
				 
			
 
				-
			
 
				-// TODO @ngxson : move this helper to mtmd-helpers.cpp
			
 
				-namespace audio_helpers {
			
 
				-
			
 
				-extern bool is_audio_file(const char * buf, size_t len);
			
 
				-
			
 
				-extern bool decode_audio_from_buf(
			
 
				-        const unsigned char * buf_in,
			
 
				-        size_t len,
			
 
				-        int target_sampler_rate,
			
 
				-        std::vector<float> & pcmf32_mono);
			
 
				-
			
 
				-} // namespace audio_helpers
			
 
				-
			
 
				-
			
 
				 namespace whisper_precalc_filters {
			
 
				 
			
 
				-extern whisper_preprocessor::whisper_filters get_128_bins();
			
 
				+whisper_preprocessor::whisper_filters get_128_bins();
			
 
				 
			
 
				 } // namespace whisper_precalc_filters
			
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -7,6 +7,7 @@
 
				 #include "console.h"
			
 
				 #include "chat.h"
			
 
				 #include "mtmd.h"
			
 
				+#include "mtmd-helper.h"
			
 
				 
			
 
				 #include <vector>
			
 
				 #include <limits.h>
			
@@ -143,7 +144,7 @@ struct mtmd_cli_context {
 
				     }
			
 
				 
			
 
				     bool load_media(const std::string & fname) {
			
 
				-        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(fname.c_str()));
			
 
				+        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str()));
			
 
				         if (!bmp.ptr) {
			
 
				             return false;
			
 
				         }
			
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@@ -1,10 +1,37 @@
 
				+// fix problem with std::min and std::max
			
 
				+#if defined(_WIN32)
			
 
				+#define WIN32_LEAN_AND_MEAN
			
 
				+#ifndef NOMINMAX
			
 
				+#   define NOMINMAX
			
 
				+#endif
			
 
				+#include <windows.h>
			
 
				+#endif
			
 
				+
			
 
				 #include "mtmd.h"
			
 
				+#include "mtmd-helper.h"
			
 
				 #include "llama.h"
			
 
				 
			
 
				 #include <algorithm>
			
 
				 #include <cinttypes>
			
 
				 #include <vector>
			
 
				 
			
 
				+//#define MTMD_AUDIO_DEBUG
			
 
				+
			
 
				+#define MINIAUDIO_IMPLEMENTATION
			
 
				+#ifndef MTMD_AUDIO_DEBUG
			
 
				+#   define MA_NO_ENCODING
			
 
				+#endif
			
 
				+#define MA_NO_DEVICE_IO
			
 
				+#define MA_NO_RESOURCE_MANAGER
			
 
				+#define MA_NO_NODE_GRAPH
			
 
				+#define MA_NO_ENGINE
			
 
				+#define MA_NO_GENERATION
			
 
				+#define MA_API static
			
 
				+#include "vendor/miniaudio.h"
			
 
				+
			
 
				+#define STB_IMAGE_IMPLEMENTATION
			
 
				+#include "vendor/stb_image.h"
			
 
				+
			
 
				 #define LOG_INF(...) fprintf(stdout, __VA_ARGS__)
			
 
				 #define LOG_ERR(...) fprintf(stderr, __VA_ARGS__)
			
 
				 
			
@@ -315,3 +342,118 @@ int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
 
				 
			
 
				     return 0;
			
 
				 }
			
 
				+
			
 
				+namespace audio_helpers {
			
 
				+
			
 
				+static bool is_audio_file(const char * buf, size_t len) {
			
 
				+    if (len < 12) {
			
 
				+        return false;
			
 
				+    }
			
 
				+
			
 
				+    // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
			
 
				+    // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
			
 
				+    bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
			
 
				+    bool is_mp3 = len >= 3 && (
			
 
				+        memcmp(buf, "ID3", 3) == 0 ||
			
 
				+        // Check for MPEG sync word (simplified check)
			
 
				+        ((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
			
 
				+    );
			
 
				+    bool is_flac = memcmp(buf, "fLaC", 4) == 0;
			
 
				+
			
 
				+    return is_wav || is_mp3 || is_flac;
			
 
				+}
			
 
				+
			
 
				+// returns true if the buffer is a valid audio file
			
 
				+static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
			
 
				+    ma_result result;
			
 
				+    const int channels = 1;
			
 
				+    ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
			
 
				+    ma_decoder decoder;
			
 
				+
			
 
				+    result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
			
 
				+    if (result != MA_SUCCESS) {
			
 
				+        return false;
			
 
				+    }
			
 
				+
			
 
				+    ma_uint64 frame_count;
			
 
				+    ma_uint64 frames_read;
			
 
				+    result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
			
 
				+    if (result != MA_SUCCESS) {
			
 
				+        ma_decoder_uninit(&decoder);
			
 
				+        return false;
			
 
				+    }
			
 
				+
			
 
				+    pcmf32_mono.resize(frame_count);
			
 
				+    result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
			
 
				+    if (result != MA_SUCCESS) {
			
 
				+        ma_decoder_uninit(&decoder);
			
 
				+        return false;
			
 
				+    }
			
 
				+
			
 
				+#ifdef MTMD_AUDIO_DEBUG
			
 
				+    // save audio to wav file
			
 
				+    ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
			
 
				+    ma_encoder encoder;
			
 
				+    ma_encoder_init_file("output.wav", &config, &encoder);
			
 
				+    ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
			
 
				+    ma_encoder_uninit(&encoder);
			
 
				+#endif
			
 
				+
			
 
				+    ma_decoder_uninit(&decoder);
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+} // namespace audio_helpers
			
 
				+
			
 
				+mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len) {
			
 
				+    if (audio_helpers::is_audio_file((const char *)buf, len)) {
			
 
				+        std::vector<float> pcmf32;
			
 
				+        int bitrate = mtmd_get_audio_bitrate(ctx);
			
 
				+        if (bitrate < 0) {
			
 
				+            LOG_ERR("This model does not support audio input\n");
			
 
				+            return nullptr;
			
 
				+        }
			
 
				+        if (!audio_helpers::decode_audio_from_buf(buf, len, bitrate, pcmf32)) {
			
 
				+            LOG_ERR("Unable to read WAV audio file from buffer\n");
			
 
				+            return nullptr;
			
 
				+        }
			
 
				+        return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
			
 
				+    }
			
 
				+
			
 
				+    // otherwise, we assume it's an image
			
 
				+    mtmd_bitmap * result = nullptr;
			
 
				+    {
			
 
				+        int nx, ny, nc;
			
 
				+        auto * data = stbi_load_from_memory(buf, len, &nx, &ny, &nc, 3);
			
 
				+        if (!data) {
			
 
				+            LOG_ERR("%s: failed to decode image bytes\n", __func__);
			
 
				+            return nullptr;
			
 
				+        }
			
 
				+        result = mtmd_bitmap_init(nx, ny, data);
			
 
				+        stbi_image_free(data);
			
 
				+    }
			
 
				+    return result;
			
 
				+}
			
 
				+
			
 
				+mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) {
			
 
				+    std::vector<unsigned char> buf;
			
 
				+    FILE * f = fopen(fname, "rb");
			
 
				+    if (!f) {
			
 
				+        LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
			
 
				+        return nullptr;
			
 
				+    }
			
 
				+
			
 
				+    fseek(f, 0, SEEK_END);
			
 
				+    long file_size = ftell(f);
			
 
				+    fseek(f, 0, SEEK_SET);
			
 
				+    buf.resize(file_size);
			
 
				+
			
 
				+    size_t n_read = fread(buf.data(), 1, file_size, f);
			
 
				+    fclose(f);
			
 
				+    if (n_read != (size_t)file_size) {
			
 
				+        LOG_ERR("Failed to read entire file %s", fname);
			
 
				+        return nullptr;
			
 
				+    }
			
 
				+
			
 
				+    return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size());
			
 
				+}
			
--- a/tools/mtmd/mtmd-helper.h
+++ b/tools/mtmd/mtmd-helper.h
@@ -0,0 +1,91 @@
 
				+#ifndef MTMD_HELPER_H
			
 
				+#define MTMD_HELPER_H
			
 
				+
			
 
				+#include "ggml.h"
			
 
				+#include "llama.h"
			
 
				+#include "mtmd.h"
			
 
				+
			
 
				+#include <stddef.h>
			
 
				+#include <stdint.h>
			
 
				+#include <stdbool.h>
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+//
			
 
				+// libmtmd helper functions
			
 
				+//
			
 
				+// Please note that these helpers are not guaranteed to be stable.
			
 
				+// BREAKING CHANGES are expected.
			
 
				+//
			
 
				+
			
 
				+// helper function to construct a mtmd_bitmap from a file
			
 
				+// it calls mtmd_helper_bitmap_init_from_buf() internally
			
 
				+// returns nullptr on failure
			
 
				+// this function is thread-safe
			
 
				+MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname);
			
 
				+
			
 
				+// helper function to construct a mtmd_bitmap from a buffer containing a file
			
 
				+// supported formats:
			
 
				+//     image: formats supported by stb_image: jpg, png, bmp, gif, etc.
			
 
				+//     audio: formats supported by miniaudio: wav, mp3, flac
			
 
				+// note: audio files will be auto-detected based on magic bytes
			
 
				+// returns nullptr on failure
			
 
				+// this function is thread-safe
			
 
				+MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
			
 
				+
			
 
				+// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
			
 
				+MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
			
 
				+
			
 
				+// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
			
 
				+// normally, n_pos is equal to n_tokens, but for M-RoPE it is different
			
 
				+MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
			
 
				+
			
 
				+// helper function that automatically:
			
 
				+// 1. run llama_decode() on text chunks
			
 
				+// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
			
 
				+// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
			
 
				+// otherwise, returns 0 on success
			
 
				+// this function is NOT thread-safe
			
 
				+MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
			
 
				+                                         struct llama_context * lctx,
			
 
				+                                         const mtmd_input_chunks * chunks,
			
 
				+                                         llama_pos n_past,
			
 
				+                                         llama_seq_id seq_id,
			
 
				+                                         int32_t n_batch,
			
 
				+                                         bool logits_last,
			
 
				+                                         llama_pos * new_n_past);
			
 
				+
			
 
				+// works like mtmd_helper_eval_chunks(), but only for a single chunk
			
 
				+// this function is NOT thread-safe
			
 
				+MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
			
 
				+                                               struct llama_context * lctx,
			
 
				+                                               const mtmd_input_chunk * chunk,
			
 
				+                                               llama_pos n_past,
			
 
				+                                               llama_seq_id seq_id,
			
 
				+                                               int32_t n_batch,
			
 
				+                                               bool logits_last,
			
 
				+                                               llama_pos * new_n_past);
			
 
				+
			
 
				+// helper function to decode an image whose embeddings have already been calculated
			
 
				+// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
			
 
				+// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
			
 
				+MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
			
 
				+                                                struct llama_context * lctx,
			
 
				+                                                const mtmd_input_chunk * chunk,
			
 
				+                                                float * encoded_embd,
			
 
				+                                                llama_pos n_past,
			
 
				+                                                llama_seq_id seq_id,
			
 
				+                                                int32_t n_batch,
			
 
				+                                                llama_pos * new_n_past);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+} // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+//
			
 
				+// C++ wrappers
			
 
				+//
			
 
				+
			
 
				+#endif
			
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -819,53 +819,12 @@ bool mtmd_support_audio(mtmd_context * ctx) {
 
				     return ctx->ctx_a != nullptr;
			
 
				 }
			
 
				 
			
 
				-// these 2 helpers below use internal clip_image_u8_ptr,
			
 
				-// so unfortunately they cannot moved to mtmd-helper.h
			
 
				-// however, in theory, user can decode image file to bitmap using
			
 
				-// whichever library they want, and then use mtmd_bitmap_init() to create bitmap
			
 
				-
			
 
				-mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len) {
			
 
				-    if (audio_helpers::is_audio_file((const char *)buf, len)) {
			
 
				-        std::vector<float> pcmf32;
			
 
				-        if (!audio_helpers::decode_audio_from_buf(buf, len, COMMON_SAMPLE_RATE, pcmf32)) {
			
 
				-            LOG_ERR("Unable to read WAV audio file from buffer\n");
			
 
				-            return nullptr;
			
 
				-        }
			
 
				-        return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
			
 
				-    }
			
 
				-
			
 
				-    clip_image_u8_ptr img_u8(clip_image_u8_init());
			
 
				-    bool ok = clip_image_load_from_bytes(buf, len, img_u8.get());
			
 
				-    if (!ok) {
			
 
				-        LOG_ERR("Unable to load image from buffer\n");
			
 
				-        return nullptr;
			
 
				+int mtmd_get_audio_bitrate(mtmd_context * ctx) {
			
 
				+    if (!ctx->ctx_a) {
			
 
				+        return -1;
			
 
				     }
			
 
				-    uint32_t nx, ny;
			
 
				-    unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny);
			
 
				-    return mtmd_bitmap_init(nx, ny, data);
			
 
				-}
			
 
				-
			
 
				-mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname) {
			
 
				-    std::vector<unsigned char> buf;
			
 
				-    FILE * f = fopen(fname, "rb");
			
 
				-    if (!f) {
			
 
				-        LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
			
 
				-        return nullptr;
			
 
				-    }
			
 
				-
			
 
				-    fseek(f, 0, SEEK_END);
			
 
				-    long file_size = ftell(f);
			
 
				-    fseek(f, 0, SEEK_SET);
			
 
				-    buf.resize(file_size);
			
 
				-
			
 
				-    size_t n_read = fread(buf.data(), 1, file_size, f);
			
 
				-    fclose(f);
			
 
				-    if (n_read != (size_t)file_size) {
			
 
				-        LOG_ERR("Failed to read entire file %s", fname);
			
 
				-        return nullptr;
			
 
				-    }
			
 
				-
			
 
				-    return mtmd_helper_bitmap_init_from_buf(buf.data(), buf.size());
			
 
				+    // for now, we assume that all audio models have the same bitrate
			
 
				+    return 16000; // 16kHz
			
 
				 }
			
 
				 
			
 
				 //
			
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -109,6 +109,10 @@ MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
 
				 // whether the current model supports audio input
			
 
				 MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
			
 
				 
			
 
				+// get audio bitrate in Hz, for example 16000 for Whisper
			
 
				+// return -1 if audio is not supported
			
 
				+MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
			
 
				+
			
 
				 // mtmd_bitmap
			
 
				 //
			
 
				 // if bitmap is image:
			
@@ -209,75 +213,6 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
 
				 
			
 
				 /////////////////////////////////////////
			
 
				 
			
 
				-//
			
 
				-// Helper functions (can be implemented based on other functions)
			
 
				-//
			
 
				-// Please note that these helpers are not guaranteed to be stable.
			
 
				-// BREAKING CHANGES are expected.
			
 
				-//
			
 
				-
			
 
				-// helper function to construct a mtmd_bitmap from a file
			
 
				-// it calls mtmd_helper_bitmap_init_from_buf() internally
			
 
				-// returns nullptr on failure
			
 
				-// this function is thread-safe
			
 
				-MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname);
			
 
				-
			
 
				-// helper function to construct a mtmd_bitmap from a buffer containing a file
			
 
				-// supported formats:
			
 
				-//     image: formats supported by stb_image: jpg, png, bmp, gif, etc.
			
 
				-//     audio: formats supported by miniaudio: wav, mp3, flac
			
 
				-// note: audio files will be auto-detected based on magic bytes
			
 
				-// returns nullptr on failure
			
 
				-// this function is thread-safe
			
 
				-MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len);
			
 
				-
			
 
				-// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
			
 
				-MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
			
 
				-
			
 
				-// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
			
 
				-// normally, n_pos is equal to n_tokens, but for M-RoPE it is different
			
 
				-MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
			
 
				-
			
 
				-// helper function that automatically:
			
 
				-// 1. run llama_decode() on text chunks
			
 
				-// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
			
 
				-// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
			
 
				-// otherwise, returns 0 on success
			
 
				-// this function is NOT thread-safe
			
 
				-MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
			
 
				-                                         struct llama_context * lctx,
			
 
				-                                         const mtmd_input_chunks * chunks,
			
 
				-                                         llama_pos n_past,
			
 
				-                                         llama_seq_id seq_id,
			
 
				-                                         int32_t n_batch,
			
 
				-                                         bool logits_last,
			
 
				-                                         llama_pos * new_n_past);
			
 
				-
			
 
				-// works like mtmd_helper_eval_chunks(), but only for a single chunk
			
 
				-// this function is NOT thread-safe
			
 
				-MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
			
 
				-                                               struct llama_context * lctx,
			
 
				-                                               const mtmd_input_chunk * chunk,
			
 
				-                                               llama_pos n_past,
			
 
				-                                               llama_seq_id seq_id,
			
 
				-                                               int32_t n_batch,
			
 
				-                                               bool logits_last,
			
 
				-                                               llama_pos * new_n_past);
			
 
				-
			
 
				-// helper function to decode an image whose embeddings have already been calculated
			
 
				-// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
			
 
				-// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
			
 
				-MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
			
 
				-                                                struct llama_context * lctx,
			
 
				-                                                const mtmd_input_chunk * chunk,
			
 
				-                                                float * encoded_embd,
			
 
				-                                                llama_pos n_past,
			
 
				-                                                llama_seq_id seq_id,
			
 
				-                                                int32_t n_batch,
			
 
				-                                                llama_pos * new_n_past);
			
 
				-
			
 
				-/////////////////////////////////////////
			
 
				-
			
 
				 // test function, to be used in test-mtmd-c-api.c
			
 
				 MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
			
 
				 
			
--- a/tools/mtmd/vendor/miniaudio.h
+++ b/tools/mtmd/vendor/miniaudio.h
--- a/tools/mtmd/vendor/stb_image.h
+++ b/tools/mtmd/vendor/stb_image.h
--- a/tools/server/CMakeLists.txt
+++ b/tools/server/CMakeLists.txt
@@ -36,7 +36,7 @@ install(TARGETS ${TARGET} RUNTIME)
 
				 
			
 
				 target_include_directories(${TARGET} PRIVATE ../llava)
			
 
				 target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
			
 
				-target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
			
 
				+target_link_libraries(${TARGET} PRIVATE common mtmd mtmd_helper ${CMAKE_THREAD_LIBS_INIT})
			
 
				 
			
 
				 if (LLAMA_SERVER_SSL)
			
 
				     find_package(OpenSSL REQUIRED)
			
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -9,6 +9,7 @@
 
				 #include "sampling.h"
			
 
				 #include "speculative.h"
			
 
				 #include "mtmd.h"
			
 
				+#include "mtmd-helper.h"
			
 
				 
			
 
				 // Change JSON_ASSERT from assert() to GGML_ASSERT:
			
 
				 #define JSON_ASSERT GGML_ASSERT
			
@@ -4187,7 +4188,7 @@ int main(int argc, char ** argv) {
 
				                     throw std::runtime_error("This server does not support multimodal");
			
 
				                 }
			
 
				                 for (auto & file : files) {
			
 
				-                    mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size()));
			
 
				+                    mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(ctx_server.mctx, file.data(), file.size()));
			
 
				                     if (!bmp.ptr) {
			
 
				                         throw std::runtime_error("Failed to load image or audio file");
			
 
				                     }
			
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -6,6 +6,7 @@
 
				 #include "arg.h" // common_remote_get_content
			
 
				 #include "base64.hpp"
			
 
				 #include "mtmd.h"
			
 
				+#include "mtmd-helper.h"
			
 
				 
			
 
				 // increase max payload length to allow use of larger context size
			
 
				 #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576