mtmd.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319
  1. #ifndef MTMD_H
  2. #define MTMD_H
  3. #include "ggml.h"
  4. #include "llama.h"
  5. #include <stddef.h>
  6. #include <stdint.h>
  7. #include <stdbool.h>
  8. #ifdef __cplusplus
  9. #include <string>
  10. #include <vector>
  11. #include <cinttypes>
  12. #include <memory>
  13. #endif
  14. /**
  15. * libmtmd: A library for multimodal support in llama.cpp.
  16. *
  17. * WARNING: This API is experimental and subject to many BREAKING CHANGES.
  18. * Issues related to API usage may receive lower priority support.
  19. *
  20. * For the usage, see an example in mtmd-cli.cpp
  21. *
  22. * For contributors:
  23. * - Make sure the C API is aligned with the libllama C API (as in llama.h)
  24. * - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead
  25. * - Keep the API minimal, do not expose internal details unless necessary
  26. *
  27. * IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
  28. * We encourage human contributors to ensure the quality and reliability of the codebase.
  29. */
  30. #ifdef LLAMA_SHARED
  31. # if defined(_WIN32) && !defined(__MINGW32__)
  32. # ifdef LLAMA_BUILD
  33. # define MTMD_API __declspec(dllexport)
  34. # else
  35. # define MTMD_API __declspec(dllimport)
  36. # endif
  37. # else
  38. # define MTMD_API __attribute__ ((visibility ("default")))
  39. # endif
  40. #else
  41. # define MTMD_API
  42. #endif
  43. // deprecated marker, use mtmd_default_marker() instead
  44. #define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"
  45. #ifdef __cplusplus
  46. extern "C" {
  47. #endif
  48. enum mtmd_input_chunk_type {
  49. MTMD_INPUT_CHUNK_TYPE_TEXT,
  50. MTMD_INPUT_CHUNK_TYPE_IMAGE,
  51. MTMD_INPUT_CHUNK_TYPE_AUDIO,
  52. };
  53. // opaque types
  54. struct mtmd_context;
  55. struct mtmd_bitmap;
  56. struct mtmd_image_tokens;
  57. struct mtmd_input_chunk;
  58. struct mtmd_input_chunks;
  59. struct mtmd_input_text {
  60. const char * text;
  61. bool add_special;
  62. bool parse_special;
  63. };
  64. //
  65. // C API
  66. //
  67. typedef struct mtmd_context mtmd_context;
  68. typedef struct mtmd_bitmap mtmd_bitmap;
  69. typedef struct mtmd_image_tokens mtmd_image_tokens;
  70. typedef struct mtmd_input_chunk mtmd_input_chunk;
  71. typedef struct mtmd_input_chunks mtmd_input_chunks;
  72. typedef struct mtmd_input_text mtmd_input_text;
  73. struct mtmd_context_params {
  74. bool use_gpu;
  75. bool print_timings;
  76. int n_threads;
  77. const char * image_marker; // deprecated, use media_marker instead
  78. const char * media_marker;
  79. enum llama_flash_attn_type flash_attn_type;
  80. bool warmup; // whether to run a warmup encode pass after initialization
  81. // limit number of image tokens, only for vision models with dynamic resolution
  82. int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
  83. int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
  84. // callback function passed over to mtmd proper
  85. ggml_backend_sched_eval_callback cb_eval;
  86. void * cb_eval_user_data;
  87. };
  88. MTMD_API const char * mtmd_default_marker(void);
  89. MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
  90. // initialize the mtmd context
  91. // return nullptr on failure
  92. MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
  93. const struct llama_model * text_model,
  94. const struct mtmd_context_params ctx_params);
  95. MTMD_API void mtmd_free(mtmd_context * ctx);
  96. // whether we need to set non-causal mask before llama_decode
  97. MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
  98. // whether the current model use M-RoPE for llama_decode
  99. MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
  100. // whether the current model supports vision input
  101. MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
  102. // whether the current model supports audio input
  103. MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
  104. // get audio bitrate in Hz, for example 16000 for Whisper
  105. // return -1 if audio is not supported
  106. MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
  107. // mtmd_bitmap
  108. //
  109. // if bitmap is image:
  110. // length of data must be nx * ny * 3
  111. // the data is in RGBRGBRGB... format
  112. // if bitmap is audio:
  113. // length of data must be n_samples * sizeof(float)
  114. // the data is in float format (PCM F32)
  115. MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data);
  116. MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data);
  117. MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap);
  118. MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap);
  119. MTMD_API const unsigned char * mtmd_bitmap_get_data (const mtmd_bitmap * bitmap);
  120. MTMD_API size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap);
  121. MTMD_API bool mtmd_bitmap_is_audio (const mtmd_bitmap * bitmap);
  122. MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap);
  123. // bitmap ID is optional, but useful for KV cache tracking
  124. // these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
  125. MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
  126. MTMD_API void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id);
  127. // mtmd_input_chunks
  128. //
  129. // this is simply a list of mtmd_input_chunk
  130. // the elements can only be populated via mtmd_tokenize()
  131. MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void);
  132. MTMD_API size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks);
  133. MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get (const mtmd_input_chunks * chunks, size_t idx);
  134. MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks);
  135. // mtmd_input_chunk
  136. //
  137. // the instance will be constructed via mtmd_tokenize()
  138. // it will be freed along with mtmd_input_chunks
  139. MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type (const mtmd_input_chunk * chunk);
  140. MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text (const mtmd_input_chunk * chunk, size_t * n_tokens_output);
  141. MTMD_API const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk);
  142. MTMD_API size_t mtmd_input_chunk_get_n_tokens (const mtmd_input_chunk * chunk);
  143. // returns nullptr for ID on text chunk
  144. MTMD_API const char * mtmd_input_chunk_get_id (const mtmd_input_chunk * chunk);
  145. // number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
  146. MTMD_API llama_pos mtmd_input_chunk_get_n_pos (const mtmd_input_chunk * chunk);
  147. // in case you want to use custom logic to handle the chunk (i.e. KV cache management)
  148. // you can move the chunk ownership to your own code by copying it
  149. // remember to free the chunk when you are done with it
  150. MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk);
  151. MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk);
  152. // mtmd_image_tokens
  153. //
  154. // the instance will be constructed via mtmd_tokenize()
  155. // it will be freed along with mtmd_input_chunk
  156. MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); // TODO: deprecate
  157. MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens);
  158. MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens);
  159. MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens); // TODO: deprecate
  160. // number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
  161. MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens); // TODO: deprecate
  162. // tokenize an input text prompt and a list of bitmaps (images/audio)
  163. // the prompt must have the input image marker (default: "<__media__>") in it
  164. // the default marker is defined by mtmd_default_marker()
  165. // the marker will be replaced with the image/audio chunk
  166. // for example:
  167. // "here is an image: <__media__>\ndescribe it in detail."
  168. // this will gives 3 chunks:
  169. // 1. "here is an image: <start_of_image>"
  170. // 2. (image/audio tokens)
  171. // 3. "<end_of_image>\ndescribe it in detail."
  172. // number of bitmaps must be equal to the number of markers in the prompt
  173. // this function is thread-safe (shared ctx)
  174. // return values:
  175. // 0 on success
  176. // 1 on number of bitmaps not matching the number of markers
  177. // 2 on image preprocessing error
  178. MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
  179. mtmd_input_chunks * output,
  180. const mtmd_input_text * text,
  181. const mtmd_bitmap ** bitmaps,
  182. size_t n_bitmaps);
  183. // returns 0 on success
  184. // TODO: deprecate
  185. MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
  186. const mtmd_image_tokens * image_tokens);
  187. // returns 0 on success
  188. MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
  189. const mtmd_input_chunk * chunk);
  190. // get output embeddings from the last encode pass
  191. // the reading size (in bytes) is equal to:
  192. // llama_model_n_embd(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float)
  193. MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
  194. // Set callback for all future logging events.
  195. // If this is not called, or NULL is supplied, everything is output on stderr.
  196. MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data);
  197. /////////////////////////////////////////
  198. // test function, to be used in test-mtmd-c-api.c
  199. MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
  200. #ifdef __cplusplus
  201. } // extern "C"
  202. #endif
  203. //
  204. // C++ wrappers
  205. //
  206. #ifdef __cplusplus
  207. namespace mtmd {
  208. struct mtmd_context_deleter {
  209. void operator()(mtmd_context * val) { mtmd_free(val); }
  210. };
  211. using context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
  212. struct mtmd_bitmap_deleter {
  213. void operator()(mtmd_bitmap * val) { mtmd_bitmap_free(val); }
  214. };
  215. using bitmap_ptr = std::unique_ptr<mtmd_bitmap, mtmd_bitmap_deleter>;
  216. struct mtmd_input_chunks_deleter {
  217. void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); }
  218. };
  219. using input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
  220. struct mtmd_input_chunk_deleter {
  221. void operator()(mtmd_input_chunk * val) { mtmd_input_chunk_free(val); }
  222. };
  223. using input_chunk_ptr = std::unique_ptr<mtmd_input_chunk, mtmd_input_chunk_deleter>;
  224. struct bitmap {
  225. bitmap_ptr ptr;
  226. bitmap() : ptr(nullptr) {}
  227. bitmap(mtmd_bitmap * bitmap) : ptr(bitmap) {}
  228. bitmap(bitmap && other) noexcept : ptr(std::move(other.ptr)) {}
  229. bitmap(uint32_t nx, uint32_t ny, const unsigned char * data) {
  230. ptr.reset(mtmd_bitmap_init(nx, ny, data));
  231. }
  232. ~bitmap() = default;
  233. uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); }
  234. uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); }
  235. const unsigned char * data() const { return mtmd_bitmap_get_data(ptr.get()); }
  236. size_t n_bytes() const { return mtmd_bitmap_get_n_bytes(ptr.get()); }
  237. std::string id() const { return mtmd_bitmap_get_id(ptr.get()); }
  238. void set_id(const char * id) const { mtmd_bitmap_set_id(ptr.get(), id); }
  239. };
  240. struct bitmaps {
  241. std::vector<bitmap> entries;
  242. ~bitmaps() = default;
  243. // return list of pointers to mtmd_bitmap
  244. // example:
  245. // auto bitmaps_c_ptr = bitmaps.c_ptr();
  246. // int32_t res = mtmd_tokenize(... bitmaps_c_ptr.data(), bitmaps_c_ptr.size());
  247. std::vector<const mtmd_bitmap *> c_ptr() {
  248. std::vector<const mtmd_bitmap *> res(entries.size());
  249. for (size_t i = 0; i < entries.size(); i++) {
  250. res[i] = entries[i].ptr.get();
  251. }
  252. return res;
  253. }
  254. };
  255. struct input_chunks {
  256. input_chunks_ptr ptr;
  257. input_chunks() = default;
  258. input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {}
  259. ~input_chunks() = default;
  260. size_t size() const { return mtmd_input_chunks_size(ptr.get()); }
  261. const mtmd_input_chunk * operator[](size_t idx) const {
  262. return mtmd_input_chunks_get(ptr.get(), idx);
  263. }
  264. };
  265. } // namespace mtmd
  266. #endif
  267. #endif