mtmd.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364
  1. #ifndef MTMD_H
  2. #define MTMD_H
  3. #include "ggml.h"
  4. #include "llama.h"
  5. #include "clip.h"
  6. #include <stddef.h>
  7. #include <stdint.h>
  8. #include <stdbool.h>
  9. #ifdef __cplusplus
  10. #include <string>
  11. #include <vector>
  12. #include <cinttypes>
  13. #include <memory>
  14. #endif
  15. /**
  16. * libmtmd: A library for multimodal support in llama.cpp.
  17. *
  18. * WARNING: This API is experimental and subject to many BREAKING CHANGES.
  19. * Issues related to API usage may receive lower priority support.
  20. *
  21. * For the usage, see an example in mtmd-cli.cpp
  22. */
  23. #ifdef LLAMA_SHARED
  24. # if defined(_WIN32) && !defined(__MINGW32__)
  25. # ifdef LLAMA_BUILD
  26. # define MTMD_API __declspec(dllexport)
  27. # else
  28. # define MTMD_API __declspec(dllimport)
  29. # endif
  30. # else
  31. # define MTMD_API __attribute__ ((visibility ("default")))
  32. # endif
  33. #else
  34. # define MTMD_API
  35. #endif
  36. // deprecated marker, use mtmd_default_marker() instead
  37. #define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"
  38. #ifdef __cplusplus
  39. extern "C" {
  40. #endif
  41. enum mtmd_input_chunk_type {
  42. MTMD_INPUT_CHUNK_TYPE_TEXT,
  43. MTMD_INPUT_CHUNK_TYPE_IMAGE,
  44. MTMD_INPUT_CHUNK_TYPE_AUDIO,
  45. };
  46. // opaque types
  47. struct mtmd_context;
  48. struct mtmd_bitmap;
  49. struct mtmd_image_tokens;
  50. struct mtmd_input_chunk;
  51. struct mtmd_input_chunks;
  52. struct mtmd_input_text {
  53. const char * text;
  54. bool add_special;
  55. bool parse_special;
  56. };
  57. //
  58. // C API
  59. //
  60. typedef struct mtmd_context mtmd_context;
  61. typedef struct mtmd_bitmap mtmd_bitmap;
  62. typedef struct mtmd_image_tokens mtmd_image_tokens;
  63. typedef struct mtmd_input_chunk mtmd_input_chunk;
  64. typedef struct mtmd_input_chunks mtmd_input_chunks;
  65. typedef struct mtmd_input_text mtmd_input_text;
  66. struct mtmd_context_params {
  67. bool use_gpu;
  68. bool print_timings;
  69. int n_threads;
  70. enum ggml_log_level verbosity;
  71. const char * image_marker; // deprecated, use media_marker instead
  72. const char * media_marker;
  73. };
  74. MTMD_API const char * mtmd_default_marker(void);
  75. MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
  76. // initialize the mtmd context
  77. // return nullptr on failure
  78. MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
  79. const struct llama_model * text_model,
  80. const struct mtmd_context_params ctx_params);
  81. MTMD_API void mtmd_free(mtmd_context * ctx);
  82. // whether we need to set non-causal mask before llama_decode
  83. MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
  84. // whether the current model use M-RoPE for llama_decode
  85. MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
  86. // whether the current model supports vision input
  87. MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
  88. // whether the current model supports audio input
  89. MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
  90. // mtmd_bitmap
  91. //
  92. // if bitmap is image:
  93. // length of data must be nx * ny * 3
  94. // the data is in RGBRGBRGB... format
  95. // if bitmap is audio:
  96. // length of data must be n_samples * sizeof(float)
  97. // the data is in float format (PCM F32)
  98. MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data);
  99. MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data);
  100. MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap);
  101. MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap);
  102. MTMD_API const unsigned char * mtmd_bitmap_get_data (const mtmd_bitmap * bitmap);
  103. MTMD_API size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap);
  104. MTMD_API bool mtmd_bitmap_is_audio (const mtmd_bitmap * bitmap);
  105. MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap);
  106. // bitmap ID is optional, but useful for KV cache tracking
  107. // these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
  108. MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
  109. MTMD_API void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id);
  110. // mtmd_input_chunks
  111. //
  112. // this is simply a list of mtmd_input_chunk
  113. // the elements can only be populated via mtmd_tokenize()
  114. MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void);
  115. MTMD_API size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks);
  116. MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get (const mtmd_input_chunks * chunks, size_t idx);
  117. MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks);
  118. // mtmd_input_chunk
  119. //
  120. // the instance will be constructed via mtmd_tokenize()
  121. // it will be freed along with mtmd_input_chunks
  122. MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type (const mtmd_input_chunk * chunk);
  123. MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text (const mtmd_input_chunk * chunk, size_t * n_tokens_output);
  124. MTMD_API const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk);
  125. MTMD_API size_t mtmd_input_chunk_get_n_tokens (const mtmd_input_chunk * chunk);
  126. // returns nullptr for ID on text chunk
  127. MTMD_API const char * mtmd_input_chunk_get_id (const mtmd_input_chunk * chunk);
  128. // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
  129. MTMD_API llama_pos mtmd_input_chunk_get_n_pos (const mtmd_input_chunk * chunk);
  130. // in case you want to use custom logic to handle the chunk (i.e. KV cache management)
  131. // you can move the chunk ownership to your own code by copying it
  132. // remember to free the chunk when you are done with it
  133. MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk);
  134. MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk);
  135. // mtmd_image_tokens
  136. //
  137. // the instance will be constructed via mtmd_tokenize()
  138. // it will be freed along with mtmd_input_chunk
  139. MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); // TODO: deprecate
  140. MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens);
  141. MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens);
  142. MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens); // TODO: deprecate
  143. // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
  144. MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens); // TODO: deprecate
  145. // tokenize an input text prompt and a list of bitmaps (images/audio)
  146. // the prompt must have the input image marker (default: "<__media__>") in it
  147. // the default marker is defined by mtmd_default_marker()
  148. // the marker will be replaced with the image/audio chunk
  149. // for example:
  150. // "here is an image: <__media__>\ndescribe it in detail."
  151. // this will gives 3 chunks:
  152. // 1. "here is an image: <start_of_image>"
  153. // 2. (image/audio tokens)
  154. // 3. "<end_of_image>\ndescribe it in detail."
  155. // number of bitmaps must be equal to the number of markers in the prompt
  156. // this function is thread-safe (shared ctx)
  157. // return values:
  158. // 0 on success
  159. // 1 on number of bitmaps not matching the number of markers
  160. // 2 on image preprocessing error
  161. MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
  162. mtmd_input_chunks * output,
  163. const mtmd_input_text * text,
  164. const mtmd_bitmap ** bitmaps,
  165. size_t n_bitmaps);
  166. // returns 0 on success
  167. // TODO: deprecate
  168. MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
  169. const mtmd_image_tokens * image_tokens);
  170. // returns 0 on success
  171. MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
  172. const mtmd_input_chunk * chunk);
  173. // get output embeddings from the last encode pass
  174. // the reading size (in bytes) is equal to:
  175. // llama_model_n_embd(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float)
  176. MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
  177. /////////////////////////////////////////
  178. //
  179. // Helper functions (can be implemented based on other functions)
  180. //
  181. // Please note that these helpers are not guaranteed to be stable.
  182. // BREAKING CHANGES are expected.
  183. //
  184. // helper function to construct a mtmd_bitmap from a file
  185. // it calls mtmd_helper_bitmap_init_from_buf() internally
  186. // returns nullptr on failure
  187. // this function is thread-safe
  188. MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname);
  189. // helper function to construct a mtmd_bitmap from a buffer containing a file
  190. // supported formats:
  191. // image: formats supported by stb_image: jpg, png, bmp, gif, etc.
  192. // audio: formats supported by miniaudio: wav, mp3, flac
  193. // note: audio files will be auto-detected based on magic bytes
  194. // returns nullptr on failure
  195. // this function is thread-safe
  196. MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len);
  197. // helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
  198. MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
  199. // helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
  200. // normally, n_pos is equal to n_tokens, but for M-RoPE it is different
  201. MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
  202. // helper function that automatically:
  203. // 1. run llama_decode() on text chunks
  204. // 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
  205. // if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
  206. // otherwise, returns 0 on success
  207. // this function is NOT thread-safe
  208. MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
  209. struct llama_context * lctx,
  210. const mtmd_input_chunks * chunks,
  211. llama_pos n_past,
  212. llama_seq_id seq_id,
  213. int32_t n_batch,
  214. bool logits_last,
  215. llama_pos * new_n_past);
  216. // works like mtmd_helper_eval_chunks(), but only for a single chunk
  217. // this function is NOT thread-safe
  218. MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
  219. struct llama_context * lctx,
  220. const mtmd_input_chunk * chunk,
  221. llama_pos n_past,
  222. llama_seq_id seq_id,
  223. int32_t n_batch,
  224. bool logits_last,
  225. llama_pos * new_n_past);
  226. // helper function to decode an image whose embeddings have already been calculated
  227. // this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
  228. // ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
  229. MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
  230. struct llama_context * lctx,
  231. const mtmd_input_chunk * chunk,
  232. float * encoded_embd,
  233. llama_pos n_past,
  234. llama_seq_id seq_id,
  235. int32_t n_batch,
  236. llama_pos * new_n_past);
  237. /////////////////////////////////////////
  238. // test function, to be used in test-mtmd-c-api.c
  239. MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
  240. #ifdef __cplusplus
  241. } // extern "C"
  242. #endif
  243. //
  244. // C++ wrappers
  245. //
  246. #ifdef __cplusplus
  247. namespace mtmd {
  248. struct mtmd_context_deleter {
  249. void operator()(mtmd_context * val) { mtmd_free(val); }
  250. };
  251. using context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
  252. struct mtmd_bitmap_deleter {
  253. void operator()(mtmd_bitmap * val) { mtmd_bitmap_free(val); }
  254. };
  255. using bitmap_ptr = std::unique_ptr<mtmd_bitmap, mtmd_bitmap_deleter>;
  256. struct mtmd_input_chunks_deleter {
  257. void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); }
  258. };
  259. using input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
  260. struct mtmd_input_chunk_deleter {
  261. void operator()(mtmd_input_chunk * val) { mtmd_input_chunk_free(val); }
  262. };
  263. using input_chunk_ptr = std::unique_ptr<mtmd_input_chunk, mtmd_input_chunk_deleter>;
  264. struct bitmap {
  265. bitmap_ptr ptr;
  266. bitmap() : ptr(nullptr) {}
  267. bitmap(mtmd_bitmap * bitmap) : ptr(bitmap) {}
  268. bitmap(bitmap && other) noexcept : ptr(std::move(other.ptr)) {}
  269. bitmap(uint32_t nx, uint32_t ny, const unsigned char * data) {
  270. ptr.reset(mtmd_bitmap_init(nx, ny, data));
  271. }
  272. ~bitmap() = default;
  273. uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); }
  274. uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); }
  275. const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); }
  276. size_t n_bytes() { return mtmd_bitmap_get_n_bytes(ptr.get()); }
  277. std::string id() { return mtmd_bitmap_get_id(ptr.get()); }
  278. void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); }
  279. };
  280. struct bitmaps {
  281. std::vector<bitmap> entries;
  282. ~bitmaps() = default;
  283. // return list of pointers to mtmd_bitmap
  284. // example:
  285. // auto bitmaps_c_ptr = bitmaps.c_ptr();
  286. // int32_t res = mtmd_tokenize(... bitmaps_c_ptr.data(), bitmaps_c_ptr.size());
  287. std::vector<const mtmd_bitmap *> c_ptr() {
  288. std::vector<const mtmd_bitmap *> res(entries.size());
  289. for (size_t i = 0; i < entries.size(); i++) {
  290. res[i] = entries[i].ptr.get();
  291. }
  292. return res;
  293. }
  294. };
  295. struct input_chunks {
  296. input_chunks_ptr ptr;
  297. input_chunks() = default;
  298. input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {}
  299. ~input_chunks() = default;
  300. size_t size() { return mtmd_input_chunks_size(ptr.get()); }
  301. const mtmd_input_chunk * operator[](size_t idx) {
  302. return mtmd_input_chunks_get(ptr.get(), idx);
  303. }
  304. };
  305. } // namespace mtmd
  306. #endif
  307. #endif