mtmd.cpp 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083
  1. #include "clip.h"
  2. #include "clip-impl.h"
  3. #include "mtmd.h"
  4. #include "mtmd-audio.h"
  5. #include "llama.h"
  6. // fix problem with std::min and std::max
  7. #if defined(_WIN32)
  8. #define WIN32_LEAN_AND_MEAN
  9. #ifndef NOMINMAX
  10. # define NOMINMAX
  11. #endif
  12. #include <windows.h>
  13. #endif
  14. #include <algorithm>
  15. #include <cerrno>
  16. #include <cstdio>
  17. #include <cstdlib>
  18. #include <cstring>
  19. #include <limits>
  20. #include <vector>
  21. // represents raw image data, layout is RGBRGBRGB...
  22. // length of data must be nx * ny * 3
  23. struct mtmd_bitmap {
  24. uint32_t nx;
  25. uint32_t ny;
  26. std::vector<unsigned char> data;
  27. std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
  28. bool is_audio = false; // true if the bitmap is audio
  29. };
  30. struct mtmd_image_tokens {
  31. uint32_t nx; // number of tokens in x direction
  32. uint32_t ny; // number of tokens in y direction
  33. bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
  34. uint32_t n_tokens() const { return nx * ny; }
  35. clip_image_f32_batch batch_f32; // preprocessed image patches
  36. std::string id; // optional user-defined ID, useful for KV cache tracking
  37. mtmd_image_tokens clone() {
  38. return mtmd_image_tokens{
  39. nx,
  40. ny,
  41. use_mrope_pos,
  42. batch_f32.clone(),
  43. id
  44. };
  45. }
  46. };
  47. using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens>;
  48. struct mtmd_audio_tokens {
  49. uint32_t n_tokens; // number of tokens
  50. clip_image_f32_batch batch_f32; // preprocessed image patches
  51. std::string id; // optional user-defined ID, useful for KV cache tracking
  52. mtmd_audio_tokens clone() {
  53. return mtmd_audio_tokens{
  54. n_tokens,
  55. batch_f32.clone(),
  56. id
  57. };
  58. }
  59. };
  60. using mtmd_audio_tokens_ptr = std::unique_ptr<mtmd_audio_tokens>;
  61. struct mtmd_input_chunk {
  62. mtmd_input_chunk_type type;
  63. std::vector<llama_token> tokens_text;
  64. mtmd_image_tokens_ptr tokens_image;
  65. mtmd_audio_tokens_ptr tokens_audio;
  66. };
  67. struct mtmd_input_chunks {
  68. std::vector<mtmd_input_chunk> entries;
  69. };
  70. // slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings
  71. // models not having it (llava-1.6) will process embeddings without any special tokens in-between
  72. enum mtmd_slice_tmpl {
  73. MTMD_SLICE_TMPL_NONE,
  74. MTMD_SLICE_TMPL_MINICPMV_2_5,
  75. MTMD_SLICE_TMPL_MINICPMV_2_6,
  76. MTMD_SLICE_TMPL_LLAMA4,
  77. MTMD_SLICE_TMPL_IDEFICS3,
  78. };
  79. const char * mtmd_default_marker() {
  80. return "<__media__>";
  81. }
  82. mtmd_context_params mtmd_context_params_default() {
  83. mtmd_context_params params;
  84. params.use_gpu = true;
  85. params.print_timings = true;
  86. params.n_threads = 4;
  87. params.verbosity = GGML_LOG_LEVEL_INFO;
  88. params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
  89. params.media_marker = mtmd_default_marker();
  90. return params;
  91. }
  92. struct mtmd_context {
  93. struct clip_ctx * ctx_v; // vision
  94. struct clip_ctx * ctx_a; // audio
  95. const struct llama_model * text_model;
  96. std::vector<float> image_embd_v; // image embedding vector
  97. bool print_timings;
  98. int n_threads;
  99. std::string media_marker;
  100. const int n_embd_text;
  101. // these are not token, but strings used to mark the beginning and end of image/audio embeddings
  102. std::string img_beg;
  103. std::string img_end;
  104. std::string aud_beg;
  105. std::string aud_end;
  106. // for llava-uhd style models, we need special tokens in-between slices
  107. // minicpmv calls them "slices", llama 4 calls them "tiles"
  108. mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
  109. std::vector<llama_token> tok_ov_img_start; // overview image
  110. std::vector<llama_token> tok_ov_img_end; // overview image
  111. std::vector<llama_token> tok_slices_start; // start of all slices
  112. std::vector<llama_token> tok_slices_end; // end of all slices
  113. std::vector<llama_token> tok_sli_img_start; // single slice start
  114. std::vector<llama_token> tok_sli_img_end; // single slice end
  115. std::vector<llama_token> tok_sli_img_mid; // between 2 slices
  116. std::vector<llama_token> tok_row_end; // end of row
  117. bool tok_row_end_trail = false;
  118. bool ov_img_first = false;
  119. bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
  120. // string template for slice image delimiters with row/col (idefics3)
  121. std::string sli_img_start_tmpl;
  122. // for whisper, we pre-calculate the mel filter bank
  123. whisper_preprocessor::whisper_filters w_filters;
  124. // TODO @ngxson : add timings
  125. mtmd_context(const char * mmproj_fname,
  126. const llama_model * text_model,
  127. const mtmd_context_params & ctx_params) :
  128. text_model (text_model),
  129. print_timings(ctx_params.print_timings),
  130. n_threads (ctx_params.n_threads),
  131. media_marker (ctx_params.media_marker),
  132. n_embd_text (llama_model_n_embd(text_model))
  133. {
  134. if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
  135. throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
  136. }
  137. if (media_marker.empty()) {
  138. throw std::runtime_error("media_marker must not be empty");
  139. }
  140. clip_context_params ctx_clip_params;
  141. ctx_clip_params.use_gpu = ctx_params.use_gpu;
  142. ctx_clip_params.verbosity = ctx_params.verbosity;
  143. auto res = clip_init(mmproj_fname, ctx_clip_params);
  144. ctx_v = res.ctx_v;
  145. ctx_a = res.ctx_a;
  146. if (!ctx_v && !ctx_a) {
  147. throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
  148. }
  149. // if both vision and audio mmproj are present, we need to validate their n_embd
  150. if (ctx_v && ctx_a) {
  151. int n_embd_v = clip_n_mmproj_embd(ctx_v);
  152. int n_embd_a = clip_n_mmproj_embd(ctx_a);
  153. if (n_embd_v != n_embd_a) {
  154. throw std::runtime_error(string_format(
  155. "mismatch between vision and audio mmproj (n_embd_v = %d, n_embd_a = %d)\n",
  156. n_embd_v, n_embd_a));
  157. }
  158. }
  159. // since we already validate n_embd of vision and audio mmproj,
  160. // we can safely assume that they are the same
  161. int n_embd_clip = clip_n_mmproj_embd(ctx_v ? ctx_v : ctx_a);
  162. if (n_embd_text != n_embd_clip) {
  163. throw std::runtime_error(string_format(
  164. "mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n"
  165. "hint: you may be using wrong mmproj\n",
  166. n_embd_text, n_embd_clip));
  167. }
  168. if (ctx_v) {
  169. init_vision();
  170. }
  171. if (ctx_a) {
  172. init_audio();
  173. }
  174. }
  175. void init_vision() {
  176. GGML_ASSERT(ctx_v != nullptr);
  177. use_mrope = clip_is_qwen2vl(ctx_v);
  178. projector_type proj = clip_get_projector_type(ctx_v);
  179. int minicpmv_version = clip_is_minicpmv(ctx_v);
  180. if (minicpmv_version == 2) {
  181. // minicpmv 2.5 format:
  182. // <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
  183. slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5;
  184. tok_ov_img_start = {lookup_token("<image>")};
  185. tok_ov_img_end = {lookup_token("</image>")};
  186. tok_slices_start = {lookup_token("<slice>")};
  187. tok_slices_end = {lookup_token("</slice>")};
  188. tok_sli_img_start = tok_ov_img_start;
  189. tok_sli_img_end = tok_ov_img_end;
  190. tok_row_end = {lookup_token("\n")};
  191. tok_row_end_trail = false; // no trailing end-of-row token
  192. ov_img_first = true;
  193. } else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6) {
  194. // minicpmv 2.6 format:
  195. // <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
  196. slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
  197. tok_ov_img_start = {lookup_token("<image>")};
  198. tok_ov_img_end = {lookup_token("</image>")};
  199. tok_sli_img_start = {lookup_token("<slice>")};
  200. tok_sli_img_end = {lookup_token("</slice>")};
  201. tok_row_end = {lookup_token("\n")};
  202. tok_row_end_trail = false; // no trailing end-of-row token
  203. ov_img_first = true;
  204. } else if (minicpmv_version != 0) {
  205. GGML_ASSERT(false && "unsupported minicpmv version");
  206. } else if (proj == PROJECTOR_TYPE_LLAMA4) {
  207. // llama 4 format:
  208. // <|image_start|>
  209. // (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
  210. // (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
  211. // ... <|tile_y_separator|> <-- trailing end-of-row token
  212. // <|image|> (overview) <-- overview image is last
  213. // <|image_end|>
  214. slice_tmpl = MTMD_SLICE_TMPL_LLAMA4;
  215. tok_ov_img_start = {lookup_token("<|image|>")};
  216. tok_sli_img_mid = {lookup_token("<|tile_x_separator|>")};
  217. tok_row_end = {lookup_token("<|tile_y_separator|>")};
  218. tok_row_end_trail = true; // add trailing end-of-row token
  219. ov_img_first = false; // overview image is last
  220. }
  221. // set boi/eoi
  222. if (proj == PROJECTOR_TYPE_GEMMA3) {
  223. // <start_of_image> ... (image embeddings) ... <end_of_image>
  224. img_beg = "<start_of_image>";
  225. img_end = "<end_of_image>";
  226. } else if (proj == PROJECTOR_TYPE_IDEFICS3) {
  227. // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
  228. slice_tmpl = MTMD_SLICE_TMPL_IDEFICS3;
  229. tok_ov_img_start = {lookup_token("\n\n"), lookup_token("<fake_token_around_image>"), lookup_token("<global-img>")};
  230. tok_ov_img_end = {lookup_token("<fake_token_around_image>")};
  231. tok_row_end = {lookup_token("\n")};
  232. sli_img_start_tmpl = "<fake_token_around_image><row_%d_col_%d>";
  233. } else if (proj == PROJECTOR_TYPE_PIXTRAL) {
  234. // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
  235. img_end = "[IMG_END]";
  236. } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL) {
  237. // <|vision_start|> ... (image embeddings) ... <|vision_end|>
  238. img_beg = "<|vision_start|>";
  239. img_end = "<|vision_end|>";
  240. } else if (proj == PROJECTOR_TYPE_LLAMA4) {
  241. // (more details in mtmd_context constructor)
  242. img_beg = "<|image_start|>";
  243. img_end = "<|image_end|>";
  244. LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
  245. " https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
  246. } else if (proj == PROJECTOR_TYPE_INTERNVL) {
  247. // <img> ... (image embeddings) ... </img>
  248. img_beg = "<img>";
  249. img_end = "</img>";
  250. } else if (proj == PROJECTOR_TYPE_LIGHTONOCR) {
  251. // <|im_start|> ... (image embeddings) ... <|im_end|>
  252. img_beg = "<|im_start|>";
  253. img_end = "<|im_end|>";
  254. }
  255. }
  256. void init_audio() {
  257. GGML_ASSERT(ctx_a != nullptr);
  258. projector_type proj = clip_get_projector_type(ctx_a);
  259. if (clip_has_whisper_encoder(ctx_a)) {
  260. // TODO @ngxson : check if model n_mel is 128 or 80
  261. w_filters = whisper_precalc_filters::get_128_bins();
  262. }
  263. LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
  264. " https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
  265. if (proj == PROJECTOR_TYPE_QWEN2A) {
  266. // <|audio_bos|> ... (embeddings) ... <|audio_eos|>
  267. aud_beg = "<|audio_bos|>";
  268. aud_end = "<|audio_eos|>";
  269. } else if (proj == PROJECTOR_TYPE_ULTRAVOX) {
  270. // [BEGIN_AUDIO] ... (embeddings) ...
  271. aud_beg = "[BEGIN_AUDIO]";
  272. }
  273. }
  274. // get clip ctx based on chunk type
  275. clip_ctx * get_clip_ctx(const mtmd_input_chunk * chunk) const {
  276. if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
  277. return ctx_v;
  278. } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
  279. return ctx_a;
  280. }
  281. GGML_ABORT("unknown chunk type");
  282. }
  283. projector_type proj_type_v() const {
  284. return ctx_v ? clip_get_projector_type(ctx_v) : PROJECTOR_TYPE_UNKNOWN;
  285. }
  286. projector_type proj_type_a() const {
  287. return ctx_a ? clip_get_projector_type(ctx_a) : PROJECTOR_TYPE_UNKNOWN;
  288. }
  289. ~mtmd_context() {
  290. clip_free(ctx_a);
  291. clip_free(ctx_v);
  292. }
  293. private:
  294. llama_token lookup_token(const std::string & token_text) {
  295. const llama_vocab * vocab = llama_model_get_vocab(text_model);
  296. const int n_vocab = llama_vocab_n_tokens(vocab);
  297. for (int i = 0; i < n_vocab; i++) {
  298. if (token_to_piece(vocab, i, true) == token_text) {
  299. return i;
  300. }
  301. }
  302. return LLAMA_TOKEN_NULL;
  303. }
  304. std::string token_to_piece(const llama_vocab * vocab, llama_token token, bool special) {
  305. std::string piece;
  306. piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
  307. const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
  308. if (n_chars < 0) {
  309. piece.resize(-n_chars);
  310. int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
  311. GGML_ASSERT(check == -n_chars);
  312. } else {
  313. piece.resize(n_chars);
  314. }
  315. return piece;
  316. }
  317. };
  318. mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
  319. const struct llama_model * text_model,
  320. const struct mtmd_context_params ctx_params) {
  321. try {
  322. return new mtmd_context(mmproj_fname, text_model, ctx_params);
  323. } catch (const std::exception & e) {
  324. LOG_ERR("%s: error: %s\n", __func__, e.what());
  325. return nullptr;
  326. }
  327. }
  328. void mtmd_free(mtmd_context * ctx) {
  329. if (ctx) {
  330. delete ctx;
  331. }
  332. }
  333. struct mtmd_tokenizer {
  334. mtmd_context * ctx;
  335. std::vector<const mtmd_bitmap *> bitmaps;
  336. std::string input_text;
  337. bool add_special;
  338. bool parse_special;
  339. const llama_vocab * vocab;
  340. mtmd_input_chunks cur;
  341. mtmd_tokenizer(mtmd_context * ctx,
  342. const mtmd_input_text * text,
  343. const mtmd_bitmap ** bitmaps,
  344. size_t n_bitmaps) : ctx(ctx), bitmaps(bitmaps, bitmaps + n_bitmaps) {
  345. add_special = text->add_special;
  346. parse_special = text->parse_special;
  347. input_text = text->text;
  348. vocab = llama_model_get_vocab(ctx->text_model);
  349. // for compatibility, we convert image marker to media marker
  350. string_replace_all(input_text, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
  351. }
  352. int32_t tokenize(mtmd_input_chunks * output) {
  353. cur.entries.clear();
  354. std::vector<std::string> parts = split_text(input_text, ctx->media_marker);
  355. size_t i_bm = 0; // index of the current bitmap
  356. for (auto & part : parts) {
  357. if (part == ctx->media_marker) {
  358. // this is a marker, we should add the next bitmap
  359. if (i_bm >= bitmaps.size()) {
  360. LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
  361. __func__, bitmaps.size(), parts.size() - 1);
  362. return 1;
  363. }
  364. const mtmd_bitmap * bitmap = bitmaps[i_bm++];
  365. int32_t res = add_media(bitmap);
  366. if (res != 0) {
  367. return res;
  368. }
  369. } else {
  370. // this is a text part, we should add it as text
  371. add_text(part, parse_special);
  372. }
  373. }
  374. if (add_special && llama_vocab_get_add_bos(vocab)) {
  375. // if first chunk is text, we add BOS token to first text chunk
  376. // otherwise, create a new text chunk with BOS token
  377. if (!cur.entries.empty() && cur.entries[0].type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
  378. // add BOS token to the beginning of first text chunk
  379. cur.entries[0].tokens_text.insert(cur.entries[0].tokens_text.begin(), llama_vocab_bos(vocab));
  380. } else {
  381. // create a new text chunk with BOS token at the beginning
  382. mtmd_input_chunk bos_chunk{
  383. MTMD_INPUT_CHUNK_TYPE_TEXT,
  384. {llama_vocab_bos(vocab)},
  385. nullptr, // image tokens
  386. nullptr, // audio tokens
  387. };
  388. cur.entries.insert(cur.entries.begin(), std::move(bos_chunk));
  389. }
  390. }
  391. if (add_special && llama_vocab_get_add_eos(vocab)) {
  392. // if last chunk is text, we add EOS token to it
  393. add_text({llama_vocab_eos(vocab)});
  394. }
  395. if (i_bm != bitmaps.size()) {
  396. LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
  397. __func__, bitmaps.size(), parts.size() - 1);
  398. return 1;
  399. }
  400. *output = std::move(cur);
  401. return 0;
  402. }
  403. void add_text(const std::string & txt, bool parse_special) {
  404. LOG_DBG("%s: %s\n", __func__, txt.c_str());
  405. auto tokens = mtmd_tokenize_text_internal(vocab, txt, /* add_special */ false, parse_special);
  406. add_text(tokens);
  407. }
  408. void add_text(const std::vector<llama_token> & tokens) {
  409. if (tokens.empty()) {
  410. return;
  411. }
  412. // if last entry is also a text chunk, add tokens to it instead of creating new chunk
  413. if (!cur.entries.empty() && cur.entries.back().type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
  414. cur.entries.back().tokens_text.insert(
  415. cur.entries.back().tokens_text.end(),
  416. tokens.begin(),
  417. tokens.end());
  418. } else {
  419. mtmd_input_chunk chunk{
  420. MTMD_INPUT_CHUNK_TYPE_TEXT,
  421. tokens,
  422. nullptr, // image tokens
  423. nullptr, // audio tokens
  424. };
  425. cur.entries.emplace_back(std::move(chunk));
  426. }
  427. }
  428. int32_t add_media(const mtmd_bitmap * bitmap) {
  429. if (!bitmap->is_audio) {
  430. // handle image
  431. if (!ctx->ctx_v) {
  432. LOG_ERR("%s: error: model does not support vision input\n", __func__);
  433. return 2;
  434. }
  435. if (!ctx->img_beg.empty()) {
  436. add_text(ctx->img_beg, true); // add image begin token
  437. }
  438. // convert mtmd_bitmap to clip_image_u8
  439. clip_image_u8_ptr img_u8(clip_image_u8_init());
  440. img_u8->nx = bitmap->nx;
  441. img_u8->ny = bitmap->ny;
  442. img_u8->buf.resize(bitmap->data.size());
  443. std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
  444. // preprocess image
  445. clip_image_f32_batch batch_f32;
  446. bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32);
  447. if (!ok) {
  448. LOG_ERR("Unable to preprocess image\n");
  449. return 2;
  450. }
  451. // handle llava-uhd style preprocessing
  452. if (
  453. ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
  454. || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
  455. || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
  456. || ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
  457. ) {
  458. const int n_col = batch_f32.grid_x;
  459. const int n_row = batch_f32.grid_y;
  460. // split batch into chunks of single images
  461. // NOTE: batch_f32 will be invalidated after this call
  462. auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id);
  463. GGML_ASSERT(chunks.size() > 0);
  464. auto ov_chunk = std::move(chunks.front());
  465. chunks.erase(chunks.begin());
  466. // add overview image (first)
  467. if (ctx->ov_img_first) {
  468. add_text(ctx->tok_ov_img_start);
  469. cur.entries.emplace_back(std::move(ov_chunk));
  470. add_text(ctx->tok_ov_img_end);
  471. }
  472. // add slices (or tiles)
  473. if (!chunks.empty()) {
  474. GGML_ASSERT((int)chunks.size() == n_row * n_col);
  475. add_text(ctx->tok_slices_start);
  476. for (int y = 0; y < n_row; y++) {
  477. for (int x = 0; x < n_col; x++) {
  478. const bool is_last_in_row = (x == n_col - 1);
  479. if (!ctx->tok_sli_img_start.empty()) {
  480. add_text(ctx->tok_sli_img_start);
  481. } else if (!ctx->sli_img_start_tmpl.empty()) {
  482. // If using a template to preceed a slice image
  483. const size_t sz = std::snprintf(nullptr, 0, ctx->sli_img_start_tmpl.c_str(), y+1, x+1) + 1;
  484. std::unique_ptr<char[]> buf(new char[sz]);
  485. std::snprintf(buf.get(), sz, ctx->sli_img_start_tmpl.c_str(), y+1, x+1);
  486. add_text(std::string(buf.get(), buf.get() + sz - 1), true);
  487. }
  488. cur.entries.emplace_back(std::move(chunks[y * n_col + x]));
  489. add_text(ctx->tok_sli_img_end);
  490. if (!is_last_in_row) {
  491. add_text(ctx->tok_sli_img_mid);
  492. }
  493. }
  494. if ((y != n_row - 1 || ctx->tok_row_end_trail)) {
  495. add_text(ctx->tok_row_end);
  496. }
  497. }
  498. add_text(ctx->tok_slices_end);
  499. }
  500. // add overview image (last)
  501. if (!ctx->ov_img_first) {
  502. add_text(ctx->tok_ov_img_start);
  503. cur.entries.emplace_back(std::move(ov_chunk));
  504. add_text(ctx->tok_ov_img_end);
  505. }
  506. } else {
  507. size_t n_tokens = 0;
  508. for (const auto & entry : batch_f32.entries) {
  509. n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get());
  510. }
  511. mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
  512. if (ctx->use_mrope) {
  513. // for Qwen2VL, we need this information for M-RoPE decoding positions
  514. image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
  515. image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
  516. image_tokens->use_mrope_pos = true;
  517. } else {
  518. // other models, we only need the total number of tokens
  519. image_tokens->nx = n_tokens;
  520. image_tokens->ny = 1;
  521. }
  522. image_tokens->batch_f32 = std::move(batch_f32);
  523. image_tokens->id = bitmap->id; // optional
  524. LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
  525. LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
  526. LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
  527. mtmd_input_chunk chunk{
  528. MTMD_INPUT_CHUNK_TYPE_IMAGE,
  529. {}, // text tokens
  530. std::move(image_tokens),
  531. nullptr, // audio tokens
  532. };
  533. cur.entries.emplace_back(std::move(chunk));
  534. }
  535. if (!ctx->img_end.empty()) {
  536. add_text(ctx->img_end, true); // add image end token
  537. }
  538. } else {
  539. // handle audio
  540. if (!ctx->ctx_a) {
  541. LOG_ERR("%s: error: model does not support audio input\n", __func__);
  542. return 2;
  543. }
  544. if (bitmap->data.size() == 0) {
  545. LOG_ERR("%s: error: empty audio data\n", __func__);
  546. return 2;
  547. }
  548. if (!ctx->aud_beg.empty()) {
  549. add_text(ctx->aud_beg, true); // add audio begin token
  550. }
  551. // preprocess audio
  552. GGML_ASSERT(ctx->w_filters.n_mel); // make sure we have filter preloaded
  553. std::vector<whisper_preprocessor::whisper_mel> mel_spec_chunks;
  554. const float * samples = (const float *)bitmap->data.data();
  555. size_t n_samples = bitmap->data.size() / sizeof(float);
  556. bool ok = whisper_preprocessor::preprocess_audio(samples, n_samples, ctx->w_filters, mel_spec_chunks);
  557. if (!ok) {
  558. LOG_ERR("Unable to preprocess audio\n");
  559. return 2;
  560. }
  561. // consider each mel_spec as a separate audio chunk
  562. // TODO: maybe support batching, but this may come with memory cost
  563. for (auto & mel_spec : mel_spec_chunks) {
  564. clip_image_f32_ptr mel_f32(clip_image_f32_init());
  565. mel_f32->nx = mel_spec.n_len;
  566. mel_f32->ny = mel_spec.n_mel;
  567. mel_f32->buf = std::move(mel_spec.data);
  568. size_t n_tokens = clip_n_output_tokens(ctx->ctx_a, mel_f32.get());
  569. clip_image_f32_batch batch_f32;
  570. batch_f32.is_audio = true;
  571. batch_f32.entries.push_back(std::move(mel_f32));
  572. mtmd_audio_tokens_ptr audio_tokens(new mtmd_audio_tokens);
  573. audio_tokens->n_tokens = n_tokens;
  574. audio_tokens->batch_f32 = std::move(batch_f32);
  575. audio_tokens->id = bitmap->id; // optional
  576. LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens);
  577. mtmd_input_chunk chunk{
  578. MTMD_INPUT_CHUNK_TYPE_AUDIO,
  579. {}, // text tokens
  580. nullptr, // image tokens
  581. std::move(audio_tokens),
  582. };
  583. cur.entries.emplace_back(std::move(chunk));
  584. }
  585. if (!ctx->aud_end.empty()) {
  586. add_text(ctx->aud_end, true); // add audio end token
  587. }
  588. }
  589. return 0;
  590. }
  591. std::vector<mtmd_input_chunk> split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) {
  592. std::vector<mtmd_input_chunk> chunks;
  593. for (auto & entry : batch_f32.entries) {
  594. mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
  595. image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, entry.get());
  596. image_tokens->ny = 1;
  597. image_tokens->batch_f32.entries.push_back(std::move(entry));
  598. image_tokens->id = id;
  599. mtmd_input_chunk chunk{
  600. MTMD_INPUT_CHUNK_TYPE_IMAGE,
  601. {}, // text tokens
  602. std::move(image_tokens),
  603. nullptr, // audio tokens
  604. };
  605. chunks.emplace_back(std::move(chunk));
  606. }
  607. return chunks;
  608. }
  609. // for example: "a <__media__> b <__media__> c" --> "a", "<__media__>", "b", "<__media__>", "c"
  610. static std::vector<std::string> split_text(const std::string & input, const std::string & delimiter) {
  611. std::vector<std::string> result;
  612. if (input.empty()) {
  613. return result;
  614. }
  615. size_t start = 0;
  616. size_t pos = 0;
  617. while ((pos = input.find(delimiter, start)) != std::string::npos) {
  618. if (pos > start) {
  619. result.push_back(input.substr(start, pos - start));
  620. }
  621. result.push_back(delimiter);
  622. start = pos + delimiter.length();
  623. }
  624. if (start < input.length()) {
  625. result.push_back(input.substr(start));
  626. }
  627. return result;
  628. }
  629. // copied from common_tokenize
  630. static std::vector<llama_token> mtmd_tokenize_text_internal(
  631. const struct llama_vocab * vocab,
  632. const std::string & text,
  633. bool add_special,
  634. bool parse_special) {
  635. // upper limit for the number of tokens
  636. int n_tokens = text.length() + 2 * add_special;
  637. std::vector<llama_token> result(n_tokens);
  638. n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
  639. if (n_tokens < 0) {
  640. result.resize(-n_tokens);
  641. int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
  642. GGML_ASSERT(check == -n_tokens);
  643. } else {
  644. result.resize(n_tokens);
  645. }
  646. return result;
  647. }
  648. };
  649. int32_t mtmd_tokenize(mtmd_context * ctx,
  650. mtmd_input_chunks * output,
  651. const mtmd_input_text * text,
  652. const mtmd_bitmap ** bitmaps,
  653. size_t n_bitmaps) {
  654. mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps);
  655. return tokenizer.tokenize(output);
  656. }
  657. int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
  658. if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
  659. LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
  660. return 0;
  661. } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
  662. if (!ctx->ctx_v) {
  663. LOG_ERR("%s: model does not support vision input\n", __func__);
  664. return 1;
  665. }
  666. return mtmd_encode(ctx, chunk->tokens_image.get());
  667. } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
  668. if (!ctx->ctx_a) {
  669. LOG_ERR("%s: model does not support audio input\n", __func__);
  670. return 1;
  671. }
  672. int n_mmproj_embd = ctx->n_embd_text;
  673. ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
  674. bool ok = clip_image_batch_encode(
  675. ctx->ctx_a,
  676. ctx->n_threads,
  677. &chunk->tokens_audio->batch_f32,
  678. ctx->image_embd_v.data());
  679. return ok ? 0 : 1;
  680. }
  681. LOG_ERR("%s: unknown chunk type %d\n", __func__, (int)chunk->type);
  682. return 1;
  683. }
  684. int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
  685. clip_ctx * ctx_clip = ctx->ctx_v;
  686. if (!ctx_clip) {
  687. LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__);
  688. return 1;
  689. }
  690. int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
  691. ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
  692. bool ok = false;
  693. if (clip_is_llava(ctx_clip)
  694. || clip_is_minicpmv(ctx_clip)
  695. || clip_is_glm(ctx_clip)) {
  696. // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
  697. const auto & entries = image_tokens->batch_f32.entries;
  698. for (size_t i = 0; i < entries.size(); i++) {
  699. int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
  700. ok = clip_image_encode(
  701. ctx_clip,
  702. ctx->n_threads,
  703. entries[i].get(),
  704. ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image);
  705. }
  706. } else {
  707. ok = clip_image_batch_encode(
  708. ctx_clip,
  709. ctx->n_threads,
  710. &image_tokens->batch_f32,
  711. ctx->image_embd_v.data());
  712. }
  713. return ok ? 0 : 1;
  714. }
  715. float * mtmd_get_output_embd(mtmd_context * ctx) {
  716. return ctx->image_embd_v.data();
  717. }
  718. bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
  719. if (ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3) {
  720. return true;
  721. }
  722. return false;
  723. }
  724. bool mtmd_decode_use_mrope(mtmd_context * ctx) {
  725. return ctx->use_mrope;
  726. }
  727. bool mtmd_support_vision(mtmd_context * ctx) {
  728. return ctx->ctx_v != nullptr;
  729. }
  730. bool mtmd_support_audio(mtmd_context * ctx) {
  731. return ctx->ctx_a != nullptr;
  732. }
  733. int mtmd_get_audio_bitrate(mtmd_context * ctx) {
  734. if (!ctx->ctx_a) {
  735. return -1;
  736. }
  737. // for now, we assume that all audio models have the same bitrate
  738. return 16000; // 16kHz
  739. }
  740. //
  741. // public API functions
  742. //
  743. // mtmd_bitmap
  744. mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
  745. uint32_t ny,
  746. const unsigned char * data) {
  747. mtmd_bitmap * bitmap = new mtmd_bitmap;
  748. bitmap->nx = nx;
  749. bitmap->ny = ny;
  750. size_t data_size = (size_t)nx * ny * 3;
  751. bitmap->data.resize(data_size);
  752. std::memcpy(bitmap->data.data(), data, data_size);
  753. return bitmap;
  754. }
  755. mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples,
  756. const float * data) {
  757. mtmd_bitmap * bitmap = new mtmd_bitmap;
  758. bitmap->nx = n_samples;
  759. bitmap->ny = 1;
  760. bitmap->is_audio = true;
  761. size_t data_size = n_samples * sizeof(float);
  762. bitmap->data.resize(data_size);
  763. std::memcpy(bitmap->data.data(), data, data_size);
  764. return bitmap;
  765. }
  766. uint32_t mtmd_bitmap_get_nx(const mtmd_bitmap * bitmap) {
  767. return bitmap->nx;
  768. }
  769. uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap) {
  770. return bitmap->ny;
  771. }
  772. const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
  773. return bitmap->data.data();
  774. }
  775. size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) {
  776. return bitmap->data.size();
  777. }
  778. bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
  779. return bitmap->is_audio;
  780. }
  781. const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) {
  782. return bitmap->id.c_str();
  783. }
  784. void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id) {
  785. if (id) {
  786. bitmap->id = std::string(id);
  787. } else {
  788. bitmap->id.clear();
  789. }
  790. }
  791. void mtmd_bitmap_free(mtmd_bitmap * bitmap) {
  792. if (bitmap) {
  793. delete bitmap;
  794. }
  795. }
  796. // mtmd_input_chunks
  797. mtmd_input_chunks * mtmd_input_chunks_init() {
  798. return new mtmd_input_chunks;
  799. }
  800. size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks) {
  801. return chunks->entries.size();
  802. }
  803. const mtmd_input_chunk * mtmd_input_chunks_get(const mtmd_input_chunks * chunks, size_t idx) {
  804. if (idx >= chunks->entries.size()) {
  805. return nullptr;
  806. }
  807. return &chunks->entries[idx];
  808. }
  809. void mtmd_input_chunks_free(mtmd_input_chunks * chunks) {
  810. if (chunks) {
  811. delete chunks;
  812. }
  813. }
  814. // mtmd_input_chunk
  815. enum mtmd_input_chunk_type mtmd_input_chunk_get_type(const mtmd_input_chunk * chunk) {
  816. return chunk->type;
  817. }
  818. const llama_token * mtmd_input_chunk_get_tokens_text(const mtmd_input_chunk * chunk, size_t * n_tokens_output) {
  819. if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
  820. *n_tokens_output = chunk->tokens_text.size();
  821. return chunk->tokens_text.data();
  822. }
  823. *n_tokens_output = 0;
  824. return nullptr;
  825. }
  826. const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk) {
  827. if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
  828. return chunk->tokens_image.get();
  829. }
  830. return nullptr;
  831. }
  832. size_t mtmd_input_chunk_get_n_tokens(const mtmd_input_chunk * chunk) {
  833. if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
  834. return chunk->tokens_text.size();
  835. } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
  836. return mtmd_image_tokens_get_n_tokens(chunk->tokens_image.get());
  837. } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
  838. return chunk->tokens_audio->n_tokens;
  839. } else {
  840. GGML_ABORT("invalid chunk type");
  841. }
  842. }
  843. llama_pos mtmd_input_chunk_get_n_pos(const mtmd_input_chunk * chunk) {
  844. if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
  845. return chunk->tokens_text.size();
  846. } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
  847. return mtmd_image_tokens_get_n_pos(chunk->tokens_image.get());
  848. } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
  849. return chunk->tokens_audio->n_tokens;
  850. } else {
  851. GGML_ABORT("invalid chunk type");
  852. }
  853. }
  854. const char * mtmd_input_chunk_get_id(const mtmd_input_chunk * chunk) {
  855. if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
  856. return chunk->tokens_image->id.c_str();
  857. } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
  858. return chunk->tokens_audio->id.c_str();
  859. }
  860. return nullptr;
  861. }
  862. mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk) {
  863. mtmd_input_chunk * copy = new mtmd_input_chunk{
  864. chunk->type,
  865. chunk->tokens_text,
  866. nullptr,
  867. nullptr,
  868. };
  869. if (chunk->tokens_image) {
  870. // copy the image tokens
  871. copy->tokens_image = mtmd_image_tokens_ptr(new mtmd_image_tokens());
  872. *copy->tokens_image = chunk->tokens_image->clone();
  873. }
  874. if (chunk->tokens_audio) {
  875. // copy the audio tokens
  876. copy->tokens_audio = mtmd_audio_tokens_ptr(new mtmd_audio_tokens());
  877. *copy->tokens_audio = chunk->tokens_audio->clone();
  878. }
  879. return copy;
  880. }
  881. void mtmd_input_chunk_free(mtmd_input_chunk * chunk) {
  882. if (chunk) {
  883. delete chunk;
  884. }
  885. }
  886. // mtmd_image_tokens
  887. size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
  888. return image_tokens->n_tokens();
  889. }
  890. size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens) {
  891. return image_tokens->nx;
  892. }
  893. size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
  894. return image_tokens->ny;
  895. }
  896. const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
  897. return image_tokens->id.c_str();
  898. }
  899. llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
  900. if (image_tokens->use_mrope_pos) {
  901. // for M-RoPE, temporal dimension = max(t,h,w)
  902. // t is omitted as we don't support video input
  903. return std::max(image_tokens->nx, image_tokens->ny);
  904. }
  905. return image_tokens->n_tokens();
  906. }
  907. // test function
  908. mtmd_input_chunks * mtmd_test_create_input_chunks() {
  909. mtmd_input_chunks * chunks = mtmd_input_chunks_init();
  910. if (!chunks) {
  911. return nullptr;
  912. }
  913. // create a text chunk
  914. std::vector<llama_token> tokens_text = { 1, 2, 3, 4, 5 };
  915. mtmd_input_chunk chunk_text{
  916. MTMD_INPUT_CHUNK_TYPE_TEXT,
  917. std::move(tokens_text),
  918. nullptr, // image tokens
  919. nullptr, // audio tokens
  920. };
  921. chunks->entries.emplace_back(std::move(chunk_text));
  922. // create an image chunk
  923. mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
  924. image_tokens->nx = 4;
  925. image_tokens->ny = 4;
  926. image_tokens->batch_f32.entries.resize(16);
  927. image_tokens->id = "image_1";
  928. mtmd_input_chunk chunk_image{
  929. MTMD_INPUT_CHUNK_TYPE_IMAGE,
  930. {}, // text tokens
  931. std::move(image_tokens),
  932. nullptr, // audio tokens
  933. };
  934. chunks->entries.emplace_back(std::move(chunk_image));
  935. return chunks;
  936. }