mtmd.h 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. #ifndef MTMD_H
  2. #define MTMD_H
  3. #include "ggml.h"
  4. #include "llama.h"
  5. #include "clip.h"
  6. #include <vector>
  7. #include <cinttypes>
  8. #include <memory>
  9. #ifdef LLAMA_SHARED
  10. # if defined(_WIN32) && !defined(__MINGW32__)
  11. # ifdef LLAMA_BUILD
  12. # define MTMD_API __declspec(dllexport)
  13. # else
  14. # define MTMD_API __declspec(dllimport)
  15. # endif
  16. # else
  17. # define MTMD_API __attribute__ ((visibility ("default")))
  18. # endif
  19. #else
  20. # define MTMD_API
  21. #endif
  22. #ifdef __cplusplus
  23. enum mtmd_input_chunk_type {
  24. MTMD_INPUT_CHUNK_TYPE_TEXT,
  25. MTMD_INPUT_CHUNK_TYPE_IMAGE,
  26. };
  27. struct mtmd_context;
  28. struct mtmd_image_tokens;
  29. // represents raw image data, layout is RGBRGBRGB...
  30. // length of data must be nx * ny * 3
  31. struct mtmd_bitmap {
  32. uint32_t nx;
  33. uint32_t ny;
  34. std::vector<unsigned char> data;
  35. };
  36. struct mtmd_input_chunk {
  37. mtmd_input_chunk_type type;
  38. std::vector<llama_token> tokens_text;
  39. mtmd_image_tokens * tokens_image = nullptr;
  40. };
  41. using mtmd_input_chunks = std::vector<mtmd_input_chunk>;
  42. struct mtmd_context_params {
  43. bool use_gpu = true;
  44. bool print_timings = true;
  45. int n_threads = 4;
  46. enum ggml_log_level verbosity = GGML_LOG_LEVEL_INFO;
  47. const char * image_marker = "<__image__>";
  48. };
  49. struct mtmd_input_text {
  50. std::string text;
  51. bool add_special;
  52. bool parse_special;
  53. };
  54. // initialize the mtmd context
  55. // return nullptr on failure
  56. MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
  57. const llama_model * text_model,
  58. const mtmd_context_params ctx_params);
  59. MTMD_API void mtmd_free(mtmd_context * ctx);
  60. // tokenize an input text prompt and an image
  61. // the prompt must have the input image marker (default: "<__image__>") in it
  62. // the marker will be replaced with the image tokens
  63. // for example:
  64. // "here is an image: <__image__>\ndescribe it in detail."
  65. // this will gives 3 chunks:
  66. // 1. "here is an image: <start_of_image>"
  67. // 2. (image tokens)
  68. // 3. "<end_of_image>\ndescribe it in detail."
  69. // number of bitmaps must be equal to the number of image markers in the prompt
  70. // this function is thread-safe (shared ctx)
  71. MTMD_API mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
  72. const mtmd_input_text & text,
  73. const std::vector<mtmd_bitmap> & bitmaps);
  74. // free image chunk data
  75. MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks);
  76. // returns 0 on success
  77. MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
  78. const mtmd_image_tokens * image_tokens);
  79. // get output embeddings from the last encode pass
  80. MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
  81. //
  82. // helper functions (can be implemented based on other functions)
  83. //
  84. // helper to count the total number of tokens from a list of chunks, useful to keep track of n_past
  85. MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks);
  86. // helper function that automatically:
  87. // 1. run llama_decode() on text chunks
  88. // 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
  89. // if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
  90. // otherwise, returns 0 on success
  91. MTMD_API int32_t mtmd_helper_eval(mtmd_context * ctx,
  92. llama_context * lctx,
  93. mtmd_input_chunks * chunks,
  94. llama_pos pos0,
  95. llama_seq_id seq_id,
  96. int32_t n_batch);
  97. // helper function to construct a mtmd_bitmap from a file
  98. // returns 0 on success
  99. // this function is thread-safe
  100. MTMD_API int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & output);
  101. // helper function to construct a mtmd_bitmap from a buffer
  102. // the buffer must be an image in format supported by stb_image (jpg, png, bmp, gif, etc.)
  103. // returns 0 on success
  104. // this function is thread-safe
  105. MTMD_API int32_t mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len, mtmd_bitmap & output);
  106. // convenient unique_ptr wrappers
  107. struct mtmd_context_deleter {
  108. void operator()(mtmd_context * val) { mtmd_free(val); }
  109. };
  110. using mtmd_context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
  111. struct mtmd_input_chunks_deleter {
  112. void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); }
  113. };
  114. using mtmd_input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
  115. #else
  116. static_assert(false && "C header is not yet supported by this library");
  117. #endif
  118. #endif