clip-quantize-cli.cpp 1.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. #include "arg.h"
  2. #include "base64.hpp"
  3. #include "log.h"
  4. #include "common.h"
  5. #include "sampling.h"
  6. #include "clip.h"
  7. #include "llava.h"
  8. #include "llama.h"
  9. #include "ggml.h"
  10. static void print_usage(int argc, char ** argv) {
  11. (void) argc;
  12. fprintf(stderr, "usage: %s /path/to/ggml-model-f32.gguf /path/to/ggml-model-quantized.gguf type\n", argv[0]);
  13. fprintf(stderr, " type = 2 - q4_0\n");
  14. fprintf(stderr, " type = 3 - q4_1\n");
  15. fprintf(stderr, " type = 6 - q5_0\n");
  16. fprintf(stderr, " type = 7 - q5_1\n");
  17. fprintf(stderr, " type = 8 - q8_0\n");
  18. }
  19. int main(int argc, char ** argv) {
  20. if (argc != 4) {
  21. print_usage(argc, argv);
  22. return 1;
  23. }
  24. const std::string fname_inp = argv[1];
  25. const std::string fname_out = argv[2];
  26. const int itype = atoi(argv[3]);
  27. const int64_t t_main_start_us = ggml_time_us();
  28. int64_t t_quantize_us = 0;
  29. // load the model
  30. {
  31. const int64_t t_start_us = ggml_time_us();
  32. if (!clip_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) {
  33. fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
  34. return 1;
  35. }
  36. t_quantize_us = ggml_time_us() - t_start_us;
  37. }
  38. // report timing
  39. {
  40. const int64_t t_main_end_us = ggml_time_us();
  41. printf("\n");
  42. printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us / 1000.0f);
  43. printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
  44. }
  45. return 0;
  46. }