make-ggml.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. #!/usr/bin/env python3
  2. """
  3. This script converts Hugging Face llama models to GGML and quantizes them.
  4. Usage:
  5. python make-ggml.py --model {model_dir_or_hf_repo_name} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]
  6. Arguments:
  7. - --model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
  8. - --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used.
  9. - --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used.
  10. - --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'.
  11. - --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created.
  12. Quant types:
  13. - Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M
  14. - Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L
  15. - Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M
  16. - Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M
  17. - Q2_K: smallest, extreme quality loss - not recommended
  18. - Q3_K: alias for Q3_K_M
  19. - Q3_K_S: very small, very high quality loss
  20. - Q3_K_M: very small, very high quality loss
  21. - Q3_K_L: small, substantial quality loss
  22. - Q4_K: alias for Q4_K_M
  23. - Q4_K_S: small, significant quality loss
  24. - Q4_K_M: medium, balanced quality - recommended
  25. - Q5_K: alias for Q5_K_M
  26. - Q5_K_S: large, low quality loss - recommended
  27. - Q5_K_M: large, very low quality loss - recommended
  28. - Q6_K: very large, extremely low quality loss
  29. - Q8_0: very large, extremely low quality loss - not recommended
  30. - F16: extremely large, virtually no quality loss - not recommended
  31. - F32: absolutely huge, lossless - not recommended
  32. """
  33. import subprocess
  34. subprocess.run(f"pip install huggingface-hub==0.16.4", shell=True, check=True)
  35. import argparse
  36. import os
  37. from huggingface_hub import snapshot_download
  38. def main(model, outname, outdir, quants, keep_fp16):
  39. ggml_version = "v3"
  40. if not os.path.isdir(model):
  41. print(f"Model not found at {model}. Downloading...")
  42. try:
  43. if outname is None:
  44. outname = model.split('/')[-1]
  45. model = snapshot_download(repo_id=model, cache_dir='../models/hf_cache')
  46. except Exception as e:
  47. raise Exception(f"Could not download the model: {e}")
  48. if outdir is None:
  49. outdir = f'../models/{outname}'
  50. if not os.path.isfile(f"{model}/config.json"):
  51. raise Exception(f"Could not find config.json in {model}")
  52. os.makedirs(outdir, exist_ok=True)
  53. print("Building llama.cpp")
  54. subprocess.run(f"cd .. && make quantize", shell=True, check=True)
  55. fp16 = f"{outdir}/{outname}.ggml{ggml_version}.fp16.bin"
  56. print(f"Making unquantised GGML at {fp16}")
  57. if not os.path.isfile(fp16):
  58. subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
  59. else:
  60. print(f"Unquantised GGML already exists at: {fp16}")
  61. print("Making quants")
  62. for type in quants:
  63. outfile = f"{outdir}/{outname}.ggml{ggml_version}.{type}.bin"
  64. print(f"Making {type} : {outfile}")
  65. subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True)
  66. if not keep_fp16:
  67. os.remove(fp16)
  68. if __name__ == "__main__":
  69. parser = argparse.ArgumentParser(description='Convert/Quantize HF to GGML. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
  70. parser.add_argument('--model', required=True, help='Downloaded model dir or Hugging Face model repo name')
  71. parser.add_argument('--outname', default=None, help='Output model(s) name')
  72. parser.add_argument('--outdir', default=None, help='Output directory')
  73. parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types')
  74. parser.add_argument('--keep_fp16', action='store_true', help='Keep fp16 model', default=False)
  75. args = parser.parse_args()
  76. main(args.model, args.outname, args.outdir, args.quants, args.keep_fp16)