generate_cu_files.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. #!/usr/bin/env python3
  2. from glob import glob
  3. import os
  4. TYPES_KV = ["GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0", "GGML_TYPE_F16"]
  5. SOURCE_FATTN_VEC = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
  6. #include "../fattn-vec-f{vkq_size}.cuh"
  7. DECL_FATTN_VEC_F{vkq_size}_CASE({head_size}, {type_k}, {type_v});
  8. """
  9. SOURCE_FATTN_WMMA_START = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
  10. #include "../fattn-wmma-f16.cuh"
  11. """
  12. SOURCE_FATTN_WMMA_CASE = "DECL_FATTN_WMMA_F16_CASE({head_size}, {cols_per_block}, {kq_acc_t});\n"
  13. TYPES_MMQ = [
  14. "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
  15. "GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K"
  16. ]
  17. SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
  18. #include "../mmq.cuh"
  19. DECL_MMQ_CASE({type});
  20. """
  21. def get_short_name(long_quant_name):
  22. return long_quant_name.replace("GGML_TYPE_", "").lower()
  23. def get_head_sizes(type_k, type_v):
  24. if type_k == "GGML_TYPE_F16" and type_v == "GGML_TYPE_F16":
  25. return [64, 128, 256]
  26. if type_k == "GGML_TYPE_F16":
  27. return [64, 128]
  28. return [128]
  29. for filename in glob("*.cu"):
  30. os.remove(filename)
  31. for vkq_size in [16, 32]:
  32. for type_k in TYPES_KV:
  33. for type_v in TYPES_KV:
  34. for head_size in get_head_sizes(type_k, type_v):
  35. with open(f"fattn-vec-f{vkq_size}-instance-hs{head_size}-{get_short_name(type_k)}-{get_short_name(type_v)}.cu", "w") as f:
  36. f.write(SOURCE_FATTN_VEC.format(vkq_size=vkq_size, head_size=head_size, type_k=type_k, type_v=type_v))
  37. for kq_acc_t in ["half", "float"]:
  38. for cols_per_block in [8, 16, 32]:
  39. if kq_acc_t == "float" and cols_per_block == 8:
  40. continue
  41. with open(f"fattn-wmma-f16-instance-kq{kq_acc_t}-cpb{cols_per_block}.cu", "w") as f:
  42. f.write(SOURCE_FATTN_WMMA_START)
  43. for head_size in [64, 80, 96, 112, 128, 256]:
  44. if cols_per_block == 8 and head_size % 32 != 0: # wmma fragment is 8x32
  45. continue
  46. if kq_acc_t == "float" and cols_per_block == 32 and head_size == 256: # register spilling, bad performance
  47. continue
  48. f.write(SOURCE_FATTN_WMMA_CASE.format(kq_acc_t=kq_acc_t, cols_per_block=cols_per_block, head_size=head_size))
  49. for type in TYPES_MMQ:
  50. with open(f"mmq-instance-{get_short_name(type)}.cu", "w") as f:
  51. f.write(SOURCE_MMQ.format(type=type))