|
|
@@ -3175,6 +3175,7 @@ struct llama_model_loader {
|
|
|
switch (type_max) {
|
|
|
case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
|
|
|
case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
|
|
|
+ case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break;
|
|
|
case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
|
|
|
case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
|
|
|
case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
|
|
|
@@ -3666,6 +3667,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
|
switch (ftype) {
|
|
|
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
|
|
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
|
|
|
+ case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
|
|
|
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
|
|
|
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
|
|
|
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
|
|
@@ -6129,6 +6131,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
|
|| !(
|
|
|
model.ftype == LLAMA_FTYPE_ALL_F32 ||
|
|
|
model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
|
|
|
+ model.ftype == LLAMA_FTYPE_MOSTLY_BF16 ||
|
|
|
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
|
|
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
|
|
|
)
|
|
|
@@ -14158,13 +14161,16 @@ static void llama_tensor_dequantize_internal(
|
|
|
if (qtype.to_float == NULL) {
|
|
|
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
|
|
|
}
|
|
|
- } else if (tensor->type != GGML_TYPE_F16) {
|
|
|
+ } else if (tensor->type != GGML_TYPE_F16 &&
|
|
|
+ tensor->type != GGML_TYPE_BF16) {
|
|
|
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
|
|
|
}
|
|
|
|
|
|
if (nthread < 2) {
|
|
|
if (tensor->type == GGML_TYPE_F16) {
|
|
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
|
|
|
+ } else if (tensor->type == GGML_TYPE_BF16) {
|
|
|
+ ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
|
|
|
} else if (ggml_is_quantized(tensor->type)) {
|
|
|
qtype.to_float(tensor->data, f32_output, nelements);
|
|
|
} else {
|
|
|
@@ -14173,7 +14179,14 @@ static void llama_tensor_dequantize_internal(
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
- size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
|
|
|
+ size_t block_size;
|
|
|
+ if (tensor->type == GGML_TYPE_F16 ||
|
|
|
+ tensor->type == GGML_TYPE_BF16) {
|
|
|
+ block_size = 1;
|
|
|
+ } else {
|
|
|
+ block_size = (size_t)ggml_blck_size(tensor->type);
|
|
|
+ }
|
|
|
+
|
|
|
size_t block_size_bytes = ggml_type_size(tensor->type);
|
|
|
|
|
|
GGML_ASSERT(nelements % block_size == 0);
|
|
|
@@ -14192,6 +14205,8 @@ static void llama_tensor_dequantize_internal(
|
|
|
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
|
|
if (typ == GGML_TYPE_F16) {
|
|
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
|
|
+ } else if (typ == GGML_TYPE_BF16) {
|
|
|
+ ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
|
|
|
} else {
|
|
|
qtype.to_float(inbuf, outbuf, nels);
|
|
|
}
|
|
|
@@ -14552,6 +14567,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
|
|
|
case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
|
|
|
case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
|
|
|
+ case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
|
|
|
case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
|
|
|
|
|
|
// K-quants
|