|
|
@@ -35,6 +35,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
|
|
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
|
|
|
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
|
|
|
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
|
|
+ // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
|
|
|
+ { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
|
|
|
};
|
|
|
|
|
|
|
|
|
@@ -71,12 +73,17 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
|
|
|
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
|
|
|
//
|
|
|
void usage(const char * executable) {
|
|
|
- fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
|
|
- fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
|
|
- fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
|
|
- fprintf(stderr, "\nAllowed quantization types:\n");
|
|
|
+ printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
|
|
+ printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
|
|
+ printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
|
|
+ printf("\nAllowed quantization types:\n");
|
|
|
for (auto & it : QUANT_OPTIONS) {
|
|
|
- printf(" %2d or %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str());
|
|
|
+ if (it.name != "COPY") {
|
|
|
+ printf(" %2d or ", it.ftype);
|
|
|
+ } else {
|
|
|
+ printf(" ");
|
|
|
+ }
|
|
|
+ printf("%-6s : %s\n", it.name.c_str(), it.desc.c_str());
|
|
|
}
|
|
|
exit(1);
|
|
|
}
|
|
|
@@ -121,6 +128,9 @@ int main(int argc, char ** argv) {
|
|
|
// export as [inp path]/ggml-model-[ftype].gguf
|
|
|
fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
|
|
|
arg_idx++;
|
|
|
+ if (ftype_str == "COPY") {
|
|
|
+ params.only_copy = true;
|
|
|
+ }
|
|
|
}
|
|
|
else {
|
|
|
fname_out = argv[arg_idx];
|
|
|
@@ -133,6 +143,10 @@ int main(int argc, char ** argv) {
|
|
|
if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
|
|
|
fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
|
|
|
return 1;
|
|
|
+ } else {
|
|
|
+ if (ftype_str == "COPY") {
|
|
|
+ params.only_copy = true;
|
|
|
+ }
|
|
|
}
|
|
|
arg_idx++;
|
|
|
}
|