2 лет назад · b9fd7eee57
--- a/.gitignore
+++ b/.gitignore
@@ -44,5 +44,6 @@ zig-cache/
 
															 ppl-*.txt
														
 
															 qnt-*.txt
														
 
															+perf-*.txt
														
 
															 examples/jeopardy/results.txt
														
--- a/README.md
+++ b/README.md
@@ -7,18 +7,10 @@
 
															 Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
														
 
															-## ⚠️ TEMPORARY NOTICE ABOUT UPCOMING BREAKING CHANGE ⚠️
														
 
															-
														
 
															-**The quantization formats will soon be updated: https://github.com/ggerganov/llama.cpp/pull/1305**
														
 
															-
														
 
															-**All `ggml` model files using the old format will not work with the latest `llama.cpp` code after that change is merged**
														
 
															-
														
 
															----
														
 
															-
														
 
															 **Hot topics:**
														
 
															+- Qauntization formats `Q4` and `Q5` have changed - requantize any old models [(info)](https://github.com/ggerganov/llama.cpp/pull/1405)
														
 
															 - [Roadmap May 2023](https://github.com/ggerganov/llama.cpp/discussions/1220)
														
 
															-- [New quantization methods](https://github.com/ggerganov/llama.cpp#quantization)
														
 
															 <details>
														
 
															   <summary>Table of Contents</summary>
														
@@ -338,18 +330,18 @@ As the models are currently fully loaded into memory, you will need adequate dis
 
															 Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
														
 
															-| Model | Measure      | F16    | Q4_0   | Q4_1   | Q4_2   | Q5_0   | Q5_1   | Q8_0   |
														
 
															-|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|-------:|
														
 
															-|    7B | perplexity   | 5.9066 | 6.1620 | 6.0910 | 6.1466 | 5.9862 | 5.9481 | 5.9069 |
														
 
															-|    7B | file size    |  13.0G |   4.0G |   4.8G |   4.0G |   4.4G |   4.8G |   7.1G |
														
 
															-|    7B | ms/tok @ 4th |    128 |     56 |     61 |     84 |     91 |     95 |     75 |
														
 
															-|    7B | ms/tok @ 8th |    128 |     47 |     55 |     48 |     53 |     59 |     75 |
														
 
															-|    7B | bits/weight  |   16.0 |    5.0 |    6.0 |    5.0 |    5.5 |    6.0 |    9.0 |
														
 
															-|   13B | perplexity   | 5.2543 | 5.3863 | 5.3607 | 5.3513 | 5.2856 | 5.2706 | 5.2548 |
														
 
															-|   13B | file size    |  25.0G |   7.6G |   9.1G |   7.6G |   8.4G |   9.1G |    14G |
														
 
															-|   13B | ms/tok @ 4th |    239 |    104 |    113 |    160 |    176 |    185 |    141 |
														
 
															-|   13B | ms/tok @ 8th |    240 |     85 |     99 |     97 |    108 |    117 |    147 |
														
 
															-|   13B | bits/weight  |   16.0 |    5.0 |    6.0 |    5.0 |    5.5 |    6.0 |    9.0 |
														
 
															+| Model | Measure      | F16    | Q4_0   | Q4_1   | Q5_0   | Q5_1   | Q8_0   |
														
 
															+|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
														
 
															+|    7B | perplexity   | 5.9066 | 6.1620 | 6.0910 | 5.9862 | 5.9481 | 5.9069 |
														
 
															+|    7B | file size    |  13.0G |   4.0G |   4.8G |   4.4G |   4.8G |   7.1G |
														
 
															+|    7B | ms/tok @ 4th |    128 |     50 |     54 |     75 |     83 |     75 |
														
 
															+|    7B | ms/tok @ 8th |    123 |     44 |     52 |     53 |     58 |     72 |
														
 
															+|    7B | bits/weight  |   16.0 |    5.0 |    6.0 |    5.5 |    6.0 |    9.0 |
														
 
															+|   13B | perplexity   | 5.2543 | 5.3863 | 5.3607 | 5.2856 | 5.2706 | 5.2548 |
														
 
															+|   13B | file size    |  25.0G |   7.6G |   9.1G |   8.4G |   9.1G |    14G |
														
 
															+|   13B | ms/tok @ 4th |    239 |     93 |    101 |    150 |    164 |    141 |
														
 
															+|   13B | ms/tok @ 8th |    240 |     81 |     96 |     96 |    104 |    136 |
														
 
															+|   13B | bits/weight  |   16.0 |    5.0 |    6.0 |    5.5 |    6.0 |    9.0 |
														
 
															 ### Perplexity (measuring model quality)
														
--- a/SHA256SUMS
+++ b/SHA256SUMS
@@ -1,24 +1,27 @@
 
															 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d  models/7B/consolidated.00.pth
														
 
															 666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847  models/7B/ggml-model-f16.bin
														
 
															-99aeb35f26b577fa2732716cca4d8b5ada39a78ea9b2dca2651fc632b5d101b6  models/7B/ggml-model-q4_0.bin
														
 
															-cc061458339a3eb8bcecbf0a825e9924fb7d1a8150f63cd5d091caa99215aafe  models/7B/ggml-model-q4_1.bin
														
 
															-25b050337a87344da687a7f2adddc03bd99b7f6c140450e836649f3585fb6496  models/7B/ggml-model-q4_2.bin
														
 
															+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q4_0.bin
														
 
															+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q4_1.bin
														
 
															+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q5_0.bin
														
 
															+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q5_1.bin
														
 
															 7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265  models/7B/params.json
														
 
															 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08  models/13B/consolidated.00.pth
														
 
															 d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085  models/13B/consolidated.01.pth
														
 
															 2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808  models/13B/ggml-model-f16.bin
														
 
															-eecb575d325d935157761172e2bf05984dad216eb2b06777b73463cf9b818bab  models/13B/ggml-model-q4_0.bin
														
 
															-d9581b5b88e5622532fe897c9f9b0e67a317d22dd27a6f90fa4ab8c6d23ccdbb  models/13B/ggml-model-q4_1.bin
														
 
															-75a218a47df03f5f96354656329864613abcb67779412b9bc2282b28c1c3cbaa  models/13B/ggml-model-q4_2.bin
														
 
															+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q4_0.bin
														
 
															+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q4_1.bin
														
 
															+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q5_0.bin
														
 
															+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q5_1.bin
														
 
															 4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f  models/13B/params.json
														
 
															 e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067  models/30B/consolidated.00.pth
														
 
															 4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff  models/30B/consolidated.01.pth
														
 
															 24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378  models/30B/consolidated.02.pth
														
 
															 1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b  models/30B/consolidated.03.pth
														
 
															 7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37  models/30B/ggml-model-f16.bin
														
 
															-517b9e525742c42b5478a6280a4b41ec66f46298c57aba7f0453d491682fe42d  models/30B/ggml-model-q4_0.bin
														
 
															-7b75ac615fa369ee593493a7e6ef87542bf0350255db928b22c5a24f6d598bcd  models/30B/ggml-model-q4_1.bin
														
 
															-aadbc9cf806313a55be570f62884eed289d30c313fac3b7838717e01bd553204  models/30B/ggml-model-q4_2.bin
														
 
															+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q4_0.bin
														
 
															+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q4_1.bin
														
 
															+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q5_0.bin
														
 
															+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q5_1.bin
														
 
															 2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb  models/30B/params.json
														
 
															 135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe  models/65B/consolidated.00.pth
														
 
															 9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde  models/65B/consolidated.01.pth
														
@@ -29,8 +32,9 @@ a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78  models/65B/con
 
															 72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b  models/65B/consolidated.06.pth
														
 
															 d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638  models/65B/consolidated.07.pth
														
 
															 60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0  models/65B/ggml-model-f16.bin
														
 
															-01672072136f8be6ca9d7cebe5f86ed316e8b85851b9fe3de951809233cea4f2  models/65B/ggml-model-q4_0.bin
														
 
															-4743a28aac3e5f32a6e838a815f51d3779de44fbbe251d745251e66c23c5950f  models/65B/ggml-model-q4_1.bin
														
 
															-1b6f6588d0e2ecfe6c4d849088e48e5e3083466b962daa32e3261363e21fc5e9  models/65B/ggml-model-q4_2.bin
														
 
															+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q4_0.bin
														
 
															+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q4_1.bin
														
 
															+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q5_0.bin
														
 
															+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q5_1.bin
														
 
															 999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b  models/65B/params.json
														
 
															 9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347  models/tokenizer.model
														
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -7,12 +7,11 @@
 
															 #include <string>
														
 
															 static const std::map<std::string, llama_ftype> LLAMA_FTYPE_MAP = {
														
 
															-    {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
														
 
															-    {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
														
 
															-    {"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
														
 
															-    {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
														
 
															-    {"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
														
 
															-    {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
														
 
															+  {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
														
 
															+  {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
														
 
															+  {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
														
 
															+  {"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
														
 
															+  {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
														
 
															 };
														
 
															 bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::string & ftype_str_out) {
														
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -49,13 +49,6 @@ typedef struct {
 
															 } block_q4_1;
														
 
															 static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
														
 
															-#define QK4_2 16
														
 
															-typedef struct {
														
 
															-    half  d;                // delta
														
 
															-    uint8_t qs[QK4_2 / 2];  // nibbles / quants
														
 
															-} block_q4_2;
														
 
															-static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
														
 
															-
														
 
															 #define QK5_0 32
														
 
															 typedef struct {
														
 
															     half d;                 // delta
														
@@ -81,29 +74,26 @@ typedef struct {
 
															 static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
														
 
															 static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
														
 
															+    static const int qk = QK4_0;
														
 
															+
														
 
															     const block_q4_0 * x = (const block_q4_0 *) vx;
														
 
															     const int i = blockIdx.x;
														
 
															     const float d = x[i].d;
														
 
															-    const uint8_t * pp = x[i].qs;
														
 
															-
														
 
															-    for (int l = 0; l < QK4_0; l += 2) {
														
 
															-        const uint8_t vi = pp[l/2];
														
 
															-
														
 
															-        const int8_t vi0 = vi & 0xf;
														
 
															-        const int8_t vi1 = vi >> 4;
														
 
															+    for (int j = 0; j < qk/2; ++j) {
														
 
															+        const int x0 = (x[i].qs[j] & 0xf) - 8;
														
 
															+        const int x1 = (x[i].qs[j] >>  4) - 8;
														
 
															-        const float v0 = (vi0 - 8)*d;
														
 
															-        const float v1 = (vi1 - 8)*d;
														
 
															-
														
 
															-        y[i*QK4_0 + l + 0] = v0;
														
 
															-        y[i*QK4_0 + l + 1] = v1;
														
 
															+        y[i*qk + j + 0   ] = x0*d;
														
 
															+        y[i*qk + j + qk/2] = x1*d;
														
 
															     }
														
 
															 }
														
 
															 static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
														
 
															+    static const int qk = QK4_1;
														
 
															+
														
 
															     const block_q4_1 * x = (const block_q4_1 *) vx;
														
 
															     const int i = blockIdx.x;
														
@@ -111,75 +101,42 @@ static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
 
															     const float d = x[i].d;
														
 
															     const float m = x[i].m;
														
 
															-    const uint8_t * pp = x[i].qs;
														
 
															-
														
 
															-    for (int l = 0; l < QK4_1; l += 2) {
														
 
															-        const uint8_t vi = pp[l/2];
														
 
															-
														
 
															-        const int8_t vi0 = vi & 0xf;
														
 
															-        const int8_t vi1 = vi >> 4;
														
 
															+    for (int j = 0; j < qk/2; ++j) {
														
 
															+        const int x0 = (x[i].qs[j] & 0xf);
														
 
															+        const int x1 = (x[i].qs[j] >>  4);
														
 
															-        const float v0 = vi0*d + m;
														
 
															-        const float v1 = vi1*d + m;
														
 
															-
														
 
															-        y[i*QK4_1 + l + 0] = v0;
														
 
															-        y[i*QK4_1 + l + 1] = v1;
														
 
															-    }
														
 
															-}
														
 
															-
														
 
															-static __global__ void dequantize_block_q4_2(const void * vx, float * y) {
														
 
															-    const block_q4_2 * x = (const block_q4_2 *) vx;
														
 
															-
														
 
															-    const int i = blockIdx.x;
														
 
															-
														
 
															-    const float d = x[i].d;
														
 
															-
														
 
															-    const uint8_t * pp = x[i].qs;
														
 
															-
														
 
															-    for (int l = 0; l < QK4_2; l += 2) {
														
 
															-        const uint8_t vi = pp[l/2];
														
 
															-
														
 
															-        const int8_t vi0 = vi & 0xf;
														
 
															-        const int8_t vi1 = vi >> 4;
														
 
															-
														
 
															-        const float v0 = (vi0 - 8)*d;
														
 
															-        const float v1 = (vi1 - 8)*d;
														
 
															-
														
 
															-        y[i*QK4_2 + l + 0] = v0;
														
 
															-        y[i*QK4_2 + l + 1] = v1;
														
 
															+        y[i*qk + j + 0   ] = x0*d + m;
														
 
															+        y[i*qk + j + qk/2] = x1*d + m;
														
 
															     }
														
 
															 }
														
 
															 static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
														
 
															+    static const int qk = QK5_0;
														
 
															+
														
 
															     const block_q5_0 * x = (const block_q5_0 *) vx;
														
 
															     const int i = blockIdx.x;
														
 
															     const float d = x[i].d;
														
 
															-    const uint8_t * pp = x[i].qs;
														
 
															-
														
 
															     uint32_t qh;
														
 
															     memcpy(&qh, x[i].qh, sizeof(qh));
														
 
															-    for (int l = 0; l < QK5_0; l += 2) {
														
 
															-        const uint8_t vi = pp[l/2];
														
 
															-
														
 
															-        const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
														
 
															-        const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
														
 
															+    for (int j = 0; j < qk/2; ++j) {
														
 
															+        const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
														
 
															+        const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
														
 
															-        const int8_t vi0 = ((vi & 0xf) | vh0);
														
 
															-        const int8_t vi1 = ((vi >>  4) | vh1);
														
 
															+        const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
														
 
															+        const int32_t x1 = ((x[i].qs[j] >>  4) | xh_1) - 16;
														
 
															-        const float v0 = (vi0 - 16)*d;
														
 
															-        const float v1 = (vi1 - 16)*d;
														
 
															-
														
 
															-        y[i*QK5_0 + l + 0] = v0;
														
 
															-        y[i*QK5_0 + l + 1] = v1;
														
 
															+        y[i*qk + j + 0   ] = x0*d;
														
 
															+        y[i*qk + j + qk/2] = x1*d;
														
 
															     }
														
 
															 }
														
 
															 static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
														
 
															+    static const int qk = QK5_1;
														
 
															+
														
 
															     const block_q5_1 * x = (const block_q5_1 *) vx;
														
 
															     const int i = blockIdx.x;
														
@@ -187,41 +144,32 @@ static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
 
															     const float d = x[i].d;
														
 
															     const float m = x[i].m;
														
 
															-    const uint8_t * pp = x[i].qs;
														
 
															-
														
 
															     uint32_t qh;
														
 
															     memcpy(&qh, x[i].qh, sizeof(qh));
														
 
															-    for (int l = 0; l < QK5_1; l += 2) {
														
 
															-        const uint8_t vi = pp[l/2];
														
 
															-
														
 
															-        const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
														
 
															-        const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
														
 
															+    for (int j = 0; j < qk/2; ++j) {
														
 
															+        const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
														
 
															+        const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
														
 
															-        const int8_t vi0 = (vi & 0xf) | vh0;
														
 
															-        const int8_t vi1 = (vi >>  4) | vh1;
														
 
															+        const int x0 = (x[i].qs[j] & 0xf) | xh_0;
														
 
															+        const int x1 = (x[i].qs[j] >>  4) | xh_1;
														
 
															-        const float v0 = vi0*d + m;
														
 
															-        const float v1 = vi1*d + m;
														
 
															-
														
 
															-        y[i*QK5_1 + l + 0] = v0;
														
 
															-        y[i*QK5_1 + l + 1] = v1;
														
 
															+        y[i*qk + j + 0   ] = x0*d + m;
														
 
															+        y[i*qk + j + qk/2] = x1*d + m;
														
 
															     }
														
 
															 }
														
 
															 static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
														
 
															+    static const int qk = QK8_0;
														
 
															+
														
 
															     const block_q8_0 * x = (const block_q8_0 *) vx;
														
 
															     const int i = blockIdx.x;
														
 
															     const float d = x[i].d;
														
 
															-    const int8_t * pp = x[i].qs;
														
 
															-
														
 
															-    for (int l = 0; l < QK8_0; l++) {
														
 
															-        const int8_t vi = pp[l];
														
 
															-
														
 
															-        y[i*QK8_0 + l] = vi*d;
														
 
															+    for (int j = 0; j < qk; ++j) {
														
 
															+        y[i*qk + j] = x[i].qs[j]*d;
														
 
															     }
														
 
															 }
														
@@ -235,11 +183,6 @@ static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStre
 
															     dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
														
 
															 }
														
 
															-static void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
														
 
															-    const int nb = k / QK4_2;
														
 
															-    dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
														
 
															-}
														
 
															-
														
 
															 static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
														
 
															     const int nb = k / QK5_0;
														
 
															     dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
														
@@ -274,8 +217,6 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
 
															             return dequantize_row_q4_0_cuda;
														
 
															         case GGML_TYPE_Q4_1:
														
 
															             return dequantize_row_q4_1_cuda;
														
 
															-        case GGML_TYPE_Q4_2:
														
 
															-            return dequantize_row_q4_2_cuda;
														
 
															         case GGML_TYPE_Q5_0:
														
 
															             return dequantize_row_q5_0_cuda;
														
 
															         case GGML_TYPE_Q5_1:
														
--- a/ggml-opencl.c
+++ b/ggml-opencl.c
@@ -52,26 +52,6 @@ __kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global f
 
															     result[index + 1] = (vi >> 4) * d + m;
														
 
															 }
														
 
															-struct block_q4_2
														
 
															-{
														
 
															-    ushort d;
														
 
															-    uchar qs[8];
														
 
															-};
														
 
															-
														
 
															-__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) {
														
 
															-    const uint i = get_global_id(0) / 16;
														
 
															-    const uint l = get_local_id(0);
														
 
															-
														
 
															-    const float d = vload_half(0, (__global half*) &blocks[i].d);
														
 
															-
														
 
															-    const uchar vi = blocks[i].qs[l];
														
 
															-
														
 
															-    const uint index = i*16 + l*2;
														
 
															-    result[index + 0] = ((vi & 0xf) - 8)*d;
														
 
															-    result[index + 1] = ((vi >> 4) - 8)*d;
														
 
															-}
														
 
															-
														
 
															-
														
 
															 struct block_q5_0
														
 
															 {
														
 
															     float d;
														
@@ -167,7 +147,7 @@ static cl_device_id device;
 
															 static cl_context context;
														
 
															 static cl_command_queue queue;
														
 
															 static cl_program program;
														
 
															-static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q5_0, kernel_q5_1, kernel_q8_0;
														
 
															+static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q5_0, kernel_q5_1, kernel_q8_0;
														
 
															 static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
														
 
															 static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
														
@@ -238,8 +218,6 @@ void ggml_cl_init(void) {
 
															     CL_CHECK(err, "clCreateKernel");
														
 
															     kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
														
 
															     CL_CHECK(err, "clCreateKernel");
														
 
															-    kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
														
 
															-    CL_CHECK(err, "clCreateKernel");
														
 
															     kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
														
 
															     CL_CHECK(err, "clCreateKernel");
														
 
															     kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
														
@@ -292,12 +270,6 @@ void ggml_cl_sgemm_wrapper(
 
															         local = 16;
														
 
															         size_qb = global * (sizeof(float) * 2 + local) / 32;
														
 
															         break;
														
 
															-    case GGML_TYPE_Q4_2:
														
 
															-        dequant = true;
														
 
															-        kernel = kernel_q4_2;
														
 
															-        local = 8;
														
 
															-        size_qb = global * (sizeof(ggml_fp16_t) + local) / 16;
														
 
															-        break;
														
 
															     case GGML_TYPE_Q5_0:
														
 
															         dequant = true;
														
 
															         kernel = kernel_q5_0;
														
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@@ -231,7 +231,7 @@ extern "C" {
 
															         GGML_TYPE_F16  = 1,
														
 
															         GGML_TYPE_Q4_0 = 2,
														
 
															         GGML_TYPE_Q4_1 = 3,
														
 
															-        GGML_TYPE_Q4_2 = 4,
														
 
															+        // GGML_TYPE_Q4_2 = 4, support has been removed
														
 
															         // GGML_TYPE_Q4_3 (5) support has been removed
														
 
															         GGML_TYPE_Q5_0 = 6,
														
 
															         GGML_TYPE_Q5_1 = 7,
														
@@ -251,7 +251,6 @@ extern "C" {
 
															         GGML_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
														
 
															         GGML_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
														
 
															         GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
														
 
															-        GGML_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
														
 
															         GGML_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
														
 
															         GGML_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
														
 
															         GGML_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
														
@@ -876,7 +875,6 @@ extern "C" {
 
															     GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
														
 
															     GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
														
 
															-    GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
														
 
															     GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
														
 
															     GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
														
 
															     GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
														
--- a/llama.cpp
+++ b/llama.cpp
@@ -402,6 +402,7 @@ enum llama_file_version {
 
															     LLAMA_FILE_VERSION_GGML,
														
 
															     LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
														
 
															     LLAMA_FILE_VERSION_GGJT_V1, // added padding
														
 
															+    LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
														
 
															 };
														
 
															 struct llama_file_loader {
														
@@ -432,6 +433,8 @@ struct llama_file_loader {
 
															             file_version = LLAMA_FILE_VERSION_GGMF_V1;
														
 
															         } else if (magic == 'ggjt' && version == 1) {
														
 
															             file_version = LLAMA_FILE_VERSION_GGJT_V1;
														
 
															+        } else if (magic == 'ggjt' && version == 2) {
														
 
															+            file_version = LLAMA_FILE_VERSION_GGJT_V2;
														
 
															         } else {
														
 
															             throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
														
 
															                          magic, version);
														
@@ -482,7 +485,6 @@ struct llama_file_loader {
 
															                 case GGML_TYPE_F16:
														
 
															                 case GGML_TYPE_Q4_0:
														
 
															                 case GGML_TYPE_Q4_1:
														
 
															-                case GGML_TYPE_Q4_2:
														
 
															                 case GGML_TYPE_Q5_0:
														
 
															                 case GGML_TYPE_Q5_1:
														
 
															                 case GGML_TYPE_Q8_0:
														
@@ -527,8 +529,8 @@ struct llama_file_saver {
 
															         write_vocab();
														
 
															     }
														
 
															     void write_magic() {
														
 
															-        file.write_u32('ggjt'); // magic
														
 
															-        file.write_u32(1); // version
														
 
															+        file.write_u32(LLAMA_FILE_MAGIC);   // magic
														
 
															+        file.write_u32(LLAMA_FILE_VERSION); // version
														
 
															     }
														
 
															     void write_hparams(enum llama_ftype new_ftype) {
														
 
															         const llama_hparams & hparams = any_file_loader->hparams;
														
@@ -558,7 +560,6 @@ struct llama_file_saver {
 
															             case GGML_TYPE_F16:
														
 
															             case GGML_TYPE_Q4_0:
														
 
															             case GGML_TYPE_Q4_1:
														
 
															-            case GGML_TYPE_Q4_2:
														
 
															             case GGML_TYPE_Q5_0:
														
 
															             case GGML_TYPE_Q5_1:
														
 
															             case GGML_TYPE_Q8_0:
														
@@ -839,9 +840,11 @@ static const char *llama_file_version_name(llama_file_version version) {
 
															     switch (version) {
														
 
															         case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
														
 
															         case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
														
 
															-        case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)";
														
 
															-        default: LLAMA_ASSERT(false);
														
 
															+        case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
														
 
															+        case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
														
 
															     }
														
 
															+
														
 
															+    return "unknown";
														
 
															 }
														
 
															 static const char *llama_ftype_name(enum llama_ftype ftype) {
														
@@ -852,7 +855,6 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
 
															         case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
														
 
															         case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
														
 
															                                       return "mostly Q4_1, some F16";
														
 
															-        case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
														
 
															         case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
														
 
															         case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
														
 
															         case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
														
@@ -918,6 +920,14 @@ static void llama_model_load_internal(
 
															         fprintf(stderr, "%s: model size = %s\n",  __func__, llama_model_type_name(model.type));
														
 
															     }
														
 
															+    if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
														
 
															+        if (hparams.ftype != LLAMA_FTYPE_ALL_F32     &&
														
 
															+            hparams.ftype != LLAMA_FTYPE_MOSTLY_F16  &&
														
 
															+            hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
														
 
															+            throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
														
 
															+        }
														
 
															+    }
														
 
															+
														
 
															     if (vocab_only) {
														
 
															         return;
														
 
															     }
														
@@ -1905,7 +1915,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
															     switch (ftype) {
														
 
															         case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
														
 
															         case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
														
 
															-        case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
														
 
															         case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
														
 
															         case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
														
 
															         case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
														
@@ -2813,9 +2822,9 @@ void llama_print_timings(struct llama_context * ctx) {
 
															     fprintf(stderr, "\n");
														
 
															     fprintf(stderr, "%s:        load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
														
 
															-    fprintf(stderr, "%s:      sample time = %8.2f ms / %5d runs   (%8.2f ms per run)\n",   __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
														
 
															+    fprintf(stderr, "%s:      sample time = %8.2f ms / %5d runs   (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
														
 
															     fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
														
 
															-    fprintf(stderr, "%s:        eval time = %8.2f ms / %5d runs   (%8.2f ms per run)\n",   __func__, 1e-3 * ctx->t_eval_us,   n_eval,   1e-3 * ctx->t_eval_us   / n_eval);
														
 
															+    fprintf(stderr, "%s:        eval time = %8.2f ms / %5d runs   (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us,   n_eval,   1e-3 * ctx->t_eval_us   / n_eval);
														
 
															     fprintf(stderr, "%s:       total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
														
 
															 }
														
--- a/llama.h
+++ b/llama.h
@@ -19,7 +19,7 @@
 
															 #    define LLAMA_API
														
 
															 #endif
														
 
															-#define LLAMA_FILE_VERSION           1
														
 
															+#define LLAMA_FILE_VERSION           2
														
 
															 #define LLAMA_FILE_MAGIC             'ggjt'
														
 
															 #define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
														
 
															 #define LLAMA_SESSION_MAGIC          'ggsn'
														
@@ -78,7 +78,7 @@ extern "C" {
 
															         LLAMA_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
														
 
															         LLAMA_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
														
 
															         LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
														
 
															-        LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
														
 
															+        // LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // support has been removed
														
 
															         // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
														
 
															         LLAMA_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
														
 
															         LLAMA_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
														
--- a/scripts/perf-run-all.sh
+++ b/scripts/perf-run-all.sh
@@ -0,0 +1,93 @@
 
															+#!/bin/bash
														
 
															+#
														
 
															+# Measure the performance (time per token) of the various quantization techniques
														
 
															+#
														
 
															+
														
 
															+QUANTIZE=0
														
 
															+if [ "$1" != "" ]; then
														
 
															+    echo "Quantizing"
														
 
															+    QUANTIZE=1
														
 
															+fi
														
 
															+
														
 
															+if [ "$QUANTIZE" != "0" ]; then
														
 
															+    #
														
 
															+    # quantize
														
 
															+    #
														
 
															+
														
 
															+    # 7B
														
 
															+    time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-7b-q4_0.txt
														
 
															+    time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-7b-q4_1.txt
														
 
															+    time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-7b-q5_0.txt
														
 
															+    time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-7b-q5_1.txt
														
 
															+    time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-7b-q8_0.txt
														
 
															+
														
 
															+    # 13B
														
 
															+    time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-13b-q4_0.txt
														
 
															+    time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-13b-q4_1.txt
														
 
															+    time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-13b-q5_0.txt
														
 
															+    time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-13b-q5_1.txt
														
 
															+    time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-13b-q8_0.txt
														
 
															+fi
														
 
															+
														
 
															+#
														
 
															+# perf
														
 
															+# run each command twice
														
 
															+#
														
 
															+
														
 
															+set -x
														
 
															+
														
 
															+# 7B - 4 threads
														
 
															+     ./bin/main -m ../models/7B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
														
 
															+time ./bin/main -m ../models/7B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-f16.txt  | grep llama_print_timings
														
 
															+     ./bin/main -m ../models/7B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
														
 
															+time ./bin/main -m ../models/7B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q4_0.txt | grep llama_print_timings
														
 
															+     ./bin/main -m ../models/7B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
														
 
															+time ./bin/main -m ../models/7B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q4_1.txt | grep llama_print_timings
														
 
															+     ./bin/main -m ../models/7B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
														
 
															+time ./bin/main -m ../models/7B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q5_0.txt | grep llama_print_timings
														
 
															+     ./bin/main -m ../models/7B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
														
 
															+time ./bin/main -m ../models/7B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q5_1.txt | grep llama_print_timings
														
 
															+     ./bin/main -m ../models/7B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
														
 
															+time ./bin/main -m ../models/7B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q8_0.txt | grep llama_print_timings
														
 
															+
														
 
															+# 7B - 8 threads
														
 
															+     ./bin/main -m ../models/7B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
														
 
															+time ./bin/main -m ../models/7B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-f16.txt  | grep llama_print_timings
														
 
															+     ./bin/main -m ../models/7B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
														
 
															+time ./bin/main -m ../models/7B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q4_0.txt | grep llama_print_timings
														
 
															+     ./bin/main -m ../models/7B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
														
 
															+time ./bin/main -m ../models/7B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q4_1.txt | grep llama_print_timings
														
 
															+     ./bin/main -m ../models/7B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
														
 
															+time ./bin/main -m ../models/7B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q5_0.txt | grep llama_print_timings
														
 
															+     ./bin/main -m ../models/7B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
														
 
															+time ./bin/main -m ../models/7B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q5_1.txt | grep llama_print_timings
														
 
															+     ./bin/main -m ../models/7B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
														
 
															+time ./bin/main -m ../models/7B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q8_0.txt | grep llama_print_timings
														
 
															+
														
 
															+# 13B - 4 threads
														
 
															+     ./bin/main -m ../models/13B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
														
 
															+time ./bin/main -m ../models/13B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-f16.txt  | grep llama_print_timings
														
 
															+     ./bin/main -m ../models/13B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
														
 
															+time ./bin/main -m ../models/13B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q4_0.txt | grep llama_print_timings
														
 
															+     ./bin/main -m ../models/13B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
														
 
															+time ./bin/main -m ../models/13B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q4_1.txt | grep llama_print_timings
														
 
															+     ./bin/main -m ../models/13B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
														
 
															+time ./bin/main -m ../models/13B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q5_0.txt | grep llama_print_timings
														
 
															+     ./bin/main -m ../models/13B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
														
 
															+time ./bin/main -m ../models/13B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q5_1.txt | grep llama_print_timings
														
 
															+     ./bin/main -m ../models/13B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
														
 
															+time ./bin/main -m ../models/13B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q8_0.txt | grep llama_print_timings
														
 
															+
														
 
															+# 13B - 8 threads
														
 
															+     ./bin/main -m ../models/13B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
														
 
															+time ./bin/main -m ../models/13B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-f16.txt  | grep llama_print_timings
														
 
															+     ./bin/main -m ../models/13B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
														
 
															+time ./bin/main -m ../models/13B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q4_0.txt | grep llama_print_timings
														
 
															+     ./bin/main -m ../models/13B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
														
 
															+time ./bin/main -m ../models/13B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q4_1.txt | grep llama_print_timings
														
 
															+     ./bin/main -m ../models/13B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
														
 
															+time ./bin/main -m ../models/13B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q5_0.txt | grep llama_print_timings
														
 
															+     ./bin/main -m ../models/13B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
														
 
															+time ./bin/main -m ../models/13B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q5_1.txt | grep llama_print_timings
														
 
															+     ./bin/main -m ../models/13B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
														
 
															+time ./bin/main -m ../models/13B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q8_0.txt | grep llama_print_timings
														
--- a/scripts/ppl-run-all.sh
+++ b/scripts/ppl-run-all.sh
@@ -7,7 +7,6 @@
 
															 # 7B
														
 
															 time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-7b-q4_0.txt
														
 
															 time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-7b-q4_1.txt
														
 
															-time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_2.bin q4_2 2>&1 | tee ../qnt-7b-q4_2.txt
														
 
															 time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-7b-q5_0.txt
														
 
															 time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-7b-q5_1.txt
														
 
															 time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-7b-q8_0.txt
														
@@ -15,7 +14,6 @@ time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q8_0
 
															 # 13B
														
 
															 time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-13b-q4_0.txt
														
 
															 time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-13b-q4_1.txt
														
 
															-time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_2.bin q4_2 2>&1 | tee ../qnt-13b-q4_2.txt
														
 
															 time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-13b-q5_0.txt
														
 
															 time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-13b-q5_1.txt
														
 
															 time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-13b-q8_0.txt
														
@@ -28,7 +26,6 @@ time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q8
 
															 time ./bin/perplexity -m ../models/7B/ggml-model-f16.bin  -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-f16.txt
														
 
															 time ./bin/perplexity -m ../models/7B/ggml-model-q4_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q4_0.txt
														
 
															 time ./bin/perplexity -m ../models/7B/ggml-model-q4_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q4_1.txt
														
 
															-time ./bin/perplexity -m ../models/7B/ggml-model-q4_2.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q4_2.txt
														
 
															 time ./bin/perplexity -m ../models/7B/ggml-model-q5_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q5_0.txt
														
 
															 time ./bin/perplexity -m ../models/7B/ggml-model-q5_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q5_1.txt
														
 
															 time ./bin/perplexity -m ../models/7B/ggml-model-q8_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q8_0.txt
														
@@ -37,7 +34,6 @@ time ./bin/perplexity -m ../models/7B/ggml-model-q8_0.bin -f ./wiki.test.raw --n
 
															 time ./bin/perplexity -m ../models/13B/ggml-model-f16.bin  -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-f16.txt
														
 
															 time ./bin/perplexity -m ../models/13B/ggml-model-q4_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q4_0.txt
														
 
															 time ./bin/perplexity -m ../models/13B/ggml-model-q4_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q4_1.txt
														
 
															-time ./bin/perplexity -m ../models/13B/ggml-model-q4_2.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q4_2.txt
														
 
															 time ./bin/perplexity -m ../models/13B/ggml-model-q5_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q5_0.txt
														
 
															 time ./bin/perplexity -m ../models/13B/ggml-model-q5_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q5_1.txt
														
 
															 time ./bin/perplexity -m ../models/13B/ggml-model-q8_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q8_0.txt