2 лет назад · b9fd7eee57
--- a/.gitignore
+++ b/.gitignore
@@ -44,5 +44,6 @@ zig-cache/
 
				 
			
 
				 ppl-*.txt
			
 
				 qnt-*.txt
			
 
				+perf-*.txt
			
 
				 
			
 
				 examples/jeopardy/results.txt
			
--- a/README.md
+++ b/README.md
@@ -7,18 +7,10 @@
 
				 
			
 
				 Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
			
 
				 
			
 
				-## ⚠️ TEMPORARY NOTICE ABOUT UPCOMING BREAKING CHANGE ⚠️
			
 
				-
			
 
				-**The quantization formats will soon be updated: https://github.com/ggerganov/llama.cpp/pull/1305**
			
 
				-
			
 
				-**All `ggml` model files using the old format will not work with the latest `llama.cpp` code after that change is merged**
			
 
				-
			
 
				----
			
 
				-
			
 
				 **Hot topics:**
			
 
				 
			
 
				+- Qauntization formats `Q4` and `Q5` have changed - requantize any old models [(info)](https://github.com/ggerganov/llama.cpp/pull/1405)
			
 
				 - [Roadmap May 2023](https://github.com/ggerganov/llama.cpp/discussions/1220)
			
 
				-- [New quantization methods](https://github.com/ggerganov/llama.cpp#quantization)
			
 
				 
			
 
				 <details>
			
 
				   <summary>Table of Contents</summary>
			
@@ -338,18 +330,18 @@ As the models are currently fully loaded into memory, you will need adequate dis
 
				 
			
 
				 Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
			
 
				 
			
 
				-| Model | Measure      | F16    | Q4_0   | Q4_1   | Q4_2   | Q5_0   | Q5_1   | Q8_0   |
			
 
				-|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|-------:|
			
 
				-|    7B | perplexity   | 5.9066 | 6.1620 | 6.0910 | 6.1466 | 5.9862 | 5.9481 | 5.9069 |
			
 
				-|    7B | file size    |  13.0G |   4.0G |   4.8G |   4.0G |   4.4G |   4.8G |   7.1G |
			
 
				-|    7B | ms/tok @ 4th |    128 |     56 |     61 |     84 |     91 |     95 |     75 |
			
 
				-|    7B | ms/tok @ 8th |    128 |     47 |     55 |     48 |     53 |     59 |     75 |
			
 
				-|    7B | bits/weight  |   16.0 |    5.0 |    6.0 |    5.0 |    5.5 |    6.0 |    9.0 |
			
 
				-|   13B | perplexity   | 5.2543 | 5.3863 | 5.3607 | 5.3513 | 5.2856 | 5.2706 | 5.2548 |
			
 
				-|   13B | file size    |  25.0G |   7.6G |   9.1G |   7.6G |   8.4G |   9.1G |    14G |
			
 
				-|   13B | ms/tok @ 4th |    239 |    104 |    113 |    160 |    176 |    185 |    141 |
			
 
				-|   13B | ms/tok @ 8th |    240 |     85 |     99 |     97 |    108 |    117 |    147 |
			
 
				-|   13B | bits/weight  |   16.0 |    5.0 |    6.0 |    5.0 |    5.5 |    6.0 |    9.0 |
			
 
				+| Model | Measure      | F16    | Q4_0   | Q4_1   | Q5_0   | Q5_1   | Q8_0   |
			
 
				+|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
			
 
				+|    7B | perplexity   | 5.9066 | 6.1620 | 6.0910 | 5.9862 | 5.9481 | 5.9069 |
			
 
				+|    7B | file size    |  13.0G |   4.0G |   4.8G |   4.4G |   4.8G |   7.1G |
			
 
				+|    7B | ms/tok @ 4th |    128 |     50 |     54 |     75 |     83 |     75 |
			
 
				+|    7B | ms/tok @ 8th |    123 |     44 |     52 |     53 |     58 |     72 |
			
 
				+|    7B | bits/weight  |   16.0 |    5.0 |    6.0 |    5.5 |    6.0 |    9.0 |
			
 
				+|   13B | perplexity   | 5.2543 | 5.3863 | 5.3607 | 5.2856 | 5.2706 | 5.2548 |
			
 
				+|   13B | file size    |  25.0G |   7.6G |   9.1G |   8.4G |   9.1G |    14G |
			
 
				+|   13B | ms/tok @ 4th |    239 |     93 |    101 |    150 |    164 |    141 |
			
 
				+|   13B | ms/tok @ 8th |    240 |     81 |     96 |     96 |    104 |    136 |
			
 
				+|   13B | bits/weight  |   16.0 |    5.0 |    6.0 |    5.5 |    6.0 |    9.0 |
			
 
				 
			
 
				 ### Perplexity (measuring model quality)
			
 
				 
			
--- a/SHA256SUMS
+++ b/SHA256SUMS
@@ -1,24 +1,27 @@
 
				 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d  models/7B/consolidated.00.pth
			
 
				 666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847  models/7B/ggml-model-f16.bin
			
 
				-99aeb35f26b577fa2732716cca4d8b5ada39a78ea9b2dca2651fc632b5d101b6  models/7B/ggml-model-q4_0.bin
			
 
				-cc061458339a3eb8bcecbf0a825e9924fb7d1a8150f63cd5d091caa99215aafe  models/7B/ggml-model-q4_1.bin
			
 
				-25b050337a87344da687a7f2adddc03bd99b7f6c140450e836649f3585fb6496  models/7B/ggml-model-q4_2.bin
			
 
				+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q4_0.bin
			
 
				+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q4_1.bin
			
 
				+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q5_0.bin
			
 
				+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q5_1.bin
			
 
				 7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265  models/7B/params.json
			
 
				 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08  models/13B/consolidated.00.pth
			
 
				 d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085  models/13B/consolidated.01.pth
			
 
				 2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808  models/13B/ggml-model-f16.bin
			
 
				-eecb575d325d935157761172e2bf05984dad216eb2b06777b73463cf9b818bab  models/13B/ggml-model-q4_0.bin
			
 
				-d9581b5b88e5622532fe897c9f9b0e67a317d22dd27a6f90fa4ab8c6d23ccdbb  models/13B/ggml-model-q4_1.bin
			
 
				-75a218a47df03f5f96354656329864613abcb67779412b9bc2282b28c1c3cbaa  models/13B/ggml-model-q4_2.bin
			
 
				+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q4_0.bin
			
 
				+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q4_1.bin
			
 
				+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q5_0.bin
			
 
				+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q5_1.bin
			
 
				 4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f  models/13B/params.json
			
 
				 e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067  models/30B/consolidated.00.pth
			
 
				 4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff  models/30B/consolidated.01.pth
			
 
				 24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378  models/30B/consolidated.02.pth
			
 
				 1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b  models/30B/consolidated.03.pth
			
 
				 7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37  models/30B/ggml-model-f16.bin
			
 
				-517b9e525742c42b5478a6280a4b41ec66f46298c57aba7f0453d491682fe42d  models/30B/ggml-model-q4_0.bin
			
 
				-7b75ac615fa369ee593493a7e6ef87542bf0350255db928b22c5a24f6d598bcd  models/30B/ggml-model-q4_1.bin
			
 
				-aadbc9cf806313a55be570f62884eed289d30c313fac3b7838717e01bd553204  models/30B/ggml-model-q4_2.bin
			
 
				+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q4_0.bin
			
 
				+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q4_1.bin
			
 
				+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q5_0.bin
			
 
				+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q5_1.bin
			
 
				 2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb  models/30B/params.json
			
 
				 135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe  models/65B/consolidated.00.pth
			
 
				 9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde  models/65B/consolidated.01.pth
			
@@ -29,8 +32,9 @@ a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78  models/65B/con
 
				 72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b  models/65B/consolidated.06.pth
			
 
				 d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638  models/65B/consolidated.07.pth
			
 
				 60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0  models/65B/ggml-model-f16.bin
			
 
				-01672072136f8be6ca9d7cebe5f86ed316e8b85851b9fe3de951809233cea4f2  models/65B/ggml-model-q4_0.bin
			
 
				-4743a28aac3e5f32a6e838a815f51d3779de44fbbe251d745251e66c23c5950f  models/65B/ggml-model-q4_1.bin
			
 
				-1b6f6588d0e2ecfe6c4d849088e48e5e3083466b962daa32e3261363e21fc5e9  models/65B/ggml-model-q4_2.bin
			
 
				+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q4_0.bin
			
 
				+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q4_1.bin
			
 
				+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q5_0.bin
			
 
				+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q5_1.bin
			
 
				 999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b  models/65B/params.json
			
 
				 9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347  models/tokenizer.model
			
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -7,12 +7,11 @@
 
				 #include <string>
			
 
				 
			
 
				 static const std::map<std::string, llama_ftype> LLAMA_FTYPE_MAP = {
			
 
				-    {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
			
 
				-    {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
			
 
				-    {"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
			
 
				-    {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
			
 
				-    {"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
			
 
				-    {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
			
 
				+  {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
			
 
				+  {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
			
 
				+  {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
			
 
				+  {"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
			
 
				+  {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
			
 
				 };
			
 
				 
			
 
				 bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::string & ftype_str_out) {
			
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -49,13 +49,6 @@ typedef struct {
 
				 } block_q4_1;
			
 
				 static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
			
 
				 
			
 
				-#define QK4_2 16
			
 
				-typedef struct {
			
 
				-    half  d;                // delta
			
 
				-    uint8_t qs[QK4_2 / 2];  // nibbles / quants
			
 
				-} block_q4_2;
			
 
				-static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
			
 
				-
			
 
				 #define QK5_0 32
			
 
				 typedef struct {
			
 
				     half d;                 // delta
			
@@ -81,29 +74,26 @@ typedef struct {
 
				 static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
			
 
				 
			
 
				 static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
			
 
				+    static const int qk = QK4_0;
			
 
				+
			
 
				     const block_q4_0 * x = (const block_q4_0 *) vx;
			
 
				 
			
 
				     const int i = blockIdx.x;
			
 
				 
			
 
				     const float d = x[i].d;
			
 
				 
			
 
				-    const uint8_t * pp = x[i].qs;
			
 
				-
			
 
				-    for (int l = 0; l < QK4_0; l += 2) {
			
 
				-        const uint8_t vi = pp[l/2];
			
 
				-
			
 
				-        const int8_t vi0 = vi & 0xf;
			
 
				-        const int8_t vi1 = vi >> 4;
			
 
				+    for (int j = 0; j < qk/2; ++j) {
			
 
				+        const int x0 = (x[i].qs[j] & 0xf) - 8;
			
 
				+        const int x1 = (x[i].qs[j] >>  4) - 8;
			
 
				 
			
 
				-        const float v0 = (vi0 - 8)*d;
			
 
				-        const float v1 = (vi1 - 8)*d;
			
 
				-
			
 
				-        y[i*QK4_0 + l + 0] = v0;
			
 
				-        y[i*QK4_0 + l + 1] = v1;
			
 
				+        y[i*qk + j + 0   ] = x0*d;
			
 
				+        y[i*qk + j + qk/2] = x1*d;
			
 
				     }
			
 
				 }
			
 
				 
			
 
				 static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
			
 
				+    static const int qk = QK4_1;
			
 
				+
			
 
				     const block_q4_1 * x = (const block_q4_1 *) vx;
			
 
				 
			
 
				     const int i = blockIdx.x;
			
@@ -111,75 +101,42 @@ static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
 
				     const float d = x[i].d;
			
 
				     const float m = x[i].m;
			
 
				 
			
 
				-    const uint8_t * pp = x[i].qs;
			
 
				-
			
 
				-    for (int l = 0; l < QK4_1; l += 2) {
			
 
				-        const uint8_t vi = pp[l/2];
			
 
				-
			
 
				-        const int8_t vi0 = vi & 0xf;
			
 
				-        const int8_t vi1 = vi >> 4;
			
 
				+    for (int j = 0; j < qk/2; ++j) {
			
 
				+        const int x0 = (x[i].qs[j] & 0xf);
			
 
				+        const int x1 = (x[i].qs[j] >>  4);
			
 
				 
			
 
				-        const float v0 = vi0*d + m;
			
 
				-        const float v1 = vi1*d + m;
			
 
				-
			
 
				-        y[i*QK4_1 + l + 0] = v0;
			
 
				-        y[i*QK4_1 + l + 1] = v1;
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-static __global__ void dequantize_block_q4_2(const void * vx, float * y) {
			
 
				-    const block_q4_2 * x = (const block_q4_2 *) vx;
			
 
				-
			
 
				-    const int i = blockIdx.x;
			
 
				-
			
 
				-    const float d = x[i].d;
			
 
				-
			
 
				-    const uint8_t * pp = x[i].qs;
			
 
				-
			
 
				-    for (int l = 0; l < QK4_2; l += 2) {
			
 
				-        const uint8_t vi = pp[l/2];
			
 
				-
			
 
				-        const int8_t vi0 = vi & 0xf;
			
 
				-        const int8_t vi1 = vi >> 4;
			
 
				-
			
 
				-        const float v0 = (vi0 - 8)*d;
			
 
				-        const float v1 = (vi1 - 8)*d;
			
 
				-
			
 
				-        y[i*QK4_2 + l + 0] = v0;
			
 
				-        y[i*QK4_2 + l + 1] = v1;
			
 
				+        y[i*qk + j + 0   ] = x0*d + m;
			
 
				+        y[i*qk + j + qk/2] = x1*d + m;
			
 
				     }
			
 
				 }
			
 
				 
			
 
				 static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
			
 
				+    static const int qk = QK5_0;
			
 
				+
			
 
				     const block_q5_0 * x = (const block_q5_0 *) vx;
			
 
				 
			
 
				     const int i = blockIdx.x;
			
 
				 
			
 
				     const float d = x[i].d;
			
 
				 
			
 
				-    const uint8_t * pp = x[i].qs;
			
 
				-
			
 
				     uint32_t qh;
			
 
				     memcpy(&qh, x[i].qh, sizeof(qh));
			
 
				 
			
 
				-    for (int l = 0; l < QK5_0; l += 2) {
			
 
				-        const uint8_t vi = pp[l/2];
			
 
				-
			
 
				-        const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
			
 
				-        const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
			
 
				+    for (int j = 0; j < qk/2; ++j) {
			
 
				+        const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
			
 
				+        const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
			
 
				 
			
 
				-        const int8_t vi0 = ((vi & 0xf) | vh0);
			
 
				-        const int8_t vi1 = ((vi >>  4) | vh1);
			
 
				+        const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
			
 
				+        const int32_t x1 = ((x[i].qs[j] >>  4) | xh_1) - 16;
			
 
				 
			
 
				-        const float v0 = (vi0 - 16)*d;
			
 
				-        const float v1 = (vi1 - 16)*d;
			
 
				-
			
 
				-        y[i*QK5_0 + l + 0] = v0;
			
 
				-        y[i*QK5_0 + l + 1] = v1;
			
 
				+        y[i*qk + j + 0   ] = x0*d;
			
 
				+        y[i*qk + j + qk/2] = x1*d;
			
 
				     }
			
 
				 }
			
 
				 
			
 
				 static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
			
 
				+    static const int qk = QK5_1;
			
 
				+
			
 
				     const block_q5_1 * x = (const block_q5_1 *) vx;
			
 
				 
			
 
				     const int i = blockIdx.x;
			
@@ -187,41 +144,32 @@ static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
 
				     const float d = x[i].d;
			
 
				     const float m = x[i].m;
			
 
				 
			
 
				-    const uint8_t * pp = x[i].qs;
			
 
				-
			
 
				     uint32_t qh;
			
 
				     memcpy(&qh, x[i].qh, sizeof(qh));
			
 
				 
			
 
				-    for (int l = 0; l < QK5_1; l += 2) {
			
 
				-        const uint8_t vi = pp[l/2];
			
 
				-
			
 
				-        const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
			
 
				-        const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
			
 
				+    for (int j = 0; j < qk/2; ++j) {
			
 
				+        const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
			
 
				+        const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
			
 
				 
			
 
				-        const int8_t vi0 = (vi & 0xf) | vh0;
			
 
				-        const int8_t vi1 = (vi >>  4) | vh1;
			
 
				+        const int x0 = (x[i].qs[j] & 0xf) | xh_0;
			
 
				+        const int x1 = (x[i].qs[j] >>  4) | xh_1;
			
 
				 
			
 
				-        const float v0 = vi0*d + m;
			
 
				-        const float v1 = vi1*d + m;
			
 
				-
			
 
				-        y[i*QK5_1 + l + 0] = v0;
			
 
				-        y[i*QK5_1 + l + 1] = v1;
			
 
				+        y[i*qk + j + 0   ] = x0*d + m;
			
 
				+        y[i*qk + j + qk/2] = x1*d + m;
			
 
				     }
			
 
				 }
			
 
				 
			
 
				 static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
			
 
				+    static const int qk = QK8_0;
			
 
				+
			
 
				     const block_q8_0 * x = (const block_q8_0 *) vx;
			
 
				 
			
 
				     const int i = blockIdx.x;
			
 
				 
			
 
				     const float d = x[i].d;
			
 
				 
			
 
				-    const int8_t * pp = x[i].qs;
			
 
				-
			
 
				-    for (int l = 0; l < QK8_0; l++) {
			
 
				-        const int8_t vi = pp[l];
			
 
				-
			
 
				-        y[i*QK8_0 + l] = vi*d;
			
 
				+    for (int j = 0; j < qk; ++j) {
			
 
				+        y[i*qk + j] = x[i].qs[j]*d;
			
 
				     }
			
 
				 }
			
 
				 
			
@@ -235,11 +183,6 @@ static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStre
 
				     dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
			
 
				 }
			
 
				 
			
 
				-static void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
			
 
				-    const int nb = k / QK4_2;
			
 
				-    dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
			
 
				-}
			
 
				-
			
 
				 static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
			
 
				     const int nb = k / QK5_0;
			
 
				     dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
			
@@ -274,8 +217,6 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
 
				             return dequantize_row_q4_0_cuda;
			
 
				         case GGML_TYPE_Q4_1:
			
 
				             return dequantize_row_q4_1_cuda;
			
 
				-        case GGML_TYPE_Q4_2:
			
 
				-            return dequantize_row_q4_2_cuda;
			
 
				         case GGML_TYPE_Q5_0:
			
 
				             return dequantize_row_q5_0_cuda;
			
 
				         case GGML_TYPE_Q5_1:
			
--- a/ggml-opencl.c
+++ b/ggml-opencl.c
@@ -52,26 +52,6 @@ __kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global f
 
				     result[index + 1] = (vi >> 4) * d + m;
			
 
				 }
			
 
				 
			
 
				-struct block_q4_2
			
 
				-{
			
 
				-    ushort d;
			
 
				-    uchar qs[8];
			
 
				-};
			
 
				-
			
 
				-__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) {
			
 
				-    const uint i = get_global_id(0) / 16;
			
 
				-    const uint l = get_local_id(0);
			
 
				-
			
 
				-    const float d = vload_half(0, (__global half*) &blocks[i].d);
			
 
				-
			
 
				-    const uchar vi = blocks[i].qs[l];
			
 
				-
			
 
				-    const uint index = i*16 + l*2;
			
 
				-    result[index + 0] = ((vi & 0xf) - 8)*d;
			
 
				-    result[index + 1] = ((vi >> 4) - 8)*d;
			
 
				-}
			
 
				-
			
 
				-
			
 
				 struct block_q5_0
			
 
				 {
			
 
				     float d;
			
@@ -167,7 +147,7 @@ static cl_device_id device;
 
				 static cl_context context;
			
 
				 static cl_command_queue queue;
			
 
				 static cl_program program;
			
 
				-static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q5_0, kernel_q5_1, kernel_q8_0;
			
 
				+static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q5_0, kernel_q5_1, kernel_q8_0;
			
 
				 static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
			
 
				 static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
			
 
				 
			
@@ -238,8 +218,6 @@ void ggml_cl_init(void) {
 
				     CL_CHECK(err, "clCreateKernel");
			
 
				     kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
			
 
				     CL_CHECK(err, "clCreateKernel");
			
 
				-    kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
			
 
				-    CL_CHECK(err, "clCreateKernel");
			
 
				     kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
			
 
				     CL_CHECK(err, "clCreateKernel");
			
 
				     kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
			
@@ -292,12 +270,6 @@ void ggml_cl_sgemm_wrapper(
 
				         local = 16;
			
 
				         size_qb = global * (sizeof(float) * 2 + local) / 32;
			
 
				         break;
			
 
				-    case GGML_TYPE_Q4_2:
			
 
				-        dequant = true;
			
 
				-        kernel = kernel_q4_2;
			
 
				-        local = 8;
			
 
				-        size_qb = global * (sizeof(ggml_fp16_t) + local) / 16;
			
 
				-        break;
			
 
				     case GGML_TYPE_Q5_0:
			
 
				         dequant = true;
			
 
				         kernel = kernel_q5_0;
			
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@@ -231,7 +231,7 @@ extern "C" {
 
				         GGML_TYPE_F16  = 1,
			
 
				         GGML_TYPE_Q4_0 = 2,
			
 
				         GGML_TYPE_Q4_1 = 3,
			
 
				-        GGML_TYPE_Q4_2 = 4,
			
 
				+        // GGML_TYPE_Q4_2 = 4, support has been removed
			
 
				         // GGML_TYPE_Q4_3 (5) support has been removed
			
 
				         GGML_TYPE_Q5_0 = 6,
			
 
				         GGML_TYPE_Q5_1 = 7,
			
@@ -251,7 +251,6 @@ extern "C" {
 
				         GGML_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
			
 
				         GGML_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
			
 
				         GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
			
 
				-        GGML_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
			
 
				         GGML_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
			
 
				         GGML_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
			
 
				         GGML_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
			
@@ -876,7 +875,6 @@ extern "C" {
 
				 
			
 
				     GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
			
 
				     GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
			
 
				-    GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
			
 
				     GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
			
 
				     GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
			
 
				     GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
			
--- a/llama.cpp
+++ b/llama.cpp
@@ -402,6 +402,7 @@ enum llama_file_version {
 
				     LLAMA_FILE_VERSION_GGML,
			
 
				     LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
			
 
				     LLAMA_FILE_VERSION_GGJT_V1, // added padding
			
 
				+    LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
			
 
				 };
			
 
				 
			
 
				 struct llama_file_loader {
			
@@ -432,6 +433,8 @@ struct llama_file_loader {
 
				             file_version = LLAMA_FILE_VERSION_GGMF_V1;
			
 
				         } else if (magic == 'ggjt' && version == 1) {
			
 
				             file_version = LLAMA_FILE_VERSION_GGJT_V1;
			
 
				+        } else if (magic == 'ggjt' && version == 2) {
			
 
				+            file_version = LLAMA_FILE_VERSION_GGJT_V2;
			
 
				         } else {
			
 
				             throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
			
 
				                          magic, version);
			
@@ -482,7 +485,6 @@ struct llama_file_loader {
 
				                 case GGML_TYPE_F16:
			
 
				                 case GGML_TYPE_Q4_0:
			
 
				                 case GGML_TYPE_Q4_1:
			
 
				-                case GGML_TYPE_Q4_2:
			
 
				                 case GGML_TYPE_Q5_0:
			
 
				                 case GGML_TYPE_Q5_1:
			
 
				                 case GGML_TYPE_Q8_0:
			
@@ -527,8 +529,8 @@ struct llama_file_saver {
 
				         write_vocab();
			
 
				     }
			
 
				     void write_magic() {
			
 
				-        file.write_u32('ggjt'); // magic
			
 
				-        file.write_u32(1); // version
			
 
				+        file.write_u32(LLAMA_FILE_MAGIC);   // magic
			
 
				+        file.write_u32(LLAMA_FILE_VERSION); // version
			
 
				     }
			
 
				     void write_hparams(enum llama_ftype new_ftype) {
			
 
				         const llama_hparams & hparams = any_file_loader->hparams;
			
@@ -558,7 +560,6 @@ struct llama_file_saver {
 
				             case GGML_TYPE_F16:
			
 
				             case GGML_TYPE_Q4_0:
			
 
				             case GGML_TYPE_Q4_1:
			
 
				-            case GGML_TYPE_Q4_2:
			
 
				             case GGML_TYPE_Q5_0:
			
 
				             case GGML_TYPE_Q5_1:
			
 
				             case GGML_TYPE_Q8_0:
			
@@ -839,9 +840,11 @@ static const char *llama_file_version_name(llama_file_version version) {
 
				     switch (version) {
			
 
				         case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
			
 
				         case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
			
 
				-        case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)";
			
 
				-        default: LLAMA_ASSERT(false);
			
 
				+        case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
			
 
				+        case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
			
 
				     }
			
 
				+
			
 
				+    return "unknown";
			
 
				 }
			
 
				 
			
 
				 static const char *llama_ftype_name(enum llama_ftype ftype) {
			
@@ -852,7 +855,6 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
 
				         case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
			
 
				         case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
			
 
				                                       return "mostly Q4_1, some F16";
			
 
				-        case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
			
 
				         case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
			
 
				         case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
			
 
				         case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
			
@@ -918,6 +920,14 @@ static void llama_model_load_internal(
 
				         fprintf(stderr, "%s: model size = %s\n",  __func__, llama_model_type_name(model.type));
			
 
				     }
			
 
				 
			
 
				+    if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
			
 
				+        if (hparams.ftype != LLAMA_FTYPE_ALL_F32     &&
			
 
				+            hparams.ftype != LLAMA_FTYPE_MOSTLY_F16  &&
			
 
				+            hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
			
 
				+            throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				     if (vocab_only) {
			
 
				         return;
			
 
				     }
			
@@ -1905,7 +1915,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
				     switch (ftype) {
			
 
				         case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
			
 
				         case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
			
 
				-        case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
			
 
				         case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
			
 
				         case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
			
 
				         case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
			
@@ -2813,9 +2822,9 @@ void llama_print_timings(struct llama_context * ctx) {
 
				 
			
 
				     fprintf(stderr, "\n");
			
 
				     fprintf(stderr, "%s:        load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
			
 
				-    fprintf(stderr, "%s:      sample time = %8.2f ms / %5d runs   (%8.2f ms per run)\n",   __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
			
 
				+    fprintf(stderr, "%s:      sample time = %8.2f ms / %5d runs   (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
			
 
				     fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
			
 
				-    fprintf(stderr, "%s:        eval time = %8.2f ms / %5d runs   (%8.2f ms per run)\n",   __func__, 1e-3 * ctx->t_eval_us,   n_eval,   1e-3 * ctx->t_eval_us   / n_eval);
			
 
				+    fprintf(stderr, "%s:        eval time = %8.2f ms / %5d runs   (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us,   n_eval,   1e-3 * ctx->t_eval_us   / n_eval);
			
 
				     fprintf(stderr, "%s:       total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
			
 
				 }
			
 
				 
			
--- a/llama.h
+++ b/llama.h
@@ -19,7 +19,7 @@
 
				 #    define LLAMA_API
			
 
				 #endif
			
 
				 
			
 
				-#define LLAMA_FILE_VERSION           1
			
 
				+#define LLAMA_FILE_VERSION           2
			
 
				 #define LLAMA_FILE_MAGIC             'ggjt'
			
 
				 #define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
			
 
				 #define LLAMA_SESSION_MAGIC          'ggsn'
			
@@ -78,7 +78,7 @@ extern "C" {
 
				         LLAMA_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
			
 
				         LLAMA_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
			
 
				         LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
			
 
				-        LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
			
 
				+        // LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // support has been removed
			
 
				         // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
			
 
				         LLAMA_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
			
 
				         LLAMA_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
			
--- a/scripts/perf-run-all.sh
+++ b/scripts/perf-run-all.sh
@@ -0,0 +1,93 @@
 
				+#!/bin/bash
			
 
				+#
			
 
				+# Measure the performance (time per token) of the various quantization techniques
			
 
				+#
			
 
				+
			
 
				+QUANTIZE=0
			
 
				+if [ "$1" != "" ]; then
			
 
				+    echo "Quantizing"
			
 
				+    QUANTIZE=1
			
 
				+fi
			
 
				+
			
 
				+if [ "$QUANTIZE" != "0" ]; then
			
 
				+    #
			
 
				+    # quantize
			
 
				+    #
			
 
				+
			
 
				+    # 7B
			
 
				+    time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-7b-q4_0.txt
			
 
				+    time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-7b-q4_1.txt
			
 
				+    time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-7b-q5_0.txt
			
 
				+    time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-7b-q5_1.txt
			
 
				+    time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-7b-q8_0.txt
			
 
				+
			
 
				+    # 13B
			
 
				+    time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-13b-q4_0.txt
			
 
				+    time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-13b-q4_1.txt
			
 
				+    time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-13b-q5_0.txt
			
 
				+    time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-13b-q5_1.txt
			
 
				+    time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-13b-q8_0.txt
			
 
				+fi
			
 
				+
			
 
				+#
			
 
				+# perf
			
 
				+# run each command twice
			
 
				+#
			
 
				+
			
 
				+set -x
			
 
				+
			
 
				+# 7B - 4 threads
			
 
				+     ./bin/main -m ../models/7B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
			
 
				+time ./bin/main -m ../models/7B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-f16.txt  | grep llama_print_timings
			
 
				+     ./bin/main -m ../models/7B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
			
 
				+time ./bin/main -m ../models/7B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q4_0.txt | grep llama_print_timings
			
 
				+     ./bin/main -m ../models/7B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
			
 
				+time ./bin/main -m ../models/7B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q4_1.txt | grep llama_print_timings
			
 
				+     ./bin/main -m ../models/7B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
			
 
				+time ./bin/main -m ../models/7B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q5_0.txt | grep llama_print_timings
			
 
				+     ./bin/main -m ../models/7B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
			
 
				+time ./bin/main -m ../models/7B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q5_1.txt | grep llama_print_timings
			
 
				+     ./bin/main -m ../models/7B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
			
 
				+time ./bin/main -m ../models/7B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q8_0.txt | grep llama_print_timings
			
 
				+
			
 
				+# 7B - 8 threads
			
 
				+     ./bin/main -m ../models/7B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
			
 
				+time ./bin/main -m ../models/7B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-f16.txt  | grep llama_print_timings
			
 
				+     ./bin/main -m ../models/7B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
			
 
				+time ./bin/main -m ../models/7B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q4_0.txt | grep llama_print_timings
			
 
				+     ./bin/main -m ../models/7B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
			
 
				+time ./bin/main -m ../models/7B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q4_1.txt | grep llama_print_timings
			
 
				+     ./bin/main -m ../models/7B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
			
 
				+time ./bin/main -m ../models/7B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q5_0.txt | grep llama_print_timings
			
 
				+     ./bin/main -m ../models/7B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
			
 
				+time ./bin/main -m ../models/7B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q5_1.txt | grep llama_print_timings
			
 
				+     ./bin/main -m ../models/7B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
			
 
				+time ./bin/main -m ../models/7B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q8_0.txt | grep llama_print_timings
			
 
				+
			
 
				+# 13B - 4 threads
			
 
				+     ./bin/main -m ../models/13B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
			
 
				+time ./bin/main -m ../models/13B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-f16.txt  | grep llama_print_timings
			
 
				+     ./bin/main -m ../models/13B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
			
 
				+time ./bin/main -m ../models/13B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q4_0.txt | grep llama_print_timings
			
 
				+     ./bin/main -m ../models/13B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
			
 
				+time ./bin/main -m ../models/13B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q4_1.txt | grep llama_print_timings
			
 
				+     ./bin/main -m ../models/13B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
			
 
				+time ./bin/main -m ../models/13B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q5_0.txt | grep llama_print_timings
			
 
				+     ./bin/main -m ../models/13B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
			
 
				+time ./bin/main -m ../models/13B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q5_1.txt | grep llama_print_timings
			
 
				+     ./bin/main -m ../models/13B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
			
 
				+time ./bin/main -m ../models/13B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q8_0.txt | grep llama_print_timings
			
 
				+
			
 
				+# 13B - 8 threads
			
 
				+     ./bin/main -m ../models/13B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
			
 
				+time ./bin/main -m ../models/13B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-f16.txt  | grep llama_print_timings
			
 
				+     ./bin/main -m ../models/13B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
			
 
				+time ./bin/main -m ../models/13B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q4_0.txt | grep llama_print_timings
			
 
				+     ./bin/main -m ../models/13B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
			
 
				+time ./bin/main -m ../models/13B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q4_1.txt | grep llama_print_timings
			
 
				+     ./bin/main -m ../models/13B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
			
 
				+time ./bin/main -m ../models/13B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q5_0.txt | grep llama_print_timings
			
 
				+     ./bin/main -m ../models/13B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
			
 
				+time ./bin/main -m ../models/13B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q5_1.txt | grep llama_print_timings
			
 
				+     ./bin/main -m ../models/13B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
			
 
				+time ./bin/main -m ../models/13B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q8_0.txt | grep llama_print_timings
			
--- a/scripts/ppl-run-all.sh
+++ b/scripts/ppl-run-all.sh
@@ -7,7 +7,6 @@
 
				 # 7B
			
 
				 time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-7b-q4_0.txt
			
 
				 time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-7b-q4_1.txt
			
 
				-time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_2.bin q4_2 2>&1 | tee ../qnt-7b-q4_2.txt
			
 
				 time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-7b-q5_0.txt
			
 
				 time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-7b-q5_1.txt
			
 
				 time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-7b-q8_0.txt
			
@@ -15,7 +14,6 @@ time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q8_0
 
				 # 13B
			
 
				 time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-13b-q4_0.txt
			
 
				 time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-13b-q4_1.txt
			
 
				-time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_2.bin q4_2 2>&1 | tee ../qnt-13b-q4_2.txt
			
 
				 time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-13b-q5_0.txt
			
 
				 time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-13b-q5_1.txt
			
 
				 time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-13b-q8_0.txt
			
@@ -28,7 +26,6 @@ time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q8
 
				 time ./bin/perplexity -m ../models/7B/ggml-model-f16.bin  -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-f16.txt
			
 
				 time ./bin/perplexity -m ../models/7B/ggml-model-q4_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q4_0.txt
			
 
				 time ./bin/perplexity -m ../models/7B/ggml-model-q4_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q4_1.txt
			
 
				-time ./bin/perplexity -m ../models/7B/ggml-model-q4_2.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q4_2.txt
			
 
				 time ./bin/perplexity -m ../models/7B/ggml-model-q5_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q5_0.txt
			
 
				 time ./bin/perplexity -m ../models/7B/ggml-model-q5_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q5_1.txt
			
 
				 time ./bin/perplexity -m ../models/7B/ggml-model-q8_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q8_0.txt
			
@@ -37,7 +34,6 @@ time ./bin/perplexity -m ../models/7B/ggml-model-q8_0.bin -f ./wiki.test.raw --n
 
				 time ./bin/perplexity -m ../models/13B/ggml-model-f16.bin  -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-f16.txt
			
 
				 time ./bin/perplexity -m ../models/13B/ggml-model-q4_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q4_0.txt
			
 
				 time ./bin/perplexity -m ../models/13B/ggml-model-q4_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q4_1.txt
			
 
				-time ./bin/perplexity -m ../models/13B/ggml-model-q4_2.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q4_2.txt
			
 
				 time ./bin/perplexity -m ../models/13B/ggml-model-q5_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q5_0.txt
			
 
				 time ./bin/perplexity -m ../models/13B/ggml-model-q5_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q5_1.txt
			
 
				 time ./bin/perplexity -m ../models/13B/ggml-model-q8_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q8_0.txt