před 2 roky · 074bea2eb1
--- a/Makefile
+++ b/Makefile
@@ -31,7 +31,7 @@ endif
 
				 #
			
 
				 
			
 
				 CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
			
 
				-CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
			
 
				+CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++17 -fPIC
			
 
				 LDFLAGS  =
			
 
				 
			
 
				 # OS specific
			
--- a/README.md
+++ b/README.md
@@ -11,6 +11,9 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 
				 - Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64
			
 
				 - Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105
			
 
				 
			
 
				+**TEMPORARY NOTICE:**
			
 
				+If you're updating to the latest master, you will need to regenerate your model files as the format has changed.
			
 
				+
			
 
				 ## Description
			
 
				 
			
 
				 The main goal is to run the model using 4-bit quantization on a MacBook
			
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@@ -60,7 +60,8 @@ def write_header(fout, hparams, ftype):
 
				 
			
 
				     keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
			
 
				     values = [
			
 
				-        0x67676d6c,  # magic: ggml in hex
			
 
				+        0x67676d66,  # magic: ggml in hex
			
 
				+        1, # file version
			
 
				         *[hparams[key] for key in keys],
			
 
				         hparams["dim"] // hparams["n_heads"],  # rot (obsolete)
			
 
				         ftype
			
@@ -85,6 +86,7 @@ def write_tokens(fout, tokenizer):
 
				             text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
			
 
				         fout.write(struct.pack("i", len(text)))
			
 
				         fout.write(text)
			
 
				+        fout.write(struct.pack("f", tokenizer.get_score(i)))
			
 
				 
			
 
				 def process_and_write_variables(fout, model, ftype):
			
 
				 
			
--- a/main.cpp
+++ b/main.cpp
@@ -3,6 +3,7 @@
 
				 #include "utils.h"
			
 
				 
			
 
				 #include <cassert>
			
 
				+#include <cinttypes>
			
 
				 #include <cmath>
			
 
				 #include <cstdio>
			
 
				 #include <cstring>
			
@@ -105,10 +106,24 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
 
				     {
			
 
				         uint32_t magic;
			
 
				         fin.read((char *) &magic, sizeof(magic));
			
 
				-        if (magic != 0x67676d6c) {
			
 
				+        if (magic == 0x67676d6c) {
			
 
				+            fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
			
 
				+                    __func__, fname.c_str());
			
 
				+            return false;
			
 
				+        }
			
 
				+        if (magic != 0x67676d66) {
			
 
				             fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
			
 
				             return false;
			
 
				         }
			
 
				+
			
 
				+        uint32_t format_version;
			
 
				+        fin.read((char *) &format_version, sizeof(format_version));
			
 
				+
			
 
				+        if (format_version != 1) {
			
 
				+            fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ")\n",
			
 
				+                    __func__, fname.c_str(), format_version);
			
 
				+            return false;
			
 
				+        }
			
 
				     }
			
 
				 
			
 
				     int n_ff = 0;
			
@@ -154,8 +169,12 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
 
				             word.resize(len);
			
 
				             fin.read((char *) word.data(), len);
			
 
				 
			
 
				+            float score;
			
 
				+            fin.read((char *) &score, sizeof(score));
			
 
				+
			
 
				             vocab.token_to_id[word] = i;
			
 
				             vocab.id_to_token[i] = word;
			
 
				+            vocab.score[i] = score;
			
 
				 
			
 
				             //if (i < 30000) {
			
 
				             //    fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
			
--- a/quantize.cpp
+++ b/quantize.cpp
@@ -3,6 +3,7 @@
 
				 #include "utils.h"
			
 
				 
			
 
				 #include <cassert>
			
 
				+#include <cinttypes>
			
 
				 #include <cmath>
			
 
				 #include <cstdio>
			
 
				 #include <cstring>
			
@@ -63,12 +64,28 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
 
				     {
			
 
				         uint32_t magic;
			
 
				         finp.read((char *) &magic, sizeof(magic));
			
 
				-        if (magic != 0x67676d6c) {
			
 
				+        if (magic == 0x67676d6c) {
			
 
				+            fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
			
 
				+                    __func__, fname_inp.c_str());
			
 
				+            return false;
			
 
				+        }
			
 
				+        if (magic != 0x67676d66) {
			
 
				             fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
			
 
				             return false;
			
 
				         }
			
 
				 
			
 
				         fout.write((char *) &magic, sizeof(magic));
			
 
				+
			
 
				+        uint32_t format_version;
			
 
				+        finp.read((char *) &format_version, sizeof(format_version));
			
 
				+
			
 
				+        if (format_version != 1) {
			
 
				+            fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ")\n",
			
 
				+                    __func__, fname_inp.c_str(), format_version);
			
 
				+            return false;
			
 
				+        }
			
 
				+
			
 
				+        fout.write((char *) &format_version, sizeof(format_version));
			
 
				     }
			
 
				 
			
 
				     llama_hparams hparams;
			
@@ -122,8 +139,13 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
 
				             finp.read ((char *) word.data(), len);
			
 
				             fout.write((char *) word.data(), len);
			
 
				 
			
 
				+            float score;
			
 
				+            finp.read ((char *) &score, sizeof(score));
			
 
				+            fout.write((char *) &score, sizeof(score));
			
 
				+
			
 
				             vocab.token_to_id[word] = i;
			
 
				             vocab.id_to_token[i] = word;
			
 
				+            vocab.score[i] = score;
			
 
				         }
			
 
				     }
			
 
				 
			
--- a/utils.cpp
+++ b/utils.cpp
@@ -6,6 +6,7 @@
 
				 #include <regex>
			
 
				 #include <iostream>
			
 
				 #include <iterator>
			
 
				+#include <queue>
			
 
				 #include <string>
			
 
				 #include <math.h>
			
 
				 
			
@@ -294,58 +295,146 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
 
				     return tokens;
			
 
				 }
			
 
				 
			
 
				-// TODO: Calculate this constant from the vocabulary
			
 
				-#define MAX_TOKEN_LEN 18
			
 
				-// SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece
			
 
				-std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
			
 
				-    std::vector<gpt_vocab::id> res;
			
 
				-    std::vector<int> score;
			
 
				-    std::vector<gpt_vocab::id> prev;
			
 
				-    int len = text.length();
			
 
				-
			
 
				-    score.resize(len + 1);
			
 
				-    prev.resize(len + 1);
			
 
				-
			
 
				-    // Forward pass
			
 
				-    for (int i = 0; i < len; i++) {
			
 
				-        int max_len = std::min(len - i, MAX_TOKEN_LEN);
			
 
				-        for (int sub_len = 1; sub_len <= max_len; sub_len++) {
			
 
				-            auto sub = text.substr(i, sub_len);
			
 
				-            auto token = vocab.token_to_id.find(sub);
			
 
				-            if (token != vocab.token_to_id.end()) {
			
 
				-                int token_score = sub.length() * sub.length();
			
 
				-                int local_score = score[i] + token_score;
			
 
				-                int next = i + sub_len;
			
 
				-                if (score[next] < local_score) {
			
 
				-                    score[next] = local_score;
			
 
				-                    prev[next] = (*token).second;
			
 
				+static size_t utf8_len(char src) {
			
 
				+    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
			
 
				+    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
			
 
				+    return lookup[highbits];
			
 
				+}
			
 
				+
			
 
				+struct llama_sp_symbol {
			
 
				+    using index = int;
			
 
				+    index prev;
			
 
				+    index next;
			
 
				+    std::string_view text;
			
 
				+};
			
 
				+
			
 
				+struct llama_sp_bigram {
			
 
				+    struct comparator {
			
 
				+        bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
			
 
				+            return (l.score < r.score) || (l.score == r.score && l.left > r.left);
			
 
				+        }
			
 
				+    };
			
 
				+    using queue_storage = std::vector<llama_sp_bigram>;
			
 
				+    using queue = std::priority_queue<llama_sp_bigram, queue_storage, comparator>;
			
 
				+    llama_sp_symbol::index left;
			
 
				+    llama_sp_symbol::index right;
			
 
				+    float score;
			
 
				+    size_t size;
			
 
				+};
			
 
				+
			
 
				+struct llama_tokenizer {
			
 
				+    llama_tokenizer(const gpt_vocab & vocab): vocab_(vocab) {}
			
 
				+
			
 
				+    void tokenize(std::string_view text, std::vector<gpt_vocab::id> & output) {
			
 
				+        // split string into utf8 chars
			
 
				+        int index = 0;
			
 
				+        while (!text.empty()) {
			
 
				+            llama_sp_symbol sym;
			
 
				+            size_t char_len = std::min(text.size(), utf8_len(text.data()[0]));
			
 
				+            sym.text = std::string_view(text.data(), char_len);
			
 
				+            sym.prev = index - 1;
			
 
				+            text.remove_prefix(char_len);
			
 
				+            sym.next = text.empty() ? -1 : index + 1;
			
 
				+            index++;
			
 
				+            symbols_.emplace_back(std::move(sym));
			
 
				+        }
			
 
				+
			
 
				+        // seed the work queue with all possible 2-character tokens.
			
 
				+        for (size_t i = 1; i < symbols_.size(); ++i) {
			
 
				+            try_add_bigram(i - 1, i);
			
 
				+        }
			
 
				+
			
 
				+        // keep substituting the highest frequency pairs for as long as we can.
			
 
				+        while (!work_queue_.empty()) {
			
 
				+            auto bigram = work_queue_.top();
			
 
				+            work_queue_.pop();
			
 
				+
			
 
				+            auto & left_sym = symbols_[bigram.left];
			
 
				+            auto & right_sym = symbols_[bigram.right];
			
 
				+
			
 
				+            // if one of the symbols already got merged, skip it.
			
 
				+            if (left_sym.text.empty() || right_sym.text.empty() ||
			
 
				+                left_sym.text.size() + right_sym.text.size() != bigram.size) {
			
 
				+                continue;
			
 
				+            }
			
 
				+
			
 
				+            // merge the right sym into the left one
			
 
				+            left_sym.text = std::string_view(left_sym.text.data(), left_sym.text.size() + right_sym.text.size());
			
 
				+            right_sym.text = std::string_view("");
			
 
				+
			
 
				+            // remove the right sym from the chain
			
 
				+            left_sym.next = right_sym.next;
			
 
				+            if (right_sym.next >= 0) {
			
 
				+                symbols_[right_sym.next].prev = bigram.left;
			
 
				+            }
			
 
				+
			
 
				+            // find more substitutions
			
 
				+            try_add_bigram(left_sym.prev, bigram.left);
			
 
				+            try_add_bigram(bigram.left, left_sym.next);
			
 
				+        }
			
 
				+
			
 
				+        for (int i = 0; i != -1; i = symbols_[i].next) {
			
 
				+            auto& symbol = symbols_[i];
			
 
				+            auto token = vocab_.token_to_id.find(std::string(symbol.text));
			
 
				+
			
 
				+            if (token == vocab_.token_to_id.end()) {
			
 
				+                // output any symbols that did not form tokens as bytes.
			
 
				+                for (int j = 0; j < symbol.text.size(); ++j) {
			
 
				+                    gpt_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
			
 
				+                    output.push_back(token_id);
			
 
				                 }
			
 
				+            } else {
			
 
				+                output.push_back((*token).second);
			
 
				             }
			
 
				         }
			
 
				     }
			
 
				 
			
 
				-    // Backward pass
			
 
				-    int i = len;
			
 
				-    while (i > 0) {
			
 
				-        gpt_vocab::id token_id = prev[i];
			
 
				-        if (token_id == 0) {
			
 
				-	    // TODO: Return error or something more meaningful
			
 
				-            printf("failed to tokenize string!\n");
			
 
				-	    break;
			
 
				+private:
			
 
				+    void try_add_bigram(int left, int right) {
			
 
				+        if (left == -1 || right == -1) {
			
 
				+            return;
			
 
				+        }
			
 
				+
			
 
				+        std::string_view text(symbols_[left].text.data(), symbols_[left].text.size() + symbols_[right].text.size());
			
 
				+        auto token = vocab_.token_to_id.find(std::string(text));
			
 
				+
			
 
				+        if (token == vocab_.token_to_id.end()) {
			
 
				+            return;
			
 
				         }
			
 
				-        res.push_back(token_id);
			
 
				-        auto token = (*vocab.id_to_token.find(token_id)).second;
			
 
				-        i -= token.length();
			
 
				+
			
 
				+        auto score = vocab_.score.find((*token).second);
			
 
				+
			
 
				+        if (score == vocab_.score.end()) {
			
 
				+            return;
			
 
				+        }
			
 
				+
			
 
				+        llama_sp_bigram bigram;
			
 
				+        bigram.left = left;
			
 
				+        bigram.right = right;
			
 
				+        bigram.score = (*score).second;
			
 
				+        bigram.size = text.size();
			
 
				+        work_queue_.push(bigram);
			
 
				     }
			
 
				 
			
 
				-    if (bos) {
			
 
				-        res.push_back(1); // TODO: replace with vocab.bos
			
 
				+    const gpt_vocab & vocab_;
			
 
				+    std::vector<llama_sp_symbol> symbols_;
			
 
				+    llama_sp_bigram::queue work_queue_;
			
 
				+};
			
 
				+
			
 
				+std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, std::string_view text, bool bos) {
			
 
				+    llama_tokenizer tokenizer(vocab);
			
 
				+    std::vector<gpt_vocab::id> output;
			
 
				+
			
 
				+    if (text.size() == 0) {
			
 
				+        return output;
			
 
				     }
			
 
				 
			
 
				-    // Pieces are in reverse order so correct that
			
 
				-    std::reverse(res.begin(), res.end());
			
 
				+    if (bos) {
			
 
				+        output.push_back(1);
			
 
				+    }
			
 
				 
			
 
				-    return res;
			
 
				+    tokenizer.tokenize(text, output);
			
 
				+    return output;
			
 
				 }
			
 
				 
			
 
				 bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
			
--- a/utils.h
+++ b/utils.h
@@ -58,6 +58,7 @@ struct gpt_vocab {
 
				 
			
 
				     std::map<token, id> token_to_id;
			
 
				     std::map<id, token> id_to_token;
			
 
				+    std::map<id, float> score;
			
 
				 };
			
 
				 
			
 
				 void replace(std::string & str, const std::string & needle, const std::string & replacement);
			
@@ -79,7 +80,7 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
 
				 
			
 
				 // TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
			
 
				 // ref: https://github.com/google/sentencepiece
			
 
				-std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos);
			
 
				+std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, std::string_view text, bool bos);
			
 
				 
			
 
				 // load the tokens from encoder.json
			
 
				 bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);