2 лет назад · 319cdb3e1f
--- a/README.md
+++ b/README.md
@@ -114,6 +114,5 @@ python3 convert-pth-to-ggml.py models/7B/ 1
 
				   In general, it seems to work, but I think it fails for unicode character support. Hopefully, someone can help with that
			
 
				 - I don't know yet how much the quantization affects the quality of the generated text
			
 
				 - Probably the token sampling can be improved
			
 
				-- No Windows support
			
 
				 - x86 quantization support [not yet ready](https://github.com/ggerganov/ggml/pull/27). Basically, you want to run this on Apple Silicon
			
 
				-  
			
 
				+
			
--- a/main.cpp
+++ b/main.cpp
@@ -728,6 +728,7 @@ int main(int argc, char ** argv) {
 
				 
			
 
				         // end of text token
			
 
				         if (embd.back() == 2) {
			
 
				+            printf(" [end of text]\n");
			
 
				             break;
			
 
				         }
			
 
				     }
			
--- a/models/.gitignore
+++ b/models/.gitignore
--- a/utils.cpp
+++ b/utils.cpp
@@ -231,39 +231,39 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
 
				 }
			
 
				 
			
 
				 std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
			
 
				-    auto res = gpt_tokenize(vocab, text);
			
 
				+    //auto res = gpt_tokenize(vocab, text);
			
 
				+
			
 
				+    //if (bos) {
			
 
				+    //    res.insert(res.begin(), 1); // TODO: replace with vocab.bos
			
 
				+    //}
			
 
				+
			
 
				+    std::vector<gpt_vocab::id> res;
			
 
				 
			
 
				     if (bos) {
			
 
				-        res.insert(res.begin(), 1); // TODO: replace with vocab.bos
			
 
				+        res.push_back(1); // TODO: replace with vocab.bos
			
 
				     }
			
 
				 
			
 
				-    //std::vector<gpt_vocab::id> res;
			
 
				+     //find the longest token that matches the text
			
 
				+    int pos = 0;
			
 
				+    while (true) {
			
 
				+        int l = 0;
			
 
				+        int t = 0;
			
 
				+        for (const auto & kv : vocab.id_to_token) {
			
 
				+            if (kv.second.size() < l) continue;
			
 
				+            if (kv.second.size() > text.size() - pos) continue;
			
 
				+            if (text.substr(pos, kv.second.size()) == kv.second) {
			
 
				+                l = kv.second.size();
			
 
				+                t = kv.first;
			
 
				+            }
			
 
				+        }
			
 
				 
			
 
				-    //if (bos) {
			
 
				-    //    res.push_back(1); // TODO: replace with vocab.bos
			
 
				-    //}
			
 
				+        if (l == 0 && t != 13) {
			
 
				+            break;
			
 
				+        }
			
 
				 
			
 
				-    // find the longest token that matches the text
			
 
				-    //int pos = 0;
			
 
				-    //while (true) {
			
 
				-    //    int l = 0;
			
 
				-    //    int t = 0;
			
 
				-    //    for (const auto & kv : vocab.id_to_token) {
			
 
				-    //        if (kv.second.size() < l) continue;
			
 
				-    //        if (kv.second.size() > text.size() - pos) continue;
			
 
				-    //        if (text.substr(pos, kv.second.size()) == kv.second) {
			
 
				-    //            l = kv.second.size();
			
 
				-    //            t = kv.first;
			
 
				-    //        }
			
 
				-    //    }
			
 
				-
			
 
				-    //    if (l == 0 && t != 13) {
			
 
				-    //        break;
			
 
				-    //    }
			
 
				-
			
 
				-    //    res.push_back(t);
			
 
				-    //    pos += l;
			
 
				-    //}
			
 
				+        res.push_back(t);
			
 
				+        pos += l;
			
 
				+    }
			
 
				 
			
 
				     return res;
			
 
				 }
			
--- a/utils.h
+++ b/utils.h
@@ -15,12 +15,12 @@
 
				 struct gpt_params {
			
 
				     int32_t seed      = -1; // RNG seed
			
 
				     int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
			
 
				-    int32_t n_predict = 200; // new tokens to predict
			
 
				+    int32_t n_predict = 128; // new tokens to predict
			
 
				 
			
 
				     // sampling parameters
			
 
				-    int32_t top_k = 100;
			
 
				+    int32_t top_k = 40;
			
 
				     float   top_p = 0.95f;
			
 
				-    float   temp  = 0.8f;
			
 
				+    float   temp  = 0.80f;
			
 
				 
			
 
				     int32_t n_batch = 8; // batch size for prompt processing