Kaynağa Gözat

Reduce model loading time (#43)

* Use buffering

* Use vector

* Minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
uint256_t 2 yıl önce
ebeveyn
işleme
63fd76fbb0
1 değiştirilmiş dosya ile 4 ekleme ve 0 silme
  1. 4 0
      main.cpp

+ 4 - 0
main.cpp

@@ -87,7 +87,10 @@ struct llama_model {
 bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
 bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
     printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
     printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
 
 
+    std::vector<char> f_buf(1024*1024);
+
     auto fin = std::ifstream(fname, std::ios::binary);
     auto fin = std::ifstream(fname, std::ios::binary);
+    fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
     if (!fin) {
     if (!fin) {
         fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
         fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
         return false;
         return false;
@@ -325,6 +328,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
         printf("%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
         printf("%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
 
 
         fin = std::ifstream(fname_part, std::ios::binary);
         fin = std::ifstream(fname_part, std::ios::binary);
+        fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
         fin.seekg(file_offset);
         fin.seekg(file_offset);
 
 
         // load weights
         // load weights