2 年前 · a316a425d0
--- a/.gitignore
+++ b/.gitignore
@@ -19,6 +19,7 @@ models/*
 
				 /main
			
 
				 /quantize
			
 
				 /result
			
 
				+/perplexity
			
 
				 
			
 
				 arm_neon.h
			
 
				 compile_commands.json
			
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -211,17 +211,6 @@ endif()
 
				 # Build libraries
			
 
				 #
			
 
				 
			
 
				-add_library(utils OBJECT
			
 
				-            utils.cpp
			
 
				-            utils.h)
			
 
				-
			
 
				-target_include_directories(utils PUBLIC .)
			
 
				-target_compile_features(utils PUBLIC cxx_std_11) # don't bump
			
 
				-target_link_libraries(utils PRIVATE ${LLAMA_EXTRA_LIBS})
			
 
				-if (BUILD_SHARED_LIBS)
			
 
				-    set_target_properties(utils PROPERTIES POSITION_INDEPENDENT_CODE ON)
			
 
				-endif()
			
 
				-
			
 
				 add_library(ggml OBJECT
			
 
				             ggml.c
			
 
				             ggml.h)
			
@@ -239,22 +228,12 @@ add_library(llama
 
				 
			
 
				 target_include_directories(llama PUBLIC .)
			
 
				 target_compile_features(llama PUBLIC cxx_std_11) # don't bump
			
 
				-target_link_libraries(llama PRIVATE utils ggml ${LLAMA_EXTRA_LIBS})
			
 
				+target_link_libraries(llama PRIVATE ggml ${LLAMA_EXTRA_LIBS})
			
 
				 if (BUILD_SHARED_LIBS)
			
 
				     set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
			
 
				     target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
			
 
				 endif()
			
 
				 
			
 
				-#
			
 
				-# Executables
			
 
				-#
			
 
				-
			
 
				-add_executable(main main.cpp)
			
 
				-target_link_libraries(main PRIVATE llama ggml utils)
			
 
				-
			
 
				-add_executable(quantize quantize.cpp)
			
 
				-target_link_libraries(quantize PRIVATE llama ggml utils)
			
 
				-
			
 
				 #
			
 
				 # programs, examples and tests
			
 
				 #
			
@@ -264,6 +243,6 @@ if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
 
				     add_subdirectory(tests)
			
 
				 endif ()
			
 
				 
			
 
				-#if (LLAMA_BUILD_EXAMPLES)
			
 
				-#    add_subdirectory(examples)
			
 
				-#endif()
			
 
				+if (LLAMA_BUILD_EXAMPLES)
			
 
				+    add_subdirectory(examples)
			
 
				+endif()
			
--- a/Makefile
+++ b/Makefile
@@ -212,7 +212,7 @@ $(info I CC:       $(CCV))
 
				 $(info I CXX:      $(CXXV))
			
 
				 $(info )
			
 
				 
			
 
				-default: main quantize
			
 
				+default: main quantize perplexity
			
 
				 
			
 
				 #
			
 
				 # Build library
			
@@ -224,20 +224,23 @@ ggml.o: ggml.c ggml.h
 
				 llama.o: llama.cpp llama.h
			
 
				 	$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
			
 
				 
			
 
				-utils.o: utils.cpp utils.h
			
 
				-	$(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o
			
 
				+common.o: examples/common.cpp examples/common.h
			
 
				+	$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
			
 
				 
			
 
				 clean:
			
 
				-	rm -f *.o main quantize
			
 
				+	rm -vf *.o main quantize perplexity
			
 
				 
			
 
				-main: main.cpp ggml.o llama.o utils.o
			
 
				-	$(CXX) $(CXXFLAGS) main.cpp ggml.o llama.o utils.o -o main $(LDFLAGS)
			
 
				+main: examples/main/main.cpp ggml.o llama.o common.o
			
 
				+	$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
			
 
				 	@echo
			
 
				 	@echo '====  Run ./main -h for help.  ===='
			
 
				 	@echo
			
 
				 
			
 
				-quantize: quantize.cpp ggml.o llama.o utils.o
			
 
				-	$(CXX) $(CXXFLAGS) quantize.cpp ggml.o llama.o utils.o -o quantize $(LDFLAGS)
			
 
				+quantize: examples/quantize/quantize.cpp ggml.o llama.o
			
 
				+	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
			
 
				+
			
 
				+perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
			
 
				+	$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
			
 
				 
			
 
				 #
			
 
				 # Tests
			
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -0,0 +1,36 @@
 
				+# dependencies
			
 
				+
			
 
				+find_package(Threads REQUIRED)
			
 
				+
			
 
				+# third-party
			
 
				+
			
 
				+# ...
			
 
				+
			
 
				+# common
			
 
				+
			
 
				+set(TARGET common)
			
 
				+
			
 
				+add_library(${TARGET} OBJECT
			
 
				+    common.h
			
 
				+    common.cpp
			
 
				+    )
			
 
				+
			
 
				+if (BUILD_SHARED_LIBS)
			
 
				+    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
			
 
				+endif()
			
 
				+
			
 
				+target_include_directories(${TARGET} PUBLIC .)
			
 
				+target_compile_features(${TARGET} PUBLIC cxx_std_11)
			
 
				+target_link_libraries(${TARGET} PRIVATE llama)
			
 
				+
			
 
				+# examples
			
 
				+
			
 
				+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
			
 
				+
			
 
				+if (EMSCRIPTEN)
			
 
				+else()
			
 
				+    add_subdirectory(main)
			
 
				+    add_subdirectory(quantize)
			
 
				+    add_subdirectory(perplexity)
			
 
				+    add_subdirectory(embedding)
			
 
				+endif()
			
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -1,6 +1,6 @@
 
				-#include "ggml.h"
			
 
				+#include "common.h"
			
 
				 
			
 
				-#include "utils.h"
			
 
				+#include "ggml.h"
			
 
				 
			
 
				 #include <cassert>
			
 
				 #include <cstring>
			
--- a/examples/common.h
+++ b/examples/common.h
--- a/examples/embedding/CMakeLists.txt
+++ b/examples/embedding/CMakeLists.txt
@@ -0,0 +1,4 @@
 
				+set(TARGET embedding)
			
 
				+add_executable(${TARGET} embedding.cpp)
			
 
				+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
			
 
				+target_compile_features(${TARGET} PRIVATE cxx_std_11)
			
--- a/examples/embedding/README.md
+++ b/examples/embedding/README.md
@@ -0,0 +1,3 @@
 
				+# embedding

			
 
				+

			
 
				+TODO

			
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -0,0 +1,106 @@
 
				+#include "common.h"
			
 
				+#include "llama.h"
			
 
				+
			
 
				+#include <cassert>
			
 
				+#include <cinttypes>
			
 
				+#include <cmath>
			
 
				+#include <cstdio>
			
 
				+#include <cstring>
			
 
				+#include <fstream>
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				+
			
 
				+int main(int argc, char ** argv) {
			
 
				+    gpt_params params;
			
 
				+    params.model = "models/llama-7B/ggml-model.bin";
			
 
				+
			
 
				+    if (gpt_params_parse(argc, argv, params) == false) {
			
 
				+        return 1;
			
 
				+    }
			
 
				+
			
 
				+    params.embedding = true;
			
 
				+
			
 
				+    if (params.n_ctx > 2048) {
			
 
				+        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
			
 
				+                "expect poor results\n", __func__, params.n_ctx);
			
 
				+    }
			
 
				+
			
 
				+    if (params.seed <= 0) {
			
 
				+        params.seed = time(NULL);
			
 
				+    }
			
 
				+
			
 
				+    fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
			
 
				+
			
 
				+    std::mt19937 rng(params.seed);
			
 
				+    if (params.random_prompt) {
			
 
				+        params.prompt = gpt_random_prompt(rng);
			
 
				+    }
			
 
				+
			
 
				+    llama_context * ctx;
			
 
				+
			
 
				+    // load the model
			
 
				+    {
			
 
				+        auto lparams = llama_context_default_params();
			
 
				+
			
 
				+        lparams.n_ctx      = params.n_ctx;
			
 
				+        lparams.n_parts    = params.n_parts;
			
 
				+        lparams.seed       = params.seed;
			
 
				+        lparams.f16_kv     = params.memory_f16;
			
 
				+        lparams.logits_all = params.perplexity;
			
 
				+        lparams.use_mlock  = params.use_mlock;
			
 
				+        lparams.embedding  = params.embedding;
			
 
				+
			
 
				+        ctx = llama_init_from_file(params.model.c_str(), lparams);
			
 
				+
			
 
				+        if (ctx == NULL) {
			
 
				+            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
			
 
				+            return 1;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // print system information
			
 
				+    {
			
 
				+        fprintf(stderr, "\n");
			
 
				+        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
			
 
				+                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
			
 
				+    }
			
 
				+
			
 
				+    int n_past = 0;
			
 
				+
			
 
				+    // Add a space in front of the first character to match OG llama tokenizer behavior
			
 
				+    params.prompt.insert(0, 1, ' ');
			
 
				+
			
 
				+    // tokenize the prompt
			
 
				+    auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
			
 
				+
			
 
				+    // determine newline token
			
 
				+    auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
			
 
				+
			
 
				+    if (params.verbose_prompt) {
			
 
				+        fprintf(stderr, "\n");
			
 
				+        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
			
 
				+        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
			
 
				+        for (int i = 0; i < (int) embd_inp.size(); i++) {
			
 
				+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
			
 
				+        }
			
 
				+        fprintf(stderr, "\n");
			
 
				+    }
			
 
				+
			
 
				+    if (params.embedding){
			
 
				+        if (embd_inp.size() > 0) {
			
 
				+            if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) {
			
 
				+                fprintf(stderr, "%s : failed to eval\n", __func__);
			
 
				+                return 1;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        const auto embeddings = llama_get_embeddings(ctx);
			
 
				+
			
 
				+        // TODO: print / use the embeddings
			
 
				+    }
			
 
				+
			
 
				+    llama_print_timings(ctx);
			
 
				+    llama_free(ctx);
			
 
				+
			
 
				+    return 0;
			
 
				+}
			
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@@ -0,0 +1,4 @@
 
				+set(TARGET main)
			
 
				+add_executable(${TARGET} main.cpp)
			
 
				+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
			
 
				+target_compile_features(${TARGET} PRIVATE cxx_std_11)
			
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -0,0 +1,3 @@
 
				+# main

			
 
				+

			
 
				+TODO

			
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -1,5 +1,4 @@
 
				-#include "utils.h"
			
 
				-#include "ggml.h"
			
 
				+#include "common.h"
			
 
				 #include "llama.h"
			
 
				 
			
 
				 #include <cassert>
			
@@ -65,79 +64,6 @@ void set_console_state(console_state new_st)
 
				     }
			
 
				 }
			
 
				 
			
 
				-std::vector<double> softmax(const std::vector<float>& logits) {
			
 
				-    std::vector<double> probs(logits.size());
			
 
				-    float max_logit = logits[0];
			
 
				-    for (float v : logits) max_logit = std::max(max_logit, v);
			
 
				-    double sum_exp = 0.0;
			
 
				-    for (size_t i = 0; i < logits.size(); i++) {
			
 
				-        // Subtract the maximum logit value from the current logit value for numerical stability
			
 
				-        float logit = logits[i] - max_logit;
			
 
				-        double exp_logit = std::exp(logit);
			
 
				-        sum_exp += exp_logit;
			
 
				-        probs[i] = exp_logit;
			
 
				-    }
			
 
				-    for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp;
			
 
				-    return probs;
			
 
				-}
			
 
				-
			
 
				-void perplexity(llama_context * ctx, const gpt_params & params) {
			
 
				-    // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
			
 
				-    // Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
			
 
				-    // Output: `perplexity: 13.5106 [114/114]`
			
 
				-    auto tokens = ::llama_tokenize(ctx, params.prompt, true);
			
 
				-
			
 
				-    int count = 0;
			
 
				-    double nll = 0.0;
			
 
				-    int seq_count = tokens.size() / params.n_ctx;
			
 
				-
			
 
				-    fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count);
			
 
				-
			
 
				-    for (int i = 0; i < seq_count; ++i) {
			
 
				-        int start = i * params.n_ctx;
			
 
				-        int end = start + params.n_ctx - 1;
			
 
				-        std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end);
			
 
				-        auto start_t = std::chrono::high_resolution_clock::now();
			
 
				-        if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) {
			
 
				-            fprintf(stderr, "%s : failed to eval\n", __func__);
			
 
				-            return;
			
 
				-        }
			
 
				-        auto end_t = std::chrono::high_resolution_clock::now();
			
 
				-        if (i == 0) {
			
 
				-            double seconds = std::chrono::duration<double>(end_t - start_t).count();
			
 
				-            printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0));
			
 
				-        }
			
 
				-        // We get the logits for all the tokens in the context window (params.n_ctx)
			
 
				-        // from llama_eval above.  Now, based on https://huggingface.co/docs/transformers/perplexity,
			
 
				-        // calculate the perplexity over the last half the window (so the model always has
			
 
				-        // some context to predict the token).
			
 
				-        //
			
 
				-        // We rely on the fact that attention in the forward pass only looks at previous
			
 
				-        // tokens here, so the logits returned for each token are an accurate representation
			
 
				-        // of what the model would have predicted at that point.
			
 
				-        //
			
 
				-        // Example, we have a context window of 512, we will compute perplexity for each of the
			
 
				-        // last 256 tokens.  Then, we split the input up into context window size chunks to
			
 
				-        // process the entire prompt.
			
 
				-
			
 
				-        auto logits = llama_get_logits(ctx);
			
 
				-        for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) {
			
 
				-            // Calculate probability of next token, given the previous ones.
			
 
				-            int n_vocab = llama_n_vocab(ctx);
			
 
				-            std::vector<float> tok_logits(
			
 
				-                logits + j * n_vocab,
			
 
				-                logits + (j + 1) * n_vocab);
			
 
				-            double prob = softmax(tok_logits)[tokens[start + j + 1]];
			
 
				-            nll += -std::log(prob);
			
 
				-            ++count;
			
 
				-        }
			
 
				-        // perplexity is e^(average negative log-likelihood)
			
 
				-        printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
			
 
				-        fflush(stdout);
			
 
				-    }
			
 
				-    printf("\n");
			
 
				-}
			
 
				-
			
 
				 static bool is_interacting = false;
			
 
				 
			
 
				 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
			
@@ -155,9 +81,6 @@ void sigint_handler(int signo) {
 
				 #endif
			
 
				 
			
 
				 int main(int argc, char ** argv) {
			
 
				-    // has to be called once at the start of the program to init ggml stuff
			
 
				-    ggml_time_init();
			
 
				-
			
 
				     gpt_params params;
			
 
				     params.model = "models/llama-7B/ggml-model.bin";
			
 
				 
			
@@ -165,6 +88,14 @@ int main(int argc, char ** argv) {
 
				         return 1;
			
 
				     }
			
 
				 
			
 
				+    if (params.perplexity) {
			
 
				+        printf("\n************\n");
			
 
				+        printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
			
 
				+        printf("************\n\n");
			
 
				+
			
 
				+        return 0;
			
 
				+    }
			
 
				+
			
 
				     if (params.n_ctx > 2048) {
			
 
				         fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
			
 
				                 "expect poor results\n", __func__, params.n_ctx);
			
@@ -198,9 +129,7 @@ int main(int argc, char ** argv) {
 
				         lparams.n_parts    = params.n_parts;
			
 
				         lparams.seed       = params.seed;
			
 
				         lparams.f16_kv     = params.memory_f16;
			
 
				-        lparams.logits_all = params.perplexity;
			
 
				         lparams.use_mlock  = params.use_mlock;
			
 
				-        lparams.embedding  = params.embedding;
			
 
				 
			
 
				         ctx = llama_init_from_file(params.model.c_str(), lparams);
			
 
				 
			
@@ -236,11 +165,6 @@ int main(int argc, char ** argv) {
 
				         return 0;
			
 
				     }
			
 
				 
			
 
				-    if (params.perplexity) {
			
 
				-        perplexity(ctx, params);
			
 
				-        exit(0);
			
 
				-    }
			
 
				-
			
 
				     int n_past = 0;
			
 
				 
			
 
				     // Add a space in front of the first character to match OG llama tokenizer behavior
			
@@ -346,27 +270,6 @@ int main(int argc, char ** argv) {
 
				     // the first thing we will do is to output the prompt, so set color accordingly
			
 
				     set_console_state(CONSOLE_STATE_PROMPT);
			
 
				 
			
 
				-    if (params.embedding){
			
 
				-        embd = embd_inp;
			
 
				-
			
 
				-        if (embd.size() > 0) {
			
 
				-            if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
			
 
				-                fprintf(stderr, "%s : failed to eval\n", __func__);
			
 
				-                return 1;
			
 
				-            }
			
 
				-        }
			
 
				-
			
 
				-        const auto embeddings = llama_get_embeddings(ctx);
			
 
				-
			
 
				-        // TODO: print / use the embeddings
			
 
				-
			
 
				-        if (params.use_color) {
			
 
				-            printf(ANSI_COLOR_RESET);
			
 
				-        }
			
 
				-
			
 
				-        return 0;
			
 
				-    }
			
 
				-
			
 
				     while (remaining_tokens > 0 || params.interactive) {
			
 
				         // predict
			
 
				         if (embd.size() > 0) {
			
@@ -392,10 +295,6 @@ int main(int argc, char ** argv) {
 
				                 auto logits = llama_get_logits(ctx);
			
 
				 
			
 
				                 if (params.ignore_eos) {
			
 
				-                    // set the logit of the eos token to zero to avoid sampling it
			
 
				-                    //logits[logits.size() - n_vocab + EOS_TOKEN_ID] = 0;
			
 
				-                    // TODO: this does not work of params.logits_all == true
			
 
				-                    assert(params.perplexity == false);
			
 
				                     logits[llama_token_eos()] = 0;
			
 
				                 }
			
 
				 
			
--- a/examples/perplexity/CMakeLists.txt
+++ b/examples/perplexity/CMakeLists.txt
@@ -0,0 +1,4 @@
 
				+set(TARGET perplexity)
			
 
				+add_executable(${TARGET} perplexity.cpp)
			
 
				+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
			
 
				+target_compile_features(${TARGET} PRIVATE cxx_std_11)
			
--- a/examples/perplexity/README.md
+++ b/examples/perplexity/README.md
@@ -0,0 +1,3 @@
 
				+# perplexity

			
 
				+

			
 
				+TODO

			
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -0,0 +1,146 @@
 
				+#include "common.h"
			
 
				+#include "llama.h"
			
 
				+
			
 
				+#include <cassert>
			
 
				+#include <cinttypes>
			
 
				+#include <cmath>
			
 
				+#include <cstdio>
			
 
				+#include <cstring>
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				+
			
 
				+std::vector<double> softmax(const std::vector<float>& logits) {
			
 
				+    std::vector<double> probs(logits.size());
			
 
				+    float max_logit = logits[0];
			
 
				+    for (float v : logits) max_logit = std::max(max_logit, v);
			
 
				+    double sum_exp = 0.0;
			
 
				+    for (size_t i = 0; i < logits.size(); i++) {
			
 
				+        // Subtract the maximum logit value from the current logit value for numerical stability
			
 
				+        float logit = logits[i] - max_logit;
			
 
				+        double exp_logit = std::exp(logit);
			
 
				+        sum_exp += exp_logit;
			
 
				+        probs[i] = exp_logit;
			
 
				+    }
			
 
				+    for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp;
			
 
				+    return probs;
			
 
				+}
			
 
				+
			
 
				+void perplexity(llama_context * ctx, const gpt_params & params) {
			
 
				+    // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
			
 
				+    // Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
			
 
				+    // Output: `perplexity: 13.5106 [114/114]`
			
 
				+    auto tokens = ::llama_tokenize(ctx, params.prompt, true);
			
 
				+
			
 
				+    int count = 0;
			
 
				+    double nll = 0.0;
			
 
				+    int seq_count = tokens.size() / params.n_ctx;
			
 
				+
			
 
				+    fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count);
			
 
				+
			
 
				+    for (int i = 0; i < seq_count; ++i) {
			
 
				+        int start = i * params.n_ctx;
			
 
				+        int end = start + params.n_ctx - 1;
			
 
				+        std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end);
			
 
				+        auto start_t = std::chrono::high_resolution_clock::now();
			
 
				+        if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) {
			
 
				+            fprintf(stderr, "%s : failed to eval\n", __func__);
			
 
				+            return;
			
 
				+        }
			
 
				+        auto end_t = std::chrono::high_resolution_clock::now();
			
 
				+        if (i == 0) {
			
 
				+            double seconds = std::chrono::duration<double>(end_t - start_t).count();
			
 
				+            printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0));
			
 
				+        }
			
 
				+        // We get the logits for all the tokens in the context window (params.n_ctx)
			
 
				+        // from llama_eval above.  Now, based on https://huggingface.co/docs/transformers/perplexity,
			
 
				+        // calculate the perplexity over the last half the window (so the model always has
			
 
				+        // some context to predict the token).
			
 
				+        //
			
 
				+        // We rely on the fact that attention in the forward pass only looks at previous
			
 
				+        // tokens here, so the logits returned for each token are an accurate representation
			
 
				+        // of what the model would have predicted at that point.
			
 
				+        //
			
 
				+        // Example, we have a context window of 512, we will compute perplexity for each of the
			
 
				+        // last 256 tokens.  Then, we split the input up into context window size chunks to
			
 
				+        // process the entire prompt.
			
 
				+
			
 
				+        auto logits = llama_get_logits(ctx);
			
 
				+        for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) {
			
 
				+            // Calculate probability of next token, given the previous ones.
			
 
				+            int n_vocab = llama_n_vocab(ctx);
			
 
				+            std::vector<float> tok_logits(
			
 
				+                logits + j * n_vocab,
			
 
				+                logits + (j + 1) * n_vocab);
			
 
				+            double prob = softmax(tok_logits)[tokens[start + j + 1]];
			
 
				+            nll += -std::log(prob);
			
 
				+            ++count;
			
 
				+        }
			
 
				+        // perplexity is e^(average negative log-likelihood)
			
 
				+        printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
			
 
				+        fflush(stdout);
			
 
				+    }
			
 
				+    printf("\n");
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char ** argv) {
			
 
				+    gpt_params params;
			
 
				+    params.model = "models/llama-7B/ggml-model.bin";
			
 
				+
			
 
				+    if (gpt_params_parse(argc, argv, params) == false) {
			
 
				+        return 1;
			
 
				+    }
			
 
				+
			
 
				+    params.perplexity = true;
			
 
				+
			
 
				+    if (params.n_ctx > 2048) {
			
 
				+        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
			
 
				+                "expect poor results\n", __func__, params.n_ctx);
			
 
				+    }
			
 
				+
			
 
				+    if (params.seed <= 0) {
			
 
				+        params.seed = time(NULL);
			
 
				+    }
			
 
				+
			
 
				+    fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
			
 
				+
			
 
				+    std::mt19937 rng(params.seed);
			
 
				+    if (params.random_prompt) {
			
 
				+        params.prompt = gpt_random_prompt(rng);
			
 
				+    }
			
 
				+
			
 
				+    llama_context * ctx;
			
 
				+
			
 
				+    // load the model
			
 
				+    {
			
 
				+        auto lparams = llama_context_default_params();
			
 
				+
			
 
				+        lparams.n_ctx      = params.n_ctx;
			
 
				+        lparams.n_parts    = params.n_parts;
			
 
				+        lparams.seed       = params.seed;
			
 
				+        lparams.f16_kv     = params.memory_f16;
			
 
				+        lparams.logits_all = params.perplexity;
			
 
				+        lparams.use_mlock  = params.use_mlock;
			
 
				+        lparams.embedding  = params.embedding;
			
 
				+
			
 
				+        ctx = llama_init_from_file(params.model.c_str(), lparams);
			
 
				+
			
 
				+        if (ctx == NULL) {
			
 
				+            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
			
 
				+            return 1;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // print system information
			
 
				+    {
			
 
				+        fprintf(stderr, "\n");
			
 
				+        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
			
 
				+                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
			
 
				+    }
			
 
				+
			
 
				+    perplexity(ctx, params);
			
 
				+
			
 
				+    llama_print_timings(ctx);
			
 
				+    llama_free(ctx);
			
 
				+
			
 
				+    return 0;
			
 
				+}
			
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@@ -0,0 +1,4 @@
 
				+set(TARGET quantize)
			
 
				+add_executable(${TARGET} quantize.cpp)
			
 
				+target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
			
 
				+target_compile_features(${TARGET} PRIVATE cxx_std_11)
			
--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@@ -0,0 +1,3 @@
 
				+# quantize
			
 
				+
			
 
				+TODO
			
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
--- a/ggml.c
+++ b/ggml.c
@@ -5741,8 +5741,8 @@ static bool ggml_compute_forward_mul_mat_use_blas(
 
				         const struct ggml_tensor * src0,
			
 
				         const struct ggml_tensor * src1,
			
 
				               struct ggml_tensor * dst) {
			
 
				-    const int ne00 = src0->ne[0];
			
 
				-    const int ne01 = src0->ne[1];
			
 
				+    //const int ne00 = src0->ne[0];
			
 
				+    //const int ne01 = src0->ne[1];
			
 
				 
			
 
				     const int ne10 = src1->ne[0];
			
 
				 
			
@@ -5776,16 +5776,16 @@ static void ggml_compute_forward_mul_mat_f32(
 
				 
			
 
				     const int ne10 = src1->ne[0];
			
 
				     const int ne11 = src1->ne[1];
			
 
				-    const int ne12 = src1->ne[2];
			
 
				-    const int ne13 = src1->ne[3];
			
 
				+    //const int ne12 = src1->ne[2];
			
 
				+    //const int ne13 = src1->ne[3];
			
 
				 
			
 
				-    const int ne0  = dst->ne[0];
			
 
				-    const int ne1  = dst->ne[1];
			
 
				-    const int ne2  = dst->ne[2];
			
 
				-    const int ne3  = dst->ne[3];
			
 
				-    const int ne   = ne0*ne1*ne2*ne3;
			
 
				+    //const int ne0  = dst->ne[0];
			
 
				+    //const int ne1  = dst->ne[1];
			
 
				+    //const int ne2  = dst->ne[2];
			
 
				+    //const int ne3  = dst->ne[3];
			
 
				+    //const int ne   = ne0*ne1*ne2*ne3;
			
 
				 
			
 
				-    const int nb00 = src0->nb[0];
			
 
				+    //const int nb00 = src0->nb[0];
			
 
				     const int nb01 = src0->nb[1];
			
 
				     const int nb02 = src0->nb[2];
			
 
				     const int nb03 = src0->nb[3];
			
@@ -5947,7 +5947,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
 
				     const int ne1  = dst->ne[1];
			
 
				     const int ne2  = dst->ne[2];
			
 
				     const int ne3  = dst->ne[3];
			
 
				-    const int ne   = ne0*ne1*ne2*ne3;
			
 
				+    //const int ne   = ne0*ne1*ne2*ne3;
			
 
				 
			
 
				     const int nb00 = src0->nb[0];
			
 
				     const int nb01 = src0->nb[1];
			
@@ -6137,7 +6137,7 @@ static void ggml_compute_forward_mul_mat_q4_0_f32(
 
				     const int ne1  = dst->ne[1];
			
 
				     const int ne2  = dst->ne[2];
			
 
				     const int ne3  = dst->ne[3];
			
 
				-    const int ne   = ne0*ne1*ne2*ne3;
			
 
				+    //const int ne   = ne0*ne1*ne2*ne3;
			
 
				 
			
 
				     const int nb00 = src0->nb[0];
			
 
				     const int nb01 = src0->nb[1];
			
@@ -6322,7 +6322,7 @@ static void ggml_compute_forward_mul_mat_q4_1_f32(
 
				     const int ne1  = dst->ne[1];
			
 
				     const int ne2  = dst->ne[2];
			
 
				     const int ne3  = dst->ne[3];
			
 
				-    const int ne   = ne0*ne1*ne2*ne3;
			
 
				+    //const int ne   = ne0*ne1*ne2*ne3;
			
 
				 
			
 
				     const int nb00 = src0->nb[0];
			
 
				     const int nb01 = src0->nb[1];
			
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,7 +1,7 @@
 
				 function(llama_add_test source)
			
 
				     get_filename_component(TEST_TARGET ${source} NAME_WE)
			
 
				     add_executable(${TEST_TARGET} ${source})
			
 
				-    target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils)
			
 
				+    target_link_libraries(${TEST_TARGET} PRIVATE llama)
			
 
				     add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
			
 
				 endfunction()
			
 
				 
			
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -1,9 +1,9 @@
 
				-#include "utils.h"
			
 
				 #include "llama.h"
			
 
				 
			
 
				 #include <cstdio>
			
 
				 #include <string>
			
 
				 #include <map>
			
 
				+#include <vector>
			
 
				 
			
 
				 static const std::map<std::string, std::vector<llama_token>> k_tests = {
			
 
				     { "Hello World",        { 1,  10994,   2787, }, },
			
@@ -48,7 +48,9 @@ int main(int argc, char **argv) {
 
				     }
			
 
				 
			
 
				     for (const auto & test_kv : k_tests) {
			
 
				-        const auto res = ::llama_tokenize(ctx, test_kv.first, true);
			
 
				+        std::vector<llama_token> res(test_kv.first.size());
			
 
				+        const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), res.size(), true);
			
 
				+        res.resize(n);
			
 
				 
			
 
				         bool correct = res.size() == test_kv.second.size();