|
|
@@ -1,5 +1,4 @@
|
|
|
-#include "utils.h"
|
|
|
-#include "ggml.h"
|
|
|
+#include "common.h"
|
|
|
#include "llama.h"
|
|
|
|
|
|
#include <cassert>
|
|
|
@@ -65,79 +64,6 @@ void set_console_state(console_state new_st)
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-std::vector<double> softmax(const std::vector<float>& logits) {
|
|
|
- std::vector<double> probs(logits.size());
|
|
|
- float max_logit = logits[0];
|
|
|
- for (float v : logits) max_logit = std::max(max_logit, v);
|
|
|
- double sum_exp = 0.0;
|
|
|
- for (size_t i = 0; i < logits.size(); i++) {
|
|
|
- // Subtract the maximum logit value from the current logit value for numerical stability
|
|
|
- float logit = logits[i] - max_logit;
|
|
|
- double exp_logit = std::exp(logit);
|
|
|
- sum_exp += exp_logit;
|
|
|
- probs[i] = exp_logit;
|
|
|
- }
|
|
|
- for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp;
|
|
|
- return probs;
|
|
|
-}
|
|
|
-
|
|
|
-void perplexity(llama_context * ctx, const gpt_params & params) {
|
|
|
- // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
|
|
- // Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
|
|
- // Output: `perplexity: 13.5106 [114/114]`
|
|
|
- auto tokens = ::llama_tokenize(ctx, params.prompt, true);
|
|
|
-
|
|
|
- int count = 0;
|
|
|
- double nll = 0.0;
|
|
|
- int seq_count = tokens.size() / params.n_ctx;
|
|
|
-
|
|
|
- fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count);
|
|
|
-
|
|
|
- for (int i = 0; i < seq_count; ++i) {
|
|
|
- int start = i * params.n_ctx;
|
|
|
- int end = start + params.n_ctx - 1;
|
|
|
- std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end);
|
|
|
- auto start_t = std::chrono::high_resolution_clock::now();
|
|
|
- if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) {
|
|
|
- fprintf(stderr, "%s : failed to eval\n", __func__);
|
|
|
- return;
|
|
|
- }
|
|
|
- auto end_t = std::chrono::high_resolution_clock::now();
|
|
|
- if (i == 0) {
|
|
|
- double seconds = std::chrono::duration<double>(end_t - start_t).count();
|
|
|
- printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0));
|
|
|
- }
|
|
|
- // We get the logits for all the tokens in the context window (params.n_ctx)
|
|
|
- // from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity,
|
|
|
- // calculate the perplexity over the last half the window (so the model always has
|
|
|
- // some context to predict the token).
|
|
|
- //
|
|
|
- // We rely on the fact that attention in the forward pass only looks at previous
|
|
|
- // tokens here, so the logits returned for each token are an accurate representation
|
|
|
- // of what the model would have predicted at that point.
|
|
|
- //
|
|
|
- // Example, we have a context window of 512, we will compute perplexity for each of the
|
|
|
- // last 256 tokens. Then, we split the input up into context window size chunks to
|
|
|
- // process the entire prompt.
|
|
|
-
|
|
|
- auto logits = llama_get_logits(ctx);
|
|
|
- for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) {
|
|
|
- // Calculate probability of next token, given the previous ones.
|
|
|
- int n_vocab = llama_n_vocab(ctx);
|
|
|
- std::vector<float> tok_logits(
|
|
|
- logits + j * n_vocab,
|
|
|
- logits + (j + 1) * n_vocab);
|
|
|
- double prob = softmax(tok_logits)[tokens[start + j + 1]];
|
|
|
- nll += -std::log(prob);
|
|
|
- ++count;
|
|
|
- }
|
|
|
- // perplexity is e^(average negative log-likelihood)
|
|
|
- printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
|
|
- fflush(stdout);
|
|
|
- }
|
|
|
- printf("\n");
|
|
|
-}
|
|
|
-
|
|
|
static bool is_interacting = false;
|
|
|
|
|
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
|
|
@@ -155,9 +81,6 @@ void sigint_handler(int signo) {
|
|
|
#endif
|
|
|
|
|
|
int main(int argc, char ** argv) {
|
|
|
- // has to be called once at the start of the program to init ggml stuff
|
|
|
- ggml_time_init();
|
|
|
-
|
|
|
gpt_params params;
|
|
|
params.model = "models/llama-7B/ggml-model.bin";
|
|
|
|
|
|
@@ -165,6 +88,14 @@ int main(int argc, char ** argv) {
|
|
|
return 1;
|
|
|
}
|
|
|
|
|
|
+ if (params.perplexity) {
|
|
|
+ printf("\n************\n");
|
|
|
+ printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
|
|
|
+ printf("************\n\n");
|
|
|
+
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+
|
|
|
if (params.n_ctx > 2048) {
|
|
|
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
|
|
|
"expect poor results\n", __func__, params.n_ctx);
|
|
|
@@ -198,9 +129,7 @@ int main(int argc, char ** argv) {
|
|
|
lparams.n_parts = params.n_parts;
|
|
|
lparams.seed = params.seed;
|
|
|
lparams.f16_kv = params.memory_f16;
|
|
|
- lparams.logits_all = params.perplexity;
|
|
|
lparams.use_mlock = params.use_mlock;
|
|
|
- lparams.embedding = params.embedding;
|
|
|
|
|
|
ctx = llama_init_from_file(params.model.c_str(), lparams);
|
|
|
|
|
|
@@ -236,11 +165,6 @@ int main(int argc, char ** argv) {
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
- if (params.perplexity) {
|
|
|
- perplexity(ctx, params);
|
|
|
- exit(0);
|
|
|
- }
|
|
|
-
|
|
|
int n_past = 0;
|
|
|
|
|
|
// Add a space in front of the first character to match OG llama tokenizer behavior
|
|
|
@@ -346,27 +270,6 @@ int main(int argc, char ** argv) {
|
|
|
// the first thing we will do is to output the prompt, so set color accordingly
|
|
|
set_console_state(CONSOLE_STATE_PROMPT);
|
|
|
|
|
|
- if (params.embedding){
|
|
|
- embd = embd_inp;
|
|
|
-
|
|
|
- if (embd.size() > 0) {
|
|
|
- if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
|
|
|
- fprintf(stderr, "%s : failed to eval\n", __func__);
|
|
|
- return 1;
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- const auto embeddings = llama_get_embeddings(ctx);
|
|
|
-
|
|
|
- // TODO: print / use the embeddings
|
|
|
-
|
|
|
- if (params.use_color) {
|
|
|
- printf(ANSI_COLOR_RESET);
|
|
|
- }
|
|
|
-
|
|
|
- return 0;
|
|
|
- }
|
|
|
-
|
|
|
while (remaining_tokens > 0 || params.interactive) {
|
|
|
// predict
|
|
|
if (embd.size() > 0) {
|
|
|
@@ -392,10 +295,6 @@ int main(int argc, char ** argv) {
|
|
|
auto logits = llama_get_logits(ctx);
|
|
|
|
|
|
if (params.ignore_eos) {
|
|
|
- // set the logit of the eos token to zero to avoid sampling it
|
|
|
- //logits[logits.size() - n_vocab + EOS_TOKEN_ID] = 0;
|
|
|
- // TODO: this does not work of params.logits_all == true
|
|
|
- assert(params.perplexity == false);
|
|
|
logits[llama_token_eos()] = 0;
|
|
|
}
|
|
|
|