cturan
/
llama.cpp
mirror of https://github.com/cturan/llama.cpp


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237
							#include "ggml.h"
#include "llama.h"
#include "llama-cpp.h"
#include "get-model.h"
#include "common.h"

#ifdef NDEBUG
#undef NDEBUG
#endif

#include <algorithm>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <fstream>
#include <map>
#include <string>
#include <unordered_map>
#include <vector>

struct backend_cli_args {
    const char * model = nullptr;
    const char * test = nullptr;
    const char * device = "cpu";
};

struct test_model_context {
    llama_model_ptr   model;
    llama_context_ptr ctx;
    int               n_vocab = 0;

    std::unordered_map<llama_seq_id, int32_t> seq_positions;
    std::unordered_map<llama_seq_id, int32_t> last_batch_info;

    bool load_model(const backend_cli_args & args) {
        if (model) {
            return true;
        }

        llama_backend_init();

        auto mparams = llama_model_default_params();

        ggml_backend_dev_t devs[2];
        if (std::string_view(args.device) == "gpu") {
            ggml_backend_dev_t gpu = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
            if (gpu == nullptr) {
                fprintf(stderr, "Error: GPU requested but not available\n");
                return false;
            }
            devs[0] = gpu;
            devs[1] = nullptr; // null terminator
            mparams.devices = devs;
            mparams.n_gpu_layers = 999;
        } else if (std::string_view(args.device) == "cpu") {
            ggml_backend_dev_t cpu = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
            devs[0] = cpu;
            devs[1] = nullptr; // null terminator
            mparams.devices = devs;
        }

        fprintf(stderr, "Using device: %s\n", ggml_backend_dev_name(devs[0]));

        model.reset(llama_model_load_from_file(args.model, mparams));

        if (!model) {
            fprintf(stderr, "Warning: failed to load model '%s', skipping test\n", args.model);
            return false;
        }
        n_vocab = llama_vocab_n_tokens(get_vocab());
        fprintf(stderr, "Vocabulary size: %d\n", n_vocab);

        return true;
    }

    bool setup(const backend_cli_args & args, std::vector<llama_sampler_seq_config> & configs, int32_t n_seq_max = -1) {
        if (!model) {
            load_model(args);
        }

        if (ctx) {
            return true;
        }

        llama_context_params cparams = llama_context_default_params();
        cparams.n_ctx = 512;
        cparams.n_batch = 512;
        cparams.samplers = configs.data();
        cparams.n_samplers = configs.size();

        // If n_seq_max is not specified, calculate it from configs
        if (n_seq_max < 0) {
            int32_t max_seq_id = 0;
            for (const auto & config : configs) {
                max_seq_id = std::max(config.seq_id, max_seq_id);
            }
            cparams.n_seq_max = max_seq_id + 1;
        } else {
            cparams.n_seq_max = n_seq_max;
        }

        ctx.reset(llama_init_from_model(model.get(), cparams));
        if (!ctx) {
            fprintf(stderr, "Warning: failed to create context, skipping test\n");
            return false;
        }
        llama_set_warmup(ctx.get(), false);

        return true;
    }

    bool decode(const std::map<llama_seq_id, std::string> & prompts) {
        if (!ctx) {
            fprintf(stderr, "Error: context not initialized, call setup() first\n");
            return false;
        }

        last_batch_info.clear();
        llama_batch batch = llama_batch_init(512, 0, prompts.size());

        auto vocab = get_vocab();
        for (const auto & [seq_id, prompt] : prompts) {
            std::vector<llama_token> tokens;
            tokens.push_back(llama_vocab_bos(vocab));

            std::vector<llama_token> prompt_tokens(32);
            int n_tokens = llama_tokenize(vocab, prompt.c_str(), prompt.length(),
                                           prompt_tokens.data(), prompt_tokens.size(),
                                           false, false);
            if (n_tokens < 0) {
                fprintf(stderr, "Warning: tokenization failed for seq_id %d\n", seq_id);
                llama_batch_free(batch);
                return false;
            }

            for (int i = 0; i < n_tokens; i++) {
                tokens.push_back(prompt_tokens[i]);
            }

            if (seq_positions.find(seq_id) == seq_positions.end()) {
                seq_positions[seq_id] = 0;
            }

            int32_t start_pos = seq_positions[seq_id];
            for (size_t i = 0; i < tokens.size(); i++) {
                common_batch_add(batch, tokens[i], start_pos + i, { seq_id }, i == tokens.size() - 1);
            }

            seq_positions[seq_id] = start_pos + tokens.size();
        }


        printf("Batch contents:\n");
        printf("n_tokens: %d\n", batch.n_tokens);
        for (int i = 0; i < batch.n_tokens; i++) {
            printf("token[%d]: tok=%-5d, pos=%d, n_seq_id=%d, seq_ids=[", i, batch.token[i], batch.pos[i], batch.n_seq_id[i]);

            for (int j = 0; j < batch.n_seq_id[i]; j++) {
                printf("%d%s", batch.seq_id[i][j], j < batch.n_seq_id[i]-1 ? ", " : "");
            }
            printf("], logits=%d\n", batch.logits[i]);
        }

        if (llama_decode(ctx.get(), batch) != 0) {
            fprintf(stderr, "Warning: llama_decode failed\n");
            llama_batch_free(batch);
            return false;
        }

        // Build mapping from seq id to batch token idx
        for (int i = 0; i < batch.n_tokens; i++) {
            if (batch.logits[i]) {
                llama_seq_id seq_id = batch.seq_id[i][0];
                last_batch_info[seq_id] = i;
            }
        }

        llama_batch_free(batch);
        return true;
    }

    int32_t idx_for_seq(llama_seq_id seq_id) {
        auto it = last_batch_info.find(seq_id);
        if (it == last_batch_info.end()) {
            fprintf(stderr, "Error: no batch index found for seq_id %d\n", seq_id);
            return -1;
        }
        return it->second;
    }

    void update_batch_info(const llama_batch & batch) {
        last_batch_info.clear();
        for (int i = 0; i < batch.n_tokens; i++) {
            if (batch.logits[i]) {
                llama_seq_id cur_seq = batch.seq_id[i][0];
                last_batch_info[cur_seq] = i;
            }
        }
    }

    bool decode_token(llama_token token, llama_seq_id seq_id = 0) {
        if (ctx == nullptr) {
            fprintf(stderr, "Error: context not initialized, call setup() first\n");
            return false;
        }

        llama_batch batch = llama_batch_init(1, 0, 1);
        int32_t pos = seq_positions[seq_id];
        common_batch_add(batch, token, pos, { seq_id }, true);

        if (llama_decode(ctx.get(), batch) != 0) {
            fprintf(stderr, "Warning: llama_decode failed for token %d in seq %d\n", token, seq_id);
            llama_batch_free(batch);
            return false;
        }

        update_batch_info(batch);

        seq_positions[seq_id]++;
        llama_batch_free(batch);
        return true;
    }

    bool decode_tokens(const std::map<llama_seq_id, llama_token> & seq_tokens) {
        if (ctx == nullptr) {
            fprintf(stderr, "Error: context not initialized, call setup() first\n");
            return false;
        }

        llama_batch batch = llama_batch_init(seq_tokens.size(), 0, seq_tokens.size());

        for (const auto & [seq_id, token] : seq_tokens) {
            int32_t pos = seq_positions[seq_id];
            common_batch_add(batch, token, pos, { seq_id }, true);
        }

        if (llama_decode(ctx.get(), batch) != 0) {
            fprintf(stderr, "Warning: llama_decode failed for batch tokens\n");
            llama_batch_free(batch);
            return false;
        }

        for (const auto & [seq_id, _] : seq_tokens) {
            seq_positions[seq_id]++;
        }

        update_batch_info(batch);

        llama_batch_free(batch);
        return true;
    }

    std::string token_to_piece(llama_token token, bool special) {
        std::string piece;
        piece.resize(piece.capacity());  // using string internal cache, 15 bytes + '\n'
        const int n_chars = llama_token_to_piece(get_vocab(), token, &piece[0], piece.size(), 0, special);
        if (n_chars < 0) {
            piece.resize(-n_chars);
            int check = llama_token_to_piece(get_vocab(), token, &piece[0], piece.size(), 0, special);
            GGML_ASSERT(check == -n_chars);
        }
        else {
            piece.resize(n_chars);
        }

        return piece;
    }

    void reset() {
        ctx.reset();
        seq_positions.clear();
        last_batch_info.clear();
    }

    const llama_vocab * get_vocab() const {
        return model ? llama_model_get_vocab(model.get()) : nullptr;
    }

};

static void test_backend_greedy_sampling(const backend_cli_args & args) {
    test_model_context test_ctx;

    const int seq_id = 0;

    struct llama_sampler_chain_params backend_sampler_params = llama_sampler_chain_default_params();
    llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_sampler_params));

    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_greedy());
    std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};

    if (!test_ctx.setup(args, backend_sampler_configs)) {
        return;
    }

    if (!test_ctx.decode({{seq_id, "Some"}})) {
        GGML_ASSERT(false && "Failed to decode token");
    }

    int32_t batch_idx = test_ctx.idx_for_seq(seq_id);

    llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
    printf("greedy sampled id:%d, string:'%s'\n", token, test_ctx.token_to_piece(token, false).c_str());
    GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);

    token = llama_get_sampled_token_ith(test_ctx.ctx.get(), -1);
    printf("greedy sampled id:%d, string:'%s'\n", token, test_ctx.token_to_piece(token, false).c_str());
    GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);

    for (int i = 0; i < 10; i++) {
        int32_t loop_idx = test_ctx.idx_for_seq(seq_id);
        llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), loop_idx);
        printf("Generation step %d: token id:%d, string: %s\n", i, token, test_ctx.token_to_piece(token, false).c_str());
        if (!test_ctx.decode_token(token, 0)) {
            GGML_ASSERT(false && "Failed to decode token");
        }
    }
}

static void test_backend_top_k_sampling(const backend_cli_args & args) {
    test_model_context test_ctx;

    const int seq_id = 0;
    const int32_t k = 8;
    struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
    llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_top_k(k));
    std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};

    if (!test_ctx.setup(args, backend_sampler_configs)) {
        return;
    }

    if (!test_ctx.decode({{seq_id, "Hello"}})) {
        GGML_ASSERT(false && "Failed to decode token");
    }

    int32_t batch_idx = test_ctx.idx_for_seq(seq_id);

    float * logits = llama_get_sampled_logits_ith(test_ctx.ctx.get(), batch_idx);
    uint32_t n_logits = llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx);
    for (size_t i = 0; i < n_logits; ++i) {
        printf("top_k logit[%zu] = %.6f\n", i, logits[i]);
    }

    llama_token * candidates = llama_get_sampled_candidates_ith(test_ctx.ctx.get(), batch_idx);
    uint32_t n_candidates = llama_get_sampled_candidates_count_ith(test_ctx.ctx.get(), batch_idx);
    for (size_t i = 0; i < n_candidates; ++i) {
        printf("top_k candidate[%zu] = %d : %s\n", i, candidates[i],
               test_ctx.token_to_piece(candidates[i], false).c_str());
    }

    // Sample using CPU sampler for verification that it is possible to do hybrid
    // sampling, first top_k on the backend and then dist on the CPU.
    struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
    llama_sampler_ptr chain(llama_sampler_chain_init(chain_params));
    GGML_ASSERT(chain->iface->backend_apply != nullptr);

    llama_sampler_chain_add(chain.get(), llama_sampler_init_dist(18));
    llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), batch_idx);
    const std::string token_str = test_ctx.token_to_piece(token, false);
    GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);

    printf("backend top-k hybrid sampling test PASSED\n");
}

static void test_backend_temp_sampling(const backend_cli_args & args) {
    test_model_context test_ctx;


    {
        const float temp_0 = 0.8f;
        struct llama_sampler_chain_params backend_chain_params_0 = llama_sampler_chain_default_params();
        llama_sampler_ptr backend_sampler_chain_0(llama_sampler_chain_init(backend_chain_params_0));
        llama_sampler_chain_add(backend_sampler_chain_0.get(), llama_sampler_init_temp(temp_0));

        const float temp_1 = 0.1f;
        struct llama_sampler_chain_params backend_chain_params_1 = llama_sampler_chain_default_params();
        llama_sampler_ptr backend_sampler_chain_1(llama_sampler_chain_init(backend_chain_params_1));
        llama_sampler_chain_add(backend_sampler_chain_1.get(), llama_sampler_init_temp(temp_1));

        std::vector<llama_sampler_seq_config> backend_sampler_configs = {
            { 0, backend_sampler_chain_0.get() },
            { 1, backend_sampler_chain_1.get() }
        };

        if (!test_ctx.setup(args, backend_sampler_configs)) {
            return;
        }

        if (!test_ctx.decode({{0, "Some where over the"}, {1, "Once upon a"}})) {
            GGML_ASSERT(false && "Failed to decode token");
        }

        // Verfify sequence 0
        {
            int32_t batch_idx = test_ctx.idx_for_seq(0);
            int n_logits = llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx);
            GGML_ASSERT(n_logits == test_ctx.n_vocab);

            // Sample from sequence 0 using CPU sampler
            struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
            llama_sampler_ptr chain(llama_sampler_chain_init(chain_params));
            llama_sampler_chain_add(chain.get(), llama_sampler_init_dist(18));

            llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), batch_idx);
            const std::string token_str = test_ctx.token_to_piece(token, false);
            printf("Sequence 0 sampled token id:%d, string: '%s'\n", token, token_str.c_str());
            GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
        }


        // Verfify sequence 1
        {
            int32_t batch_idx = test_ctx.idx_for_seq(1);

            // Sample from sequence 1 using CPU sampler
            struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
            llama_sampler_ptr chain(llama_sampler_chain_init(chain_params));
            llama_sampler_chain_add(chain.get(), llama_sampler_init_dist(18));

            llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), batch_idx);
            const std::string token_str = test_ctx.token_to_piece(token, false);
            printf("Sequence 1 sampled token id:%d, string: '%s'\n", token, token_str.c_str());
            GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
        }
    }

    // lambda to testing non-positive temperature values.
    auto test_argmax_temp = [&](float temp) {
        printf("\nTesting temperature = %.1f\n", temp);

        test_ctx.reset();

        int seq_id = 0;
        struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
        llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
        llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_temp(temp));

        std::vector<llama_sampler_seq_config> backend_sampler_configs = {
            { seq_id, backend_sampler_chain.get() },
        };

        if (!test_ctx.setup(args, backend_sampler_configs)) {
            return;
        }

        if (!test_ctx.decode({{seq_id, "Once"}})) {
            GGML_ASSERT(false && "Failed to decode token");
        }

        int32_t batch_idx = test_ctx.idx_for_seq(seq_id);

        uint32_t n_logits = llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx);
        GGML_ASSERT(n_logits == 1);
    };

    test_argmax_temp(0.0f);
    test_argmax_temp(-1.0f);

    printf("backend temp sampling test PASSED\n");

}

static void test_backend_temp_ext_sampling(const backend_cli_args & args) {
    test_model_context test_ctx;

    {
        int seq_id = 0;
        const float temp = 0.8f;
        const float delta = 0.5f;
        const float exponent = 1.5f;
        struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
        llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
        llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_temp_ext(temp, delta, exponent));

        std::vector<llama_sampler_seq_config> backend_sampler_configs = {
            { seq_id, backend_sampler_chain.get() },
        };

        if (!test_ctx.setup(args, backend_sampler_configs)) {
            return;
        }

        if (!test_ctx.decode({{seq_id, "Once upon a"}})) {
            GGML_ASSERT(false && "Failed to decode token");
        }

        // Verify sequence 0
        {
            int32_t batch_idx = test_ctx.idx_for_seq(seq_id);
            int n_logits = llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx);
            GGML_ASSERT(n_logits == test_ctx.n_vocab);
        }
    }

    test_ctx.reset();

    // lambda to testing non-positive temp/delta/exponent values.
    auto test_argmax_temp = [&](float temp, float delta, float exponent) {
        printf("\nTesting temperature = %.1f, delta = %1.f, exponent = %1.f\n", temp, delta, exponent);

        test_ctx.reset();

        int seq_id = 0;
        struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
        llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
        llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_temp_ext(temp, delta, exponent));

        std::vector<llama_sampler_seq_config> backend_sampler_configs = {
            { seq_id, backend_sampler_chain.get() },
        };

        if (!test_ctx.setup(args, backend_sampler_configs)) {
            return;
        }

        if (!test_ctx.decode({{seq_id, "Once"}})) {
            GGML_ASSERT(false && "Failed to decode token");
        }

        int32_t batch_idx = test_ctx.idx_for_seq(seq_id);

        uint32_t n_logits = llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx);

        if (temp <= 0.0f && delta >= 0.0f) {
            GGML_ASSERT(n_logits == 1);
        } else {
            GGML_ASSERT(n_logits == (uint32_t) test_ctx.n_vocab);
        }
    };

    test_argmax_temp(0.0f,  0.3f, 1.0f); // Greedy (temp=0)
    test_argmax_temp(-1.0f, 0.3f, 2.0f); // Greedy (temp<0)
    test_argmax_temp(0.8f,  0.0f, 2.0f); // Temperature scaling

    printf("backend temp_ext sampling test PASSED\n");

}

static void test_backend_min_p_sampling(const backend_cli_args & args) {
    test_model_context test_ctx;

    const int seq_id = 0;
    const float p = 0.1;
    struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
    llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_min_p(p, 0));
    std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};

    if (!test_ctx.setup(args, backend_sampler_configs)) {
        return;
    }

    if (!test_ctx.decode({{seq_id, "Hello"}})) {
        GGML_ASSERT(false && "Failed to decode token");
    }

    int32_t batch_idx = test_ctx.idx_for_seq(seq_id);

    float * logits = llama_get_sampled_logits_ith(test_ctx.ctx.get(), batch_idx);
    uint32_t n_logits = llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx);

    // Print the logits that are above the min-p threshold
    std::vector<float> filtered_logits;
    for (size_t i = 0; i < n_logits; ++i) {
        if (logits[i] > -1e9f) {
            filtered_logits.push_back(logits[i]);
            //printf("min_p logit[%zu] = %.6f\n", i, logits[i]);
        }
    }
    GGML_ASSERT(filtered_logits.size() < (size_t) test_ctx.n_vocab);

    // Sample using CPU sampler for verification to inspect they are reasonable
    struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
    llama_sampler_ptr chain(llama_sampler_chain_init(chain_params));
    llama_sampler_chain_add(chain.get(), llama_sampler_init_dist(88));

    llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), batch_idx);
    const std::string token_str = test_ctx.token_to_piece(token, false);
    printf("min-p cpu sampled token id:%d, string: '%s'\n", token, token_str.c_str());
    GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);

    // Decode and sampler 10 more tokens
    for (int i = 0; i < 10; i++) {
        int32_t loop_idx = test_ctx.idx_for_seq(seq_id);
        llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), loop_idx);
        printf("min-p gen step %d: token id :%5.d, string: %s\n", i, token, test_ctx.token_to_piece(token, false).c_str());
        if (!test_ctx.decode_token(token, 0)) {
            GGML_ASSERT(false && "Failed to decode token");
        }
    }

    printf("min-p sampling test PASSED\n");
}

static void test_backend_top_p_sampling(const backend_cli_args & args) {
    test_model_context test_ctx;

    const int seq_id = 0;
    const float p = 0.9;
    struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
    llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_top_p(p, 0));
    std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};

    if (!test_ctx.setup(args, backend_sampler_configs)) {
        return;
    }

    if (!test_ctx.decode({{seq_id, "Hello"}})) {
        return;
    }

    int32_t batch_idx = test_ctx.idx_for_seq(seq_id);

    float * logits = llama_get_sampled_logits_ith(test_ctx.ctx.get(), batch_idx);
    uint32_t n_logits = llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx);

    // Print the logits that are above the min-p threshold
    std::vector<float> filtered_logits;
    for (size_t i = 0; i < n_logits; ++i) {
        if (logits[i] > -1e9f) {
            filtered_logits.push_back(logits[i]);
        }
    }
    GGML_ASSERT(filtered_logits.size() < (size_t) test_ctx.n_vocab);
    GGML_ASSERT(filtered_logits.size() > 0);

    // Sample using CPU sampler for verification to inspect they are reasonable
    struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
    llama_sampler_ptr chain(llama_sampler_chain_init(chain_params));
    llama_sampler_chain_add(chain.get(), llama_sampler_init_dist(88));

    llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), batch_idx);
    const std::string token_str = test_ctx.token_to_piece(token, false);
    printf("top-p cpu sampled token id:%d, string: '%s'\n", token, token_str.c_str());
    GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);

    // Decode and sampler 10 more tokens
    for (int i = 0; i < 10; i++) {
        int32_t loop_idx = test_ctx.idx_for_seq(seq_id);
        llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), loop_idx);
        printf("top-p gen step %d: token id :%5.d, string: %s\n", i, token, test_ctx.token_to_piece(token, false).c_str());
        test_ctx.decode_token(token, 0);
    }

    printf("top-p sampling test PASSED\n");
}

static void test_backend_multi_sequence_sampling(const backend_cli_args & args) {
    test_model_context test_ctx;

    struct llama_sampler_chain_params chain_params_0 = llama_sampler_chain_default_params();
    llama_sampler_ptr sampler_chain_0(llama_sampler_chain_init(chain_params_0));
    llama_sampler_chain_add(sampler_chain_0.get(), llama_sampler_init_greedy());

    struct llama_sampler_chain_params chain_params_1 = llama_sampler_chain_default_params();
    llama_sampler_ptr sampler_chain_1(llama_sampler_chain_init(chain_params_1));
    llama_sampler_chain_add(sampler_chain_1.get(), llama_sampler_init_temp(0.8f));
    llama_sampler_chain_add(sampler_chain_1.get(), llama_sampler_init_greedy());

    std::vector<llama_sampler_seq_config> backend_sampler_configs = {
        { 0, sampler_chain_0.get() },
        { 1, sampler_chain_1.get() }
    };

    if (!test_ctx.setup(args, backend_sampler_configs)) {
        return;
    }

    std::map<llama_seq_id, std::string> prompts = {
        {0, "Hello"},
        {1, "Some"}
    };

    if (!test_ctx.decode(prompts)) {
        GGML_ASSERT(false && "Failed to decode token");
    }

    // Verfiy sequence 0
    {
        int32_t batch_idx = test_ctx.idx_for_seq(0);
        llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
        const std::string token_str = test_ctx.token_to_piece(token, false);
        printf("Seq 0 sampled token id=%d, string='%s'\n", token, token_str.c_str());
        GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
    }

    // Verify sequence 1
    {
        int32_t batch_idx= test_ctx.idx_for_seq(1);
        llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
        const std::string token_str = test_ctx.token_to_piece(token, false);
        printf("Seq 1 sampled token id=%d, string='%s'\n", token, token_str.c_str());
        GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
    }

    // Generate tokens for each sequence
    printf("\nMulti-sequence generation:\n");
    for (int step = 0; step < 4; step++) {
        std::map<llama_seq_id, llama_token> tokens;

        for (llama_seq_id seq_id : {0, 1}) {
            int32_t idx = test_ctx.idx_for_seq(seq_id);
            llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), idx);
            const std::string token_str = test_ctx.token_to_piece(token, false);
            printf("  Seq %d, step %d: token id=%d, string='%s'\n", seq_id, step, token, token_str.c_str());
            tokens[seq_id] = token;
        }

        // Decode all tokens in a single batch
        if (!test_ctx.decode_tokens(tokens)) {
            GGML_ASSERT(false && "Failed to decode token");
        }
    }

    printf("backend multi-sequence sampling test PASSED\n");
}

static void test_backend_dist_sampling(const backend_cli_args & args) {
    test_model_context test_ctx;

    const int seq_id = 189;
    const int32_t seed = 88;
    struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
    llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_dist(seed));
    std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};

    if (!test_ctx.setup(args, backend_sampler_configs)) {
        return;
    }

    if (!test_ctx.decode({{seq_id, "Some"}})) {
        GGML_ASSERT(false && "Failed to decode token");
    }

    int32_t batch_idx = test_ctx.idx_for_seq(seq_id);
    llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
    printf("dist sampled id:%d, string:'%s'\n", token, test_ctx.token_to_piece(token, false).c_str());
    GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
    //GGML_ASSERT(llama_get_sampled_logits_ith(test_ctx.ctx.get(), batch_idx) == nullptr);

    token = llama_get_sampled_token_ith(test_ctx.ctx.get(), -1);
    printf("dist sampled id:%d, string:'%s'\n", token, test_ctx.token_to_piece(token, false).c_str());
    GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);

    printf("backend dist sampling test PASSED\n");
}

static void test_backend_dist_sampling_and_cpu(const backend_cli_args & args) {
    test_model_context test_ctx;

    const int seq_id = 0;
    const int32_t seed = 88;
    struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
    llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_dist(seed));
    std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};

    if (!test_ctx.setup(args, backend_sampler_configs)) {
        return;
    }

    if (!test_ctx.decode({{seq_id, "Some"}})) {
        GGML_ASSERT(false && "Failed to decode token");
    }

    int32_t batch_idx = test_ctx.idx_for_seq(seq_id);

    // Sample using CPU sampler
    struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
    llama_sampler_ptr chain(llama_sampler_chain_init(chain_params));
    llama_sampler_chain_add(chain.get(), llama_sampler_init_dist(18));

    llama_token backend_token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
    llama_token cpu_token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), batch_idx);
    printf("dist & cpu sampled id:%d, string:'%s'\n", cpu_token, test_ctx.token_to_piece(cpu_token, false).c_str());
    GGML_ASSERT(backend_token == cpu_token);

    printf("backend dist & cpu sampling test PASSED\n");
}

static void test_backend_logit_bias_sampling(const backend_cli_args & args) {
    test_model_context test_ctx;

    // Calling load_model to ensure vocab is loaded and can be accessed
    if (!test_ctx.load_model(args)) {
        return;
    }

    const int seq_id = 0;

    // Create the logit biases vector.
    std::vector<llama_logit_bias> logit_bias;

    // Get the token for the piece "World".
    const std::string piece = "World";
    std::vector<llama_token> tokens(16);
    llama_tokenize(test_ctx.get_vocab(), piece.c_str(), piece.size(), tokens.data(), tokens.size(), false, false);
    llama_token bias_token = tokens[0];
    logit_bias.push_back({ bias_token, +100.0f });
    printf("biasing token piece '%s' -> token id %d\n", piece.c_str(), bias_token);

    struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
    llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_logit_bias(
                llama_vocab_n_tokens(test_ctx.get_vocab()),
                logit_bias.size(),
                logit_bias.data()));
    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_dist(88));

    std::vector<llama_sampler_seq_config> backend_sampler_configs = {
        { seq_id, backend_sampler_chain.get() },
    };

    if (!test_ctx.setup(args, backend_sampler_configs)) {
        return;
    }

    if (!test_ctx.decode({{seq_id, "Hello"}})) {
        GGML_ASSERT(false && "Failed to decode token");
    }

    llama_token backend_token = llama_get_sampled_token_ith(test_ctx.ctx.get(), test_ctx.idx_for_seq(seq_id));
    const std::string backend_token_str = test_ctx.token_to_piece(backend_token, false);
    printf("logit bias sampled token = %d, string='%s'\n", backend_token, backend_token_str.c_str());
    GGML_ASSERT(backend_token == bias_token);

    printf("backend logit bias sampling test PASSED\n");
}

// This test verifies that it is possible to have two different backend sampler,
// one that uses the backend dist sampler, and another that uses CPU dist sampler.
static void test_backend_mixed_sampling(const backend_cli_args & args) {
    test_model_context test_ctx;

    struct llama_sampler_chain_params chain_params_0 = llama_sampler_chain_default_params();
    llama_sampler_ptr sampler_chain_0(llama_sampler_chain_init(chain_params_0));
    llama_sampler_chain_add(sampler_chain_0.get(), llama_sampler_init_dist(88));

    int k = 40;
    struct llama_sampler_chain_params chain_params_1 = llama_sampler_chain_default_params();
    llama_sampler_ptr sampler_chain_1(llama_sampler_chain_init(chain_params_1));
    llama_sampler_chain_add(sampler_chain_1.get(), llama_sampler_init_top_k(k));

    std::vector<llama_sampler_seq_config> backend_sampler_configs = {
        { 0, sampler_chain_0.get() },
        { 1, sampler_chain_1.get() }
    };

    if (!test_ctx.setup(args, backend_sampler_configs)) {
        return;
    }

    std::map<llama_seq_id, std::string> prompts = {
        {0, "Hello"},
        {1, "Some"}
    };

    if (!test_ctx.decode(prompts)) {
        GGML_ASSERT(false && "Failed to decode token");
    }

    // Verfiy sequence 0 that used the dist backend sampler.
    {
        int32_t batch_idx = test_ctx.idx_for_seq(0);
        llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
        const std::string token_str = test_ctx.token_to_piece(token, false);
        printf("sampled token id=%d, string='%s'\n", token, token_str.c_str());
        GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
        //GGML_ASSERT(llama_get_sampled_logits_ith(test_ctx.ctx.get(), batch_idx) == nullptr);
        //GGML_ASSERT(llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx) == 0);
    }

    // Verfiy sequence 1 that used the top-k backend sampler.
    {
        int32_t batch_idx = test_ctx.idx_for_seq(1);
        float * logits = llama_get_sampled_logits_ith(test_ctx.ctx.get(), batch_idx);
        GGML_ASSERT(logits != nullptr);
        size_t n_logits = llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx);
        GGML_ASSERT(n_logits == (size_t) k);
        GGML_ASSERT(llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx) == LLAMA_TOKEN_NULL);
    }

    printf("backend mixed sampling test PASSED\n");
}

static void test_backend_set_sampler(const backend_cli_args & args) {
    test_model_context test_ctx;

    const int32_t seed = 88;
    const int seq_id = 0;
    struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
    llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_dist(seed));
    std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};

    if (!test_ctx.setup(args, backend_sampler_configs)) {
        return;
    }

    if (!test_ctx.decode({{seq_id, "Hello"}})) {
        GGML_ASSERT(false && "Failed to decode token");
    }

    int32_t batch_idx = test_ctx.idx_for_seq(seq_id);

    // Sample using backend sampler configured above
    llama_token backend_token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
    const std::string backend_token_str = test_ctx.token_to_piece(backend_token, false);
    printf("dist sampled token = %d, string='%s'\n", backend_token, backend_token_str.c_str());

    // Now clear the backend sampler for this sequence.
    llama_set_sampler(test_ctx.ctx.get(), seq_id, nullptr);
    printf("Cleared backend sampler for seq_id %d\n", seq_id);

    // Sample using CPU sampler
    struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
    llama_sampler_ptr chain(llama_sampler_chain_init(chain_params));
    llama_sampler_chain_add(chain.get(), llama_sampler_init_dist(18));

    std::map<llama_seq_id, llama_token> tokens = { { seq_id, backend_token}, };
    if (!test_ctx.decode_tokens(tokens)) {
        GGML_ASSERT(false && "Failed to decode token");
    }

    // Should not have any sampled token or probs after clearing the backend sampler.
    const int32_t idx = test_ctx.idx_for_seq(seq_id);
    GGML_ASSERT(llama_get_sampled_token_ith(test_ctx.ctx.get(), idx) == LLAMA_TOKEN_NULL);
    GGML_ASSERT(llama_get_sampled_probs_ith(test_ctx.ctx.get(), idx) == nullptr);

    // Sample the token using the CPU sampler chain.
    llama_token token2 = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), seq_id);
    const std::string token2_str = test_ctx.token_to_piece(token2, false);
    printf("CPU sampled token after clearing backend sampler: id=%d, string='%s'\n", token2, token2_str.c_str());
    std::map<llama_seq_id, llama_token> tokens2 = { { seq_id, token2}, };

    // Set a new backend sampler for the sequence.
    struct llama_sampler_chain_params new_backend_chain_params = llama_sampler_chain_default_params();
    llama_sampler_ptr new_backend_sampler_chain(llama_sampler_chain_init(new_backend_chain_params));
    llama_sampler_chain_add(new_backend_sampler_chain.get(), llama_sampler_init_top_k(20));
    llama_sampler_chain_add(new_backend_sampler_chain.get(), llama_sampler_init_dist(seed));
    llama_set_sampler(test_ctx.ctx.get(), seq_id, new_backend_sampler_chain.get());

    if (!test_ctx.decode_tokens(tokens2)) {
        GGML_ASSERT(false && "Failed to decode token");
    }

    llama_token new_backend_token = llama_get_sampled_token_ith(test_ctx.ctx.get(), test_ctx.idx_for_seq(seq_id));
    const std::string new_backend_token_str = test_ctx.token_to_piece(new_backend_token, false);
    printf("dist sampled token = %d, string='%s'\n", new_backend_token, new_backend_token_str.c_str());

    printf("backend set sampler test PASSED\n");
}

static void test_backend_cpu_mixed_batch(const backend_cli_args & args) {
    test_model_context test_ctx;

    // Sequence 0 uses backend sampling
    struct llama_sampler_chain_params chain_params_0 = llama_sampler_chain_default_params();
    llama_sampler_ptr sampler_chain_0(llama_sampler_chain_init(chain_params_0));
    llama_sampler_chain_add(sampler_chain_0.get(), llama_sampler_init_dist(88));

    std::vector<llama_sampler_seq_config> backend_sampler_configs = {
        { 0, sampler_chain_0.get() },
    };

    // We need 2 sequences: seq 0 with backend sampling, seq 1 with CPU sampling
    if (!test_ctx.setup(args, backend_sampler_configs, 2)) {
        return;
    }

    std::map<llama_seq_id, std::string> prompts = {
        {0, "Hello"},  // Will use backend sampling
        {1, "Some"}   // Will use CPU sampling
    };

    if (!test_ctx.decode(prompts)) {
        GGML_ASSERT(false && "Failed to decode token");
    }

    // Verify sequence 0 (backend sampled)
    {
        int32_t batch_idx = test_ctx.idx_for_seq(0);
        llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
        const std::string token_str = test_ctx.token_to_piece(token, false);
        printf("Seq 0 (backend) sampled token id=%d, string='%s'\n", token, token_str.c_str());
        GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
    }

    // Verify sequence 1 (CPU sampled)
    {
        int32_t batch_idx = test_ctx.idx_for_seq(1);

        llama_token backend_token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
        GGML_ASSERT(backend_token == LLAMA_TOKEN_NULL);

        struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
        llama_sampler_ptr chain(llama_sampler_chain_init(chain_params));
        llama_sampler_chain_add(chain.get(), llama_sampler_init_greedy());

        llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), batch_idx);
        const std::string token_str = test_ctx.token_to_piece(token, false);
        printf("Seq 1 (CPU) sampled token id=%d, string='%s'\n", token, token_str.c_str());
        GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
    }

    // Clear/remove the backend sampler, and sample again
    {
        // clear the backend sampler for seq 0 so that there are no backend
        // samplers.
        llama_set_sampler(test_ctx.ctx.get(), 0, nullptr);

        // Create a CPU sampler and verify we can sampler from it.
        struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
        llama_sampler_ptr chain(llama_sampler_chain_init(chain_params));
        llama_sampler_chain_add(chain.get(), llama_sampler_init_greedy());

        int32_t batch_idx = test_ctx.idx_for_seq(1);
        llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), batch_idx);
        if (!test_ctx.decode_token(token, 1)) {
            GGML_ASSERT(false && "Failed to decode token");
        }
    }

    // Set a backend sampler so that we can verify that it can be reset
    {
        struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
        llama_sampler_ptr sampler_chain(llama_sampler_chain_init(chain_params));
        llama_sampler_chain_add(sampler_chain.get(), llama_sampler_init_dist(88));

        llama_set_sampler(test_ctx.ctx.get(), 0, sampler_chain.get());

        if (!test_ctx.decode_token(3834, 0)) {
            GGML_ASSERT(false && "Failed to decode token");
        }

        int32_t batch_idx = test_ctx.idx_for_seq(0);
        llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
        const std::string token_str = test_ctx.token_to_piece(token, false);
        printf("re-added backend sampled token id=%d, string='%s'\n", token, token_str.c_str());
        GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
    }

    printf("backend-cpu mixed batch test PASSED\n");
}

static void test_backend_max_outputs(const backend_cli_args & args) {
    test_model_context test_ctx;

    const int seq_id = 0;
    const int32_t seed = 88;
    llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
    llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_dist(seed));
    std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};

    if (!test_ctx.setup(args, backend_sampler_configs)) {
        return;
    }

    llama_batch batch = llama_batch_init(512, 0, 1);
    std::string prompt = "Hello";

    std::vector<llama_token> tokens;
    tokens.push_back(llama_vocab_bos(test_ctx.get_vocab()));

    std::vector<llama_token> prompt_tokens(32);
    int n_tokens = llama_tokenize(test_ctx.get_vocab(), prompt.c_str(), prompt.length(),
                                   prompt_tokens.data(), prompt_tokens.size(),
                                   false, false);
    for (int i = 0; i < n_tokens; i++) {
        tokens.push_back(prompt_tokens[i]);
    }

    for (size_t i = 0; i < tokens.size(); i++) {
        // set all tokens as output to trigger error
        common_batch_add(batch, tokens[i], i, { seq_id }, true);
    }

    printf(">>> test_max_outputs expected error start:\n");
    const int ret = llama_decode(test_ctx.ctx.get(), batch);
    GGML_ASSERT(ret != 0 && "llama_decode should not succeed multiple outputs per sequence");
    printf("<<< test_max_outputs expected error end.\n");
    llama_batch_free(batch);

    printf("backend max outputs test PASSED\n");
}

struct backend_test_case {
    const char * name;
    void (*fn)(const backend_cli_args &);
    bool enabled_by_default;
};

static const backend_test_case BACKEND_TESTS[] = {
    { "greedy",          test_backend_greedy_sampling,         true  },
    { "logit_bias",      test_backend_logit_bias_sampling,     true  },
    { "temp",            test_backend_temp_sampling,           true  },
    { "temp_ext",        test_backend_temp_ext_sampling,       true  },
    { "top_k",           test_backend_top_k_sampling,          true  },
    { "multi_sequence",  test_backend_multi_sequence_sampling, true  },
    { "dist",            test_backend_dist_sampling,           true  },
    { "dist_and_cpu",    test_backend_dist_sampling_and_cpu,   true  },
    { "set_sampler",     test_backend_set_sampler,             true  },
    { "max_outputs",     test_backend_max_outputs,             true  },
    { "mixed",           test_backend_mixed_sampling,          true  },
    { "min_p",           test_backend_min_p_sampling,          true  },
    { "cpu_mixed",       test_backend_cpu_mixed_batch,         true  },
    { "top_p",           test_backend_top_p_sampling,          true  },
};

static backend_cli_args parse_backend_cli(int argc, char ** argv) {
    backend_cli_args out;

    for (int i = 1; i < argc; ++i) {
        const char * arg = argv[i];

        if (std::strcmp(arg, "--test") == 0) {
            if (i + 1 >= argc) {
                fprintf(stderr, "--test expects a value\n");
                exit(EXIT_FAILURE);
            }
            out.test = argv[++i];
            continue;
        }
        if (std::strncmp(arg, "--test=", 7) == 0) {
            out.test = arg + 7;
            continue;
        }
        if (std::strcmp(arg, "--model") == 0) {
            if (i + 1 >= argc) {
                fprintf(stderr, "--model expects a value\n");
                exit(EXIT_FAILURE);
            }
            out.model = argv[++i];
            continue;
        }
        if (std::strncmp(arg, "--model=", 8) == 0) {
            out.model = arg + 8;
            continue;
        }
        if (std::strcmp(arg, "--device") == 0) {
            if (i + 1 >= argc) {
                fprintf(stderr, "--device expects a value (cpu or gpu)\n");
                exit(EXIT_FAILURE);
            }
            out.device = argv[++i];
            continue;
        }
        if (std::strncmp(arg, "--device=", 9) == 0) {
            out.device = arg + 9;
            continue;
        }
        if (!out.model) {
            out.model = arg;
            continue;
        }

        fprintf(stderr, "Unexpected argument: %s\n", arg);
        exit(EXIT_FAILURE);
    }

    if (std::strcmp(out.device, "cpu") != 0 && std::strcmp(out.device, "gpu") != 0) {
        fprintf(stderr, "Invalid device '%s'. Must be 'cpu' or 'gpu'\n", out.device);
        exit(EXIT_FAILURE);
    }

    return out;
}

static std::vector<const backend_test_case *> collect_tests_to_run(const char * requested) {
    std::vector<const backend_test_case *> selected;

    if (requested != nullptr) {
        for (const auto & test : BACKEND_TESTS) {
            if (std::strcmp(test.name, requested) == 0) {
                selected.push_back(&test);
                break;
            }
        }
        if (selected.empty()) {
            fprintf(stderr, "Unknown test '%s'. Available tests:\n", requested);
            for (const auto & test : BACKEND_TESTS) {
                fprintf(stderr, "  %s\n", test.name);
            }
            exit(EXIT_FAILURE);
        }
    } else {
        for (const auto & test : BACKEND_TESTS) {
            if (test.enabled_by_default) {
                selected.push_back(&test);
            }
        }
    }

    if (selected.empty()) {
        fprintf(stderr, "No backend sampling tests selected. Use --test=<name> to pick one.\n");
    }

    return selected;
}

static void run_tests(const std::vector<const backend_test_case *> & tests, const backend_cli_args & args) {
    for (const auto * test : tests) {
        fprintf(stderr, "\n=== %s ===\n", test->name);
        test->fn(args);
    }
}


int main(int argc, char ** argv) {
    backend_cli_args args = parse_backend_cli(argc, argv);

    if (args.model == nullptr) {
        args.model = get_model_or_exit(1, argv);
    }

    std::ifstream file(args.model);
    if (!file.is_open()) {
        fprintf(stderr, "no model '%s' found\n", args.model);
        return EXIT_FAILURE;
    }

    fprintf(stderr, "using '%s'\n", args.model);

    ggml_time_init();

    const std::vector<const backend_test_case *> tests = collect_tests_to_run(args.test);
    if (!tests.empty()) {
        run_tests(tests, args);
    }

    return 0;
}