| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469 |
- #pragma once
- #include "llama.h"
- #include "llama-arch.h"
- #include "llama-graph.h"
- #include "llama-hparams.h"
- #include "llama-memory.h"
- #include "llama-vocab.h"
- #include <memory>
- #include <string>
- #include <unordered_map>
- #include <vector>
- struct llama_cparams;
- struct llama_ubatch;
- struct llama_model_loader;
- // available models
- enum llm_type {
- LLM_TYPE_UNKNOWN,
- LLM_TYPE_14M,
- LLM_TYPE_17M,
- LLM_TYPE_22M,
- LLM_TYPE_33M,
- LLM_TYPE_60M,
- LLM_TYPE_70M,
- LLM_TYPE_80M,
- LLM_TYPE_109M,
- LLM_TYPE_137M,
- LLM_TYPE_160M,
- LLM_TYPE_190M,
- LLM_TYPE_220M,
- LLM_TYPE_250M,
- LLM_TYPE_256M,
- LLM_TYPE_270M,
- LLM_TYPE_335M,
- LLM_TYPE_350M,
- LLM_TYPE_410M,
- LLM_TYPE_450M,
- LLM_TYPE_475M,
- LLM_TYPE_700M,
- LLM_TYPE_770M,
- LLM_TYPE_780M,
- LLM_TYPE_0_3B,
- LLM_TYPE_0_5B,
- LLM_TYPE_0_6B,
- LLM_TYPE_1B,
- LLM_TYPE_1_2B,
- LLM_TYPE_1_3B,
- LLM_TYPE_1_4B,
- LLM_TYPE_1_5B,
- LLM_TYPE_1_6B,
- LLM_TYPE_1_7B,
- LLM_TYPE_1_8B,
- LLM_TYPE_2B,
- LLM_TYPE_2_8B,
- LLM_TYPE_2_9B,
- LLM_TYPE_3B,
- LLM_TYPE_4B,
- LLM_TYPE_6B,
- LLM_TYPE_6_9B,
- LLM_TYPE_7B,
- LLM_TYPE_8B,
- LLM_TYPE_9B,
- LLM_TYPE_11B,
- LLM_TYPE_12B,
- LLM_TYPE_13B,
- LLM_TYPE_14B,
- LLM_TYPE_15B,
- LLM_TYPE_16B,
- LLM_TYPE_20B,
- LLM_TYPE_27B,
- LLM_TYPE_30B,
- LLM_TYPE_32B,
- LLM_TYPE_34B,
- LLM_TYPE_35B,
- LLM_TYPE_40B,
- LLM_TYPE_65B,
- LLM_TYPE_70B,
- LLM_TYPE_142B,
- LLM_TYPE_236B,
- LLM_TYPE_290B,
- LLM_TYPE_314B,
- LLM_TYPE_405B,
- LLM_TYPE_671B,
- LLM_TYPE_SMALL,
- LLM_TYPE_MEDIUM,
- LLM_TYPE_LARGE,
- LLM_TYPE_XL,
- LLM_TYPE_A1_7B,
- LLM_TYPE_A2_7B,
- LLM_TYPE_8x7B,
- LLM_TYPE_8x22B,
- LLM_TYPE_16x12B,
- LLM_TYPE_16x3_8B,
- LLM_TYPE_10B_128x3_66B,
- LLM_TYPE_57B_A14B,
- LLM_TYPE_17B_16E, // llama4 Scout
- LLM_TYPE_17B_128E, // llama4 Maverick
- LLM_TYPE_A13B,
- LLM_TYPE_30B_A3B,
- LLM_TYPE_235B_A22B,
- LLM_TYPE_E2B,
- LLM_TYPE_E4B,
- };
- std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
- struct llama_layer_posnet {
- // resnet
- struct ggml_tensor * norm1 = nullptr;
- struct ggml_tensor * norm1_b = nullptr;
- struct ggml_tensor * conv1 = nullptr;
- struct ggml_tensor * conv1_b = nullptr;
- struct ggml_tensor * norm2 = nullptr;
- struct ggml_tensor * norm2_b = nullptr;
- struct ggml_tensor * conv2 = nullptr;
- struct ggml_tensor * conv2_b = nullptr;
- // attention
- struct ggml_tensor * attn_norm = nullptr;
- struct ggml_tensor * attn_norm_b = nullptr;
- struct ggml_tensor * attn_q = nullptr;
- struct ggml_tensor * attn_q_b = nullptr;
- struct ggml_tensor * attn_k = nullptr;
- struct ggml_tensor * attn_k_b = nullptr;
- struct ggml_tensor * attn_v = nullptr;
- struct ggml_tensor * attn_v_b = nullptr;
- struct ggml_tensor * attn_o = nullptr;
- struct ggml_tensor * attn_o_b = nullptr;
- // normalize
- struct ggml_tensor * norm = nullptr;
- struct ggml_tensor * norm_b = nullptr;
- };
- struct llama_layer_convnext {
- struct ggml_tensor * dw = nullptr;
- struct ggml_tensor * dw_b = nullptr;
- struct ggml_tensor * norm = nullptr;
- struct ggml_tensor * norm_b = nullptr;
- struct ggml_tensor * pw1 = nullptr;
- struct ggml_tensor * pw1_b = nullptr;
- struct ggml_tensor * pw2 = nullptr;
- struct ggml_tensor * pw2_b = nullptr;
- struct ggml_tensor * gamma = nullptr;
- };
- struct llama_layer_shortconv {
- struct ggml_tensor * in_proj = nullptr;
- struct ggml_tensor * conv = nullptr;
- struct ggml_tensor * out_proj = nullptr;
- };
- struct llama_layer {
- // normalization
- struct ggml_tensor * attn_norm = nullptr;
- struct ggml_tensor * attn_norm_b = nullptr;
- struct ggml_tensor * attn_norm_2 = nullptr;
- struct ggml_tensor * attn_norm_2_b = nullptr;
- struct ggml_tensor * attn_q_norm = nullptr;
- struct ggml_tensor * attn_q_norm_b = nullptr;
- struct ggml_tensor * attn_k_norm = nullptr;
- struct ggml_tensor * attn_k_norm_b = nullptr;
- struct ggml_tensor * attn_out_norm = nullptr;
- struct ggml_tensor * attn_out_norm_b = nullptr;
- struct ggml_tensor * attn_q_a_norm = nullptr;
- struct ggml_tensor * attn_kv_a_norm = nullptr;
- struct ggml_tensor * attn_sub_norm = nullptr;
- struct ggml_tensor * attn_post_norm = nullptr;
- struct ggml_tensor * ffn_sub_norm = nullptr;
- struct ggml_tensor * attn_norm_cross = nullptr;
- struct ggml_tensor * attn_norm_enc = nullptr;
- struct ggml_tensor * ssm_norm = nullptr;
- struct ggml_tensor * ssm_dt_norm = nullptr;
- struct ggml_tensor * ssm_b_norm = nullptr;
- struct ggml_tensor * ssm_c_norm = nullptr;
- // attention
- struct ggml_tensor * wq = nullptr;
- struct ggml_tensor * wk = nullptr;
- struct ggml_tensor * wv = nullptr;
- struct ggml_tensor * wo = nullptr;
- struct ggml_tensor * wqkv = nullptr;
- struct ggml_tensor * wq_a = nullptr;
- struct ggml_tensor * wq_b = nullptr;
- struct ggml_tensor * wkv_a_mqa = nullptr;
- struct ggml_tensor * wkv_b = nullptr;
- struct ggml_tensor * wk_b = nullptr;
- struct ggml_tensor * wv_b = nullptr;
- struct ggml_tensor * wq_cross = nullptr;
- struct ggml_tensor * wk_cross = nullptr;
- struct ggml_tensor * wv_cross = nullptr;
- struct ggml_tensor * wo_cross = nullptr;
- struct ggml_tensor * wq_enc = nullptr;
- struct ggml_tensor * wk_enc = nullptr;
- struct ggml_tensor * wv_enc = nullptr;
- struct ggml_tensor * wo_enc = nullptr;
- // attention bias
- struct ggml_tensor * bq = nullptr;
- struct ggml_tensor * bk = nullptr;
- struct ggml_tensor * bv = nullptr;
- struct ggml_tensor * bo = nullptr;
- struct ggml_tensor * bqkv = nullptr;
- // relative position bias
- struct ggml_tensor * attn_rel_b = nullptr;
- struct ggml_tensor * attn_rel_b_enc = nullptr;
- struct ggml_tensor * attn_rel_b_cross = nullptr;
- // normalization
- struct ggml_tensor * ffn_norm = nullptr;
- struct ggml_tensor * ffn_norm_b = nullptr;
- struct ggml_tensor * ffn_post_norm = nullptr;
- struct ggml_tensor * layer_out_norm = nullptr;
- struct ggml_tensor * layer_out_norm_b = nullptr;
- struct ggml_tensor * ffn_norm_exps = nullptr;
- struct ggml_tensor * ffn_norm_enc = nullptr;
- // ff
- struct ggml_tensor * ffn_gate = nullptr; // w1
- struct ggml_tensor * ffn_down = nullptr; // w2
- struct ggml_tensor * ffn_up = nullptr; // w3
- struct ggml_tensor * ffn_gate_enc = nullptr;
- struct ggml_tensor * ffn_down_enc = nullptr;
- struct ggml_tensor * ffn_up_enc = nullptr;
- // ff MoE
- struct ggml_tensor * ffn_gate_inp = nullptr;
- struct ggml_tensor * ffn_gate_exps = nullptr;
- struct ggml_tensor * ffn_down_exps = nullptr;
- struct ggml_tensor * ffn_up_exps = nullptr;
- // ff shared expert (shexp)
- struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
- struct ggml_tensor * ffn_gate_shexp = nullptr;
- struct ggml_tensor * ffn_down_shexp = nullptr;
- struct ggml_tensor * ffn_up_shexp = nullptr;
- // ff bias
- struct ggml_tensor * ffn_gate_b = nullptr;
- struct ggml_tensor * ffn_down_b = nullptr; // b2
- struct ggml_tensor * ffn_up_b = nullptr; // b3
- struct ggml_tensor * ffn_act = nullptr;
- struct ggml_tensor * ffn_exp_probs_b = nullptr;
- // mamba proj
- struct ggml_tensor * ssm_in = nullptr;
- struct ggml_tensor * ssm_x = nullptr;
- struct ggml_tensor * ssm_dt = nullptr;
- struct ggml_tensor * ssm_out = nullptr;
- // mamba
- struct ggml_tensor * ssm_conv1d = nullptr;
- struct ggml_tensor * ssm_a = nullptr;
- struct ggml_tensor * ssm_d = nullptr;
- // mamba bias
- struct ggml_tensor * ssm_conv1d_b = nullptr;
- struct ggml_tensor * ssm_dt_b = nullptr;
- // rwkv
- struct ggml_tensor * time_mix_w1 = nullptr;
- struct ggml_tensor * time_mix_w2 = nullptr;
- struct ggml_tensor * time_mix_lerp_x = nullptr;
- struct ggml_tensor * time_mix_lerp_w = nullptr;
- struct ggml_tensor * time_mix_lerp_k = nullptr;
- struct ggml_tensor * time_mix_lerp_v = nullptr;
- struct ggml_tensor * time_mix_lerp_r = nullptr;
- struct ggml_tensor * time_mix_lerp_g = nullptr;
- struct ggml_tensor * time_mix_lerp_fused = nullptr;
- struct ggml_tensor * time_mix_first = nullptr;
- struct ggml_tensor * time_mix_decay = nullptr;
- struct ggml_tensor * time_mix_decay_w1 = nullptr;
- struct ggml_tensor * time_mix_decay_w2 = nullptr;
- struct ggml_tensor * time_mix_key = nullptr;
- struct ggml_tensor * time_mix_key_b = nullptr;
- struct ggml_tensor * time_mix_value = nullptr;
- struct ggml_tensor * time_mix_value_b = nullptr;
- struct ggml_tensor * time_mix_receptance = nullptr;
- struct ggml_tensor * time_mix_receptance_b = nullptr;
- struct ggml_tensor * time_mix_gate = nullptr;
- // rwkv7
- struct ggml_tensor * time_mix_w0 = nullptr;
- struct ggml_tensor * time_mix_a0 = nullptr;
- struct ggml_tensor * time_mix_a1 = nullptr;
- struct ggml_tensor * time_mix_a2 = nullptr;
- struct ggml_tensor * time_mix_v0 = nullptr;
- struct ggml_tensor * time_mix_v1 = nullptr;
- struct ggml_tensor * time_mix_v2 = nullptr;
- struct ggml_tensor * time_mix_g1 = nullptr;
- struct ggml_tensor * time_mix_g2 = nullptr;
- struct ggml_tensor * time_mix_k_k = nullptr;
- struct ggml_tensor * time_mix_k_a = nullptr;
- struct ggml_tensor * time_mix_r_k = nullptr;
- struct ggml_tensor * time_mix_ln = nullptr;
- struct ggml_tensor * time_mix_ln_b = nullptr;
- struct ggml_tensor * time_mix_output = nullptr;
- struct ggml_tensor * channel_mix_lerp_k = nullptr;
- struct ggml_tensor * channel_mix_lerp_r = nullptr;
- struct ggml_tensor * channel_mix_key = nullptr;
- struct ggml_tensor * channel_mix_receptance = nullptr;
- struct ggml_tensor * channel_mix_value = nullptr;
- // long rope factors
- struct ggml_tensor * rope_long = nullptr;
- struct ggml_tensor * rope_short = nullptr;
- struct ggml_tensor * rope_freqs = nullptr;
- // bitnet scale
- struct ggml_tensor * wq_scale = nullptr;
- struct ggml_tensor * wk_scale = nullptr;
- struct ggml_tensor * wv_scale = nullptr;
- struct ggml_tensor * wo_scale = nullptr;
- struct ggml_tensor * ffn_gate_scale = nullptr;
- struct ggml_tensor * ffn_up_scale = nullptr;
- struct ggml_tensor * ffn_down_scale = nullptr;
- // altup & laurel
- struct ggml_tensor * per_layer_inp_gate = nullptr;
- struct ggml_tensor * per_layer_proj = nullptr;
- struct ggml_tensor * per_layer_post_norm = nullptr;
- struct ggml_tensor * altup_correct_coef = nullptr;
- struct ggml_tensor * altup_correct_scale = nullptr;
- struct ggml_tensor * altup_predict_coef = nullptr;
- struct ggml_tensor * altup_router = nullptr;
- struct ggml_tensor * altup_router_norm = nullptr;
- struct ggml_tensor * laurel_l = nullptr;
- struct ggml_tensor * laurel_r = nullptr;
- struct ggml_tensor * laurel_post_norm = nullptr;
- struct llama_layer_posnet posnet;
- struct llama_layer_convnext convnext;
- struct llama_layer_shortconv shortconv;
- };
- struct llama_model {
- llm_type type = LLM_TYPE_UNKNOWN;
- llm_arch arch = LLM_ARCH_UNKNOWN;
- std::string name = "n/a";
- llama_hparams hparams = {};
- llama_vocab vocab;
- // for classifier models
- std::vector<std::string> classifier_labels;
- struct ggml_tensor * tok_embd = nullptr;
- struct ggml_tensor * type_embd = nullptr;
- struct ggml_tensor * pos_embd = nullptr;
- struct ggml_tensor * tok_norm = nullptr;
- struct ggml_tensor * tok_norm_b = nullptr;
- struct ggml_tensor * output_norm = nullptr;
- struct ggml_tensor * output_norm_b = nullptr;
- struct ggml_tensor * output = nullptr;
- struct ggml_tensor * output_b = nullptr;
- struct ggml_tensor * output_norm_enc = nullptr;
- // classifier
- struct ggml_tensor * cls = nullptr;
- struct ggml_tensor * cls_b = nullptr;
- struct ggml_tensor * cls_out = nullptr;
- struct ggml_tensor * cls_out_b = nullptr;
- struct ggml_tensor * conv1d = nullptr;
- struct ggml_tensor * conv1d_b = nullptr;
- // gemma3n altup
- struct ggml_tensor * tok_embd_per_layer = nullptr;
- struct ggml_tensor * altup_proj = nullptr;
- struct ggml_tensor * altup_unembd_proj = nullptr;
- struct ggml_tensor * per_layer_model_proj = nullptr;
- struct ggml_tensor * per_layer_proj_norm = nullptr;
- std::vector<llama_layer> layers;
- llama_model_params params;
- // gguf metadata
- std::unordered_map<std::string, std::string> gguf_kv;
- // list of devices used in this model
- std::vector<ggml_backend_dev_t> devices;
- // for quantize-stats only
- std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
- int64_t t_load_us = 0;
- int64_t t_start_us = 0;
- explicit llama_model(const struct llama_model_params & params);
- ~llama_model();
- void load_stats (llama_model_loader & ml);
- void load_arch (llama_model_loader & ml);
- void load_hparams(llama_model_loader & ml);
- void load_vocab (llama_model_loader & ml);
- bool load_tensors(llama_model_loader & ml); // returns false if cancelled by progress_callback
- std::string arch_name() const;
- std::string type_name() const;
- std::string desc() const;
- size_t size() const;
- size_t n_tensors() const;
- size_t n_devices() const;
- // total number of parameters in the model
- uint64_t n_elements() const;
- void print_info() const;
- ggml_backend_dev_t dev_layer(int il) const;
- ggml_backend_dev_t dev_output() const;
- ggml_backend_buffer_type_t select_buft(int il) const;
- bool has_tensor_overrides() const;
- const struct ggml_tensor * get_tensor(const char * name) const;
- float get_rope_freq_base (const llama_cparams & cparams, int il) const;
- float get_rope_freq_scale(const llama_cparams & cparams, int il) const;
- ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const;
- // note: can mutate `cparams`
- // TODO: move this to new llm_arch_model_i interface
- llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
- // TODO: move this to new llm_arch_model_i interface
- llm_graph_result_ptr build_graph(
- const llm_graph_params & params,
- ggml_cgraph * gf,
- llm_graph_type type) const;
- private:
- struct impl;
- std::unique_ptr<impl> pimpl;
- };
- const char * llm_type_name(llm_type type);
- // For internal test use
- // TODO: remove
- const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model);
|