llama-cparams.h 1.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344
  1. #pragma once
  2. #include "llama.h"
  3. #include <cstdint>
  4. #define LLAMA_MAX_SEQ 256
  5. struct llama_cparams {
  6. uint32_t n_ctx; // context size used during inference
  7. uint32_t n_ctx_seq; // context for a single sequence
  8. uint32_t n_batch;
  9. uint32_t n_ubatch;
  10. uint32_t n_seq_max;
  11. int32_t n_threads; // number of threads to use for generation
  12. int32_t n_threads_batch; // number of threads to use for batch processing
  13. float rope_freq_base;
  14. float rope_freq_scale;
  15. uint32_t n_ctx_orig_yarn;
  16. // These hyperparameters are not exposed in GGUF, because all
  17. // existing YaRN models use the same values for them.
  18. float yarn_ext_factor;
  19. float yarn_attn_factor;
  20. float yarn_beta_fast;
  21. float yarn_beta_slow;
  22. bool embeddings;
  23. bool causal_attn;
  24. bool offload_kqv;
  25. bool flash_attn;
  26. bool auto_fa;
  27. bool no_perf;
  28. bool warmup;
  29. bool op_offload;
  30. bool kv_unified;
  31. bool pipeline_parallel;
  32. enum llama_pooling_type pooling_type;
  33. ggml_backend_sched_eval_callback cb_eval;
  34. void * cb_eval_user_data;
  35. };