1 an în urmă · cb13ef85a4
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -20,7 +20,12 @@ else()
 
				     add_subdirectory(batched)
			
 
				     add_subdirectory(embedding)
			
 
				     add_subdirectory(eval-callback)
			
 
				-    add_subdirectory(gbnf-validator)
			
 
				+
			
 
				+    if (NOT WIN32)
			
 
				+        # disabled on Windows because it uses internal functions not exported with LLAMA_API
			
 
				+        add_subdirectory(gbnf-validator)
			
 
				+    endif()
			
 
				+
			
 
				     add_subdirectory(gguf-hash)
			
 
				     add_subdirectory(gguf-split)
			
 
				     add_subdirectory(gguf)
			
@@ -51,7 +56,10 @@ else()
 
				         add_subdirectory(convert-llama2c-to-ggml)
			
 
				         add_subdirectory(cvector-generator)
			
 
				         add_subdirectory(export-lora)
			
 
				-        add_subdirectory(quantize-stats)
			
 
				+        if (NOT WIN32)
			
 
				+            # disabled on Windows because it uses internal functions not exported with LLAMA_API
			
 
				+            add_subdirectory(quantize-stats)
			
 
				+        endif()
			
 
				         add_subdirectory(llava)
			
 
				         if (GGML_RPC)
			
 
				             add_subdirectory(rpc)
			
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -287,7 +287,7 @@ struct split_strategy {
 
				     }
			
 
				 
			
 
				     void print_info() {
			
 
				-        printf("n_split: %ld\n", ctx_outs.size());
			
 
				+        printf("n_split: %zu\n", ctx_outs.size());
			
 
				         int i_split = 0;
			
 
				         for (auto & ctx_out : ctx_outs) {
			
 
				             // re-calculate the real gguf size for each split (= metadata size + total size of all tensors)
			
@@ -297,7 +297,7 @@ struct split_strategy {
 
				                 total_size += ggml_nbytes(t);
			
 
				             }
			
 
				             total_size = total_size / 1000 / 1000; // convert to megabytes
			
 
				-            printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
			
 
				+            printf("split %05d: n_tensors = %d, total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
			
 
				             i_split++;
			
 
				         }
			
 
				     }
			
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -1521,7 +1521,7 @@ int main(int argc, char ** argv) {
 
				     for (const auto & inst : params_instances) {
			
 
				         params_idx++;
			
 
				         if (params.progress) {
			
 
				-            fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count);
			
 
				+            fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count);
			
 
				         }
			
 
				         // keep the same model between tests when possible
			
 
				         if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
			
@@ -1573,14 +1573,14 @@ int main(int argc, char ** argv) {
 
				         // warmup run
			
 
				         if (t.n_prompt > 0) {
			
 
				             if (params.progress) {
			
 
				-                fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
			
 
				+                fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
			
 
				             }
			
 
				             //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
			
 
				             test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
			
 
				         }
			
 
				         if (t.n_gen > 0) {
			
 
				             if (params.progress) {
			
 
				-                fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
			
 
				+                fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
			
 
				             }
			
 
				             test_gen(ctx, 1, t.n_threads);
			
 
				         }
			
@@ -1592,14 +1592,14 @@ int main(int argc, char ** argv) {
 
				 
			
 
				             if (t.n_prompt > 0) {
			
 
				                 if (params.progress) {
			
 
				-                    fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count,
			
 
				+                    fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count,
			
 
				                             i + 1, params.reps);
			
 
				                 }
			
 
				                 test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
			
 
				             }
			
 
				             if (t.n_gen > 0) {
			
 
				                 if (params.progress) {
			
 
				-                    fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count,
			
 
				+                    fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
			
 
				                             i + 1, params.reps);
			
 
				                 }
			
 
				                 test_gen(ctx, t.n_gen, t.n_threads);
			
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -143,7 +143,7 @@ int main(int argc, char ** argv) {
 
				         std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
			
 
				         chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
			
 
				     }
			
 
				-    LOG_INF("Number of chunks: %ld\n", chunks.size());
			
 
				+    LOG_INF("Number of chunks: %zu\n", chunks.size());
			
 
				 
			
 
				     llama_backend_init();
			
 
				     llama_numa_init(params.numa);
			
--- a/examples/tokenize/tokenize.cpp
+++ b/examples/tokenize/tokenize.cpp
@@ -394,7 +394,7 @@ int main(int raw_argc, char ** raw_argv) {
 
				     }
			
 
				 
			
 
				     if (show_token_count) {
			
 
				-        printf("Total number of tokens: %ld\n", tokens.size());
			
 
				+        printf("Total number of tokens: %zu\n", tokens.size());
			
 
				     }
			
 
				     // silence valgrind
			
 
				     llama_free(ctx);
			
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -32,6 +32,13 @@ else()
 
				     endif()
			
 
				 endif()
			
 
				 
			
 
				+# remove the lib prefix on win32 mingw
			
 
				+if (WIN32)
			
 
				+    set(CMAKE_STATIC_LIBRARY_PREFIX "")
			
 
				+    set(CMAKE_SHARED_LIBRARY_PREFIX "")
			
 
				+    set(CMAKE_SHARED_MODULE_PREFIX  "")
			
 
				+endif()
			
 
				+
			
 
				 option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
			
 
				 option(GGML_BACKEND_DL   "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
			
 
				 
			
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -194,11 +194,6 @@ endif()
 
				 
			
 
				 if (WIN32)
			
 
				     add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
			
 
				-
			
 
				-    if (BUILD_SHARED_LIBS)
			
 
				-        # TODO: should not use this
			
 
				-        set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
			
 
				-    endif()
			
 
				 endif()
			
 
				 
			
 
				 # ggml
			
--- a/ggml/src/ggml-cpu/amx/amx.cpp
+++ b/ggml/src/ggml-cpu/amx/amx.cpp
@@ -122,7 +122,7 @@ static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_ty
 
				 }
			
 
				 
			
 
				 static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
			
 
				-    void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
			
 
				+    void * data = ggml_aligned_malloc(size);
			
 
				     if (data == NULL) {
			
 
				         fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
			
 
				         return NULL;
			
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -126,8 +126,7 @@ struct ggml_arm_arch_features_type {
 
				 #endif
			
 
				 #include <windows.h>
			
 
				 
			
 
				-
			
 
				-#if !defined(__clang__)
			
 
				+#if defined(_MSC_VER) && !defined(__clang__)
			
 
				 #define GGML_CACHE_ALIGN __declspec(align(GGML_CACHE_LINE))
			
 
				 
			
 
				 typedef volatile LONG atomic_int;
			
@@ -12945,7 +12944,7 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data);
 
				 #include "windows.h"
			
 
				 
			
 
				 // TODO: support > 64 CPUs
			
 
				-bool ggml_thread_apply_affinity(bool * mask) {
			
 
				+static bool ggml_thread_apply_affinity(bool * mask) {
			
 
				     HANDLE    h = GetCurrentThread();
			
 
				     uint64_t  bitmask = 0ULL;
			
 
				 
			
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -74,8 +74,8 @@ static inline int ggml_up(int n, int m) {
 
				 //
			
 
				 
			
 
				 GGML_ATTRIBUTE_FORMAT(2, 3)
			
 
				-void ggml_log_internal        (enum ggml_log_level level, const char * format, ...);
			
 
				-void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data);
			
 
				+GGML_API void ggml_log_internal        (enum ggml_log_level level, const char * format, ...);
			
 
				+GGML_API void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data);
			
 
				 
			
 
				 #define GGML_LOG(...)       ggml_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
			
 
				 #define GGML_LOG_INFO(...)  ggml_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
			
@@ -304,8 +304,8 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
 
				 
			
 
				 // Memory allocation
			
 
				 
			
 
				-void * ggml_aligned_malloc(size_t size);
			
 
				-void ggml_aligned_free(void * ptr, size_t size);
			
 
				+GGML_API void * ggml_aligned_malloc(size_t size);
			
 
				+GGML_API void ggml_aligned_free(void * ptr, size_t size);
			
 
				 
			
 
				 // FP16 to FP32 conversion
			
 
				 
			
--- a/ggml/src/ggml-threading.h
+++ b/ggml/src/ggml-threading.h
@@ -1,11 +1,13 @@
 
				 #pragma once
			
 
				 
			
 
				+#include "ggml.h"
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 extern "C" {
			
 
				 #endif
			
 
				 
			
 
				-void ggml_critical_section_start(void);
			
 
				-void ggml_critical_section_end(void);
			
 
				+GGML_API void ggml_critical_section_start(void);
			
 
				+GGML_API void ggml_critical_section_end(void);
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 }
			
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,10 +1,3 @@
 
				-# TODO: should not use this
			
 
				-if (WIN32)
			
 
				-    if (BUILD_SHARED_LIBS)
			
 
				-        set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
			
 
				-    endif()
			
 
				-endif()
			
 
				-
			
 
				 llama_add_compile_flags()
			
 
				 
			
 
				 #
			
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1794,7 +1794,7 @@ private:
 
				         DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
			
 
				                                     NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
			
 
				         if (!bufLen) {
			
 
				-            ret = format("Win32 error code: %s", error_code);
			
 
				+            ret = format("Win32 error code: %lx", error_code);
			
 
				         } else {
			
 
				             ret = lpMsgBuf;
			
 
				             LocalFree(lpMsgBuf);
			
@@ -2132,7 +2132,7 @@ struct llama_mmap {
 
				             HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
			
 
				 
			
 
				             // may fail on pre-Windows 8 systems
			
 
				-            pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
			
 
				+            pPrefetchVirtualMemory = (decltype(pPrefetchVirtualMemory))(void *) GetProcAddress(hKernel32, "PrefetchVirtualMemory");
			
 
				 
			
 
				             if (pPrefetchVirtualMemory) {
			
 
				                 // advise the kernel to preload the mapped memory
			
@@ -21577,7 +21577,7 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
 
				                 throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
			
 
				             }
			
 
				         } else if ((size_t) i >= ctx->output_ids.size()) {
			
 
				-            throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
			
 
				+            throw std::runtime_error(format("out of range [0, %zu)", ctx->output_ids.size()));
			
 
				         } else {
			
 
				             j = ctx->output_ids[i];
			
 
				         }
			
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -84,38 +84,50 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2             ARGS ${CMAKE
 
				 llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
			
 
				 llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
			
 
				 
			
 
				-# build test-tokenizer-1-bpe target once and add many tests
			
 
				-add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
			
 
				-target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
			
 
				-install(TARGETS test-tokenizer-1-bpe RUNTIME)
			
 
				-
			
 
				-# TODO: disabled due to slowness
			
 
				-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
			
 
				-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
			
 
				-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2     ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
			
 
				-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
			
 
				-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
			
 
				-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
			
 
				-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
			
 
				-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
			
 
				-
			
 
				-# build test-tokenizer-1-spm target once and add many tests
			
 
				-add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)
			
 
				-target_link_libraries(test-tokenizer-1-spm PRIVATE common)
			
 
				-install(TARGETS test-tokenizer-1-spm RUNTIME)
			
 
				-
			
 
				-llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
			
 
				-#llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-baichuan  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
			
 
				-
			
 
				-# llama_target_and_test(test-double-float.cpp) # SLOW
			
 
				+
			
 
				+if (NOT WIN32)
			
 
				+    # these tests are disabled on Windows because they use internal functions not exported with LLAMA_API
			
 
				+    llama_target_and_test(test-sampling.cpp)
			
 
				+    llama_target_and_test(test-grammar-parser.cpp)
			
 
				+    llama_target_and_test(test-grammar-integration.cpp)
			
 
				+    llama_target_and_test(test-llama-grammar.cpp)
			
 
				+    # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
			
 
				+    if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
			
 
				+        llama_target_and_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
			
 
				+        target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
			
 
				+    endif()
			
 
				+
			
 
				+
			
 
				+    # build test-tokenizer-1-bpe target once and add many tests
			
 
				+    add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
			
 
				+    target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
			
 
				+    install(TARGETS test-tokenizer-1-bpe RUNTIME)
			
 
				+
			
 
				+    # TODO: disabled due to slowness
			
 
				+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
			
 
				+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
			
 
				+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2     ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
			
 
				+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
			
 
				+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
			
 
				+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
			
 
				+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
			
 
				+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
			
 
				+
			
 
				+    # build test-tokenizer-1-spm target once and add many tests
			
 
				+    add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)
			
 
				+    target_link_libraries(test-tokenizer-1-spm PRIVATE common)
			
 
				+    install(TARGETS test-tokenizer-1-spm RUNTIME)
			
 
				+
			
 
				+    llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
			
 
				+    #llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-baichuan  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
			
 
				+
			
 
				+    # llama_target_and_test(test-double-float.cpp) # SLOW
			
 
				+endif()
			
 
				+
			
 
				 llama_target_and_test(test-log.cpp)
			
 
				 llama_target_and_test(test-arg-parser.cpp)
			
 
				-llama_target_and_test(test-sampling.cpp)
			
 
				 llama_target_and_test(test-chat-template.cpp)
			
 
				 
			
 
				-llama_target_and_test(test-grammar-parser.cpp)
			
 
				-llama_target_and_test(test-grammar-integration.cpp)
			
 
				-llama_target_and_test(test-llama-grammar.cpp)
			
 
				 # llama_target_and_test(test-opt.cpp) # SLOW
			
 
				 llama_target_and_test(test-backend-ops.cpp)
			
 
				 
			
@@ -130,11 +142,6 @@ if (NOT GGML_BACKEND_DL)
 
				     llama_target_and_test(test-rope.cpp)
			
 
				 endif()
			
 
				 
			
 
				-# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
			
 
				-if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
			
 
				-    llama_target_and_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
			
 
				-    target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
			
 
				-endif()
			
 
				 
			
 
				 # dummy executable - not installed
			
 
				 get_filename_component(TEST_TARGET test-c.c NAME_WE)