hace 2 años · c918fe8dca
--- a/.gitignore
+++ b/.gitignore
@@ -105,3 +105,4 @@ poetry.toml
 
				 /tests/test-tokenizer-1-bpe
			
 
				 /tests/test-rope
			
 
				 /tests/test-backend-ops
			
 
				+/tests/test-autorelease
			
--- a/Makefile
+++ b/Makefile
@@ -9,7 +9,7 @@ TEST_TARGETS = \
 
				 	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
			
 
				 	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
			
 
				 	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope      \
			
 
				-	tests/test-backend-ops
			
 
				+	tests/test-backend-ops tests/test-autorelease
			
 
				 
			
 
				 # Code coverage output files
			
 
				 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
			
@@ -747,3 +747,6 @@ tests/test-c.o: tests/test-c.c llama.h
 
				 
			
 
				 tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
			
 
				 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
			
 
				+
			
 
				+tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
			
 
				+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
			
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -179,6 +179,8 @@ function gg_run_open_llama_3b_v2 {
 
				 
			
 
				     wiki_test_60="${path_wiki}/wiki.test-60.raw"
			
 
				 
			
 
				+    ./bin/test-autorelease ${model_f16}
			
 
				+
			
 
				     ./bin/quantize ${model_f16} ${model_q8_0} q8_0
			
 
				     ./bin/quantize ${model_f16} ${model_q4_0} q4_0
			
 
				     ./bin/quantize ${model_f16} ${model_q4_1} q4_1
			
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -303,22 +303,21 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
 
				                 return NULL;
			
 
				             }
			
 
				 
			
 
				-            // dictionary of preprocessor macros
			
 
				-            NSMutableDictionary * prep = [NSMutableDictionary dictionary];
			
 
				+            @autoreleasepool {
			
 
				+                // dictionary of preprocessor macros
			
 
				+                NSMutableDictionary * prep = [NSMutableDictionary dictionary];
			
 
				 
			
 
				 #ifdef GGML_QKK_64
			
 
				-            prep[@"QK_K"] = @(64);
			
 
				+                prep[@"QK_K"] = @(64);
			
 
				 #endif
			
 
				 
			
 
				-            MTLCompileOptions* options = [MTLCompileOptions new];
			
 
				-            options.preprocessorMacros = prep;
			
 
				+                MTLCompileOptions* options = [MTLCompileOptions new];
			
 
				+                options.preprocessorMacros = prep;
			
 
				 
			
 
				-            //[options setFastMathEnabled:false];
			
 
				+                //[options setFastMathEnabled:false];
			
 
				 
			
 
				-            ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
			
 
				-
			
 
				-            [options release];
			
 
				-            [prep release];
			
 
				+                ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
			
 
				+            }
			
 
				         }
			
 
				 
			
 
				         if (error) {
			
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -49,6 +49,7 @@ llama_build_and_test_executable(test-llama-grammar.cpp)
 
				 llama_build_and_test_executable(test-grad0.cpp)
			
 
				 # llama_build_and_test_executable(test-opt.cpp) # SLOW
			
 
				 llama_build_and_test_executable(test-backend-ops.cpp)
			
 
				+llama_build_and_test_executable(test-autorelease.cpp)
			
 
				 
			
 
				 llama_build_and_test_executable(test-rope.cpp)
			
 
				 
			
--- a/tests/test-autorelease.cpp
+++ b/tests/test-autorelease.cpp
@@ -0,0 +1,28 @@
 
				+// ref: https://github.com/ggerganov/llama.cpp/issues/4952#issuecomment-1892864763
			
 
				+
			
 
				+#include <cstdio>
			
 
				+#include <string>
			
 
				+#include <thread>
			
 
				+
			
 
				+#include "llama.h"
			
 
				+
			
 
				+// This creates a new context inside a pthread and then tries to exit cleanly.
			
 
				+int main(int argc, char ** argv) {
			
 
				+    if (argc < 2) {
			
 
				+        printf("Usage: %s model.gguf\n", argv[0]);
			
 
				+        return 0; // intentionally return success
			
 
				+    }
			
 
				+
			
 
				+    const std::string fname = argv[1];
			
 
				+
			
 
				+    std::thread([&fname]() {
			
 
				+        llama_backend_init(false);
			
 
				+        auto * model = llama_load_model_from_file(fname.c_str(), llama_model_default_params());
			
 
				+        auto * ctx = llama_new_context_with_model(model, llama_context_default_params());
			
 
				+        llama_free(ctx);
			
 
				+        llama_free_model(model);
			
 
				+        llama_backend_free();
			
 
				+    }).join();
			
 
				+
			
 
				+    return 0;
			
 
				+}