%!s(int64=2) %!d(string=hai) anos · b925f1f1b0
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -355,8 +355,18 @@ cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src,
 
				 }
			
 
				 
			
 
				 void * ggml_cuda_host_malloc(size_t size) {
			
 
				-    void * ptr;
			
 
				-    CUDA_CHECK(cudaMallocHost((void **) &ptr, size));
			
 
				+    if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
			
 
				+        return nullptr;
			
 
				+    }
			
 
				+
			
 
				+    void * ptr = nullptr;
			
 
				+    cudaError_t err = cudaMallocHost((void **) &ptr, size);
			
 
				+    if (err != cudaSuccess) {
			
 
				+        fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
			
 
				+            size/1024.0/1024.0, cudaGetErrorString(err));
			
 
				+        return nullptr;
			
 
				+    }
			
 
				+
			
 
				     return ptr;
			
 
				 }
			
 
				 
			
--- a/llama-util.h
+++ b/llama-util.h
@@ -395,6 +395,8 @@ struct llama_buffer {
 
				     uint8_t * addr = NULL;
			
 
				     size_t size = 0;
			
 
				 
			
 
				+    llama_buffer() = default;
			
 
				+
			
 
				     void resize(size_t size) {
			
 
				         delete[] addr;
			
 
				         addr = new uint8_t[size];
			
@@ -404,27 +406,59 @@ struct llama_buffer {
 
				     ~llama_buffer() {
			
 
				         delete[] addr;
			
 
				     }
			
 
				+
			
 
				+    // disable copy and move
			
 
				+    llama_buffer(const llama_buffer&) = delete;
			
 
				+    llama_buffer(llama_buffer&&) = delete;
			
 
				+    llama_buffer& operator=(const llama_buffer&) = delete;
			
 
				+    llama_buffer& operator=(llama_buffer&&) = delete;
			
 
				 };
			
 
				 
			
 
				 #ifdef GGML_USE_CUBLAS
			
 
				 #include "ggml-cuda.h"
			
 
				 struct llama_ctx_buffer {
			
 
				     uint8_t * addr = NULL;
			
 
				+    bool is_cuda;
			
 
				     size_t size = 0;
			
 
				 
			
 
				+    llama_ctx_buffer() = default;
			
 
				+
			
 
				     void resize(size_t size) {
			
 
				+        free();
			
 
				+
			
 
				+        addr = (uint8_t *) ggml_cuda_host_malloc(size);
			
 
				         if (addr) {
			
 
				-            ggml_cuda_host_free(addr);
			
 
				+            is_cuda = true;
			
 
				+        }
			
 
				+        else {
			
 
				+            // fall back to pageable memory
			
 
				+            addr = new uint8_t[size];
			
 
				+            is_cuda = false;
			
 
				         }
			
 
				-        addr = (uint8_t *) ggml_cuda_host_malloc(size);
			
 
				         this->size = size;
			
 
				     }
			
 
				 
			
 
				-    ~llama_ctx_buffer() {
			
 
				+    void free() {
			
 
				         if (addr) {
			
 
				-            ggml_cuda_host_free(addr);
			
 
				+            if (is_cuda) {
			
 
				+                ggml_cuda_host_free(addr);
			
 
				+            }
			
 
				+            else {
			
 
				+                delete[] addr;
			
 
				+            }
			
 
				         }
			
 
				+        addr = NULL;
			
 
				     }
			
 
				+
			
 
				+    ~llama_ctx_buffer() {
			
 
				+        free();
			
 
				+    }
			
 
				+
			
 
				+    // disable copy and move
			
 
				+    llama_ctx_buffer(const llama_ctx_buffer&) = delete;
			
 
				+    llama_ctx_buffer(llama_ctx_buffer&&) = delete;
			
 
				+    llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
			
 
				+    llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
			
 
				 };
			
 
				 #else
			
 
				 typedef llama_buffer llama_ctx_buffer;
			
--- a/llama.cpp
+++ b/llama.cpp
@@ -727,8 +727,7 @@ struct llama_model_loader {
 
				             LLAMA_ASSERT(offset == lt.size);
			
 
				         } else if (lt.split_type == SPLIT_BY_COLUMNS) {
			
 
				             // Let's load the data into temporary buffers to ensure the OS performs large loads.
			
 
				-            std::vector<llama_buffer> tmp_bufs;
			
 
				-            tmp_bufs.resize(lt.shards.size());
			
 
				+            std::vector<llama_buffer> tmp_bufs(lt.shards.size());
			
 
				             for (size_t i = 0; i < lt.shards.size(); i++) {
			
 
				                 llama_load_tensor_shard & shard = lt.shards.at(i);
			
 
				                 llama_file & file = file_loaders.at(shard.file_idx)->file;