2 лет назад · d1f224712d
--- a/README.md
+++ b/README.md
@@ -145,44 +145,16 @@ python3 -m pip install torch numpy sentencepiece
 
				 python3 convert-pth-to-ggml.py models/7B/ 1
			
 
				 
			
 
				 # quantize the model to 4-bits
			
 
				-./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2
			
 
				+./quantize.sh 7B
			
 
				 
			
 
				 # run the inference
			
 
				 ./main -m ./models/7B/ggml-model-q4_0.bin -t 8 -n 128
			
 
				 ```
			
 
				 
			
 
				-For the bigger models, there are a few extra quantization steps. For example, for LLaMA-13B, converting to FP16 format
			
 
				-will create 2 ggml files, instead of one:
			
 
				-
			
 
				-```bash
			
 
				-ggml-model-f16.bin
			
 
				-ggml-model-f16.bin.1
			
 
				-```
			
 
				-
			
 
				-You need to quantize each of them separately like this:
			
 
				-
			
 
				-```bash
			
 
				-./quantize ./models/13B/ggml-model-f16.bin   ./models/13B/ggml-model-q4_0.bin 2
			
 
				-./quantize ./models/13B/ggml-model-f16.bin.1 ./models/13B/ggml-model-q4_0.bin.1 2
			
 
				-```
			
 
				-
			
 
				-Everything else is the same. Simply run:
			
 
				-
			
 
				-```bash
			
 
				-./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 128
			
 
				-```
			
 
				-
			
 
				-The number of files generated for each model is as follows:
			
 
				-
			
 
				-```
			
 
				-7B  -> 1 file
			
 
				-13B -> 2 files
			
 
				-30B -> 4 files
			
 
				-65B -> 8 files
			
 
				-```
			
 
				-
			
 
				 When running the larger models, make sure you have enough disk space to store all the intermediate files.
			
 
				 
			
 
				+TODO: add model disk/mem requirements
			
 
				+
			
 
				 ### Interactive mode
			
 
				 
			
 
				 If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.
			
--- a/quantize.sh
+++ b/quantize.sh
@@ -0,0 +1,15 @@
 
				+#!/usr/bin/env bash
			
 
				+
			
 
				+if ! [[ "$1" =~ ^[0-9]{1,2}B$ ]]; then
			
 
				+    echo
			
 
				+    echo "Usage: quantize.sh 7B|13B|30B|65B [--remove-f16]"
			
 
				+    echo
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+for i in `ls models/$1/ggml-model-f16.bin*`; do
			
 
				+    ./quantize "$i" "${i/f16/q4_0}" 2
			
 
				+    if [[ "$2" == "--remove-f16" ]]; then
			
 
				+        rm "$i"
			
 
				+    fi
			
 
				+done