2 år sedan · 2af23d3043
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@@ -0,0 +1,17 @@
 
				+ARG UBUNTU_VERSION=22.04
			
 
				+
			
 
				+FROM ubuntu:$UBUNTU_VERSION as build
			
 
				+
			
 
				+RUN apt-get update && \
			
 
				+    apt-get install -y build-essential python3 python3-pip
			
 
				+
			
 
				+RUN pip install --upgrade pip setuptools wheel \
			
 
				+    && pip install torch torchvision torchaudio sentencepiece numpy
			
 
				+
			
 
				+WORKDIR /app
			
 
				+
			
 
				+COPY . .
			
 
				+
			
 
				+RUN make
			
 
				+
			
 
				+ENTRYPOINT ["/app/.devops/tools.sh"]
			
--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@@ -0,0 +1,18 @@
 
				+ARG UBUNTU_VERSION=22.04
			
 
				+
			
 
				+FROM ubuntu:$UBUNTU_VERSION as build
			
 
				+
			
 
				+RUN apt-get update && \
			
 
				+    apt-get install -y build-essential
			
 
				+
			
 
				+WORKDIR /app
			
 
				+
			
 
				+COPY . .
			
 
				+
			
 
				+RUN make
			
 
				+
			
 
				+FROM ubuntu:$UBUNTU_VERSION as runtime
			
 
				+
			
 
				+COPY --from=build /app/main /main
			
 
				+
			
 
				+ENTRYPOINT [ "/main" ]
			
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -0,0 +1,46 @@
 
				+#!/bin/bash
			
 
				+set -e
			
 
				+
			
 
				+# Read the first argument into a variable
			
 
				+arg1="$1"
			
 
				+
			
 
				+# Shift the arguments to remove the first one
			
 
				+shift
			
 
				+
			
 
				+# Join the remaining arguments into a single string
			
 
				+arg2="$@"
			
 
				+
			
 
				+if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then
			
 
				+    python3 ./convert-pth-to-ggml.py $arg2
			
 
				+elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
			
 
				+    ./quantize $arg2
			
 
				+elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
			
 
				+    ./main $arg2
			
 
				+elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then
			
 
				+    python3 ./download-pth.py $arg2
			
 
				+elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
			
 
				+    echo "Downloading model..."
			
 
				+    python3 ./download-pth.py "$1" "$2"
			
 
				+    echo "Converting PTH to GGML..."
			
 
				+    for i in `ls $1/$2/ggml-model-f16.bin*`; do
			
 
				+        if [ -f "${i/f16/q4_0}" ]; then
			
 
				+            echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
			
 
				+        else
			
 
				+            echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
			
 
				+            ./quantize "$i" "${i/f16/q4_0}" 2
			
 
				+        fi
			
 
				+    done
			
 
				+else
			
 
				+    echo "Unknown command: $arg1"
			
 
				+    echo "Available commands: "
			
 
				+    echo "  --run (-r): Run a model previously converted into ggml"
			
 
				+    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -t 8 -n 512"
			
 
				+    echo "  --convert (-c): Convert a llama model into ggml"
			
 
				+    echo "              ex: \"/models/7B/\" 1"
			
 
				+    echo "  --quantize (-q): Optimize with quantization process ggml"
			
 
				+    echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
			
 
				+    echo "  --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/"
			
 
				+    echo "              ex: \"/models/\" 7B"
			
 
				+    echo "  --all-in-one (-a): Execute --download, --convert & --quantize"
			
 
				+    echo "              ex: \"/models/\" 7B"
			
 
				+fi
			
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1,24 @@
 
				+*.o
			
 
				+*.a
			
 
				+.cache/
			
 
				+.vs/
			
 
				+.vscode/
			
 
				+.DS_Store
			
 
				+
			
 
				+build/
			
 
				+build-em/
			
 
				+build-debug/
			
 
				+build-release/
			
 
				+build-static/
			
 
				+build-no-accel/
			
 
				+build-sanitize-addr/
			
 
				+build-sanitize-thread/
			
 
				+
			
 
				+models/*
			
 
				+
			
 
				+/main
			
 
				+/quantize
			
 
				+
			
 
				+arm_neon.h
			
 
				+compile_commands.json
			
 
				+Dockerfile
			
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -19,7 +19,7 @@ jobs:
 
				           make
			
 
				 
			
 
				   macOS-latest:
			
 
				-    runs-on: macOS-latest
			
 
				+    runs-on: macos-latest
			
 
				 
			
 
				     steps:
			
 
				       - name: Clone
			
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -0,0 +1,61 @@
 
				+# This workflow uses actions that are not certified by GitHub.
			
 
				+# They are provided by a third-party and are governed by
			
 
				+# separate terms of service, privacy policy, and support
			
 
				+# documentation.
			
 
				+
			
 
				+# GitHub recommends pinning actions to a commit SHA.
			
 
				+# To get a newer version, you will need to update the SHA.
			
 
				+# You can also reference a tag or branch, but the action may change without warning.
			
 
				+
			
 
				+name: Publish Docker image
			
 
				+
			
 
				+on:
			
 
				+  pull_request:
			
 
				+  push:
			
 
				+    branches:
			
 
				+      - master
			
 
				+
			
 
				+jobs:
			
 
				+  push_to_registry:
			
 
				+    name: Push Docker image to Docker Hub
			
 
				+    runs-on: ubuntu-latest
			
 
				+    env:
			
 
				+      COMMIT_SHA: ${{ github.sha }}
			
 
				+    strategy:
			
 
				+      matrix:
			
 
				+        config:
			
 
				+          - { tag: "light", dockerfile: ".devops/main.Dockerfile" }
			
 
				+          - { tag: "full", dockerfile: ".devops/full.Dockerfile" }
			
 
				+    steps:
			
 
				+      - name: Check out the repo
			
 
				+        uses: actions/checkout@v3
			
 
				+
			
 
				+      - name: Set up QEMU
			
 
				+        uses: docker/setup-qemu-action@v2
			
 
				+
			
 
				+      - name: Set up Docker Buildx
			
 
				+        uses: docker/setup-buildx-action@v2
			
 
				+
			
 
				+      - name: Log in to Docker Hub
			
 
				+        uses: docker/login-action@v2
			
 
				+        with:
			
 
				+          registry: ghcr.io
			
 
				+          username: ${{ github.actor }}
			
 
				+          password: ${{ secrets.GITHUB_TOKEN }}
			
 
				+
			
 
				+      - name: Build and push Docker image (versioned)
			
 
				+        if: github.event_name == 'push'
			
 
				+        uses: docker/build-push-action@v4
			
 
				+        with:
			
 
				+          context: .
			
 
				+          push: true
			
 
				+          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
			
 
				+          file: ${{ matrix.config.dockerfile }}
			
 
				+
			
 
				+      - name: Build and push Docker image (tagged)
			
 
				+        uses: docker/build-push-action@v4
			
 
				+        with:
			
 
				+          context: .
			
 
				+          push: ${{ github.event_name == 'push' }}
			
 
				+          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
			
 
				+          file: ${{ matrix.config.dockerfile }}
			
--- a/README.md
+++ b/README.md
@@ -32,6 +32,7 @@ Supported platforms:
 
				 - [X] Mac OS
			
 
				 - [X] Linux
			
 
				 - [X] Windows (via CMake)
			
 
				+- [X] Docker
			
 
				 
			
 
				 ---
			
 
				 
			
@@ -194,6 +195,37 @@ Finally, copy the `llama` binary and the model files to your device storage. Her
 
				 
			
 
				 https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
			
 
				 
			
 
				+### Docker
			
 
				+
			
 
				+#### Prerequisites
			
 
				+* Docker must be installed and running on your system.
			
 
				+* Create a folder to store big models & intermediate files (in ex. im using /llama/models)
			
 
				+
			
 
				+#### Images
			
 
				+We have two Docker images available for this project:
			
 
				+
			
 
				+1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
			
 
				+2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file.
			
 
				+
			
 
				+#### Usage
			
 
				+
			
 
				+The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image.
			
 
				+
			
 
				+ ```bash
			
 
				+docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
			
 
				+```
			
 
				+
			
 
				+On complete, you are ready to play!
			
 
				+
			
 
				+```bash
			
 
				+docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
			
 
				+```
			
 
				+
			
 
				+or with light image:
			
 
				+
			
 
				+```bash
			
 
				+docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
			
 
				+```
			
 
				 
			
 
				 ## Limitations
			
 
				 
			
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@@ -16,7 +16,7 @@
 
				 # At the start of the ggml file we write the model parameters
			
 
				 # and vocabulary.
			
 
				 #
			
 
				-
			
 
				+import os
			
 
				 import sys
			
 
				 import json
			
 
				 import struct
			
@@ -64,6 +64,10 @@ if len(sys.argv) > 2:
 
				         sys.exit(1)
			
 
				     fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
			
 
				 
			
 
				+if os.path.exists(fname_out):
			
 
				+    print(f"Skip conversion, it already exists: {fname_out}")
			
 
				+    sys.exit(0)
			
 
				+
			
 
				 with open(fname_hparams, "r") as f:
			
 
				     hparams = json.load(f)
			
 
				 
			
--- a/download-pth.py
+++ b/download-pth.py
@@ -0,0 +1,66 @@
 
				+import os
			
 
				+import sys
			
 
				+from tqdm import tqdm
			
 
				+import requests
			
 
				+
			
 
				+if len(sys.argv) < 3:
			
 
				+    print("Usage: download-pth.py dir-model model-type\n")
			
 
				+    print("  model-type: Available models 7B, 13B, 30B or 65B")
			
 
				+    sys.exit(1)
			
 
				+
			
 
				+modelsDir = sys.argv[1]
			
 
				+model = sys.argv[2]
			
 
				+
			
 
				+num = {
			
 
				+    "7B": 1,
			
 
				+    "13B": 2,
			
 
				+    "30B": 4,
			
 
				+    "65B": 8,
			
 
				+}
			
 
				+
			
 
				+if model not in num:
			
 
				+    print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B")
			
 
				+    sys.exit(1)
			
 
				+
			
 
				+print(f"Downloading model {model}")
			
 
				+
			
 
				+files = ["checklist.chk", "params.json"]
			
 
				+
			
 
				+for i in range(num[model]):
			
 
				+    files.append(f"consolidated.0{i}.pth")
			
 
				+
			
 
				+resolved_path = os.path.abspath(os.path.join(modelsDir, model))
			
 
				+os.makedirs(resolved_path, exist_ok=True)
			
 
				+
			
 
				+for file in files:
			
 
				+    dest_path = os.path.join(resolved_path, file)
			
 
				+    
			
 
				+    if os.path.exists(dest_path):
			
 
				+        print(f"Skip file download, it already exists: {file}")
			
 
				+        continue
			
 
				+
			
 
				+    url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}"
			
 
				+    response = requests.get(url, stream=True)
			
 
				+    with open(dest_path, 'wb') as f:
			
 
				+        with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
			
 
				+            for chunk in response.iter_content(chunk_size=1024):
			
 
				+                if chunk:
			
 
				+                    f.write(chunk)
			
 
				+                    t.update(len(chunk))
			
 
				+
			
 
				+files2 = ["tokenizer_checklist.chk", "tokenizer.model"]
			
 
				+for file in files2:
			
 
				+    dest_path = os.path.join(modelsDir, file)
			
 
				+    
			
 
				+    if os.path.exists(dest_path):
			
 
				+        print(f"Skip file download, it already exists: {file}")
			
 
				+        continue
			
 
				+    
			
 
				+    url = f"https://agi.gpt4.org/llama/LLaMA/{file}"
			
 
				+    response = requests.get(url, stream=True)
			
 
				+    with open(dest_path, 'wb') as f:
			
 
				+        with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
			
 
				+            for chunk in response.iter_content(chunk_size=1024):
			
 
				+                if chunk:
			
 
				+                    f.write(chunk)
			
 
				+                    t.update(len(chunk))