|
@@ -2,7 +2,7 @@
|
|
|
# -*- coding: utf-8 -*-
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
# This script downloads the tokenizer models of the specified models from Huggingface and
|
|
# This script downloads the tokenizer models of the specified models from Huggingface and
|
|
|
-# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
|
|
|
|
|
|
|
+# generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
|
|
|
#
|
|
#
|
|
|
# This is necessary in order to analyze the type of pre-tokenizer used by the model and
|
|
# This is necessary in order to analyze the type of pre-tokenizer used by the model and
|
|
|
# provide the necessary information to llama.cpp via the GGUF header in order to implement
|
|
# provide the necessary information to llama.cpp via the GGUF header in order to implement
|
|
@@ -15,9 +15,9 @@
|
|
|
# - Add a new model to the "models" list
|
|
# - Add a new model to the "models" list
|
|
|
# - Run the script with your huggingface token:
|
|
# - Run the script with your huggingface token:
|
|
|
#
|
|
#
|
|
|
-# python3 convert-hf-to-gguf-update.py <huggingface_token>
|
|
|
|
|
|
|
+# python3 convert_hf_to_gguf_update.py <huggingface_token>
|
|
|
#
|
|
#
|
|
|
-# - Copy-paste the generated get_vocab_base_pre() function into convert-hf-to-gguf.py
|
|
|
|
|
|
|
+# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py
|
|
|
# - Update llama.cpp with the new pre-tokenizer if necessary
|
|
# - Update llama.cpp with the new pre-tokenizer if necessary
|
|
|
#
|
|
#
|
|
|
# TODO: generate tokenizer tests for llama.cpp
|
|
# TODO: generate tokenizer tests for llama.cpp
|
|
@@ -37,7 +37,7 @@ from enum import IntEnum, auto
|
|
|
from transformers import AutoTokenizer
|
|
from transformers import AutoTokenizer
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.DEBUG)
|
|
logging.basicConfig(level=logging.DEBUG)
|
|
|
-logger = logging.getLogger("convert-hf-to-gguf-update")
|
|
|
|
|
|
|
+logger = logging.getLogger("convert_hf_to_gguf_update")
|
|
|
sess = requests.Session()
|
|
sess = requests.Session()
|
|
|
|
|
|
|
|
|
|
|
|
@@ -56,10 +56,10 @@ if len(sys.argv) == 2:
|
|
|
token = sys.argv[1]
|
|
token = sys.argv[1]
|
|
|
if not token.startswith("hf_"):
|
|
if not token.startswith("hf_"):
|
|
|
logger.info("Huggingface token seems invalid")
|
|
logger.info("Huggingface token seems invalid")
|
|
|
- logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
|
|
|
|
|
|
|
+ logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
|
|
|
sys.exit(1)
|
|
sys.exit(1)
|
|
|
else:
|
|
else:
|
|
|
- logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
|
|
|
|
|
|
|
+ logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
|
|
|
sys.exit(1)
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
# TODO: add models here, base models preferred
|
|
# TODO: add models here, base models preferred
|
|
@@ -134,7 +134,7 @@ for model in models:
|
|
|
logger.error(f"Failed to download model {model['name']}. Error: {e}")
|
|
logger.error(f"Failed to download model {model['name']}. Error: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
-# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
|
|
|
|
|
|
|
+# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
|
|
|
|
|
|
|
|
src_ifs = ""
|
|
src_ifs = ""
|
|
|
for model in models:
|
|
for model in models:
|
|
@@ -201,7 +201,7 @@ src_func = f"""
|
|
|
|
|
|
|
|
res = None
|
|
res = None
|
|
|
|
|
|
|
|
- # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
|
|
|
|
|
|
|
+ # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
|
|
|
# or pull the latest version of the model from Huggingface
|
|
# or pull the latest version of the model from Huggingface
|
|
|
# don't edit the hashes manually!
|
|
# don't edit the hashes manually!
|
|
|
{src_ifs}
|
|
{src_ifs}
|
|
@@ -210,9 +210,9 @@ src_func = f"""
|
|
|
logger.warning("**************************************************************************************")
|
|
logger.warning("**************************************************************************************")
|
|
|
logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
|
|
logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
|
|
|
logger.warning("** There are 2 possible reasons for this:")
|
|
logger.warning("** There are 2 possible reasons for this:")
|
|
|
- logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet")
|
|
|
|
|
|
|
+ logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
|
|
|
logger.warning("** - the pre-tokenization config has changed upstream")
|
|
logger.warning("** - the pre-tokenization config has changed upstream")
|
|
|
- logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
|
|
|
|
|
|
|
+ logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
|
|
|
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
|
|
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
|
|
|
logger.warning("**")
|
|
logger.warning("**")
|
|
|
logger.warning(f"** chkhsh: {{chkhsh}}")
|
|
logger.warning(f"** chkhsh: {{chkhsh}}")
|
|
@@ -226,7 +226,7 @@ src_func = f"""
|
|
|
return res
|
|
return res
|
|
|
"""
|
|
"""
|
|
|
|
|
|
|
|
-convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
|
|
|
|
|
|
|
+convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
|
|
|
convert_py = convert_py_pth.read_text(encoding="utf-8")
|
|
convert_py = convert_py_pth.read_text(encoding="utf-8")
|
|
|
convert_py = re.sub(
|
|
convert_py = re.sub(
|
|
|
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
|
|
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
|
|
@@ -237,7 +237,7 @@ convert_py = re.sub(
|
|
|
|
|
|
|
|
convert_py_pth.write_text(convert_py, encoding="utf-8")
|
|
convert_py_pth.write_text(convert_py, encoding="utf-8")
|
|
|
|
|
|
|
|
-logger.info("+++ convert-hf-to-gguf.py was updated")
|
|
|
|
|
|
|
+logger.info("+++ convert_hf_to_gguf.py was updated")
|
|
|
|
|
|
|
|
# generate tests for each tokenizer model
|
|
# generate tests for each tokenizer model
|
|
|
|
|
|
|
@@ -343,6 +343,6 @@ logger.info("\nRun the following commands to generate the vocab files for testin
|
|
|
for model in models:
|
|
for model in models:
|
|
|
name = model["name"]
|
|
name = model["name"]
|
|
|
|
|
|
|
|
- print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
|
|
|
|
|
|
|
+ print(f"python3 convert_hf_to_gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
|
|
|
|
|
|
|
|
logger.info("\n")
|
|
logger.info("\n")
|