From 12469de58019ba06a6d8fdf84e8c6ab8e3774443 Mon Sep 17 00:00:00 2001 From: Simon Gardling Date: Sat, 11 Apr 2026 10:27:38 -0400 Subject: [PATCH] llama.cpp: things --- .../llamacpp/0003-gemma4-tokenizer-fix.patch | 88 ------------------- services/llama-cpp.nix | 5 +- 2 files changed, 2 insertions(+), 91 deletions(-) delete mode 100644 patches/llamacpp/0003-gemma4-tokenizer-fix.patch diff --git a/patches/llamacpp/0003-gemma4-tokenizer-fix.patch b/patches/llamacpp/0003-gemma4-tokenizer-fix.patch deleted file mode 100644 index e01692a..0000000 --- a/patches/llamacpp/0003-gemma4-tokenizer-fix.patch +++ /dev/null @@ -1,88 +0,0 @@ -From 320c29c2dbe3c8df56374a9ec19a7fe5c124d4f8 Mon Sep 17 00:00:00 2001 -From: Piotr Wilkin -Date: Tue, 7 Apr 2026 00:54:00 +0200 -Subject: [PATCH 1/2] YATF (Yet Another Tokenizer Fix) for Gemma 4. With tests! - ---- - convert_hf_to_gguf_update.py | 1 + - models/ggml-vocab-gemma-4.gguf | Bin 0 -> 15776467 bytes - models/ggml-vocab-gemma-4.gguf.inp | 111 +++++++++++++++++++++++++++++ - models/ggml-vocab-gemma-4.gguf.out | 46 ++++++++++++ - src/llama-vocab.cpp | 13 +++- - tests/CMakeLists.txt | 1 + - 6 files changed, 170 insertions(+), 2 deletions(-) - create mode 100644 models/ggml-vocab-gemma-4.gguf - create mode 100644 models/ggml-vocab-gemma-4.gguf.inp - create mode 100644 models/ggml-vocab-gemma-4.gguf.out - -diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py -index 086f1c22863..f1d70d62e73 100755 ---- a/convert_hf_to_gguf_update.py -+++ b/convert_hf_to_gguf_update.py -@@ -114,6 +114,7 @@ class TOKENIZER_TYPE(IntEnum): - {"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B - {"name": "gemma", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", }, - {"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", }, -+ {"name": "gemma-4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/google/gemma-4-E2B-it", }, - {"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", }, - {"name": "jais-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inceptionai/Jais-2-8B-Chat", }, - {"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", }, -diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp -index de9a9466bc7..e9e276ab999 100644 ---- a/src/llama-vocab.cpp -+++ b/src/llama-vocab.cpp -@@ -658,9 +658,18 @@ struct llm_tokenizer_bpe_session { - const auto token = vocab.text_to_token(str); - - if (token == LLAMA_TOKEN_NULL) { -+ static const char * hex = "0123456789ABCDEF"; - for (auto j = str.begin(); j != str.end(); ++j) { -- std::string byte_str(1, *j); -- auto token_multibyte = vocab.text_to_token(byte_str); -+ llama_token token_multibyte = LLAMA_TOKEN_NULL; -+ if (tokenizer.byte_encode) { -+ std::string byte_str(1, *j); -+ token_multibyte = vocab.text_to_token(byte_str); -+ } else { -+ // For non-byte-encoded BPE (e.g. gemma-4), byte tokens use <0xXX> format -+ const uint8_t ch = (uint8_t)*j; -+ const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 }; -+ token_multibyte = vocab.text_to_token(buf); -+ } - if (token_multibyte != LLAMA_TOKEN_NULL) { - output.push_back(token_multibyte); - } -diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt -index 5e87c8b34e1..cd4bc5ef1d3 100644 ---- a/tests/CMakeLists.txt -+++ b/tests/CMakeLists.txt -@@ -124,6 +124,7 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${PROJE - llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-deepseek-coder.gguf) - llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-deepseek-llm.gguf) - llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-falcon.gguf) -+llama_test(test-tokenizer-0 NAME test-tokenizer-0-gemma-4 ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gemma-4.gguf) - llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gpt-2.gguf) - llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-bpe.gguf) - llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-spm.gguf) - -From 0e98596dec124c6968132ef042c21ccdb20d1304 Mon Sep 17 00:00:00 2001 -From: Piotr Wilkin -Date: Tue, 7 Apr 2026 00:58:08 +0200 -Subject: [PATCH 2/2] Remove unnecessary hash from update script. - ---- - convert_hf_to_gguf_update.py | 1 - - 1 file changed, 1 deletion(-) - -diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py -index f1d70d62e73..086f1c22863 100755 ---- a/convert_hf_to_gguf_update.py -+++ b/convert_hf_to_gguf_update.py -@@ -114,7 +114,6 @@ class TOKENIZER_TYPE(IntEnum): - {"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B - {"name": "gemma", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", }, - {"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", }, -- {"name": "gemma-4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/google/gemma-4-E2B-it", }, - {"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", }, - {"name": "jais-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inceptionai/Jais-2-8B-Chat", }, - {"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", }, diff --git a/services/llama-cpp.nix b/services/llama-cpp.nix index 78f931f..d9f7f76 100644 --- a/services/llama-cpp.nix +++ b/services/llama-cpp.nix @@ -9,7 +9,7 @@ }: let cfg = config.services.llama-cpp; - modelUrl = "https://huggingface.co/bartowski/google_gemma-4-E2B-it-GGUF/resolve/main/google_gemma-4-E2B-it-Q4_K_M.gguf"; + modelUrl = "https://huggingface.co/bartowski/google_gemma-4-E2B-it-GGUF/resolve/main/google_gemma-4-E2B-it-IQ2_M.gguf"; modelAlias = lib.removeSuffix ".gguf" (baseNameOf modelUrl); in { @@ -25,7 +25,7 @@ in model = toString ( pkgs.fetchurl { url = modelUrl; - sha256 = "5efe645db4e1909c7a1f4a9608df18e6c14383f5e86777fc49f769f9ba7d5fdf"; + sha256 = "17e869ac54d0e59faa884d5319fc55ad84cd866f50f0b3073fbb25accc875a23"; } ); port = service_configs.ports.private.llama_cpp.port; @@ -33,7 +33,6 @@ in package = lib.optimizePackage ( inputs.llamacpp.packages.${pkgs.system}.vulkan.overrideAttrs (old: { patches = (old.patches or [ ]) ++ [ - ../patches/llamacpp/0003-gemma4-tokenizer-fix.patch ]; }) );