diff --git a/patches/llamacpp/0004-gemma4-graph-fix.patch b/patches/llamacpp/0004-gemma4-graph-fix.patch new file mode 100644 index 0000000..089efc3 --- /dev/null +++ b/patches/llamacpp/0004-gemma4-graph-fix.patch @@ -0,0 +1,24 @@ +From b934a8ca49f9e764fa21d45ff2ce1168a3a7c914 Mon Sep 17 00:00:00 2001 +From: Georgi Gerganov +Date: Mon, 6 Apr 2026 11:50:22 +0300 +Subject: [PATCH] models : set gemma 4 FFN MoE prec to F32 + +--- + src/llama-graph.cpp | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp +index 0e7d96ca10d..aa8a35721fa 100644 +--- a/src/llama-graph.cpp ++++ b/src/llama-graph.cpp +@@ -1185,8 +1185,8 @@ ggml_tensor * llm_graph_context::build_ffn( + + if (down) { + cur = build_lora_mm(down, cur); +- if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2) { +- // GLM4, GLM4_MOE, and JAIS2 seem to have numerical issues with half-precision accumulators ++ if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2 || arch == LLM_ARCH_GEMMA4) { ++ // certain models seem to have numerical issues with half-precision accumulators + ggml_mul_mat_set_prec(cur, GGML_PREC_F32); + } + } diff --git a/services/llama-cpp.nix b/services/llama-cpp.nix index 25015c8..93fdaaf 100644 --- a/services/llama-cpp.nix +++ b/services/llama-cpp.nix @@ -27,6 +27,7 @@ in inputs.llamacpp.packages.${pkgs.system}.vulkan.overrideAttrs (old: { patches = (old.patches or [ ]) ++ [ ../patches/llamacpp/0003-gemma4-tokenizer-fix.patch + ../patches/llamacpp/0004-gemma4-graph-fix.patch ]; }) );