llama-cpp: add gemma 4 graph fix

2026-04-07 22:59:59 -04:00
2 changed files with 25 additions and 0 deletions
--- a/patches/llamacpp/0004-gemma4-graph-fix.patch
+++ b/patches/llamacpp/0004-gemma4-graph-fix.patch
@@ -0,0 +1,24 @@
+From b934a8ca49f9e764fa21d45ff2ce1168a3a7c914 Mon Sep 17 00:00:00 2001
+From: Georgi Gerganov <ggerganov@gmail.com>
+Date: Mon, 6 Apr 2026 11:50:22 +0300
+Subject: [PATCH] models : set gemma 4 FFN MoE prec to F32
+
+---
+ src/llama-graph.cpp | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
+index 0e7d96ca10d..aa8a35721fa 100644
+--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
+@@ -1185,8 +1185,8 @@ ggml_tensor * llm_graph_context::build_ffn(
+ 
+     if (down) {
+         cur = build_lora_mm(down, cur);
+-        if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2) {
+-            // GLM4, GLM4_MOE, and JAIS2 seem to have numerical issues with half-precision accumulators
+        if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2 || arch == LLM_ARCH_GEMMA4) {
+            // certain models seem to have numerical issues with half-precision accumulators
+             ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
+         }
+     }
--- a/services/llama-cpp.nix
+++ b/services/llama-cpp.nix
@@ -27,6 +27,7 @@ in
      inputs.llamacpp.packages.${pkgs.system}.vulkan.overrideAttrs (old: {
        patches = (old.patches or [ ]) ++ [
          ../patches/llamacpp/0003-gemma4-tokenizer-fix.patch
+          ../patches/llamacpp/0004-gemma4-graph-fix.patch
        ];
      })
    );