llama-cpp: add gemma 4 graph fix

2026-04-07 22:59:59 -04:00
3 changed files with 43 additions and 41 deletions
--- a/flake.lock
+++ b/flake.lock
@@ -304,11 +304,11 @@
        "rust-overlay": "rust-overlay"
      },
      "locked": {
-        "lastModified": 1775754862,
+        "lastModified": 1775510693,
-        "narHash": "sha256-8y9cz8+cyeA7KtA7+Q3bXjyFJV5nM38Fc0E4qPw7WDk=",
+        "narHash": "sha256-gZfJ07j/oOciDi8mF/V8QTm7YCeDcusNSMZzBFi8OUM=",
        "owner": "nix-community",
        "repo": "lanzaboote",
-        "rev": "bea51aaee00688794a877f308007590a6cc8e378",
+        "rev": "3fe0ae8cb285e0ad101a9675f4190d455fb05e85",
        "type": "github"
      },
      "original": {
@@ -325,11 +325,11 @@
        ]
      },
      "locked": {
-        "lastModified": 1775754125,
+        "lastModified": 1775614184,
-        "narHash": "sha256-4udYhEvii0xPmRiKXYWLhPakPDd1mJppnEFY6uWdv8s=",
+        "narHash": "sha256-OYwr36LLVIeEqccN1mJ2k6vCsFocboCQJnbtne415Ig=",
        "owner": "TheTom",
        "repo": "llama-cpp-turboquant",
-        "rev": "8590cbff961dbaf1d3a9793fd11d402e248869b9",
+        "rev": "eea498c42716519e58baf2d9600d2e2b41839255",
        "type": "github"
      },
      "original": {
@@ -368,11 +368,11 @@
        "systems": "systems_3"
      },
      "locked": {
-        "lastModified": 1775752089,
+        "lastModified": 1775531897,
-        "narHash": "sha256-+psXqZ1SvQw7L8HgCQINmob9zLnvK433b2k080lBPH0=",
+        "narHash": "sha256-3NIpnV1HxBCwi00iMvj9KcqXkM0VNA72KABj8g0cFFs=",
        "owner": "Infinidoge",
        "repo": "nix-minecraft",
-        "rev": "1beacd3bdadabfac884dedd56176966c141214d8",
+        "rev": "8c7693880cb861e60adeab5480f02dc3e7a390f6",
        "type": "github"
      },
      "original": {
@@ -399,11 +399,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1775595990,
+        "lastModified": 1775305101,
-        "narHash": "sha256-OEf7YqhF9IjJFYZJyuhAypgU+VsRB5lD4DuiMws5Ltc=",
+        "narHash": "sha256-/74n1oQPtKG52Yw41cbToxspxHbYz6O3vi+XEw16Qe8=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "4e92bbcdb030f3b4782be4751dc08e6b6cb6ccf2",
+        "rev": "36a601196c4ebf49e035270e10b2d103fe39076b",
        "type": "github"
      },
      "original": {
@@ -624,11 +624,11 @@
        ]
      },
      "locked": {
-        "lastModified": 1775701952,
+        "lastModified": 1775444042,
-        "narHash": "sha256-xj9u8fz2hTTTELMorqox0hPWrmAvGRnQUEnlj+vCjFo=",
+        "narHash": "sha256-cg19ipIlZaLYgs/5ZPFcDDuOcZlGzfprB5xS4x7bVM4=",
        "owner": "nix-community",
        "repo": "srvos",
-        "rev": "f56f1053ae9f878501d3a8ae1961c73d1d7abce3",
+        "rev": "64c9cc6a274dac7d08c4d53494ffa4acf906e287",
        "type": "github"
      },
      "original": {
@@ -715,11 +715,11 @@
    "trackerlist": {
      "flake": false,
      "locked": {
-        "lastModified": 1775686189,
+        "lastModified": 1775599784,
-        "narHash": "sha256-kzEDJKptaVToSg/wpub0bLjAVRmkYOorjPsNqlpxWdU=",
+        "narHash": "sha256-ZapxbiFEYjJV2nhdowHQ/8+c8Jd5fpBIEKDiPEmyNgI=",
        "owner": "ngosang",
        "repo": "trackerslist",
-        "rev": "ce9c0afc3885d0592caa91f0d4359f315ef7428c",
+        "rev": "6cc71b5b65349081bb713719f5142c200438a327",
        "type": "github"
      },
      "original": {
--- a/patches/llamacpp/0004-gemma4-graph-fix.patch
+++ b/patches/llamacpp/0004-gemma4-graph-fix.patch
@@ -0,0 +1,24 @@
 From b934a8ca49f9e764fa21d45ff2ce1168a3a7c914 Mon Sep 17 00:00:00 2001
 From: Georgi Gerganov <ggerganov@gmail.com>
 Date: Mon, 6 Apr 2026 11:50:22 +0300
 Subject: [PATCH] models : set gemma 4 FFN MoE prec to F32
 ---
 src/llama-graph.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
 index 0e7d96ca10d..aa8a35721fa 100644
 --- a/src/llama-graph.cpp
 +++ b/src/llama-graph.cpp
@@ -1185,8 +1185,8 @@ ggml_tensor * llm_graph_context::build_ffn(
     if (down) {
         cur = build_lora_mm(down, cur);
 -        if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2) {
 -            // GLM4, GLM4_MOE, and JAIS2 seem to have numerical issues with half-precision accumulators
 +        if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2 || arch == LLM_ARCH_GEMMA4) {
 +            // certain models seem to have numerical issues with half-precision accumulators
             ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
         }
     }
--- a/services/llama-cpp.nix
+++ b/services/llama-cpp.nix
@@ -27,6 +27,7 @@ in
      inputs.llamacpp.packages.${pkgs.system}.vulkan.overrideAttrs (old: {
        patches = (old.patches or [ ]) ++ [
          ../patches/llamacpp/0003-gemma4-tokenizer-fix.patch
          ../patches/llamacpp/0004-gemma4-graph-fix.patch
        ];
      })
    );
@@ -50,40 +51,17 @@ in
      "4096"
      "-ub"
      "4096"
      "--parallel"
      "2"
    ];
  };
  # have to do this in order to get vulkan to work
  systemd.services.llama-cpp.serviceConfig.DynamicUser = lib.mkForce false;
  # ANV driver's turbo3 shader compilation exceeds the default 8 MB thread stack.
  systemd.services.llama-cpp.serviceConfig.LimitSTACK = lib.mkForce "67108864"; # 64 MB soft+hard
  # llama-server tries to create ~/.cache; ProtectSystem=strict + impermanent
  # root make /root read-only. Give it a writable cache dir and point HOME there.
  systemd.services.llama-cpp.serviceConfig.CacheDirectory = "llama-cpp";
  systemd.services.llama-cpp.environment.HOME = "/var/cache/llama-cpp";
  # turbo3 KV cache quantization runs a 14-barrier WHT butterfly per 128-element
  # workgroup in SET_ROWS. With 4 concurrent slots and batch=4096, the combined
  # GPU dispatch can exceed the default i915 CCS engine preempt timeout (7.5s),
  # causing GPU HANG -> ErrorDeviceLost. Increase compute engine timeouts.
  # Note: batch<4096 is not viable -- GDN chunked mode needs a larger compute
  # buffer at smaller batch sizes, exceeding the A380's 6 GB VRAM.
  # '+' prefix runs as root regardless of service User=.
  systemd.services.llama-cpp.serviceConfig.ExecStartPre = [
    "+${pkgs.writeShellScript "set-gpu-compute-timeout" ''
      for f in /sys/class/drm/card*/engine/ccs*/preempt_timeout_ms; do
        [ -w "$f" ] && echo 30000 > "$f"
      done
      for f in /sys/class/drm/card*/engine/ccs*/heartbeat_interval_ms; do
        [ -w "$f" ] && echo 10000 > "$f"
      done
    ''}"
  ];
  # upstream module hardcodes --log-disable; override ExecStart to keep logs
  # so we can see prompt processing progress via journalctl
  systemd.services.llama-cpp.serviceConfig.ExecStart = lib.mkForce (