Compare commits

..

1 Commits

Author SHA1 Message Date
0e75c0036f llama-cpp: add gemma 4 graph fix
All checks were successful
Build and Deploy / deploy (push) Successful in 6m42s
2026-04-07 22:59:59 -04:00
3 changed files with 43 additions and 41 deletions

36
flake.lock generated
View File

@@ -304,11 +304,11 @@
"rust-overlay": "rust-overlay" "rust-overlay": "rust-overlay"
}, },
"locked": { "locked": {
"lastModified": 1775754862, "lastModified": 1775510693,
"narHash": "sha256-8y9cz8+cyeA7KtA7+Q3bXjyFJV5nM38Fc0E4qPw7WDk=", "narHash": "sha256-gZfJ07j/oOciDi8mF/V8QTm7YCeDcusNSMZzBFi8OUM=",
"owner": "nix-community", "owner": "nix-community",
"repo": "lanzaboote", "repo": "lanzaboote",
"rev": "bea51aaee00688794a877f308007590a6cc8e378", "rev": "3fe0ae8cb285e0ad101a9675f4190d455fb05e85",
"type": "github" "type": "github"
}, },
"original": { "original": {
@@ -325,11 +325,11 @@
] ]
}, },
"locked": { "locked": {
"lastModified": 1775754125, "lastModified": 1775614184,
"narHash": "sha256-4udYhEvii0xPmRiKXYWLhPakPDd1mJppnEFY6uWdv8s=", "narHash": "sha256-OYwr36LLVIeEqccN1mJ2k6vCsFocboCQJnbtne415Ig=",
"owner": "TheTom", "owner": "TheTom",
"repo": "llama-cpp-turboquant", "repo": "llama-cpp-turboquant",
"rev": "8590cbff961dbaf1d3a9793fd11d402e248869b9", "rev": "eea498c42716519e58baf2d9600d2e2b41839255",
"type": "github" "type": "github"
}, },
"original": { "original": {
@@ -368,11 +368,11 @@
"systems": "systems_3" "systems": "systems_3"
}, },
"locked": { "locked": {
"lastModified": 1775752089, "lastModified": 1775531897,
"narHash": "sha256-+psXqZ1SvQw7L8HgCQINmob9zLnvK433b2k080lBPH0=", "narHash": "sha256-3NIpnV1HxBCwi00iMvj9KcqXkM0VNA72KABj8g0cFFs=",
"owner": "Infinidoge", "owner": "Infinidoge",
"repo": "nix-minecraft", "repo": "nix-minecraft",
"rev": "1beacd3bdadabfac884dedd56176966c141214d8", "rev": "8c7693880cb861e60adeab5480f02dc3e7a390f6",
"type": "github" "type": "github"
}, },
"original": { "original": {
@@ -399,11 +399,11 @@
}, },
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1775595990, "lastModified": 1775305101,
"narHash": "sha256-OEf7YqhF9IjJFYZJyuhAypgU+VsRB5lD4DuiMws5Ltc=", "narHash": "sha256-/74n1oQPtKG52Yw41cbToxspxHbYz6O3vi+XEw16Qe8=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "4e92bbcdb030f3b4782be4751dc08e6b6cb6ccf2", "rev": "36a601196c4ebf49e035270e10b2d103fe39076b",
"type": "github" "type": "github"
}, },
"original": { "original": {
@@ -624,11 +624,11 @@
] ]
}, },
"locked": { "locked": {
"lastModified": 1775701952, "lastModified": 1775444042,
"narHash": "sha256-xj9u8fz2hTTTELMorqox0hPWrmAvGRnQUEnlj+vCjFo=", "narHash": "sha256-cg19ipIlZaLYgs/5ZPFcDDuOcZlGzfprB5xS4x7bVM4=",
"owner": "nix-community", "owner": "nix-community",
"repo": "srvos", "repo": "srvos",
"rev": "f56f1053ae9f878501d3a8ae1961c73d1d7abce3", "rev": "64c9cc6a274dac7d08c4d53494ffa4acf906e287",
"type": "github" "type": "github"
}, },
"original": { "original": {
@@ -715,11 +715,11 @@
"trackerlist": { "trackerlist": {
"flake": false, "flake": false,
"locked": { "locked": {
"lastModified": 1775686189, "lastModified": 1775599784,
"narHash": "sha256-kzEDJKptaVToSg/wpub0bLjAVRmkYOorjPsNqlpxWdU=", "narHash": "sha256-ZapxbiFEYjJV2nhdowHQ/8+c8Jd5fpBIEKDiPEmyNgI=",
"owner": "ngosang", "owner": "ngosang",
"repo": "trackerslist", "repo": "trackerslist",
"rev": "ce9c0afc3885d0592caa91f0d4359f315ef7428c", "rev": "6cc71b5b65349081bb713719f5142c200438a327",
"type": "github" "type": "github"
}, },
"original": { "original": {

View File

@@ -0,0 +1,24 @@
From b934a8ca49f9e764fa21d45ff2ce1168a3a7c914 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 6 Apr 2026 11:50:22 +0300
Subject: [PATCH] models : set gemma 4 FFN MoE prec to F32
---
src/llama-graph.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 0e7d96ca10d..aa8a35721fa 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -1185,8 +1185,8 @@ ggml_tensor * llm_graph_context::build_ffn(
if (down) {
cur = build_lora_mm(down, cur);
- if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2) {
- // GLM4, GLM4_MOE, and JAIS2 seem to have numerical issues with half-precision accumulators
+ if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2 || arch == LLM_ARCH_GEMMA4) {
+ // certain models seem to have numerical issues with half-precision accumulators
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
}
}

View File

@@ -27,6 +27,7 @@ in
inputs.llamacpp.packages.${pkgs.system}.vulkan.overrideAttrs (old: { inputs.llamacpp.packages.${pkgs.system}.vulkan.overrideAttrs (old: {
patches = (old.patches or [ ]) ++ [ patches = (old.patches or [ ]) ++ [
../patches/llamacpp/0003-gemma4-tokenizer-fix.patch ../patches/llamacpp/0003-gemma4-tokenizer-fix.patch
../patches/llamacpp/0004-gemma4-graph-fix.patch
]; ];
}) })
); );
@@ -50,40 +51,17 @@ in
"4096" "4096"
"-ub" "-ub"
"4096" "4096"
"--parallel"
"2"
]; ];
}; };
# have to do this in order to get vulkan to work # have to do this in order to get vulkan to work
systemd.services.llama-cpp.serviceConfig.DynamicUser = lib.mkForce false; systemd.services.llama-cpp.serviceConfig.DynamicUser = lib.mkForce false;
# ANV driver's turbo3 shader compilation exceeds the default 8 MB thread stack.
systemd.services.llama-cpp.serviceConfig.LimitSTACK = lib.mkForce "67108864"; # 64 MB soft+hard
# llama-server tries to create ~/.cache; ProtectSystem=strict + impermanent # llama-server tries to create ~/.cache; ProtectSystem=strict + impermanent
# root make /root read-only. Give it a writable cache dir and point HOME there. # root make /root read-only. Give it a writable cache dir and point HOME there.
systemd.services.llama-cpp.serviceConfig.CacheDirectory = "llama-cpp"; systemd.services.llama-cpp.serviceConfig.CacheDirectory = "llama-cpp";
systemd.services.llama-cpp.environment.HOME = "/var/cache/llama-cpp"; systemd.services.llama-cpp.environment.HOME = "/var/cache/llama-cpp";
# turbo3 KV cache quantization runs a 14-barrier WHT butterfly per 128-element
# workgroup in SET_ROWS. With 4 concurrent slots and batch=4096, the combined
# GPU dispatch can exceed the default i915 CCS engine preempt timeout (7.5s),
# causing GPU HANG -> ErrorDeviceLost. Increase compute engine timeouts.
# Note: batch<4096 is not viable -- GDN chunked mode needs a larger compute
# buffer at smaller batch sizes, exceeding the A380's 6 GB VRAM.
# '+' prefix runs as root regardless of service User=.
systemd.services.llama-cpp.serviceConfig.ExecStartPre = [
"+${pkgs.writeShellScript "set-gpu-compute-timeout" ''
for f in /sys/class/drm/card*/engine/ccs*/preempt_timeout_ms; do
[ -w "$f" ] && echo 30000 > "$f"
done
for f in /sys/class/drm/card*/engine/ccs*/heartbeat_interval_ms; do
[ -w "$f" ] && echo 10000 > "$f"
done
''}"
];
# upstream module hardcodes --log-disable; override ExecStart to keep logs # upstream module hardcodes --log-disable; override ExecStart to keep logs
# so we can see prompt processing progress via journalctl # so we can see prompt processing progress via journalctl
systemd.services.llama-cpp.serviceConfig.ExecStart = lib.mkForce ( systemd.services.llama-cpp.serviceConfig.ExecStart = lib.mkForce (