Compare commits
1 Commits
d1e9c92423
...
0e75c0036f
| Author | SHA1 | Date | |
|---|---|---|---|
|
0e75c0036f
|
36
flake.lock
generated
36
flake.lock
generated
@@ -304,11 +304,11 @@
|
||||
"rust-overlay": "rust-overlay"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1775754862,
|
||||
"narHash": "sha256-8y9cz8+cyeA7KtA7+Q3bXjyFJV5nM38Fc0E4qPw7WDk=",
|
||||
"lastModified": 1775510693,
|
||||
"narHash": "sha256-gZfJ07j/oOciDi8mF/V8QTm7YCeDcusNSMZzBFi8OUM=",
|
||||
"owner": "nix-community",
|
||||
"repo": "lanzaboote",
|
||||
"rev": "bea51aaee00688794a877f308007590a6cc8e378",
|
||||
"rev": "3fe0ae8cb285e0ad101a9675f4190d455fb05e85",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@@ -325,11 +325,11 @@
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1775754125,
|
||||
"narHash": "sha256-4udYhEvii0xPmRiKXYWLhPakPDd1mJppnEFY6uWdv8s=",
|
||||
"lastModified": 1775614184,
|
||||
"narHash": "sha256-OYwr36LLVIeEqccN1mJ2k6vCsFocboCQJnbtne415Ig=",
|
||||
"owner": "TheTom",
|
||||
"repo": "llama-cpp-turboquant",
|
||||
"rev": "8590cbff961dbaf1d3a9793fd11d402e248869b9",
|
||||
"rev": "eea498c42716519e58baf2d9600d2e2b41839255",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@@ -368,11 +368,11 @@
|
||||
"systems": "systems_3"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1775752089,
|
||||
"narHash": "sha256-+psXqZ1SvQw7L8HgCQINmob9zLnvK433b2k080lBPH0=",
|
||||
"lastModified": 1775531897,
|
||||
"narHash": "sha256-3NIpnV1HxBCwi00iMvj9KcqXkM0VNA72KABj8g0cFFs=",
|
||||
"owner": "Infinidoge",
|
||||
"repo": "nix-minecraft",
|
||||
"rev": "1beacd3bdadabfac884dedd56176966c141214d8",
|
||||
"rev": "8c7693880cb861e60adeab5480f02dc3e7a390f6",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@@ -399,11 +399,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1775595990,
|
||||
"narHash": "sha256-OEf7YqhF9IjJFYZJyuhAypgU+VsRB5lD4DuiMws5Ltc=",
|
||||
"lastModified": 1775305101,
|
||||
"narHash": "sha256-/74n1oQPtKG52Yw41cbToxspxHbYz6O3vi+XEw16Qe8=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "4e92bbcdb030f3b4782be4751dc08e6b6cb6ccf2",
|
||||
"rev": "36a601196c4ebf49e035270e10b2d103fe39076b",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@@ -624,11 +624,11 @@
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1775701952,
|
||||
"narHash": "sha256-xj9u8fz2hTTTELMorqox0hPWrmAvGRnQUEnlj+vCjFo=",
|
||||
"lastModified": 1775444042,
|
||||
"narHash": "sha256-cg19ipIlZaLYgs/5ZPFcDDuOcZlGzfprB5xS4x7bVM4=",
|
||||
"owner": "nix-community",
|
||||
"repo": "srvos",
|
||||
"rev": "f56f1053ae9f878501d3a8ae1961c73d1d7abce3",
|
||||
"rev": "64c9cc6a274dac7d08c4d53494ffa4acf906e287",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@@ -715,11 +715,11 @@
|
||||
"trackerlist": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"lastModified": 1775686189,
|
||||
"narHash": "sha256-kzEDJKptaVToSg/wpub0bLjAVRmkYOorjPsNqlpxWdU=",
|
||||
"lastModified": 1775599784,
|
||||
"narHash": "sha256-ZapxbiFEYjJV2nhdowHQ/8+c8Jd5fpBIEKDiPEmyNgI=",
|
||||
"owner": "ngosang",
|
||||
"repo": "trackerslist",
|
||||
"rev": "ce9c0afc3885d0592caa91f0d4359f315ef7428c",
|
||||
"rev": "6cc71b5b65349081bb713719f5142c200438a327",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
||||
24
patches/llamacpp/0004-gemma4-graph-fix.patch
Normal file
24
patches/llamacpp/0004-gemma4-graph-fix.patch
Normal file
@@ -0,0 +1,24 @@
|
||||
From b934a8ca49f9e764fa21d45ff2ce1168a3a7c914 Mon Sep 17 00:00:00 2001
|
||||
From: Georgi Gerganov <ggerganov@gmail.com>
|
||||
Date: Mon, 6 Apr 2026 11:50:22 +0300
|
||||
Subject: [PATCH] models : set gemma 4 FFN MoE prec to F32
|
||||
|
||||
---
|
||||
src/llama-graph.cpp | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
|
||||
index 0e7d96ca10d..aa8a35721fa 100644
|
||||
--- a/src/llama-graph.cpp
|
||||
+++ b/src/llama-graph.cpp
|
||||
@@ -1185,8 +1185,8 @@ ggml_tensor * llm_graph_context::build_ffn(
|
||||
|
||||
if (down) {
|
||||
cur = build_lora_mm(down, cur);
|
||||
- if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2) {
|
||||
- // GLM4, GLM4_MOE, and JAIS2 seem to have numerical issues with half-precision accumulators
|
||||
+ if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2 || arch == LLM_ARCH_GEMMA4) {
|
||||
+ // certain models seem to have numerical issues with half-precision accumulators
|
||||
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
||||
}
|
||||
}
|
||||
@@ -27,6 +27,7 @@ in
|
||||
inputs.llamacpp.packages.${pkgs.system}.vulkan.overrideAttrs (old: {
|
||||
patches = (old.patches or [ ]) ++ [
|
||||
../patches/llamacpp/0003-gemma4-tokenizer-fix.patch
|
||||
../patches/llamacpp/0004-gemma4-graph-fix.patch
|
||||
];
|
||||
})
|
||||
);
|
||||
@@ -50,40 +51,17 @@ in
|
||||
"4096"
|
||||
"-ub"
|
||||
"4096"
|
||||
"--parallel"
|
||||
"2"
|
||||
];
|
||||
};
|
||||
|
||||
# have to do this in order to get vulkan to work
|
||||
systemd.services.llama-cpp.serviceConfig.DynamicUser = lib.mkForce false;
|
||||
|
||||
# ANV driver's turbo3 shader compilation exceeds the default 8 MB thread stack.
|
||||
systemd.services.llama-cpp.serviceConfig.LimitSTACK = lib.mkForce "67108864"; # 64 MB soft+hard
|
||||
|
||||
# llama-server tries to create ~/.cache; ProtectSystem=strict + impermanent
|
||||
# root make /root read-only. Give it a writable cache dir and point HOME there.
|
||||
systemd.services.llama-cpp.serviceConfig.CacheDirectory = "llama-cpp";
|
||||
systemd.services.llama-cpp.environment.HOME = "/var/cache/llama-cpp";
|
||||
|
||||
# turbo3 KV cache quantization runs a 14-barrier WHT butterfly per 128-element
|
||||
# workgroup in SET_ROWS. With 4 concurrent slots and batch=4096, the combined
|
||||
# GPU dispatch can exceed the default i915 CCS engine preempt timeout (7.5s),
|
||||
# causing GPU HANG -> ErrorDeviceLost. Increase compute engine timeouts.
|
||||
# Note: batch<4096 is not viable -- GDN chunked mode needs a larger compute
|
||||
# buffer at smaller batch sizes, exceeding the A380's 6 GB VRAM.
|
||||
# '+' prefix runs as root regardless of service User=.
|
||||
systemd.services.llama-cpp.serviceConfig.ExecStartPre = [
|
||||
"+${pkgs.writeShellScript "set-gpu-compute-timeout" ''
|
||||
for f in /sys/class/drm/card*/engine/ccs*/preempt_timeout_ms; do
|
||||
[ -w "$f" ] && echo 30000 > "$f"
|
||||
done
|
||||
for f in /sys/class/drm/card*/engine/ccs*/heartbeat_interval_ms; do
|
||||
[ -w "$f" ] && echo 10000 > "$f"
|
||||
done
|
||||
''}"
|
||||
];
|
||||
|
||||
# upstream module hardcodes --log-disable; override ExecStart to keep logs
|
||||
# so we can see prompt processing progress via journalctl
|
||||
systemd.services.llama-cpp.serviceConfig.ExecStart = lib.mkForce (
|
||||
|
||||
Reference in New Issue
Block a user