From 4f33b16411ff9810ad3b4af0de2605a35ca19b1a Mon Sep 17 00:00:00 2001 From: Simon Gardling Date: Thu, 9 Apr 2026 14:02:53 -0400 Subject: [PATCH] llama.cpp: thing --- flake.lock | 6 +++--- services/llama-cpp.nix | 23 +++++++++++++++++++++++ 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/flake.lock b/flake.lock index 4b64d4f..429d531 100644 --- a/flake.lock +++ b/flake.lock @@ -325,11 +325,11 @@ ] }, "locked": { - "lastModified": 1775614184, - "narHash": "sha256-OYwr36LLVIeEqccN1mJ2k6vCsFocboCQJnbtne415Ig=", + "lastModified": 1775754125, + "narHash": "sha256-4udYhEvii0xPmRiKXYWLhPakPDd1mJppnEFY6uWdv8s=", "owner": "TheTom", "repo": "llama-cpp-turboquant", - "rev": "eea498c42716519e58baf2d9600d2e2b41839255", + "rev": "8590cbff961dbaf1d3a9793fd11d402e248869b9", "type": "github" }, "original": { diff --git a/services/llama-cpp.nix b/services/llama-cpp.nix index 25015c8..b7470f9 100644 --- a/services/llama-cpp.nix +++ b/services/llama-cpp.nix @@ -50,17 +50,40 @@ in "4096" "-ub" "4096" + "--parallel" + "2" ]; }; # have to do this in order to get vulkan to work systemd.services.llama-cpp.serviceConfig.DynamicUser = lib.mkForce false; + # ANV driver's turbo3 shader compilation exceeds the default 8 MB thread stack. + systemd.services.llama-cpp.serviceConfig.LimitSTACK = lib.mkForce "67108864"; # 64 MB soft+hard + # llama-server tries to create ~/.cache; ProtectSystem=strict + impermanent # root make /root read-only. Give it a writable cache dir and point HOME there. systemd.services.llama-cpp.serviceConfig.CacheDirectory = "llama-cpp"; systemd.services.llama-cpp.environment.HOME = "/var/cache/llama-cpp"; + # turbo3 KV cache quantization runs a 14-barrier WHT butterfly per 128-element + # workgroup in SET_ROWS. With 4 concurrent slots and batch=4096, the combined + # GPU dispatch can exceed the default i915 CCS engine preempt timeout (7.5s), + # causing GPU HANG -> ErrorDeviceLost. Increase compute engine timeouts. + # Note: batch<4096 is not viable -- GDN chunked mode needs a larger compute + # buffer at smaller batch sizes, exceeding the A380's 6 GB VRAM. + # '+' prefix runs as root regardless of service User=. + systemd.services.llama-cpp.serviceConfig.ExecStartPre = [ + "+${pkgs.writeShellScript "set-gpu-compute-timeout" '' + for f in /sys/class/drm/card*/engine/ccs*/preempt_timeout_ms; do + [ -w "$f" ] && echo 30000 > "$f" + done + for f in /sys/class/drm/card*/engine/ccs*/heartbeat_interval_ms; do + [ -w "$f" ] && echo 10000 > "$f" + done + ''}" + ]; + # upstream module hardcodes --log-disable; override ExecStart to keep logs # so we can see prompt processing progress via journalctl systemd.services.llama-cpp.serviceConfig.ExecStart = lib.mkForce (