update

llama.cpp: thing
2026-04-09 14:03:34 -04:00 · 2026-04-09 14:02:53 -04:00
2 changed files with 41 additions and 18 deletions
--- a/flake.lock
+++ b/flake.lock
@@ -304,11 +304,11 @@
        "rust-overlay": "rust-overlay"
      },
      "locked": {
-        "lastModified": 1775510693,
-        "narHash": "sha256-gZfJ07j/oOciDi8mF/V8QTm7YCeDcusNSMZzBFi8OUM=",
+        "lastModified": 1775754862,
+        "narHash": "sha256-8y9cz8+cyeA7KtA7+Q3bXjyFJV5nM38Fc0E4qPw7WDk=",
        "owner": "nix-community",
        "repo": "lanzaboote",
-        "rev": "3fe0ae8cb285e0ad101a9675f4190d455fb05e85",
+        "rev": "bea51aaee00688794a877f308007590a6cc8e378",
        "type": "github"
      },
      "original": {
@@ -325,11 +325,11 @@
        ]
      },
      "locked": {
-        "lastModified": 1775614184,
-        "narHash": "sha256-OYwr36LLVIeEqccN1mJ2k6vCsFocboCQJnbtne415Ig=",
+        "lastModified": 1775754125,
+        "narHash": "sha256-4udYhEvii0xPmRiKXYWLhPakPDd1mJppnEFY6uWdv8s=",
        "owner": "TheTom",
        "repo": "llama-cpp-turboquant",
-        "rev": "eea498c42716519e58baf2d9600d2e2b41839255",
+        "rev": "8590cbff961dbaf1d3a9793fd11d402e248869b9",
        "type": "github"
      },
      "original": {
@@ -368,11 +368,11 @@
        "systems": "systems_3"
      },
      "locked": {
-        "lastModified": 1775531897,
-        "narHash": "sha256-3NIpnV1HxBCwi00iMvj9KcqXkM0VNA72KABj8g0cFFs=",
+        "lastModified": 1775752089,
+        "narHash": "sha256-+psXqZ1SvQw7L8HgCQINmob9zLnvK433b2k080lBPH0=",
        "owner": "Infinidoge",
        "repo": "nix-minecraft",
-        "rev": "8c7693880cb861e60adeab5480f02dc3e7a390f6",
+        "rev": "1beacd3bdadabfac884dedd56176966c141214d8",
        "type": "github"
      },
      "original": {
@@ -399,11 +399,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1775305101,
-        "narHash": "sha256-/74n1oQPtKG52Yw41cbToxspxHbYz6O3vi+XEw16Qe8=",
+        "lastModified": 1775595990,
+        "narHash": "sha256-OEf7YqhF9IjJFYZJyuhAypgU+VsRB5lD4DuiMws5Ltc=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "36a601196c4ebf49e035270e10b2d103fe39076b",
+        "rev": "4e92bbcdb030f3b4782be4751dc08e6b6cb6ccf2",
        "type": "github"
      },
      "original": {
@@ -624,11 +624,11 @@
        ]
      },
      "locked": {
-        "lastModified": 1775444042,
-        "narHash": "sha256-cg19ipIlZaLYgs/5ZPFcDDuOcZlGzfprB5xS4x7bVM4=",
+        "lastModified": 1775701952,
+        "narHash": "sha256-xj9u8fz2hTTTELMorqox0hPWrmAvGRnQUEnlj+vCjFo=",
        "owner": "nix-community",
        "repo": "srvos",
-        "rev": "64c9cc6a274dac7d08c4d53494ffa4acf906e287",
+        "rev": "f56f1053ae9f878501d3a8ae1961c73d1d7abce3",
        "type": "github"
      },
      "original": {
@@ -715,11 +715,11 @@
    "trackerlist": {
      "flake": false,
      "locked": {
-        "lastModified": 1775599784,
-        "narHash": "sha256-ZapxbiFEYjJV2nhdowHQ/8+c8Jd5fpBIEKDiPEmyNgI=",
+        "lastModified": 1775686189,
+        "narHash": "sha256-kzEDJKptaVToSg/wpub0bLjAVRmkYOorjPsNqlpxWdU=",
        "owner": "ngosang",
        "repo": "trackerslist",
-        "rev": "6cc71b5b65349081bb713719f5142c200438a327",
+        "rev": "ce9c0afc3885d0592caa91f0d4359f315ef7428c",
        "type": "github"
      },
      "original": {
--- a/services/llama-cpp.nix
+++ b/services/llama-cpp.nix
@@ -50,17 +50,40 @@ in
      "4096"
      "-ub"
      "4096"
+      "--parallel"
+      "2"
    ];
  };

  # have to do this in order to get vulkan to work
  systemd.services.llama-cpp.serviceConfig.DynamicUser = lib.mkForce false;

+  # ANV driver's turbo3 shader compilation exceeds the default 8 MB thread stack.
+  systemd.services.llama-cpp.serviceConfig.LimitSTACK = lib.mkForce "67108864"; # 64 MB soft+hard
+
  # llama-server tries to create ~/.cache; ProtectSystem=strict + impermanent
  # root make /root read-only. Give it a writable cache dir and point HOME there.
  systemd.services.llama-cpp.serviceConfig.CacheDirectory = "llama-cpp";
  systemd.services.llama-cpp.environment.HOME = "/var/cache/llama-cpp";

+  # turbo3 KV cache quantization runs a 14-barrier WHT butterfly per 128-element
+  # workgroup in SET_ROWS. With 4 concurrent slots and batch=4096, the combined
+  # GPU dispatch can exceed the default i915 CCS engine preempt timeout (7.5s),
+  # causing GPU HANG -> ErrorDeviceLost. Increase compute engine timeouts.
+  # Note: batch<4096 is not viable -- GDN chunked mode needs a larger compute
+  # buffer at smaller batch sizes, exceeding the A380's 6 GB VRAM.
+  # '+' prefix runs as root regardless of service User=.
+  systemd.services.llama-cpp.serviceConfig.ExecStartPre = [
+    "+${pkgs.writeShellScript "set-gpu-compute-timeout" ''
+      for f in /sys/class/drm/card*/engine/ccs*/preempt_timeout_ms; do
+        [ -w "$f" ] && echo 30000 > "$f"
+      done
+      for f in /sys/class/drm/card*/engine/ccs*/heartbeat_interval_ms; do
+        [ -w "$f" ] && echo 10000 > "$f"
+      done
+    ''}"
+  ];
+
  # upstream module hardcodes --log-disable; override ExecStart to keep logs
  # so we can see prompt processing progress via journalctl
  systemd.services.llama-cpp.serviceConfig.ExecStart = lib.mkForce (
Author	SHA1	Message	Date
Simon Gardling	d1e9c92423	update Some checks failed Build and Deploy / deploy (push) Failing after 4s	2026-04-09 14:03:34 -04:00
Simon Gardling	4f33b16411	llama.cpp: thing	2026-04-09 14:02:53 -04:00