Compare commits

...

2 Commits

Author SHA1 Message Date
d1e9c92423 update
Some checks failed
Build and Deploy / deploy (push) Failing after 4s
2026-04-09 14:03:34 -04:00
4f33b16411 llama.cpp: thing 2026-04-09 14:02:53 -04:00
2 changed files with 41 additions and 18 deletions

36
flake.lock generated
View File

@@ -304,11 +304,11 @@
"rust-overlay": "rust-overlay"
},
"locked": {
"lastModified": 1775510693,
"narHash": "sha256-gZfJ07j/oOciDi8mF/V8QTm7YCeDcusNSMZzBFi8OUM=",
"lastModified": 1775754862,
"narHash": "sha256-8y9cz8+cyeA7KtA7+Q3bXjyFJV5nM38Fc0E4qPw7WDk=",
"owner": "nix-community",
"repo": "lanzaboote",
"rev": "3fe0ae8cb285e0ad101a9675f4190d455fb05e85",
"rev": "bea51aaee00688794a877f308007590a6cc8e378",
"type": "github"
},
"original": {
@@ -325,11 +325,11 @@
]
},
"locked": {
"lastModified": 1775614184,
"narHash": "sha256-OYwr36LLVIeEqccN1mJ2k6vCsFocboCQJnbtne415Ig=",
"lastModified": 1775754125,
"narHash": "sha256-4udYhEvii0xPmRiKXYWLhPakPDd1mJppnEFY6uWdv8s=",
"owner": "TheTom",
"repo": "llama-cpp-turboquant",
"rev": "eea498c42716519e58baf2d9600d2e2b41839255",
"rev": "8590cbff961dbaf1d3a9793fd11d402e248869b9",
"type": "github"
},
"original": {
@@ -368,11 +368,11 @@
"systems": "systems_3"
},
"locked": {
"lastModified": 1775531897,
"narHash": "sha256-3NIpnV1HxBCwi00iMvj9KcqXkM0VNA72KABj8g0cFFs=",
"lastModified": 1775752089,
"narHash": "sha256-+psXqZ1SvQw7L8HgCQINmob9zLnvK433b2k080lBPH0=",
"owner": "Infinidoge",
"repo": "nix-minecraft",
"rev": "8c7693880cb861e60adeab5480f02dc3e7a390f6",
"rev": "1beacd3bdadabfac884dedd56176966c141214d8",
"type": "github"
},
"original": {
@@ -399,11 +399,11 @@
},
"nixpkgs": {
"locked": {
"lastModified": 1775305101,
"narHash": "sha256-/74n1oQPtKG52Yw41cbToxspxHbYz6O3vi+XEw16Qe8=",
"lastModified": 1775595990,
"narHash": "sha256-OEf7YqhF9IjJFYZJyuhAypgU+VsRB5lD4DuiMws5Ltc=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "36a601196c4ebf49e035270e10b2d103fe39076b",
"rev": "4e92bbcdb030f3b4782be4751dc08e6b6cb6ccf2",
"type": "github"
},
"original": {
@@ -624,11 +624,11 @@
]
},
"locked": {
"lastModified": 1775444042,
"narHash": "sha256-cg19ipIlZaLYgs/5ZPFcDDuOcZlGzfprB5xS4x7bVM4=",
"lastModified": 1775701952,
"narHash": "sha256-xj9u8fz2hTTTELMorqox0hPWrmAvGRnQUEnlj+vCjFo=",
"owner": "nix-community",
"repo": "srvos",
"rev": "64c9cc6a274dac7d08c4d53494ffa4acf906e287",
"rev": "f56f1053ae9f878501d3a8ae1961c73d1d7abce3",
"type": "github"
},
"original": {
@@ -715,11 +715,11 @@
"trackerlist": {
"flake": false,
"locked": {
"lastModified": 1775599784,
"narHash": "sha256-ZapxbiFEYjJV2nhdowHQ/8+c8Jd5fpBIEKDiPEmyNgI=",
"lastModified": 1775686189,
"narHash": "sha256-kzEDJKptaVToSg/wpub0bLjAVRmkYOorjPsNqlpxWdU=",
"owner": "ngosang",
"repo": "trackerslist",
"rev": "6cc71b5b65349081bb713719f5142c200438a327",
"rev": "ce9c0afc3885d0592caa91f0d4359f315ef7428c",
"type": "github"
},
"original": {

View File

@@ -50,17 +50,40 @@ in
"4096"
"-ub"
"4096"
"--parallel"
"2"
];
};
# have to do this in order to get vulkan to work
systemd.services.llama-cpp.serviceConfig.DynamicUser = lib.mkForce false;
# ANV driver's turbo3 shader compilation exceeds the default 8 MB thread stack.
systemd.services.llama-cpp.serviceConfig.LimitSTACK = lib.mkForce "67108864"; # 64 MB soft+hard
# llama-server tries to create ~/.cache; ProtectSystem=strict + impermanent
# root make /root read-only. Give it a writable cache dir and point HOME there.
systemd.services.llama-cpp.serviceConfig.CacheDirectory = "llama-cpp";
systemd.services.llama-cpp.environment.HOME = "/var/cache/llama-cpp";
# turbo3 KV cache quantization runs a 14-barrier WHT butterfly per 128-element
# workgroup in SET_ROWS. With 4 concurrent slots and batch=4096, the combined
# GPU dispatch can exceed the default i915 CCS engine preempt timeout (7.5s),
# causing GPU HANG -> ErrorDeviceLost. Increase compute engine timeouts.
# Note: batch<4096 is not viable -- GDN chunked mode needs a larger compute
# buffer at smaller batch sizes, exceeding the A380's 6 GB VRAM.
# '+' prefix runs as root regardless of service User=.
systemd.services.llama-cpp.serviceConfig.ExecStartPre = [
"+${pkgs.writeShellScript "set-gpu-compute-timeout" ''
for f in /sys/class/drm/card*/engine/ccs*/preempt_timeout_ms; do
[ -w "$f" ] && echo 30000 > "$f"
done
for f in /sys/class/drm/card*/engine/ccs*/heartbeat_interval_ms; do
[ -w "$f" ] && echo 10000 > "$f"
done
''}"
];
# upstream module hardcodes --log-disable; override ExecStart to keep logs
# so we can see prompt processing progress via journalctl
systemd.services.llama-cpp.serviceConfig.ExecStart = lib.mkForce (