phase 2: promote services/, tests/, patches/, lib/, scripts/

2026-04-18 00:47:39 -04:00
parent 99e98e39b7
commit 999ed05d9f
86 changed files with 0 additions and 0 deletions
--- a/services/llama-cpp.nix
+++ b/services/llama-cpp.nix
@@ -0,0 +1,103 @@
+{
+  pkgs,
+  service_configs,
+  config,
+  inputs,
+  lib,
+  utils,
+  ...
+}:
+let
+  cfg = config.services.llama-cpp;
+  modelUrl = "https://huggingface.co/bartowski/google_gemma-4-E2B-it-GGUF/resolve/main/google_gemma-4-E2B-it-IQ2_M.gguf";
+  modelAlias = lib.removeSuffix ".gguf" (baseNameOf modelUrl);
+in
+{
+  imports = [
+    (lib.mkCaddyReverseProxy {
+      subdomain = "llm";
+      port = service_configs.ports.private.llama_cpp.port;
+    })
+  ];
+
+  services.llama-cpp = {
+    enable = true;
+    model = toString (
+      pkgs.fetchurl {
+        url = modelUrl;
+        sha256 = "17e869ac54d0e59faa884d5319fc55ad84cd866f50f0b3073fbb25accc875a23";
+      }
+    );
+    port = service_configs.ports.private.llama_cpp.port;
+    host = "0.0.0.0";
+    package = lib.optimizePackage (
+      inputs.llamacpp.packages.${pkgs.system}.vulkan.overrideAttrs (old: {
+        patches = (old.patches or [ ]) ++ [
+        ];
+      })
+    );
+    extraFlags = [
+      "-ngl"
+      "999"
+      "-c"
+      "65536"
+      "-ctk"
+      "turbo3"
+      "-ctv"
+      "turbo3"
+      "-fa"
+      "on"
+      "--api-key-file"
+      config.age.secrets.llama-cpp-api-key.path
+      "--metrics"
+      "--alias"
+      modelAlias
+      "-b"
+      "4096"
+      "-ub"
+      "4096"
+      "--parallel"
+      "2"
+    ];
+  };
+
+  # have to do this in order to get vulkan to work
+  systemd.services.llama-cpp.serviceConfig.DynamicUser = lib.mkForce false;
+
+  # ANV driver's turbo3 shader compilation exceeds the default 8 MB thread stack.
+  systemd.services.llama-cpp.serviceConfig.LimitSTACK = lib.mkForce "67108864"; # 64 MB soft+hard
+
+  # llama-server tries to create ~/.cache; ProtectSystem=strict + impermanent
+  # root make /root read-only. Give it a writable cache dir and point HOME there.
+  systemd.services.llama-cpp.serviceConfig.CacheDirectory = "llama-cpp";
+  systemd.services.llama-cpp.environment.HOME = "/var/cache/llama-cpp";
+
+  # turbo3 KV cache quantization runs a 14-barrier WHT butterfly per 128-element
+  # workgroup in SET_ROWS. With 4 concurrent slots and batch=4096, the combined
+  # GPU dispatch can exceed the default i915 CCS engine preempt timeout (7.5s),
+  # causing GPU HANG -> ErrorDeviceLost. Increase compute engine timeouts.
+  # Note: batch<4096 is not viable -- GDN chunked mode needs a larger compute
+  # buffer at smaller batch sizes, exceeding the A380's 6 GB VRAM.
+  # '+' prefix runs as root regardless of service User=.
+  systemd.services.llama-cpp.serviceConfig.ExecStartPre = [
+    "+${pkgs.writeShellScript "set-gpu-compute-timeout" ''
+      for f in /sys/class/drm/card*/engine/ccs*/preempt_timeout_ms; do
+        [ -w "$f" ] && echo 30000 > "$f"
+      done
+      for f in /sys/class/drm/card*/engine/ccs*/heartbeat_interval_ms; do
+        [ -w "$f" ] && echo 10000 > "$f"
+      done
+    ''}"
+  ];
+
+  # upstream module hardcodes --log-disable; override ExecStart to keep logs
+  # so we can see prompt processing progress via journalctl
+  systemd.services.llama-cpp.serviceConfig.ExecStart = lib.mkForce (
+    "${cfg.package}/bin/llama-server"
+    + " --host ${cfg.host}"
+    + " --port ${toString cfg.port}"
+    + " -m ${cfg.model}"
+    + " ${utils.escapeSystemdExecArgs cfg.extraFlags}"
+  );
+
+}