organize

2026-04-03 00:47:12 -04:00
parent 1451f902ad
commit 124d33963e
18 changed files with 43 additions and 17 deletions
--- a/services/llama-cpp/default.nix
+++ b/services/llama-cpp/default.nix
@@ -0,0 +1,6 @@
+{
+  imports = [
+    ./llama-cpp.nix
+    ./llama-cpp-xmrig-pause.nix
+  ];
+}
--- a/services/llama-cpp/llama-cpp-xmrig-pause.nix
+++ b/services/llama-cpp/llama-cpp-xmrig-pause.nix
@@ -0,0 +1,35 @@
+{
+  config,
+  lib,
+  pkgs,
+  ...
+}:
+lib.mkIf config.services.llama-cpp.enable {
+  systemd.services.llama-cpp-xmrig-pause = {
+    description = "Pause xmrig while llama-cpp is processing requests";
+    after = [
+      "llama-cpp.service"
+      "xmrig.service"
+    ];
+    wantedBy = [ "multi-user.target" ];
+    serviceConfig = {
+      ExecStart = "${pkgs.python3}/bin/python3 ${./llama-cpp-xmrig-pause.py}";
+      Restart = "always";
+      RestartSec = "10s";
+      # Needs /proc access (default) and AF_UNIX for systemctl
+      NoNewPrivileges = true;
+      ProtectHome = true;
+      ProtectSystem = "strict";
+      PrivateTmp = true;
+      RestrictAddressFamilies = [
+        "AF_UNIX" # systemctl talks to systemd over D-Bus unix socket
+      ];
+      MemoryDenyWriteExecute = true;
+    };
+    environment = {
+      POLL_INTERVAL = "3";
+      GRACE_PERIOD = "10";
+      CPU_THRESHOLD = "50";
+    };
+  };
+}
--- a/services/llama-cpp/llama-cpp-xmrig-pause.py
+++ b/services/llama-cpp/llama-cpp-xmrig-pause.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+"""
+Pause xmrig while llama-cpp is processing inference requests.
+
+Checks if the llama-server process is actively using CPU by reading
+/proc/<pid>/stat. When CPU usage exceeds the threshold, stops xmrig.
+When CPU drops below threshold for GRACE_PERIOD seconds, restarts xmrig.
+"""
+
+import glob
+import os
+import subprocess
+import sys
+import time
+
+POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "3"))
+GRACE_PERIOD = float(os.environ.get("GRACE_PERIOD", "10"))
+# CPU percentage (per-core) above which llama-server is considered busy.
+# Idle llama-server uses ~0% CPU; active inference saturates multiple cores.
+CPU_THRESHOLD = float(os.environ.get("CPU_THRESHOLD", "50"))
+
+
+def log(msg):
+    print(f"[llama-cpp-xmrig-pause] {msg}", file=sys.stderr, flush=True)
+
+
+def find_llama_pid():
+    """Find the PID of the llama-server process."""
+    for path in glob.glob("/proc/[0-9]*/comm"):
+        try:
+            with open(path) as f:
+                if f.read().strip() == "llama-server":
+                    return int(path.split("/")[2])
+        except (OSError, ValueError):
+            continue
+    return None
+
+
+def get_cpu_times(pid):
+    """Read utime + stime from /proc/<pid>/stat. Returns total ticks or None."""
+    try:
+        with open(f"/proc/{pid}/stat") as f:
+            fields = f.read().split(")")[-1].split()
+            # fields[11] = utime, fields[12] = stime (0-indexed after ')')
+            return int(fields[11]) + int(fields[12])
+    except (OSError, IndexError, ValueError):
+        return None
+
+
+def systemctl(action, unit):
+    result = subprocess.run(
+        ["systemctl", action, unit],
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        log(f"systemctl {action} {unit} failed (rc={result.returncode}): {result.stderr.strip()}")
+    return result.returncode == 0
+
+
+def main():
+    xmrig_paused = False
+    idle_since = None
+    prev_ticks = None
+    prev_time = None
+    hz = os.sysconf("SC_CLK_TCK")
+
+    log(f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s threshold={CPU_THRESHOLD}%")
+
+    while True:
+        pid = find_llama_pid()
+        if pid is None:
+            # llama-server not running
+            idle_since = None
+            prev_ticks = None
+            prev_time = None
+            time.sleep(POLL_INTERVAL)
+            continue
+
+        ticks = get_cpu_times(pid)
+        now = time.monotonic()
+
+        if ticks is None or prev_ticks is None or prev_time is None:
+            prev_ticks = ticks
+            prev_time = now
+            time.sleep(POLL_INTERVAL)
+            continue
+
+        dt = now - prev_time
+        if dt <= 0:
+            prev_ticks = ticks
+            prev_time = now
+            time.sleep(POLL_INTERVAL)
+            continue
+
+        # CPU% = (delta_ticks / hz) / delta_seconds * 100
+        cpu_pct = ((ticks - prev_ticks) / hz) / dt * 100
+        prev_ticks = ticks
+        prev_time = now
+
+        busy = cpu_pct > CPU_THRESHOLD
+
+        if busy:
+            idle_since = None
+            if not xmrig_paused:
+                log(f"llama-server busy ({cpu_pct:.0f}% CPU) — stopping xmrig")
+                if systemctl("stop", "xmrig"):
+                    xmrig_paused = True
+        else:
+            if xmrig_paused:
+                if idle_since is None:
+                    idle_since = now
+                elif now - idle_since >= GRACE_PERIOD:
+                    log(f"llama-server idle ({cpu_pct:.0f}% CPU) past grace period — starting xmrig")
+                    if systemctl("start", "xmrig"):
+                        xmrig_paused = False
+                    idle_since = None
+
+        time.sleep(POLL_INTERVAL)
+
+
+if __name__ == "__main__":
+    main()
--- a/services/llama-cpp/llama-cpp.nix
+++ b/services/llama-cpp/llama-cpp.nix
@@ -0,0 +1,46 @@
+{
+  pkgs,
+  service_configs,
+  config,
+  inputs,
+  lib,
+  ...
+}:
+{
+  services.llama-cpp = {
+    enable = false;
+    model = toString (
+      pkgs.fetchurl {
+        url = "https://huggingface.co/unsloth/Qwen3.5-9B-GGUF/resolve/main/Qwen3.5-9B-Q4_K_M.gguf";
+        sha256 = "03b74727a860a56338e042c4420bb3f04b2fec5734175f4cb9fa853daf52b7e8";
+      }
+    );
+    port = service_configs.ports.private.llama_cpp.port;
+    host = "0.0.0.0";
+    package = (lib.optimizePackage inputs.llamacpp.packages.${pkgs.system}.default);
+    extraFlags = [
+      # "-ngl"
+      # "12"
+      "-c"
+      "32768"
+      "-ctk"
+      "q8_0"
+      "-ctv"
+      "turbo4"
+      "-fa"
+      "on"
+      "--api-key-file"
+      config.age.secrets.llama-cpp-api-key.path
+    ];
+  };
+
+  # have to do this in order to get vulkan to work
+  systemd.services.llama-cpp.serviceConfig.DynamicUser = lib.mkForce false;
+
+  # Auth handled by llama-cpp --api-key-file (Bearer token).
+  # No caddy_auth — the API key is the auth layer, and caddy_auth's basic
+  # auth would block Bearer-only clients like oh-my-pi.
+  services.caddy.virtualHosts."llm.${service_configs.https.domain}".extraConfig = ''
+    reverse_proxy :${toString config.services.llama-cpp.port}
+  '';
+}