llama-cpp: xmrig + grafana hooks

2026-04-02 22:47:53 -04:00
parent ab9c12cb97
commit 096ffeb943
8 changed files with 369 additions and 157 deletions
--- a/services/llama-cpp-annotations.nix
+++ b/services/llama-cpp-annotations.nix
@@ -1,15 +1,12 @@
 {
-  config,
  pkgs,
  service_configs,
-  lib,
  ...
 }:
 {
  systemd.services.llama-cpp-annotations = {
    description = "LLM request annotation service for Grafana";
    after = [
-      "network.target"
      "grafana.service"
      "llama-cpp.service"
    ];
@@ -31,10 +28,10 @@
      MemoryDenyWriteExecute = true;
    };
    environment = {
-      LLAMA_CPP_URL = "http://127.0.0.1:${toString service_configs.ports.private.llama_cpp.port}";
      GRAFANA_URL = "http://127.0.0.1:${toString service_configs.ports.private.grafana.port}";
      STATE_FILE = "/var/lib/llama-cpp-annotations/state.json";
      POLL_INTERVAL = "5";
+      CPU_THRESHOLD = "50";
    };
  };
 }
--- a/services/llama-cpp-annotations.py
+++ b/services/llama-cpp-annotations.py
@@ -1,14 +1,42 @@
 #!/usr/bin/env python3
+"""
+Grafana annotation service for llama-cpp inference requests.
+
+Monitors llama-server CPU usage via /proc. Creates a Grafana annotation
+when inference starts (CPU spikes), closes it when inference ends.
+"""
+
+import glob
 import json
 import os
 import sys
 import time
 import urllib.request

-LLAMA_CPP_URL = os.environ.get("LLAMA_CPP_URL", "http://127.0.0.1:6688")
 GRAFANA_URL = os.environ.get("GRAFANA_URL", "http://127.0.0.1:3000")
 STATE_FILE = os.environ.get("STATE_FILE", "/var/lib/llama-cpp-annotations/state.json")
 POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "5"))
+CPU_THRESHOLD = float(os.environ.get("CPU_THRESHOLD", "50"))
+
+
+def find_llama_pid():
+    for path in glob.glob("/proc/[0-9]*/comm"):
+        try:
+            with open(path) as f:
+                if f.read().strip() == "llama-server":
+                    return int(path.split("/")[2])
+        except (OSError, ValueError):
+            continue
+    return None
+
+
+def get_cpu_times(pid):
+    try:
+        with open(f"/proc/{pid}/stat") as f:
+            fields = f.read().split(")")[-1].split()
+            return int(fields[11]) + int(fields[12])
+    except (OSError, IndexError, ValueError):
+        return None


 def http_json(method, url, body=None):
@@ -23,19 +51,6 @@ def http_json(method, url, body=None):
        return json.loads(resp.read())


-def get_slots():
-    try:
-        req = urllib.request.Request(
-            f"{LLAMA_CPP_URL}/slots",
-            headers={"Accept": "application/json"},
-        )
-        with urllib.request.urlopen(req, timeout=5) as resp:
-            return json.loads(resp.read())
-    except Exception as e:
-        print(f"Error fetching slots: {e}", file=sys.stderr)
-        return None
-
-
 def load_state():
    try:
        with open(STATE_FILE) as f:
@@ -81,45 +96,58 @@ def grafana_close(grafana_id, end_ms, text=None):

 def main():
    state = load_state()
+    prev_ticks = None
+    prev_time = None
+    hz = os.sysconf("SC_CLK_TCK")

    while True:
        now_ms = int(time.time() * 1000)
-        slots = get_slots()
+        pid = find_llama_pid()

-        if slots is not None:
-            # Track which slots are currently processing
-            processing_ids = set()
-            for slot in slots:
-                slot_id = str(slot["id"])
-                is_processing = slot.get("is_processing", False)
+        if pid is None:
+            prev_ticks = None
+            prev_time = None
+            time.sleep(POLL_INTERVAL)
+            continue

-                if is_processing:
-                    processing_ids.add(slot_id)
-                    if slot_id not in state:
-                        text = f"LLM request (slot {slot['id']})"
-                        grafana_id = grafana_post(text, now_ms)
-                        if grafana_id is not None:
-                            state[slot_id] = {
-                                "grafana_id": grafana_id,
-                                "start_ms": now_ms,
-                            }
-                            save_state(state)
+        ticks = get_cpu_times(pid)
+        now = time.monotonic()

-            # Close annotations for slots that stopped processing
-            for slot_id in [k for k in state if k not in processing_ids]:
-                info = state.pop(slot_id)
-                # Try to get token count from the slot data
-                n_decoded = None
-                for slot in slots:
-                    if str(slot["id"]) == slot_id:
-                        n_decoded = slot.get("next_token", {}).get("n_decoded")
-                        break
-                text = f"LLM request (slot {slot_id})"
-                if n_decoded is not None and n_decoded > 0:
-                    text += f" — {n_decoded} tokens"
-                grafana_close(info["grafana_id"], now_ms, text)
+        if ticks is None or prev_ticks is None or prev_time is None:
+            prev_ticks = ticks
+            prev_time = now
+            time.sleep(POLL_INTERVAL)
+            continue
+
+        dt = now - prev_time
+        if dt <= 0:
+            prev_ticks = ticks
+            prev_time = now
+            time.sleep(POLL_INTERVAL)
+            continue
+
+        cpu_pct = ((ticks - prev_ticks) / hz) / dt * 100
+        prev_ticks = ticks
+        prev_time = now
+
+        busy = cpu_pct > CPU_THRESHOLD
+
+        if busy and "active" not in state:
+            grafana_id = grafana_post("LLM request", now_ms)
+            if grafana_id is not None:
+                state["active"] = {
+                    "grafana_id": grafana_id,
+                    "start_ms": now_ms,
+                }
                save_state(state)

+        elif not busy and "active" in state:
+            info = state.pop("active")
+            duration_s = (now_ms - info["start_ms"]) / 1000
+            text = f"LLM request ({duration_s:.1f}s)"
+            grafana_close(info["grafana_id"], now_ms, text)
+            save_state(state)
+
        time.sleep(POLL_INTERVAL)


--- a/services/llama-cpp-xmrig-pause.nix
+++ b/services/llama-cpp-xmrig-pause.nix
@@ -1,13 +1,11 @@
 {
  pkgs,
-  service_configs,
  ...
 }:
 {
  systemd.services.llama-cpp-xmrig-pause = {
    description = "Pause xmrig while llama-cpp is processing requests";
    after = [
-      "network.target"
      "llama-cpp.service"
      "xmrig.service"
    ];
@@ -16,20 +14,20 @@
      ExecStart = "${pkgs.python3}/bin/python3 ${./llama-cpp-xmrig-pause.py}";
      Restart = "always";
      RestartSec = "10s";
+      # Needs /proc access (default) and AF_UNIX for systemctl
      NoNewPrivileges = true;
      ProtectHome = true;
      ProtectSystem = "strict";
      PrivateTmp = true;
      RestrictAddressFamilies = [
-        "AF_INET"
-        "AF_INET6"
+        "AF_UNIX" # systemctl talks to systemd over D-Bus unix socket
      ];
      MemoryDenyWriteExecute = true;
    };
    environment = {
-      LLAMA_CPP_URL = "http://127.0.0.1:${toString service_configs.ports.private.llama_cpp.port}";
      POLL_INTERVAL = "3";
      GRACE_PERIOD = "10";
+      CPU_THRESHOLD = "50";
    };
  };
 }
--- a/services/llama-cpp-xmrig-pause.py
+++ b/services/llama-cpp-xmrig-pause.py
@@ -2,42 +2,51 @@
 """
 Pause xmrig while llama-cpp is processing inference requests.

-Polls llama-cpp /slots endpoint. When any slot is busy, stops xmrig.
-When all slots are idle for GRACE_PERIOD seconds, restarts xmrig.
-If llama-cpp is unreachable, does nothing (leaves xmrig in its current state).
+Checks if the llama-server process is actively using CPU by reading
+/proc/<pid>/stat. When CPU usage exceeds the threshold, stops xmrig.
+When CPU drops below threshold for GRACE_PERIOD seconds, restarts xmrig.
 """

-import json
+import glob
 import os
 import subprocess
 import sys
 import time
-import urllib.request

-LLAMA_CPP_URL = os.environ["LLAMA_CPP_URL"].rstrip("/")
 POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "3"))
 GRACE_PERIOD = float(os.environ.get("GRACE_PERIOD", "10"))
+# CPU percentage (per-core) above which llama-server is considered busy.
+# Idle llama-server uses ~0% CPU; active inference saturates multiple cores.
+CPU_THRESHOLD = float(os.environ.get("CPU_THRESHOLD", "50"))


 def log(msg):
    print(f"[llama-cpp-xmrig-pause] {msg}", file=sys.stderr, flush=True)


-def get_slots():
-    """Fetch /slots from llama-cpp. Returns list of slot dicts, or None on error."""
-    req = urllib.request.Request(f"{LLAMA_CPP_URL}/slots")
+def find_llama_pid():
+    """Find the PID of the llama-server process."""
+    for path in glob.glob("/proc/[0-9]*/comm"):
+        try:
+            with open(path) as f:
+                if f.read().strip() == "llama-server":
+                    return int(path.split("/")[2])
+        except (OSError, ValueError):
+            continue
+    return None
+
+
+def get_cpu_times(pid):
+    """Read utime + stime from /proc/<pid>/stat. Returns total ticks or None."""
    try:
-        with urllib.request.urlopen(req, timeout=5) as resp:
-            return json.loads(resp.read())
-    except (urllib.error.URLError, OSError, json.JSONDecodeError, ValueError) as exc:
-        log(f"Cannot reach llama-cpp: {exc}")
+        with open(f"/proc/{pid}/stat") as f:
+            fields = f.read().split(")")[-1].split()
+            # fields[11] = utime, fields[12] = stime (0-indexed after ')')
+            return int(fields[11]) + int(fields[12])
+    except (OSError, IndexError, ValueError):
        return None


-def any_slot_busy(slots):
-    return any(s.get("is_processing", False) for s in slots)
-
-
 def systemctl(action, unit):
    result = subprocess.run(
        ["systemctl", action, unit],
@@ -51,35 +60,58 @@ def systemctl(action, unit):

 def main():
    xmrig_paused = False
-    idle_since = None  # monotonic timestamp when slots first went idle
+    idle_since = None
+    prev_ticks = None
+    prev_time = None
+    hz = os.sysconf("SC_CLK_TCK")

-    log(f"Starting: url={LLAMA_CPP_URL} poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s")
+    log(f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s threshold={CPU_THRESHOLD}%")

    while True:
-        slots = get_slots()
-
-        if slots is None:
-            # llama-cpp unreachable — leave xmrig alone, reset idle timer
+        pid = find_llama_pid()
+        if pid is None:
+            # llama-server not running
            idle_since = None
+            prev_ticks = None
+            prev_time = None
            time.sleep(POLL_INTERVAL)
            continue

-        busy = any_slot_busy(slots)
+        ticks = get_cpu_times(pid)
+        now = time.monotonic()
+
+        if ticks is None or prev_ticks is None or prev_time is None:
+            prev_ticks = ticks
+            prev_time = now
+            time.sleep(POLL_INTERVAL)
+            continue
+
+        dt = now - prev_time
+        if dt <= 0:
+            prev_ticks = ticks
+            prev_time = now
+            time.sleep(POLL_INTERVAL)
+            continue
+
+        # CPU% = (delta_ticks / hz) / delta_seconds * 100
+        cpu_pct = ((ticks - prev_ticks) / hz) / dt * 100
+        prev_ticks = ticks
+        prev_time = now
+
+        busy = cpu_pct > CPU_THRESHOLD

        if busy:
            idle_since = None
            if not xmrig_paused:
-                log("Slot busy — stopping xmrig")
+                log(f"llama-server busy ({cpu_pct:.0f}% CPU) — stopping xmrig")
                if systemctl("stop", "xmrig"):
                    xmrig_paused = True
        else:
-            # All slots idle
            if xmrig_paused:
-                now = time.monotonic()
                if idle_since is None:
                    idle_since = now
                elif now - idle_since >= GRACE_PERIOD:
-                    log("Slots idle past grace period — starting xmrig")
+                    log(f"llama-server idle ({cpu_pct:.0f}% CPU) past grace period — starting xmrig")
                    if systemctl("start", "xmrig"):
                        xmrig_paused = False
                    idle_since = None