llama-cpp: xmrig + grafana hooks

2026-04-02 22:47:53 -04:00
parent ab9c12cb97
commit 096ffeb943
8 changed files with 369 additions and 157 deletions
--- a/services/llama-cpp-xmrig-pause.py
+++ b/services/llama-cpp-xmrig-pause.py
@@ -2,42 +2,51 @@
 """
 Pause xmrig while llama-cpp is processing inference requests.

-Polls llama-cpp /slots endpoint. When any slot is busy, stops xmrig.
-When all slots are idle for GRACE_PERIOD seconds, restarts xmrig.
-If llama-cpp is unreachable, does nothing (leaves xmrig in its current state).
+Checks if the llama-server process is actively using CPU by reading
+/proc/<pid>/stat. When CPU usage exceeds the threshold, stops xmrig.
+When CPU drops below threshold for GRACE_PERIOD seconds, restarts xmrig.
 """

-import json
+import glob
 import os
 import subprocess
 import sys
 import time
-import urllib.request

-LLAMA_CPP_URL = os.environ["LLAMA_CPP_URL"].rstrip("/")
 POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "3"))
 GRACE_PERIOD = float(os.environ.get("GRACE_PERIOD", "10"))
+# CPU percentage (per-core) above which llama-server is considered busy.
+# Idle llama-server uses ~0% CPU; active inference saturates multiple cores.
+CPU_THRESHOLD = float(os.environ.get("CPU_THRESHOLD", "50"))


 def log(msg):
    print(f"[llama-cpp-xmrig-pause] {msg}", file=sys.stderr, flush=True)


-def get_slots():
-    """Fetch /slots from llama-cpp. Returns list of slot dicts, or None on error."""
-    req = urllib.request.Request(f"{LLAMA_CPP_URL}/slots")
+def find_llama_pid():
+    """Find the PID of the llama-server process."""
+    for path in glob.glob("/proc/[0-9]*/comm"):
+        try:
+            with open(path) as f:
+                if f.read().strip() == "llama-server":
+                    return int(path.split("/")[2])
+        except (OSError, ValueError):
+            continue
+    return None
+
+
+def get_cpu_times(pid):
+    """Read utime + stime from /proc/<pid>/stat. Returns total ticks or None."""
    try:
-        with urllib.request.urlopen(req, timeout=5) as resp:
-            return json.loads(resp.read())
-    except (urllib.error.URLError, OSError, json.JSONDecodeError, ValueError) as exc:
-        log(f"Cannot reach llama-cpp: {exc}")
+        with open(f"/proc/{pid}/stat") as f:
+            fields = f.read().split(")")[-1].split()
+            # fields[11] = utime, fields[12] = stime (0-indexed after ')')
+            return int(fields[11]) + int(fields[12])
+    except (OSError, IndexError, ValueError):
        return None


-def any_slot_busy(slots):
-    return any(s.get("is_processing", False) for s in slots)
-
-
 def systemctl(action, unit):
    result = subprocess.run(
        ["systemctl", action, unit],
@@ -51,35 +60,58 @@ def systemctl(action, unit):

 def main():
    xmrig_paused = False
-    idle_since = None  # monotonic timestamp when slots first went idle
+    idle_since = None
+    prev_ticks = None
+    prev_time = None
+    hz = os.sysconf("SC_CLK_TCK")

-    log(f"Starting: url={LLAMA_CPP_URL} poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s")
+    log(f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s threshold={CPU_THRESHOLD}%")

    while True:
-        slots = get_slots()
-
-        if slots is None:
-            # llama-cpp unreachable — leave xmrig alone, reset idle timer
+        pid = find_llama_pid()
+        if pid is None:
+            # llama-server not running
            idle_since = None
+            prev_ticks = None
+            prev_time = None
            time.sleep(POLL_INTERVAL)
            continue

-        busy = any_slot_busy(slots)
+        ticks = get_cpu_times(pid)
+        now = time.monotonic()
+
+        if ticks is None or prev_ticks is None or prev_time is None:
+            prev_ticks = ticks
+            prev_time = now
+            time.sleep(POLL_INTERVAL)
+            continue
+
+        dt = now - prev_time
+        if dt <= 0:
+            prev_ticks = ticks
+            prev_time = now
+            time.sleep(POLL_INTERVAL)
+            continue
+
+        # CPU% = (delta_ticks / hz) / delta_seconds * 100
+        cpu_pct = ((ticks - prev_ticks) / hz) / dt * 100
+        prev_ticks = ticks
+        prev_time = now
+
+        busy = cpu_pct > CPU_THRESHOLD

        if busy:
            idle_since = None
            if not xmrig_paused:
-                log("Slot busy — stopping xmrig")
+                log(f"llama-server busy ({cpu_pct:.0f}% CPU) — stopping xmrig")
                if systemctl("stop", "xmrig"):
                    xmrig_paused = True
        else:
-            # All slots idle
            if xmrig_paused:
-                now = time.monotonic()
                if idle_since is None:
                    idle_since = now
                elif now - idle_since >= GRACE_PERIOD:
-                    log("Slots idle past grace period — starting xmrig")
+                    log(f"llama-server idle ({cpu_pct:.0f}% CPU) past grace period — starting xmrig")
                    if systemctl("start", "xmrig"):
                        xmrig_paused = False
                    idle_since = None