llama-cpp: xmrig + grafana hooks

2026-04-02 22:47:53 -04:00
parent ab9c12cb97
commit 096ffeb943
8 changed files with 369 additions and 157 deletions
--- a/services/llama-cpp-annotations.py
+++ b/services/llama-cpp-annotations.py
@@ -1,14 +1,42 @@
 #!/usr/bin/env python3
+"""
+Grafana annotation service for llama-cpp inference requests.
+
+Monitors llama-server CPU usage via /proc. Creates a Grafana annotation
+when inference starts (CPU spikes), closes it when inference ends.
+"""
+
+import glob
 import json
 import os
 import sys
 import time
 import urllib.request

-LLAMA_CPP_URL = os.environ.get("LLAMA_CPP_URL", "http://127.0.0.1:6688")
 GRAFANA_URL = os.environ.get("GRAFANA_URL", "http://127.0.0.1:3000")
 STATE_FILE = os.environ.get("STATE_FILE", "/var/lib/llama-cpp-annotations/state.json")
 POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "5"))
+CPU_THRESHOLD = float(os.environ.get("CPU_THRESHOLD", "50"))
+
+
+def find_llama_pid():
+    for path in glob.glob("/proc/[0-9]*/comm"):
+        try:
+            with open(path) as f:
+                if f.read().strip() == "llama-server":
+                    return int(path.split("/")[2])
+        except (OSError, ValueError):
+            continue
+    return None
+
+
+def get_cpu_times(pid):
+    try:
+        with open(f"/proc/{pid}/stat") as f:
+            fields = f.read().split(")")[-1].split()
+            return int(fields[11]) + int(fields[12])
+    except (OSError, IndexError, ValueError):
+        return None


 def http_json(method, url, body=None):
@@ -23,19 +51,6 @@ def http_json(method, url, body=None):
        return json.loads(resp.read())


-def get_slots():
-    try:
-        req = urllib.request.Request(
-            f"{LLAMA_CPP_URL}/slots",
-            headers={"Accept": "application/json"},
-        )
-        with urllib.request.urlopen(req, timeout=5) as resp:
-            return json.loads(resp.read())
-    except Exception as e:
-        print(f"Error fetching slots: {e}", file=sys.stderr)
-        return None
-
-
 def load_state():
    try:
        with open(STATE_FILE) as f:
@@ -81,45 +96,58 @@ def grafana_close(grafana_id, end_ms, text=None):

 def main():
    state = load_state()
+    prev_ticks = None
+    prev_time = None
+    hz = os.sysconf("SC_CLK_TCK")

    while True:
        now_ms = int(time.time() * 1000)
-        slots = get_slots()
+        pid = find_llama_pid()

-        if slots is not None:
-            # Track which slots are currently processing
-            processing_ids = set()
-            for slot in slots:
-                slot_id = str(slot["id"])
-                is_processing = slot.get("is_processing", False)
+        if pid is None:
+            prev_ticks = None
+            prev_time = None
+            time.sleep(POLL_INTERVAL)
+            continue

-                if is_processing:
-                    processing_ids.add(slot_id)
-                    if slot_id not in state:
-                        text = f"LLM request (slot {slot['id']})"
-                        grafana_id = grafana_post(text, now_ms)
-                        if grafana_id is not None:
-                            state[slot_id] = {
-                                "grafana_id": grafana_id,
-                                "start_ms": now_ms,
-                            }
-                            save_state(state)
+        ticks = get_cpu_times(pid)
+        now = time.monotonic()

-            # Close annotations for slots that stopped processing
-            for slot_id in [k for k in state if k not in processing_ids]:
-                info = state.pop(slot_id)
-                # Try to get token count from the slot data
-                n_decoded = None
-                for slot in slots:
-                    if str(slot["id"]) == slot_id:
-                        n_decoded = slot.get("next_token", {}).get("n_decoded")
-                        break
-                text = f"LLM request (slot {slot_id})"
-                if n_decoded is not None and n_decoded > 0:
-                    text += f" — {n_decoded} tokens"
-                grafana_close(info["grafana_id"], now_ms, text)
+        if ticks is None or prev_ticks is None or prev_time is None:
+            prev_ticks = ticks
+            prev_time = now
+            time.sleep(POLL_INTERVAL)
+            continue
+
+        dt = now - prev_time
+        if dt <= 0:
+            prev_ticks = ticks
+            prev_time = now
+            time.sleep(POLL_INTERVAL)
+            continue
+
+        cpu_pct = ((ticks - prev_ticks) / hz) / dt * 100
+        prev_ticks = ticks
+        prev_time = now
+
+        busy = cpu_pct > CPU_THRESHOLD
+
+        if busy and "active" not in state:
+            grafana_id = grafana_post("LLM request", now_ms)
+            if grafana_id is not None:
+                state["active"] = {
+                    "grafana_id": grafana_id,
+                    "start_ms": now_ms,
+                }
                save_state(state)

+        elif not busy and "active" in state:
+            info = state.pop("active")
+            duration_s = (now_ms - info["start_ms"]) / 1000
+            text = f"LLM request ({duration_s:.1f}s)"
+            grafana_close(info["grafana_id"], now_ms, text)
+            save_state(state)
+
        time.sleep(POLL_INTERVAL)