llama-cpp: add grafana annotations for inference requests

Poll /slots endpoint, create annotations when slots start processing, close with token count when complete. Includes NixOS VM test with mock llama-cpp and grafana servers. Dashboard annotation entry added.
2026-04-02 17:43:49 -04:00
parent 0235617627
commit 9baeaa5c23
6 changed files with 362 additions and 0 deletions
--- a/services/llama-cpp-annotations.py
+++ b/services/llama-cpp-annotations.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python3
+import json
+import os
+import sys
+import time
+import urllib.request
+
+LLAMA_CPP_URL = os.environ.get("LLAMA_CPP_URL", "http://127.0.0.1:6688")
+GRAFANA_URL = os.environ.get("GRAFANA_URL", "http://127.0.0.1:3000")
+STATE_FILE = os.environ.get("STATE_FILE", "/var/lib/llama-cpp-annotations/state.json")
+POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "5"))
+
+
+def http_json(method, url, body=None):
+    data = json.dumps(body).encode() if body is not None else None
+    req = urllib.request.Request(
+        url,
+        data=data,
+        headers={"Content-Type": "application/json", "Accept": "application/json"},
+        method=method,
+    )
+    with urllib.request.urlopen(req, timeout=5) as resp:
+        return json.loads(resp.read())
+
+
+def get_slots():
+    try:
+        req = urllib.request.Request(
+            f"{LLAMA_CPP_URL}/slots",
+            headers={"Accept": "application/json"},
+        )
+        with urllib.request.urlopen(req, timeout=5) as resp:
+            return json.loads(resp.read())
+    except Exception as e:
+        print(f"Error fetching slots: {e}", file=sys.stderr)
+        return None
+
+
+def load_state():
+    try:
+        with open(STATE_FILE) as f:
+            return json.load(f)
+    except (FileNotFoundError, json.JSONDecodeError):
+        return {}
+
+
+def save_state(state):
+    os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
+    tmp = STATE_FILE + ".tmp"
+    with open(tmp, "w") as f:
+        json.dump(state, f)
+    os.replace(tmp, STATE_FILE)
+
+
+def grafana_post(text, start_ms):
+    try:
+        result = http_json(
+            "POST",
+            f"{GRAFANA_URL}/api/annotations",
+            {"time": start_ms, "text": text, "tags": ["llama-cpp"]},
+        )
+        return result.get("id")
+    except Exception as e:
+        print(f"Error posting annotation: {e}", file=sys.stderr)
+        return None
+
+
+def grafana_close(grafana_id, end_ms, text=None):
+    try:
+        body = {"timeEnd": end_ms}
+        if text is not None:
+            body["text"] = text
+        http_json(
+            "PATCH",
+            f"{GRAFANA_URL}/api/annotations/{grafana_id}",
+            body,
+        )
+    except Exception as e:
+        print(f"Error closing annotation {grafana_id}: {e}", file=sys.stderr)
+
+
+def main():
+    state = load_state()
+
+    while True:
+        now_ms = int(time.time() * 1000)
+        slots = get_slots()
+
+        if slots is not None:
+            # Track which slots are currently processing
+            processing_ids = set()
+            for slot in slots:
+                slot_id = str(slot["id"])
+                is_processing = slot.get("is_processing", False)
+
+                if is_processing:
+                    processing_ids.add(slot_id)
+                    if slot_id not in state:
+                        text = f"LLM request (slot {slot['id']})"
+                        grafana_id = grafana_post(text, now_ms)
+                        if grafana_id is not None:
+                            state[slot_id] = {
+                                "grafana_id": grafana_id,
+                                "start_ms": now_ms,
+                            }
+                            save_state(state)
+
+            # Close annotations for slots that stopped processing
+            for slot_id in [k for k in state if k not in processing_ids]:
+                info = state.pop(slot_id)
+                # Try to get token count from the slot data
+                n_decoded = None
+                for slot in slots:
+                    if str(slot["id"]) == slot_id:
+                        n_decoded = slot.get("next_token", {}).get("n_decoded")
+                        break
+                text = f"LLM request (slot {slot_id})"
+                if n_decoded is not None and n_decoded > 0:
+                    text += f" — {n_decoded} tokens"
+                grafana_close(info["grafana_id"], now_ms, text)
+                save_state(state)
+
+        time.sleep(POLL_INTERVAL)
+
+
+if __name__ == "__main__":
+    main()