From e9ce1ce0a24cbc72b3ace3508023cee74c91a271 Mon Sep 17 00:00:00 2001 From: Simon Gardling Date: Thu, 9 Apr 2026 19:19:58 -0400 Subject: [PATCH] grafana: replace llama-cpp-annotations daemon with prometheus query --- services/grafana/dashboard.nix | 11 +- services/grafana/default.nix | 1 - services/grafana/llama-cpp-annotations.nix | 18 --- services/grafana/llama-cpp-annotations.py | 155 --------------------- tests/llama-cpp-annotations.nix | 132 ------------------ tests/mock-llama-server-proc.py | 42 ------ tests/tests.nix | 3 - 7 files changed, 4 insertions(+), 358 deletions(-) delete mode 100644 services/grafana/llama-cpp-annotations.nix delete mode 100644 services/grafana/llama-cpp-annotations.py delete mode 100644 tests/llama-cpp-annotations.nix delete mode 100644 tests/mock-llama-server-proc.py diff --git a/services/grafana/dashboard.nix b/services/grafana/dashboard.nix index 89ea472..7f2ff07 100644 --- a/services/grafana/dashboard.nix +++ b/services/grafana/dashboard.nix @@ -50,15 +50,12 @@ let } { name = "LLM Requests"; - datasource = { - type = "grafana"; - uid = "-- Grafana --"; - }; + datasource = promDs; enable = true; iconColor = "purple"; - showIn = 0; - type = "tags"; - tags = [ "llama-cpp" ]; + expr = "llamacpp:requests_processing > 0"; + step = "10s"; + titleFormat = "LLM inference"; } ]; diff --git a/services/grafana/default.nix b/services/grafana/default.nix index 9985459..b6a4bd7 100644 --- a/services/grafana/default.nix +++ b/services/grafana/default.nix @@ -5,7 +5,6 @@ ./dashboard.nix ./exporters.nix ./jellyfin-annotations.nix - ./llama-cpp-annotations.nix ./zfs-scrub-annotations.nix ]; } diff --git a/services/grafana/llama-cpp-annotations.nix b/services/grafana/llama-cpp-annotations.nix deleted file mode 100644 index f4f6440..0000000 --- a/services/grafana/llama-cpp-annotations.nix +++ /dev/null @@ -1,18 +0,0 @@ -{ - config, - service_configs, - lib, - ... -}: -lib.mkIf (config.services.grafana.enable && config.services.llama-cpp.enable) ( - lib.mkGrafanaAnnotationService { - name = "llama-cpp"; - description = "LLM request annotation service for Grafana"; - script = ./llama-cpp-annotations.py; - after = [ "llama-cpp.service" ]; - environment = { - POLL_INTERVAL = "5"; - CPU_THRESHOLD = "50"; - }; - } -) diff --git a/services/grafana/llama-cpp-annotations.py b/services/grafana/llama-cpp-annotations.py deleted file mode 100644 index 6da2c6c..0000000 --- a/services/grafana/llama-cpp-annotations.py +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env python3 -""" -Grafana annotation service for llama-cpp inference requests. - -Monitors llama-server CPU usage via /proc. Creates a Grafana annotation -when inference starts (CPU spikes), closes it when inference ends. -""" - -import glob -import json -import os -import sys -import time -import urllib.request - -GRAFANA_URL = os.environ.get("GRAFANA_URL", "http://127.0.0.1:3000") -STATE_FILE = os.environ.get("STATE_FILE", "/var/lib/llama-cpp-annotations/state.json") -POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "5")) -CPU_THRESHOLD = float(os.environ.get("CPU_THRESHOLD", "50")) - - -def find_llama_pid(): - for path in glob.glob("/proc/[0-9]*/comm"): - try: - with open(path) as f: - if f.read().strip() == "llama-server": - return int(path.split("/")[2]) - except (OSError, ValueError): - continue - return None - - -def get_cpu_times(pid): - try: - with open(f"/proc/{pid}/stat") as f: - fields = f.read().split(")")[-1].split() - return int(fields[11]) + int(fields[12]) - except (OSError, IndexError, ValueError): - return None - - -def http_json(method, url, body=None): - data = json.dumps(body).encode() if body is not None else None - req = urllib.request.Request( - url, - data=data, - headers={"Content-Type": "application/json", "Accept": "application/json"}, - method=method, - ) - with urllib.request.urlopen(req, timeout=5) as resp: - return json.loads(resp.read()) - - -def load_state(): - try: - with open(STATE_FILE) as f: - return json.load(f) - except (FileNotFoundError, json.JSONDecodeError): - return {} - - -def save_state(state): - os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True) - tmp = STATE_FILE + ".tmp" - with open(tmp, "w") as f: - json.dump(state, f) - os.replace(tmp, STATE_FILE) - - -def grafana_post(text, start_ms): - try: - result = http_json( - "POST", - f"{GRAFANA_URL}/api/annotations", - {"time": start_ms, "text": text, "tags": ["llama-cpp"]}, - ) - return result.get("id") - except Exception as e: - print(f"Error posting annotation: {e}", file=sys.stderr) - return None - - -def grafana_close(grafana_id, end_ms, text=None): - try: - body = {"timeEnd": end_ms} - if text is not None: - body["text"] = text - http_json( - "PATCH", - f"{GRAFANA_URL}/api/annotations/{grafana_id}", - body, - ) - except Exception as e: - print(f"Error closing annotation {grafana_id}: {e}", file=sys.stderr) - - -def main(): - state = load_state() - prev_ticks = None - prev_time = None - hz = os.sysconf("SC_CLK_TCK") - - while True: - now_ms = int(time.time() * 1000) - pid = find_llama_pid() - - if pid is None: - prev_ticks = None - prev_time = None - time.sleep(POLL_INTERVAL) - continue - - ticks = get_cpu_times(pid) - now = time.monotonic() - - if ticks is None or prev_ticks is None or prev_time is None: - prev_ticks = ticks - prev_time = now - time.sleep(POLL_INTERVAL) - continue - - dt = now - prev_time - if dt <= 0: - prev_ticks = ticks - prev_time = now - time.sleep(POLL_INTERVAL) - continue - - cpu_pct = ((ticks - prev_ticks) / hz) / dt * 100 - prev_ticks = ticks - prev_time = now - - busy = cpu_pct > CPU_THRESHOLD - - if busy and "active" not in state: - grafana_id = grafana_post("LLM request", now_ms) - if grafana_id is not None: - state["active"] = { - "grafana_id": grafana_id, - "start_ms": now_ms, - } - save_state(state) - - elif not busy and "active" in state: - info = state.pop("active") - duration_s = (now_ms - info["start_ms"]) / 1000 - text = f"LLM request ({duration_s:.1f}s)" - grafana_close(info["grafana_id"], now_ms, text) - save_state(state) - - time.sleep(POLL_INTERVAL) - - -if __name__ == "__main__": - main() diff --git a/tests/llama-cpp-annotations.nix b/tests/llama-cpp-annotations.nix deleted file mode 100644 index 4dbc077..0000000 --- a/tests/llama-cpp-annotations.nix +++ /dev/null @@ -1,132 +0,0 @@ -{ - pkgs, - ... -}: -let - mockGrafana = ./mock-grafana-server.py; - script = ../services/grafana/llama-cpp-annotations.py; - python = pkgs.python3; - - mockLlamaProcess = ./mock-llama-server-proc.py; -in -pkgs.testers.runNixOSTest { - name = "llama-cpp-annotations"; - - nodes.machine = - { pkgs, ... }: - { - environment.systemPackages = [ - pkgs.python3 - pkgs.curl - pkgs.procps - ]; - }; - - testScript = '' - import json - import time - - GRAFANA_PORT = 13000 - ANNOTS_FILE = "/tmp/annotations.json" - LLAMA_STATE = "/tmp/llama-state.txt" - STATE_FILE = "/tmp/llama-annot-state.json" - PYTHON = "${python}/bin/python3" - MOCK_GRAFANA = "${mockGrafana}" - MOCK_LLAMA = "${mockLlamaProcess}" - SCRIPT = "${script}" - - def read_annotations(): - out = machine.succeed(f"cat {ANNOTS_FILE} 2>/dev/null || echo '[]'") - return json.loads(out.strip()) - - def set_busy(): - machine.succeed(f"echo busy > {LLAMA_STATE}") - - def set_idle(): - machine.succeed(f"echo idle > {LLAMA_STATE}") - - start_all() - machine.wait_for_unit("multi-user.target") - - with subtest("Start mock services"): - machine.succeed(f"echo '[]' > {ANNOTS_FILE}") - machine.succeed( - f"systemd-run --unit=mock-grafana {PYTHON} {MOCK_GRAFANA} {GRAFANA_PORT} {ANNOTS_FILE}" - ) - machine.succeed( - f"systemd-run --unit=mock-llama {PYTHON} {MOCK_LLAMA} {LLAMA_STATE}" - ) - machine.wait_until_succeeds( - f"curl -sf http://127.0.0.1:{GRAFANA_PORT}/api/annotations -X POST " - f"-H 'Content-Type: application/json' -d '{{\"text\":\"ping\",\"tags\":[]}}' | grep -q id", - timeout=10, - ) - machine.wait_until_succeeds( - "pgrep -x llama-server", - timeout=10, - ) - machine.succeed(f"echo '[]' > {ANNOTS_FILE}") - - with subtest("Start annotation service"): - machine.succeed( - f"systemd-run --unit=llama-annot " - f"--setenv=GRAFANA_URL=http://127.0.0.1:{GRAFANA_PORT} " - f"--setenv=STATE_FILE={STATE_FILE} " - f"--setenv=POLL_INTERVAL=2 " - f"--setenv=CPU_THRESHOLD=10 " - f"{PYTHON} {SCRIPT}" - ) - time.sleep(5) - - with subtest("No annotations when idle"): - annots = read_annotations() - assert annots == [], f"Expected no annotations, got: {annots}" - - with subtest("Annotation created when llama-server becomes busy"): - set_busy() - machine.wait_until_succeeds( - f"cat {ANNOTS_FILE} | {PYTHON} -c " - f"\"import sys,json; a=json.load(sys.stdin); exit(0 if a else 1)\"", - timeout=20, - ) - annots = read_annotations() - assert len(annots) == 1, f"Expected 1 annotation, got: {annots}" - assert "llama-cpp" in annots[0].get("tags", []), f"Missing tag: {annots[0]}" - assert "LLM request" in annots[0]["text"], f"Missing text: {annots[0]['text']}" - assert "timeEnd" not in annots[0], f"timeEnd should not be set: {annots[0]}" - - with subtest("Annotation closed when llama-server becomes idle"): - set_idle() - machine.wait_until_succeeds( - f"cat {ANNOTS_FILE} | {PYTHON} -c " - f"\"import sys,json; a=json.load(sys.stdin); exit(0 if a and 'timeEnd' in a[0] else 1)\"", - timeout=20, - ) - annots = read_annotations() - assert len(annots) == 1, f"Expected 1, got: {annots}" - assert "timeEnd" in annots[0], f"timeEnd missing: {annots[0]}" - assert annots[0]["timeEnd"] > annots[0]["time"], "timeEnd should be after time" - assert "s)" in annots[0].get("text", ""), f"Duration missing: {annots[0]}" - - with subtest("State survives restart"): - set_busy() - machine.wait_until_succeeds( - f"cat {ANNOTS_FILE} | {PYTHON} -c " - f"\"import sys,json; a=json.load(sys.stdin); exit(0 if len(a)==2 else 1)\"", - timeout=20, - ) - machine.succeed("systemctl stop llama-annot || true") - time.sleep(1) - machine.succeed( - f"systemd-run --unit=llama-annot-2 " - f"--setenv=GRAFANA_URL=http://127.0.0.1:{GRAFANA_PORT} " - f"--setenv=STATE_FILE={STATE_FILE} " - f"--setenv=POLL_INTERVAL=2 " - f"--setenv=CPU_THRESHOLD=10 " - f"{PYTHON} {SCRIPT}" - ) - time.sleep(6) - annots = read_annotations() - assert len(annots) == 2, f"Restart should not duplicate, got: {annots}" - ''; -} diff --git a/tests/mock-llama-server-proc.py b/tests/mock-llama-server-proc.py deleted file mode 100644 index 6119372..0000000 --- a/tests/mock-llama-server-proc.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 -""" -Mock llama-server process for NixOS VM tests. - -Sets /proc/self/comm to "llama-server" via prctl so that monitoring scripts -(llama-cpp-annotations, llama-cpp-xmrig-pause) can discover this process -the same way they discover the real one. - -Usage: python3 mock-llama-server-proc.py - -The state file controls behavior: - "busy" -> burn CPU in a tight loop (simulates prompt processing / inference) - "idle" -> sleep (simulates waiting for requests) -""" - -import ctypes -import ctypes.util -import sys -import time - -STATE_FILE = sys.argv[1] - -# PR_SET_NAME = 15, sets /proc/self/comm -libc = ctypes.CDLL(ctypes.util.find_library("c"), use_errno=True) -libc.prctl(15, b"llama-server", 0, 0, 0) - -with open(STATE_FILE, "w") as f: - f.write("idle") - -while True: - try: - with open(STATE_FILE) as f: - state = f.read().strip() - except Exception: - state = "idle" - - if state == "busy": - end = time.monotonic() + 0.1 - while time.monotonic() < end: - _ = sum(range(10000)) - else: - time.sleep(0.5) diff --git a/tests/tests.nix b/tests/tests.nix index c5e6cd4..8493569 100644 --- a/tests/tests.nix +++ b/tests/tests.nix @@ -28,9 +28,6 @@ in # zfs scrub annotations test zfsScrubAnnotationsTest = handleTest ./zfs-scrub-annotations.nix; - # llama-cpp tests - llamaCppAnnotationsTest = handleTest ./llama-cpp-annotations.nix; - # xmrig auto-pause test xmrigAutoPauseTest = handleTest ./xmrig-auto-pause.nix; # ntfy alerts test