grafana: replace llama-cpp-annotations daemon with prometheus query

This commit is contained in:
2026-04-09 19:19:58 -04:00
parent a3a6700106
commit e9ce1ce0a2
7 changed files with 4 additions and 358 deletions

View File

@@ -1,132 +0,0 @@
{
pkgs,
...
}:
let
mockGrafana = ./mock-grafana-server.py;
script = ../services/grafana/llama-cpp-annotations.py;
python = pkgs.python3;
mockLlamaProcess = ./mock-llama-server-proc.py;
in
pkgs.testers.runNixOSTest {
name = "llama-cpp-annotations";
nodes.machine =
{ pkgs, ... }:
{
environment.systemPackages = [
pkgs.python3
pkgs.curl
pkgs.procps
];
};
testScript = ''
import json
import time
GRAFANA_PORT = 13000
ANNOTS_FILE = "/tmp/annotations.json"
LLAMA_STATE = "/tmp/llama-state.txt"
STATE_FILE = "/tmp/llama-annot-state.json"
PYTHON = "${python}/bin/python3"
MOCK_GRAFANA = "${mockGrafana}"
MOCK_LLAMA = "${mockLlamaProcess}"
SCRIPT = "${script}"
def read_annotations():
out = machine.succeed(f"cat {ANNOTS_FILE} 2>/dev/null || echo '[]'")
return json.loads(out.strip())
def set_busy():
machine.succeed(f"echo busy > {LLAMA_STATE}")
def set_idle():
machine.succeed(f"echo idle > {LLAMA_STATE}")
start_all()
machine.wait_for_unit("multi-user.target")
with subtest("Start mock services"):
machine.succeed(f"echo '[]' > {ANNOTS_FILE}")
machine.succeed(
f"systemd-run --unit=mock-grafana {PYTHON} {MOCK_GRAFANA} {GRAFANA_PORT} {ANNOTS_FILE}"
)
machine.succeed(
f"systemd-run --unit=mock-llama {PYTHON} {MOCK_LLAMA} {LLAMA_STATE}"
)
machine.wait_until_succeeds(
f"curl -sf http://127.0.0.1:{GRAFANA_PORT}/api/annotations -X POST "
f"-H 'Content-Type: application/json' -d '{{\"text\":\"ping\",\"tags\":[]}}' | grep -q id",
timeout=10,
)
machine.wait_until_succeeds(
"pgrep -x llama-server",
timeout=10,
)
machine.succeed(f"echo '[]' > {ANNOTS_FILE}")
with subtest("Start annotation service"):
machine.succeed(
f"systemd-run --unit=llama-annot "
f"--setenv=GRAFANA_URL=http://127.0.0.1:{GRAFANA_PORT} "
f"--setenv=STATE_FILE={STATE_FILE} "
f"--setenv=POLL_INTERVAL=2 "
f"--setenv=CPU_THRESHOLD=10 "
f"{PYTHON} {SCRIPT}"
)
time.sleep(5)
with subtest("No annotations when idle"):
annots = read_annotations()
assert annots == [], f"Expected no annotations, got: {annots}"
with subtest("Annotation created when llama-server becomes busy"):
set_busy()
machine.wait_until_succeeds(
f"cat {ANNOTS_FILE} | {PYTHON} -c "
f"\"import sys,json; a=json.load(sys.stdin); exit(0 if a else 1)\"",
timeout=20,
)
annots = read_annotations()
assert len(annots) == 1, f"Expected 1 annotation, got: {annots}"
assert "llama-cpp" in annots[0].get("tags", []), f"Missing tag: {annots[0]}"
assert "LLM request" in annots[0]["text"], f"Missing text: {annots[0]['text']}"
assert "timeEnd" not in annots[0], f"timeEnd should not be set: {annots[0]}"
with subtest("Annotation closed when llama-server becomes idle"):
set_idle()
machine.wait_until_succeeds(
f"cat {ANNOTS_FILE} | {PYTHON} -c "
f"\"import sys,json; a=json.load(sys.stdin); exit(0 if a and 'timeEnd' in a[0] else 1)\"",
timeout=20,
)
annots = read_annotations()
assert len(annots) == 1, f"Expected 1, got: {annots}"
assert "timeEnd" in annots[0], f"timeEnd missing: {annots[0]}"
assert annots[0]["timeEnd"] > annots[0]["time"], "timeEnd should be after time"
assert "s)" in annots[0].get("text", ""), f"Duration missing: {annots[0]}"
with subtest("State survives restart"):
set_busy()
machine.wait_until_succeeds(
f"cat {ANNOTS_FILE} | {PYTHON} -c "
f"\"import sys,json; a=json.load(sys.stdin); exit(0 if len(a)==2 else 1)\"",
timeout=20,
)
machine.succeed("systemctl stop llama-annot || true")
time.sleep(1)
machine.succeed(
f"systemd-run --unit=llama-annot-2 "
f"--setenv=GRAFANA_URL=http://127.0.0.1:{GRAFANA_PORT} "
f"--setenv=STATE_FILE={STATE_FILE} "
f"--setenv=POLL_INTERVAL=2 "
f"--setenv=CPU_THRESHOLD=10 "
f"{PYTHON} {SCRIPT}"
)
time.sleep(6)
annots = read_annotations()
assert len(annots) == 2, f"Restart should not duplicate, got: {annots}"
'';
}

View File

@@ -1,42 +0,0 @@
#!/usr/bin/env python3
"""
Mock llama-server process for NixOS VM tests.
Sets /proc/self/comm to "llama-server" via prctl so that monitoring scripts
(llama-cpp-annotations, llama-cpp-xmrig-pause) can discover this process
the same way they discover the real one.
Usage: python3 mock-llama-server-proc.py <state-file>
The state file controls behavior:
"busy" -> burn CPU in a tight loop (simulates prompt processing / inference)
"idle" -> sleep (simulates waiting for requests)
"""
import ctypes
import ctypes.util
import sys
import time
STATE_FILE = sys.argv[1]
# PR_SET_NAME = 15, sets /proc/self/comm
libc = ctypes.CDLL(ctypes.util.find_library("c"), use_errno=True)
libc.prctl(15, b"llama-server", 0, 0, 0)
with open(STATE_FILE, "w") as f:
f.write("idle")
while True:
try:
with open(STATE_FILE) as f:
state = f.read().strip()
except Exception:
state = "idle"
if state == "busy":
end = time.monotonic() + 0.1
while time.monotonic() < end:
_ = sum(range(10000))
else:
time.sleep(0.5)

View File

@@ -28,9 +28,6 @@ in
# zfs scrub annotations test
zfsScrubAnnotationsTest = handleTest ./zfs-scrub-annotations.nix;
# llama-cpp tests
llamaCppAnnotationsTest = handleTest ./llama-cpp-annotations.nix;
# xmrig auto-pause test
xmrigAutoPauseTest = handleTest ./xmrig-auto-pause.nix;
# ntfy alerts test