From b9cd380a222d769aef29fde409c7a50099c62f3d Mon Sep 17 00:00:00 2001 From: Simon Gardling Date: Tue, 31 Mar 2026 15:18:51 -0400 Subject: [PATCH] grafana: test --- configuration.nix | 1 + services/intel-gpu-collector.py | 78 +++++++++++ services/jellyfin-annotations.nix | 40 ++++++ services/jellyfin-annotations.py | 142 ++++++++++++++++++++ services/monitoring.nix | 180 ++++++++++++++++++++++++- tests/jellyfin-annotations.nix | 213 ++++++++++++++++++++++++++++++ tests/tests.nix | 3 + 7 files changed, 650 insertions(+), 7 deletions(-) create mode 100644 services/intel-gpu-collector.py create mode 100644 services/jellyfin-annotations.nix create mode 100644 services/jellyfin-annotations.py create mode 100644 tests/jellyfin-annotations.nix diff --git a/configuration.nix b/configuration.nix index cad6b63..adbd64e 100644 --- a/configuration.nix +++ b/configuration.nix @@ -48,6 +48,7 @@ ./services/ups.nix ./services/monitoring.nix + ./services/jellyfin-annotations.nix ./services/bitwarden.nix ./services/firefox-syncserver.nix diff --git a/services/intel-gpu-collector.py b/services/intel-gpu-collector.py new file mode 100644 index 0000000..97aacec --- /dev/null +++ b/services/intel-gpu-collector.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +import json +import os +import subprocess +import sys +import time + +TEXTFILE = os.environ.get( + "TEXTFILE", + "/var/lib/prometheus-node-exporter-textfiles/intel-gpu.prom", +) + + +def read_one_sample(): + proc = subprocess.Popen( + ["intel_gpu_top", "-J", "-s", "1000"], + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + ) + buf = b"" + depth = 0 + in_obj = False + deadline = time.monotonic() + 5.0 + try: + while time.monotonic() < deadline: + byte = proc.stdout.read(1) + if not byte: + break + if byte == b"{": + in_obj = True + depth += 1 + if in_obj: + buf += byte + if in_obj and byte == b"}": + depth -= 1 + if depth == 0: + break + finally: + proc.terminate() + proc.wait() + return json.loads(buf) if buf else None + + +def write_metrics(sample): + lines = [ + "# HELP intel_gpu_engine_busy_percent Intel GPU engine busy percentage", + "# TYPE intel_gpu_engine_busy_percent gauge", + ] + for engine, data in sample.get("engines", {}).items(): + lines.append( + f'intel_gpu_engine_busy_percent{{engine="{engine}"}} {data.get("busy", 0)}' + ) + freq = sample.get("frequency", {}) + lines += [ + "# HELP intel_gpu_frequency_mhz Intel GPU actual frequency in MHz", + "# TYPE intel_gpu_frequency_mhz gauge", + f'intel_gpu_frequency_mhz {freq.get("actual", 0)}', + "# HELP intel_gpu_rc6_percent Intel GPU RC6 power-saving state percentage", + "# TYPE intel_gpu_rc6_percent gauge", + f'intel_gpu_rc6_percent {sample.get("rc6", {}).get("value", 0)}', + ] + + tmp = TEXTFILE + ".tmp" + with open(tmp, "w") as f: + f.write("\n".join(lines) + "\n") + os.replace(tmp, TEXTFILE) + + +def main(): + sample = read_one_sample() + if sample is None: + print("Failed to read intel_gpu_top sample", file=sys.stderr) + sys.exit(1) + write_metrics(sample) + + +if __name__ == "__main__": + main() diff --git a/services/jellyfin-annotations.nix b/services/jellyfin-annotations.nix new file mode 100644 index 0000000..93abea2 --- /dev/null +++ b/services/jellyfin-annotations.nix @@ -0,0 +1,40 @@ +{ + config, + pkgs, + service_configs, + lib, + ... +}: +{ + systemd.services.jellyfin-annotations = { + description = "Jellyfin stream annotation service for Grafana"; + after = [ + "network.target" + "grafana.service" + ]; + wantedBy = [ "multi-user.target" ]; + serviceConfig = { + ExecStart = "${pkgs.python3}/bin/python3 ${./jellyfin-annotations.py}"; + Restart = "always"; + RestartSec = "10s"; + LoadCredential = "jellyfin-api-key:${config.age.secrets.jellyfin-api-key.path}"; + DynamicUser = true; + StateDirectory = "jellyfin-annotations"; + NoNewPrivileges = true; + ProtectSystem = "strict"; + ProtectHome = true; + PrivateTmp = true; + RestrictAddressFamilies = [ + "AF_INET" + "AF_INET6" + ]; + MemoryDenyWriteExecute = true; + }; + environment = { + JELLYFIN_URL = "http://127.0.0.1:${toString service_configs.ports.private.jellyfin.port}"; + GRAFANA_URL = "http://127.0.0.1:${toString service_configs.ports.private.grafana.port}"; + STATE_FILE = "/var/lib/jellyfin-annotations/state.json"; + POLL_INTERVAL = "30"; + }; + }; +} diff --git a/services/jellyfin-annotations.py b/services/jellyfin-annotations.py new file mode 100644 index 0000000..c4274da --- /dev/null +++ b/services/jellyfin-annotations.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 +import json +import os +import sys +import time +import urllib.request +from pathlib import Path + +JELLYFIN_URL = os.environ.get("JELLYFIN_URL", "http://127.0.0.1:8096") +GRAFANA_URL = os.environ.get("GRAFANA_URL", "http://127.0.0.1:3000") +STATE_FILE = os.environ.get("STATE_FILE", "/var/lib/jellyfin-annotations/state.json") +POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "30")) + + +def get_api_key(): + cred_dir = os.environ.get("CREDENTIALS_DIRECTORY") + if cred_dir: + return Path(cred_dir, "jellyfin-api-key").read_text().strip() + for p in ["/run/agenix/jellyfin-api-key"]: + if Path(p).exists(): + return Path(p).read_text().strip() + sys.exit("ERROR: Cannot find jellyfin-api-key") + + +def http_json(method, url, body=None): + data = json.dumps(body).encode() if body is not None else None + req = urllib.request.Request( + url, + data=data, + headers={"Content-Type": "application/json", "Accept": "application/json"}, + method=method, + ) + with urllib.request.urlopen(req, timeout=5) as resp: + return json.loads(resp.read()) + + +def get_active_sessions(api_key): + try: + req = urllib.request.Request( + f"{JELLYFIN_URL}/Sessions?api_key={api_key}", + headers={"Accept": "application/json"}, + ) + with urllib.request.urlopen(req, timeout=5) as resp: + sessions = json.loads(resp.read()) + return [s for s in sessions if s.get("NowPlayingItem")] + except Exception as e: + print(f"Error fetching sessions: {e}", file=sys.stderr) + return None + + +def format_label(session): + user = session.get("UserName", "Unknown") + item = session.get("NowPlayingItem", {}) or {} + name = item.get("Name", "Unknown") + series = item.get("SeriesName", "") + season = item.get("ParentIndexNumber") + episode = item.get("IndexNumber") + media_type = item.get("Type", "Unknown") + + if series and season and episode: + return f"{user}: {series} S{season:02d}E{episode:02d} - {name}" + elif series: + return f"{user}: {series} - {name}" + elif media_type == "Movie": + return f"{user}: {name} (movie)" + return f"{user}: {name}" + + +def load_state(): + try: + with open(STATE_FILE) as f: + return json.load(f) + except (FileNotFoundError, json.JSONDecodeError): + return {} + + +def save_state(state): + os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True) + tmp = STATE_FILE + ".tmp" + with open(tmp, "w") as f: + json.dump(state, f) + os.replace(tmp, STATE_FILE) + + +def grafana_post(label, start_ms): + try: + result = http_json( + "POST", + f"{GRAFANA_URL}/api/annotations", + {"time": start_ms, "text": label, "tags": ["jellyfin"]}, + ) + return result.get("id") + except Exception as e: + print(f"Error posting annotation: {e}", file=sys.stderr) + return None + + +def grafana_close(grafana_id, end_ms): + try: + http_json( + "PATCH", + f"{GRAFANA_URL}/api/annotations/{grafana_id}", + {"timeEnd": end_ms}, + ) + except Exception as e: + print(f"Error closing annotation {grafana_id}: {e}", file=sys.stderr) + + +def main(): + api_key = get_api_key() + state = load_state() + + while True: + now_ms = int(time.time() * 1000) + sessions = get_active_sessions(api_key) + + if sessions is not None: + current_ids = {s["Id"] for s in sessions} + + for s in sessions: + sid = s["Id"] + if sid not in state: + label = format_label(s) + grafana_id = grafana_post(label, now_ms) + if grafana_id is not None: + state[sid] = { + "grafana_id": grafana_id, + "label": label, + "start_ms": now_ms, + } + save_state(state) + + for sid in [k for k in state if k not in current_ids]: + info = state.pop(sid) + grafana_close(info["grafana_id"], now_ms) + save_state(state) + + time.sleep(POLL_INTERVAL) + + +if __name__ == "__main__": + main() diff --git a/services/monitoring.nix b/services/monitoring.nix index a87c578..279ed00 100644 --- a/services/monitoring.nix +++ b/services/monitoring.nix @@ -38,6 +38,17 @@ let ''; }; + intelGpuCollector = pkgs.writeShellApplication { + name = "intel-gpu-collector"; + runtimeInputs = with pkgs; [ + python3 + intel-gpu-tools + ]; + text = '' + exec python3 ${./intel-gpu-collector.py} + ''; + }; + dashboard = { editable = true; graphTooltip = 1; @@ -54,6 +65,21 @@ let title = "System Overview"; uid = "system-overview"; + annotations.list = [ + { + name = "Jellyfin Streams"; + datasource = { + type = "grafana"; + uid = "-- Grafana --"; + }; + enable = true; + iconColor = "green"; + showIn = 0; + type = "tags"; + tags = [ "jellyfin" ]; + } + ]; + panels = [ # -- Row 1: UPS -- { @@ -415,6 +441,134 @@ let graphMode = "area"; }; } + + # -- Row 3: Intel GPU -- + { + id = 8; + type = "timeseries"; + title = "Intel GPU Utilization"; + gridPos = { + h = 8; + w = 18; + x = 0; + y = 16; + }; + datasource = promDs; + targets = [ + { + datasource = promDs; + expr = "intel_gpu_engine_busy_percent"; + legendFormat = "{{engine}}"; + refId = "A"; + } + ]; + fieldConfig = { + defaults = { + unit = "percent"; + min = 0; + max = 100; + color.mode = "palette-classic"; + custom = { + lineWidth = 2; + fillOpacity = 10; + spanNulls = true; + }; + }; + overrides = [ ]; + }; + } + { + id = 9; + type = "stat"; + title = "GPU Frequency"; + gridPos = { + h = 4; + w = 6; + x = 18; + y = 16; + }; + datasource = promDs; + targets = [ + { + datasource = promDs; + expr = "intel_gpu_frequency_mhz"; + refId = "A"; + } + ]; + fieldConfig = { + defaults = { + unit = "megahertz"; + thresholds = { + mode = "absolute"; + steps = [ + { + color = "green"; + value = null; + } + ]; + }; + }; + overrides = [ ]; + }; + options = { + reduceOptions = { + calcs = [ "lastNotNull" ]; + fields = ""; + values = false; + }; + colorMode = "value"; + graphMode = "area"; + }; + } + { + id = 10; + type = "stat"; + title = "GPU RC6 (idle)"; + gridPos = { + h = 4; + w = 6; + x = 18; + y = 20; + }; + datasource = promDs; + targets = [ + { + datasource = promDs; + expr = "intel_gpu_rc6_percent"; + refId = "A"; + } + ]; + fieldConfig = { + defaults = { + unit = "percent"; + min = 0; + max = 100; + thresholds = { + mode = "absolute"; + steps = [ + { + color = "blue"; + value = null; + } + { + color = "green"; + value = 50; + } + ]; + }; + }; + overrides = [ ]; + }; + options = { + reduceOptions = { + calcs = [ "lastNotNull" ]; + fields = ""; + values = false; + }; + colorMode = "value"; + graphMode = "none"; + }; + } ]; }; in @@ -500,7 +654,6 @@ in root_url = "https://${service_configs.grafana.domain}"; }; - # Caddy handles auth -- disable Grafana login entirely "auth.anonymous" = { enabled = true; org_role = "Admin"; @@ -539,21 +692,17 @@ in }; }; - # Provision dashboard JSON environment.etc."grafana-dashboards/system-overview.json" = { text = builtins.toJSON dashboard; mode = "0444"; }; - # Caddy reverse proxy with auth services.caddy.virtualHosts."${service_configs.grafana.domain}".extraConfig = '' import ${config.age.secrets.caddy_auth.path} reverse_proxy :${builtins.toString service_configs.ports.private.grafana.port} ''; - # -- Jellyfin metrics collector -- - # Queries the Jellyfin API for active streams and writes a .prom file - # for the node_exporter textfile collector. + # -- Jellyfin active-stream prometheus textfile collector -- systemd.services.jellyfin-metrics-collector = { description = "Collect Jellyfin metrics for Prometheus"; after = [ "network.target" ]; @@ -572,7 +721,24 @@ in }; }; - # Ensure textfile collector directory exists (tmpfs root -- recreated on boot) + # -- Intel GPU textfile collector -- + systemd.services.intel-gpu-collector = { + description = "Collect Intel GPU metrics for Prometheus"; + serviceConfig = { + Type = "oneshot"; + ExecStart = lib.getExe intelGpuCollector; + }; + environment.TEXTFILE = "${textfileDir}/intel-gpu.prom"; + }; + + systemd.timers.intel-gpu-collector = { + wantedBy = [ "timers.target" ]; + timerConfig = { + OnCalendar = "*:*:0/30"; + RandomizedDelaySec = "10s"; + }; + }; + systemd.tmpfiles.rules = [ "d ${textfileDir} 0755 root root -" ]; diff --git a/tests/jellyfin-annotations.nix b/tests/jellyfin-annotations.nix new file mode 100644 index 0000000..ae71865 --- /dev/null +++ b/tests/jellyfin-annotations.nix @@ -0,0 +1,213 @@ +{ + lib, + pkgs, + ... +}: +let + mockServer = pkgs.writeText "mock-server.py" '' + import http.server, json, os, sys + from urllib.parse import urlparse + + MODE = sys.argv[1] + PORT = int(sys.argv[2]) + DATA_FILE = sys.argv[3] + + class Handler(http.server.BaseHTTPRequestHandler): + def log_message(self, fmt, *args): + pass + + def _read_body(self): + length = int(self.headers.get("Content-Length", 0)) + return json.loads(self.rfile.read(length)) if length else {} + + def _json(self, code, body): + data = json.dumps(body).encode() + self.send_response(code) + self.send_header("Content-Type", "application/json") + self.end_headers() + self.wfile.write(data) + + def do_GET(self): + if MODE == "jellyfin" and self.path.startswith("/Sessions"): + try: + with open(DATA_FILE) as f: + sessions = json.load(f) + except Exception: + sessions = [] + self._json(200, sessions) + else: + self.send_response(404) + self.end_headers() + + def do_POST(self): + if MODE == "grafana" and self.path == "/api/annotations": + body = self._read_body() + try: + with open(DATA_FILE) as f: + annotations = json.load(f) + except Exception: + annotations = [] + aid = len(annotations) + 1 + body["id"] = aid + annotations.append(body) + with open(DATA_FILE, "w") as f: + json.dump(annotations, f) + self._json(200, {"id": aid, "message": "Annotation added"}) + else: + self.send_response(404) + self.end_headers() + + def do_PATCH(self): + if MODE == "grafana" and self.path.startswith("/api/annotations/"): + aid = int(self.path.rsplit("/", 1)[-1]) + body = self._read_body() + try: + with open(DATA_FILE) as f: + annotations = json.load(f) + except Exception: + annotations = [] + for a in annotations: + if a["id"] == aid: + a.update(body) + with open(DATA_FILE, "w") as f: + json.dump(annotations, f) + self._json(200, {"message": "Annotation patched"}) + else: + self.send_response(404) + self.end_headers() + + http.server.HTTPServer(("127.0.0.1", PORT), Handler).serve_forever() + ''; + + script = ../services/jellyfin-annotations.py; + python = pkgs.python3; +in +pkgs.testers.runNixOSTest { + name = "jellyfin-annotations"; + + nodes.machine = + { pkgs, ... }: + { + environment.systemPackages = [ pkgs.python3 ]; + }; + + testScript = '' + import json + import time + + JELLYFIN_PORT = 18096 + GRAFANA_PORT = 13000 + SESSIONS_FILE = "/tmp/sessions.json" + ANNOTS_FILE = "/tmp/annotations.json" + STATE_FILE = "/tmp/annotations-state.json" + CREDS_DIR = "/tmp/test-creds" + PYTHON = "${python}/bin/python3" + MOCK = "${mockServer}" + SCRIPT = "${script}" + + def read_annotations(): + out = machine.succeed(f"cat {ANNOTS_FILE} 2>/dev/null || echo '[]'") + return json.loads(out.strip()) + + start_all() + machine.wait_for_unit("multi-user.target") + + with subtest("Setup mock credentials and data files"): + machine.succeed(f"mkdir -p {CREDS_DIR} && echo 'fake-api-key' > {CREDS_DIR}/jellyfin-api-key") + machine.succeed(f"echo '[]' > {SESSIONS_FILE}") + machine.succeed(f"echo '[]' > {ANNOTS_FILE}") + + with subtest("Start mock Jellyfin and Grafana servers"): + machine.succeed( + f"systemd-run --unit=mock-jellyfin {PYTHON} {MOCK} jellyfin {JELLYFIN_PORT} {SESSIONS_FILE}" + ) + machine.succeed( + f"systemd-run --unit=mock-grafana {PYTHON} {MOCK} grafana {GRAFANA_PORT} {ANNOTS_FILE}" + ) + machine.wait_until_succeeds( + f"curl -sf http://127.0.0.1:{JELLYFIN_PORT}/Sessions", timeout=10 + ) + machine.wait_until_succeeds( + f"curl -sf -X POST http://127.0.0.1:{GRAFANA_PORT}/api/annotations " + f"-H 'Content-Type: application/json' -d '{{\"text\":\"ping\",\"tags\":[]}}' | grep -q id", + timeout=10, + ) + machine.succeed(f"echo '[]' > {ANNOTS_FILE}") + + with subtest("Start annotation service pointing at mock servers"): + machine.succeed( + f"systemd-run --unit=annotations-svc " + f"--setenv=JELLYFIN_URL=http://127.0.0.1:{JELLYFIN_PORT} " + f"--setenv=GRAFANA_URL=http://127.0.0.1:{GRAFANA_PORT} " + f"--setenv=CREDENTIALS_DIRECTORY={CREDS_DIR} " + f"--setenv=STATE_FILE={STATE_FILE} " + f"--setenv=POLL_INTERVAL=3 " + f"{PYTHON} {SCRIPT}" + ) + time.sleep(2) + + with subtest("No annotations pushed when no streams active"): + time.sleep(4) + annots = read_annotations() + assert annots == [], f"Expected no annotations, got: {annots}" + + with subtest("Annotation created when stream starts"): + machine.succeed( + f"""echo '[{{"Id":"sess-1","UserName":"alice","NowPlayingItem":{{"Name":"Inception","Type":"Movie"}}}}]' > {SESSIONS_FILE}""" + ) + machine.wait_until_succeeds( + f"cat {ANNOTS_FILE} | python3 -c \"import sys,json; a=json.load(sys.stdin); exit(0 if a else 1)\"", + timeout=15, + ) + annots = read_annotations() + assert len(annots) == 1, f"Expected 1 annotation, got: {annots}" + assert annots[0]["text"] == "alice: Inception (movie)", f"Unexpected label: {annots[0]}" + assert "jellyfin" in annots[0].get("tags", []), f"Missing jellyfin tag: {annots[0]}" + assert "timeEnd" not in annots[0], f"timeEnd should not be set yet: {annots[0]}" + + with subtest("Annotation closed when stream ends"): + machine.succeed(f"echo '[]' > {SESSIONS_FILE}") + machine.wait_until_succeeds( + f"cat {ANNOTS_FILE} | python3 -c \"import sys,json; a=json.load(sys.stdin); exit(0 if a and 'timeEnd' in a[0] else 1)\"", + timeout=15, + ) + annots = read_annotations() + assert len(annots) == 1, f"Expected 1 annotation, got: {annots}" + assert "timeEnd" in annots[0], f"timeEnd should be set: {annots[0]}" + assert annots[0]["timeEnd"] > annots[0]["time"], "timeEnd should be after time" + + with subtest("Multiple concurrent streams each get their own annotation"): + machine.succeed(f"echo '[]' > {ANNOTS_FILE}") + machine.succeed( + f"""echo '[ + {{"Id":"sess-2","UserName":"bob","NowPlayingItem":{{"Name":"Breaking Bad","SeriesName":"Breaking Bad","ParentIndexNumber":1,"IndexNumber":1}}}}, + {{"Id":"sess-3","UserName":"carol","NowPlayingItem":{{"Name":"Inception","Type":"Movie"}}}} + ]' > {SESSIONS_FILE}""" + ) + machine.wait_until_succeeds( + f"cat {ANNOTS_FILE} | python3 -c \"import sys,json; a=json.load(sys.stdin); exit(0 if len(a)==2 else 1)\"", + timeout=15, + ) + annots = read_annotations() + assert len(annots) == 2, f"Expected 2 annotations, got: {annots}" + labels = sorted(a["text"] for a in annots) + assert labels[0] == "bob: Breaking Bad S01E01 - Breaking Bad", f"Unexpected: {labels[0]}" + assert labels[1] == "carol: Inception (movie)", f"Unexpected: {labels[1]}" + + with subtest("State survives service restart (no duplicate annotations)"): + machine.succeed("systemctl stop annotations-svc || true") + time.sleep(1) + machine.succeed( + f"systemd-run --unit=annotations-svc-2 " + f"--setenv=JELLYFIN_URL=http://127.0.0.1:{JELLYFIN_PORT} " + f"--setenv=GRAFANA_URL=http://127.0.0.1:{GRAFANA_PORT} " + f"--setenv=CREDENTIALS_DIRECTORY={CREDS_DIR} " + f"--setenv=STATE_FILE={STATE_FILE} " + f"--setenv=POLL_INTERVAL=3 " + f"{PYTHON} {SCRIPT}" + ) + time.sleep(6) + annots = read_annotations() + assert len(annots) == 2, f"Restart should not create duplicates, got: {annots}" + ''; +} diff --git a/tests/tests.nix b/tests/tests.nix index 44b1db0..59906f9 100644 --- a/tests/tests.nix +++ b/tests/tests.nix @@ -22,6 +22,9 @@ in fail2banImmichTest = handleTest ./fail2ban-immich.nix; fail2banJellyfinTest = handleTest ./fail2ban-jellyfin.nix; + # jellyfin annotation service test + jellyfinAnnotationsTest = handleTest ./jellyfin-annotations.nix; + # ntfy alerts test ntfyAlertsTest = handleTest ./ntfy-alerts.nix;