Files
nixos/tests/jellyfin-failure-alert.nix

352 lines
12 KiB
Nix
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
config,
lib,
pkgs,
...
}:
let
# Mock ntfy server script that records POST requests to a JSON log.
mockNtfyScript = pkgs.writeScript "mock-ntfy.py" ''
import json
import os
from http.server import HTTPServer, BaseHTTPRequestHandler
from datetime import datetime
REQUESTS_FILE = "/tmp/ntfy-requests.json"
class MockNtfy(BaseHTTPRequestHandler):
def _respond(self, code=200, body=b"Ok"):
self.send_response(code)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(body if isinstance(body, bytes) else body.encode())
def do_GET(self):
self._respond()
def do_POST(self):
content_length = int(self.headers.get("Content-Length", 0))
body = self.rfile.read(content_length).decode() if content_length > 0 else ""
request_data = {
"timestamp": datetime.now().isoformat(),
"path": self.path,
"headers": dict(self.headers),
"body": body,
}
requests = []
if os.path.exists(REQUESTS_FILE):
try:
with open(REQUESTS_FILE, "r") as f:
requests = json.load(f)
except:
requests = []
requests.append(request_data)
with open(REQUESTS_FILE, "w") as f:
json.dump(requests, f, indent=2)
self._respond()
def log_message(self, format, *args):
pass
HTTPServer(("0.0.0.0", 8080), MockNtfy).serve_forever()
'';
# Jellyfin log lines used in the test.
# Jellyfin log format: [YYYY-MM-DD HH:MM:SS.mmm +TZ] [LEVEL] [thread] Source: message
mkLogLine =
ts: level: thread: source: msg:
"[${ts}] [${level}] [${thread}] ${source}: ${msg}";
# Realistic error lines that should trigger alerts.
transcodeCrash =
mkLogLine "2026-05-12 14:23:01.123 +00:00" "ERR" "42"
"MediaBrowser.MediaEncoding.Transcoding.TranscodeManager"
"FFmpeg exited with code 137 after 45.2 seconds";
playbackError =
mkLogLine "2026-05-12 14:24:05.456 +00:00" "ERR" "17"
"Emby.Server.Implementations.Session.SessionManager"
"Playback error for user \"alice\" on device \"Living Room TV\"";
exceptionMiddleware =
mkLogLine "2026-05-12 14:25:10.789 +00:00" "ERR" "99"
"Jellyfin.Server.Middleware.ExceptionMiddleware"
''Error processing request. URL "GET" "/Videos/a1b2c3d4-e5f6-7890-abcd-ef1234567890/stream".'';
streamAbort =
mkLogLine "2026-05-12 14:26:00.111 +00:00" "ERR" "33"
"MediaBrowser.Api.Playback.Hls.DynamicHlsService"
"Cannot open HLS stream segment";
# Lines that should NOT trigger alerts.
authDenied =
mkLogLine "2026-05-12 14:27:00.222 +00:00" "ERR" "12"
"Jellyfin.Server.Implementations.Users.UserManager"
''Authentication request for "bob" has been denied (IP: "10.0.0.5").'';
libraryScanError =
mkLogLine "2026-05-12 14:28:00.333 +00:00" "ERR" "55" "MediaBrowser.Controller.Entities.BaseItem"
"Error refreshing item metadata for /library/some-broken-file.mkv";
# Below ERR level never triggers.
warnLine =
mkLogLine "2026-05-12 14:29:00.444 +00:00" "WRN" "77"
"MediaBrowser.MediaEncoding.Transcoding.TranscodeManager"
"Slow transcoding detected (0.95x realtime)";
infoLine =
mkLogLine "2026-05-12 14:30:00.555 +00:00" "INF" "88"
"Jellyfin.Server.Middleware.ExceptionMiddleware"
"This is informational and should not alert";
# Log file contents for each scenario.
logWithFailures = pkgs.writeText "jellyfin-failure-log.log" ''
${authDenied}
${libraryScanError}
${warnLine}
${infoLine}
${transcodeCrash}
${playbackError}
${exceptionMiddleware}
${streamAbort}
'';
logWithDedup = pkgs.writeText "jellyfin-dedup-log.log" ''
${transcodeCrash}
${transcodeCrash}
${transcodeCrash}
'';
logNoFailures = pkgs.writeText "jellyfin-clean-log.log" ''
${authDenied}
${libraryScanError}
${warnLine}
${infoLine}
'';
in
pkgs.testers.runNixOSTest {
name = "jellyfin-failure-alert";
nodes.machine =
{ pkgs, ... }:
{
imports = [
../modules/ntfy-alerts.nix
../services/jellyfin/jellyfin-failure-alert.nix
];
system.stateVersion = config.system.stateVersion;
virtualisation.memorySize = 2048;
environment.systemPackages = with pkgs; [
curl
jq
];
# Minimal jellyfin config so the guard passes. Jellyfin 10.11+
# requires 2 GiB free space, so give the VM a 4 GiB disk.
virtualisation.diskSize = 4096;
services.jellyfin = {
enable = true;
dataDir = "/var/lib/jellyfin-test";
cacheDir = "/var/cache/jellyfin-test";
user = "jellyfin";
group = "jellyfin";
};
# Jellyfin base dirs
systemd.tmpfiles.rules = [
"d /var/lib/jellyfin-test 0755 jellyfin jellyfin"
"d /var/lib/jellyfin-test/log 0755 jellyfin jellyfin"
"f /run/ntfy-test-topic 0644 root root - test-alerts"
"f /run/ntfy-test-token 0644 root root - test-token-value"
];
# Mock ntfy server
systemd.services.mock-ntfy = {
description = "Mock ntfy server for jellyfin-failure-alert test";
wantedBy = [ "multi-user.target" ];
before = [ "jellyfin-failure-alert.service" ];
serviceConfig = {
ExecStart = "${pkgs.python3}/bin/python3 ${mockNtfyScript}";
Type = "simple";
};
};
# Configure ntfy-alerts to use mock server
services.ntfyAlerts = {
enable = true;
serverUrl = "http://localhost:8080";
topicFile = "/run/ntfy-test-topic";
tokenFile = "/run/ntfy-test-token";
};
# Speed up polling for the test
systemd.services.jellyfin-failure-alert.environment.POLL_INTERVAL = lib.mkForce "2";
systemd.services.jellyfin-failure-alert.environment.DEDUP_WINDOW = lib.mkForce "10";
};
testScript = ''
import json
import time
LOG_DIR = "/var/lib/jellyfin-test/log"
REQUESTS_FILE = "/tmp/ntfy-requests.json"
start_all()
# Wait for mock ntfy server
machine.wait_for_unit("mock-ntfy.service")
machine.wait_until_succeeds("curl -sf http://localhost:8080/", timeout=30)
# ------------------------------------------------------------------
# Phase 1: Client failures trigger alerts
# ------------------------------------------------------------------
with subtest("Client failure log lines trigger ntfy notifications"):
# Place a log file with known failure lines
machine.succeed(
"cp ${logWithFailures} {}/log_test.log && chown jellyfin:jellyfin {}/log_test.log".format(
LOG_DIR, LOG_DIR
)
)
# Start the monitor
machine.succeed("systemctl start jellyfin-failure-alert.service")
# Wait for the monitor to poll and send notifications.
# Should pick up: transcodeCrash, playbackError, exceptionMiddleware, streamAbort
machine.wait_until_succeeds(
"test -f {} && test $(jq 'length' {}) -ge 4".format(REQUESTS_FILE, REQUESTS_FILE),
timeout=30,
)
result = machine.succeed("cat {}".format(REQUESTS_FILE))
requests = json.loads(result)
print(f"Phase 1: received {len(requests)} ntfy notifications")
assert len(requests) >= 4, f"Expected >= 4 notifications, got {len(requests)}"
# Verify each notification has the expected shape
for req in requests:
assert "/test-alerts" in req["path"], f"Wrong topic path: {req['path']}"
assert "Title" in req["headers"], "Missing Title header"
assert "Jellyfin" in req["headers"]["Title"], (
f"Title should mention Jellyfin: {req['headers']['Title']}"
)
assert req["headers"]["Priority"] == "high", (
f"Expected Priority 'high', got {req['headers'].get('Priority')}"
)
assert req["headers"]["Tags"] == "warning", (
f"Expected Tags 'warning', got {req['headers'].get('Tags')}"
)
assert req["headers"]["Authorization"] == "Bearer test-token-value", (
f"Missing or wrong Authorization header: {req['headers'].get('Authorization')}"
)
assert len(req["body"]) > 0, "Notification body is empty"
# Verify specific error content appears in bodies
bodies = " ".join(r["body"] for r in requests)
assert "FFmpeg" in bodies, "Missing FFmpeg error in notifications"
assert "Playback error" in bodies, "Missing playback error in notifications"
assert "ExceptionMiddleware" in bodies or "/Videos/" in bodies, (
"Missing exception middleware error in notifications"
)
print("Phase 1 passed: all client failures triggered alerts")
# ------------------------------------------------------------------
# Phase 2: Non-client errors are filtered out
# ------------------------------------------------------------------
with subtest("Non-client errors do not trigger alerts"):
# Clear previous requests and add a new log file with only non-client errors
machine.succeed("rm -f {}".format(REQUESTS_FILE))
machine.succeed(
"cp ${logNoFailures} {}/log_clean.log && chown jellyfin:jellyfin {}/log_clean.log".format(
LOG_DIR, LOG_DIR
)
)
# Wait enough poll cycles (at least 8s = 4 cycles at POLL_INTERVAL=2)
time.sleep(8)
# Assert no requests file was created. If it exists, it must be empty
# (any non-empty result means a non-client error leaked through).
rc, _ = machine.execute("test -f {}".format(REQUESTS_FILE))
if rc == 0:
machine.succeed("test $(jq 'length' {}) -eq 0".format(REQUESTS_FILE))
print("Phase 2 passed: non-client errors correctly filtered")
# ------------------------------------------------------------------
# Phase 3: Deduplication
# ------------------------------------------------------------------
with subtest("Duplicate errors are deduplicated within the window"):
# Clear and add a log file with the same error repeated
machine.succeed("rm -f {}".format(REQUESTS_FILE))
machine.succeed(
"cp ${logWithDedup} {}/log_dedup.log && chown jellyfin:jellyfin {}/log_dedup.log".format(
LOG_DIR, LOG_DIR
)
)
# Wait for the monitor to process. Should get exactly 1 notification.
machine.wait_until_succeeds(
"test -f {} && test $(jq 'length' {}) -eq 1".format(REQUESTS_FILE, REQUESTS_FILE),
timeout=30,
)
result = machine.succeed("cat {}".format(REQUESTS_FILE))
requests = json.loads(result)
print(f"Phase 3: received {len(requests)} notifications (expected 1 for dedup)")
assert len(requests) == 1, (
f"Expected exactly 1 notification for 3 identical errors, got {len(requests)}"
)
assert "FFmpeg" in requests[0]["body"], "Missing FFmpeg error in dedup notification"
print("Phase 3 passed: deduplication works correctly")
# ------------------------------------------------------------------
# Phase 4: Re-alert after dedup window expires
# ------------------------------------------------------------------
with subtest("Same error re-alerts after dedup window expires"):
# Wait for dedup window to expire (10s)
time.sleep(12)
# Write the same error again in a new log file
machine.succeed(
"cp ${logWithDedup} {}/log_dedup2.log && chown jellyfin:jellyfin {}/log_dedup2.log".format(
LOG_DIR, LOG_DIR
)
)
# Wait for the monitor to pick it up. Should now have 2 entries.
machine.wait_until_succeeds(
"test -f {} && test $(jq 'length' {}) -eq 2".format(REQUESTS_FILE, REQUESTS_FILE),
timeout=30,
)
result = machine.succeed("cat {}".format(REQUESTS_FILE))
requests = json.loads(result)
print(f"Phase 4: received {len(requests)} notifications (expected 2)")
assert len(requests) == 2, (
f"Expected 2 notifications after dedup window expired, got {len(requests)}"
)
print("Phase 4 passed: re-alert after dedup window works")
print("All jellyfin-failure-alert tests passed!")
'';
}