jellyfin: alert on failure
This commit is contained in:
@@ -3,5 +3,6 @@
|
|||||||
./jellyfin.nix
|
./jellyfin.nix
|
||||||
./jellyfin-qbittorrent-monitor.nix
|
./jellyfin-qbittorrent-monitor.nix
|
||||||
./jellyfin-deploy-guard.nix
|
./jellyfin-deploy-guard.nix
|
||||||
|
./jellyfin-failure-alert.nix
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|||||||
65
services/jellyfin/jellyfin-failure-alert.nix
Normal file
65
services/jellyfin/jellyfin-failure-alert.nix
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
{
|
||||||
|
config,
|
||||||
|
lib,
|
||||||
|
pkgs,
|
||||||
|
...
|
||||||
|
}:
|
||||||
|
let
|
||||||
|
jfCfg = config.services.jellyfin;
|
||||||
|
ntfyCfg = config.services.ntfyAlerts;
|
||||||
|
in
|
||||||
|
lib.mkIf (jfCfg.enable && ntfyCfg.enable) {
|
||||||
|
systemd.services.jellyfin-failure-alert = {
|
||||||
|
description = "Monitor Jellyfin logs for client playback failures and alert via ntfy";
|
||||||
|
after = [
|
||||||
|
"network.target"
|
||||||
|
"jellyfin.service"
|
||||||
|
];
|
||||||
|
wants = [ "jellyfin.service" ];
|
||||||
|
wantedBy = [ "multi-user.target" ];
|
||||||
|
|
||||||
|
serviceConfig = {
|
||||||
|
Type = "simple";
|
||||||
|
ExecStart = pkgs.writeShellScript "jellyfin-failure-alert-start" ''
|
||||||
|
set -euo pipefail
|
||||||
|
export NTFY_TOPIC=$(cat "$CREDENTIALS_DIRECTORY/ntfy-topic" | tr -d '[:space:]')
|
||||||
|
${lib.optionalString (ntfyCfg.tokenFile != null) ''
|
||||||
|
export NTFY_TOKEN_FILE="$CREDENTIALS_DIRECTORY/ntfy-token"
|
||||||
|
''}
|
||||||
|
exec ${pkgs.python3}/bin/python ${./jellyfin-failure-alert.py}
|
||||||
|
'';
|
||||||
|
Restart = "always";
|
||||||
|
RestartSec = "10s";
|
||||||
|
|
||||||
|
# Security hardening
|
||||||
|
DynamicUser = true;
|
||||||
|
NoNewPrivileges = true;
|
||||||
|
ProtectSystem = "strict";
|
||||||
|
ProtectHome = true;
|
||||||
|
ProtectKernelTunables = true;
|
||||||
|
ProtectKernelModules = true;
|
||||||
|
ProtectControlGroups = true;
|
||||||
|
MemoryDenyWriteExecute = true;
|
||||||
|
RestrictRealtime = true;
|
||||||
|
RestrictSUIDSGID = true;
|
||||||
|
RemoveIPC = true;
|
||||||
|
|
||||||
|
# DynamicUser needs jellyfin group to read 0700 log dir
|
||||||
|
SupplementaryGroups = [ jfCfg.group ];
|
||||||
|
|
||||||
|
# Load credentials from agenix secrets
|
||||||
|
LoadCredential = [
|
||||||
|
"ntfy-topic:${ntfyCfg.topicFile}"
|
||||||
|
]
|
||||||
|
++ lib.optional (ntfyCfg.tokenFile != null) "ntfy-token:${ntfyCfg.tokenFile}";
|
||||||
|
};
|
||||||
|
|
||||||
|
environment = {
|
||||||
|
JELLYFIN_LOG_DIR = "${jfCfg.dataDir}/log";
|
||||||
|
NTFY_SERVER_URL = ntfyCfg.serverUrl;
|
||||||
|
HOSTNAME = config.networking.hostName;
|
||||||
|
POLL_INTERVAL = "15";
|
||||||
|
DEDUP_WINDOW = "300";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
324
services/jellyfin/jellyfin-failure-alert.py
Normal file
324
services/jellyfin/jellyfin-failure-alert.py
Normal file
@@ -0,0 +1,324 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Monitor Jellyfin log files for client playback/transcoding failures.
|
||||||
|
|
||||||
|
Tails Jellyfin's rotating log files, matches [ERR] lines that indicate
|
||||||
|
a client-facing failure (playback error, transcode crash, stream abort),
|
||||||
|
deduplicates within a window, and pushes a ntfy notification.
|
||||||
|
|
||||||
|
Environment
|
||||||
|
JELLYFIN_LOG_DIR path to Jellyfin log directory (required)
|
||||||
|
NTFY_SERVER_URL ntfy server base URL (required)
|
||||||
|
NTFY_TOPIC ntfy topic name (required)
|
||||||
|
NTFY_TOKEN_FILE optional path to file containing ntfy auth bearer token
|
||||||
|
HOSTNAME server hostname for notification title (default: "muffin")
|
||||||
|
POLL_INTERVAL seconds between log scans (default: 15)
|
||||||
|
DEDUP_WINDOW seconds before re-alerting same signature (default: 300)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import glob
|
||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import signal
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import urllib.request
|
||||||
|
import urllib.error
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Patterns
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Jellyfin log line prefix: [2024-01-01 12:00:00.000 +00:00] [ERR] [123]
|
||||||
|
_LOG_PREFIX_RE = re.compile(
|
||||||
|
r"^\[[\d\-]{10} [\d:.]{12} [+-]\d{2}:\d{2}\] \[ERR\] \[\d+\] "
|
||||||
|
)
|
||||||
|
|
||||||
|
# Sources that indicate a client-facing failure.
|
||||||
|
_CLIENT_FAILURE_SOURCES = {
|
||||||
|
# Transcoding engine crashes / errors
|
||||||
|
"MediaBrowser.MediaEncoding.Transcoding.TranscodeManager",
|
||||||
|
"MediaBrowser.MediaEncoding.Encoder.EncodingManager",
|
||||||
|
# Playback / session errors
|
||||||
|
"Emby.Server.Implementations.Session.SessionManager",
|
||||||
|
# HTTP exceptions on media endpoints
|
||||||
|
"Jellyfin.Server.Middleware.ExceptionMiddleware",
|
||||||
|
# Streaming / live TV
|
||||||
|
"MediaBrowser.Api.Playback.MediaInfoService",
|
||||||
|
"MediaBrowser.Api.Playback.Progressive.ProgressiveStreamWriter",
|
||||||
|
"MediaBrowser.Api.Playback.Hls.DynamicHlsService",
|
||||||
|
# Direct play / stream
|
||||||
|
"MediaBrowser.Controller.MediaEncoding.EncodingHelper",
|
||||||
|
# DLNA / remote control (rare but client-facing)
|
||||||
|
"Emby.Server.Implementations.HttpServer.HttpListenerHost",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Additional message-level patterns for lines whose source is not in
|
||||||
|
# _CLIENT_FAILURE_SOURCES but whose message text indicates a client problem.
|
||||||
|
_CLIENT_FAILURE_PATTERNS = [
|
||||||
|
re.compile(p, re.IGNORECASE)
|
||||||
|
for p in [
|
||||||
|
r"error processing request.*?(?:/Videos/|/Items/|/Audio/)",
|
||||||
|
r"ffmpeg.*?(?:error|exited with code [1-9]|crashed|killed)",
|
||||||
|
r"playback\s*error",
|
||||||
|
r"transcode.*?(?:fail|error|abort)",
|
||||||
|
r"stream.*?(?:fail|error|abort|closed)",
|
||||||
|
r"client.*?(?:disconnect|error|timeout)",
|
||||||
|
]
|
||||||
|
]
|
||||||
|
|
||||||
|
# Items to scrub from log lines before generating a dedup signature.
|
||||||
|
_SIGNATURE_SCRUB_RE = re.compile(
|
||||||
|
r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}" # UUID
|
||||||
|
r"|\b[0-9a-fA-F]{32,}\b" # long hex hashes
|
||||||
|
r"|\b\d{4,}\b" # ids / durations / sizes ≥ 4 digits
|
||||||
|
r"|0x[0-9a-fA-F]+" # hex addresses
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _read_token(token_file: str | None) -> str | None:
|
||||||
|
if not token_file or not os.path.isfile(token_file):
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return Path(token_file).read_text().strip()
|
||||||
|
except OSError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _send_ntfy(
|
||||||
|
server_url: str,
|
||||||
|
topic: str,
|
||||||
|
title: str,
|
||||||
|
message: str,
|
||||||
|
token: str | None,
|
||||||
|
priority: str = "high",
|
||||||
|
tags: str = "warning",
|
||||||
|
) -> bool:
|
||||||
|
"""POST a ntfy notification. Returns True on success."""
|
||||||
|
url = f"{server_url.rstrip('/')}/{topic}"
|
||||||
|
data = message.encode("utf-8")
|
||||||
|
headers = {
|
||||||
|
"Title": title,
|
||||||
|
"Priority": priority,
|
||||||
|
"Tags": tags,
|
||||||
|
"Content-Type": "text/plain",
|
||||||
|
}
|
||||||
|
if token:
|
||||||
|
headers["Authorization"] = f"Bearer {token}"
|
||||||
|
|
||||||
|
req = urllib.request.Request(url, data=data, headers=headers, method="POST")
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||||
|
return 200 <= resp.status < 300
|
||||||
|
except urllib.error.HTTPError as exc:
|
||||||
|
logger.warning("ntfy POST returned HTTP %s: %s", exc.code, exc.reason)
|
||||||
|
return False
|
||||||
|
except urllib.error.URLError as exc:
|
||||||
|
logger.warning("ntfy POST failed: %s", exc.reason)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _error_signature(line: str) -> str:
|
||||||
|
"""Return a stable hash for a Jellyfin error log line.
|
||||||
|
|
||||||
|
Strips the timestamp prefix and normalises UUIDs, hex hashes, and large
|
||||||
|
integers so that the same logical error from different sessions or items
|
||||||
|
collapses to the same signature.
|
||||||
|
"""
|
||||||
|
# Strip timestamp / level / thread prefix so we keep <Source>: <message>
|
||||||
|
body = _LOG_PREFIX_RE.sub("", line, count=1)
|
||||||
|
if not body:
|
||||||
|
body = line
|
||||||
|
# Collapse repeated whitespace
|
||||||
|
normalised = _SIGNATURE_SCRUB_RE.sub("<ID>", body)
|
||||||
|
normalised = re.sub(r"\s+", " ", normalised).strip()
|
||||||
|
# Keep the source prefix (up to first ':') as part of the signature
|
||||||
|
return hashlib.sha256(normalised.encode()).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def _is_client_failure(line: str) -> bool:
|
||||||
|
"""Check whether a Jellyfin [ERR] log line indicates a client failure."""
|
||||||
|
if not _LOG_PREFIX_RE.match(line):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Strip prefix for matching
|
||||||
|
body = _LOG_PREFIX_RE.sub("", line, count=1)
|
||||||
|
if not body:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check source (the part before ': ')
|
||||||
|
if ": " in body:
|
||||||
|
source = body.split(": ", 1)[0]
|
||||||
|
if source in _CLIENT_FAILURE_SOURCES:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Fall back to message-level patterns
|
||||||
|
for pat in _CLIENT_FAILURE_PATTERNS:
|
||||||
|
if pat.search(body):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _scan_log_file(path: str, seen_positions: dict[str, int]) -> list[str]:
|
||||||
|
"""Read new lines from *path* since *seen_positions[path]*.
|
||||||
|
|
||||||
|
Updates *seen_positions* in place. Handles truncation (log rotation)
|
||||||
|
by resetting the cursor to 0 when the file shrinks.
|
||||||
|
"""
|
||||||
|
hits: list[str] = []
|
||||||
|
try:
|
||||||
|
st = os.stat(path)
|
||||||
|
inode_key = f"{st.st_ino}:{st.st_dev}"
|
||||||
|
prev_offset = seen_positions.get(inode_key, 0)
|
||||||
|
|
||||||
|
if st.st_size < prev_offset:
|
||||||
|
# File was truncated (rotation): start from the beginning.
|
||||||
|
prev_offset = 0
|
||||||
|
|
||||||
|
if st.st_size == prev_offset:
|
||||||
|
seen_positions[inode_key] = prev_offset
|
||||||
|
return hits
|
||||||
|
|
||||||
|
with open(path, "r", errors="replace") as fh:
|
||||||
|
fh.seek(prev_offset)
|
||||||
|
for raw in fh:
|
||||||
|
line = raw.rstrip("\n\r")
|
||||||
|
if _is_client_failure(line):
|
||||||
|
hits.append(line)
|
||||||
|
seen_positions[inode_key] = fh.tell()
|
||||||
|
except FileNotFoundError:
|
||||||
|
seen_positions.pop(inode_key, None)
|
||||||
|
except OSError as exc:
|
||||||
|
logger.debug("Cannot read %s: %s", path, exc)
|
||||||
|
|
||||||
|
return hits
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Main loop
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
log_dir = os.environ.get("JELLYFIN_LOG_DIR")
|
||||||
|
ntfy_url = os.environ.get("NTFY_SERVER_URL")
|
||||||
|
ntfy_topic = os.environ.get("NTFY_TOPIC")
|
||||||
|
ntfy_token_file = os.environ.get("NTFY_TOKEN_FILE")
|
||||||
|
hostname = os.environ.get("HOSTNAME", "muffin")
|
||||||
|
poll_interval = int(os.environ.get("POLL_INTERVAL", "15"))
|
||||||
|
dedup_window = int(os.environ.get("DEDUP_WINDOW", "300"))
|
||||||
|
|
||||||
|
if not log_dir:
|
||||||
|
logger.fatal("JELLYFIN_LOG_DIR is required")
|
||||||
|
sys.exit(1)
|
||||||
|
if not ntfy_url:
|
||||||
|
logger.fatal("NTFY_SERVER_URL is required")
|
||||||
|
sys.exit(1)
|
||||||
|
if not ntfy_topic:
|
||||||
|
logger.fatal("NTFY_TOPIC is required")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
running = True
|
||||||
|
|
||||||
|
def _handle_signal(signum: int, _frame: object) -> None:
|
||||||
|
nonlocal running
|
||||||
|
logger.info("Received signal %s, shutting down", signum)
|
||||||
|
running = False
|
||||||
|
|
||||||
|
signal.signal(signal.SIGTERM, _handle_signal)
|
||||||
|
signal.signal(signal.SIGINT, _handle_signal)
|
||||||
|
|
||||||
|
ntfy_token = _read_token(ntfy_token_file)
|
||||||
|
|
||||||
|
# Dedup state: signature → last-alerted timestamp
|
||||||
|
seen_signatures: dict[str, float] = {}
|
||||||
|
# File read cursors: "{inode}:{dev}" → byte offset
|
||||||
|
file_positions: dict[str, int] = {}
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Starting Jellyfin failure alert monitor (log_dir=%s, poll=%ss, dedup=%ss)",
|
||||||
|
log_dir,
|
||||||
|
poll_interval,
|
||||||
|
dedup_window,
|
||||||
|
)
|
||||||
|
|
||||||
|
while running:
|
||||||
|
try:
|
||||||
|
now = time.time()
|
||||||
|
|
||||||
|
# Expire old dedup entries
|
||||||
|
expired = [s for s, ts in seen_signatures.items() if now - ts > dedup_window]
|
||||||
|
for s in expired:
|
||||||
|
del seen_signatures[s]
|
||||||
|
|
||||||
|
# Scan all log files
|
||||||
|
log_pattern = os.path.join(log_dir, "log_*.log")
|
||||||
|
for path in sorted(glob.glob(log_pattern)):
|
||||||
|
hits = _scan_log_file(path, file_positions)
|
||||||
|
for line in hits:
|
||||||
|
sig = _error_signature(line)
|
||||||
|
if sig in seen_signatures:
|
||||||
|
logger.debug("Suppressed duplicate: %s", line[:120])
|
||||||
|
continue
|
||||||
|
seen_signatures[sig] = now
|
||||||
|
|
||||||
|
# Build a clean title: source + short summary
|
||||||
|
body = _LOG_PREFIX_RE.sub("", line, count=1)
|
||||||
|
title = f"[{hostname}] Jellyfin client failure"
|
||||||
|
if ": " in body:
|
||||||
|
source, msg = body.split(": ", 1)
|
||||||
|
title = f"[{hostname}] Jellyfin: {source.split('.')[-1]}"
|
||||||
|
body = msg
|
||||||
|
|
||||||
|
# Truncate body for readability
|
||||||
|
if len(body) > 500:
|
||||||
|
body = body[:497] + "..."
|
||||||
|
|
||||||
|
logger.warning("Alerting: %s", body[:120])
|
||||||
|
_send_ntfy(
|
||||||
|
ntfy_url,
|
||||||
|
ntfy_topic,
|
||||||
|
title,
|
||||||
|
body,
|
||||||
|
ntfy_token,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Clean up stale file-position entries for rotated-out files
|
||||||
|
current_inodes = set()
|
||||||
|
for path in glob.glob(log_pattern):
|
||||||
|
try:
|
||||||
|
st = os.stat(path)
|
||||||
|
current_inodes.add(f"{st.st_ino}:{st.st_dev}")
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
stale = [k for k in file_positions if k not in current_inodes]
|
||||||
|
for k in stale:
|
||||||
|
del file_positions[k]
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Unhandled error in main loop")
|
||||||
|
|
||||||
|
# Sleep in small increments so we can react to SIGTERM promptly.
|
||||||
|
deadline = time.time() + poll_interval
|
||||||
|
while running and time.time() < deadline:
|
||||||
|
time.sleep(min(1, deadline - time.time()))
|
||||||
|
|
||||||
|
logger.info("Jellyfin failure alert monitor stopped")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
351
tests/jellyfin-failure-alert.nix
Normal file
351
tests/jellyfin-failure-alert.nix
Normal file
@@ -0,0 +1,351 @@
|
|||||||
|
{
|
||||||
|
config,
|
||||||
|
lib,
|
||||||
|
pkgs,
|
||||||
|
...
|
||||||
|
}:
|
||||||
|
let
|
||||||
|
|
||||||
|
# Mock ntfy server script that records POST requests to a JSON log.
|
||||||
|
mockNtfyScript = pkgs.writeScript "mock-ntfy.py" ''
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
REQUESTS_FILE = "/tmp/ntfy-requests.json"
|
||||||
|
|
||||||
|
class MockNtfy(BaseHTTPRequestHandler):
|
||||||
|
def _respond(self, code=200, body=b"Ok"):
|
||||||
|
self.send_response(code)
|
||||||
|
self.send_header("Content-Type", "application/json")
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(body if isinstance(body, bytes) else body.encode())
|
||||||
|
|
||||||
|
def do_GET(self):
|
||||||
|
self._respond()
|
||||||
|
|
||||||
|
def do_POST(self):
|
||||||
|
content_length = int(self.headers.get("Content-Length", 0))
|
||||||
|
body = self.rfile.read(content_length).decode() if content_length > 0 else ""
|
||||||
|
|
||||||
|
request_data = {
|
||||||
|
"timestamp": datetime.now().isoformat(),
|
||||||
|
"path": self.path,
|
||||||
|
"headers": dict(self.headers),
|
||||||
|
"body": body,
|
||||||
|
}
|
||||||
|
|
||||||
|
requests = []
|
||||||
|
if os.path.exists(REQUESTS_FILE):
|
||||||
|
try:
|
||||||
|
with open(REQUESTS_FILE, "r") as f:
|
||||||
|
requests = json.load(f)
|
||||||
|
except:
|
||||||
|
requests = []
|
||||||
|
|
||||||
|
requests.append(request_data)
|
||||||
|
|
||||||
|
with open(REQUESTS_FILE, "w") as f:
|
||||||
|
json.dump(requests, f, indent=2)
|
||||||
|
|
||||||
|
self._respond()
|
||||||
|
|
||||||
|
def log_message(self, format, *args):
|
||||||
|
pass
|
||||||
|
|
||||||
|
HTTPServer(("0.0.0.0", 8080), MockNtfy).serve_forever()
|
||||||
|
'';
|
||||||
|
|
||||||
|
# Jellyfin log lines used in the test.
|
||||||
|
# Jellyfin log format: [YYYY-MM-DD HH:MM:SS.mmm +TZ] [LEVEL] [thread] Source: message
|
||||||
|
mkLogLine =
|
||||||
|
ts: level: thread: source: msg:
|
||||||
|
"[${ts}] [${level}] [${thread}] ${source}: ${msg}";
|
||||||
|
|
||||||
|
# Realistic error lines that should trigger alerts.
|
||||||
|
transcodeCrash =
|
||||||
|
mkLogLine "2026-05-12 14:23:01.123 +00:00" "ERR" "42"
|
||||||
|
"MediaBrowser.MediaEncoding.Transcoding.TranscodeManager"
|
||||||
|
"FFmpeg exited with code 137 after 45.2 seconds";
|
||||||
|
|
||||||
|
playbackError =
|
||||||
|
mkLogLine "2026-05-12 14:24:05.456 +00:00" "ERR" "17"
|
||||||
|
"Emby.Server.Implementations.Session.SessionManager"
|
||||||
|
"Playback error for user \"alice\" on device \"Living Room TV\"";
|
||||||
|
|
||||||
|
exceptionMiddleware =
|
||||||
|
mkLogLine "2026-05-12 14:25:10.789 +00:00" "ERR" "99"
|
||||||
|
"Jellyfin.Server.Middleware.ExceptionMiddleware"
|
||||||
|
''Error processing request. URL "GET" "/Videos/a1b2c3d4-e5f6-7890-abcd-ef1234567890/stream".'';
|
||||||
|
|
||||||
|
streamAbort =
|
||||||
|
mkLogLine "2026-05-12 14:26:00.111 +00:00" "ERR" "33"
|
||||||
|
"MediaBrowser.Api.Playback.Hls.DynamicHlsService"
|
||||||
|
"Cannot open HLS stream segment";
|
||||||
|
|
||||||
|
# Lines that should NOT trigger alerts.
|
||||||
|
authDenied =
|
||||||
|
mkLogLine "2026-05-12 14:27:00.222 +00:00" "ERR" "12"
|
||||||
|
"Jellyfin.Server.Implementations.Users.UserManager"
|
||||||
|
''Authentication request for "bob" has been denied (IP: "10.0.0.5").'';
|
||||||
|
|
||||||
|
libraryScanError =
|
||||||
|
mkLogLine "2026-05-12 14:28:00.333 +00:00" "ERR" "55" "MediaBrowser.Controller.Entities.BaseItem"
|
||||||
|
"Error refreshing item metadata for /library/some-broken-file.mkv";
|
||||||
|
|
||||||
|
# Below ERR level – never triggers.
|
||||||
|
warnLine =
|
||||||
|
mkLogLine "2026-05-12 14:29:00.444 +00:00" "WRN" "77"
|
||||||
|
"MediaBrowser.MediaEncoding.Transcoding.TranscodeManager"
|
||||||
|
"Slow transcoding detected (0.95x realtime)";
|
||||||
|
|
||||||
|
infoLine =
|
||||||
|
mkLogLine "2026-05-12 14:30:00.555 +00:00" "INF" "88"
|
||||||
|
"Jellyfin.Server.Middleware.ExceptionMiddleware"
|
||||||
|
"This is informational and should not alert";
|
||||||
|
|
||||||
|
# Log file contents for each scenario.
|
||||||
|
logWithFailures = pkgs.writeText "jellyfin-failure-log.log" ''
|
||||||
|
${authDenied}
|
||||||
|
${libraryScanError}
|
||||||
|
${warnLine}
|
||||||
|
${infoLine}
|
||||||
|
${transcodeCrash}
|
||||||
|
${playbackError}
|
||||||
|
${exceptionMiddleware}
|
||||||
|
${streamAbort}
|
||||||
|
'';
|
||||||
|
|
||||||
|
logWithDedup = pkgs.writeText "jellyfin-dedup-log.log" ''
|
||||||
|
${transcodeCrash}
|
||||||
|
${transcodeCrash}
|
||||||
|
${transcodeCrash}
|
||||||
|
'';
|
||||||
|
|
||||||
|
logNoFailures = pkgs.writeText "jellyfin-clean-log.log" ''
|
||||||
|
${authDenied}
|
||||||
|
${libraryScanError}
|
||||||
|
${warnLine}
|
||||||
|
${infoLine}
|
||||||
|
'';
|
||||||
|
in
|
||||||
|
pkgs.testers.runNixOSTest {
|
||||||
|
name = "jellyfin-failure-alert";
|
||||||
|
|
||||||
|
nodes.machine =
|
||||||
|
{ pkgs, ... }:
|
||||||
|
{
|
||||||
|
imports = [
|
||||||
|
../modules/ntfy-alerts.nix
|
||||||
|
../services/jellyfin/jellyfin-failure-alert.nix
|
||||||
|
];
|
||||||
|
|
||||||
|
system.stateVersion = config.system.stateVersion;
|
||||||
|
|
||||||
|
virtualisation.memorySize = 2048;
|
||||||
|
|
||||||
|
environment.systemPackages = with pkgs; [
|
||||||
|
curl
|
||||||
|
jq
|
||||||
|
];
|
||||||
|
# Minimal jellyfin config so the guard passes. Jellyfin 10.11+
|
||||||
|
# requires 2 GiB free space, so give the VM a 4 GiB disk.
|
||||||
|
virtualisation.diskSize = 4096;
|
||||||
|
|
||||||
|
services.jellyfin = {
|
||||||
|
enable = true;
|
||||||
|
dataDir = "/var/lib/jellyfin-test";
|
||||||
|
cacheDir = "/var/cache/jellyfin-test";
|
||||||
|
user = "jellyfin";
|
||||||
|
group = "jellyfin";
|
||||||
|
};
|
||||||
|
|
||||||
|
# Jellyfin base dirs
|
||||||
|
systemd.tmpfiles.rules = [
|
||||||
|
"d /var/lib/jellyfin-test 0755 jellyfin jellyfin"
|
||||||
|
"d /var/lib/jellyfin-test/log 0755 jellyfin jellyfin"
|
||||||
|
"f /run/ntfy-test-topic 0644 root root - test-alerts"
|
||||||
|
"f /run/ntfy-test-token 0644 root root - test-token-value"
|
||||||
|
];
|
||||||
|
|
||||||
|
# Mock ntfy server
|
||||||
|
systemd.services.mock-ntfy = {
|
||||||
|
description = "Mock ntfy server for jellyfin-failure-alert test";
|
||||||
|
wantedBy = [ "multi-user.target" ];
|
||||||
|
before = [ "jellyfin-failure-alert.service" ];
|
||||||
|
serviceConfig = {
|
||||||
|
ExecStart = "${pkgs.python3}/bin/python3 ${mockNtfyScript}";
|
||||||
|
Type = "simple";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
# Configure ntfy-alerts to use mock server
|
||||||
|
services.ntfyAlerts = {
|
||||||
|
enable = true;
|
||||||
|
serverUrl = "http://localhost:8080";
|
||||||
|
topicFile = "/run/ntfy-test-topic";
|
||||||
|
tokenFile = "/run/ntfy-test-token";
|
||||||
|
};
|
||||||
|
|
||||||
|
# Speed up polling for the test
|
||||||
|
systemd.services.jellyfin-failure-alert.environment.POLL_INTERVAL = lib.mkForce "2";
|
||||||
|
systemd.services.jellyfin-failure-alert.environment.DEDUP_WINDOW = lib.mkForce "10";
|
||||||
|
};
|
||||||
|
testScript = ''
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
|
||||||
|
LOG_DIR = "/var/lib/jellyfin-test/log"
|
||||||
|
REQUESTS_FILE = "/tmp/ntfy-requests.json"
|
||||||
|
|
||||||
|
start_all()
|
||||||
|
|
||||||
|
# Wait for mock ntfy server
|
||||||
|
machine.wait_for_unit("mock-ntfy.service")
|
||||||
|
machine.wait_until_succeeds("curl -sf http://localhost:8080/", timeout=30)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Phase 1: Client failures trigger alerts
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
with subtest("Client failure log lines trigger ntfy notifications"):
|
||||||
|
# Place a log file with known failure lines
|
||||||
|
machine.succeed(
|
||||||
|
"cp ${logWithFailures} {}/log_test.log && chown jellyfin:jellyfin {}/log_test.log".format(
|
||||||
|
LOG_DIR, LOG_DIR
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Start the monitor
|
||||||
|
machine.succeed("systemctl start jellyfin-failure-alert.service")
|
||||||
|
|
||||||
|
# Wait for the monitor to poll and send notifications.
|
||||||
|
# Should pick up: transcodeCrash, playbackError, exceptionMiddleware, streamAbort
|
||||||
|
machine.wait_until_succeeds(
|
||||||
|
"test -f {} && test $(jq 'length' {}) -ge 4".format(REQUESTS_FILE, REQUESTS_FILE),
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = machine.succeed("cat {}".format(REQUESTS_FILE))
|
||||||
|
requests = json.loads(result)
|
||||||
|
print(f"Phase 1: received {len(requests)} ntfy notifications")
|
||||||
|
|
||||||
|
assert len(requests) >= 4, f"Expected >= 4 notifications, got {len(requests)}"
|
||||||
|
|
||||||
|
# Verify each notification has the expected shape
|
||||||
|
for req in requests:
|
||||||
|
assert "/test-alerts" in req["path"], f"Wrong topic path: {req['path']}"
|
||||||
|
assert "Title" in req["headers"], "Missing Title header"
|
||||||
|
assert "Jellyfin" in req["headers"]["Title"], (
|
||||||
|
f"Title should mention Jellyfin: {req['headers']['Title']}"
|
||||||
|
)
|
||||||
|
assert req["headers"]["Priority"] == "high", (
|
||||||
|
f"Expected Priority 'high', got {req['headers'].get('Priority')}"
|
||||||
|
)
|
||||||
|
assert req["headers"]["Tags"] == "warning", (
|
||||||
|
f"Expected Tags 'warning', got {req['headers'].get('Tags')}"
|
||||||
|
)
|
||||||
|
assert req["headers"]["Authorization"] == "Bearer test-token-value", (
|
||||||
|
f"Missing or wrong Authorization header: {req['headers'].get('Authorization')}"
|
||||||
|
)
|
||||||
|
assert len(req["body"]) > 0, "Notification body is empty"
|
||||||
|
|
||||||
|
# Verify specific error content appears in bodies
|
||||||
|
bodies = " ".join(r["body"] for r in requests)
|
||||||
|
assert "FFmpeg" in bodies, "Missing FFmpeg error in notifications"
|
||||||
|
assert "Playback error" in bodies, "Missing playback error in notifications"
|
||||||
|
assert "ExceptionMiddleware" in bodies or "/Videos/" in bodies, (
|
||||||
|
"Missing exception middleware error in notifications"
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Phase 1 passed: all client failures triggered alerts")
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Phase 2: Non-client errors are filtered out
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
with subtest("Non-client errors do not trigger alerts"):
|
||||||
|
# Clear previous requests and add a new log file with only non-client errors
|
||||||
|
machine.succeed("rm -f {}".format(REQUESTS_FILE))
|
||||||
|
machine.succeed(
|
||||||
|
"cp ${logNoFailures} {}/log_clean.log && chown jellyfin:jellyfin {}/log_clean.log".format(
|
||||||
|
LOG_DIR, LOG_DIR
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Wait enough poll cycles (at least 8s = 4 cycles at POLL_INTERVAL=2)
|
||||||
|
time.sleep(8)
|
||||||
|
|
||||||
|
# Assert no requests file was created. If it exists, it must be empty
|
||||||
|
# (any non-empty result means a non-client error leaked through).
|
||||||
|
rc, _ = machine.execute("test -f {}".format(REQUESTS_FILE))
|
||||||
|
if rc == 0:
|
||||||
|
machine.succeed("test $(jq 'length' {}) -eq 0".format(REQUESTS_FILE))
|
||||||
|
|
||||||
|
print("Phase 2 passed: non-client errors correctly filtered")
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Phase 3: Deduplication
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
with subtest("Duplicate errors are deduplicated within the window"):
|
||||||
|
# Clear and add a log file with the same error repeated
|
||||||
|
machine.succeed("rm -f {}".format(REQUESTS_FILE))
|
||||||
|
machine.succeed(
|
||||||
|
"cp ${logWithDedup} {}/log_dedup.log && chown jellyfin:jellyfin {}/log_dedup.log".format(
|
||||||
|
LOG_DIR, LOG_DIR
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Wait for the monitor to process. Should get exactly 1 notification.
|
||||||
|
machine.wait_until_succeeds(
|
||||||
|
"test -f {} && test $(jq 'length' {}) -eq 1".format(REQUESTS_FILE, REQUESTS_FILE),
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = machine.succeed("cat {}".format(REQUESTS_FILE))
|
||||||
|
requests = json.loads(result)
|
||||||
|
print(f"Phase 3: received {len(requests)} notifications (expected 1 for dedup)")
|
||||||
|
|
||||||
|
assert len(requests) == 1, (
|
||||||
|
f"Expected exactly 1 notification for 3 identical errors, got {len(requests)}"
|
||||||
|
)
|
||||||
|
assert "FFmpeg" in requests[0]["body"], "Missing FFmpeg error in dedup notification"
|
||||||
|
|
||||||
|
print("Phase 3 passed: deduplication works correctly")
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Phase 4: Re-alert after dedup window expires
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
with subtest("Same error re-alerts after dedup window expires"):
|
||||||
|
# Wait for dedup window to expire (10s)
|
||||||
|
time.sleep(12)
|
||||||
|
|
||||||
|
# Write the same error again in a new log file
|
||||||
|
machine.succeed(
|
||||||
|
"cp ${logWithDedup} {}/log_dedup2.log && chown jellyfin:jellyfin {}/log_dedup2.log".format(
|
||||||
|
LOG_DIR, LOG_DIR
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Wait for the monitor to pick it up. Should now have 2 entries.
|
||||||
|
machine.wait_until_succeeds(
|
||||||
|
"test -f {} && test $(jq 'length' {}) -eq 2".format(REQUESTS_FILE, REQUESTS_FILE),
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = machine.succeed("cat {}".format(REQUESTS_FILE))
|
||||||
|
requests = json.loads(result)
|
||||||
|
print(f"Phase 4: received {len(requests)} notifications (expected 2)")
|
||||||
|
|
||||||
|
assert len(requests) == 2, (
|
||||||
|
f"Expected 2 notifications after dedup window expired, got {len(requests)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Phase 4 passed: re-alert after dedup window works")
|
||||||
|
|
||||||
|
print("All jellyfin-failure-alert tests passed!")
|
||||||
|
'';
|
||||||
|
}
|
||||||
@@ -27,6 +27,8 @@ in
|
|||||||
|
|
||||||
# jellyfin annotation service test
|
# jellyfin annotation service test
|
||||||
jellyfinAnnotationsTest = handleTest ./jellyfin-annotations.nix;
|
jellyfinAnnotationsTest = handleTest ./jellyfin-annotations.nix;
|
||||||
|
# jellyfin failure alert test
|
||||||
|
jellyfinFailureAlertTest = handleTest ./jellyfin-failure-alert.nix;
|
||||||
|
|
||||||
# zfs scrub annotations test
|
# zfs scrub annotations test
|
||||||
zfsScrubAnnotationsTest = handleTest ./zfs-scrub-annotations.nix;
|
zfsScrubAnnotationsTest = handleTest ./zfs-scrub-annotations.nix;
|
||||||
|
|||||||
Reference in New Issue
Block a user