monitoring: add grafana annotations for zfs scrub events

This commit is contained in:
2026-04-01 01:49:53 -04:00
parent 3196b38db7
commit a5206b9ec6
6 changed files with 230 additions and 0 deletions

View File

@@ -49,6 +49,7 @@
./services/ups.nix
./services/monitoring.nix
./services/jellyfin-annotations.nix
./services/zfs-scrub-annotations.nix
./services/bitwarden.nix
./services/firefox-syncserver.nix

View File

@@ -108,6 +108,18 @@ let
type = "tags";
tags = [ "jellyfin" ];
}
{
name = "ZFS Scrubs";
datasource = {
type = "grafana";
uid = "-- Grafana --";
};
enable = true;
iconColor = "orange";
showIn = 0;
type = "tags";
tags = [ "zfs-scrub" ];
}
];
panels = [

View File

@@ -0,0 +1,36 @@
{
config,
pkgs,
service_configs,
lib,
...
}:
let
grafanaUrl = "http://127.0.0.1:${toString service_configs.ports.private.grafana.port}";
script = pkgs.writeShellApplication {
name = "zfs-scrub-annotations";
runtimeInputs = with pkgs; [
curl
jq
coreutils
gnugrep
gnused
config.boot.zfs.package
];
text = builtins.readFile ./zfs-scrub-annotations.sh;
};
in
{
systemd.services.zfs-scrub = {
environment = {
GRAFANA_URL = grafanaUrl;
STATE_DIR = "/run/zfs-scrub-annotations";
};
serviceConfig = {
RuntimeDirectory = "zfs-scrub-annotations";
ExecStartPre = [ "-${lib.getExe script} start" ];
ExecStopPost = [ "${lib.getExe script} stop" ];
};
};
}

View File

@@ -0,0 +1,55 @@
#!/usr/bin/env bash
# ZFS scrub annotation script for Grafana
# Usage: zfs-scrub-annotations.sh {start|stop}
# Required env: GRAFANA_URL, STATE_DIR
# Required on PATH: zpool, curl, jq, paste, date, grep, sed
set -euo pipefail
ACTION="${1:-}"
GRAFANA_URL="${GRAFANA_URL:?GRAFANA_URL required}"
STATE_DIR="${STATE_DIR:?STATE_DIR required}"
case "$ACTION" in
start)
POOLS=$(zpool list -H -o name | paste -sd ', ')
NOW_MS=$(date +%s%3N)
RESPONSE=$(curl -sf --max-time 5 \
-X POST "$GRAFANA_URL/api/annotations" \
-H "Content-Type: application/json" \
-d "$(jq -n --arg text "ZFS scrub: $POOLS" --argjson time "$NOW_MS" \
'{time: $time, text: $text, tags: ["zfs-scrub"]}')" \
) || exit 0
echo "$RESPONSE" | jq -r '.id' > "$STATE_DIR/annotation-id"
;;
stop)
ANN_ID=$(cat "$STATE_DIR/annotation-id" 2>/dev/null) || exit 0
[ -z "$ANN_ID" ] && exit 0
NOW_MS=$(date +%s%3N)
RESULTS=""
while IFS= read -r pool; do
scan_line=$(zpool status "$pool" | grep "scan:" | sed 's/^[[:space:]]*//')
RESULTS="${RESULTS}${pool}: ${scan_line}"$'\n'
done < <(zpool list -H -o name)
TEXT=$(printf "ZFS scrub completed\n%s" "$RESULTS")
curl -sf --max-time 5 \
-X PATCH "$GRAFANA_URL/api/annotations/$ANN_ID" \
-H "Content-Type: application/json" \
-d "$(jq -n --arg text "$TEXT" --argjson timeEnd "$NOW_MS" \
'{timeEnd: $timeEnd, text: $text}')" || true
rm -f "$STATE_DIR/annotation-id"
;;
*)
echo "Usage: $0 {start|stop}" >&2
exit 1
;;
esac

View File

@@ -25,6 +25,9 @@ in
# jellyfin annotation service test
jellyfinAnnotationsTest = handleTest ./jellyfin-annotations.nix;
# zfs scrub annotations test
zfsScrubAnnotationsTest = handleTest ./zfs-scrub-annotations.nix;
# ntfy alerts test
ntfyAlertsTest = handleTest ./ntfy-alerts.nix;

View File

@@ -0,0 +1,123 @@
{
lib,
pkgs,
...
}:
let
mockServer = ./mock-grafana-server.py;
mockZpool = pkgs.writeShellScript "zpool" ''
case "$1" in
list)
echo "tank"
echo "hdds"
;;
status)
pool="$2"
if [ "$pool" = "tank" ]; then
echo " scan: scrub repaired 0B in 00:24:39 with 0 errors on Mon Jan 1 02:24:39 2024"
elif [ "$pool" = "hdds" ]; then
echo " scan: scrub repaired 0B in 04:12:33 with 0 errors on Mon Jan 1 06:12:33 2024"
fi
;;
esac
'';
script = ../services/zfs-scrub-annotations.sh;
python = pkgs.python3;
in
pkgs.testers.runNixOSTest {
name = "zfs-scrub-annotations";
nodes.machine =
{ pkgs, ... }:
{
environment.systemPackages = with pkgs; [
python3
curl
jq
];
};
testScript = ''
import json
GRAFANA_PORT = 13000
ANNOTS_FILE = "/tmp/annotations.json"
STATE_DIR = "/tmp/scrub-state"
PYTHON = "${python}/bin/python3"
MOCK = "${mockServer}"
SCRIPT = "${script}"
MOCK_ZPOOL = "${mockZpool}"
MOCK_BIN = "/tmp/mock-bin"
ENV_PREFIX = (
f"GRAFANA_URL=http://127.0.0.1:{GRAFANA_PORT} "
f"STATE_DIR={STATE_DIR} "
f"PATH={MOCK_BIN}:$PATH "
)
def read_annotations():
out = machine.succeed(f"cat {ANNOTS_FILE} 2>/dev/null || echo '[]'")
return json.loads(out.strip())
start_all()
machine.wait_for_unit("multi-user.target")
with subtest("Setup state directory and mock zpool"):
machine.succeed(f"mkdir -p {STATE_DIR}")
machine.succeed(f"mkdir -p {MOCK_BIN} && cp {MOCK_ZPOOL} {MOCK_BIN}/zpool && chmod +x {MOCK_BIN}/zpool")
with subtest("Start mock Grafana server"):
machine.succeed(f"echo '[]' > {ANNOTS_FILE}")
machine.succeed(
f"systemd-run --unit=mock-grafana {PYTHON} {MOCK} {GRAFANA_PORT} {ANNOTS_FILE}"
)
machine.wait_until_succeeds(
f"curl -sf -X POST http://127.0.0.1:{GRAFANA_PORT}/api/annotations "
f"-H 'Content-Type: application/json' -d '{{\"text\":\"ping\",\"tags\":[]}}' | grep -q id",
timeout=10,
)
machine.succeed(f"echo '[]' > {ANNOTS_FILE}")
with subtest("Start action creates annotation with pool names and zfs-scrub tag"):
machine.succeed(f"{ENV_PREFIX} bash {SCRIPT} start")
annots = read_annotations()
assert len(annots) == 1, f"Expected 1 annotation, got: {annots}"
assert "zfs-scrub" in annots[0].get("tags", []), f"Missing zfs-scrub tag: {annots[0]}"
assert "tank" in annots[0]["text"], f"Missing tank in text: {annots[0]['text']}"
assert "hdds" in annots[0]["text"], f"Missing hdds in text: {annots[0]['text']}"
assert "time" in annots[0], f"Missing time field: {annots[0]}"
assert "timeEnd" not in annots[0], f"timeEnd should not be set yet: {annots[0]}"
with subtest("State file contains annotation ID"):
ann_id = machine.succeed(f"cat {STATE_DIR}/annotation-id").strip()
assert ann_id == "1", f"Expected annotation ID 1, got: {ann_id}"
with subtest("Stop action closes annotation with per-pool scrub results"):
machine.succeed(f"{ENV_PREFIX} bash {SCRIPT} stop")
annots = read_annotations()
assert len(annots) == 1, f"Expected 1 annotation, got: {annots}"
assert "timeEnd" in annots[0], f"timeEnd should be set: {annots[0]}"
assert annots[0]["timeEnd"] > annots[0]["time"], "timeEnd should be after time"
text = annots[0]["text"]
assert "ZFS scrub completed" in text, f"Missing completed text: {text}"
assert "tank:" in text, f"Missing tank results: {text}"
assert "hdds:" in text, f"Missing hdds results: {text}"
assert "00:24:39" in text, f"Missing tank scrub duration: {text}"
assert "04:12:33" in text, f"Missing hdds scrub duration: {text}"
with subtest("State file cleaned up after stop"):
machine.fail(f"test -f {STATE_DIR}/annotation-id")
with subtest("Stop action handles missing state file gracefully"):
machine.succeed(f"{ENV_PREFIX} bash {SCRIPT} stop")
annots = read_annotations()
assert len(annots) == 1, f"Expected no new annotations, got: {annots}"
with subtest("Start action handles Grafana being down gracefully"):
machine.succeed("systemctl stop mock-grafana")
machine.succeed(f"{ENV_PREFIX} bash {SCRIPT} start")
machine.fail(f"test -f {STATE_DIR}/annotation-id")
'';
}