From a5206b9ec6128afa33caef5c5eb42590e1a97e69 Mon Sep 17 00:00:00 2001 From: Simon Gardling Date: Wed, 1 Apr 2026 01:49:53 -0400 Subject: [PATCH] monitoring: add grafana annotations for zfs scrub events --- configuration.nix | 1 + services/monitoring.nix | 12 +++ services/zfs-scrub-annotations.nix | 36 +++++++++ services/zfs-scrub-annotations.sh | 55 +++++++++++++ tests/tests.nix | 3 + tests/zfs-scrub-annotations.nix | 123 +++++++++++++++++++++++++++++ 6 files changed, 230 insertions(+) create mode 100644 services/zfs-scrub-annotations.nix create mode 100644 services/zfs-scrub-annotations.sh create mode 100644 tests/zfs-scrub-annotations.nix diff --git a/configuration.nix b/configuration.nix index adbd64e..13a1582 100644 --- a/configuration.nix +++ b/configuration.nix @@ -49,6 +49,7 @@ ./services/ups.nix ./services/monitoring.nix ./services/jellyfin-annotations.nix + ./services/zfs-scrub-annotations.nix ./services/bitwarden.nix ./services/firefox-syncserver.nix diff --git a/services/monitoring.nix b/services/monitoring.nix index adb6c94..f23f7c7 100644 --- a/services/monitoring.nix +++ b/services/monitoring.nix @@ -108,6 +108,18 @@ let type = "tags"; tags = [ "jellyfin" ]; } + { + name = "ZFS Scrubs"; + datasource = { + type = "grafana"; + uid = "-- Grafana --"; + }; + enable = true; + iconColor = "orange"; + showIn = 0; + type = "tags"; + tags = [ "zfs-scrub" ]; + } ]; panels = [ diff --git a/services/zfs-scrub-annotations.nix b/services/zfs-scrub-annotations.nix new file mode 100644 index 0000000..e502178 --- /dev/null +++ b/services/zfs-scrub-annotations.nix @@ -0,0 +1,36 @@ +{ + config, + pkgs, + service_configs, + lib, + ... +}: +let + grafanaUrl = "http://127.0.0.1:${toString service_configs.ports.private.grafana.port}"; + + script = pkgs.writeShellApplication { + name = "zfs-scrub-annotations"; + runtimeInputs = with pkgs; [ + curl + jq + coreutils + gnugrep + gnused + config.boot.zfs.package + ]; + text = builtins.readFile ./zfs-scrub-annotations.sh; + }; +in +{ + systemd.services.zfs-scrub = { + environment = { + GRAFANA_URL = grafanaUrl; + STATE_DIR = "/run/zfs-scrub-annotations"; + }; + serviceConfig = { + RuntimeDirectory = "zfs-scrub-annotations"; + ExecStartPre = [ "-${lib.getExe script} start" ]; + ExecStopPost = [ "${lib.getExe script} stop" ]; + }; + }; +} diff --git a/services/zfs-scrub-annotations.sh b/services/zfs-scrub-annotations.sh new file mode 100644 index 0000000..237ab3d --- /dev/null +++ b/services/zfs-scrub-annotations.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash +# ZFS scrub annotation script for Grafana +# Usage: zfs-scrub-annotations.sh {start|stop} +# Required env: GRAFANA_URL, STATE_DIR +# Required on PATH: zpool, curl, jq, paste, date, grep, sed + +set -euo pipefail + +ACTION="${1:-}" +GRAFANA_URL="${GRAFANA_URL:?GRAFANA_URL required}" +STATE_DIR="${STATE_DIR:?STATE_DIR required}" + +case "$ACTION" in + start) + POOLS=$(zpool list -H -o name | paste -sd ', ') + NOW_MS=$(date +%s%3N) + + RESPONSE=$(curl -sf --max-time 5 \ + -X POST "$GRAFANA_URL/api/annotations" \ + -H "Content-Type: application/json" \ + -d "$(jq -n --arg text "ZFS scrub: $POOLS" --argjson time "$NOW_MS" \ + '{time: $time, text: $text, tags: ["zfs-scrub"]}')" \ + ) || exit 0 + + echo "$RESPONSE" | jq -r '.id' > "$STATE_DIR/annotation-id" + ;; + + stop) + ANN_ID=$(cat "$STATE_DIR/annotation-id" 2>/dev/null) || exit 0 + [ -z "$ANN_ID" ] && exit 0 + + NOW_MS=$(date +%s%3N) + + RESULTS="" + while IFS= read -r pool; do + scan_line=$(zpool status "$pool" | grep "scan:" | sed 's/^[[:space:]]*//') + RESULTS="${RESULTS}${pool}: ${scan_line}"$'\n' + done < <(zpool list -H -o name) + + TEXT=$(printf "ZFS scrub completed\n%s" "$RESULTS") + + curl -sf --max-time 5 \ + -X PATCH "$GRAFANA_URL/api/annotations/$ANN_ID" \ + -H "Content-Type: application/json" \ + -d "$(jq -n --arg text "$TEXT" --argjson timeEnd "$NOW_MS" \ + '{timeEnd: $timeEnd, text: $text}')" || true + + rm -f "$STATE_DIR/annotation-id" + ;; + + *) + echo "Usage: $0 {start|stop}" >&2 + exit 1 + ;; +esac diff --git a/tests/tests.nix b/tests/tests.nix index 59906f9..9762b9c 100644 --- a/tests/tests.nix +++ b/tests/tests.nix @@ -25,6 +25,9 @@ in # jellyfin annotation service test jellyfinAnnotationsTest = handleTest ./jellyfin-annotations.nix; + # zfs scrub annotations test + zfsScrubAnnotationsTest = handleTest ./zfs-scrub-annotations.nix; + # ntfy alerts test ntfyAlertsTest = handleTest ./ntfy-alerts.nix; diff --git a/tests/zfs-scrub-annotations.nix b/tests/zfs-scrub-annotations.nix new file mode 100644 index 0000000..4bed3a7 --- /dev/null +++ b/tests/zfs-scrub-annotations.nix @@ -0,0 +1,123 @@ +{ + lib, + pkgs, + ... +}: +let + mockServer = ./mock-grafana-server.py; + + mockZpool = pkgs.writeShellScript "zpool" '' + case "$1" in + list) + echo "tank" + echo "hdds" + ;; + status) + pool="$2" + if [ "$pool" = "tank" ]; then + echo " scan: scrub repaired 0B in 00:24:39 with 0 errors on Mon Jan 1 02:24:39 2024" + elif [ "$pool" = "hdds" ]; then + echo " scan: scrub repaired 0B in 04:12:33 with 0 errors on Mon Jan 1 06:12:33 2024" + fi + ;; + esac + ''; + + script = ../services/zfs-scrub-annotations.sh; + python = pkgs.python3; +in +pkgs.testers.runNixOSTest { + name = "zfs-scrub-annotations"; + + nodes.machine = + { pkgs, ... }: + { + environment.systemPackages = with pkgs; [ + python3 + curl + jq + ]; + }; + + testScript = '' + import json + + GRAFANA_PORT = 13000 + ANNOTS_FILE = "/tmp/annotations.json" + STATE_DIR = "/tmp/scrub-state" + PYTHON = "${python}/bin/python3" + MOCK = "${mockServer}" + SCRIPT = "${script}" + MOCK_ZPOOL = "${mockZpool}" + + MOCK_BIN = "/tmp/mock-bin" + ENV_PREFIX = ( + f"GRAFANA_URL=http://127.0.0.1:{GRAFANA_PORT} " + f"STATE_DIR={STATE_DIR} " + f"PATH={MOCK_BIN}:$PATH " + ) + + def read_annotations(): + out = machine.succeed(f"cat {ANNOTS_FILE} 2>/dev/null || echo '[]'") + return json.loads(out.strip()) + + start_all() + machine.wait_for_unit("multi-user.target") + + with subtest("Setup state directory and mock zpool"): + machine.succeed(f"mkdir -p {STATE_DIR}") + machine.succeed(f"mkdir -p {MOCK_BIN} && cp {MOCK_ZPOOL} {MOCK_BIN}/zpool && chmod +x {MOCK_BIN}/zpool") + + with subtest("Start mock Grafana server"): + machine.succeed(f"echo '[]' > {ANNOTS_FILE}") + machine.succeed( + f"systemd-run --unit=mock-grafana {PYTHON} {MOCK} {GRAFANA_PORT} {ANNOTS_FILE}" + ) + machine.wait_until_succeeds( + f"curl -sf -X POST http://127.0.0.1:{GRAFANA_PORT}/api/annotations " + f"-H 'Content-Type: application/json' -d '{{\"text\":\"ping\",\"tags\":[]}}' | grep -q id", + timeout=10, + ) + machine.succeed(f"echo '[]' > {ANNOTS_FILE}") + + with subtest("Start action creates annotation with pool names and zfs-scrub tag"): + machine.succeed(f"{ENV_PREFIX} bash {SCRIPT} start") + annots = read_annotations() + assert len(annots) == 1, f"Expected 1 annotation, got: {annots}" + assert "zfs-scrub" in annots[0].get("tags", []), f"Missing zfs-scrub tag: {annots[0]}" + assert "tank" in annots[0]["text"], f"Missing tank in text: {annots[0]['text']}" + assert "hdds" in annots[0]["text"], f"Missing hdds in text: {annots[0]['text']}" + assert "time" in annots[0], f"Missing time field: {annots[0]}" + assert "timeEnd" not in annots[0], f"timeEnd should not be set yet: {annots[0]}" + + with subtest("State file contains annotation ID"): + ann_id = machine.succeed(f"cat {STATE_DIR}/annotation-id").strip() + assert ann_id == "1", f"Expected annotation ID 1, got: {ann_id}" + + with subtest("Stop action closes annotation with per-pool scrub results"): + machine.succeed(f"{ENV_PREFIX} bash {SCRIPT} stop") + annots = read_annotations() + assert len(annots) == 1, f"Expected 1 annotation, got: {annots}" + assert "timeEnd" in annots[0], f"timeEnd should be set: {annots[0]}" + assert annots[0]["timeEnd"] > annots[0]["time"], "timeEnd should be after time" + text = annots[0]["text"] + assert "ZFS scrub completed" in text, f"Missing completed text: {text}" + assert "tank:" in text, f"Missing tank results: {text}" + assert "hdds:" in text, f"Missing hdds results: {text}" + assert "00:24:39" in text, f"Missing tank scrub duration: {text}" + assert "04:12:33" in text, f"Missing hdds scrub duration: {text}" + + with subtest("State file cleaned up after stop"): + machine.fail(f"test -f {STATE_DIR}/annotation-id") + + with subtest("Stop action handles missing state file gracefully"): + machine.succeed(f"{ENV_PREFIX} bash {SCRIPT} stop") + annots = read_annotations() + assert len(annots) == 1, f"Expected no new annotations, got: {annots}" + + with subtest("Start action handles Grafana being down gracefully"): + machine.succeed("systemctl stop mock-grafana") + machine.succeed(f"{ENV_PREFIX} bash {SCRIPT} start") + machine.fail(f"test -f {STATE_DIR}/annotation-id") + ''; +}