monitoring: add grafana annotations for zfs scrub events
This commit is contained in:
@@ -49,6 +49,7 @@
|
||||
./services/ups.nix
|
||||
./services/monitoring.nix
|
||||
./services/jellyfin-annotations.nix
|
||||
./services/zfs-scrub-annotations.nix
|
||||
|
||||
./services/bitwarden.nix
|
||||
./services/firefox-syncserver.nix
|
||||
|
||||
@@ -108,6 +108,18 @@ let
|
||||
type = "tags";
|
||||
tags = [ "jellyfin" ];
|
||||
}
|
||||
{
|
||||
name = "ZFS Scrubs";
|
||||
datasource = {
|
||||
type = "grafana";
|
||||
uid = "-- Grafana --";
|
||||
};
|
||||
enable = true;
|
||||
iconColor = "orange";
|
||||
showIn = 0;
|
||||
type = "tags";
|
||||
tags = [ "zfs-scrub" ];
|
||||
}
|
||||
];
|
||||
|
||||
panels = [
|
||||
|
||||
36
services/zfs-scrub-annotations.nix
Normal file
36
services/zfs-scrub-annotations.nix
Normal file
@@ -0,0 +1,36 @@
|
||||
{
|
||||
config,
|
||||
pkgs,
|
||||
service_configs,
|
||||
lib,
|
||||
...
|
||||
}:
|
||||
let
|
||||
grafanaUrl = "http://127.0.0.1:${toString service_configs.ports.private.grafana.port}";
|
||||
|
||||
script = pkgs.writeShellApplication {
|
||||
name = "zfs-scrub-annotations";
|
||||
runtimeInputs = with pkgs; [
|
||||
curl
|
||||
jq
|
||||
coreutils
|
||||
gnugrep
|
||||
gnused
|
||||
config.boot.zfs.package
|
||||
];
|
||||
text = builtins.readFile ./zfs-scrub-annotations.sh;
|
||||
};
|
||||
in
|
||||
{
|
||||
systemd.services.zfs-scrub = {
|
||||
environment = {
|
||||
GRAFANA_URL = grafanaUrl;
|
||||
STATE_DIR = "/run/zfs-scrub-annotations";
|
||||
};
|
||||
serviceConfig = {
|
||||
RuntimeDirectory = "zfs-scrub-annotations";
|
||||
ExecStartPre = [ "-${lib.getExe script} start" ];
|
||||
ExecStopPost = [ "${lib.getExe script} stop" ];
|
||||
};
|
||||
};
|
||||
}
|
||||
55
services/zfs-scrub-annotations.sh
Normal file
55
services/zfs-scrub-annotations.sh
Normal file
@@ -0,0 +1,55 @@
|
||||
#!/usr/bin/env bash
|
||||
# ZFS scrub annotation script for Grafana
|
||||
# Usage: zfs-scrub-annotations.sh {start|stop}
|
||||
# Required env: GRAFANA_URL, STATE_DIR
|
||||
# Required on PATH: zpool, curl, jq, paste, date, grep, sed
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
ACTION="${1:-}"
|
||||
GRAFANA_URL="${GRAFANA_URL:?GRAFANA_URL required}"
|
||||
STATE_DIR="${STATE_DIR:?STATE_DIR required}"
|
||||
|
||||
case "$ACTION" in
|
||||
start)
|
||||
POOLS=$(zpool list -H -o name | paste -sd ', ')
|
||||
NOW_MS=$(date +%s%3N)
|
||||
|
||||
RESPONSE=$(curl -sf --max-time 5 \
|
||||
-X POST "$GRAFANA_URL/api/annotations" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "$(jq -n --arg text "ZFS scrub: $POOLS" --argjson time "$NOW_MS" \
|
||||
'{time: $time, text: $text, tags: ["zfs-scrub"]}')" \
|
||||
) || exit 0
|
||||
|
||||
echo "$RESPONSE" | jq -r '.id' > "$STATE_DIR/annotation-id"
|
||||
;;
|
||||
|
||||
stop)
|
||||
ANN_ID=$(cat "$STATE_DIR/annotation-id" 2>/dev/null) || exit 0
|
||||
[ -z "$ANN_ID" ] && exit 0
|
||||
|
||||
NOW_MS=$(date +%s%3N)
|
||||
|
||||
RESULTS=""
|
||||
while IFS= read -r pool; do
|
||||
scan_line=$(zpool status "$pool" | grep "scan:" | sed 's/^[[:space:]]*//')
|
||||
RESULTS="${RESULTS}${pool}: ${scan_line}"$'\n'
|
||||
done < <(zpool list -H -o name)
|
||||
|
||||
TEXT=$(printf "ZFS scrub completed\n%s" "$RESULTS")
|
||||
|
||||
curl -sf --max-time 5 \
|
||||
-X PATCH "$GRAFANA_URL/api/annotations/$ANN_ID" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "$(jq -n --arg text "$TEXT" --argjson timeEnd "$NOW_MS" \
|
||||
'{timeEnd: $timeEnd, text: $text}')" || true
|
||||
|
||||
rm -f "$STATE_DIR/annotation-id"
|
||||
;;
|
||||
|
||||
*)
|
||||
echo "Usage: $0 {start|stop}" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
@@ -25,6 +25,9 @@ in
|
||||
# jellyfin annotation service test
|
||||
jellyfinAnnotationsTest = handleTest ./jellyfin-annotations.nix;
|
||||
|
||||
# zfs scrub annotations test
|
||||
zfsScrubAnnotationsTest = handleTest ./zfs-scrub-annotations.nix;
|
||||
|
||||
# ntfy alerts test
|
||||
ntfyAlertsTest = handleTest ./ntfy-alerts.nix;
|
||||
|
||||
|
||||
123
tests/zfs-scrub-annotations.nix
Normal file
123
tests/zfs-scrub-annotations.nix
Normal file
@@ -0,0 +1,123 @@
|
||||
{
|
||||
lib,
|
||||
pkgs,
|
||||
...
|
||||
}:
|
||||
let
|
||||
mockServer = ./mock-grafana-server.py;
|
||||
|
||||
mockZpool = pkgs.writeShellScript "zpool" ''
|
||||
case "$1" in
|
||||
list)
|
||||
echo "tank"
|
||||
echo "hdds"
|
||||
;;
|
||||
status)
|
||||
pool="$2"
|
||||
if [ "$pool" = "tank" ]; then
|
||||
echo " scan: scrub repaired 0B in 00:24:39 with 0 errors on Mon Jan 1 02:24:39 2024"
|
||||
elif [ "$pool" = "hdds" ]; then
|
||||
echo " scan: scrub repaired 0B in 04:12:33 with 0 errors on Mon Jan 1 06:12:33 2024"
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
'';
|
||||
|
||||
script = ../services/zfs-scrub-annotations.sh;
|
||||
python = pkgs.python3;
|
||||
in
|
||||
pkgs.testers.runNixOSTest {
|
||||
name = "zfs-scrub-annotations";
|
||||
|
||||
nodes.machine =
|
||||
{ pkgs, ... }:
|
||||
{
|
||||
environment.systemPackages = with pkgs; [
|
||||
python3
|
||||
curl
|
||||
jq
|
||||
];
|
||||
};
|
||||
|
||||
testScript = ''
|
||||
import json
|
||||
|
||||
GRAFANA_PORT = 13000
|
||||
ANNOTS_FILE = "/tmp/annotations.json"
|
||||
STATE_DIR = "/tmp/scrub-state"
|
||||
PYTHON = "${python}/bin/python3"
|
||||
MOCK = "${mockServer}"
|
||||
SCRIPT = "${script}"
|
||||
MOCK_ZPOOL = "${mockZpool}"
|
||||
|
||||
MOCK_BIN = "/tmp/mock-bin"
|
||||
ENV_PREFIX = (
|
||||
f"GRAFANA_URL=http://127.0.0.1:{GRAFANA_PORT} "
|
||||
f"STATE_DIR={STATE_DIR} "
|
||||
f"PATH={MOCK_BIN}:$PATH "
|
||||
)
|
||||
|
||||
def read_annotations():
|
||||
out = machine.succeed(f"cat {ANNOTS_FILE} 2>/dev/null || echo '[]'")
|
||||
return json.loads(out.strip())
|
||||
|
||||
start_all()
|
||||
machine.wait_for_unit("multi-user.target")
|
||||
|
||||
with subtest("Setup state directory and mock zpool"):
|
||||
machine.succeed(f"mkdir -p {STATE_DIR}")
|
||||
machine.succeed(f"mkdir -p {MOCK_BIN} && cp {MOCK_ZPOOL} {MOCK_BIN}/zpool && chmod +x {MOCK_BIN}/zpool")
|
||||
|
||||
with subtest("Start mock Grafana server"):
|
||||
machine.succeed(f"echo '[]' > {ANNOTS_FILE}")
|
||||
machine.succeed(
|
||||
f"systemd-run --unit=mock-grafana {PYTHON} {MOCK} {GRAFANA_PORT} {ANNOTS_FILE}"
|
||||
)
|
||||
machine.wait_until_succeeds(
|
||||
f"curl -sf -X POST http://127.0.0.1:{GRAFANA_PORT}/api/annotations "
|
||||
f"-H 'Content-Type: application/json' -d '{{\"text\":\"ping\",\"tags\":[]}}' | grep -q id",
|
||||
timeout=10,
|
||||
)
|
||||
machine.succeed(f"echo '[]' > {ANNOTS_FILE}")
|
||||
|
||||
with subtest("Start action creates annotation with pool names and zfs-scrub tag"):
|
||||
machine.succeed(f"{ENV_PREFIX} bash {SCRIPT} start")
|
||||
annots = read_annotations()
|
||||
assert len(annots) == 1, f"Expected 1 annotation, got: {annots}"
|
||||
assert "zfs-scrub" in annots[0].get("tags", []), f"Missing zfs-scrub tag: {annots[0]}"
|
||||
assert "tank" in annots[0]["text"], f"Missing tank in text: {annots[0]['text']}"
|
||||
assert "hdds" in annots[0]["text"], f"Missing hdds in text: {annots[0]['text']}"
|
||||
assert "time" in annots[0], f"Missing time field: {annots[0]}"
|
||||
assert "timeEnd" not in annots[0], f"timeEnd should not be set yet: {annots[0]}"
|
||||
|
||||
with subtest("State file contains annotation ID"):
|
||||
ann_id = machine.succeed(f"cat {STATE_DIR}/annotation-id").strip()
|
||||
assert ann_id == "1", f"Expected annotation ID 1, got: {ann_id}"
|
||||
|
||||
with subtest("Stop action closes annotation with per-pool scrub results"):
|
||||
machine.succeed(f"{ENV_PREFIX} bash {SCRIPT} stop")
|
||||
annots = read_annotations()
|
||||
assert len(annots) == 1, f"Expected 1 annotation, got: {annots}"
|
||||
assert "timeEnd" in annots[0], f"timeEnd should be set: {annots[0]}"
|
||||
assert annots[0]["timeEnd"] > annots[0]["time"], "timeEnd should be after time"
|
||||
text = annots[0]["text"]
|
||||
assert "ZFS scrub completed" in text, f"Missing completed text: {text}"
|
||||
assert "tank:" in text, f"Missing tank results: {text}"
|
||||
assert "hdds:" in text, f"Missing hdds results: {text}"
|
||||
assert "00:24:39" in text, f"Missing tank scrub duration: {text}"
|
||||
assert "04:12:33" in text, f"Missing hdds scrub duration: {text}"
|
||||
|
||||
with subtest("State file cleaned up after stop"):
|
||||
machine.fail(f"test -f {STATE_DIR}/annotation-id")
|
||||
|
||||
with subtest("Stop action handles missing state file gracefully"):
|
||||
machine.succeed(f"{ENV_PREFIX} bash {SCRIPT} stop")
|
||||
annots = read_annotations()
|
||||
assert len(annots) == 1, f"Expected no new annotations, got: {annots}"
|
||||
|
||||
with subtest("Start action handles Grafana being down gracefully"):
|
||||
machine.succeed("systemctl stop mock-grafana")
|
||||
machine.succeed(f"{ENV_PREFIX} bash {SCRIPT} start")
|
||||
machine.fail(f"test -f {STATE_DIR}/annotation-id")
|
||||
'';
|
||||
}
|
||||
Reference in New Issue
Block a user