From a3a670010621bd4d63d477df016c199a7bafd2c8 Mon Sep 17 00:00:00 2001 From: Simon Gardling Date: Thu, 9 Apr 2026 19:15:54 -0400 Subject: [PATCH] grafana: replace disk-usage-collector with prometheus-zfs-exporter The custom disk-usage-collector shell script + minutely timer is replaced by prometheus-zfs-exporter (pdf/zfs_exporter, packaged in nixpkgs as services.prometheus.exporters.zfs). The exporter provides pool capacity metrics (allocated/free/size) natively. Partition metrics (/boot, /persistent, /nix) now use node_exporter's built-in filesystem collector (node_filesystem_*_bytes) which already runs and collects these metrics. Also fixes a latent race condition in serviceMountWithZpool: the -mounts service now orders after zfs-mount.service (which runs 'zfs mount -a'), not just after pool import. Without this, the mount check could run before datasets are actually mounted. --- modules/lib.nix | 8 +++-- service-configs.nix | 4 +++ services/grafana/dashboard.nix | 10 +++--- services/grafana/default.nix | 1 - services/grafana/disk-usage-collector.nix | 38 -------------------- services/grafana/disk-usage-collector.sh | 44 ----------------------- services/grafana/prometheus.nix | 12 +++++++ 7 files changed, 27 insertions(+), 90 deletions(-) delete mode 100644 services/grafana/disk-usage-collector.nix delete mode 100644 services/grafana/disk-usage-collector.sh diff --git a/modules/lib.nix b/modules/lib.nix index 85d6b92..2d85360 100644 --- a/modules/lib.nix +++ b/modules/lib.nix @@ -59,8 +59,12 @@ inputs.nixpkgs.lib.extend ( { pkgs, config, ... }: { systemd.services."${serviceName}-mounts" = { - wants = [ "zfs.target" ] ++ lib.optionals (zpool != "") [ "zfs-import-${zpool}.service" ]; - after = lib.optionals (zpool != "") [ "zfs-import-${zpool}.service" ]; + wants = [ + "zfs.target" + "zfs-mount.service" + ] + ++ lib.optionals (zpool != "") [ "zfs-import-${zpool}.service" ]; + after = [ "zfs-mount.service" ] ++ lib.optionals (zpool != "") [ "zfs-import-${zpool}.service" ]; before = [ "${serviceName}.service" ]; serviceConfig = { diff --git a/service-configs.nix b/service-configs.nix index 24d8429..4cb200d 100644 --- a/service-configs.nix +++ b/service-configs.nix @@ -189,6 +189,10 @@ rec { port = 9563; proto = "tcp"; }; + prometheus_zfs = { + port = 9134; + proto = "tcp"; + }; harmonia = { port = 5500; proto = "tcp"; diff --git a/services/grafana/dashboard.nix b/services/grafana/dashboard.nix index 94d84c1..89ea472 100644 --- a/services/grafana/dashboard.nix +++ b/services/grafana/dashboard.nix @@ -613,13 +613,13 @@ let targets = [ { datasource = promDs; - expr = "zpool_used_bytes{pool=\"tank\"} / zpool_size_bytes{pool=\"tank\"} * 100"; + expr = "zfs_pool_allocated_bytes{pool=\"tank\"} / zfs_pool_size_bytes{pool=\"tank\"} * 100"; legendFormat = "tank"; refId = "A"; } { datasource = promDs; - expr = "zpool_used_bytes{pool=\"hdds\"} / zpool_size_bytes{pool=\"hdds\"} * 100"; + expr = "zfs_pool_allocated_bytes{pool=\"hdds\"} / zfs_pool_size_bytes{pool=\"hdds\"} * 100"; legendFormat = "hdds"; refId = "B"; } @@ -653,19 +653,19 @@ let targets = [ { datasource = promDs; - expr = "partition_used_bytes{mount=\"/boot\"} / partition_size_bytes{mount=\"/boot\"} * 100"; + expr = "(node_filesystem_size_bytes{mountpoint=\"/boot\"} - node_filesystem_avail_bytes{mountpoint=\"/boot\"}) / node_filesystem_size_bytes{mountpoint=\"/boot\"} * 100"; legendFormat = "/boot"; refId = "A"; } { datasource = promDs; - expr = "partition_used_bytes{mount=\"/persistent\"} / partition_size_bytes{mount=\"/persistent\"} * 100"; + expr = "(node_filesystem_size_bytes{mountpoint=\"/persistent\"} - node_filesystem_avail_bytes{mountpoint=\"/persistent\"}) / node_filesystem_size_bytes{mountpoint=\"/persistent\"} * 100"; legendFormat = "/persistent"; refId = "B"; } { datasource = promDs; - expr = "partition_used_bytes{mount=\"/nix\"} / partition_size_bytes{mount=\"/nix\"} * 100"; + expr = "(node_filesystem_size_bytes{mountpoint=\"/nix\"} - node_filesystem_avail_bytes{mountpoint=\"/nix\"}) / node_filesystem_size_bytes{mountpoint=\"/nix\"} * 100"; legendFormat = "/nix"; refId = "C"; } diff --git a/services/grafana/default.nix b/services/grafana/default.nix index ec4fb98..9985459 100644 --- a/services/grafana/default.nix +++ b/services/grafana/default.nix @@ -5,7 +5,6 @@ ./dashboard.nix ./exporters.nix ./jellyfin-annotations.nix - ./disk-usage-collector.nix ./llama-cpp-annotations.nix ./zfs-scrub-annotations.nix ]; diff --git a/services/grafana/disk-usage-collector.nix b/services/grafana/disk-usage-collector.nix deleted file mode 100644 index 9170b36..0000000 --- a/services/grafana/disk-usage-collector.nix +++ /dev/null @@ -1,38 +0,0 @@ -{ - config, - pkgs, - lib, - ... -}: -let - textfileDir = "/var/lib/prometheus-node-exporter-textfiles"; - - diskUsageCollector = pkgs.writeShellApplication { - name = "disk-usage-collector"; - runtimeInputs = with pkgs; [ - coreutils - gawk - config.boot.zfs.package - util-linux # for mountpoint - ]; - text = builtins.readFile ./disk-usage-collector.sh; - }; -in -lib.mkIf config.services.grafana.enable { - systemd.services.disk-usage-collector = { - description = "Collect ZFS pool and partition usage metrics for Prometheus"; - serviceConfig = { - Type = "oneshot"; - ExecStart = lib.getExe diskUsageCollector; - }; - environment.TEXTFILE = "${textfileDir}/disk-usage.prom"; - }; - - systemd.timers.disk-usage-collector = { - wantedBy = [ "timers.target" ]; - timerConfig = { - OnCalendar = "minutely"; - RandomizedDelaySec = "10s"; - }; - }; -} diff --git a/services/grafana/disk-usage-collector.sh b/services/grafana/disk-usage-collector.sh deleted file mode 100644 index 3874b53..0000000 --- a/services/grafana/disk-usage-collector.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env bash -# Collects ZFS pool utilization and boot partition usage for Prometheus textfile collector -set -euo pipefail - -TEXTFILE="${TEXTFILE:?TEXTFILE env required}" -TMP="${TEXTFILE}.$$" - -{ - echo '# HELP zpool_size_bytes Total size of ZFS pool in bytes' - echo '# TYPE zpool_size_bytes gauge' - echo '# HELP zpool_used_bytes Used space in ZFS pool in bytes' - echo '# TYPE zpool_used_bytes gauge' - echo '# HELP zpool_free_bytes Free space in ZFS pool in bytes' - echo '# TYPE zpool_free_bytes gauge' - - # -Hp: scripting mode, parseable, bytes - zpool list -Hp -o name,size,alloc,free | while IFS=$'\t' read -r name size alloc free; do - echo "zpool_size_bytes{pool=\"${name}\"} ${size}" - echo "zpool_used_bytes{pool=\"${name}\"} ${alloc}" - echo "zpool_free_bytes{pool=\"${name}\"} ${free}" - done - - echo '# HELP partition_size_bytes Total size of partition in bytes' - echo '# TYPE partition_size_bytes gauge' - echo '# HELP partition_used_bytes Used space on partition in bytes' - echo '# TYPE partition_used_bytes gauge' - echo '# HELP partition_free_bytes Free space on partition in bytes' - echo '# TYPE partition_free_bytes gauge' - - # Boot drive partitions: /boot (ESP), /persistent, /nix - # Use df with 1K blocks and convert to bytes - for mount in /boot /persistent /nix; do - if mountpoint -q "$mount" 2>/dev/null; then - read -r size used avail _ <<< "$(df -k --output=size,used,avail "$mount" | tail -1)" - size_b=$((size * 1024)) - used_b=$((used * 1024)) - avail_b=$((avail * 1024)) - echo "partition_size_bytes{mount=\"${mount}\"} ${size_b}" - echo "partition_used_bytes{mount=\"${mount}\"} ${used_b}" - echo "partition_free_bytes{mount=\"${mount}\"} ${avail_b}" - fi - done -} > "$TMP" -mv "$TMP" "$TEXTFILE" diff --git a/services/grafana/prometheus.nix b/services/grafana/prometheus.nix index 680ca3d..634de9b 100644 --- a/services/grafana/prometheus.nix +++ b/services/grafana/prometheus.nix @@ -44,6 +44,12 @@ in listenAddress = "127.0.0.1"; apcupsdAddress = "127.0.0.1:3551"; }; + + zfs = { + enable = true; + port = service_configs.ports.private.prometheus_zfs.port; + listenAddress = "127.0.0.1"; + }; }; scrapeConfigs = [ @@ -89,6 +95,12 @@ in { targets = [ "127.0.0.1:${toString service_configs.ports.private.igpu_exporter.port}" ]; } ]; } + { + job_name = "zfs"; + static_configs = [ + { targets = [ "127.0.0.1:${toString service_configs.ports.private.prometheus_zfs.port}" ]; } + ]; + } ]; };