This repository has been archived on 2026-04-18. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
server-config/services/monitoring.nix
Simon Gardling bfe7a65db2 monitoring: add zpool and boot partition usage metrics
Add textfile collector for ZFS pool utilization (tank, hdds) and
boot drive partitions (/boot, /persistent, /nix). Runs every 60s.
Add two Grafana dashboard panels: ZFS Pool Utilization and Boot
Drive Partitions as Row 5.
2026-04-02 18:02:23 -04:00

999 lines
24 KiB
Nix

{
config,
pkgs,
service_configs,
lib,
...
}:
let
textfileDir = "/var/lib/prometheus-node-exporter-textfiles";
promDs = {
type = "prometheus";
uid = "prometheus";
};
jellyfinCollector = pkgs.writeShellApplication {
name = "jellyfin-metrics-collector";
runtimeInputs = with pkgs; [
curl
jq
];
text = ''
API_KEY=$(cat "$CREDENTIALS_DIRECTORY/jellyfin-api-key")
JELLYFIN="http://127.0.0.1:${toString service_configs.ports.private.jellyfin.port}"
if response=$(curl -sf --max-time 5 "''${JELLYFIN}/Sessions?api_key=''${API_KEY}"); then
active_streams=$(echo "$response" | jq '[.[] | select(.NowPlayingItem != null)] | length')
else
active_streams=0
fi
{
echo '# HELP jellyfin_active_streams Number of currently active Jellyfin streams'
echo '# TYPE jellyfin_active_streams gauge'
echo "jellyfin_active_streams $active_streams"
} > "${textfileDir}/jellyfin.prom.$$.tmp"
mv "${textfileDir}/jellyfin.prom.$$.tmp" "${textfileDir}/jellyfin.prom"
'';
};
intelGpuCollector = pkgs.writeShellApplication {
name = "intel-gpu-collector";
runtimeInputs = with pkgs; [
python3
intel-gpu-tools
];
text = ''
exec python3 ${./intel-gpu-collector.py}
'';
};
qbittorrentCollector = pkgs.writeShellApplication {
name = "qbittorrent-collector";
runtimeInputs = with pkgs; [
curl
jq
];
text = ''
QBIT="http://${config.vpnNamespaces.wg.namespaceAddress}:${toString config.services.qbittorrent.webuiPort}"
OUT="${textfileDir}/qbittorrent.prom"
if info=$(curl -sf --max-time 5 "''${QBIT}/api/v2/transfer/info"); then
dl=$(echo "$info" | jq '.dl_info_speed')
ul=$(echo "$info" | jq '.up_info_speed')
else
dl=0
ul=0
fi
{
echo '# HELP qbittorrent_download_bytes_per_second Current download speed in bytes/s'
echo '# TYPE qbittorrent_download_bytes_per_second gauge'
echo "qbittorrent_download_bytes_per_second $dl"
echo '# HELP qbittorrent_upload_bytes_per_second Current upload speed in bytes/s'
echo '# TYPE qbittorrent_upload_bytes_per_second gauge'
echo "qbittorrent_upload_bytes_per_second $ul"
} > "''${OUT}.tmp"
mv "''${OUT}.tmp" "$OUT"
'';
};
diskUsageCollector = pkgs.writeShellApplication {
name = "disk-usage-collector";
runtimeInputs = with pkgs; [
coreutils
gawk
config.boot.zfs.package
util-linux # for mountpoint
];
text = builtins.readFile ./disk-usage-collector.sh;
};
dashboard = {
editable = true;
graphTooltip = 1;
schemaVersion = 39;
tags = [
"system"
"monitoring"
];
time = {
from = "now-6h";
to = "now";
};
timezone = "browser";
title = "System Overview";
uid = "system-overview";
annotations.list = [
{
name = "Jellyfin Streams";
datasource = {
type = "grafana";
uid = "-- Grafana --";
};
enable = true;
iconColor = "green";
showIn = 0;
type = "tags";
tags = [ "jellyfin" ];
}
{
name = "ZFS Scrubs";
datasource = {
type = "grafana";
uid = "-- Grafana --";
};
enable = true;
iconColor = "orange";
showIn = 0;
type = "tags";
tags = [ "zfs-scrub" ];
}
{
name = "LLM Requests";
datasource = {
type = "grafana";
uid = "-- Grafana --";
};
enable = true;
iconColor = "purple";
showIn = 0;
type = "tags";
tags = [ "llama-cpp" ];
}
];
panels = [
# -- Row 1: UPS --
{
id = 1;
type = "timeseries";
title = "UPS Power Draw";
gridPos = {
h = 8;
w = 8;
x = 0;
y = 0;
};
datasource = promDs;
targets = [
{
datasource = promDs;
expr = "apcupsd_ups_load_percent / 100 * apcupsd_nominal_power_watts";
legendFormat = "Power (W)";
refId = "A";
}
{
datasource = promDs;
expr = "avg_over_time((apcupsd_ups_load_percent / 100 * apcupsd_nominal_power_watts + 4.5)[5m:])";
legendFormat = "5m average (W)";
refId = "B";
}
];
fieldConfig = {
defaults = {
unit = "watt";
color.mode = "palette-classic";
custom = {
lineWidth = 2;
fillOpacity = 20;
spanNulls = true;
};
};
overrides = [
{
matcher = {
id = "byFrameRefID";
options = "A";
};
properties = [
{
id = "custom.lineStyle";
value = {
fill = "dot";
};
}
{
id = "custom.fillOpacity";
value = 10;
}
{
id = "custom.lineWidth";
value = 1;
}
{
id = "custom.pointSize";
value = 1;
}
];
}
{
matcher = {
id = "byFrameRefID";
options = "B";
};
properties = [
{
id = "custom.lineWidth";
value = 4;
}
{
id = "custom.fillOpacity";
value = 0;
}
];
}
];
};
}
{
id = 7;
type = "stat";
title = "Energy Usage (24h)";
gridPos = {
h = 8;
w = 4;
x = 8;
y = 0;
};
datasource = promDs;
targets = [
{
datasource = promDs;
expr = "avg_over_time((apcupsd_ups_load_percent / 100 * apcupsd_nominal_power_watts + 4.5)[24h:]) * 24 / 1000";
legendFormat = "";
refId = "A";
}
];
fieldConfig = {
defaults = {
unit = "kwatth";
decimals = 2;
thresholds = {
mode = "absolute";
steps = [
{
color = "green";
value = null;
}
{
color = "yellow";
value = 5;
}
{
color = "red";
value = 10;
}
];
};
};
overrides = [ ];
};
options = {
reduceOptions = {
calcs = [ "lastNotNull" ];
fields = "";
values = false;
};
colorMode = "value";
graphMode = "none";
};
}
{
id = 2;
type = "gauge";
title = "UPS Load";
gridPos = {
h = 8;
w = 6;
x = 12;
y = 0;
};
datasource = promDs;
targets = [
{
datasource = promDs;
expr = "apcupsd_ups_load_percent";
refId = "A";
}
];
fieldConfig = {
defaults = {
unit = "percent";
min = 0;
max = 100;
thresholds = {
mode = "absolute";
steps = [
{
color = "green";
value = null;
}
{
color = "yellow";
value = 70;
}
{
color = "red";
value = 90;
}
];
};
};
overrides = [ ];
};
options.reduceOptions = {
calcs = [ "lastNotNull" ];
fields = "";
values = false;
};
}
{
id = 3;
type = "gauge";
title = "UPS Battery";
gridPos = {
h = 8;
w = 6;
x = 18;
y = 0;
};
datasource = promDs;
targets = [
{
datasource = promDs;
expr = "apcupsd_battery_charge_percent";
refId = "A";
}
];
fieldConfig = {
defaults = {
unit = "percent";
min = 0;
max = 100;
thresholds = {
mode = "absolute";
steps = [
{
color = "red";
value = null;
}
{
color = "yellow";
value = 20;
}
{
color = "green";
value = 50;
}
];
};
};
overrides = [ ];
};
options.reduceOptions = {
calcs = [ "lastNotNull" ];
fields = "";
values = false;
};
}
# -- Row 2: System --
{
id = 4;
type = "timeseries";
title = "CPU Temperature";
gridPos = {
h = 8;
w = 12;
x = 0;
y = 8;
};
datasource = promDs;
targets = [
{
datasource = promDs;
expr = ''node_hwmon_temp_celsius{chip=~"pci.*"}'';
legendFormat = "CPU {{sensor}}";
refId = "A";
}
];
fieldConfig = {
defaults = {
unit = "celsius";
color.mode = "palette-classic";
custom = {
lineWidth = 2;
fillOpacity = 10;
spanNulls = true;
};
};
overrides = [ ];
};
}
{
id = 5;
type = "stat";
title = "System Uptime";
gridPos = {
h = 8;
w = 6;
x = 12;
y = 8;
};
datasource = promDs;
targets = [
{
datasource = promDs;
expr = "time() - node_boot_time_seconds";
refId = "A";
}
];
fieldConfig = {
defaults = {
unit = "s";
thresholds = {
mode = "absolute";
steps = [
{
color = "green";
value = null;
}
];
};
};
overrides = [ ];
};
options = {
reduceOptions = {
calcs = [ "lastNotNull" ];
fields = "";
values = false;
};
colorMode = "value";
graphMode = "none";
};
}
{
id = 6;
type = "stat";
title = "Jellyfin Active Streams";
gridPos = {
h = 8;
w = 6;
x = 18;
y = 8;
};
datasource = promDs;
targets = [
{
datasource = promDs;
expr = "jellyfin_active_streams";
refId = "A";
}
];
fieldConfig = {
defaults = {
thresholds = {
mode = "absolute";
steps = [
{
color = "green";
value = null;
}
{
color = "yellow";
value = 3;
}
{
color = "red";
value = 6;
}
];
};
};
overrides = [ ];
};
options = {
reduceOptions = {
calcs = [ "lastNotNull" ];
fields = "";
values = false;
};
colorMode = "value";
graphMode = "area";
};
}
# -- Row 3: qBittorrent --
{
id = 11;
type = "timeseries";
title = "qBittorrent Speed";
gridPos = {
h = 8;
w = 24;
x = 0;
y = 16;
};
datasource = promDs;
targets = [
{
datasource = promDs;
expr = "qbittorrent_download_bytes_per_second";
legendFormat = "Download";
refId = "A";
}
{
datasource = promDs;
expr = "qbittorrent_upload_bytes_per_second";
legendFormat = "Upload";
refId = "B";
}
{
datasource = promDs;
expr = "avg_over_time(qbittorrent_download_bytes_per_second[10m:])";
legendFormat = "Download (10m avg)";
refId = "C";
}
{
datasource = promDs;
expr = "avg_over_time(qbittorrent_upload_bytes_per_second[10m:])";
legendFormat = "Upload (10m avg)";
refId = "D";
}
];
fieldConfig = {
defaults = {
unit = "binBps";
min = 0;
color.mode = "palette-classic";
custom = {
lineWidth = 1;
fillOpacity = 10;
spanNulls = true;
};
};
overrides = [
{
matcher = {
id = "byFrameRefID";
options = "A";
};
properties = [
{
id = "color";
value = {
fixedColor = "green";
mode = "fixed";
};
}
{
id = "custom.fillOpacity";
value = 5;
}
];
}
{
matcher = {
id = "byFrameRefID";
options = "B";
};
properties = [
{
id = "color";
value = {
fixedColor = "blue";
mode = "fixed";
};
}
{
id = "custom.fillOpacity";
value = 5;
}
];
}
{
matcher = {
id = "byFrameRefID";
options = "C";
};
properties = [
{
id = "color";
value = {
fixedColor = "green";
mode = "fixed";
};
}
{
id = "custom.lineWidth";
value = 3;
}
{
id = "custom.fillOpacity";
value = 0;
}
];
}
{
matcher = {
id = "byFrameRefID";
options = "D";
};
properties = [
{
id = "color";
value = {
fixedColor = "blue";
mode = "fixed";
};
}
{
id = "custom.lineWidth";
value = 3;
}
{
id = "custom.fillOpacity";
value = 0;
}
];
}
];
};
}
# -- Row 4: Intel GPU --
{
id = 8;
type = "timeseries";
title = "Intel GPU Utilization";
gridPos = {
h = 8;
w = 24;
x = 0;
y = 24;
};
datasource = promDs;
targets = [
{
datasource = promDs;
expr = "intel_gpu_engine_busy_percent";
legendFormat = "{{engine}}";
refId = "A";
}
];
fieldConfig = {
defaults = {
unit = "percent";
min = 0;
max = 100;
color.mode = "palette-classic";
custom = {
lineWidth = 2;
fillOpacity = 10;
spanNulls = true;
};
};
overrides = [ ];
};
}
# -- Row 5: Storage --
{
id = 12;
type = "timeseries";
title = "ZFS Pool Utilization";
gridPos = {
h = 8;
w = 12;
x = 0;
y = 32;
};
datasource = promDs;
targets = [
{
datasource = promDs;
expr = "zpool_used_bytes{pool=\"tank\"} / zpool_size_bytes{pool=\"tank\"} * 100";
legendFormat = "tank";
refId = "A";
}
{
datasource = promDs;
expr = "zpool_used_bytes{pool=\"hdds\"} / zpool_size_bytes{pool=\"hdds\"} * 100";
legendFormat = "hdds";
refId = "B";
}
];
fieldConfig = {
defaults = {
unit = "percent";
min = 0;
max = 100;
color.mode = "palette-classic";
custom = {
lineWidth = 2;
fillOpacity = 20;
spanNulls = true;
};
};
overrides = [ ];
};
}
{
id = 13;
type = "timeseries";
title = "Boot Drive Partitions";
gridPos = {
h = 8;
w = 12;
x = 12;
y = 32;
};
datasource = promDs;
targets = [
{
datasource = promDs;
expr = "partition_used_bytes{mount=\"/boot\"} / partition_size_bytes{mount=\"/boot\"} * 100";
legendFormat = "/boot";
refId = "A";
}
{
datasource = promDs;
expr = "partition_used_bytes{mount=\"/persistent\"} / partition_size_bytes{mount=\"/persistent\"} * 100";
legendFormat = "/persistent";
refId = "B";
}
{
datasource = promDs;
expr = "partition_used_bytes{mount=\"/nix\"} / partition_size_bytes{mount=\"/nix\"} * 100";
legendFormat = "/nix";
refId = "C";
}
];
fieldConfig = {
defaults = {
unit = "percent";
min = 0;
max = 100;
color.mode = "palette-classic";
custom = {
lineWidth = 2;
fillOpacity = 20;
spanNulls = true;
};
};
overrides = [ ];
};
}
];
};
in
{
imports = [
(lib.serviceMountWithZpool "grafana" service_configs.zpool_ssds [
service_configs.grafana.dir
])
(lib.serviceFilePerms "grafana" [
"Z ${service_configs.grafana.dir} 0700 grafana grafana"
])
(lib.serviceMountWithZpool "prometheus" service_configs.zpool_ssds [
"/var/lib/prometheus"
])
(lib.serviceFilePerms "prometheus" [
"Z /var/lib/prometheus 0700 prometheus prometheus"
])
];
# -- Prometheus --
services.prometheus = {
enable = true;
port = service_configs.ports.private.prometheus.port;
listenAddress = "127.0.0.1";
stateDir = "prometheus";
retentionTime = "90d";
exporters = {
node = {
enable = true;
port = service_configs.ports.private.prometheus_node.port;
listenAddress = "127.0.0.1";
enabledCollectors = [
"hwmon"
"systemd"
"textfile"
];
extraFlags = [
"--collector.textfile.directory=${textfileDir}"
];
};
apcupsd = {
enable = true;
port = service_configs.ports.private.prometheus_apcupsd.port;
listenAddress = "127.0.0.1";
apcupsdAddress = "127.0.0.1:3551";
};
};
scrapeConfigs = [
{
job_name = "prometheus";
static_configs = [
{ targets = [ "127.0.0.1:${toString service_configs.ports.private.prometheus.port}" ]; }
];
}
{
job_name = "node";
static_configs = [
{ targets = [ "127.0.0.1:${toString service_configs.ports.private.prometheus_node.port}" ]; }
];
}
{
job_name = "apcupsd";
static_configs = [
{ targets = [ "127.0.0.1:${toString service_configs.ports.private.prometheus_apcupsd.port}" ]; }
];
}
];
};
# -- Grafana --
services.grafana = {
enable = true;
dataDir = service_configs.grafana.dir;
settings = {
server = {
http_addr = "127.0.0.1";
http_port = service_configs.ports.private.grafana.port;
domain = service_configs.grafana.domain;
root_url = "https://${service_configs.grafana.domain}";
};
"auth.anonymous" = {
enabled = true;
org_role = "Admin";
};
"auth.basic".enabled = false;
"auth".disable_login_form = true;
analytics.reporting_enabled = false;
feature_toggles.enable = "dataConnectionsConsole=false";
users.default_theme = "dark";
# Disable unused built-in integrations
alerting.enabled = false;
"unified_alerting".enabled = false;
explore.enabled = false;
news.news_feed_enabled = false;
plugins = {
enable_alpha = false;
plugin_admin_enabled = false;
};
};
provision = {
datasources.settings = {
apiVersion = 1;
datasources = [
{
name = "Prometheus";
type = "prometheus";
url = "http://127.0.0.1:${toString service_configs.ports.private.prometheus.port}";
access = "proxy";
isDefault = true;
editable = false;
uid = "prometheus";
}
];
};
dashboards.settings.providers = [
{
name = "system";
type = "file";
options.path = "/etc/grafana-dashboards";
disableDeletion = true;
updateIntervalSeconds = 60;
}
];
};
};
environment.etc."grafana-dashboards/system-overview.json" = {
text = builtins.toJSON dashboard;
mode = "0444";
};
services.caddy.virtualHosts."${service_configs.grafana.domain}".extraConfig = ''
import ${config.age.secrets.caddy_auth.path}
reverse_proxy :${builtins.toString service_configs.ports.private.grafana.port}
'';
# -- Jellyfin active-stream prometheus textfile collector --
systemd.services.jellyfin-metrics-collector = {
description = "Collect Jellyfin metrics for Prometheus";
after = [ "network.target" ];
serviceConfig = {
Type = "oneshot";
ExecStart = lib.getExe jellyfinCollector;
LoadCredential = "jellyfin-api-key:${config.age.secrets.jellyfin-api-key.path}";
};
};
systemd.timers.jellyfin-metrics-collector = {
wantedBy = [ "timers.target" ];
timerConfig = {
OnCalendar = "*:*:0/30";
RandomizedDelaySec = "5s";
};
};
# -- Intel GPU textfile collector --
systemd.services.intel-gpu-collector = {
description = "Collect Intel GPU metrics for Prometheus";
serviceConfig = {
Type = "oneshot";
ExecStart = lib.getExe intelGpuCollector;
};
environment.TEXTFILE = "${textfileDir}/intel-gpu.prom";
};
systemd.timers.intel-gpu-collector = {
wantedBy = [ "timers.target" ];
timerConfig = {
OnCalendar = "*:*:0/30";
RandomizedDelaySec = "10s";
};
};
# -- qBittorrent speed textfile collector --
systemd.services.qbittorrent-collector = {
description = "Collect qBittorrent transfer metrics for Prometheus";
after = [
"network.target"
"qbittorrent.service"
];
serviceConfig = {
Type = "oneshot";
ExecStart = lib.getExe qbittorrentCollector;
};
};
systemd.timers.qbittorrent-collector = {
wantedBy = [ "timers.target" ];
timerConfig = {
OnCalendar = "*:*:0/15";
RandomizedDelaySec = "3s";
};
};
# -- Disk/pool usage textfile collector --
systemd.services.disk-usage-collector = {
description = "Collect ZFS pool and partition usage metrics for Prometheus";
serviceConfig = {
Type = "oneshot";
ExecStart = lib.getExe diskUsageCollector;
};
environment.TEXTFILE = "${textfileDir}/disk-usage.prom";
};
systemd.timers.disk-usage-collector = {
wantedBy = [ "timers.target" ];
timerConfig = {
OnCalendar = "*:*:0/60"; # every 60 seconds
RandomizedDelaySec = "10s";
};
};
systemd.tmpfiles.rules = [
"d ${textfileDir} 0755 root root -"
];
}