Files
nixos/modules/server-deploy-guard.nix
Simon Gardling aef99e7365
Some checks failed
Build and Deploy / mreow (push) Successful in 51s
Build and Deploy / yarn (push) Successful in 47s
Build and Deploy / muffin (push) Failing after 1m9s
deploy-guard: block activation while users are online
- modules/server-deploy-guard.nix: extendable aggregator registered via
  services.deployGuard.checks.<name>.{description,command}. Installs
  deploy-guard-check with per-check timeout, pass/block reporting, JSON
  output, DEPLOY_GUARD_BYPASS / /run/deploy-guard-bypass (single-shot).
- services/jellyfin/jellyfin-deploy-guard.nix: curl+jq on /Sessions,
  blocks when any session carries NowPlayingItem; soft-fails when unreachable.
- services/minecraft-deploy-guard.nix: mcstatus SLP query on 25565, blocks
  when players.online > 0; soft-fails when unreachable.
- flake.nix: wrap deploy.nodes.muffin activation with activate.custom so
  deploy-guard-check runs before switch-to-configuration. Auto-rollback
  catches the failure. dryActivate/boot branches preserved.
- deploy.sh: SSH preflight for ./deploy.sh muffin with --force /
  DEPLOY_GUARD_FORCE=1 (touches remote bypass marker). Connectivity
  failure is soft; activation still enforces.
- tests/deploy-guard.nix: aggregator contract, bypass mechanics, timeout,
  JSON output.
2026-04-22 00:36:21 -04:00

174 lines
5.6 KiB
Nix

# Extendable deploy guard. Aggregates per-service "is it safe to deploy right now?"
# checks registered under `services.deployGuard.checks.<name>` and exposes a single
# `deploy-guard-check` binary that deploy-rs (and the local deploy.sh preflight)
# invokes before switch-to-configuration.
#
# Extension contract (per-service): register a submodule entry whose `command`
# package installs a single executable (via meta.mainProgram) that exits 0 when
# the service is idle and non-zero when a live user would be disrupted by a
# deploy. Human-readable reasons go to stdout/stderr.
#
# Bypass: export DEPLOY_GUARD_BYPASS=1 or touch /run/deploy-guard-bypass before
# invoking the aggregator. The marker file is single-shot; the aggregator
# removes it after honoring it.
{
config,
lib,
pkgs,
...
}:
let
cfg = config.services.deployGuard;
# attrNames returns alphabetical order in Nix; rely on that for stable output.
checkNames = builtins.attrNames cfg.checks;
runCheckLines = lib.concatMapStringsSep "\n" (name: ''
run_check ${lib.escapeShellArg name} \
${lib.escapeShellArg cfg.checks.${name}.description} \
${lib.getExe cfg.checks.${name}.command}
'') checkNames;
aggregator = pkgs.writeShellApplication {
name = "deploy-guard-check";
runtimeInputs = [
pkgs.coreutils
pkgs.jq
];
text = ''
json_mode=0
if [[ "''${1:-}" == "--json" ]]; then
json_mode=1
fi
# Bypass precedence: env var first (useful in ad-hoc SSH one-liners),
# then file marker (written by `./deploy.sh muffin --force` over SSH).
bypass=0
bypass_reason=""
if [[ "''${DEPLOY_GUARD_BYPASS:-0}" == "1" ]]; then
bypass=1
bypass_reason="DEPLOY_GUARD_BYPASS=1"
elif [[ -e /run/deploy-guard-bypass ]]; then
bypass=1
bypass_reason="/run/deploy-guard-bypass"
rm -f /run/deploy-guard-bypass
fi
if [[ "$bypass" == "1" ]]; then
if [[ "$json_mode" == "1" ]]; then
jq -cn --arg reason "$bypass_reason" \
'{bypassed:true, reason:$reason, ok:true, checks:[]}'
else
printf 'deploy-guard: BYPASS via %s no checks executed\n' "$bypass_reason" >&2
fi
exit 0
fi
declare -a results=()
overall=0
run_check() {
local name="$1" description="$2" exe="$3"
local status=0 output=""
# shellcheck disable=SC2034
output=$(timeout --signal=TERM ${toString cfg.timeout} "$exe" 2>&1) || status=$?
if [[ $status -eq 0 ]]; then
results+=("$(jq -cn \
--arg name "$name" \
--arg description "$description" \
--arg output "$output" \
'{name:$name, description:$description, ok:true, output:$output}')")
[[ "$json_mode" == "1" ]] || printf 'PASS: %s %s\n' "$name" "$description"
else
overall=1
results+=("$(jq -cn \
--arg name "$name" \
--arg description "$description" \
--arg output "$output" \
--argjson exit "$status" \
'{name:$name, description:$description, ok:false, exit:$exit, output:$output}')")
if [[ "$json_mode" != "1" ]]; then
if [[ $status -eq 124 ]]; then
printf 'BLOCK: %s %s check timed out after ${toString cfg.timeout}s\n' \
"$name" "$description"
else
printf 'BLOCK: %s %s\n' "$name" "$description"
if [[ -n "$output" ]]; then
printf '%s\n' "$output" | sed 's/^/ /'
fi
fi
fi
fi
}
${runCheckLines}
if [[ "$json_mode" == "1" ]]; then
ok=$([[ $overall -eq 0 ]] && echo true || echo false)
joined=""
for r in "''${results[@]:-}"; do
if [[ -z "$r" ]]; then continue; fi
if [[ -z "$joined" ]]; then joined="$r"; else joined="$joined,$r"; fi
done
printf '{"bypassed":false,"ok":%s,"checks":[%s]}\n' "$ok" "$joined"
fi
exit "$overall"
'';
};
in
{
options.services.deployGuard = {
enable = lib.mkEnableOption "deploy guard aggregator for blocking deploys on live use";
timeout = lib.mkOption {
type = lib.types.ints.positive;
default = 10;
description = "Per-check timeout in seconds.";
};
checks = lib.mkOption {
type = lib.types.attrsOf (
lib.types.submodule {
options = {
description = lib.mkOption {
type = lib.types.str;
description = "Short human description shown in pass/fail output.";
};
command = lib.mkOption {
type = lib.types.package;
description = ''
A derivation whose meta.mainProgram is the check executable.
Contract: exit 0 when deploys are safe, non-zero with a
human-readable reason on stdout/stderr when blocked.
'';
};
};
}
);
default = { };
description = ''
Per-service deploy guard checks. Merged from anywhere in the config.
Any module can register a check see modules/server-deploy-guard.nix
for the contract.
'';
};
};
config = lib.mkIf cfg.enable {
environment.systemPackages = [ aggregator ];
assertions = [
{
assertion = cfg.checks != { };
message = ''
services.deployGuard.enable = true but no checks are registered.
Either disable it or register at least one check via
services.deployGuard.checks.<name>.
'';
}
];
};
}