188 lines
7.8 KiB
Nix
188 lines
7.8 KiB
Nix
# Deferred deploy finalize for deploy-rs-driven hosts.
|
|
#
|
|
# When deploy-rs activates via `switch-to-configuration switch` and the gitea-
|
|
# actions runner driving the deploy lives on the same host, the runner unit
|
|
# gets restarted mid-activation — its definition changes between builds. That
|
|
# restart kills the SSH session, the CI job, and deploy-rs's magic-rollback
|
|
# handshake, so CI reports failure even when the deploy itself completed.
|
|
# This is deploy-rs#153, open since 2022.
|
|
#
|
|
# This module breaks the dependency: activation does `switch-to-configuration
|
|
# boot` (bootloader only, no service restarts), then invokes deploy-finalize
|
|
# which schedules a detached systemd transient unit that fires `delay` seconds
|
|
# later with the real `switch` (or `systemctl reboot` when the kernel, initrd,
|
|
# or kernel-modules changed since boot). The transient unit is owned by pid1,
|
|
# so it survives the runner's eventual restart — by which time the CI job has
|
|
# finished reporting.
|
|
#
|
|
# Prior art (reboot-or-switch logic, not the self-deploy detachment):
|
|
# - nixpkgs `system.autoUpgrade` (allowReboot = true branch) is the canonical
|
|
# source of the 3-path {initrd,kernel,kernel-modules} comparison.
|
|
# - obsidiansystems/obelisk#957 merged the same snippet into `ob deploy` for
|
|
# push-based remote deploys — but doesn't need detachment since its deployer
|
|
# lives on a different machine from the target.
|
|
# - nixpkgs#185030 tracks lifting this into switch-to-configuration proper.
|
|
# Stale since 2025-07; until it lands, every downstream reimplements it.
|
|
#
|
|
# Bootstrap note: the activation snippet resolves deploy-finalize via
|
|
# lib.getExe (store path), not via `/run/current-system/sw/bin` — `boot` mode
|
|
# does not update `/run/current-system`, so the old binary would be resolved.
|
|
{
|
|
config,
|
|
lib,
|
|
pkgs,
|
|
...
|
|
}:
|
|
let
|
|
cfg = config.services.deployFinalize;
|
|
|
|
finalize = pkgs.writeShellApplication {
|
|
name = "deploy-finalize";
|
|
runtimeInputs = [
|
|
pkgs.coreutils
|
|
pkgs.systemd
|
|
];
|
|
text = ''
|
|
delay=${toString cfg.delay}
|
|
profile=/nix/var/nix/profiles/system
|
|
dry_run=0
|
|
|
|
usage() {
|
|
cat <<EOF
|
|
Usage: deploy-finalize [--dry-run] [--delay N] [--profile PATH]
|
|
|
|
Compares /run/booted-system against PATH (default /nix/var/nix/profiles/system)
|
|
and schedules either \`systemctl reboot\` (kernel or initrd changed) or
|
|
\`switch-to-configuration switch\` (services only) via a detached systemd-run
|
|
timer firing N seconds later.
|
|
|
|
Options:
|
|
--dry-run Print the decision and would-be command without scheduling.
|
|
--delay N Override the delay in seconds. Default: ${toString cfg.delay}.
|
|
--profile PATH Override the profile path used for comparison.
|
|
EOF
|
|
}
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--dry-run) dry_run=1; shift ;;
|
|
--delay) delay="$2"; shift 2 ;;
|
|
--profile) profile="$2"; shift 2 ;;
|
|
-h|--help) usage; exit 0 ;;
|
|
*)
|
|
echo "deploy-finalize: unknown option $1" >&2
|
|
usage >&2
|
|
exit 2
|
|
;;
|
|
esac
|
|
done
|
|
|
|
# Comparing {kernel,initrd,kernel-modules} matches nixpkgs's canonical
|
|
# `system.autoUpgrade` allowReboot logic. -e (not -f) so a dangling
|
|
# symlink counts as missing: on a real NixOS profile all three exist,
|
|
# but defensive: if a profile has bad symlinks we refuse to schedule
|
|
# rather than scheduling against ghost paths.
|
|
booted_kernel="$(readlink -e /run/booted-system/kernel 2>/dev/null || true)"
|
|
booted_initrd="$(readlink -e /run/booted-system/initrd 2>/dev/null || true)"
|
|
booted_modules="$(readlink -e /run/booted-system/kernel-modules 2>/dev/null || true)"
|
|
new_kernel="$(readlink -e "$profile/kernel" 2>/dev/null || true)"
|
|
new_initrd="$(readlink -e "$profile/initrd" 2>/dev/null || true)"
|
|
new_modules="$(readlink -e "$profile/kernel-modules" 2>/dev/null || true)"
|
|
|
|
if [[ -z "$new_kernel" || -z "$new_initrd" || -z "$new_modules" ]]; then
|
|
echo "deploy-finalize: refusing to schedule — $profile is missing kernel, initrd, or kernel-modules" >&2
|
|
exit 1
|
|
fi
|
|
|
|
changed=()
|
|
if [[ -z "$booted_kernel" || -z "$booted_initrd" || -z "$booted_modules" ]]; then
|
|
# Unreachable on a booted NixOS, but fail closed on reboot.
|
|
changed+=("/run/booted-system incomplete")
|
|
fi
|
|
[[ "$booted_kernel" != "$new_kernel" ]] && changed+=("kernel")
|
|
[[ "$booted_initrd" != "$new_initrd" ]] && changed+=("initrd")
|
|
[[ "$booted_modules" != "$new_modules" ]] && changed+=("kernel-modules")
|
|
|
|
reboot_needed=0
|
|
reason=""
|
|
if [[ ''${#changed[@]} -gt 0 ]]; then
|
|
reboot_needed=1
|
|
# Join with commas so the reason reads as e.g. `kernel,initrd changed`.
|
|
reason="$(IFS=, ; echo "''${changed[*]}") changed"
|
|
fi
|
|
|
|
if [[ "$reboot_needed" == 1 ]]; then
|
|
action=reboot
|
|
cmd="systemctl reboot"
|
|
else
|
|
action=switch
|
|
reason="services only"
|
|
cmd="$profile/bin/switch-to-configuration switch"
|
|
fi
|
|
|
|
# Nanosecond suffix so back-to-back deploys don't collide on unit names.
|
|
unit="deploy-finalize-$(date +%s%N)"
|
|
|
|
printf 'deploy-finalize: booted_kernel=%s\n' "$booted_kernel"
|
|
printf 'deploy-finalize: new_kernel=%s\n' "$new_kernel"
|
|
printf 'deploy-finalize: booted_initrd=%s\n' "$booted_initrd"
|
|
printf 'deploy-finalize: new_initrd=%s\n' "$new_initrd"
|
|
printf 'deploy-finalize: booted_kernel-modules=%s\n' "$booted_modules"
|
|
printf 'deploy-finalize: new_kernel-modules=%s\n' "$new_modules"
|
|
printf 'deploy-finalize: action=%s reason=%s delay=%ss unit=%s\n' \
|
|
"$action" "$reason" "$delay" "$unit"
|
|
|
|
if [[ "$dry_run" == 1 ]]; then
|
|
printf 'deploy-finalize: dry-run — not scheduling\n'
|
|
printf 'deploy-finalize: would run: %s\n' "$cmd"
|
|
printf 'deploy-finalize: would schedule: systemd-run --collect --unit=%s --on-active=%s\n' \
|
|
"$unit" "$delay"
|
|
exit 0
|
|
fi
|
|
|
|
# Cancel any still-pending finalize timers from an earlier deploy so this
|
|
# invocation is authoritative. Without this a stale timer could fire with
|
|
# the old profile's action (reboot/switch) against the new profile and
|
|
# briefly run new userspace under the old kernel.
|
|
systemctl stop 'deploy-finalize-*.timer' 2>/dev/null || true
|
|
|
|
# --on-active arms a transient timer owned by pid1. systemd-run returns
|
|
# once the timer is armed; the SSH session that called us can exit and
|
|
# the gitea-runner can be restarted (by the switch the timer fires)
|
|
# without affecting whether the finalize runs.
|
|
systemd-run \
|
|
--collect \
|
|
--unit="$unit" \
|
|
--description="Finalize NixOS deploy ($action after boot-mode activation)" \
|
|
--on-active="$delay" \
|
|
/bin/sh -c "$cmd"
|
|
'';
|
|
};
|
|
in
|
|
{
|
|
options.services.deployFinalize = {
|
|
enable = lib.mkEnableOption "deferred deploy finalize (switch or reboot) after boot-mode activation";
|
|
|
|
delay = lib.mkOption {
|
|
type = lib.types.ints.positive;
|
|
default = 60;
|
|
description = ''
|
|
Seconds between the deploy-rs activation completing and the scheduled
|
|
finalize firing. Tuned so the CI job (or manual SSH session) has time
|
|
to complete status reporting before the runner is restarted by the
|
|
eventual switch-to-configuration.
|
|
'';
|
|
};
|
|
};
|
|
|
|
config = lib.mkIf cfg.enable {
|
|
environment.systemPackages = [ finalize ];
|
|
|
|
# Exposed for the deploy-rs activation snippet to reference by /nix/store
|
|
# path via lib.getExe — `boot` mode does not update /run/current-system,
|
|
# so reading through /run/current-system/sw/bin would resolve to the OLD
|
|
# binary on a new-feature rollout or immediately after a rollback.
|
|
system.build.deployFinalize = finalize;
|
|
};
|
|
}
|