deploy: potentially fix self-deploy issue?
This commit is contained in:
187
modules/server-deploy-finalize.nix
Normal file
187
modules/server-deploy-finalize.nix
Normal file
@@ -0,0 +1,187 @@
|
||||
# Deferred deploy finalize for deploy-rs-driven hosts.
|
||||
#
|
||||
# When deploy-rs activates via `switch-to-configuration switch` and the gitea-
|
||||
# actions runner driving the deploy lives on the same host, the runner unit
|
||||
# gets restarted mid-activation — its definition changes between builds. That
|
||||
# restart kills the SSH session, the CI job, and deploy-rs's magic-rollback
|
||||
# handshake, so CI reports failure even when the deploy itself completed.
|
||||
# This is deploy-rs#153, open since 2022.
|
||||
#
|
||||
# This module breaks the dependency: activation does `switch-to-configuration
|
||||
# boot` (bootloader only, no service restarts), then invokes deploy-finalize
|
||||
# which schedules a detached systemd transient unit that fires `delay` seconds
|
||||
# later with the real `switch` (or `systemctl reboot` when the kernel, initrd,
|
||||
# or kernel-modules changed since boot). The transient unit is owned by pid1,
|
||||
# so it survives the runner's eventual restart — by which time the CI job has
|
||||
# finished reporting.
|
||||
#
|
||||
# Prior art (reboot-or-switch logic, not the self-deploy detachment):
|
||||
# - nixpkgs `system.autoUpgrade` (allowReboot = true branch) is the canonical
|
||||
# source of the 3-path {initrd,kernel,kernel-modules} comparison.
|
||||
# - obsidiansystems/obelisk#957 merged the same snippet into `ob deploy` for
|
||||
# push-based remote deploys — but doesn't need detachment since its deployer
|
||||
# lives on a different machine from the target.
|
||||
# - nixpkgs#185030 tracks lifting this into switch-to-configuration proper.
|
||||
# Stale since 2025-07; until it lands, every downstream reimplements it.
|
||||
#
|
||||
# Bootstrap note: the activation snippet resolves deploy-finalize via
|
||||
# lib.getExe (store path), not via `/run/current-system/sw/bin` — `boot` mode
|
||||
# does not update `/run/current-system`, so the old binary would be resolved.
|
||||
{
|
||||
config,
|
||||
lib,
|
||||
pkgs,
|
||||
...
|
||||
}:
|
||||
let
|
||||
cfg = config.services.deployFinalize;
|
||||
|
||||
finalize = pkgs.writeShellApplication {
|
||||
name = "deploy-finalize";
|
||||
runtimeInputs = [
|
||||
pkgs.coreutils
|
||||
pkgs.systemd
|
||||
];
|
||||
text = ''
|
||||
delay=${toString cfg.delay}
|
||||
profile=/nix/var/nix/profiles/system
|
||||
dry_run=0
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: deploy-finalize [--dry-run] [--delay N] [--profile PATH]
|
||||
|
||||
Compares /run/booted-system against PATH (default /nix/var/nix/profiles/system)
|
||||
and schedules either \`systemctl reboot\` (kernel or initrd changed) or
|
||||
\`switch-to-configuration switch\` (services only) via a detached systemd-run
|
||||
timer firing N seconds later.
|
||||
|
||||
Options:
|
||||
--dry-run Print the decision and would-be command without scheduling.
|
||||
--delay N Override the delay in seconds. Default: ${toString cfg.delay}.
|
||||
--profile PATH Override the profile path used for comparison.
|
||||
EOF
|
||||
}
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--dry-run) dry_run=1; shift ;;
|
||||
--delay) delay="$2"; shift 2 ;;
|
||||
--profile) profile="$2"; shift 2 ;;
|
||||
-h|--help) usage; exit 0 ;;
|
||||
*)
|
||||
echo "deploy-finalize: unknown option $1" >&2
|
||||
usage >&2
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Comparing {kernel,initrd,kernel-modules} matches nixpkgs's canonical
|
||||
# `system.autoUpgrade` allowReboot logic. -e (not -f) so a dangling
|
||||
# symlink counts as missing: on a real NixOS profile all three exist,
|
||||
# but defensive: if a profile has bad symlinks we refuse to schedule
|
||||
# rather than scheduling against ghost paths.
|
||||
booted_kernel="$(readlink -e /run/booted-system/kernel 2>/dev/null || true)"
|
||||
booted_initrd="$(readlink -e /run/booted-system/initrd 2>/dev/null || true)"
|
||||
booted_modules="$(readlink -e /run/booted-system/kernel-modules 2>/dev/null || true)"
|
||||
new_kernel="$(readlink -e "$profile/kernel" 2>/dev/null || true)"
|
||||
new_initrd="$(readlink -e "$profile/initrd" 2>/dev/null || true)"
|
||||
new_modules="$(readlink -e "$profile/kernel-modules" 2>/dev/null || true)"
|
||||
|
||||
if [[ -z "$new_kernel" || -z "$new_initrd" || -z "$new_modules" ]]; then
|
||||
echo "deploy-finalize: refusing to schedule — $profile is missing kernel, initrd, or kernel-modules" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
changed=()
|
||||
if [[ -z "$booted_kernel" || -z "$booted_initrd" || -z "$booted_modules" ]]; then
|
||||
# Unreachable on a booted NixOS, but fail closed on reboot.
|
||||
changed+=("/run/booted-system incomplete")
|
||||
fi
|
||||
[[ "$booted_kernel" != "$new_kernel" ]] && changed+=("kernel")
|
||||
[[ "$booted_initrd" != "$new_initrd" ]] && changed+=("initrd")
|
||||
[[ "$booted_modules" != "$new_modules" ]] && changed+=("kernel-modules")
|
||||
|
||||
reboot_needed=0
|
||||
reason=""
|
||||
if [[ ''${#changed[@]} -gt 0 ]]; then
|
||||
reboot_needed=1
|
||||
# Join with commas so the reason reads as e.g. `kernel,initrd changed`.
|
||||
reason="$(IFS=, ; echo "''${changed[*]}") changed"
|
||||
fi
|
||||
|
||||
if [[ "$reboot_needed" == 1 ]]; then
|
||||
action=reboot
|
||||
cmd="systemctl reboot"
|
||||
else
|
||||
action=switch
|
||||
reason="services only"
|
||||
cmd="$profile/bin/switch-to-configuration switch"
|
||||
fi
|
||||
|
||||
# Nanosecond suffix so back-to-back deploys don't collide on unit names.
|
||||
unit="deploy-finalize-$(date +%s%N)"
|
||||
|
||||
printf 'deploy-finalize: booted_kernel=%s\n' "$booted_kernel"
|
||||
printf 'deploy-finalize: new_kernel=%s\n' "$new_kernel"
|
||||
printf 'deploy-finalize: booted_initrd=%s\n' "$booted_initrd"
|
||||
printf 'deploy-finalize: new_initrd=%s\n' "$new_initrd"
|
||||
printf 'deploy-finalize: booted_kernel-modules=%s\n' "$booted_modules"
|
||||
printf 'deploy-finalize: new_kernel-modules=%s\n' "$new_modules"
|
||||
printf 'deploy-finalize: action=%s reason=%s delay=%ss unit=%s\n' \
|
||||
"$action" "$reason" "$delay" "$unit"
|
||||
|
||||
if [[ "$dry_run" == 1 ]]; then
|
||||
printf 'deploy-finalize: dry-run — not scheduling\n'
|
||||
printf 'deploy-finalize: would run: %s\n' "$cmd"
|
||||
printf 'deploy-finalize: would schedule: systemd-run --collect --unit=%s --on-active=%s\n' \
|
||||
"$unit" "$delay"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Cancel any still-pending finalize timers from an earlier deploy so this
|
||||
# invocation is authoritative. Without this a stale timer could fire with
|
||||
# the old profile's action (reboot/switch) against the new profile and
|
||||
# briefly run new userspace under the old kernel.
|
||||
systemctl stop 'deploy-finalize-*.timer' 2>/dev/null || true
|
||||
|
||||
# --on-active arms a transient timer owned by pid1. systemd-run returns
|
||||
# once the timer is armed; the SSH session that called us can exit and
|
||||
# the gitea-runner can be restarted (by the switch the timer fires)
|
||||
# without affecting whether the finalize runs.
|
||||
systemd-run \
|
||||
--collect \
|
||||
--unit="$unit" \
|
||||
--description="Finalize NixOS deploy ($action after boot-mode activation)" \
|
||||
--on-active="$delay" \
|
||||
/bin/sh -c "$cmd"
|
||||
'';
|
||||
};
|
||||
in
|
||||
{
|
||||
options.services.deployFinalize = {
|
||||
enable = lib.mkEnableOption "deferred deploy finalize (switch or reboot) after boot-mode activation";
|
||||
|
||||
delay = lib.mkOption {
|
||||
type = lib.types.ints.positive;
|
||||
default = 60;
|
||||
description = ''
|
||||
Seconds between the deploy-rs activation completing and the scheduled
|
||||
finalize firing. Tuned so the CI job (or manual SSH session) has time
|
||||
to complete status reporting before the runner is restarted by the
|
||||
eventual switch-to-configuration.
|
||||
'';
|
||||
};
|
||||
};
|
||||
|
||||
config = lib.mkIf cfg.enable {
|
||||
environment.systemPackages = [ finalize ];
|
||||
|
||||
# Exposed for the deploy-rs activation snippet to reference by /nix/store
|
||||
# path via lib.getExe — `boot` mode does not update /run/current-system,
|
||||
# so reading through /run/current-system/sw/bin would resolve to the OLD
|
||||
# binary on a new-feature rollout or immediately after a rollback.
|
||||
system.build.deployFinalize = finalize;
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user