# Deferred deploy finalize for deploy-rs-driven hosts. # # When deploy-rs activates via `switch-to-configuration switch` and the gitea- # actions runner driving the deploy lives on the same host, the runner unit # gets restarted mid-activation — its definition changes between builds. That # restart kills the SSH session, the CI job, and deploy-rs's magic-rollback # handshake, so CI reports failure even when the deploy itself completed. # This is deploy-rs#153, open since 2022. # # This module breaks the dependency: activation does `switch-to-configuration # boot` (bootloader only, no service restarts), then invokes deploy-finalize # which schedules a detached systemd transient unit that fires `delay` seconds # later with the real `switch` (or `systemctl reboot` when the kernel, initrd, # or kernel-modules changed since boot). The transient unit is owned by pid1, # so it survives the runner's eventual restart — by which time the CI job has # finished reporting. # # Prior art (reboot-or-switch logic, not the self-deploy detachment): # - nixpkgs `system.autoUpgrade` (allowReboot = true branch) is the canonical # source of the 3-path {initrd,kernel,kernel-modules} comparison. # - obsidiansystems/obelisk#957 merged the same snippet into `ob deploy` for # push-based remote deploys — but doesn't need detachment since its deployer # lives on a different machine from the target. # - nixpkgs#185030 tracks lifting this into switch-to-configuration proper. # Stale since 2025-07; until it lands, every downstream reimplements it. # # Bootstrap note: the activation snippet resolves deploy-finalize via # lib.getExe (store path), not via `/run/current-system/sw/bin` — `boot` mode # does not update `/run/current-system`, so the old binary would be resolved. { config, lib, pkgs, ... }: let cfg = config.services.deployFinalize; finalize = pkgs.writeShellApplication { name = "deploy-finalize"; runtimeInputs = [ pkgs.coreutils pkgs.systemd ]; text = '' delay=${toString cfg.delay} profile=/nix/var/nix/profiles/system dry_run=0 usage() { cat <&2 usage >&2 exit 2 ;; esac done # Comparing {kernel,initrd,kernel-modules} matches nixpkgs's canonical # `system.autoUpgrade` allowReboot logic. -e (not -f) so a dangling # symlink counts as missing: on a real NixOS profile all three exist, # but defensive: if a profile has bad symlinks we refuse to schedule # rather than scheduling against ghost paths. booted_kernel="$(readlink -e /run/booted-system/kernel 2>/dev/null || true)" booted_initrd="$(readlink -e /run/booted-system/initrd 2>/dev/null || true)" booted_modules="$(readlink -e /run/booted-system/kernel-modules 2>/dev/null || true)" new_kernel="$(readlink -e "$profile/kernel" 2>/dev/null || true)" new_initrd="$(readlink -e "$profile/initrd" 2>/dev/null || true)" new_modules="$(readlink -e "$profile/kernel-modules" 2>/dev/null || true)" if [[ -z "$new_kernel" || -z "$new_initrd" || -z "$new_modules" ]]; then echo "deploy-finalize: refusing to schedule — $profile is missing kernel, initrd, or kernel-modules" >&2 exit 1 fi changed=() if [[ -z "$booted_kernel" || -z "$booted_initrd" || -z "$booted_modules" ]]; then # Unreachable on a booted NixOS, but fail closed on reboot. changed+=("/run/booted-system incomplete") fi [[ "$booted_kernel" != "$new_kernel" ]] && changed+=("kernel") [[ "$booted_initrd" != "$new_initrd" ]] && changed+=("initrd") [[ "$booted_modules" != "$new_modules" ]] && changed+=("kernel-modules") reboot_needed=0 reason="" if [[ ''${#changed[@]} -gt 0 ]]; then reboot_needed=1 # Join with commas so the reason reads as e.g. `kernel,initrd changed`. reason="$(IFS=, ; echo "''${changed[*]}") changed" fi if [[ "$reboot_needed" == 1 ]]; then action=reboot cmd="systemctl reboot" else action=switch reason="services only" cmd="$profile/bin/switch-to-configuration switch" fi # Nanosecond suffix so back-to-back deploys don't collide on unit names. unit="deploy-finalize-$(date +%s%N)" printf 'deploy-finalize: booted_kernel=%s\n' "$booted_kernel" printf 'deploy-finalize: new_kernel=%s\n' "$new_kernel" printf 'deploy-finalize: booted_initrd=%s\n' "$booted_initrd" printf 'deploy-finalize: new_initrd=%s\n' "$new_initrd" printf 'deploy-finalize: booted_kernel-modules=%s\n' "$booted_modules" printf 'deploy-finalize: new_kernel-modules=%s\n' "$new_modules" printf 'deploy-finalize: action=%s reason=%s delay=%ss unit=%s\n' \ "$action" "$reason" "$delay" "$unit" if [[ "$dry_run" == 1 ]]; then printf 'deploy-finalize: dry-run — not scheduling\n' printf 'deploy-finalize: would run: %s\n' "$cmd" printf 'deploy-finalize: would schedule: systemd-run --collect --unit=%s --on-active=%s\n' \ "$unit" "$delay" exit 0 fi # Cancel any still-pending finalize timers from an earlier deploy so this # invocation is authoritative. Without this a stale timer could fire with # the old profile's action (reboot/switch) against the new profile and # briefly run new userspace under the old kernel. systemctl stop 'deploy-finalize-*.timer' 2>/dev/null || true # --on-active arms a transient timer owned by pid1. systemd-run returns # once the timer is armed; the SSH session that called us can exit and # the gitea-runner can be restarted (by the switch the timer fires) # without affecting whether the finalize runs. systemd-run \ --collect \ --unit="$unit" \ --description="Finalize NixOS deploy ($action after boot-mode activation)" \ --on-active="$delay" \ /bin/sh -c "$cmd" ''; }; in { options.services.deployFinalize = { enable = lib.mkEnableOption "deferred deploy finalize (switch or reboot) after boot-mode activation"; delay = lib.mkOption { type = lib.types.ints.positive; default = 60; description = '' Seconds between the deploy-rs activation completing and the scheduled finalize firing. Tuned so the CI job (or manual SSH session) has time to complete status reporting before the runner is restarted by the eventual switch-to-configuration. ''; }; }; config = lib.mkIf cfg.enable { environment.systemPackages = [ finalize ]; # Exposed for the deploy-rs activation snippet to reference by /nix/store # path via lib.getExe — `boot` mode does not update /run/current-system, # so reading through /run/current-system/sw/bin would resolve to the OLD # binary on a new-feature rollout or immediately after a rollback. system.build.deployFinalize = finalize; }; }