deploy: potentially fix self-deploy issue?

2026-04-22 23:02:38 -04:00
parent a1924849d6
commit 0901f5edf0
6 changed files with 428 additions and 1 deletions
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -191,6 +191,21 @@ lib.mkIf config.services.<service>.enable {

 Existing registrations live in `services/jellyfin/jellyfin-deploy-guard.nix` (REST `/Sessions` via curl+jq) and `services/minecraft-deploy-guard.nix` (Server List Ping via `mcstatus`). Prefer soft-fail on unreachable — a service that's already down has no users to disrupt.

+## Deploy finalize (muffin)
+
+`modules/server-deploy-finalize.nix` solves the self-deploy problem: the gitea-actions runner driving CI deploys lives on muffin itself, so a direct `switch-to-configuration switch` restarts the runner mid-activation, killing the SSH session, the CI job, and deploy-rs's magic-rollback handshake. The failure mode is visible as "deploy appears to fail even though the new config landed" (or worse, a rollback storm).
+
+The fix is a two-phase activation wired into `deploy.nodes.muffin.profiles.system.path` in `flake.nix`:
+
+1. `switch-to-configuration boot` — bootloader-only, no service restarts. The runner, SSH session, and magic-rollback survive.
+2. `deploy-finalize` — schedules a detached `systemd-run --on-active=N` transient unit (default 60s). The unit is owned by pid1, so it survives the eventual runner restart. If `/run/booted-system/{kernel,initrd,kernel-modules}` differs from the new profile's, the unit runs `systemctl reboot`; otherwise it runs `switch-to-configuration switch`.
+
+That is, reboot is dynamically gated on kernel/initrd/kernel-modules change. The 60s delay is tuned so the CI job (or manual `./deploy.sh muffin`) has time to emit status/notification steps before the runner is recycled.
+
+Back-to-back deploys supersede each other: each invocation cancels any still-pending `deploy-finalize-*.timer` before scheduling its own. `deploy-finalize --dry-run` prints the decision without scheduling anything — useful when debugging.
+
+Prior art: the 3-path `{kernel,initrd,kernel-modules}` diff is lifted from nixpkgs's `system.autoUpgrade` module (the `allowReboot = true` branch) and was packaged the same way in [obsidiansystems/obelisk#957](https://github.com/obsidiansystems/obelisk/pull/957). nixpkgs#185030 tracks lifting it into `switch-to-configuration` proper but has been stale since 2025-07. The self-deploy `systemd-run` detachment is the proposed fix from [deploy-rs#153](https://github.com/serokell/deploy-rs/issues/153), also unmerged upstream.
+
 ## Technical details

 - **Privilege escalation**: `doas` everywhere; `sudo` is disabled on every host.
--- a/flake.nix
+++ b/flake.nix
@@ -415,7 +415,27 @@
          # want to avoid when the deploy is supposed to be a no-op blocked by
          # the guard. Blocking before the deploy-rs invocation is the only
          # clean way to leave the running system untouched.
-          path = deploy-rs.lib.${system}.activate.nixos self.nixosConfigurations.muffin;
+          #
+          # Activation uses `switch-to-configuration boot` + a detached finalize
+          # (modules/server-deploy-finalize.nix) rather than the default
+          # `switch`. The gitea-actions runner driving CI deploys lives on
+          # muffin itself; a direct `switch` restarts gitea-runner-muffin mid-
+          # activation, killing the SSH session, the CI job, and deploy-rs's
+          # magic-rollback handshake. `boot` only touches the bootloader — no
+          # service restarts — and deploy-finalize schedules a pid1-owned
+          # transient unit that runs the real `switch` (or `systemctl reboot`
+          # when kernel/initrd/kernel-modules changed) ~60s later, surviving
+          # runner restart because it's decoupled from the SSH session.
+          path =
+            deploy-rs.lib.${system}.activate.custom self.nixosConfigurations.muffin.config.system.build.toplevel
+              ''
+                # matches activate.nixos's workaround for NixOS/nixpkgs#73404
+                cd /tmp
+
+                $PROFILE/bin/switch-to-configuration boot
+
+                ${nixpkgs-stable.lib.getExe self.nixosConfigurations.muffin.config.system.build.deployFinalize}
+              '';
        };
      };

--- a/hosts/muffin/default.nix
+++ b/hosts/muffin/default.nix
@@ -26,6 +26,7 @@
    ../../modules/ntfy-alerts.nix
    ../../modules/server-power.nix
    ../../modules/server-deploy-guard.nix
+    ../../modules/server-deploy-finalize.nix

    ../../services/postgresql.nix
    ../../services/jellyfin
@@ -99,6 +100,13 @@

  services.deployGuard.enable = true;

+  # Detached deploy finalize: see modules/server-deploy-finalize.nix. deploy-rs
+  # activates in `boot` mode and invokes deploy-finalize to schedule the real
+  # `switch` (or reboot, when kernel/initrd/kernel-modules changed) 60s later
+  # as a pid1-owned transient unit. Prevents the self-hosted gitea runner from
+  # being restarted mid-CI-deploy.
+  services.deployFinalize.enable = true;
+
  # Disable serial getty on ttyS0 to prevent dmesg warnings
  systemd.services."serial-getty@ttyS0".enable = false;

--- a/modules/server-deploy-finalize.nix
+++ b/modules/server-deploy-finalize.nix
@@ -0,0 +1,187 @@
+# Deferred deploy finalize for deploy-rs-driven hosts.
+#
+# When deploy-rs activates via `switch-to-configuration switch` and the gitea-
+# actions runner driving the deploy lives on the same host, the runner unit
+# gets restarted mid-activation — its definition changes between builds. That
+# restart kills the SSH session, the CI job, and deploy-rs's magic-rollback
+# handshake, so CI reports failure even when the deploy itself completed.
+# This is deploy-rs#153, open since 2022.
+#
+# This module breaks the dependency: activation does `switch-to-configuration
+# boot` (bootloader only, no service restarts), then invokes deploy-finalize
+# which schedules a detached systemd transient unit that fires `delay` seconds
+# later with the real `switch` (or `systemctl reboot` when the kernel, initrd,
+# or kernel-modules changed since boot). The transient unit is owned by pid1,
+# so it survives the runner's eventual restart — by which time the CI job has
+# finished reporting.
+#
+# Prior art (reboot-or-switch logic, not the self-deploy detachment):
+# - nixpkgs `system.autoUpgrade` (allowReboot = true branch) is the canonical
+#   source of the 3-path {initrd,kernel,kernel-modules} comparison.
+# - obsidiansystems/obelisk#957 merged the same snippet into `ob deploy` for
+#   push-based remote deploys — but doesn't need detachment since its deployer
+#   lives on a different machine from the target.
+# - nixpkgs#185030 tracks lifting this into switch-to-configuration proper.
+#   Stale since 2025-07; until it lands, every downstream reimplements it.
+#
+# Bootstrap note: the activation snippet resolves deploy-finalize via
+# lib.getExe (store path), not via `/run/current-system/sw/bin` — `boot` mode
+# does not update `/run/current-system`, so the old binary would be resolved.
+{
+  config,
+  lib,
+  pkgs,
+  ...
+}:
+let
+  cfg = config.services.deployFinalize;
+
+  finalize = pkgs.writeShellApplication {
+    name = "deploy-finalize";
+    runtimeInputs = [
+      pkgs.coreutils
+      pkgs.systemd
+    ];
+    text = ''
+      delay=${toString cfg.delay}
+      profile=/nix/var/nix/profiles/system
+      dry_run=0
+
+      usage() {
+        cat <<EOF
+      Usage: deploy-finalize [--dry-run] [--delay N] [--profile PATH]
+
+      Compares /run/booted-system against PATH (default /nix/var/nix/profiles/system)
+      and schedules either \`systemctl reboot\` (kernel or initrd changed) or
+      \`switch-to-configuration switch\` (services only) via a detached systemd-run
+      timer firing N seconds later.
+
+      Options:
+        --dry-run       Print the decision and would-be command without scheduling.
+        --delay N       Override the delay in seconds. Default: ${toString cfg.delay}.
+        --profile PATH  Override the profile path used for comparison.
+      EOF
+      }
+
+      while [[ $# -gt 0 ]]; do
+        case "$1" in
+          --dry-run) dry_run=1; shift ;;
+          --delay) delay="$2"; shift 2 ;;
+          --profile) profile="$2"; shift 2 ;;
+          -h|--help) usage; exit 0 ;;
+          *)
+            echo "deploy-finalize: unknown option $1" >&2
+            usage >&2
+            exit 2
+            ;;
+        esac
+      done
+
+      # Comparing {kernel,initrd,kernel-modules} matches nixpkgs's canonical
+      # `system.autoUpgrade` allowReboot logic. -e (not -f) so a dangling
+      # symlink counts as missing: on a real NixOS profile all three exist,
+      # but defensive: if a profile has bad symlinks we refuse to schedule
+      # rather than scheduling against ghost paths.
+      booted_kernel="$(readlink -e /run/booted-system/kernel 2>/dev/null || true)"
+      booted_initrd="$(readlink -e /run/booted-system/initrd 2>/dev/null || true)"
+      booted_modules="$(readlink -e /run/booted-system/kernel-modules 2>/dev/null || true)"
+      new_kernel="$(readlink -e "$profile/kernel" 2>/dev/null || true)"
+      new_initrd="$(readlink -e "$profile/initrd" 2>/dev/null || true)"
+      new_modules="$(readlink -e "$profile/kernel-modules" 2>/dev/null || true)"
+
+      if [[ -z "$new_kernel" || -z "$new_initrd" || -z "$new_modules" ]]; then
+        echo "deploy-finalize: refusing to schedule — $profile is missing kernel, initrd, or kernel-modules" >&2
+        exit 1
+      fi
+
+      changed=()
+      if [[ -z "$booted_kernel" || -z "$booted_initrd" || -z "$booted_modules" ]]; then
+        # Unreachable on a booted NixOS, but fail closed on reboot.
+        changed+=("/run/booted-system incomplete")
+      fi
+      [[ "$booted_kernel" != "$new_kernel" ]] && changed+=("kernel")
+      [[ "$booted_initrd" != "$new_initrd" ]] && changed+=("initrd")
+      [[ "$booted_modules" != "$new_modules" ]] && changed+=("kernel-modules")
+
+      reboot_needed=0
+      reason=""
+      if [[ ''${#changed[@]} -gt 0 ]]; then
+        reboot_needed=1
+        # Join with commas so the reason reads as e.g. `kernel,initrd changed`.
+        reason="$(IFS=, ; echo "''${changed[*]}") changed"
+      fi
+
+      if [[ "$reboot_needed" == 1 ]]; then
+        action=reboot
+        cmd="systemctl reboot"
+      else
+        action=switch
+        reason="services only"
+        cmd="$profile/bin/switch-to-configuration switch"
+      fi
+
+      # Nanosecond suffix so back-to-back deploys don't collide on unit names.
+      unit="deploy-finalize-$(date +%s%N)"
+
+      printf 'deploy-finalize: booted_kernel=%s\n'         "$booted_kernel"
+      printf 'deploy-finalize: new_kernel=%s\n'            "$new_kernel"
+      printf 'deploy-finalize: booted_initrd=%s\n'         "$booted_initrd"
+      printf 'deploy-finalize: new_initrd=%s\n'            "$new_initrd"
+      printf 'deploy-finalize: booted_kernel-modules=%s\n' "$booted_modules"
+      printf 'deploy-finalize: new_kernel-modules=%s\n'    "$new_modules"
+      printf 'deploy-finalize: action=%s reason=%s delay=%ss unit=%s\n' \
+        "$action" "$reason" "$delay" "$unit"
+
+      if [[ "$dry_run" == 1 ]]; then
+        printf 'deploy-finalize: dry-run — not scheduling\n'
+        printf 'deploy-finalize: would run: %s\n' "$cmd"
+        printf 'deploy-finalize: would schedule: systemd-run --collect --unit=%s --on-active=%s\n' \
+          "$unit" "$delay"
+        exit 0
+      fi
+
+      # Cancel any still-pending finalize timers from an earlier deploy so this
+      # invocation is authoritative. Without this a stale timer could fire with
+      # the old profile's action (reboot/switch) against the new profile and
+      # briefly run new userspace under the old kernel.
+      systemctl stop 'deploy-finalize-*.timer' 2>/dev/null || true
+
+      # --on-active arms a transient timer owned by pid1. systemd-run returns
+      # once the timer is armed; the SSH session that called us can exit and
+      # the gitea-runner can be restarted (by the switch the timer fires)
+      # without affecting whether the finalize runs.
+      systemd-run \
+        --collect \
+        --unit="$unit" \
+        --description="Finalize NixOS deploy ($action after boot-mode activation)" \
+        --on-active="$delay" \
+        /bin/sh -c "$cmd"
+    '';
+  };
+in
+{
+  options.services.deployFinalize = {
+    enable = lib.mkEnableOption "deferred deploy finalize (switch or reboot) after boot-mode activation";
+
+    delay = lib.mkOption {
+      type = lib.types.ints.positive;
+      default = 60;
+      description = ''
+        Seconds between the deploy-rs activation completing and the scheduled
+        finalize firing. Tuned so the CI job (or manual SSH session) has time
+        to complete status reporting before the runner is restarted by the
+        eventual switch-to-configuration.
+      '';
+    };
+  };
+
+  config = lib.mkIf cfg.enable {
+    environment.systemPackages = [ finalize ];
+
+    # Exposed for the deploy-rs activation snippet to reference by /nix/store
+    # path via lib.getExe — `boot` mode does not update /run/current-system,
+    # so reading through /run/current-system/sw/bin would resolve to the OLD
+    # binary on a new-feature rollout or immediately after a rollback.
+    system.build.deployFinalize = finalize;
+  };
+}
--- a/tests/deploy-finalize.nix
+++ b/tests/deploy-finalize.nix
@@ -0,0 +1,196 @@
+# Test for modules/server-deploy-finalize.nix.
+#
+# Covers the decision and scheduling logic with fabricated profile directories,
+# since spawning a second booted NixOS toplevel to diff kernels is too heavy for
+# a runNixOSTest. We rely on the shellcheck pass baked into writeShellApplication
+# to catch syntax regressions in the script itself.
+{
+  lib,
+  pkgs,
+  inputs,
+  ...
+}:
+pkgs.testers.runNixOSTest {
+  name = "deploy-finalize";
+
+  node.specialArgs = {
+    inherit inputs lib;
+    username = "testuser";
+  };
+
+  nodes.machine =
+    { ... }:
+    {
+      imports = [
+        ../modules/server-deploy-finalize.nix
+      ];
+
+      services.deployFinalize = {
+        enable = true;
+        # Shorter default in the test to make expected-substring assertions
+        # stable and reinforce that the option is wired through.
+        delay = 15;
+      };
+    };
+
+  testScript = ''
+    start_all()
+    machine.wait_for_unit("multi-user.target")
+
+    # Test fixtures: fabricated profile trees whose kernel/initrd/kernel-modules
+    # symlinks are under test control. `readlink -e` requires the targets to
+    # exist, so we point at real files in /tmp rather than non-existent paths.
+    machine.succeed(
+        "mkdir -p /tmp/profile-same /tmp/profile-changed-kernel "
+        "/tmp/profile-changed-initrd /tmp/profile-changed-modules "
+        "/tmp/profile-missing /tmp/fake-targets"
+    )
+    machine.succeed(
+        "touch /tmp/fake-targets/alt-kernel /tmp/fake-targets/alt-initrd "
+        "/tmp/fake-targets/alt-modules"
+    )
+
+    booted_kernel  = machine.succeed("readlink -e /run/booted-system/kernel").strip()
+    booted_initrd  = machine.succeed("readlink -e /run/booted-system/initrd").strip()
+    booted_modules = machine.succeed("readlink -e /run/booted-system/kernel-modules").strip()
+
+    def link_profile(path, kernel, initrd, modules):
+        machine.succeed(f"ln -sf {kernel} {path}/kernel")
+        machine.succeed(f"ln -sf {initrd} {path}/initrd")
+        machine.succeed(f"ln -sf {modules} {path}/kernel-modules")
+
+    # profile-same: matches booted exactly → should choose `switch`.
+    link_profile("/tmp/profile-same", booted_kernel, booted_initrd, booted_modules)
+    machine.succeed("mkdir -p /tmp/profile-same/bin")
+    machine.succeed(
+        "ln -sf /run/current-system/bin/switch-to-configuration "
+        "/tmp/profile-same/bin/switch-to-configuration"
+    )
+
+    # profile-changed-kernel: kernel differs only → should choose `reboot`.
+    link_profile(
+        "/tmp/profile-changed-kernel",
+        "/tmp/fake-targets/alt-kernel",
+        booted_initrd,
+        booted_modules,
+    )
+
+    # profile-changed-initrd: initrd differs only → should choose `reboot`.
+    link_profile(
+        "/tmp/profile-changed-initrd",
+        booted_kernel,
+        "/tmp/fake-targets/alt-initrd",
+        booted_modules,
+    )
+
+    # profile-changed-modules: kernel-modules differs only → should choose `reboot`.
+    # Catches the obelisk PR / nixpkgs auto-upgrade case where modules rebuild
+    # against the same kernel but ABI-incompatible.
+    link_profile(
+        "/tmp/profile-changed-modules",
+        booted_kernel,
+        booted_initrd,
+        "/tmp/fake-targets/alt-modules",
+    )
+
+    # profile-missing: no kernel/initrd/kernel-modules → should fail closed.
+
+    with subtest("dry-run against identical profile selects switch"):
+        rc, out = machine.execute(
+            "deploy-finalize --dry-run --profile /tmp/profile-same 2>&1"
+        )
+        assert rc == 0, f"rc={rc}\n{out}"
+        assert "action=switch" in out, out
+        assert "services only" in out, out
+        assert "dry-run — not scheduling" in out, out
+        assert "would run: /tmp/profile-same/bin/switch-to-configuration switch" in out, out
+        assert "would schedule: systemd-run" in out, out
+
+    with subtest("dry-run against changed-kernel profile selects reboot"):
+        rc, out = machine.execute(
+            "deploy-finalize --dry-run --profile /tmp/profile-changed-kernel 2>&1"
+        )
+        assert rc == 0, f"rc={rc}\n{out}"
+        assert "action=reboot" in out, out
+        assert "reason=kernel changed" in out, out
+        assert "systemctl reboot" in out, out
+
+    with subtest("dry-run against changed-initrd profile selects reboot"):
+        rc, out = machine.execute(
+            "deploy-finalize --dry-run --profile /tmp/profile-changed-initrd 2>&1"
+        )
+        assert rc == 0, f"rc={rc}\n{out}"
+        assert "action=reboot" in out, out
+        assert "reason=initrd changed" in out, out
+
+    with subtest("dry-run against changed-modules profile selects reboot"):
+        rc, out = machine.execute(
+            "deploy-finalize --dry-run --profile /tmp/profile-changed-modules 2>&1"
+        )
+        assert rc == 0, f"rc={rc}\n{out}"
+        assert "action=reboot" in out, out
+        assert "reason=kernel-modules changed" in out, out
+
+    with subtest("dry-run against empty profile fails closed with rc=1"):
+        rc, out = machine.execute(
+            "deploy-finalize --dry-run --profile /tmp/profile-missing 2>&1"
+        )
+        assert rc == 1, f"rc={rc}\n{out}"
+        assert "missing kernel, initrd, or kernel-modules" in out, out
+
+    with subtest("--delay override is reflected in output"):
+        rc, out = machine.execute(
+            "deploy-finalize --dry-run --delay 7 --profile /tmp/profile-same 2>&1"
+        )
+        assert rc == 0, f"rc={rc}\n{out}"
+        assert "delay=7s" in out, out
+
+    with subtest("configured default delay from module option is used"):
+        rc, out = machine.execute(
+            "deploy-finalize --dry-run --profile /tmp/profile-same 2>&1"
+        )
+        assert rc == 0, f"rc={rc}\n{out}"
+        # module option delay=15 in nodes.machine above.
+        assert "delay=15s" in out, out
+
+    with subtest("unknown option rejected with rc=2"):
+        rc, out = machine.execute("deploy-finalize --bogus 2>&1")
+        assert rc == 2, f"rc={rc}\n{out}"
+        assert "unknown option --bogus" in out, out
+
+    with subtest("non-dry run arms a transient systemd timer"):
+        # Long delay so the timer doesn't fire during the test. We stop it
+        # explicitly afterwards.
+        rc, out = machine.execute(
+            "deploy-finalize --delay 3600 --profile /tmp/profile-same 2>&1"
+        )
+        assert rc == 0, f"scheduling rc={rc}\n{out}"
+        # Confirm exactly one transient timer is active.
+        timers = machine.succeed(
+            "systemctl list-units --type=timer --no-legend 'deploy-finalize-*.timer' "
+            "--state=waiting | awk 'NF{print $1}'"
+        ).strip().splitlines()
+        assert len(timers) == 1, f"expected exactly one pending timer, got {timers}"
+        assert timers[0].startswith("deploy-finalize-"), timers
+
+    with subtest("back-to-back scheduling cancels the previous timer"):
+        # The previous subtest left one timer armed. Schedule again; the old
+        # one should be stopped before the new unit name is created.
+        machine.succeed("sleep 1")  # ensure a distinct unit-name timestamp
+        rc, out = machine.execute(
+            "deploy-finalize --delay 3600 --profile /tmp/profile-same 2>&1"
+        )
+        assert rc == 0, f"second-schedule rc={rc}\n{out}"
+        timers = machine.succeed(
+            "systemctl list-units --type=timer --no-legend 'deploy-finalize-*.timer' "
+            "--state=waiting | awk 'NF{print $1}'"
+        ).strip().splitlines()
+        assert len(timers) == 1, f"expected only the new timer, got {timers}"
+
+        # Clean up so the test's shutdown path is quiet.
+        machine.succeed(
+            "systemctl stop 'deploy-finalize-*.timer' 'deploy-finalize-*.service' "
+            "2>/dev/null || true"
+        )
+  '';
+}
--- a/tests/tests.nix
+++ b/tests/tests.nix
@@ -13,6 +13,7 @@ in
  minecraftTest = handleTest ./minecraft.nix;
  jellyfinQbittorrentMonitorTest = handleTest ./jellyfin-qbittorrent-monitor.nix;
  deployGuardTest = handleTest ./deploy-guard.nix;
+  deployFinalizeTest = handleTest ./deploy-finalize.nix;
  filePermsTest = handleTest ./file-perms.nix;

  # fail2ban tests