diff --git a/.gitea/workflows/deploy.yml b/.gitea/workflows/deploy.yml index d566da3..0a7d22f 100644 --- a/.gitea/workflows/deploy.yml +++ b/.gitea/workflows/deploy.yml @@ -63,6 +63,13 @@ jobs: - name: Build muffin run: nix build .#nixosConfigurations.muffin.config.system.build.toplevel -L + - name: Deploy guard preflight + run: | + ssh -i /run/agenix/ci-deploy-key \ + -o StrictHostKeyChecking=yes \ + -o UserKnownHostsFile=/etc/ci-known-hosts \ + root@server-public deploy-guard-check + - name: Deploy via deploy-rs run: | eval $(ssh-agent -s) diff --git a/AGENTS.md b/AGENTS.md index aaf6b55..d3533b4 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -158,10 +158,12 @@ Hard requirements that are asserted at eval time: ## Deploy guard (muffin) -`modules/server-deploy-guard.nix` blocks `./deploy.sh muffin` / deploy-rs activation when a service it covers is in active use. Two paths enforce it: +`modules/server-deploy-guard.nix` aggregates per-service "is anyone using this right now?" checks into a single `deploy-guard-check` binary on muffin. Enforcement is **preflight-only** — the guard runs over SSH *before* deploy-rs is invoked; activation itself is never gated. This matters because deploy-rs sets the new profile pointer before running the activation script, so a failed activation triggers auto-rollback which re-runs `switch-to-configuration` on the previous generation — that re-activation rotates agenix secrets, reinstalls lanzaboote, and reloads systemd units. The only safe place to stop a deploy is before deploy-rs starts. -- **Preflight**: `./deploy.sh muffin` SSHes to `server-public` and runs `deploy-guard-check` before the build. Connectivity failure is soft (activation still enforces). `./deploy.sh muffin --force` or `DEPLOY_GUARD_FORCE=1 ./deploy.sh muffin` touches `/run/deploy-guard-bypass` remotely (single-shot) and skips the preflight. -- **Activation**: the custom `activate.custom` wrapper in `flake.nix` runs `$PROFILE/sw/bin/deploy-guard-check` before `switch-to-configuration switch`. A non-zero exit triggers deploy-rs auto-rollback. Same bypass: `DEPLOY_GUARD_BYPASS=1` env or pre-touched `/run/deploy-guard-bypass`. +Two drivers invoke the preflight: + +- **`./deploy.sh muffin`** SSHes to `server-public` and runs `deploy-guard-check`. SSH connection failure is a hard abort (rc=255) because there is no second gate. `./deploy.sh muffin --force` (or `DEPLOY_GUARD_FORCE=1 ./deploy.sh muffin`) skips the preflight entirely. +- **CI (`.gitea/workflows/deploy.yml`)** has a `Deploy guard preflight` step between `Build muffin` and `Deploy via deploy-rs`. A non-zero exit fails the job before any closure copy or activation. ### Adding a new check diff --git a/deploy.sh b/deploy.sh index 3e6d27a..707bcda 100755 --- a/deploy.sh +++ b/deploy.sh @@ -6,8 +6,8 @@ # ./deploy.sh switch # apply immediately on current host # ./deploy.sh test # apply without adding boot entry # ./deploy.sh build # build only, no activation -# ./deploy.sh muffin # build + deploy to muffin via deploy-rs -# ./deploy.sh muffin --force # bypass the deploy guard (active-user check) +# ./deploy.sh muffin # preflight deploy guard + deploy-rs to muffin +# ./deploy.sh muffin --force # skip the preflight deploy guard # # muffin cannot be rebuilt locally from another host — this script only issues # the remote deploy via deploy-rs when explicitly named. @@ -28,27 +28,26 @@ case "$arg" in if [ "${1:-}" = "--force" ]; then force=1; shift; fi if [ "$force" = "1" ]; then - echo "deploy-guard: bypass requested; setting remote marker" - ssh -o BatchMode=yes -o ConnectTimeout=3 root@server-public \ - 'touch /run/deploy-guard-bypass' \ - || echo "deploy-guard: warning: could not write remote bypass marker" >&2 + echo "deploy-guard: preflight skipped (--force)" else - # Single SSH probe — if exit 255 it's a connectivity failure (skip - # preflight; the activation-time guard still enforces). Any other - # non-zero is the guard blocking the deploy. + # Single SSH probe. Exit 255 is a connectivity failure; treat as a hard + # abort — without the preflight there is no other gate that prevents + # deploy-rs from partially activating while users are online. output=$(ssh -o BatchMode=yes -o ConnectTimeout=5 \ root@server-public deploy-guard-check 2>&1) && rc=0 || rc=$? - if [ "$rc" -eq 255 ]; then - echo "deploy-guard: muffin unreachable for preflight;" \ - "activation will still enforce" >&2 - elif [ "$rc" -ne 0 ]; then + if [ "$rc" -eq 0 ]; then + [ -n "$output" ] && printf '%s\n' "$output" + elif [ "$rc" -eq 255 ]; then + echo "deploy-guard: preflight SSH failed (rc=255)." >&2 + printf '%s\n' "$output" >&2 + echo "Re-run with --force once you've confirmed the host is idle." >&2 + exit 1 + else printf '%s\n' "$output" echo >&2 echo "Blocked by deploy guard. Bypass: ./deploy.sh muffin --force" >&2 exit 1 - elif [ -n "$output" ]; then - printf '%s\n' "$output" fi fi diff --git a/flake.nix b/flake.nix index 1e46131..c574db8 100644 --- a/flake.nix +++ b/flake.nix @@ -372,37 +372,17 @@ profiles.system = { sshUser = "root"; user = "root"; - # Wrap deploy-rs.activate.nixos so the guard runs before - # switch-to-configuration. If the guard exits non-zero, deploy-rs's - # auto-rollback restores the previous profile. Bypass via - # DEPLOY_GUARD_BYPASS=1 or by pre-touching /run/deploy-guard-bypass. - path = - let - base = self.nixosConfigurations.muffin; - activate = deploy-rs.lib.${system}.activate; - bootloaderDefaultCleanup = nixpkgs-stable.lib.optionalString base.config.boot.loader.systemd-boot.enable "sed -i '/^default /d' ${base.config.boot.loader.efi.efiSysMountPoint}/loader/loader.conf"; - in - ( - activate.custom - // { - dryActivate = "$PROFILE/bin/switch-to-configuration dry-activate"; - boot = "$PROFILE/bin/switch-to-configuration boot"; - } - ) - base.config.system.build.toplevel - '' - # work around https://github.com/NixOS/nixpkgs/issues/73404 - cd /tmp - - # Halt deploys while users are actively using services. - # See modules/server-deploy-guard.nix. - "$PROFILE/sw/bin/deploy-guard-check" - - $PROFILE/bin/switch-to-configuration switch - - # https://github.com/serokell/deploy-rs/issues/31 - ${bootloaderDefaultCleanup} - ''; + # Deploy guard enforcement lives in the preflight driver (deploy.sh + # and .gitea/workflows/deploy.yml) — not in activation. Activation- + # time enforcement is unsafe: deploy-rs sets the new profile pointer + # before running deploy-rs-activate, so a non-zero activation exit + # triggers auto-rollback which re-runs switch-to-configuration on the + # previous generation. That re-activation rotates agenix secrets, + # reinstalls lanzaboote, and reloads systemd units — side effects we + # want to avoid when the deploy is supposed to be a no-op blocked by + # the guard. Blocking before the deploy-rs invocation is the only + # clean way to leave the running system untouched. + path = deploy-rs.lib.${system}.activate.nixos self.nixosConfigurations.muffin; }; };