From a228f61d34b6266b4ceab46f5f27bad8488b7530 Mon Sep 17 00:00:00 2001 From: Simon Gardling Date: Tue, 21 Apr 2026 21:37:37 -0400 Subject: [PATCH] systemd: patch freezer stuck-state on kill-while-frozen Reset u->freezer_state to FREEZER_RUNNING when a unit transitions to inactive/failed. Without this, any SIGKILL path to a frozen unit (systemctl kill, OOM, watchdog SIGABRT-then-KILL, segfault) leaves the unit stranded at FreezerState=frozen with no recovery short of a reboot. Complements upstream PR #38528 which covers only the watchdog path. xmrig-auto-pause never calls `systemctl freeze` itself (direct cgroup.freeze writes bypass the bug class entirely), so the patch is defensive: benefits systemd-homed on lock, user-session freezing on suspend, or anything else that may freeze units on muffin. Patching systemd cascades udev-check-hook hash changes into fuse3 --> e2fsprogs and into fish. Two test-suite workarounds ride along: drop e2fsprogs m_hugefile (4 GiB sparse file, fails on some build sandboxes) and fish doCheck=false (cargo pexpect TTY tests). Both are environmental, unrelated to the patch. --- flake.nix | 26 ++++++++++ ...set-freezer-state-on-inactive-failed.patch | 48 +++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 patches/systemd/0001-core-unit-reset-freezer-state-on-inactive-failed.patch diff --git a/flake.nix b/flake.nix index 16d0eed..67fb24a 100644 --- a/flake.nix +++ b/flake.nix @@ -304,6 +304,12 @@ prev.callPackage "${patchedStableSrc}/pkgs/by-name/sy/syncstorage-rs/package.nix" { }; }) + # NOTE: systemd patch is applied via `systemd.package` in the module + # list below, not via an overlay. An overlay replaces pkgs.systemd + # for every consumer, which cascades through udev-check-hook and + # causes the entire closure (fish, e2fsprogs, valkey, …) to rebuild + # and re-run flaky test suites in the sandbox. `systemd.package` + # only injects the patched systemd into the runtime init chain. ]; nixpkgs.config.allowUnfreePredicate = pkg: @@ -312,6 +318,26 @@ ]; } + # Runtime-only systemd patch: reset FreezerState on inactive/failed + # transitions so a SIGKILL to a frozen unit doesn't strand + # FreezerState=frozen (unrecoverable without a reboot, upstream issue + # #38517). PR #38528 closed only the watchdog path; this closes + # systemctl kill / OOM / segfault paths too. + # + # Applied via systemd.package, not via overlay, so pkgs.systemd stays + # untouched for every other consumer — no udev-check-hook cascade, + # no fish/e2fsprogs/valkey rebuild, no flaky-test fallout. + ( + { pkgs, ... }: + { + systemd.package = pkgs.systemd.overrideAttrs (old: { + patches = (old.patches or [ ]) ++ [ + ./patches/systemd/0001-core-unit-reset-freezer-state-on-inactive-failed.patch + ]; + }); + } + ) + lanzaboote.nixosModules.lanzaboote arr-init.nixosModules.default (import "${nixpkgs-p2pool-module}/nixos/modules/services/networking/p2pool.nix") diff --git a/patches/systemd/0001-core-unit-reset-freezer-state-on-inactive-failed.patch b/patches/systemd/0001-core-unit-reset-freezer-state-on-inactive-failed.patch new file mode 100644 index 0000000..1b59719 --- /dev/null +++ b/patches/systemd/0001-core-unit-reset-freezer-state-on-inactive-failed.patch @@ -0,0 +1,48 @@ +From: Simon Gardling +Date: 2026-04-21 +Subject: [PATCH] core/unit: reset freezer state when unit becomes inactive/failed + +When a frozen unit is killed via anything that delivers SIGKILL outside +systemd's normal deactivation path (`systemctl kill --signal=KILL`, +`kill -9`, the OOM killer, a segfault), the service transitions to +failed or inactive correctly, but `u->freezer_state` is never reset. +This leaves the unit permanently stuck at FreezerState=frozen: + + - `systemctl thaw` refuses with "Unit is not active". + - `systemctl start`/`restart` refuses with + "Cannot perform operation on frozen unit". + - `reset-failed`, `daemon-reload`, `daemon-reexec`, and `revert` do + not clear the state. + +The only way to recover a persistent unit in this state is to reboot. + +Reset the freezer state to FREEZER_RUNNING whenever a unit transitions +to an inactive or failed state, and cancel any pending freeze/thaw dbus +reply that will never complete. The cgroup is pruned immediately after +this point, so there is nothing left to freeze; the "running" state is +the correct neutral state for a dead unit. On restart, the unit will +pick up its freezer state from the cgroup hierarchy again. + +This complements PR #38528 (stop/reset watchdog on freeze/thaw) by +addressing the general SIGKILL path instead of just the watchdog- +initiated kill. +--- +--- a/src/core/unit.c ++++ b/src/core/unit.c +@@ -2754,6 +2754,16 @@ + SET_FLAG(u->markers, + (1u << UNIT_MARKER_NEEDS_RELOAD)|(1u << UNIT_MARKER_NEEDS_RESTART), + false); ++ ++ /* If the unit was frozen (or in a freezer transition) when it became inactive/failed, ++ * reset the freezer state to running. The cgroup is about to be pruned; leaving the ++ * state stuck at frozen would prevent the unit from ever being started, thawed, or ++ * cleared without rebooting PID 1. Also cancel any pending freeze/thaw dbus reply. */ ++ if (u->freezer_state != FREEZER_RUNNING) { ++ unit_set_freezer_state(u, FREEZER_RUNNING); ++ (void) bus_unit_send_pending_freezer_message(u, /* canceled= */ true); ++ } ++ + unit_prune_cgroup(u); + unit_unlink_state_files(u); + } else if (ns != os && ns == UNIT_RELOADING)