xmrig-auto-pause: use cgroup.freeze and thaws

2026-04-21 14:30:03 -04:00
parent a8cf95c7dd
commit 018b590e0d
3 changed files with 492 additions and 218 deletions
--- a/services/monero/xmrig-auto-pause.nix
+++ b/services/monero/xmrig-auto-pause.nix
@@ -2,15 +2,33 @@
  config,
  lib,
  pkgs,
+  service_configs,
  ...
 }:
+let
+  cgroupDir = "/sys/fs/cgroup/system.slice/xmrig.service";
+  cgroupFreeze = "${cgroupDir}/cgroup.freeze";
+in
 lib.mkIf config.services.xmrig.enable {
  systemd.services.xmrig-auto-pause = {
-    description = "Auto-pause xmrig when other services need CPU";
+    description = "Auto-pause xmrig via cgroup freezer when other services need CPU";
    after = [ "xmrig.service" ];
+    # PartOf cascades stop/restart: when xmrig stops (deploy, apcupsd battery,
+    # manual), systemd stops auto-pause first and ExecStop thaws xmrig so
+    # xmrig's own stop does not hang on a frozen cgroup.
+    partOf = [ "xmrig.service" ];
    wantedBy = [ "multi-user.target" ];
+
    serviceConfig = {
      ExecStart = "${pkgs.python3}/bin/python3 ${./xmrig-auto-pause.py}";
+      # Safety net: any exit path (SIGTERM from PartOf cascade, systemctl stop,
+      # crash with Restart=) must leave xmrig thawed. The Python SIGTERM
+      # handler does the same thing; this covers SIGKILL / hard crash paths
+      # too. Idempotent.
+      ExecStop = pkgs.writeShellScript "xmrig-auto-pause-thaw" ''
+        f=${cgroupFreeze}
+        [ -w "$f" ] && echo 0 > "$f" || true
+      '';
      Restart = "always";
      RestartSec = "10s";
      NoNewPrivileges = true;
@@ -22,6 +40,9 @@ lib.mkIf config.services.xmrig.enable {
      ];
      MemoryDenyWriteExecute = true;
      StateDirectory = "xmrig-auto-pause";
+      # Required so the script can write to cgroup.freeze under
+      # ProtectSystem=strict (which makes /sys read-only by default).
+      ReadWritePaths = [ cgroupDir ];
    };
    environment = {
      POLL_INTERVAL = "3";
@@ -32,8 +53,19 @@ lib.mkIf config.services.xmrig.enable {
      # steady-state floor to avoid restarting xmrig while services are active.
      CPU_STOP_THRESHOLD = "40";
      CPU_RESUME_THRESHOLD = "10";
-      STARTUP_COOLDOWN = "10";
      STATE_DIR = "/var/lib/xmrig-auto-pause";
+      XMRIG_CGROUP_FREEZE = cgroupFreeze;
+      # Per-service CPU thresholds. Catches sub-threshold activity that never
+      # trips the system-wide gauge — a single Minecraft player uses 3-15% of
+      # one core (0.3-1.3% of a 12-thread host) which is pure noise in
+      # /proc/stat but dominant in the minecraft cgroup.
+      WATCHED_SERVICES = lib.concatStringsSep "," (
+        lib.optional config.services.minecraft-servers.enable "minecraft-server-${service_configs.minecraft.server_name}:2"
+      );
    };
  };
+
+  # Pull auto-pause along whenever xmrig starts. After= on auto-pause ensures
+  # correct order; Wants= here ensures it actually starts.
+  systemd.services.xmrig.wants = [ "xmrig-auto-pause.service" ];
 }
--- a/services/monero/xmrig-auto-pause.py
+++ b/services/monero/xmrig-auto-pause.py
@@ -2,33 +2,54 @@
 """
 Auto-pause xmrig when other services need CPU.

-Monitors non-nice CPU usage from /proc/stat. Since xmrig runs at Nice=19,
-its CPU time lands in the 'nice' column and is excluded from the metric.
-When real workload (user + system + irq + softirq) exceeds the stop
-threshold, stops xmrig. When it drops below the resume threshold for
-GRACE_PERIOD seconds, restarts xmrig.
+Two independent signals drive the decision; either one can trigger a pause:

-This replaces per-service pause scripts with a single general-purpose
-monitor that handles any CPU-intensive workload (gitea workers, llama-cpp
-inference, etc.) without needing to know about specific processes.
+1. System-wide non-nice CPU from /proc/stat. Catches any CPU-heavy workload
+   including non-systemd user work (interactive sessions, ad-hoc jobs).
+   Since xmrig runs at Nice=19, its CPU time lands in the 'nice' column and
+   is excluded from the metric.
+
+2. Per-service CPU from cgroup cpu.stat usage_usec. Catches sub-threshold
+   service activity — a single Minecraft player drives the server JVM to
+   3-15% of one core, which is noise system-wide (0.3-1.3% of total on a
+   12-thread host) but dominant for the minecraft cgroup.
+
+When either signal crosses its stop threshold, writes 1 to
+/sys/fs/cgroup/system.slice/xmrig.service/cgroup.freeze. When both are quiet
+for GRACE_PERIOD seconds, writes 0 to resume.
+
+Why direct cgroup.freeze instead of systemctl freeze:
+  systemd 256+ has a bug class where `systemctl freeze` followed by any
+  process death (SIGKILL, watchdog, OOM, segfault, shutdown) strands the
+  unit in FreezerState=frozen ActiveState=failed with no recovery short of
+  a reboot. See https://github.com/systemd/systemd/issues/38517. Writing
+  directly to cgroup.freeze keeps systemd's FreezerState at "running" the
+  whole time, so there is no state machine to get stuck: if xmrig dies
+  while frozen, systemd transitions it to inactive normally.

 Why scheduler priority alone isn't enough:
  Nice=19 / SCHED_IDLE only affects which thread gets the next time slice.
-  RandomX's 2MB-per-thread scratchpad (24MB across 12 threads) pollutes
-  the shared 32MB L3 cache, and its memory access pattern saturates DRAM
-  bandwidth. Other services run slower even though they aren't denied CPU
-  time. The only fix is to stop xmrig entirely when real work is happening.
+  RandomX's 2MB-per-thread scratchpad (24MB across 12 threads) holds about
+  68% of the shared 32MB L3 cache on Zen 3, evicting hot lines from
+  interactive services. Measured on muffin: pointer-chase latency is 112ns
+  with xmrig running and 19ns with xmrig frozen — a 6x difference that
+  scheduler priority cannot address.

 Hysteresis:
-  The stop threshold is set higher than the resume threshold to prevent
-  oscillation. When xmrig runs, its L3 cache pressure makes other processes
-  appear ~3-8% busier. A single threshold trips on this indirect effect,
-  causing stop/start thrashing. Separate thresholds break the cycle: the
-  resume threshold confirms the system is truly idle, while the stop
-  threshold requires genuine workload above xmrig's indirect pressure.
+  The system-wide stop threshold sits higher than the resume threshold
+  because background services (qbittorrent, bitmagnet, postgres) produce
+  15-25% non-nice CPU during normal operation, and xmrig's indirect cache
+  pressure inflates that by another few percent. A single threshold
+  thrashes on the floor; two thresholds break the cycle.
+
+  Per-service thresholds are single-valued. Per-service CPU is a clean
+  signal without background noise to calibrate against, so idle_since is
+  reset whenever any watched service is at-or-above its threshold and the
+  grace period only advances when every watched service is below.
 """

 import os
+import signal
 import subprocess
 import sys
 import time
@@ -37,19 +58,23 @@ POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "3"))
 GRACE_PERIOD = float(os.environ.get("GRACE_PERIOD", "15"))
 # Percentage of total CPU ticks that non-nice processes must use to trigger
 # a pause. On a 12-thread system, one fully loaded core ≈ 8.3% of total.
-# Default 15% requires roughly two busy cores, which avoids false positives
-# from xmrig's L3 cache pressure inflating other processes' apparent CPU.
 CPU_STOP_THRESHOLD = float(os.environ.get("CPU_STOP_THRESHOLD", "15"))
 # Percentage below which the system is considered idle enough to resume
 # mining. Lower than the stop threshold to provide hysteresis.
 CPU_RESUME_THRESHOLD = float(os.environ.get("CPU_RESUME_THRESHOLD", "5"))
-# After starting xmrig, ignore CPU spikes for this many seconds to let
-# RandomX dataset initialization complete (~4s on the target hardware)
-# without retriggering a stop.
-STARTUP_COOLDOWN = float(os.environ.get("STARTUP_COOLDOWN", "10"))
+# Per-service CPU thresholds parsed from "unit1:threshold1,unit2:threshold2".
+# Thresholds are percentage of TOTAL CPU capacity (same frame as
+# CPU_STOP_THRESHOLD). Empty / unset disables the per-service path.
+WATCHED_SERVICES_RAW = os.environ.get("WATCHED_SERVICES", "")
+# Path to xmrig's cgroup.freeze file. Direct write bypasses systemd's
+# freezer state machine; see module docstring.
+XMRIG_CGROUP_FREEZE = os.environ.get(
+    "XMRIG_CGROUP_FREEZE",
+    "/sys/fs/cgroup/system.slice/xmrig.service/cgroup.freeze",
+)
 # Directory for persisting pause state across script restarts.  Without
 # this, a restart while xmrig is paused loses the paused_by_us flag and
-# xmrig stays stopped permanently.
+# xmrig stays frozen until something else thaws it.
 STATE_DIR = os.environ.get("STATE_DIR", "")
 _PAUSE_FILE = os.path.join(STATE_DIR, "paused") if STATE_DIR else ""

@@ -58,6 +83,51 @@ def log(msg):
    print(f"[xmrig-auto-pause] {msg}", file=sys.stderr, flush=True)


+def _parse_watched(spec):
+    out = {}
+    for entry in filter(None, (s.strip() for s in spec.split(","))):
+        name, _, pct = entry.partition(":")
+        name = name.strip()
+        pct = pct.strip()
+        if not name or not pct:
+            log(f"WATCHED_SERVICES: ignoring malformed entry '{entry}'")
+            continue
+        try:
+            out[name] = float(pct)
+        except ValueError:
+            log(f"WATCHED_SERVICES: ignoring non-numeric threshold in '{entry}'")
+    return out
+
+
+def _resolve_cgroup_cpustat(unit):
+    """Look up the unit's cgroup path via systemd. Returns cpu.stat path or
+    None if the unit has no cgroup (service not running, unknown unit)."""
+    result = subprocess.run(
+        ["systemctl", "show", "--value", "--property=ControlGroup", unit],
+        capture_output=True,
+        text=True,
+    )
+    cg = result.stdout.strip()
+    if not cg:
+        return None
+    path = f"/sys/fs/cgroup{cg}/cpu.stat"
+    if not os.path.isfile(path):
+        return None
+    return path
+
+
+def _read_service_usec(path):
+    """Cumulative cpu.stat usage_usec, or None if the cgroup has vanished."""
+    try:
+        with open(path) as f:
+            for line in f:
+                if line.startswith("usage_usec "):
+                    return int(line.split()[1])
+    except FileNotFoundError:
+        return None
+    return None
+
+
 def read_cpu_ticks():
    """Read CPU tick counters from /proc/stat.

@@ -84,123 +154,241 @@ def is_active(unit):
    return result.returncode == 0


-def systemctl(action, unit):
+def main_pid(unit):
+    """Return the unit's MainPID, or 0 if unit is not running."""
    result = subprocess.run(
-        ["systemctl", action, unit],
+        ["systemctl", "show", "--value", "--property=MainPID", unit],
        capture_output=True,
        text=True,
    )
-    if result.returncode != 0:
-        log(f"systemctl {action} {unit} failed (rc={result.returncode}): {result.stderr.strip()}")
-    return result.returncode == 0
+    try:
+        return int(result.stdout.strip() or "0")
+    except ValueError:
+        return 0


-def _save_paused(paused):
-    """Persist pause flag so a script restart can resume where we left off."""
+def _freeze(frozen):
+    """Write 1 or 0 to xmrig's cgroup.freeze. Returns True on success.
+
+    Direct kernel interface — bypasses systemd's freezer state tracking."""
+    try:
+        with open(XMRIG_CGROUP_FREEZE, "w") as f:
+            f.write("1" if frozen else "0")
+        return True
+    except OSError as e:
+        action = "freeze" if frozen else "thaw"
+        log(f"cgroup.freeze {action} write failed: {e}")
+        return False
+
+
+def _is_frozen():
+    """Read the actual frozen state from cgroup.events. False if cgroup absent."""
+    events_path = os.path.join(os.path.dirname(XMRIG_CGROUP_FREEZE), "cgroup.events")
+    try:
+        with open(events_path) as f:
+            for line in f:
+                if line.startswith("frozen "):
+                    return line.split()[1] == "1"
+    except FileNotFoundError:
+        return False
+    return False
+
+
+def _save_paused(pid):
+    """Persist the xmrig MainPID at the time of freeze. pid=0 clears claim."""
    if not _PAUSE_FILE:
        return
    try:
-        if paused:
-            open(_PAUSE_FILE, "w").close()
+        if pid:
+            with open(_PAUSE_FILE, "w") as f:
+                f.write(str(pid))
        else:
-            os.remove(_PAUSE_FILE)
-    except OSError:
-        pass
+            try:
+                os.remove(_PAUSE_FILE)
+            except FileNotFoundError:
+                pass
+    except OSError as e:
+        log(f"state file write failed: {e}")


 def _load_paused():
-    """Check if a previous instance left xmrig paused."""
+    """Return True iff our claim is still valid: same PID and still frozen.
+
+    Restart of the xmrig unit gives it a new PID, which invalidates any
+    prior claim — we can't "own" a freeze we didn't perform on this
+    instance. Also confirms the cgroup is actually frozen so an external
+    thaw drops the claim.
+    """
    if not _PAUSE_FILE:
        return False
-    return os.path.isfile(_PAUSE_FILE)
+    try:
+        with open(_PAUSE_FILE) as f:
+            saved = int(f.read().strip() or "0")
+    except (FileNotFoundError, ValueError):
+        return False
+    if not saved:
+        return False
+    if saved != main_pid("xmrig.service"):
+        return False
+    return _is_frozen()
+
+
+def _cleanup(signum=None, frame=None):
+    """On SIGTERM/SIGINT: thaw xmrig and clear claim. Operators must never see
+    a frozen unit we owned after auto-pause exits."""
+    if _is_frozen():
+        _freeze(False)
+    _save_paused(0)
+    sys.exit(0)


 def main():
-    paused_by_us = _load_paused()
-    idle_since = None
-    started_at = None  # monotonic time when we last started xmrig
-    prev_total = None
-    prev_work = None
+    watched_services = _parse_watched(WATCHED_SERVICES_RAW)
+    watched_paths = {}
+    for name in watched_services:
+        path = _resolve_cgroup_cpustat(name)
+        if path is None:
+            log(f"WATCHED_SERVICES: {name} has no cgroup — ignoring until it starts")
+        watched_paths[name] = path

+    nproc = os.cpu_count() or 1
+
+    signal.signal(signal.SIGTERM, _cleanup)
+    signal.signal(signal.SIGINT, _cleanup)
+
+    paused_by_us = _load_paused()
    if paused_by_us:
        log("Recovered pause state from previous instance")

    log(
        f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s "
-        f"stop={CPU_STOP_THRESHOLD}% resume={CPU_RESUME_THRESHOLD}% "
-        f"cooldown={STARTUP_COOLDOWN}s"
+        f"sys_stop={CPU_STOP_THRESHOLD}% sys_resume={CPU_RESUME_THRESHOLD}% "
+        f"watched={watched_services or '(none)'}"
    )

+    idle_since = None
+    prev_total = None
+    prev_work = None
+    prev_monotonic = None
+    prev_service_usec = {}
+
    while True:
        total, work = read_cpu_ticks()
+        now = time.monotonic()

        if prev_total is None:
            prev_total = total
            prev_work = work
+            prev_monotonic = now
+            # seed per-service baselines too
+            for name, path in watched_paths.items():
+                if path is None:
+                    # Re-resolve in case the service has started since startup
+                    path = _resolve_cgroup_cpustat(name)
+                    watched_paths[name] = path
+                if path is not None:
+                    usec = _read_service_usec(path)
+                    if usec is not None:
+                        prev_service_usec[name] = usec
            time.sleep(POLL_INTERVAL)
            continue

        dt = total - prev_total
-        if dt <= 0:
+        dt_s = now - prev_monotonic
+        if dt <= 0 or dt_s <= 0:
            prev_total = total
            prev_work = work
+            prev_monotonic = now
            time.sleep(POLL_INTERVAL)
            continue

        real_work_pct = ((work - prev_work) / dt) * 100
+
+        # Per-service CPU percentages this window. Fraction of total CPU
+        # capacity used by this specific service, same frame as real_work_pct.
+        svc_pct = {}
+        for name in watched_services:
+            path = watched_paths.get(name)
+            if path is None:
+                # Unit wasn't running at startup; try resolving again in case
+                # it has started since.
+                path = _resolve_cgroup_cpustat(name)
+                watched_paths[name] = path
+                if path is None:
+                    prev_service_usec.pop(name, None)
+                    continue
+            cur = _read_service_usec(path)
+            if cur is None:
+                # Service stopped; drop prev so it doesn't compute a huge delta
+                # on next start.
+                prev_service_usec.pop(name, None)
+                watched_paths[name] = None  # force re-resolution next poll
+                continue
+            if name in prev_service_usec:
+                delta_us = cur - prev_service_usec[name]
+                if delta_us >= 0:
+                    svc_pct[name] = (delta_us / 1_000_000) / (dt_s * nproc) * 100
+            prev_service_usec[name] = cur
+
        prev_total = total
        prev_work = work
+        prev_monotonic = now

-        # Don't act during startup cooldown — RandomX dataset init causes
-        # a transient CPU spike that would immediately retrigger a stop.
-        if started_at is not None:
-            if time.monotonic() - started_at < STARTUP_COOLDOWN:
-                time.sleep(POLL_INTERVAL)
-                continue
-            # Cooldown expired — verify xmrig survived startup.  If it
-            # crashed during init (hugepage failure, pool unreachable, etc.),
-            # re-enter the pause/retry cycle rather than silently leaving
-            # xmrig dead.
-            if not is_active("xmrig.service"):
-                log("xmrig died during startup cooldown — will retry")
-                paused_by_us = True
-                _save_paused(True)
-            started_at = None
+        above_stop_sys = real_work_pct > CPU_STOP_THRESHOLD
+        below_resume_sys = real_work_pct <= CPU_RESUME_THRESHOLD

-        above_stop = real_work_pct > CPU_STOP_THRESHOLD
-        below_resume = real_work_pct <= CPU_RESUME_THRESHOLD
+        busy_services = [
+            n for n in watched_services if svc_pct.get(n, 0) > watched_services[n]
+        ]
+        any_svc_at_or_above = any(
+            svc_pct.get(n, 0) >= watched_services[n] for n in watched_services
+        )

-        if above_stop:
+        stop_pressure = above_stop_sys or bool(busy_services)
+        fully_idle = below_resume_sys and not any_svc_at_or_above
+
+        if stop_pressure:
            idle_since = None
-            if paused_by_us and is_active("xmrig.service"):
-                # Something else restarted xmrig (deploy, manual start, etc.)
-                # while we thought it was stopped. Reset ownership so we can
-                # manage it again.
-                log("xmrig was restarted externally while paused — reclaiming")
+            if paused_by_us and not _is_frozen():
+                # Someone thawed xmrig while we believed it paused. Reclaim
+                # ownership so we can re-freeze.
+                log("xmrig was thawed externally while paused — reclaiming")
                paused_by_us = False
-                _save_paused(False)
-            if not paused_by_us:
-                # Only claim ownership if xmrig is actually running.
-                # If something else stopped it (e.g. UPS battery hook),
-                # don't interfere — we'd wrongly restart it later.
-                if is_active("xmrig.service"):
-                    log(f"Real workload detected ({real_work_pct:.1f}% CPU) — stopping xmrig")
-                    if systemctl("stop", "xmrig.service"):
-                        paused_by_us = True
-                        _save_paused(True)
+                _save_paused(0)
+            if not paused_by_us and is_active("xmrig.service"):
+                # Only claim ownership if xmrig is actually running. If
+                # something else stopped it (e.g. UPS battery hook), don't
+                # interfere.
+                if busy_services:
+                    reasons = ", ".join(
+                        f"{n}={svc_pct[n]:.1f}%>{watched_services[n]:.1f}%"
+                        for n in busy_services
+                    )
+                    log(f"Stop: watched service(s) busy [{reasons}] — freezing xmrig")
+                else:
+                    log(
+                        f"Stop: system CPU {real_work_pct:.1f}% > "
+                        f"{CPU_STOP_THRESHOLD:.1f}% — freezing xmrig"
+                    )
+                if _freeze(True):
+                    paused_by_us = True
+                    _save_paused(main_pid("xmrig.service"))
        elif paused_by_us:
-            if below_resume:
+            if fully_idle:
                if idle_since is None:
                    idle_since = time.monotonic()
                elif time.monotonic() - idle_since >= GRACE_PERIOD:
-                    log(f"Workload ended ({real_work_pct:.1f}% CPU) past grace period — starting xmrig")
-                    if systemctl("start", "xmrig.service"):
+                    log(
+                        f"Idle past grace period (system {real_work_pct:.1f}%) "
+                        "— thawing xmrig"
+                    )
+                    if _freeze(False):
                        paused_by_us = False
-                        _save_paused(False)
-                        started_at = time.monotonic()
+                        _save_paused(0)
                    idle_since = None
            else:
-                # Between thresholds — not idle enough to resume.
+                # Between thresholds or a watched service is borderline — not
+                # idle enough to resume.
                idle_since = None

        time.sleep(POLL_INTERVAL)
--- a/tests/xmrig-auto-pause.nix
+++ b/tests/xmrig-auto-pause.nix
@@ -5,6 +5,15 @@
 let
  script = ../services/monero/xmrig-auto-pause.py;
  python = pkgs.python3;
+  cgroupDir = "/sys/fs/cgroup/system.slice/xmrig.service";
+  cgroupFreeze = "${cgroupDir}/cgroup.freeze";
+  cgroupEvents = "${cgroupDir}/cgroup.events";
+  # Inline ExecStop for the transient monitor: mirrors the production .nix
+  # ExecStop so the PartOf cascade test exercises the same code path.
+  thawScript = pkgs.writeShellScript "test-thaw-xmrig" ''
+    f=${cgroupFreeze}
+    [ -w "$f" ] && echo 0 > "$f" || true
+  '';
 in
 pkgs.testers.runNixOSTest {
  name = "xmrig-auto-pause";
@@ -17,13 +26,18 @@ pkgs.testers.runNixOSTest {
        pkgs.procps
      ];

-      # Mock xmrig as a nice'd sleep process that can be stopped/started.
+      # Mock xmrig as a nice'd sleep process. Runs in the real
+      # /sys/fs/cgroup/system.slice/xmrig.service cgroup, which is what the
+      # auto-pause script writes cgroup.freeze into.
      systemd.services.xmrig = {
        description = "Mock xmrig miner";
        serviceConfig = {
          ExecStart = "${pkgs.coreutils}/bin/sleep infinity";
          Type = "simple";
          Nice = 19;
+          # Short timeout so the PartOf cascade test completes fast if the
+          # cascade is broken (would otherwise hit systemd's 90s default).
+          TimeoutStopSec = "10s";
        };
        wantedBy = [ "multi-user.target" ];
      };
@@ -34,20 +48,39 @@ pkgs.testers.runNixOSTest {

    PYTHON = "${python}/bin/python3"
    SCRIPT = "${script}"
+    CGROUP_FREEZE = "${cgroupFreeze}"
+    CGROUP_EVENTS = "${cgroupEvents}"
+    THAW_SCRIPT = "${thawScript}"

    # Tuned for test VMs (1-2 cores).
    # POLL_INTERVAL=1 keeps detection latency low.
-    # GRACE_PERIOD=5 is long enough to verify "stays stopped" but short
-    # enough that the full test completes in reasonable time.
-    # CPU_STOP_THRESHOLD=20 catches a busy-loop on a 1-2 core VM (50-100%)
-    # without triggering from normal VM noise.
-    # CPU_RESUME_THRESHOLD=10 is the idle cutoff for a 1-2 core VM.
+    # GRACE_PERIOD=5 is long enough to verify hysteresis, short enough for
+    # reasonable total test time.
+    # CPU_STOP_HIGH=999 effectively disables the system-wide path (a 1-core
+    # VM can never exceed 100% of total CPU) so per-service subtests exercise
+    # that path in isolation. CPU_STOP_LOW=20 catches a bash busy-loop on a
+    # 1-2 core VM without tripping on normal VM noise.
    POLL_INTERVAL = "1"
-    GRACE_PERIOD = "5"
-    CPU_STOP_THRESHOLD = "20"
-    CPU_RESUME_THRESHOLD = "10"
-    STARTUP_COOLDOWN = "4"
+    GRACE_PERIOD  = "5"
+    CPU_STOP_HIGH = "999"
+    CPU_STOP_LOW  = "20"
+    CPU_RESUME_HIGH = "950"
+    CPU_RESUME_LOW  = "10"
    STATE_DIR = "/tmp/xap-state"
+    WATCHED_UNIT = "watched-burn"
+    WATCHED_THR  = "5"
+
+    def frozen():
+        out = machine.succeed(f"cat {CGROUP_EVENTS}")
+        return "frozen 1" in out
+
+    def thawed():
+        out = machine.succeed(f"cat {CGROUP_EVENTS}")
+        return "frozen 0" in out
+
+    def xmrig_pid():
+        return machine.succeed("systemctl show xmrig -p MainPID --value").strip()
+
    def start_cpu_load(name):
        """Start a non-nice CPU burn as a transient systemd unit."""
        machine.succeed(
@@ -58,20 +91,29 @@ pkgs.testers.runNixOSTest {
    def stop_cpu_load(name):
        machine.succeed(f"systemctl stop {name}")

-    def start_monitor(unit_name):
-        """Start the auto-pause monitor as a transient unit."""
-        machine.succeed(
-            f"systemd-run --unit={unit_name} "
-            f"--setenv=POLL_INTERVAL={POLL_INTERVAL} "
-            f"--setenv=GRACE_PERIOD={GRACE_PERIOD} "
-            f"--setenv=CPU_STOP_THRESHOLD={CPU_STOP_THRESHOLD} "
-            f"--setenv=CPU_RESUME_THRESHOLD={CPU_RESUME_THRESHOLD} "
-            f"--setenv=STARTUP_COOLDOWN={STARTUP_COOLDOWN} "
-            f"--setenv=STATE_DIR={STATE_DIR} "
-            f"{PYTHON} {SCRIPT}"
-        )
-        # Monitor needs two consecutive polls to compute a CPU delta.
-        time.sleep(3)
+    def start_monitor(unit_name, *, watched="", cpu_stop=CPU_STOP_HIGH, cpu_resume=CPU_RESUME_HIGH):
+        """Start the auto-pause monitor as a transient unit.
+
+        watched="foo:5,bar:10" enables the per-service path.
+        cpu_stop/cpu_resume default to values that disable the system-wide
+        path (95/90) so per-service behaviour is tested in isolation.
+        """
+        parts = [
+            f"systemd-run --unit={unit_name}",
+            "--property=After=xmrig.service",
+            "--property=PartOf=xmrig.service",
+            f"--property=ExecStop={THAW_SCRIPT}",
+            f"--setenv=POLL_INTERVAL={POLL_INTERVAL}",
+            f"--setenv=GRACE_PERIOD={GRACE_PERIOD}",
+            f"--setenv=CPU_STOP_THRESHOLD={cpu_stop}",
+            f"--setenv=CPU_RESUME_THRESHOLD={cpu_resume}",
+            f"--setenv=STATE_DIR={STATE_DIR}",
+            f"--setenv=XMRIG_CGROUP_FREEZE={CGROUP_FREEZE}",
+        ]
+        if watched:
+            parts.append(f"--setenv=WATCHED_SERVICES={watched}")
+        parts.append(f"{PYTHON} {SCRIPT}")
+        machine.succeed(" ".join(parts))
        # Monitor needs two consecutive polls to compute a CPU delta.
        time.sleep(3)

@@ -80,127 +122,139 @@ pkgs.testers.runNixOSTest {
    machine.wait_for_unit("xmrig.service")
    machine.succeed(f"mkdir -p {STATE_DIR}")

-    with subtest("Start auto-pause monitor"):
-        start_monitor("xmrig-auto-pause")
+    # ------------------------------------------------------------------
+    # Per-service path (primary signal)
+    # ------------------------------------------------------------------

-    with subtest("xmrig stays running while system is idle"):
-        machine.succeed("systemctl is-active xmrig")
+    with subtest("Idle → xmrig stays thawed"):
+        start_monitor("ap-watched", watched=f"{WATCHED_UNIT}:{WATCHED_THR}")
+        assert thawed(), f"expected thawed, got: {machine.succeed(f'cat {CGROUP_EVENTS}')}"
+        pid0 = xmrig_pid()
+        assert pid0 and pid0 != "0", f"expected a real xmrig PID, got {pid0!r}"

-    with subtest("xmrig stopped when CPU load appears"):
-        start_cpu_load("cpu-load")
-        machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
+    with subtest("Watched service CPU load → xmrig frozen, PID preserved"):
+        start_cpu_load(WATCHED_UNIT)
+        machine.wait_until_succeeds(f"grep -q '^frozen 1' {CGROUP_EVENTS}", timeout=15)
+        assert xmrig_pid() == pid0, "PID must be preserved across freeze"

-    with subtest("xmrig remains stopped during grace period after load ends"):
-        stop_cpu_load("cpu-load")
-        # Load just stopped. Grace period is 5s. Check at 2s — well within.
-        time.sleep(2)
-        machine.fail("systemctl is-active xmrig")
+    with subtest("Load ends → xmrig thawed after grace period, same PID"):
+        stop_cpu_load(WATCHED_UNIT)
+        # Grace period is 5s; watched service drops to 0 immediately, so the
+        # idle timer starts right away. Expect thaw within GRACE + 2*POLL.
+        machine.wait_until_succeeds(f"grep -q '^frozen 0' {CGROUP_EVENTS}", timeout=30)
+        assert xmrig_pid() == pid0, "PID must survive the whole cycle"

-    with subtest("xmrig resumes after grace period expires"):
-        # Already idle since previous subtest. Grace period (5s) plus
-        # detection delay (~2 polls) plus startup cooldown (4s) means
-        # xmrig should restart within ~12s.
-        machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20)
-
-    with subtest("Intermittent load does not cause flapping"):
-        # First load — stop xmrig
-        start_cpu_load("cpu-load-1")
-        machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
-        stop_cpu_load("cpu-load-1")
-
-        # Brief idle gap — shorter than grace period
-        time.sleep(2)
-
-        # Second load arrives before grace period expires
-        start_cpu_load("cpu-load-2")
+    with subtest("Intermittent watched load does not cause flapping"):
+        start_cpu_load(WATCHED_UNIT)
+        machine.wait_until_succeeds(f"grep -q '^frozen 1' {CGROUP_EVENTS}", timeout=15)
+        stop_cpu_load(WATCHED_UNIT)
+        time.sleep(2)  # shorter than grace period
+        start_cpu_load(WATCHED_UNIT)
        time.sleep(3)
+        assert frozen(), "xmrig must still be frozen during intermittent load"
+        stop_cpu_load(WATCHED_UNIT)
+        machine.wait_until_succeeds(f"grep -q '^frozen 0' {CGROUP_EVENTS}", timeout=30)

-        # xmrig must still be stopped
-        machine.fail("systemctl is-active xmrig")
+    with subtest("Sustained watched load keeps xmrig frozen"):
+        start_cpu_load(WATCHED_UNIT)
+        machine.wait_until_succeeds(f"grep -q '^frozen 1' {CGROUP_EVENTS}", timeout=15)
+        time.sleep(int(GRACE_PERIOD) + 3)  # past grace period
+        assert frozen(), "sustained load must keep xmrig frozen"
+        stop_cpu_load(WATCHED_UNIT)
+        machine.wait_until_succeeds(f"grep -q '^frozen 0' {CGROUP_EVENTS}", timeout=30)

-        stop_cpu_load("cpu-load-2")
-        machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20)
+    with subtest("External thaw reclaimed while load present"):
+        start_cpu_load(WATCHED_UNIT)
+        machine.wait_until_succeeds(f"grep -q '^frozen 1' {CGROUP_EVENTS}", timeout=15)
+        # Someone manually thaws xmrig. Auto-pause must detect and re-freeze.
+        machine.succeed(f"echo 0 > {CGROUP_FREEZE}")
+        machine.wait_until_succeeds(f"grep -q '^frozen 1' {CGROUP_EVENTS}", timeout=15)
+        stop_cpu_load(WATCHED_UNIT)
+        machine.wait_until_succeeds(f"grep -q '^frozen 0' {CGROUP_EVENTS}", timeout=30)

-    with subtest("Sustained load keeps xmrig stopped"):
-        start_cpu_load("cpu-load-3")
-        machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
+    with subtest("Monitor SIGTERM thaws xmrig"):
+        start_cpu_load(WATCHED_UNIT)
+        machine.wait_until_succeeds(f"grep -q '^frozen 1' {CGROUP_EVENTS}", timeout=15)
+        machine.succeed("systemctl stop ap-watched")
+        machine.wait_until_succeeds(f"grep -q '^frozen 0' {CGROUP_EVENTS}", timeout=10)
+        stop_cpu_load(WATCHED_UNIT)
+        machine.succeed("systemctl reset-failed ap-watched 2>/dev/null || true")

-        # Stay busy longer than the grace period to prove continuous
-        # activity keeps xmrig stopped indefinitely.
-        time.sleep(8)
-        machine.fail("systemctl is-active xmrig")
+    # ------------------------------------------------------------------
+    # Negative control + system-wide path
+    # ------------------------------------------------------------------

-        stop_cpu_load("cpu-load-3")
-        machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20)
+    with subtest("Unwatched CPU burn does not trip per-service path"):
+        # High CPU_STOP_THRESHOLD + no watched service → no reason to freeze.
+        machine.succeed(f"rm -f {STATE_DIR}/paused")
+        start_monitor("ap-neg")
+        start_cpu_load("unwatched-neg")
+        time.sleep(int(GRACE_PERIOD) + 3)
+        assert thawed(), "unwatched load must not trip when system threshold is high and nothing is watched"
+        stop_cpu_load("unwatched-neg")
+        machine.succeed("systemctl stop ap-neg")
+        machine.succeed("systemctl reset-failed ap-neg 2>/dev/null || true")

-    with subtest("External restart detected and re-stopped under load"):
-        # Put system under load so auto-pause stops xmrig.
-        start_cpu_load("cpu-load-4")
-        machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
+    with subtest("System-wide CPU path freezes xmrig when threshold is low"):
+        machine.succeed(f"rm -f {STATE_DIR}/paused")
+        start_monitor("ap-sys", cpu_stop=CPU_STOP_LOW, cpu_resume=CPU_RESUME_LOW)
+        start_cpu_load("sys-burn")
+        machine.wait_until_succeeds(f"grep -q '^frozen 1' {CGROUP_EVENTS}", timeout=20)
+        stop_cpu_load("sys-burn")
+        machine.wait_until_succeeds(f"grep -q '^frozen 0' {CGROUP_EVENTS}", timeout=30)
+        machine.succeed("systemctl stop ap-sys")
+        machine.succeed("systemctl reset-failed ap-sys 2>/dev/null || true")

-        # Something external starts xmrig while load is active.
-        # The script should detect this and re-stop it.
-        machine.succeed("systemctl start xmrig")
-        machine.succeed("systemctl is-active xmrig")
-        machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
+    # ------------------------------------------------------------------
+    # State persistence and operational edge cases
+    # ------------------------------------------------------------------

-        stop_cpu_load("cpu-load-4")
-        machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20)
+    with subtest("Monitor crash preserves pause claim; next instance resumes"):
+        machine.succeed(f"rm -f {STATE_DIR}/paused")
+        start_monitor("ap-persist", watched=f"{WATCHED_UNIT}:{WATCHED_THR}")
+        start_cpu_load(WATCHED_UNIT)
+        machine.wait_until_succeeds(f"grep -q '^frozen 1' {CGROUP_EVENTS}", timeout=15)
+        # State file must contain the xmrig PID we claim to have frozen.
+        machine.succeed(f"test -s {STATE_DIR}/paused")
+        saved = machine.succeed(f"cat {STATE_DIR}/paused").strip()
+        assert saved == xmrig_pid(), f"state file PID {saved!r} != live xmrig PID {xmrig_pid()!r}"
+        # Hard-kill the monitor. ExecStop does NOT run on SIGKILL, so xmrig
+        # stays frozen. The state file persists.
+        machine.succeed("systemctl kill --signal=KILL ap-persist")
+        machine.succeed("systemctl reset-failed ap-persist 2>/dev/null || true")
+        assert frozen(), "xmrig must remain frozen after monitor SIGKILL"
+        # Fresh monitor picks up the state file, recognises the same PID +
+        # still-frozen cgroup, and continues owning the claim. Ending the
+        # load must thaw xmrig through the normal grace path.
+        start_monitor("ap-persist2", watched=f"{WATCHED_UNIT}:{WATCHED_THR}")
+        stop_cpu_load(WATCHED_UNIT)
+        machine.wait_until_succeeds(f"grep -q '^frozen 0' {CGROUP_EVENTS}", timeout=30)
+        # State file cleared after successful resume.
+        machine.fail(f"test -f {STATE_DIR}/paused")
+        machine.succeed("systemctl stop ap-persist2")
+        machine.succeed("systemctl reset-failed ap-persist2 2>/dev/null || true")

-    # --- State persistence and crash recovery ---
-    machine.succeed("systemctl stop xmrig-auto-pause")
-
-    with subtest("xmrig recovers after crash during startup cooldown"):
-        machine.succeed(f"rm -rf {STATE_DIR} && mkdir -p {STATE_DIR}")
-        start_monitor("xmrig-auto-pause-crash")
-
-        # Load -> xmrig stops
-        start_cpu_load("cpu-crash")
-        machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
-
-        # End load -> xmrig restarts after grace period
-        stop_cpu_load("cpu-crash")
-        machine.wait_until_succeeds("systemctl is-active xmrig", timeout=30)
-
-        # Kill xmrig immediately — simulates crash during startup cooldown.
-        # The script should detect the failure when cooldown expires and
-        # re-enter the retry cycle.
-        machine.succeed("systemctl kill --signal=KILL xmrig")
-        machine.wait_until_fails("systemctl is-active xmrig", timeout=5)
-
-        # After cooldown + grace period + restart, xmrig should be back.
-        machine.wait_until_succeeds("systemctl is-active xmrig", timeout=30)
-
-        machine.succeed("systemctl stop xmrig-auto-pause-crash")
-        machine.succeed("systemctl reset-failed xmrig.service || true")
+    with subtest("systemctl stop xmrig cascades via PartOf and completes quickly"):
+        machine.succeed(f"rm -f {STATE_DIR}/paused")
+        start_monitor("ap-cascade", watched=f"{WATCHED_UNIT}:{WATCHED_THR}")
+        start_cpu_load(WATCHED_UNIT)
+        machine.wait_until_succeeds(f"grep -q '^frozen 1' {CGROUP_EVENTS}", timeout=15)
+        # Simulate apcupsd onbattery hook: `systemctl stop xmrig` while frozen.
+        # Without the PartOf cascade this would hang for TimeoutStopSec (10s
+        # in the mock config, 90s in production) and systemd's freezer bug
+        # class could strand the unit. With cascade: auto-pause stops first,
+        # its ExecStop thaws cgroup.freeze, xmrig's SIGTERM then succeeds.
+        t0 = time.monotonic()
+        machine.succeed("systemctl stop xmrig")
+        dt = time.monotonic() - t0
+        assert dt < 5, f"systemctl stop xmrig took {dt:.1f}s, cascade broken"
+        machine.succeed("systemctl show xmrig -p ActiveState --value | grep -q inactive")
+        # auto-pause stopped as a PartOf dependent
+        machine.succeed("systemctl show ap-cascade -p ActiveState --value | grep -qE 'inactive|deactivating'")
+        # Bring xmrig back for any remaining subtests
        machine.succeed("systemctl start xmrig")
        machine.wait_for_unit("xmrig.service")
-
-    with subtest("Script restart preserves pause state"):
-        machine.succeed(f"rm -rf {STATE_DIR} && mkdir -p {STATE_DIR}")
-        start_monitor("xmrig-auto-pause-persist")
-
-        # Load -> xmrig stops
-        start_cpu_load("cpu-persist")
-        machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
-
-        # Kill the monitor while xmrig is paused (simulates script crash)
-        machine.succeed("systemctl stop xmrig-auto-pause-persist")
-
-        # State file must exist — the monitor persisted the pause flag
-        machine.succeed(f"test -f {STATE_DIR}/paused")
-
-        # Start a fresh monitor instance (reads state file on startup)
-        start_monitor("xmrig-auto-pause-persist2")
-
-        # End load — the new monitor should pick up the paused state
-        # and restart xmrig after the grace period
-        stop_cpu_load("cpu-persist")
-        machine.wait_until_succeeds("systemctl is-active xmrig", timeout=30)
-
-        # State file should be cleaned up after successful restart
-        machine.fail(f"test -f {STATE_DIR}/paused")
-
-        machine.succeed("systemctl stop xmrig-auto-pause-persist2")
+        stop_cpu_load(WATCHED_UNIT)
+        machine.succeed("systemctl reset-failed ap-cascade 2>/dev/null || true")
  '';
 }