{ pkgs, ... }: let script = ../services/monero/xmrig-auto-pause.py; python = pkgs.python3; cgroupDir = "/sys/fs/cgroup/system.slice/xmrig.service"; cgroupFreeze = "${cgroupDir}/cgroup.freeze"; cgroupEvents = "${cgroupDir}/cgroup.events"; # Inline ExecStop for the transient monitor: mirrors the production .nix # ExecStop so the PartOf cascade test exercises the same code path. thawScript = pkgs.writeShellScript "test-thaw-xmrig" '' f=${cgroupFreeze} [ -w "$f" ] && echo 0 > "$f" || true ''; in pkgs.testers.runNixOSTest { name = "xmrig-auto-pause"; nodes.machine = { pkgs, ... }: { environment.systemPackages = [ pkgs.python3 pkgs.procps ]; # Mock xmrig as a nice'd sleep process. Runs in the real # /sys/fs/cgroup/system.slice/xmrig.service cgroup, which is what the # auto-pause script writes cgroup.freeze into. systemd.services.xmrig = { description = "Mock xmrig miner"; serviceConfig = { ExecStart = "${pkgs.coreutils}/bin/sleep infinity"; Type = "simple"; Nice = 19; # Short timeout so the PartOf cascade test completes fast if the # cascade is broken (would otherwise hit systemd's 90s default). TimeoutStopSec = "10s"; }; wantedBy = [ "multi-user.target" ]; }; }; testScript = '' import time PYTHON = "${python}/bin/python3" SCRIPT = "${script}" CGROUP_FREEZE = "${cgroupFreeze}" CGROUP_EVENTS = "${cgroupEvents}" THAW_SCRIPT = "${thawScript}" # Tuned for test VMs (1-2 cores). # POLL_INTERVAL=1 keeps detection latency low. # GRACE_PERIOD=5 is long enough to verify hysteresis, short enough for # reasonable total test time. # CPU_STOP_HIGH=999 effectively disables the system-wide path (a 1-core # VM can never exceed 100% of total CPU) so per-service subtests exercise # that path in isolation. CPU_STOP_LOW=20 catches a bash busy-loop on a # 1-2 core VM without tripping on normal VM noise. POLL_INTERVAL = "1" GRACE_PERIOD = "5" CPU_STOP_HIGH = "999" CPU_STOP_LOW = "20" CPU_RESUME_HIGH = "950" CPU_RESUME_LOW = "10" STATE_DIR = "/tmp/xap-state" WATCHED_UNIT = "watched-burn" WATCHED_THR = "5" def frozen(): out = machine.succeed(f"cat {CGROUP_EVENTS}") return "frozen 1" in out def thawed(): out = machine.succeed(f"cat {CGROUP_EVENTS}") return "frozen 0" in out def xmrig_pid(): return machine.succeed("systemctl show xmrig -p MainPID --value").strip() def start_cpu_load(name): """Start a non-nice CPU burn as a transient systemd unit.""" machine.succeed( f"systemd-run --unit={name} --property=Type=exec " f"bash -c 'while true; do :; done'" ) def stop_cpu_load(name): machine.succeed(f"systemctl stop {name}") def start_monitor(unit_name, *, watched="", cpu_stop=CPU_STOP_HIGH, cpu_resume=CPU_RESUME_HIGH): """Start the auto-pause monitor as a transient unit. watched="foo:5,bar:10" enables the per-service path. cpu_stop/cpu_resume default to values that disable the system-wide path (95/90) so per-service behaviour is tested in isolation. """ parts = [ f"systemd-run --unit={unit_name}", "--property=After=xmrig.service", "--property=PartOf=xmrig.service", f"--property=ExecStop={THAW_SCRIPT}", f"--setenv=POLL_INTERVAL={POLL_INTERVAL}", f"--setenv=GRACE_PERIOD={GRACE_PERIOD}", f"--setenv=CPU_STOP_THRESHOLD={cpu_stop}", f"--setenv=CPU_RESUME_THRESHOLD={cpu_resume}", f"--setenv=STATE_DIR={STATE_DIR}", f"--setenv=XMRIG_CGROUP_FREEZE={CGROUP_FREEZE}", ] if watched: parts.append(f"--setenv=WATCHED_SERVICES={watched}") parts.append(f"{PYTHON} {SCRIPT}") machine.succeed(" ".join(parts)) # Monitor needs two consecutive polls to compute a CPU delta. time.sleep(3) start_all() machine.wait_for_unit("multi-user.target") machine.wait_for_unit("xmrig.service") machine.succeed(f"mkdir -p {STATE_DIR}") # ------------------------------------------------------------------ # Per-service path (primary signal) # ------------------------------------------------------------------ with subtest("Idle → xmrig stays thawed"): start_monitor("ap-watched", watched=f"{WATCHED_UNIT}:{WATCHED_THR}") assert thawed(), f"expected thawed, got: {machine.succeed(f'cat {CGROUP_EVENTS}')}" pid0 = xmrig_pid() assert pid0 and pid0 != "0", f"expected a real xmrig PID, got {pid0!r}" with subtest("Watched service CPU load → xmrig frozen, PID preserved"): start_cpu_load(WATCHED_UNIT) machine.wait_until_succeeds(f"grep -q '^frozen 1' {CGROUP_EVENTS}", timeout=15) assert xmrig_pid() == pid0, "PID must be preserved across freeze" with subtest("Load ends → xmrig thawed after grace period, same PID"): stop_cpu_load(WATCHED_UNIT) # Grace period is 5s; watched service drops to 0 immediately, so the # idle timer starts right away. Expect thaw within GRACE + 2*POLL. machine.wait_until_succeeds(f"grep -q '^frozen 0' {CGROUP_EVENTS}", timeout=30) assert xmrig_pid() == pid0, "PID must survive the whole cycle" with subtest("Intermittent watched load does not cause flapping"): start_cpu_load(WATCHED_UNIT) machine.wait_until_succeeds(f"grep -q '^frozen 1' {CGROUP_EVENTS}", timeout=15) stop_cpu_load(WATCHED_UNIT) time.sleep(2) # shorter than grace period start_cpu_load(WATCHED_UNIT) time.sleep(3) assert frozen(), "xmrig must still be frozen during intermittent load" stop_cpu_load(WATCHED_UNIT) machine.wait_until_succeeds(f"grep -q '^frozen 0' {CGROUP_EVENTS}", timeout=30) with subtest("Sustained watched load keeps xmrig frozen"): start_cpu_load(WATCHED_UNIT) machine.wait_until_succeeds(f"grep -q '^frozen 1' {CGROUP_EVENTS}", timeout=15) time.sleep(int(GRACE_PERIOD) + 3) # past grace period assert frozen(), "sustained load must keep xmrig frozen" stop_cpu_load(WATCHED_UNIT) machine.wait_until_succeeds(f"grep -q '^frozen 0' {CGROUP_EVENTS}", timeout=30) with subtest("External thaw reclaimed while load present"): start_cpu_load(WATCHED_UNIT) machine.wait_until_succeeds(f"grep -q '^frozen 1' {CGROUP_EVENTS}", timeout=15) # Someone manually thaws xmrig. Auto-pause must detect and re-freeze. machine.succeed(f"echo 0 > {CGROUP_FREEZE}") machine.wait_until_succeeds(f"grep -q '^frozen 1' {CGROUP_EVENTS}", timeout=15) stop_cpu_load(WATCHED_UNIT) machine.wait_until_succeeds(f"grep -q '^frozen 0' {CGROUP_EVENTS}", timeout=30) with subtest("Monitor SIGTERM thaws xmrig"): start_cpu_load(WATCHED_UNIT) machine.wait_until_succeeds(f"grep -q '^frozen 1' {CGROUP_EVENTS}", timeout=15) machine.succeed("systemctl stop ap-watched") machine.wait_until_succeeds(f"grep -q '^frozen 0' {CGROUP_EVENTS}", timeout=10) stop_cpu_load(WATCHED_UNIT) machine.succeed("systemctl reset-failed ap-watched 2>/dev/null || true") # ------------------------------------------------------------------ # Negative control + system-wide path # ------------------------------------------------------------------ with subtest("Unwatched CPU burn does not trip per-service path"): # High CPU_STOP_THRESHOLD + no watched service → no reason to freeze. machine.succeed(f"rm -f {STATE_DIR}/paused") start_monitor("ap-neg") start_cpu_load("unwatched-neg") time.sleep(int(GRACE_PERIOD) + 3) assert thawed(), "unwatched load must not trip when system threshold is high and nothing is watched" stop_cpu_load("unwatched-neg") machine.succeed("systemctl stop ap-neg") machine.succeed("systemctl reset-failed ap-neg 2>/dev/null || true") with subtest("System-wide CPU path freezes xmrig when threshold is low"): machine.succeed(f"rm -f {STATE_DIR}/paused") start_monitor("ap-sys", cpu_stop=CPU_STOP_LOW, cpu_resume=CPU_RESUME_LOW) start_cpu_load("sys-burn") machine.wait_until_succeeds(f"grep -q '^frozen 1' {CGROUP_EVENTS}", timeout=20) stop_cpu_load("sys-burn") machine.wait_until_succeeds(f"grep -q '^frozen 0' {CGROUP_EVENTS}", timeout=30) machine.succeed("systemctl stop ap-sys") machine.succeed("systemctl reset-failed ap-sys 2>/dev/null || true") # ------------------------------------------------------------------ # State persistence and operational edge cases # ------------------------------------------------------------------ with subtest("Monitor crash preserves pause claim; next instance resumes"): machine.succeed(f"rm -f {STATE_DIR}/paused") start_monitor("ap-persist", watched=f"{WATCHED_UNIT}:{WATCHED_THR}") start_cpu_load(WATCHED_UNIT) machine.wait_until_succeeds(f"grep -q '^frozen 1' {CGROUP_EVENTS}", timeout=15) # State file must contain the xmrig PID we claim to have frozen. machine.succeed(f"test -s {STATE_DIR}/paused") saved = machine.succeed(f"cat {STATE_DIR}/paused").strip() assert saved == xmrig_pid(), f"state file PID {saved!r} != live xmrig PID {xmrig_pid()!r}" # Hard-kill the monitor. ExecStop does NOT run on SIGKILL, so xmrig # stays frozen. The state file persists. machine.succeed("systemctl kill --signal=KILL ap-persist") machine.succeed("systemctl reset-failed ap-persist 2>/dev/null || true") assert frozen(), "xmrig must remain frozen after monitor SIGKILL" # Fresh monitor picks up the state file, recognises the same PID + # still-frozen cgroup, and continues owning the claim. Ending the # load must thaw xmrig through the normal grace path. start_monitor("ap-persist2", watched=f"{WATCHED_UNIT}:{WATCHED_THR}") stop_cpu_load(WATCHED_UNIT) machine.wait_until_succeeds(f"grep -q '^frozen 0' {CGROUP_EVENTS}", timeout=30) # State file cleared after successful resume. machine.fail(f"test -f {STATE_DIR}/paused") machine.succeed("systemctl stop ap-persist2") machine.succeed("systemctl reset-failed ap-persist2 2>/dev/null || true") with subtest("systemctl stop xmrig cascades via PartOf and completes quickly"): machine.succeed(f"rm -f {STATE_DIR}/paused") start_monitor("ap-cascade", watched=f"{WATCHED_UNIT}:{WATCHED_THR}") start_cpu_load(WATCHED_UNIT) machine.wait_until_succeeds(f"grep -q '^frozen 1' {CGROUP_EVENTS}", timeout=15) # Simulate apcupsd onbattery hook: `systemctl stop xmrig` while frozen. # Without the PartOf cascade this would hang for TimeoutStopSec (10s # in the mock config, 90s in production) and systemd's freezer bug # class could strand the unit. With cascade: auto-pause stops first, # its ExecStop thaws cgroup.freeze, xmrig's SIGTERM then succeeds. t0 = time.monotonic() machine.succeed("systemctl stop xmrig") dt = time.monotonic() - t0 assert dt < 5, f"systemctl stop xmrig took {dt:.1f}s, cascade broken" machine.succeed("systemctl show xmrig -p ActiveState --value | grep -q inactive") # auto-pause stopped as a PartOf dependent machine.succeed("systemctl show ap-cascade -p ActiveState --value | grep -qE 'inactive|deactivating'") # Bring xmrig back for any remaining subtests machine.succeed("systemctl start xmrig") machine.wait_for_unit("xmrig.service") stop_cpu_load(WATCHED_UNIT) machine.succeed("systemctl reset-failed ap-cascade 2>/dev/null || true") ''; }