261 lines
12 KiB
Nix
261 lines
12 KiB
Nix
{
|
|
pkgs,
|
|
...
|
|
}:
|
|
let
|
|
script = ../services/monero/xmrig-auto-pause.py;
|
|
python = pkgs.python3;
|
|
cgroupDir = "/sys/fs/cgroup/system.slice/xmrig.service";
|
|
cgroupFreeze = "${cgroupDir}/cgroup.freeze";
|
|
cgroupEvents = "${cgroupDir}/cgroup.events";
|
|
# Inline ExecStop for the transient monitor: mirrors the production .nix
|
|
# ExecStop so the PartOf cascade test exercises the same code path.
|
|
thawScript = pkgs.writeShellScript "test-thaw-xmrig" ''
|
|
f=${cgroupFreeze}
|
|
[ -w "$f" ] && echo 0 > "$f" || true
|
|
'';
|
|
in
|
|
pkgs.testers.runNixOSTest {
|
|
name = "xmrig-auto-pause";
|
|
|
|
nodes.machine =
|
|
{ pkgs, ... }:
|
|
{
|
|
environment.systemPackages = [
|
|
pkgs.python3
|
|
pkgs.procps
|
|
];
|
|
|
|
# Mock xmrig as a nice'd sleep process. Runs in the real
|
|
# /sys/fs/cgroup/system.slice/xmrig.service cgroup, which is what the
|
|
# auto-pause script writes cgroup.freeze into.
|
|
systemd.services.xmrig = {
|
|
description = "Mock xmrig miner";
|
|
serviceConfig = {
|
|
ExecStart = "${pkgs.coreutils}/bin/sleep infinity";
|
|
Type = "simple";
|
|
Nice = 19;
|
|
# Short timeout so the PartOf cascade test completes fast if the
|
|
# cascade is broken (would otherwise hit systemd's 90s default).
|
|
TimeoutStopSec = "10s";
|
|
};
|
|
wantedBy = [ "multi-user.target" ];
|
|
};
|
|
};
|
|
|
|
testScript = ''
|
|
import time
|
|
|
|
PYTHON = "${python}/bin/python3"
|
|
SCRIPT = "${script}"
|
|
CGROUP_FREEZE = "${cgroupFreeze}"
|
|
CGROUP_EVENTS = "${cgroupEvents}"
|
|
THAW_SCRIPT = "${thawScript}"
|
|
|
|
# Tuned for test VMs (1-2 cores).
|
|
# POLL_INTERVAL=1 keeps detection latency low.
|
|
# GRACE_PERIOD=5 is long enough to verify hysteresis, short enough for
|
|
# reasonable total test time.
|
|
# CPU_STOP_HIGH=999 effectively disables the system-wide path (a 1-core
|
|
# VM can never exceed 100% of total CPU) so per-service subtests exercise
|
|
# that path in isolation. CPU_STOP_LOW=20 catches a bash busy-loop on a
|
|
# 1-2 core VM without tripping on normal VM noise.
|
|
POLL_INTERVAL = "1"
|
|
GRACE_PERIOD = "5"
|
|
CPU_STOP_HIGH = "999"
|
|
CPU_STOP_LOW = "20"
|
|
CPU_RESUME_HIGH = "950"
|
|
CPU_RESUME_LOW = "10"
|
|
STATE_DIR = "/tmp/xap-state"
|
|
WATCHED_UNIT = "watched-burn"
|
|
WATCHED_THR = "5"
|
|
|
|
def frozen():
|
|
out = machine.succeed(f"cat {CGROUP_EVENTS}")
|
|
return "frozen 1" in out
|
|
|
|
def thawed():
|
|
out = machine.succeed(f"cat {CGROUP_EVENTS}")
|
|
return "frozen 0" in out
|
|
|
|
def xmrig_pid():
|
|
return machine.succeed("systemctl show xmrig -p MainPID --value").strip()
|
|
|
|
def start_cpu_load(name):
|
|
"""Start a non-nice CPU burn as a transient systemd unit."""
|
|
machine.succeed(
|
|
f"systemd-run --unit={name} --property=Type=exec "
|
|
f"bash -c 'while true; do :; done'"
|
|
)
|
|
|
|
def stop_cpu_load(name):
|
|
machine.succeed(f"systemctl stop {name}")
|
|
|
|
def start_monitor(unit_name, *, watched="", cpu_stop=CPU_STOP_HIGH, cpu_resume=CPU_RESUME_HIGH):
|
|
"""Start the auto-pause monitor as a transient unit.
|
|
|
|
watched="foo:5,bar:10" enables the per-service path.
|
|
cpu_stop/cpu_resume default to values that disable the system-wide
|
|
path (95/90) so per-service behaviour is tested in isolation.
|
|
"""
|
|
parts = [
|
|
f"systemd-run --unit={unit_name}",
|
|
"--property=After=xmrig.service",
|
|
"--property=PartOf=xmrig.service",
|
|
f"--property=ExecStop={THAW_SCRIPT}",
|
|
f"--setenv=POLL_INTERVAL={POLL_INTERVAL}",
|
|
f"--setenv=GRACE_PERIOD={GRACE_PERIOD}",
|
|
f"--setenv=CPU_STOP_THRESHOLD={cpu_stop}",
|
|
f"--setenv=CPU_RESUME_THRESHOLD={cpu_resume}",
|
|
f"--setenv=STATE_DIR={STATE_DIR}",
|
|
f"--setenv=XMRIG_CGROUP_FREEZE={CGROUP_FREEZE}",
|
|
]
|
|
if watched:
|
|
parts.append(f"--setenv=WATCHED_SERVICES={watched}")
|
|
parts.append(f"{PYTHON} {SCRIPT}")
|
|
machine.succeed(" ".join(parts))
|
|
# Monitor needs two consecutive polls to compute a CPU delta.
|
|
time.sleep(3)
|
|
|
|
start_all()
|
|
machine.wait_for_unit("multi-user.target")
|
|
machine.wait_for_unit("xmrig.service")
|
|
machine.succeed(f"mkdir -p {STATE_DIR}")
|
|
|
|
# ------------------------------------------------------------------
|
|
# Per-service path (primary signal)
|
|
# ------------------------------------------------------------------
|
|
|
|
with subtest("Idle → xmrig stays thawed"):
|
|
start_monitor("ap-watched", watched=f"{WATCHED_UNIT}:{WATCHED_THR}")
|
|
assert thawed(), f"expected thawed, got: {machine.succeed(f'cat {CGROUP_EVENTS}')}"
|
|
pid0 = xmrig_pid()
|
|
assert pid0 and pid0 != "0", f"expected a real xmrig PID, got {pid0!r}"
|
|
|
|
with subtest("Watched service CPU load → xmrig frozen, PID preserved"):
|
|
start_cpu_load(WATCHED_UNIT)
|
|
machine.wait_until_succeeds(f"grep -q '^frozen 1' {CGROUP_EVENTS}", timeout=15)
|
|
assert xmrig_pid() == pid0, "PID must be preserved across freeze"
|
|
|
|
with subtest("Load ends → xmrig thawed after grace period, same PID"):
|
|
stop_cpu_load(WATCHED_UNIT)
|
|
# Grace period is 5s; watched service drops to 0 immediately, so the
|
|
# idle timer starts right away. Expect thaw within GRACE + 2*POLL.
|
|
machine.wait_until_succeeds(f"grep -q '^frozen 0' {CGROUP_EVENTS}", timeout=30)
|
|
assert xmrig_pid() == pid0, "PID must survive the whole cycle"
|
|
|
|
with subtest("Intermittent watched load does not cause flapping"):
|
|
start_cpu_load(WATCHED_UNIT)
|
|
machine.wait_until_succeeds(f"grep -q '^frozen 1' {CGROUP_EVENTS}", timeout=15)
|
|
stop_cpu_load(WATCHED_UNIT)
|
|
time.sleep(2) # shorter than grace period
|
|
start_cpu_load(WATCHED_UNIT)
|
|
time.sleep(3)
|
|
assert frozen(), "xmrig must still be frozen during intermittent load"
|
|
stop_cpu_load(WATCHED_UNIT)
|
|
machine.wait_until_succeeds(f"grep -q '^frozen 0' {CGROUP_EVENTS}", timeout=30)
|
|
|
|
with subtest("Sustained watched load keeps xmrig frozen"):
|
|
start_cpu_load(WATCHED_UNIT)
|
|
machine.wait_until_succeeds(f"grep -q '^frozen 1' {CGROUP_EVENTS}", timeout=15)
|
|
time.sleep(int(GRACE_PERIOD) + 3) # past grace period
|
|
assert frozen(), "sustained load must keep xmrig frozen"
|
|
stop_cpu_load(WATCHED_UNIT)
|
|
machine.wait_until_succeeds(f"grep -q '^frozen 0' {CGROUP_EVENTS}", timeout=30)
|
|
|
|
with subtest("External thaw reclaimed while load present"):
|
|
start_cpu_load(WATCHED_UNIT)
|
|
machine.wait_until_succeeds(f"grep -q '^frozen 1' {CGROUP_EVENTS}", timeout=15)
|
|
# Someone manually thaws xmrig. Auto-pause must detect and re-freeze.
|
|
machine.succeed(f"echo 0 > {CGROUP_FREEZE}")
|
|
machine.wait_until_succeeds(f"grep -q '^frozen 1' {CGROUP_EVENTS}", timeout=15)
|
|
stop_cpu_load(WATCHED_UNIT)
|
|
machine.wait_until_succeeds(f"grep -q '^frozen 0' {CGROUP_EVENTS}", timeout=30)
|
|
|
|
with subtest("Monitor SIGTERM thaws xmrig"):
|
|
start_cpu_load(WATCHED_UNIT)
|
|
machine.wait_until_succeeds(f"grep -q '^frozen 1' {CGROUP_EVENTS}", timeout=15)
|
|
machine.succeed("systemctl stop ap-watched")
|
|
machine.wait_until_succeeds(f"grep -q '^frozen 0' {CGROUP_EVENTS}", timeout=10)
|
|
stop_cpu_load(WATCHED_UNIT)
|
|
machine.succeed("systemctl reset-failed ap-watched 2>/dev/null || true")
|
|
|
|
# ------------------------------------------------------------------
|
|
# Negative control + system-wide path
|
|
# ------------------------------------------------------------------
|
|
|
|
with subtest("Unwatched CPU burn does not trip per-service path"):
|
|
# High CPU_STOP_THRESHOLD + no watched service → no reason to freeze.
|
|
machine.succeed(f"rm -f {STATE_DIR}/paused")
|
|
start_monitor("ap-neg")
|
|
start_cpu_load("unwatched-neg")
|
|
time.sleep(int(GRACE_PERIOD) + 3)
|
|
assert thawed(), "unwatched load must not trip when system threshold is high and nothing is watched"
|
|
stop_cpu_load("unwatched-neg")
|
|
machine.succeed("systemctl stop ap-neg")
|
|
machine.succeed("systemctl reset-failed ap-neg 2>/dev/null || true")
|
|
|
|
with subtest("System-wide CPU path freezes xmrig when threshold is low"):
|
|
machine.succeed(f"rm -f {STATE_DIR}/paused")
|
|
start_monitor("ap-sys", cpu_stop=CPU_STOP_LOW, cpu_resume=CPU_RESUME_LOW)
|
|
start_cpu_load("sys-burn")
|
|
machine.wait_until_succeeds(f"grep -q '^frozen 1' {CGROUP_EVENTS}", timeout=20)
|
|
stop_cpu_load("sys-burn")
|
|
machine.wait_until_succeeds(f"grep -q '^frozen 0' {CGROUP_EVENTS}", timeout=30)
|
|
machine.succeed("systemctl stop ap-sys")
|
|
machine.succeed("systemctl reset-failed ap-sys 2>/dev/null || true")
|
|
|
|
# ------------------------------------------------------------------
|
|
# State persistence and operational edge cases
|
|
# ------------------------------------------------------------------
|
|
|
|
with subtest("Monitor crash preserves pause claim; next instance resumes"):
|
|
machine.succeed(f"rm -f {STATE_DIR}/paused")
|
|
start_monitor("ap-persist", watched=f"{WATCHED_UNIT}:{WATCHED_THR}")
|
|
start_cpu_load(WATCHED_UNIT)
|
|
machine.wait_until_succeeds(f"grep -q '^frozen 1' {CGROUP_EVENTS}", timeout=15)
|
|
# State file must contain the xmrig PID we claim to have frozen.
|
|
machine.succeed(f"test -s {STATE_DIR}/paused")
|
|
saved = machine.succeed(f"cat {STATE_DIR}/paused").strip()
|
|
assert saved == xmrig_pid(), f"state file PID {saved!r} != live xmrig PID {xmrig_pid()!r}"
|
|
# Hard-kill the monitor. ExecStop does NOT run on SIGKILL, so xmrig
|
|
# stays frozen. The state file persists.
|
|
machine.succeed("systemctl kill --signal=KILL ap-persist")
|
|
machine.succeed("systemctl reset-failed ap-persist 2>/dev/null || true")
|
|
assert frozen(), "xmrig must remain frozen after monitor SIGKILL"
|
|
# Fresh monitor picks up the state file, recognises the same PID +
|
|
# still-frozen cgroup, and continues owning the claim. Ending the
|
|
# load must thaw xmrig through the normal grace path.
|
|
start_monitor("ap-persist2", watched=f"{WATCHED_UNIT}:{WATCHED_THR}")
|
|
stop_cpu_load(WATCHED_UNIT)
|
|
machine.wait_until_succeeds(f"grep -q '^frozen 0' {CGROUP_EVENTS}", timeout=30)
|
|
# State file cleared after successful resume.
|
|
machine.fail(f"test -f {STATE_DIR}/paused")
|
|
machine.succeed("systemctl stop ap-persist2")
|
|
machine.succeed("systemctl reset-failed ap-persist2 2>/dev/null || true")
|
|
|
|
with subtest("systemctl stop xmrig cascades via PartOf and completes quickly"):
|
|
machine.succeed(f"rm -f {STATE_DIR}/paused")
|
|
start_monitor("ap-cascade", watched=f"{WATCHED_UNIT}:{WATCHED_THR}")
|
|
start_cpu_load(WATCHED_UNIT)
|
|
machine.wait_until_succeeds(f"grep -q '^frozen 1' {CGROUP_EVENTS}", timeout=15)
|
|
# Simulate apcupsd onbattery hook: `systemctl stop xmrig` while frozen.
|
|
# Without the PartOf cascade this would hang for TimeoutStopSec (10s
|
|
# in the mock config, 90s in production) and systemd's freezer bug
|
|
# class could strand the unit. With cascade: auto-pause stops first,
|
|
# its ExecStop thaws cgroup.freeze, xmrig's SIGTERM then succeeds.
|
|
t0 = time.monotonic()
|
|
machine.succeed("systemctl stop xmrig")
|
|
dt = time.monotonic() - t0
|
|
assert dt < 5, f"systemctl stop xmrig took {dt:.1f}s, cascade broken"
|
|
machine.succeed("systemctl show xmrig -p ActiveState --value | grep -q inactive")
|
|
# auto-pause stopped as a PartOf dependent
|
|
machine.succeed("systemctl show ap-cascade -p ActiveState --value | grep -qE 'inactive|deactivating'")
|
|
# Bring xmrig back for any remaining subtests
|
|
machine.succeed("systemctl start xmrig")
|
|
machine.wait_for_unit("xmrig.service")
|
|
stop_cpu_load(WATCHED_UNIT)
|
|
machine.succeed("systemctl reset-failed ap-cascade 2>/dev/null || true")
|
|
'';
|
|
}
|