xmrig-auto-pause: fix

This commit is contained in:
2026-04-06 13:11:54 -04:00
parent a12dcb01ec
commit 7afd1f35d2
3 changed files with 111 additions and 7 deletions

View File

@@ -44,6 +44,7 @@ pkgs.testers.runNixOSTest {
GRACE_PERIOD = "5"
CPU_THRESHOLD = "10"
STARTUP_COOLDOWN = "4"
STATE_DIR = "/tmp/xap-state"
def start_cpu_load(name):
"""Start a non-nice CPU burn as a transient systemd unit."""
@@ -55,22 +56,28 @@ pkgs.testers.runNixOSTest {
def stop_cpu_load(name):
machine.succeed(f"systemctl stop {name}")
start_all()
machine.wait_for_unit("multi-user.target")
machine.wait_for_unit("xmrig.service")
with subtest("Start auto-pause monitor"):
def start_monitor(unit_name):
"""Start the auto-pause monitor as a transient unit."""
machine.succeed(
f"systemd-run --unit=xmrig-auto-pause "
f"systemd-run --unit={unit_name} "
f"--setenv=POLL_INTERVAL={POLL_INTERVAL} "
f"--setenv=GRACE_PERIOD={GRACE_PERIOD} "
f"--setenv=CPU_THRESHOLD={CPU_THRESHOLD} "
f"--setenv=STARTUP_COOLDOWN={STARTUP_COOLDOWN} "
f"--setenv=STATE_DIR={STATE_DIR} "
f"{PYTHON} {SCRIPT}"
)
# Monitor needs two consecutive polls to compute a CPU delta.
time.sleep(3)
start_all()
machine.wait_for_unit("multi-user.target")
machine.wait_for_unit("xmrig.service")
machine.succeed(f"mkdir -p {STATE_DIR}")
with subtest("Start auto-pause monitor"):
start_monitor("xmrig-auto-pause")
with subtest("xmrig stays running while system is idle"):
machine.succeed("systemctl is-active xmrig")
@@ -134,5 +141,61 @@ pkgs.testers.runNixOSTest {
stop_cpu_load("cpu-load-4")
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20)
# --- State persistence and crash recovery ---
machine.succeed("systemctl stop xmrig-auto-pause")
with subtest("xmrig recovers after crash during startup cooldown"):
machine.succeed(f"rm -rf {STATE_DIR} && mkdir -p {STATE_DIR}")
start_monitor("xmrig-auto-pause-crash")
# Load -> xmrig stops
start_cpu_load("cpu-crash")
machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
# End load -> xmrig restarts after grace period
stop_cpu_load("cpu-crash")
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=30)
# Kill xmrig immediately simulates crash during startup cooldown.
# The script should detect the failure when cooldown expires and
# re-enter the retry cycle.
machine.succeed("systemctl kill --signal=KILL xmrig")
machine.wait_until_fails("systemctl is-active xmrig", timeout=5)
# After cooldown + grace period + restart, xmrig should be back.
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=30)
machine.succeed("systemctl stop xmrig-auto-pause-crash")
machine.succeed("systemctl reset-failed xmrig.service || true")
machine.succeed("systemctl start xmrig")
machine.wait_for_unit("xmrig.service")
with subtest("Script restart preserves pause state"):
machine.succeed(f"rm -rf {STATE_DIR} && mkdir -p {STATE_DIR}")
start_monitor("xmrig-auto-pause-persist")
# Load -> xmrig stops
start_cpu_load("cpu-persist")
machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
# Kill the monitor while xmrig is paused (simulates script crash)
machine.succeed("systemctl stop xmrig-auto-pause-persist")
# State file must exist the monitor persisted the pause flag
machine.succeed(f"test -f {STATE_DIR}/paused")
# Start a fresh monitor instance (reads state file on startup)
start_monitor("xmrig-auto-pause-persist2")
# End load the new monitor should pick up the paused state
# and restart xmrig after the grace period
stop_cpu_load("cpu-persist")
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=30)
# State file should be cleaned up after successful restart
machine.fail(f"test -f {STATE_DIR}/paused")
machine.succeed("systemctl stop xmrig-auto-pause-persist2")
'';
}