xmrig's RandomX pollutes the L3 cache, making other processes appear ~3-8% busier. With a single 5% threshold for both stopping and resuming, the script oscillates: start xmrig -> cache pressure inflates CPU -> stop xmrig -> CPU drops -> restart -> repeat. Split into CPU_STOP_THRESHOLD (15%) and CPU_RESUME_THRESHOLD (5%). The stop threshold sits above xmrig's indirect pressure, so only genuine workloads trigger a pause. The resume threshold confirms the system is truly idle before restarting.
207 lines
7.6 KiB
Nix
207 lines
7.6 KiB
Nix
{
|
|
pkgs,
|
|
...
|
|
}:
|
|
let
|
|
script = ../services/monero/xmrig-auto-pause.py;
|
|
python = pkgs.python3;
|
|
in
|
|
pkgs.testers.runNixOSTest {
|
|
name = "xmrig-auto-pause";
|
|
|
|
nodes.machine =
|
|
{ pkgs, ... }:
|
|
{
|
|
environment.systemPackages = [
|
|
pkgs.python3
|
|
pkgs.procps
|
|
];
|
|
|
|
# Mock xmrig as a nice'd sleep process that can be stopped/started.
|
|
systemd.services.xmrig = {
|
|
description = "Mock xmrig miner";
|
|
serviceConfig = {
|
|
ExecStart = "${pkgs.coreutils}/bin/sleep infinity";
|
|
Type = "simple";
|
|
Nice = 19;
|
|
};
|
|
wantedBy = [ "multi-user.target" ];
|
|
};
|
|
};
|
|
|
|
testScript = ''
|
|
import time
|
|
|
|
PYTHON = "${python}/bin/python3"
|
|
SCRIPT = "${script}"
|
|
|
|
# Tuned for test VMs (1-2 cores).
|
|
# POLL_INTERVAL=1 keeps detection latency low.
|
|
# GRACE_PERIOD=5 is long enough to verify "stays stopped" but short
|
|
# enough that the full test completes in reasonable time.
|
|
# CPU_STOP_THRESHOLD=20 catches a busy-loop on a 1-2 core VM (50-100%)
|
|
# without triggering from normal VM noise.
|
|
# CPU_RESUME_THRESHOLD=10 is the idle cutoff for a 1-2 core VM.
|
|
POLL_INTERVAL = "1"
|
|
GRACE_PERIOD = "5"
|
|
CPU_STOP_THRESHOLD = "20"
|
|
CPU_RESUME_THRESHOLD = "10"
|
|
STARTUP_COOLDOWN = "4"
|
|
STATE_DIR = "/tmp/xap-state"
|
|
def start_cpu_load(name):
|
|
"""Start a non-nice CPU burn as a transient systemd unit."""
|
|
machine.succeed(
|
|
f"systemd-run --unit={name} --property=Type=exec "
|
|
f"bash -c 'while true; do :; done'"
|
|
)
|
|
|
|
def stop_cpu_load(name):
|
|
machine.succeed(f"systemctl stop {name}")
|
|
|
|
def start_monitor(unit_name):
|
|
"""Start the auto-pause monitor as a transient unit."""
|
|
machine.succeed(
|
|
f"systemd-run --unit={unit_name} "
|
|
f"--setenv=POLL_INTERVAL={POLL_INTERVAL} "
|
|
f"--setenv=GRACE_PERIOD={GRACE_PERIOD} "
|
|
f"--setenv=CPU_STOP_THRESHOLD={CPU_STOP_THRESHOLD} "
|
|
f"--setenv=CPU_RESUME_THRESHOLD={CPU_RESUME_THRESHOLD} "
|
|
f"--setenv=STARTUP_COOLDOWN={STARTUP_COOLDOWN} "
|
|
f"--setenv=STATE_DIR={STATE_DIR} "
|
|
f"{PYTHON} {SCRIPT}"
|
|
)
|
|
# Monitor needs two consecutive polls to compute a CPU delta.
|
|
time.sleep(3)
|
|
# Monitor needs two consecutive polls to compute a CPU delta.
|
|
time.sleep(3)
|
|
|
|
start_all()
|
|
machine.wait_for_unit("multi-user.target")
|
|
machine.wait_for_unit("xmrig.service")
|
|
machine.succeed(f"mkdir -p {STATE_DIR}")
|
|
|
|
with subtest("Start auto-pause monitor"):
|
|
start_monitor("xmrig-auto-pause")
|
|
|
|
with subtest("xmrig stays running while system is idle"):
|
|
machine.succeed("systemctl is-active xmrig")
|
|
|
|
with subtest("xmrig stopped when CPU load appears"):
|
|
start_cpu_load("cpu-load")
|
|
machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
|
|
|
|
with subtest("xmrig remains stopped during grace period after load ends"):
|
|
stop_cpu_load("cpu-load")
|
|
# Load just stopped. Grace period is 5s. Check at 2s — well within.
|
|
time.sleep(2)
|
|
machine.fail("systemctl is-active xmrig")
|
|
|
|
with subtest("xmrig resumes after grace period expires"):
|
|
# Already idle since previous subtest. Grace period (5s) plus
|
|
# detection delay (~2 polls) plus startup cooldown (4s) means
|
|
# xmrig should restart within ~12s.
|
|
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20)
|
|
|
|
with subtest("Intermittent load does not cause flapping"):
|
|
# First load — stop xmrig
|
|
start_cpu_load("cpu-load-1")
|
|
machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
|
|
stop_cpu_load("cpu-load-1")
|
|
|
|
# Brief idle gap — shorter than grace period
|
|
time.sleep(2)
|
|
|
|
# Second load arrives before grace period expires
|
|
start_cpu_load("cpu-load-2")
|
|
time.sleep(3)
|
|
|
|
# xmrig must still be stopped
|
|
machine.fail("systemctl is-active xmrig")
|
|
|
|
stop_cpu_load("cpu-load-2")
|
|
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20)
|
|
|
|
with subtest("Sustained load keeps xmrig stopped"):
|
|
start_cpu_load("cpu-load-3")
|
|
machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
|
|
|
|
# Stay busy longer than the grace period to prove continuous
|
|
# activity keeps xmrig stopped indefinitely.
|
|
time.sleep(8)
|
|
machine.fail("systemctl is-active xmrig")
|
|
|
|
stop_cpu_load("cpu-load-3")
|
|
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20)
|
|
|
|
with subtest("External restart detected and re-stopped under load"):
|
|
# Put system under load so auto-pause stops xmrig.
|
|
start_cpu_load("cpu-load-4")
|
|
machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
|
|
|
|
# Something external starts xmrig while load is active.
|
|
# The script should detect this and re-stop it.
|
|
machine.succeed("systemctl start xmrig")
|
|
machine.succeed("systemctl is-active xmrig")
|
|
machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
|
|
|
|
stop_cpu_load("cpu-load-4")
|
|
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20)
|
|
|
|
# --- State persistence and crash recovery ---
|
|
machine.succeed("systemctl stop xmrig-auto-pause")
|
|
|
|
with subtest("xmrig recovers after crash during startup cooldown"):
|
|
machine.succeed(f"rm -rf {STATE_DIR} && mkdir -p {STATE_DIR}")
|
|
start_monitor("xmrig-auto-pause-crash")
|
|
|
|
# Load -> xmrig stops
|
|
start_cpu_load("cpu-crash")
|
|
machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
|
|
|
|
# End load -> xmrig restarts after grace period
|
|
stop_cpu_load("cpu-crash")
|
|
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=30)
|
|
|
|
# Kill xmrig immediately — simulates crash during startup cooldown.
|
|
# The script should detect the failure when cooldown expires and
|
|
# re-enter the retry cycle.
|
|
machine.succeed("systemctl kill --signal=KILL xmrig")
|
|
machine.wait_until_fails("systemctl is-active xmrig", timeout=5)
|
|
|
|
# After cooldown + grace period + restart, xmrig should be back.
|
|
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=30)
|
|
|
|
machine.succeed("systemctl stop xmrig-auto-pause-crash")
|
|
machine.succeed("systemctl reset-failed xmrig.service || true")
|
|
machine.succeed("systemctl start xmrig")
|
|
machine.wait_for_unit("xmrig.service")
|
|
|
|
with subtest("Script restart preserves pause state"):
|
|
machine.succeed(f"rm -rf {STATE_DIR} && mkdir -p {STATE_DIR}")
|
|
start_monitor("xmrig-auto-pause-persist")
|
|
|
|
# Load -> xmrig stops
|
|
start_cpu_load("cpu-persist")
|
|
machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
|
|
|
|
# Kill the monitor while xmrig is paused (simulates script crash)
|
|
machine.succeed("systemctl stop xmrig-auto-pause-persist")
|
|
|
|
# State file must exist — the monitor persisted the pause flag
|
|
machine.succeed(f"test -f {STATE_DIR}/paused")
|
|
|
|
# Start a fresh monitor instance (reads state file on startup)
|
|
start_monitor("xmrig-auto-pause-persist2")
|
|
|
|
# End load — the new monitor should pick up the paused state
|
|
# and restart xmrig after the grace period
|
|
stop_cpu_load("cpu-persist")
|
|
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=30)
|
|
|
|
# State file should be cleaned up after successful restart
|
|
machine.fail(f"test -f {STATE_DIR}/paused")
|
|
|
|
machine.succeed("systemctl stop xmrig-auto-pause-persist2")
|
|
'';
|
|
}
|