xmrig-auto-pause: fix stuck state after external restart, add startup cooldown
Two bugs found during live verification on the server: 1. Stuck state after external restart: if something else restarted xmrig (e.g. deploy-rs activation) while paused_by_us=True, the script never detected this and became permanently stuck — unable to stop xmrig on future load because it thought xmrig was already stopped. Fix: when paused_by_us=True and busy, check if xmrig is actually running. If so, reset paused_by_us=False and re-stop it. 2. Flapping on xmrig restart: RandomX dataset init takes ~3.7s of intense non-nice CPU, which the script detected as real workload and immediately re-stopped xmrig after every restart, creating a start-stop loop. Fix: add STARTUP_COOLDOWN (default 10s) — after starting xmrig, skip CPU checks until the cooldown expires. Both bugs were present in production: the script had been stuck since Apr 3 (2+ days) with xmrig running unmanaged alongside llama-server.
This commit is contained in:
@@ -43,6 +43,7 @@ pkgs.testers.runNixOSTest {
|
||||
POLL_INTERVAL = "1"
|
||||
GRACE_PERIOD = "5"
|
||||
CPU_THRESHOLD = "10"
|
||||
STARTUP_COOLDOWN = "4"
|
||||
|
||||
def start_cpu_load(name):
|
||||
"""Start a non-nice CPU burn as a transient systemd unit."""
|
||||
@@ -64,6 +65,7 @@ pkgs.testers.runNixOSTest {
|
||||
f"--setenv=POLL_INTERVAL={POLL_INTERVAL} "
|
||||
f"--setenv=GRACE_PERIOD={GRACE_PERIOD} "
|
||||
f"--setenv=CPU_THRESHOLD={CPU_THRESHOLD} "
|
||||
f"--setenv=STARTUP_COOLDOWN={STARTUP_COOLDOWN} "
|
||||
f"{PYTHON} {SCRIPT}"
|
||||
)
|
||||
# Monitor needs two consecutive polls to compute a CPU delta.
|
||||
@@ -84,8 +86,9 @@ pkgs.testers.runNixOSTest {
|
||||
|
||||
with subtest("xmrig resumes after grace period expires"):
|
||||
# Already idle since previous subtest. Grace period (5s) plus
|
||||
# detection delay (~2 polls) means xmrig should restart within ~8s.
|
||||
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=15)
|
||||
# detection delay (~2 polls) plus startup cooldown (4s) means
|
||||
# xmrig should restart within ~12s.
|
||||
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20)
|
||||
|
||||
with subtest("Intermittent load does not cause flapping"):
|
||||
# First load — stop xmrig
|
||||
@@ -104,7 +107,7 @@ pkgs.testers.runNixOSTest {
|
||||
machine.fail("systemctl is-active xmrig")
|
||||
|
||||
stop_cpu_load("cpu-load-2")
|
||||
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=15)
|
||||
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20)
|
||||
|
||||
with subtest("Sustained load keeps xmrig stopped"):
|
||||
start_cpu_load("cpu-load-3")
|
||||
@@ -116,6 +119,20 @@ pkgs.testers.runNixOSTest {
|
||||
machine.fail("systemctl is-active xmrig")
|
||||
|
||||
stop_cpu_load("cpu-load-3")
|
||||
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=15)
|
||||
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20)
|
||||
|
||||
with subtest("External restart detected and re-stopped under load"):
|
||||
# Put system under load so auto-pause stops xmrig.
|
||||
start_cpu_load("cpu-load-4")
|
||||
machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
|
||||
|
||||
# Something external starts xmrig while load is active.
|
||||
# The script should detect this and re-stop it.
|
||||
machine.succeed("systemctl start xmrig")
|
||||
machine.succeed("systemctl is-active xmrig")
|
||||
machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
|
||||
|
||||
stop_cpu_load("cpu-load-4")
|
||||
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20)
|
||||
'';
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user