xmrig-auto-pause: fix stuck state after external restart, add startup cooldown
All checks were successful
Build and Deploy / deploy (push) Successful in 8m47s
All checks were successful
Build and Deploy / deploy (push) Successful in 8m47s
Two bugs found during live verification on the server: 1. Stuck state after external restart: if something else restarted xmrig (e.g. deploy-rs activation) while paused_by_us=True, the script never detected this and became permanently stuck — unable to stop xmrig on future load because it thought xmrig was already stopped. Fix: when paused_by_us=True and busy, check if xmrig is actually running. If so, reset paused_by_us=False and re-stop it. 2. Flapping on xmrig restart: RandomX dataset init takes ~3.7s of intense non-nice CPU, which the script detected as real workload and immediately re-stopped xmrig after every restart, creating a start-stop loop. Fix: add STARTUP_COOLDOWN (default 10s) — after starting xmrig, skip CPU checks until the cooldown expires. Both bugs were present in production: the script had been stuck since Apr 3 (2+ days) with xmrig running unmanaged alongside llama-server.
This commit is contained in:
@@ -26,6 +26,7 @@ lib.mkIf config.services.xmrig.enable {
|
|||||||
POLL_INTERVAL = "3";
|
POLL_INTERVAL = "3";
|
||||||
GRACE_PERIOD = "15";
|
GRACE_PERIOD = "15";
|
||||||
CPU_THRESHOLD = "5";
|
CPU_THRESHOLD = "5";
|
||||||
|
STARTUP_COOLDOWN = "10";
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -31,6 +31,10 @@ GRACE_PERIOD = float(os.environ.get("GRACE_PERIOD", "15"))
|
|||||||
# a pause. On a 12-thread system, one fully loaded core ≈ 8.3% of total.
|
# a pause. On a 12-thread system, one fully loaded core ≈ 8.3% of total.
|
||||||
# Default 5% catches anything using more than ~60% of a single core.
|
# Default 5% catches anything using more than ~60% of a single core.
|
||||||
CPU_THRESHOLD = float(os.environ.get("CPU_THRESHOLD", "5"))
|
CPU_THRESHOLD = float(os.environ.get("CPU_THRESHOLD", "5"))
|
||||||
|
# After starting xmrig, ignore CPU spikes for this many seconds to let
|
||||||
|
# RandomX dataset initialization complete (~4s on the target hardware)
|
||||||
|
# without retriggering a stop.
|
||||||
|
STARTUP_COOLDOWN = float(os.environ.get("STARTUP_COOLDOWN", "10"))
|
||||||
|
|
||||||
|
|
||||||
def log(msg):
|
def log(msg):
|
||||||
@@ -77,10 +81,14 @@ def systemctl(action, unit):
|
|||||||
def main():
|
def main():
|
||||||
paused_by_us = False
|
paused_by_us = False
|
||||||
idle_since = None
|
idle_since = None
|
||||||
|
started_at = None # monotonic time when we last started xmrig
|
||||||
prev_total = None
|
prev_total = None
|
||||||
prev_work = None
|
prev_work = None
|
||||||
|
|
||||||
log(f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s threshold={CPU_THRESHOLD}%")
|
log(
|
||||||
|
f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s "
|
||||||
|
f"threshold={CPU_THRESHOLD}% cooldown={STARTUP_COOLDOWN}s"
|
||||||
|
)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
total, work = read_cpu_ticks()
|
total, work = read_cpu_ticks()
|
||||||
@@ -102,10 +110,24 @@ def main():
|
|||||||
prev_total = total
|
prev_total = total
|
||||||
prev_work = work
|
prev_work = work
|
||||||
|
|
||||||
|
# Don't act during startup cooldown — RandomX dataset init causes
|
||||||
|
# a transient CPU spike that would immediately retrigger a stop.
|
||||||
|
if started_at is not None:
|
||||||
|
if time.monotonic() - started_at < STARTUP_COOLDOWN:
|
||||||
|
time.sleep(POLL_INTERVAL)
|
||||||
|
continue
|
||||||
|
started_at = None
|
||||||
|
|
||||||
busy = real_work_pct > CPU_THRESHOLD
|
busy = real_work_pct > CPU_THRESHOLD
|
||||||
|
|
||||||
if busy:
|
if busy:
|
||||||
idle_since = None
|
idle_since = None
|
||||||
|
if paused_by_us and is_active("xmrig.service"):
|
||||||
|
# Something else restarted xmrig (deploy, manual start, etc.)
|
||||||
|
# while we thought it was stopped. Reset ownership so we can
|
||||||
|
# manage it again.
|
||||||
|
log("xmrig was restarted externally while paused — reclaiming")
|
||||||
|
paused_by_us = False
|
||||||
if not paused_by_us:
|
if not paused_by_us:
|
||||||
# Only claim ownership if xmrig is actually running.
|
# Only claim ownership if xmrig is actually running.
|
||||||
# If something else stopped it (e.g. UPS battery hook),
|
# If something else stopped it (e.g. UPS battery hook),
|
||||||
@@ -122,6 +144,7 @@ def main():
|
|||||||
log(f"Workload ended ({real_work_pct:.1f}% CPU) past grace period — starting xmrig")
|
log(f"Workload ended ({real_work_pct:.1f}% CPU) past grace period — starting xmrig")
|
||||||
if systemctl("start", "xmrig.service"):
|
if systemctl("start", "xmrig.service"):
|
||||||
paused_by_us = False
|
paused_by_us = False
|
||||||
|
started_at = time.monotonic()
|
||||||
idle_since = None
|
idle_since = None
|
||||||
|
|
||||||
time.sleep(POLL_INTERVAL)
|
time.sleep(POLL_INTERVAL)
|
||||||
|
|||||||
@@ -43,6 +43,7 @@ pkgs.testers.runNixOSTest {
|
|||||||
POLL_INTERVAL = "1"
|
POLL_INTERVAL = "1"
|
||||||
GRACE_PERIOD = "5"
|
GRACE_PERIOD = "5"
|
||||||
CPU_THRESHOLD = "10"
|
CPU_THRESHOLD = "10"
|
||||||
|
STARTUP_COOLDOWN = "4"
|
||||||
|
|
||||||
def start_cpu_load(name):
|
def start_cpu_load(name):
|
||||||
"""Start a non-nice CPU burn as a transient systemd unit."""
|
"""Start a non-nice CPU burn as a transient systemd unit."""
|
||||||
@@ -64,6 +65,7 @@ pkgs.testers.runNixOSTest {
|
|||||||
f"--setenv=POLL_INTERVAL={POLL_INTERVAL} "
|
f"--setenv=POLL_INTERVAL={POLL_INTERVAL} "
|
||||||
f"--setenv=GRACE_PERIOD={GRACE_PERIOD} "
|
f"--setenv=GRACE_PERIOD={GRACE_PERIOD} "
|
||||||
f"--setenv=CPU_THRESHOLD={CPU_THRESHOLD} "
|
f"--setenv=CPU_THRESHOLD={CPU_THRESHOLD} "
|
||||||
|
f"--setenv=STARTUP_COOLDOWN={STARTUP_COOLDOWN} "
|
||||||
f"{PYTHON} {SCRIPT}"
|
f"{PYTHON} {SCRIPT}"
|
||||||
)
|
)
|
||||||
# Monitor needs two consecutive polls to compute a CPU delta.
|
# Monitor needs two consecutive polls to compute a CPU delta.
|
||||||
@@ -84,8 +86,9 @@ pkgs.testers.runNixOSTest {
|
|||||||
|
|
||||||
with subtest("xmrig resumes after grace period expires"):
|
with subtest("xmrig resumes after grace period expires"):
|
||||||
# Already idle since previous subtest. Grace period (5s) plus
|
# Already idle since previous subtest. Grace period (5s) plus
|
||||||
# detection delay (~2 polls) means xmrig should restart within ~8s.
|
# detection delay (~2 polls) plus startup cooldown (4s) means
|
||||||
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=15)
|
# xmrig should restart within ~12s.
|
||||||
|
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20)
|
||||||
|
|
||||||
with subtest("Intermittent load does not cause flapping"):
|
with subtest("Intermittent load does not cause flapping"):
|
||||||
# First load — stop xmrig
|
# First load — stop xmrig
|
||||||
@@ -104,7 +107,7 @@ pkgs.testers.runNixOSTest {
|
|||||||
machine.fail("systemctl is-active xmrig")
|
machine.fail("systemctl is-active xmrig")
|
||||||
|
|
||||||
stop_cpu_load("cpu-load-2")
|
stop_cpu_load("cpu-load-2")
|
||||||
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=15)
|
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20)
|
||||||
|
|
||||||
with subtest("Sustained load keeps xmrig stopped"):
|
with subtest("Sustained load keeps xmrig stopped"):
|
||||||
start_cpu_load("cpu-load-3")
|
start_cpu_load("cpu-load-3")
|
||||||
@@ -116,6 +119,20 @@ pkgs.testers.runNixOSTest {
|
|||||||
machine.fail("systemctl is-active xmrig")
|
machine.fail("systemctl is-active xmrig")
|
||||||
|
|
||||||
stop_cpu_load("cpu-load-3")
|
stop_cpu_load("cpu-load-3")
|
||||||
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=15)
|
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20)
|
||||||
|
|
||||||
|
with subtest("External restart detected and re-stopped under load"):
|
||||||
|
# Put system under load so auto-pause stops xmrig.
|
||||||
|
start_cpu_load("cpu-load-4")
|
||||||
|
machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
|
||||||
|
|
||||||
|
# Something external starts xmrig while load is active.
|
||||||
|
# The script should detect this and re-stop it.
|
||||||
|
machine.succeed("systemctl start xmrig")
|
||||||
|
machine.succeed("systemctl is-active xmrig")
|
||||||
|
machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
|
||||||
|
|
||||||
|
stop_cpu_load("cpu-load-4")
|
||||||
|
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20)
|
||||||
'';
|
'';
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user