From bbcd662c28b56f150354bb7c923e6b5dd5351061 Mon Sep 17 00:00:00 2001 From: Simon Gardling Date: Sun, 5 Apr 2026 23:20:47 -0400 Subject: [PATCH] xmrig-auto-pause: fix stuck state after external restart, add startup cooldown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs found during live verification on the server: 1. Stuck state after external restart: if something else restarted xmrig (e.g. deploy-rs activation) while paused_by_us=True, the script never detected this and became permanently stuck — unable to stop xmrig on future load because it thought xmrig was already stopped. Fix: when paused_by_us=True and busy, check if xmrig is actually running. If so, reset paused_by_us=False and re-stop it. 2. Flapping on xmrig restart: RandomX dataset init takes ~3.7s of intense non-nice CPU, which the script detected as real workload and immediately re-stopped xmrig after every restart, creating a start-stop loop. Fix: add STARTUP_COOLDOWN (default 10s) — after starting xmrig, skip CPU checks until the cooldown expires. Both bugs were present in production: the script had been stuck since Apr 3 (2+ days) with xmrig running unmanaged alongside llama-server. --- services/monero/xmrig-auto-pause.nix | 1 + services/monero/xmrig-auto-pause.py | 25 ++++++++++++++++++++++++- tests/xmrig-auto-pause.nix | 25 +++++++++++++++++++++---- 3 files changed, 46 insertions(+), 5 deletions(-) diff --git a/services/monero/xmrig-auto-pause.nix b/services/monero/xmrig-auto-pause.nix index 12dc475..1d1e79c 100644 --- a/services/monero/xmrig-auto-pause.nix +++ b/services/monero/xmrig-auto-pause.nix @@ -26,6 +26,7 @@ lib.mkIf config.services.xmrig.enable { POLL_INTERVAL = "3"; GRACE_PERIOD = "15"; CPU_THRESHOLD = "5"; + STARTUP_COOLDOWN = "10"; }; }; } diff --git a/services/monero/xmrig-auto-pause.py b/services/monero/xmrig-auto-pause.py index 2aafd7e..85c7ba0 100644 --- a/services/monero/xmrig-auto-pause.py +++ b/services/monero/xmrig-auto-pause.py @@ -31,6 +31,10 @@ GRACE_PERIOD = float(os.environ.get("GRACE_PERIOD", "15")) # a pause. On a 12-thread system, one fully loaded core ≈ 8.3% of total. # Default 5% catches anything using more than ~60% of a single core. CPU_THRESHOLD = float(os.environ.get("CPU_THRESHOLD", "5")) +# After starting xmrig, ignore CPU spikes for this many seconds to let +# RandomX dataset initialization complete (~4s on the target hardware) +# without retriggering a stop. +STARTUP_COOLDOWN = float(os.environ.get("STARTUP_COOLDOWN", "10")) def log(msg): @@ -77,10 +81,14 @@ def systemctl(action, unit): def main(): paused_by_us = False idle_since = None + started_at = None # monotonic time when we last started xmrig prev_total = None prev_work = None - log(f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s threshold={CPU_THRESHOLD}%") + log( + f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s " + f"threshold={CPU_THRESHOLD}% cooldown={STARTUP_COOLDOWN}s" + ) while True: total, work = read_cpu_ticks() @@ -102,10 +110,24 @@ def main(): prev_total = total prev_work = work + # Don't act during startup cooldown — RandomX dataset init causes + # a transient CPU spike that would immediately retrigger a stop. + if started_at is not None: + if time.monotonic() - started_at < STARTUP_COOLDOWN: + time.sleep(POLL_INTERVAL) + continue + started_at = None + busy = real_work_pct > CPU_THRESHOLD if busy: idle_since = None + if paused_by_us and is_active("xmrig.service"): + # Something else restarted xmrig (deploy, manual start, etc.) + # while we thought it was stopped. Reset ownership so we can + # manage it again. + log("xmrig was restarted externally while paused — reclaiming") + paused_by_us = False if not paused_by_us: # Only claim ownership if xmrig is actually running. # If something else stopped it (e.g. UPS battery hook), @@ -122,6 +144,7 @@ def main(): log(f"Workload ended ({real_work_pct:.1f}% CPU) past grace period — starting xmrig") if systemctl("start", "xmrig.service"): paused_by_us = False + started_at = time.monotonic() idle_since = None time.sleep(POLL_INTERVAL) diff --git a/tests/xmrig-auto-pause.nix b/tests/xmrig-auto-pause.nix index ba8d225..20dc7d8 100644 --- a/tests/xmrig-auto-pause.nix +++ b/tests/xmrig-auto-pause.nix @@ -43,6 +43,7 @@ pkgs.testers.runNixOSTest { POLL_INTERVAL = "1" GRACE_PERIOD = "5" CPU_THRESHOLD = "10" + STARTUP_COOLDOWN = "4" def start_cpu_load(name): """Start a non-nice CPU burn as a transient systemd unit.""" @@ -64,6 +65,7 @@ pkgs.testers.runNixOSTest { f"--setenv=POLL_INTERVAL={POLL_INTERVAL} " f"--setenv=GRACE_PERIOD={GRACE_PERIOD} " f"--setenv=CPU_THRESHOLD={CPU_THRESHOLD} " + f"--setenv=STARTUP_COOLDOWN={STARTUP_COOLDOWN} " f"{PYTHON} {SCRIPT}" ) # Monitor needs two consecutive polls to compute a CPU delta. @@ -84,8 +86,9 @@ pkgs.testers.runNixOSTest { with subtest("xmrig resumes after grace period expires"): # Already idle since previous subtest. Grace period (5s) plus - # detection delay (~2 polls) means xmrig should restart within ~8s. - machine.wait_until_succeeds("systemctl is-active xmrig", timeout=15) + # detection delay (~2 polls) plus startup cooldown (4s) means + # xmrig should restart within ~12s. + machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20) with subtest("Intermittent load does not cause flapping"): # First load — stop xmrig @@ -104,7 +107,7 @@ pkgs.testers.runNixOSTest { machine.fail("systemctl is-active xmrig") stop_cpu_load("cpu-load-2") - machine.wait_until_succeeds("systemctl is-active xmrig", timeout=15) + machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20) with subtest("Sustained load keeps xmrig stopped"): start_cpu_load("cpu-load-3") @@ -116,6 +119,20 @@ pkgs.testers.runNixOSTest { machine.fail("systemctl is-active xmrig") stop_cpu_load("cpu-load-3") - machine.wait_until_succeeds("systemctl is-active xmrig", timeout=15) + machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20) + + with subtest("External restart detected and re-stopped under load"): + # Put system under load so auto-pause stops xmrig. + start_cpu_load("cpu-load-4") + machine.wait_until_fails("systemctl is-active xmrig", timeout=20) + + # Something external starts xmrig while load is active. + # The script should detect this and re-stop it. + machine.succeed("systemctl start xmrig") + machine.succeed("systemctl is-active xmrig") + machine.wait_until_fails("systemctl is-active xmrig", timeout=20) + + stop_cpu_load("cpu-load-4") + machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20) ''; }