diff --git a/services/monero/xmrig-auto-pause.nix b/services/monero/xmrig-auto-pause.nix index 12dc475..1d1e79c 100644 --- a/services/monero/xmrig-auto-pause.nix +++ b/services/monero/xmrig-auto-pause.nix @@ -26,6 +26,7 @@ lib.mkIf config.services.xmrig.enable { POLL_INTERVAL = "3"; GRACE_PERIOD = "15"; CPU_THRESHOLD = "5"; + STARTUP_COOLDOWN = "10"; }; }; } diff --git a/services/monero/xmrig-auto-pause.py b/services/monero/xmrig-auto-pause.py index 2aafd7e..85c7ba0 100644 --- a/services/monero/xmrig-auto-pause.py +++ b/services/monero/xmrig-auto-pause.py @@ -31,6 +31,10 @@ GRACE_PERIOD = float(os.environ.get("GRACE_PERIOD", "15")) # a pause. On a 12-thread system, one fully loaded core ≈ 8.3% of total. # Default 5% catches anything using more than ~60% of a single core. CPU_THRESHOLD = float(os.environ.get("CPU_THRESHOLD", "5")) +# After starting xmrig, ignore CPU spikes for this many seconds to let +# RandomX dataset initialization complete (~4s on the target hardware) +# without retriggering a stop. +STARTUP_COOLDOWN = float(os.environ.get("STARTUP_COOLDOWN", "10")) def log(msg): @@ -77,10 +81,14 @@ def systemctl(action, unit): def main(): paused_by_us = False idle_since = None + started_at = None # monotonic time when we last started xmrig prev_total = None prev_work = None - log(f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s threshold={CPU_THRESHOLD}%") + log( + f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s " + f"threshold={CPU_THRESHOLD}% cooldown={STARTUP_COOLDOWN}s" + ) while True: total, work = read_cpu_ticks() @@ -102,10 +110,24 @@ def main(): prev_total = total prev_work = work + # Don't act during startup cooldown — RandomX dataset init causes + # a transient CPU spike that would immediately retrigger a stop. + if started_at is not None: + if time.monotonic() - started_at < STARTUP_COOLDOWN: + time.sleep(POLL_INTERVAL) + continue + started_at = None + busy = real_work_pct > CPU_THRESHOLD if busy: idle_since = None + if paused_by_us and is_active("xmrig.service"): + # Something else restarted xmrig (deploy, manual start, etc.) + # while we thought it was stopped. Reset ownership so we can + # manage it again. + log("xmrig was restarted externally while paused — reclaiming") + paused_by_us = False if not paused_by_us: # Only claim ownership if xmrig is actually running. # If something else stopped it (e.g. UPS battery hook), @@ -122,6 +144,7 @@ def main(): log(f"Workload ended ({real_work_pct:.1f}% CPU) past grace period — starting xmrig") if systemctl("start", "xmrig.service"): paused_by_us = False + started_at = time.monotonic() idle_since = None time.sleep(POLL_INTERVAL) diff --git a/tests/xmrig-auto-pause.nix b/tests/xmrig-auto-pause.nix index ba8d225..20dc7d8 100644 --- a/tests/xmrig-auto-pause.nix +++ b/tests/xmrig-auto-pause.nix @@ -43,6 +43,7 @@ pkgs.testers.runNixOSTest { POLL_INTERVAL = "1" GRACE_PERIOD = "5" CPU_THRESHOLD = "10" + STARTUP_COOLDOWN = "4" def start_cpu_load(name): """Start a non-nice CPU burn as a transient systemd unit.""" @@ -64,6 +65,7 @@ pkgs.testers.runNixOSTest { f"--setenv=POLL_INTERVAL={POLL_INTERVAL} " f"--setenv=GRACE_PERIOD={GRACE_PERIOD} " f"--setenv=CPU_THRESHOLD={CPU_THRESHOLD} " + f"--setenv=STARTUP_COOLDOWN={STARTUP_COOLDOWN} " f"{PYTHON} {SCRIPT}" ) # Monitor needs two consecutive polls to compute a CPU delta. @@ -84,8 +86,9 @@ pkgs.testers.runNixOSTest { with subtest("xmrig resumes after grace period expires"): # Already idle since previous subtest. Grace period (5s) plus - # detection delay (~2 polls) means xmrig should restart within ~8s. - machine.wait_until_succeeds("systemctl is-active xmrig", timeout=15) + # detection delay (~2 polls) plus startup cooldown (4s) means + # xmrig should restart within ~12s. + machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20) with subtest("Intermittent load does not cause flapping"): # First load — stop xmrig @@ -104,7 +107,7 @@ pkgs.testers.runNixOSTest { machine.fail("systemctl is-active xmrig") stop_cpu_load("cpu-load-2") - machine.wait_until_succeeds("systemctl is-active xmrig", timeout=15) + machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20) with subtest("Sustained load keeps xmrig stopped"): start_cpu_load("cpu-load-3") @@ -116,6 +119,20 @@ pkgs.testers.runNixOSTest { machine.fail("systemctl is-active xmrig") stop_cpu_load("cpu-load-3") - machine.wait_until_succeeds("systemctl is-active xmrig", timeout=15) + machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20) + + with subtest("External restart detected and re-stopped under load"): + # Put system under load so auto-pause stops xmrig. + start_cpu_load("cpu-load-4") + machine.wait_until_fails("systemctl is-active xmrig", timeout=20) + + # Something external starts xmrig while load is active. + # The script should detect this and re-stop it. + machine.succeed("systemctl start xmrig") + machine.succeed("systemctl is-active xmrig") + machine.wait_until_fails("systemctl is-active xmrig", timeout=20) + + stop_cpu_load("cpu-load-4") + machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20) ''; }