diff --git a/services/monero/xmrig-auto-pause.nix b/services/monero/xmrig-auto-pause.nix index 1d1e79c..4161e34 100644 --- a/services/monero/xmrig-auto-pause.nix +++ b/services/monero/xmrig-auto-pause.nix @@ -21,12 +21,14 @@ lib.mkIf config.services.xmrig.enable { "AF_UNIX" # systemctl talks to systemd over D-Bus unix socket ]; MemoryDenyWriteExecute = true; + StateDirectory = "xmrig-auto-pause"; }; environment = { POLL_INTERVAL = "3"; GRACE_PERIOD = "15"; CPU_THRESHOLD = "5"; STARTUP_COOLDOWN = "10"; + STATE_DIR = "/var/lib/xmrig-auto-pause"; }; }; } diff --git a/services/monero/xmrig-auto-pause.py b/services/monero/xmrig-auto-pause.py index 85c7ba0..2abd4ac 100644 --- a/services/monero/xmrig-auto-pause.py +++ b/services/monero/xmrig-auto-pause.py @@ -35,6 +35,11 @@ CPU_THRESHOLD = float(os.environ.get("CPU_THRESHOLD", "5")) # RandomX dataset initialization complete (~4s on the target hardware) # without retriggering a stop. STARTUP_COOLDOWN = float(os.environ.get("STARTUP_COOLDOWN", "10")) +# Directory for persisting pause state across script restarts. Without +# this, a restart while xmrig is paused loses the paused_by_us flag and +# xmrig stays stopped permanently. +STATE_DIR = os.environ.get("STATE_DIR", "") +_PAUSE_FILE = os.path.join(STATE_DIR, "paused") if STATE_DIR else "" def log(msg): @@ -78,13 +83,36 @@ def systemctl(action, unit): return result.returncode == 0 +def _save_paused(paused): + """Persist pause flag so a script restart can resume where we left off.""" + if not _PAUSE_FILE: + return + try: + if paused: + open(_PAUSE_FILE, "w").close() + else: + os.remove(_PAUSE_FILE) + except OSError: + pass + + +def _load_paused(): + """Check if a previous instance left xmrig paused.""" + if not _PAUSE_FILE: + return False + return os.path.isfile(_PAUSE_FILE) + + def main(): - paused_by_us = False + paused_by_us = _load_paused() idle_since = None started_at = None # monotonic time when we last started xmrig prev_total = None prev_work = None + if paused_by_us: + log("Recovered pause state from previous instance") + log( f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s " f"threshold={CPU_THRESHOLD}% cooldown={STARTUP_COOLDOWN}s" @@ -116,6 +144,14 @@ def main(): if time.monotonic() - started_at < STARTUP_COOLDOWN: time.sleep(POLL_INTERVAL) continue + # Cooldown expired — verify xmrig survived startup. If it + # crashed during init (hugepage failure, pool unreachable, etc.), + # re-enter the pause/retry cycle rather than silently leaving + # xmrig dead. + if not is_active("xmrig.service"): + log("xmrig died during startup cooldown — will retry") + paused_by_us = True + _save_paused(True) started_at = None busy = real_work_pct > CPU_THRESHOLD @@ -128,6 +164,7 @@ def main(): # manage it again. log("xmrig was restarted externally while paused — reclaiming") paused_by_us = False + _save_paused(False) if not paused_by_us: # Only claim ownership if xmrig is actually running. # If something else stopped it (e.g. UPS battery hook), @@ -136,6 +173,7 @@ def main(): log(f"Real workload detected ({real_work_pct:.1f}% CPU) — stopping xmrig") if systemctl("stop", "xmrig.service"): paused_by_us = True + _save_paused(True) else: if paused_by_us: if idle_since is None: @@ -144,6 +182,7 @@ def main(): log(f"Workload ended ({real_work_pct:.1f}% CPU) past grace period — starting xmrig") if systemctl("start", "xmrig.service"): paused_by_us = False + _save_paused(False) started_at = time.monotonic() idle_since = None diff --git a/tests/xmrig-auto-pause.nix b/tests/xmrig-auto-pause.nix index 20dc7d8..33796e6 100644 --- a/tests/xmrig-auto-pause.nix +++ b/tests/xmrig-auto-pause.nix @@ -44,6 +44,7 @@ pkgs.testers.runNixOSTest { GRACE_PERIOD = "5" CPU_THRESHOLD = "10" STARTUP_COOLDOWN = "4" + STATE_DIR = "/tmp/xap-state" def start_cpu_load(name): """Start a non-nice CPU burn as a transient systemd unit.""" @@ -55,22 +56,28 @@ pkgs.testers.runNixOSTest { def stop_cpu_load(name): machine.succeed(f"systemctl stop {name}") - start_all() - machine.wait_for_unit("multi-user.target") - machine.wait_for_unit("xmrig.service") - - with subtest("Start auto-pause monitor"): + def start_monitor(unit_name): + """Start the auto-pause monitor as a transient unit.""" machine.succeed( - f"systemd-run --unit=xmrig-auto-pause " + f"systemd-run --unit={unit_name} " f"--setenv=POLL_INTERVAL={POLL_INTERVAL} " f"--setenv=GRACE_PERIOD={GRACE_PERIOD} " f"--setenv=CPU_THRESHOLD={CPU_THRESHOLD} " f"--setenv=STARTUP_COOLDOWN={STARTUP_COOLDOWN} " + f"--setenv=STATE_DIR={STATE_DIR} " f"{PYTHON} {SCRIPT}" ) # Monitor needs two consecutive polls to compute a CPU delta. time.sleep(3) + start_all() + machine.wait_for_unit("multi-user.target") + machine.wait_for_unit("xmrig.service") + machine.succeed(f"mkdir -p {STATE_DIR}") + + with subtest("Start auto-pause monitor"): + start_monitor("xmrig-auto-pause") + with subtest("xmrig stays running while system is idle"): machine.succeed("systemctl is-active xmrig") @@ -134,5 +141,61 @@ pkgs.testers.runNixOSTest { stop_cpu_load("cpu-load-4") machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20) + + # --- State persistence and crash recovery --- + machine.succeed("systemctl stop xmrig-auto-pause") + + with subtest("xmrig recovers after crash during startup cooldown"): + machine.succeed(f"rm -rf {STATE_DIR} && mkdir -p {STATE_DIR}") + start_monitor("xmrig-auto-pause-crash") + + # Load -> xmrig stops + start_cpu_load("cpu-crash") + machine.wait_until_fails("systemctl is-active xmrig", timeout=20) + + # End load -> xmrig restarts after grace period + stop_cpu_load("cpu-crash") + machine.wait_until_succeeds("systemctl is-active xmrig", timeout=30) + + # Kill xmrig immediately — simulates crash during startup cooldown. + # The script should detect the failure when cooldown expires and + # re-enter the retry cycle. + machine.succeed("systemctl kill --signal=KILL xmrig") + machine.wait_until_fails("systemctl is-active xmrig", timeout=5) + + # After cooldown + grace period + restart, xmrig should be back. + machine.wait_until_succeeds("systemctl is-active xmrig", timeout=30) + + machine.succeed("systemctl stop xmrig-auto-pause-crash") + machine.succeed("systemctl reset-failed xmrig.service || true") + machine.succeed("systemctl start xmrig") + machine.wait_for_unit("xmrig.service") + + with subtest("Script restart preserves pause state"): + machine.succeed(f"rm -rf {STATE_DIR} && mkdir -p {STATE_DIR}") + start_monitor("xmrig-auto-pause-persist") + + # Load -> xmrig stops + start_cpu_load("cpu-persist") + machine.wait_until_fails("systemctl is-active xmrig", timeout=20) + + # Kill the monitor while xmrig is paused (simulates script crash) + machine.succeed("systemctl stop xmrig-auto-pause-persist") + + # State file must exist — the monitor persisted the pause flag + machine.succeed(f"test -f {STATE_DIR}/paused") + + # Start a fresh monitor instance (reads state file on startup) + start_monitor("xmrig-auto-pause-persist2") + + # End load — the new monitor should pick up the paused state + # and restart xmrig after the grace period + stop_cpu_load("cpu-persist") + machine.wait_until_succeeds("systemctl is-active xmrig", timeout=30) + + # State file should be cleaned up after successful restart + machine.fail(f"test -f {STATE_DIR}/paused") + + machine.succeed("systemctl stop xmrig-auto-pause-persist2") ''; }