xmrig-auto-pause: fix

This commit is contained in:
2026-04-06 13:11:54 -04:00
parent a12dcb01ec
commit 7afd1f35d2
3 changed files with 111 additions and 7 deletions

View File

@@ -21,12 +21,14 @@ lib.mkIf config.services.xmrig.enable {
"AF_UNIX" # systemctl talks to systemd over D-Bus unix socket "AF_UNIX" # systemctl talks to systemd over D-Bus unix socket
]; ];
MemoryDenyWriteExecute = true; MemoryDenyWriteExecute = true;
StateDirectory = "xmrig-auto-pause";
}; };
environment = { environment = {
POLL_INTERVAL = "3"; POLL_INTERVAL = "3";
GRACE_PERIOD = "15"; GRACE_PERIOD = "15";
CPU_THRESHOLD = "5"; CPU_THRESHOLD = "5";
STARTUP_COOLDOWN = "10"; STARTUP_COOLDOWN = "10";
STATE_DIR = "/var/lib/xmrig-auto-pause";
}; };
}; };
} }

View File

@@ -35,6 +35,11 @@ CPU_THRESHOLD = float(os.environ.get("CPU_THRESHOLD", "5"))
# RandomX dataset initialization complete (~4s on the target hardware) # RandomX dataset initialization complete (~4s on the target hardware)
# without retriggering a stop. # without retriggering a stop.
STARTUP_COOLDOWN = float(os.environ.get("STARTUP_COOLDOWN", "10")) STARTUP_COOLDOWN = float(os.environ.get("STARTUP_COOLDOWN", "10"))
# Directory for persisting pause state across script restarts. Without
# this, a restart while xmrig is paused loses the paused_by_us flag and
# xmrig stays stopped permanently.
STATE_DIR = os.environ.get("STATE_DIR", "")
_PAUSE_FILE = os.path.join(STATE_DIR, "paused") if STATE_DIR else ""
def log(msg): def log(msg):
@@ -78,13 +83,36 @@ def systemctl(action, unit):
return result.returncode == 0 return result.returncode == 0
def _save_paused(paused):
"""Persist pause flag so a script restart can resume where we left off."""
if not _PAUSE_FILE:
return
try:
if paused:
open(_PAUSE_FILE, "w").close()
else:
os.remove(_PAUSE_FILE)
except OSError:
pass
def _load_paused():
"""Check if a previous instance left xmrig paused."""
if not _PAUSE_FILE:
return False
return os.path.isfile(_PAUSE_FILE)
def main(): def main():
paused_by_us = False paused_by_us = _load_paused()
idle_since = None idle_since = None
started_at = None # monotonic time when we last started xmrig started_at = None # monotonic time when we last started xmrig
prev_total = None prev_total = None
prev_work = None prev_work = None
if paused_by_us:
log("Recovered pause state from previous instance")
log( log(
f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s " f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s "
f"threshold={CPU_THRESHOLD}% cooldown={STARTUP_COOLDOWN}s" f"threshold={CPU_THRESHOLD}% cooldown={STARTUP_COOLDOWN}s"
@@ -116,6 +144,14 @@ def main():
if time.monotonic() - started_at < STARTUP_COOLDOWN: if time.monotonic() - started_at < STARTUP_COOLDOWN:
time.sleep(POLL_INTERVAL) time.sleep(POLL_INTERVAL)
continue continue
# Cooldown expired — verify xmrig survived startup. If it
# crashed during init (hugepage failure, pool unreachable, etc.),
# re-enter the pause/retry cycle rather than silently leaving
# xmrig dead.
if not is_active("xmrig.service"):
log("xmrig died during startup cooldown — will retry")
paused_by_us = True
_save_paused(True)
started_at = None started_at = None
busy = real_work_pct > CPU_THRESHOLD busy = real_work_pct > CPU_THRESHOLD
@@ -128,6 +164,7 @@ def main():
# manage it again. # manage it again.
log("xmrig was restarted externally while paused — reclaiming") log("xmrig was restarted externally while paused — reclaiming")
paused_by_us = False paused_by_us = False
_save_paused(False)
if not paused_by_us: if not paused_by_us:
# Only claim ownership if xmrig is actually running. # Only claim ownership if xmrig is actually running.
# If something else stopped it (e.g. UPS battery hook), # If something else stopped it (e.g. UPS battery hook),
@@ -136,6 +173,7 @@ def main():
log(f"Real workload detected ({real_work_pct:.1f}% CPU) — stopping xmrig") log(f"Real workload detected ({real_work_pct:.1f}% CPU) — stopping xmrig")
if systemctl("stop", "xmrig.service"): if systemctl("stop", "xmrig.service"):
paused_by_us = True paused_by_us = True
_save_paused(True)
else: else:
if paused_by_us: if paused_by_us:
if idle_since is None: if idle_since is None:
@@ -144,6 +182,7 @@ def main():
log(f"Workload ended ({real_work_pct:.1f}% CPU) past grace period — starting xmrig") log(f"Workload ended ({real_work_pct:.1f}% CPU) past grace period — starting xmrig")
if systemctl("start", "xmrig.service"): if systemctl("start", "xmrig.service"):
paused_by_us = False paused_by_us = False
_save_paused(False)
started_at = time.monotonic() started_at = time.monotonic()
idle_since = None idle_since = None

View File

@@ -44,6 +44,7 @@ pkgs.testers.runNixOSTest {
GRACE_PERIOD = "5" GRACE_PERIOD = "5"
CPU_THRESHOLD = "10" CPU_THRESHOLD = "10"
STARTUP_COOLDOWN = "4" STARTUP_COOLDOWN = "4"
STATE_DIR = "/tmp/xap-state"
def start_cpu_load(name): def start_cpu_load(name):
"""Start a non-nice CPU burn as a transient systemd unit.""" """Start a non-nice CPU burn as a transient systemd unit."""
@@ -55,22 +56,28 @@ pkgs.testers.runNixOSTest {
def stop_cpu_load(name): def stop_cpu_load(name):
machine.succeed(f"systemctl stop {name}") machine.succeed(f"systemctl stop {name}")
start_all() def start_monitor(unit_name):
machine.wait_for_unit("multi-user.target") """Start the auto-pause monitor as a transient unit."""
machine.wait_for_unit("xmrig.service")
with subtest("Start auto-pause monitor"):
machine.succeed( machine.succeed(
f"systemd-run --unit=xmrig-auto-pause " f"systemd-run --unit={unit_name} "
f"--setenv=POLL_INTERVAL={POLL_INTERVAL} " f"--setenv=POLL_INTERVAL={POLL_INTERVAL} "
f"--setenv=GRACE_PERIOD={GRACE_PERIOD} " f"--setenv=GRACE_PERIOD={GRACE_PERIOD} "
f"--setenv=CPU_THRESHOLD={CPU_THRESHOLD} " f"--setenv=CPU_THRESHOLD={CPU_THRESHOLD} "
f"--setenv=STARTUP_COOLDOWN={STARTUP_COOLDOWN} " f"--setenv=STARTUP_COOLDOWN={STARTUP_COOLDOWN} "
f"--setenv=STATE_DIR={STATE_DIR} "
f"{PYTHON} {SCRIPT}" f"{PYTHON} {SCRIPT}"
) )
# Monitor needs two consecutive polls to compute a CPU delta. # Monitor needs two consecutive polls to compute a CPU delta.
time.sleep(3) time.sleep(3)
start_all()
machine.wait_for_unit("multi-user.target")
machine.wait_for_unit("xmrig.service")
machine.succeed(f"mkdir -p {STATE_DIR}")
with subtest("Start auto-pause monitor"):
start_monitor("xmrig-auto-pause")
with subtest("xmrig stays running while system is idle"): with subtest("xmrig stays running while system is idle"):
machine.succeed("systemctl is-active xmrig") machine.succeed("systemctl is-active xmrig")
@@ -134,5 +141,61 @@ pkgs.testers.runNixOSTest {
stop_cpu_load("cpu-load-4") stop_cpu_load("cpu-load-4")
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20) machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20)
# --- State persistence and crash recovery ---
machine.succeed("systemctl stop xmrig-auto-pause")
with subtest("xmrig recovers after crash during startup cooldown"):
machine.succeed(f"rm -rf {STATE_DIR} && mkdir -p {STATE_DIR}")
start_monitor("xmrig-auto-pause-crash")
# Load -> xmrig stops
start_cpu_load("cpu-crash")
machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
# End load -> xmrig restarts after grace period
stop_cpu_load("cpu-crash")
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=30)
# Kill xmrig immediately simulates crash during startup cooldown.
# The script should detect the failure when cooldown expires and
# re-enter the retry cycle.
machine.succeed("systemctl kill --signal=KILL xmrig")
machine.wait_until_fails("systemctl is-active xmrig", timeout=5)
# After cooldown + grace period + restart, xmrig should be back.
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=30)
machine.succeed("systemctl stop xmrig-auto-pause-crash")
machine.succeed("systemctl reset-failed xmrig.service || true")
machine.succeed("systemctl start xmrig")
machine.wait_for_unit("xmrig.service")
with subtest("Script restart preserves pause state"):
machine.succeed(f"rm -rf {STATE_DIR} && mkdir -p {STATE_DIR}")
start_monitor("xmrig-auto-pause-persist")
# Load -> xmrig stops
start_cpu_load("cpu-persist")
machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
# Kill the monitor while xmrig is paused (simulates script crash)
machine.succeed("systemctl stop xmrig-auto-pause-persist")
# State file must exist the monitor persisted the pause flag
machine.succeed(f"test -f {STATE_DIR}/paused")
# Start a fresh monitor instance (reads state file on startup)
start_monitor("xmrig-auto-pause-persist2")
# End load the new monitor should pick up the paused state
# and restart xmrig after the grace period
stop_cpu_load("cpu-persist")
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=30)
# State file should be cleaned up after successful restart
machine.fail(f"test -f {STATE_DIR}/paused")
machine.succeed("systemctl stop xmrig-auto-pause-persist2")
''; '';
} }