diff --git a/services/monero/xmrig-auto-pause.nix b/services/monero/xmrig-auto-pause.nix index 4161e34..758353b 100644 --- a/services/monero/xmrig-auto-pause.nix +++ b/services/monero/xmrig-auto-pause.nix @@ -26,7 +26,8 @@ lib.mkIf config.services.xmrig.enable { environment = { POLL_INTERVAL = "3"; GRACE_PERIOD = "15"; - CPU_THRESHOLD = "5"; + CPU_STOP_THRESHOLD = "15"; + CPU_RESUME_THRESHOLD = "5"; STARTUP_COOLDOWN = "10"; STATE_DIR = "/var/lib/xmrig-auto-pause"; }; diff --git a/services/monero/xmrig-auto-pause.py b/services/monero/xmrig-auto-pause.py index 2abd4ac..4e11f84 100644 --- a/services/monero/xmrig-auto-pause.py +++ b/services/monero/xmrig-auto-pause.py @@ -4,9 +4,9 @@ Auto-pause xmrig when other services need CPU. Monitors non-nice CPU usage from /proc/stat. Since xmrig runs at Nice=19, its CPU time lands in the 'nice' column and is excluded from the metric. -When real workload (user + system + irq + softirq) exceeds the threshold, -stops xmrig. When it drops below threshold for GRACE_PERIOD seconds, -restarts xmrig. +When real workload (user + system + irq + softirq) exceeds the stop +threshold, stops xmrig. When it drops below the resume threshold for +GRACE_PERIOD seconds, restarts xmrig. This replaces per-service pause scripts with a single general-purpose monitor that handles any CPU-intensive workload (gitea workers, llama-cpp @@ -18,6 +18,14 @@ Why scheduler priority alone isn't enough: the shared 32MB L3 cache, and its memory access pattern saturates DRAM bandwidth. Other services run slower even though they aren't denied CPU time. The only fix is to stop xmrig entirely when real work is happening. + +Hysteresis: + The stop threshold is set higher than the resume threshold to prevent + oscillation. When xmrig runs, its L3 cache pressure makes other processes + appear ~3-8% busier. A single threshold trips on this indirect effect, + causing stop/start thrashing. Separate thresholds break the cycle: the + resume threshold confirms the system is truly idle, while the stop + threshold requires genuine workload above xmrig's indirect pressure. """ import os @@ -29,8 +37,12 @@ POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "3")) GRACE_PERIOD = float(os.environ.get("GRACE_PERIOD", "15")) # Percentage of total CPU ticks that non-nice processes must use to trigger # a pause. On a 12-thread system, one fully loaded core ≈ 8.3% of total. -# Default 5% catches anything using more than ~60% of a single core. -CPU_THRESHOLD = float(os.environ.get("CPU_THRESHOLD", "5")) +# Default 15% requires roughly two busy cores, which avoids false positives +# from xmrig's L3 cache pressure inflating other processes' apparent CPU. +CPU_STOP_THRESHOLD = float(os.environ.get("CPU_STOP_THRESHOLD", "15")) +# Percentage below which the system is considered idle enough to resume +# mining. Lower than the stop threshold to provide hysteresis. +CPU_RESUME_THRESHOLD = float(os.environ.get("CPU_RESUME_THRESHOLD", "5")) # After starting xmrig, ignore CPU spikes for this many seconds to let # RandomX dataset initialization complete (~4s on the target hardware) # without retriggering a stop. @@ -115,7 +127,8 @@ def main(): log( f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s " - f"threshold={CPU_THRESHOLD}% cooldown={STARTUP_COOLDOWN}s" + f"stop={CPU_STOP_THRESHOLD}% resume={CPU_RESUME_THRESHOLD}% " + f"cooldown={STARTUP_COOLDOWN}s" ) while True: @@ -154,9 +167,10 @@ def main(): _save_paused(True) started_at = None - busy = real_work_pct > CPU_THRESHOLD + above_stop = real_work_pct > CPU_STOP_THRESHOLD + below_resume = real_work_pct <= CPU_RESUME_THRESHOLD - if busy: + if above_stop: idle_since = None if paused_by_us and is_active("xmrig.service"): # Something else restarted xmrig (deploy, manual start, etc.) @@ -174,8 +188,8 @@ def main(): if systemctl("stop", "xmrig.service"): paused_by_us = True _save_paused(True) - else: - if paused_by_us: + elif paused_by_us: + if below_resume: if idle_since is None: idle_since = time.monotonic() elif time.monotonic() - idle_since >= GRACE_PERIOD: @@ -185,6 +199,9 @@ def main(): _save_paused(False) started_at = time.monotonic() idle_since = None + else: + # Between thresholds — not idle enough to resume. + idle_since = None time.sleep(POLL_INTERVAL) diff --git a/tests/xmrig-auto-pause.nix b/tests/xmrig-auto-pause.nix index 33796e6..ca52d77 100644 --- a/tests/xmrig-auto-pause.nix +++ b/tests/xmrig-auto-pause.nix @@ -39,13 +39,15 @@ pkgs.testers.runNixOSTest { # POLL_INTERVAL=1 keeps detection latency low. # GRACE_PERIOD=5 is long enough to verify "stays stopped" but short # enough that the full test completes in reasonable time. - # CPU_THRESHOLD=10 catches a single busy-loop on a 1-2 core VM. + # CPU_STOP_THRESHOLD=20 catches a busy-loop on a 1-2 core VM (50-100%) + # without triggering from normal VM noise. + # CPU_RESUME_THRESHOLD=10 is the idle cutoff for a 1-2 core VM. POLL_INTERVAL = "1" GRACE_PERIOD = "5" - CPU_THRESHOLD = "10" + CPU_STOP_THRESHOLD = "20" + CPU_RESUME_THRESHOLD = "10" STARTUP_COOLDOWN = "4" STATE_DIR = "/tmp/xap-state" - def start_cpu_load(name): """Start a non-nice CPU burn as a transient systemd unit.""" machine.succeed( @@ -62,13 +64,16 @@ pkgs.testers.runNixOSTest { f"systemd-run --unit={unit_name} " f"--setenv=POLL_INTERVAL={POLL_INTERVAL} " f"--setenv=GRACE_PERIOD={GRACE_PERIOD} " - f"--setenv=CPU_THRESHOLD={CPU_THRESHOLD} " + f"--setenv=CPU_STOP_THRESHOLD={CPU_STOP_THRESHOLD} " + f"--setenv=CPU_RESUME_THRESHOLD={CPU_RESUME_THRESHOLD} " f"--setenv=STARTUP_COOLDOWN={STARTUP_COOLDOWN} " f"--setenv=STATE_DIR={STATE_DIR} " f"{PYTHON} {SCRIPT}" ) # Monitor needs two consecutive polls to compute a CPU delta. time.sleep(3) + # Monitor needs two consecutive polls to compute a CPU delta. + time.sleep(3) start_all() machine.wait_for_unit("multi-user.target")