#!/usr/bin/env python3 """ Auto-pause xmrig when other services need CPU. Monitors non-nice CPU usage from /proc/stat. Since xmrig runs at Nice=19, its CPU time lands in the 'nice' column and is excluded from the metric. When real workload (user + system + irq + softirq) exceeds the threshold, stops xmrig. When it drops below threshold for GRACE_PERIOD seconds, restarts xmrig. This replaces per-service pause scripts with a single general-purpose monitor that handles any CPU-intensive workload (gitea workers, llama-cpp inference, etc.) without needing to know about specific processes. Why scheduler priority alone isn't enough: Nice=19 / SCHED_IDLE only affects which thread gets the next time slice. RandomX's 2MB-per-thread scratchpad (24MB across 12 threads) pollutes the shared 32MB L3 cache, and its memory access pattern saturates DRAM bandwidth. Other services run slower even though they aren't denied CPU time. The only fix is to stop xmrig entirely when real work is happening. """ import os import subprocess import sys import time POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "3")) GRACE_PERIOD = float(os.environ.get("GRACE_PERIOD", "15")) # Percentage of total CPU ticks that non-nice processes must use to trigger # a pause. On a 12-thread system, one fully loaded core ≈ 8.3% of total. # Default 5% catches anything using more than ~60% of a single core. CPU_THRESHOLD = float(os.environ.get("CPU_THRESHOLD", "5")) # After starting xmrig, ignore CPU spikes for this many seconds to let # RandomX dataset initialization complete (~4s on the target hardware) # without retriggering a stop. STARTUP_COOLDOWN = float(os.environ.get("STARTUP_COOLDOWN", "10")) def log(msg): print(f"[xmrig-auto-pause] {msg}", file=sys.stderr, flush=True) def read_cpu_ticks(): """Read CPU tick counters from /proc/stat. Returns (total_ticks, real_work_ticks) where real_work excludes the 'nice' column (xmrig) and idle/iowait. """ with open("/proc/stat") as f: parts = f.readline().split() # cpu user nice system idle iowait irq softirq steal user, nice, system, idle, iowait, irq, softirq, steal = ( int(x) for x in parts[1:9] ) total = user + nice + system + idle + iowait + irq + softirq + steal real_work = user + system + irq + softirq return total, real_work def is_active(unit): """Check if a systemd unit is currently active.""" result = subprocess.run( ["systemctl", "is-active", "--quiet", unit], capture_output=True, ) return result.returncode == 0 def systemctl(action, unit): result = subprocess.run( ["systemctl", action, unit], capture_output=True, text=True, ) if result.returncode != 0: log(f"systemctl {action} {unit} failed (rc={result.returncode}): {result.stderr.strip()}") return result.returncode == 0 def main(): paused_by_us = False idle_since = None started_at = None # monotonic time when we last started xmrig prev_total = None prev_work = None log( f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s " f"threshold={CPU_THRESHOLD}% cooldown={STARTUP_COOLDOWN}s" ) while True: total, work = read_cpu_ticks() if prev_total is None: prev_total = total prev_work = work time.sleep(POLL_INTERVAL) continue dt = total - prev_total if dt <= 0: prev_total = total prev_work = work time.sleep(POLL_INTERVAL) continue real_work_pct = ((work - prev_work) / dt) * 100 prev_total = total prev_work = work # Don't act during startup cooldown — RandomX dataset init causes # a transient CPU spike that would immediately retrigger a stop. if started_at is not None: if time.monotonic() - started_at < STARTUP_COOLDOWN: time.sleep(POLL_INTERVAL) continue started_at = None busy = real_work_pct > CPU_THRESHOLD if busy: idle_since = None if paused_by_us and is_active("xmrig.service"): # Something else restarted xmrig (deploy, manual start, etc.) # while we thought it was stopped. Reset ownership so we can # manage it again. log("xmrig was restarted externally while paused — reclaiming") paused_by_us = False if not paused_by_us: # Only claim ownership if xmrig is actually running. # If something else stopped it (e.g. UPS battery hook), # don't interfere — we'd wrongly restart it later. if is_active("xmrig.service"): log(f"Real workload detected ({real_work_pct:.1f}% CPU) — stopping xmrig") if systemctl("stop", "xmrig.service"): paused_by_us = True else: if paused_by_us: if idle_since is None: idle_since = time.monotonic() elif time.monotonic() - idle_since >= GRACE_PERIOD: log(f"Workload ended ({real_work_pct:.1f}% CPU) past grace period — starting xmrig") if systemctl("start", "xmrig.service"): paused_by_us = False started_at = time.monotonic() idle_since = None time.sleep(POLL_INTERVAL) if __name__ == "__main__": main()