server-config/services/monero/xmrig-auto-pause.py

#!/usr/bin/env python3
"""
Auto-pause xmrig when other services need CPU.

Monitors non-nice CPU usage from /proc/stat. Since xmrig runs at Nice=19,
its CPU time lands in the 'nice' column and is excluded from the metric.
When real workload (user + system + irq + softirq) exceeds the stop
threshold, stops xmrig. When it drops below the resume threshold for
GRACE_PERIOD seconds, restarts xmrig.

This replaces per-service pause scripts with a single general-purpose
monitor that handles any CPU-intensive workload (gitea workers, llama-cpp
inference, etc.) without needing to know about specific processes.

Why scheduler priority alone isn't enough:
  Nice=19 / SCHED_IDLE only affects which thread gets the next time slice.
  RandomX's 2MB-per-thread scratchpad (24MB across 12 threads) pollutes
  the shared 32MB L3 cache, and its memory access pattern saturates DRAM
  bandwidth. Other services run slower even though they aren't denied CPU
  time. The only fix is to stop xmrig entirely when real work is happening.

Hysteresis:
  The stop threshold is set higher than the resume threshold to prevent
  oscillation. When xmrig runs, its L3 cache pressure makes other processes
  appear ~3-8% busier. A single threshold trips on this indirect effect,
  causing stop/start thrashing. Separate thresholds break the cycle: the
  resume threshold confirms the system is truly idle, while the stop
  threshold requires genuine workload above xmrig's indirect pressure.
"""

import os
import subprocess
import sys
import time

POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "3"))
GRACE_PERIOD = float(os.environ.get("GRACE_PERIOD", "15"))
# Percentage of total CPU ticks that non-nice processes must use to trigger
# a pause. On a 12-thread system, one fully loaded core ≈ 8.3% of total.
# Default 15% requires roughly two busy cores, which avoids false positives
# from xmrig's L3 cache pressure inflating other processes' apparent CPU.
CPU_STOP_THRESHOLD = float(os.environ.get("CPU_STOP_THRESHOLD", "15"))
# Percentage below which the system is considered idle enough to resume
# mining. Lower than the stop threshold to provide hysteresis.
CPU_RESUME_THRESHOLD = float(os.environ.get("CPU_RESUME_THRESHOLD", "5"))
# After starting xmrig, ignore CPU spikes for this many seconds to let
# RandomX dataset initialization complete (~4s on the target hardware)
# without retriggering a stop.
STARTUP_COOLDOWN = float(os.environ.get("STARTUP_COOLDOWN", "10"))
# Directory for persisting pause state across script restarts.  Without
# this, a restart while xmrig is paused loses the paused_by_us flag and
# xmrig stays stopped permanently.
STATE_DIR = os.environ.get("STATE_DIR", "")
_PAUSE_FILE = os.path.join(STATE_DIR, "paused") if STATE_DIR else ""


def log(msg):
    print(f"[xmrig-auto-pause] {msg}", file=sys.stderr, flush=True)


def read_cpu_ticks():
    """Read CPU tick counters from /proc/stat.

    Returns (total_ticks, real_work_ticks) where real_work excludes the
    'nice' column (xmrig) and idle/iowait.
    """
    with open("/proc/stat") as f:
        parts = f.readline().split()
    # cpu  user nice system idle iowait irq softirq steal
    user, nice, system, idle, iowait, irq, softirq, steal = (
        int(x) for x in parts[1:9]
    )
    total = user + nice + system + idle + iowait + irq + softirq + steal
    real_work = user + system + irq + softirq
    return total, real_work


def is_active(unit):
    """Check if a systemd unit is currently active."""
    result = subprocess.run(
        ["systemctl", "is-active", "--quiet", unit],
        capture_output=True,
    )
    return result.returncode == 0


def systemctl(action, unit):
    result = subprocess.run(
        ["systemctl", action, unit],
        capture_output=True,
        text=True,
    )
    if result.returncode != 0:
        log(f"systemctl {action} {unit} failed (rc={result.returncode}): {result.stderr.strip()}")
    return result.returncode == 0


def _save_paused(paused):
    """Persist pause flag so a script restart can resume where we left off."""
    if not _PAUSE_FILE:
        return
    try:
        if paused:
            open(_PAUSE_FILE, "w").close()
        else:
            os.remove(_PAUSE_FILE)
    except OSError:
        pass


def _load_paused():
    """Check if a previous instance left xmrig paused."""
    if not _PAUSE_FILE:
        return False
    return os.path.isfile(_PAUSE_FILE)


def main():
    paused_by_us = _load_paused()
    idle_since = None
    started_at = None  # monotonic time when we last started xmrig
    prev_total = None
    prev_work = None

    if paused_by_us:
        log("Recovered pause state from previous instance")

    log(
        f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s "
        f"stop={CPU_STOP_THRESHOLD}% resume={CPU_RESUME_THRESHOLD}% "
        f"cooldown={STARTUP_COOLDOWN}s"
    )

    while True:
        total, work = read_cpu_ticks()

        if prev_total is None:
            prev_total = total
            prev_work = work
            time.sleep(POLL_INTERVAL)
            continue

        dt = total - prev_total
        if dt <= 0:
            prev_total = total
            prev_work = work
            time.sleep(POLL_INTERVAL)
            continue

        real_work_pct = ((work - prev_work) / dt) * 100
        prev_total = total
        prev_work = work

        # Don't act during startup cooldown — RandomX dataset init causes
        # a transient CPU spike that would immediately retrigger a stop.
        if started_at is not None:
            if time.monotonic() - started_at < STARTUP_COOLDOWN:
                time.sleep(POLL_INTERVAL)
                continue
            # Cooldown expired — verify xmrig survived startup.  If it
            # crashed during init (hugepage failure, pool unreachable, etc.),
            # re-enter the pause/retry cycle rather than silently leaving
            # xmrig dead.
            if not is_active("xmrig.service"):
                log("xmrig died during startup cooldown — will retry")
                paused_by_us = True
                _save_paused(True)
            started_at = None

        above_stop = real_work_pct > CPU_STOP_THRESHOLD
        below_resume = real_work_pct <= CPU_RESUME_THRESHOLD

        if above_stop:
            idle_since = None
            if paused_by_us and is_active("xmrig.service"):
                # Something else restarted xmrig (deploy, manual start, etc.)
                # while we thought it was stopped. Reset ownership so we can
                # manage it again.
                log("xmrig was restarted externally while paused — reclaiming")
                paused_by_us = False
                _save_paused(False)
            if not paused_by_us:
                # Only claim ownership if xmrig is actually running.
                # If something else stopped it (e.g. UPS battery hook),
                # don't interfere — we'd wrongly restart it later.
                if is_active("xmrig.service"):
                    log(f"Real workload detected ({real_work_pct:.1f}% CPU) — stopping xmrig")
                    if systemctl("stop", "xmrig.service"):
                        paused_by_us = True
                        _save_paused(True)
        elif paused_by_us:
            if below_resume:
                if idle_since is None:
                    idle_since = time.monotonic()
                elif time.monotonic() - idle_since >= GRACE_PERIOD:
                    log(f"Workload ended ({real_work_pct:.1f}% CPU) past grace period — starting xmrig")
                    if systemctl("start", "xmrig.service"):
                        paused_by_us = False
                        _save_paused(False)
                        started_at = time.monotonic()
                    idle_since = None
            else:
                # Between thresholds — not idle enough to resume.
                idle_since = None

        time.sleep(POLL_INTERVAL)


if __name__ == "__main__":
    main()