nixos/services/monero/xmrig-auto-pause.py

#!/usr/bin/env python3
"""
Auto-pause xmrig when other services need CPU.

Two independent signals drive the decision; either one can trigger a pause:

1. System-wide non-nice CPU from /proc/stat. Catches any CPU-heavy workload
   including non-systemd user work (interactive sessions, ad-hoc jobs).
   Since xmrig runs at Nice=19, its CPU time lands in the 'nice' column and
   is excluded from the metric.

2. Per-service CPU from cgroup cpu.stat usage_usec. Catches sub-threshold
   service activity — a single Minecraft player drives the server JVM to
   3-15% of one core, which is noise system-wide (0.3-1.3% of total on a
   12-thread host) but dominant for the minecraft cgroup.

When either signal crosses its stop threshold, writes 1 to
/sys/fs/cgroup/system.slice/xmrig.service/cgroup.freeze. When both are quiet
for GRACE_PERIOD seconds, writes 0 to resume.

Why direct cgroup.freeze instead of systemctl freeze:
  systemd 256+ has a bug class where `systemctl freeze` followed by any
  process death (SIGKILL, watchdog, OOM, segfault, shutdown) strands the
  unit in FreezerState=frozen ActiveState=failed with no recovery short of
  a reboot. See https://github.com/systemd/systemd/issues/38517. Writing
  directly to cgroup.freeze keeps systemd's FreezerState at "running" the
  whole time, so there is no state machine to get stuck: if xmrig dies
  while frozen, systemd transitions it to inactive normally.

Why scheduler priority alone isn't enough:
  Nice=19 / SCHED_IDLE only affects which thread gets the next time slice.
  RandomX's 2MB-per-thread scratchpad (24MB across 12 threads) holds about
  68% of the shared 32MB L3 cache on Zen 3, evicting hot lines from
  interactive services. Measured on muffin: pointer-chase latency is 112ns
  with xmrig running and 19ns with xmrig frozen — a 6x difference that
  scheduler priority cannot address.

Hysteresis:
  The system-wide stop threshold sits higher than the resume threshold
  because background services (qbittorrent, bitmagnet, postgres) produce
  15-25% non-nice CPU during normal operation, and xmrig's indirect cache
  pressure inflates that by another few percent. A single threshold
  thrashes on the floor; two thresholds break the cycle.

  Per-service thresholds are single-valued. Per-service CPU is a clean
  signal without background noise to calibrate against, so idle_since is
  reset whenever any watched service is at-or-above its threshold and the
  grace period only advances when every watched service is below.
"""

import os
import signal
import subprocess
import sys
import time

POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "3"))
GRACE_PERIOD = float(os.environ.get("GRACE_PERIOD", "15"))
# Percentage of total CPU ticks that non-nice processes must use to trigger
# a pause. On a 12-thread system, one fully loaded core ≈ 8.3% of total.
CPU_STOP_THRESHOLD = float(os.environ.get("CPU_STOP_THRESHOLD", "15"))
# Percentage below which the system is considered idle enough to resume
# mining. Lower than the stop threshold to provide hysteresis.
CPU_RESUME_THRESHOLD = float(os.environ.get("CPU_RESUME_THRESHOLD", "5"))
# Per-service CPU thresholds parsed from "unit1:threshold1,unit2:threshold2".
# Thresholds are percentage of TOTAL CPU capacity (same frame as
# CPU_STOP_THRESHOLD). Empty / unset disables the per-service path.
WATCHED_SERVICES_RAW = os.environ.get("WATCHED_SERVICES", "")
# Path to xmrig's cgroup.freeze file. Direct write bypasses systemd's
# freezer state machine; see module docstring.
XMRIG_CGROUP_FREEZE = os.environ.get(
    "XMRIG_CGROUP_FREEZE",
    "/sys/fs/cgroup/system.slice/xmrig.service/cgroup.freeze",
)
# Directory for persisting pause state across script restarts.  Without
# this, a restart while xmrig is paused loses the paused_by_us flag and
# xmrig stays frozen until something else thaws it.
STATE_DIR = os.environ.get("STATE_DIR", "")
_PAUSE_FILE = os.path.join(STATE_DIR, "paused") if STATE_DIR else ""


def log(msg):
    print(f"[xmrig-auto-pause] {msg}", file=sys.stderr, flush=True)


def _parse_watched(spec):
    out = {}
    for entry in filter(None, (s.strip() for s in spec.split(","))):
        name, _, pct = entry.partition(":")
        name = name.strip()
        pct = pct.strip()
        if not name or not pct:
            log(f"WATCHED_SERVICES: ignoring malformed entry '{entry}'")
            continue
        try:
            out[name] = float(pct)
        except ValueError:
            log(f"WATCHED_SERVICES: ignoring non-numeric threshold in '{entry}'")
    return out


def _resolve_cgroup_cpustat(unit):
    """Look up the unit's cgroup path via systemd. Returns cpu.stat path or
    None if the unit has no cgroup (service not running, unknown unit)."""
    result = subprocess.run(
        ["systemctl", "show", "--value", "--property=ControlGroup", unit],
        capture_output=True,
        text=True,
    )
    cg = result.stdout.strip()
    if not cg:
        return None
    path = f"/sys/fs/cgroup{cg}/cpu.stat"
    if not os.path.isfile(path):
        return None
    return path


def _read_service_usec(path):
    """Cumulative cpu.stat usage_usec, or None if the cgroup has vanished."""
    try:
        with open(path) as f:
            for line in f:
                if line.startswith("usage_usec "):
                    return int(line.split()[1])
    except FileNotFoundError:
        return None
    return None


def read_cpu_ticks():
    """Read CPU tick counters from /proc/stat.

    Returns (total_ticks, real_work_ticks) where real_work excludes the
    'nice' column (xmrig) and idle/iowait.
    """
    with open("/proc/stat") as f:
        parts = f.readline().split()
    # cpu  user nice system idle iowait irq softirq steal
    user, nice, system, idle, iowait, irq, softirq, steal = (
        int(x) for x in parts[1:9]
    )
    total = user + nice + system + idle + iowait + irq + softirq + steal
    real_work = user + system + irq + softirq
    return total, real_work


def is_active(unit):
    """Check if a systemd unit is currently active."""
    result = subprocess.run(
        ["systemctl", "is-active", "--quiet", unit],
        capture_output=True,
    )
    return result.returncode == 0


def main_pid(unit):
    """Return the unit's MainPID, or 0 if unit is not running."""
    result = subprocess.run(
        ["systemctl", "show", "--value", "--property=MainPID", unit],
        capture_output=True,
        text=True,
    )
    try:
        return int(result.stdout.strip() or "0")
    except ValueError:
        return 0


def _freeze(frozen):
    """Write 1 or 0 to xmrig's cgroup.freeze. Returns True on success.

    Direct kernel interface — bypasses systemd's freezer state tracking."""
    try:
        with open(XMRIG_CGROUP_FREEZE, "w") as f:
            f.write("1" if frozen else "0")
        return True
    except OSError as e:
        action = "freeze" if frozen else "thaw"
        log(f"cgroup.freeze {action} write failed: {e}")
        return False


def _is_frozen():
    """Read the actual frozen state from cgroup.events. False if cgroup absent."""
    events_path = os.path.join(os.path.dirname(XMRIG_CGROUP_FREEZE), "cgroup.events")
    try:
        with open(events_path) as f:
            for line in f:
                if line.startswith("frozen "):
                    return line.split()[1] == "1"
    except FileNotFoundError:
        return False
    return False


def _save_paused(pid):
    """Persist the xmrig MainPID at the time of freeze. pid=0 clears claim."""
    if not _PAUSE_FILE:
        return
    try:
        if pid:
            with open(_PAUSE_FILE, "w") as f:
                f.write(str(pid))
        else:
            try:
                os.remove(_PAUSE_FILE)
            except FileNotFoundError:
                pass
    except OSError as e:
        log(f"state file write failed: {e}")


def _load_paused():
    """Return True iff our claim is still valid: same PID and still frozen.

    Restart of the xmrig unit gives it a new PID, which invalidates any
    prior claim — we can't "own" a freeze we didn't perform on this
    instance. Also confirms the cgroup is actually frozen so an external
    thaw drops the claim.
    """
    if not _PAUSE_FILE:
        return False
    try:
        with open(_PAUSE_FILE) as f:
            saved = int(f.read().strip() or "0")
    except (FileNotFoundError, ValueError):
        return False
    if not saved:
        return False
    if saved != main_pid("xmrig.service"):
        return False
    return _is_frozen()


def _cleanup(signum=None, frame=None):
    """On SIGTERM/SIGINT: thaw xmrig and clear claim. Operators must never see
    a frozen unit we owned after auto-pause exits."""
    if _is_frozen():
        _freeze(False)
    _save_paused(0)
    sys.exit(0)


def main():
    watched_services = _parse_watched(WATCHED_SERVICES_RAW)
    watched_paths = {}
    for name in watched_services:
        path = _resolve_cgroup_cpustat(name)
        if path is None:
            log(f"WATCHED_SERVICES: {name} has no cgroup — ignoring until it starts")
        watched_paths[name] = path

    nproc = os.cpu_count() or 1

    signal.signal(signal.SIGTERM, _cleanup)
    signal.signal(signal.SIGINT, _cleanup)

    paused_by_us = _load_paused()
    if paused_by_us:
        log("Recovered pause state from previous instance")

    log(
        f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s "
        f"sys_stop={CPU_STOP_THRESHOLD}% sys_resume={CPU_RESUME_THRESHOLD}% "
        f"watched={watched_services or '(none)'}"
    )

    idle_since = None
    prev_total = None
    prev_work = None
    prev_monotonic = None
    prev_service_usec = {}

    while True:
        total, work = read_cpu_ticks()
        now = time.monotonic()

        if prev_total is None:
            prev_total = total
            prev_work = work
            prev_monotonic = now
            # seed per-service baselines too
            for name, path in watched_paths.items():
                if path is None:
                    # Re-resolve in case the service has started since startup
                    path = _resolve_cgroup_cpustat(name)
                    watched_paths[name] = path
                if path is not None:
                    usec = _read_service_usec(path)
                    if usec is not None:
                        prev_service_usec[name] = usec
            time.sleep(POLL_INTERVAL)
            continue

        dt = total - prev_total
        dt_s = now - prev_monotonic
        if dt <= 0 or dt_s <= 0:
            prev_total = total
            prev_work = work
            prev_monotonic = now
            time.sleep(POLL_INTERVAL)
            continue

        real_work_pct = ((work - prev_work) / dt) * 100

        # Per-service CPU percentages this window. Fraction of total CPU
        # capacity used by this specific service, same frame as real_work_pct.
        svc_pct = {}
        for name in watched_services:
            path = watched_paths.get(name)
            if path is None:
                # Unit wasn't running at startup; try resolving again in case
                # it has started since.
                path = _resolve_cgroup_cpustat(name)
                watched_paths[name] = path
                if path is None:
                    prev_service_usec.pop(name, None)
                    continue
            cur = _read_service_usec(path)
            if cur is None:
                # Service stopped; drop prev so it doesn't compute a huge delta
                # on next start.
                prev_service_usec.pop(name, None)
                watched_paths[name] = None  # force re-resolution next poll
                continue
            if name in prev_service_usec:
                delta_us = cur - prev_service_usec[name]
                if delta_us >= 0:
                    svc_pct[name] = (delta_us / 1_000_000) / (dt_s * nproc) * 100
            prev_service_usec[name] = cur

        prev_total = total
        prev_work = work
        prev_monotonic = now

        above_stop_sys = real_work_pct > CPU_STOP_THRESHOLD
        below_resume_sys = real_work_pct <= CPU_RESUME_THRESHOLD

        busy_services = [
            n for n in watched_services if svc_pct.get(n, 0) > watched_services[n]
        ]
        any_svc_at_or_above = any(
            svc_pct.get(n, 0) >= watched_services[n] for n in watched_services
        )

        stop_pressure = above_stop_sys or bool(busy_services)
        fully_idle = below_resume_sys and not any_svc_at_or_above

        if stop_pressure:
            idle_since = None
            if paused_by_us and not _is_frozen():
                # Someone thawed xmrig while we believed it paused. Reclaim
                # ownership so we can re-freeze.
                log("xmrig was thawed externally while paused — reclaiming")
                paused_by_us = False
                _save_paused(0)
            if not paused_by_us and is_active("xmrig.service"):
                # Only claim ownership if xmrig is actually running. If
                # something else stopped it (e.g. UPS battery hook), don't
                # interfere.
                if busy_services:
                    reasons = ", ".join(
                        f"{n}={svc_pct[n]:.1f}%>{watched_services[n]:.1f}%"
                        for n in busy_services
                    )
                    log(f"Stop: watched service(s) busy [{reasons}] — freezing xmrig")
                else:
                    log(
                        f"Stop: system CPU {real_work_pct:.1f}% > "
                        f"{CPU_STOP_THRESHOLD:.1f}% — freezing xmrig"
                    )
                if _freeze(True):
                    paused_by_us = True
                    _save_paused(main_pid("xmrig.service"))
        elif paused_by_us:
            if fully_idle:
                if idle_since is None:
                    idle_since = time.monotonic()
                elif time.monotonic() - idle_since >= GRACE_PERIOD:
                    log(
                        f"Idle past grace period (system {real_work_pct:.1f}%) "
                        "— thawing xmrig"
                    )
                    if _freeze(False):
                        paused_by_us = False
                        _save_paused(0)
                    idle_since = None
            else:
                # Between thresholds or a watched service is borderline — not
                # idle enough to resume.
                idle_since = None

        time.sleep(POLL_INTERVAL)


if __name__ == "__main__":
    main()