server-config/services/llama-cpp-xmrig-pause.py

#!/usr/bin/env python3
"""
Pause xmrig while llama-cpp is processing inference requests.

Checks if the llama-server process is actively using CPU by reading
/proc/<pid>/stat. When CPU usage exceeds the threshold, stops xmrig.
When CPU drops below threshold for GRACE_PERIOD seconds, restarts xmrig.
"""

import glob
import os
import subprocess
import sys
import time

POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "3"))
GRACE_PERIOD = float(os.environ.get("GRACE_PERIOD", "10"))
# CPU percentage (per-core) above which llama-server is considered busy.
# Idle llama-server uses ~0% CPU; active inference saturates multiple cores.
CPU_THRESHOLD = float(os.environ.get("CPU_THRESHOLD", "50"))


def log(msg):
    print(f"[llama-cpp-xmrig-pause] {msg}", file=sys.stderr, flush=True)


def find_llama_pid():
    """Find the PID of the llama-server process."""
    for path in glob.glob("/proc/[0-9]*/comm"):
        try:
            with open(path) as f:
                if f.read().strip() == "llama-server":
                    return int(path.split("/")[2])
        except (OSError, ValueError):
            continue
    return None


def get_cpu_times(pid):
    """Read utime + stime from /proc/<pid>/stat. Returns total ticks or None."""
    try:
        with open(f"/proc/{pid}/stat") as f:
            fields = f.read().split(")")[-1].split()
            # fields[11] = utime, fields[12] = stime (0-indexed after ')')
            return int(fields[11]) + int(fields[12])
    except (OSError, IndexError, ValueError):
        return None


def systemctl(action, unit):
    result = subprocess.run(
        ["systemctl", action, unit],
        capture_output=True,
        text=True,
    )
    if result.returncode != 0:
        log(f"systemctl {action} {unit} failed (rc={result.returncode}): {result.stderr.strip()}")
    return result.returncode == 0


def main():
    xmrig_paused = False
    idle_since = None
    prev_ticks = None
    prev_time = None
    hz = os.sysconf("SC_CLK_TCK")

    log(f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s threshold={CPU_THRESHOLD}%")

    while True:
        pid = find_llama_pid()
        if pid is None:
            # llama-server not running
            idle_since = None
            prev_ticks = None
            prev_time = None
            time.sleep(POLL_INTERVAL)
            continue

        ticks = get_cpu_times(pid)
        now = time.monotonic()

        if ticks is None or prev_ticks is None or prev_time is None:
            prev_ticks = ticks
            prev_time = now
            time.sleep(POLL_INTERVAL)
            continue

        dt = now - prev_time
        if dt <= 0:
            prev_ticks = ticks
            prev_time = now
            time.sleep(POLL_INTERVAL)
            continue

        # CPU% = (delta_ticks / hz) / delta_seconds * 100
        cpu_pct = ((ticks - prev_ticks) / hz) / dt * 100
        prev_ticks = ticks
        prev_time = now

        busy = cpu_pct > CPU_THRESHOLD

        if busy:
            idle_since = None
            if not xmrig_paused:
                log(f"llama-server busy ({cpu_pct:.0f}% CPU) — stopping xmrig")
                if systemctl("stop", "xmrig"):
                    xmrig_paused = True
        else:
            if xmrig_paused:
                if idle_since is None:
                    idle_since = now
                elif now - idle_since >= GRACE_PERIOD:
                    log(f"llama-server idle ({cpu_pct:.0f}% CPU) past grace period — starting xmrig")
                    if systemctl("start", "xmrig"):
                        xmrig_paused = False
                    idle_since = None

        time.sleep(POLL_INTERVAL)


if __name__ == "__main__":
    main()