#!/usr/bin/env python3 """ Pause xmrig while llama-cpp is processing inference requests. Checks if the llama-server process is actively using CPU by reading /proc//stat. When CPU usage exceeds the threshold, stops xmrig. When CPU drops below threshold for GRACE_PERIOD seconds, restarts xmrig. """ import glob import os import subprocess import sys import time POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "3")) GRACE_PERIOD = float(os.environ.get("GRACE_PERIOD", "10")) # CPU percentage (per-core) above which llama-server is considered busy. # Idle llama-server uses ~0% CPU; active inference saturates multiple cores. CPU_THRESHOLD = float(os.environ.get("CPU_THRESHOLD", "50")) def log(msg): print(f"[llama-cpp-xmrig-pause] {msg}", file=sys.stderr, flush=True) def find_llama_pid(): """Find the PID of the llama-server process.""" for path in glob.glob("/proc/[0-9]*/comm"): try: with open(path) as f: if f.read().strip() == "llama-server": return int(path.split("/")[2]) except (OSError, ValueError): continue return None def get_cpu_times(pid): """Read utime + stime from /proc//stat. Returns total ticks or None.""" try: with open(f"/proc/{pid}/stat") as f: fields = f.read().split(")")[-1].split() # fields[11] = utime, fields[12] = stime (0-indexed after ')') return int(fields[11]) + int(fields[12]) except (OSError, IndexError, ValueError): return None def systemctl(action, unit): result = subprocess.run( ["systemctl", action, unit], capture_output=True, text=True, ) if result.returncode != 0: log(f"systemctl {action} {unit} failed (rc={result.returncode}): {result.stderr.strip()}") return result.returncode == 0 def main(): xmrig_paused = False idle_since = None prev_ticks = None prev_time = None hz = os.sysconf("SC_CLK_TCK") log(f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s threshold={CPU_THRESHOLD}%") while True: pid = find_llama_pid() if pid is None: # llama-server not running idle_since = None prev_ticks = None prev_time = None time.sleep(POLL_INTERVAL) continue ticks = get_cpu_times(pid) now = time.monotonic() if ticks is None or prev_ticks is None or prev_time is None: prev_ticks = ticks prev_time = now time.sleep(POLL_INTERVAL) continue dt = now - prev_time if dt <= 0: prev_ticks = ticks prev_time = now time.sleep(POLL_INTERVAL) continue # CPU% = (delta_ticks / hz) / delta_seconds * 100 cpu_pct = ((ticks - prev_ticks) / hz) / dt * 100 prev_ticks = ticks prev_time = now busy = cpu_pct > CPU_THRESHOLD if busy: idle_since = None if not xmrig_paused: log(f"llama-server busy ({cpu_pct:.0f}% CPU) — stopping xmrig") if systemctl("stop", "xmrig"): xmrig_paused = True else: if xmrig_paused: if idle_since is None: idle_since = now elif now - idle_since >= GRACE_PERIOD: log(f"llama-server idle ({cpu_pct:.0f}% CPU) past grace period — starting xmrig") if systemctl("start", "xmrig"): xmrig_paused = False idle_since = None time.sleep(POLL_INTERVAL) if __name__ == "__main__": main()