fix xmrig pause

This commit is contained in:
2026-04-03 14:39:20 -04:00
parent d4d01d63f1
commit daf82c16ba
8 changed files with 263 additions and 298 deletions

View File

@@ -1,6 +1,5 @@
{
imports = [
./llama-cpp.nix
./llama-cpp-xmrig-pause.nix
];
}

View File

@@ -1,123 +0,0 @@
#!/usr/bin/env python3
"""
Pause xmrig while llama-cpp is processing inference requests.
Checks if the llama-server process is actively using CPU by reading
/proc/<pid>/stat. When CPU usage exceeds the threshold, stops xmrig.
When CPU drops below threshold for GRACE_PERIOD seconds, restarts xmrig.
"""
import glob
import os
import subprocess
import sys
import time
POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "3"))
GRACE_PERIOD = float(os.environ.get("GRACE_PERIOD", "10"))
# CPU percentage (per-core) above which llama-server is considered busy.
# Idle llama-server uses ~0% CPU; active inference saturates multiple cores.
CPU_THRESHOLD = float(os.environ.get("CPU_THRESHOLD", "50"))
def log(msg):
print(f"[llama-cpp-xmrig-pause] {msg}", file=sys.stderr, flush=True)
def find_llama_pid():
"""Find the PID of the llama-server process."""
for path in glob.glob("/proc/[0-9]*/comm"):
try:
with open(path) as f:
if f.read().strip() == "llama-server":
return int(path.split("/")[2])
except (OSError, ValueError):
continue
return None
def get_cpu_times(pid):
"""Read utime + stime from /proc/<pid>/stat. Returns total ticks or None."""
try:
with open(f"/proc/{pid}/stat") as f:
fields = f.read().split(")")[-1].split()
# fields[11] = utime, fields[12] = stime (0-indexed after ')')
return int(fields[11]) + int(fields[12])
except (OSError, IndexError, ValueError):
return None
def systemctl(action, unit):
result = subprocess.run(
["systemctl", action, unit],
capture_output=True,
text=True,
)
if result.returncode != 0:
log(f"systemctl {action} {unit} failed (rc={result.returncode}): {result.stderr.strip()}")
return result.returncode == 0
def main():
xmrig_paused = False
idle_since = None
prev_ticks = None
prev_time = None
hz = os.sysconf("SC_CLK_TCK")
log(f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s threshold={CPU_THRESHOLD}%")
while True:
pid = find_llama_pid()
if pid is None:
# llama-server not running
idle_since = None
prev_ticks = None
prev_time = None
time.sleep(POLL_INTERVAL)
continue
ticks = get_cpu_times(pid)
now = time.monotonic()
if ticks is None or prev_ticks is None or prev_time is None:
prev_ticks = ticks
prev_time = now
time.sleep(POLL_INTERVAL)
continue
dt = now - prev_time
if dt <= 0:
prev_ticks = ticks
prev_time = now
time.sleep(POLL_INTERVAL)
continue
# CPU% = (delta_ticks / hz) / delta_seconds * 100
cpu_pct = ((ticks - prev_ticks) / hz) / dt * 100
prev_ticks = ticks
prev_time = now
busy = cpu_pct > CPU_THRESHOLD
if busy:
idle_since = None
if not xmrig_paused:
log(f"llama-server busy ({cpu_pct:.0f}% CPU) — stopping xmrig")
if systemctl("stop", "xmrig"):
xmrig_paused = True
else:
if xmrig_paused:
if idle_since is None:
idle_since = now
elif now - idle_since >= GRACE_PERIOD:
log(f"llama-server idle ({cpu_pct:.0f}% CPU) past grace period — starting xmrig")
if systemctl("start", "xmrig"):
xmrig_paused = False
idle_since = None
time.sleep(POLL_INTERVAL)
if __name__ == "__main__":
main()

View File

@@ -4,19 +4,15 @@
pkgs,
...
}:
lib.mkIf config.services.llama-cpp.enable {
systemd.services.llama-cpp-xmrig-pause = {
description = "Pause xmrig while llama-cpp is processing requests";
after = [
"llama-cpp.service"
"xmrig.service"
];
lib.mkIf config.services.xmrig.enable {
systemd.services.xmrig-auto-pause = {
description = "Auto-pause xmrig when other services need CPU";
after = [ "xmrig.service" ];
wantedBy = [ "multi-user.target" ];
serviceConfig = {
ExecStart = "${pkgs.python3}/bin/python3 ${./llama-cpp-xmrig-pause.py}";
ExecStart = "${pkgs.python3}/bin/python3 ${./xmrig-auto-pause.py}";
Restart = "always";
RestartSec = "10s";
# Needs /proc access (default) and AF_UNIX for systemctl
NoNewPrivileges = true;
ProtectHome = true;
ProtectSystem = "strict";
@@ -28,8 +24,8 @@ lib.mkIf config.services.llama-cpp.enable {
};
environment = {
POLL_INTERVAL = "3";
GRACE_PERIOD = "10";
CPU_THRESHOLD = "50";
GRACE_PERIOD = "15";
CPU_THRESHOLD = "5";
};
};
}

View File

@@ -0,0 +1,131 @@
#!/usr/bin/env python3
"""
Auto-pause xmrig when other services need CPU.
Monitors non-nice CPU usage from /proc/stat. Since xmrig runs at Nice=19,
its CPU time lands in the 'nice' column and is excluded from the metric.
When real workload (user + system + irq + softirq) exceeds the threshold,
stops xmrig. When it drops below threshold for GRACE_PERIOD seconds,
restarts xmrig.
This replaces per-service pause scripts with a single general-purpose
monitor that handles any CPU-intensive workload (gitea workers, llama-cpp
inference, etc.) without needing to know about specific processes.
Why scheduler priority alone isn't enough:
Nice=19 / SCHED_IDLE only affects which thread gets the next time slice.
RandomX's 2MB-per-thread scratchpad (24MB across 12 threads) pollutes
the shared 32MB L3 cache, and its memory access pattern saturates DRAM
bandwidth. Other services run slower even though they aren't denied CPU
time. The only fix is to stop xmrig entirely when real work is happening.
"""
import os
import subprocess
import sys
import time
POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "3"))
GRACE_PERIOD = float(os.environ.get("GRACE_PERIOD", "15"))
# Percentage of total CPU ticks that non-nice processes must use to trigger
# a pause. On a 12-thread system, one fully loaded core ≈ 8.3% of total.
# Default 5% catches anything using more than ~60% of a single core.
CPU_THRESHOLD = float(os.environ.get("CPU_THRESHOLD", "5"))
def log(msg):
print(f"[xmrig-auto-pause] {msg}", file=sys.stderr, flush=True)
def read_cpu_ticks():
"""Read CPU tick counters from /proc/stat.
Returns (total_ticks, real_work_ticks) where real_work excludes the
'nice' column (xmrig) and idle/iowait.
"""
with open("/proc/stat") as f:
parts = f.readline().split()
# cpu user nice system idle iowait irq softirq steal
user, nice, system, idle, iowait, irq, softirq, steal = (
int(x) for x in parts[1:9]
)
total = user + nice + system + idle + iowait + irq + softirq + steal
real_work = user + system + irq + softirq
return total, real_work
def is_active(unit):
"""Check if a systemd unit is currently active."""
result = subprocess.run(
["systemctl", "is-active", "--quiet", unit],
capture_output=True,
)
return result.returncode == 0
def systemctl(action, unit):
result = subprocess.run(
["systemctl", action, unit],
capture_output=True,
text=True,
)
if result.returncode != 0:
log(f"systemctl {action} {unit} failed (rc={result.returncode}): {result.stderr.strip()}")
return result.returncode == 0
def main():
paused_by_us = False
idle_since = None
prev_total = None
prev_work = None
log(f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s threshold={CPU_THRESHOLD}%")
while True:
total, work = read_cpu_ticks()
if prev_total is None:
prev_total = total
prev_work = work
time.sleep(POLL_INTERVAL)
continue
dt = total - prev_total
if dt <= 0:
prev_total = total
prev_work = work
time.sleep(POLL_INTERVAL)
continue
real_work_pct = ((work - prev_work) / dt) * 100
prev_total = total
prev_work = work
busy = real_work_pct > CPU_THRESHOLD
if busy:
idle_since = None
if not paused_by_us:
# Only claim ownership if xmrig is actually running.
# If something else stopped it (e.g. UPS battery hook),
# don't interfere — we'd wrongly restart it later.
if is_active("xmrig.service"):
log(f"Real workload detected ({real_work_pct:.1f}% CPU) — stopping xmrig")
if systemctl("stop", "xmrig.service"):
paused_by_us = True
else:
if paused_by_us:
if idle_since is None:
idle_since = time.monotonic()
elif time.monotonic() - idle_since >= GRACE_PERIOD:
log(f"Workload ended ({real_work_pct:.1f}% CPU) past grace period — starting xmrig")
if systemctl("start", "xmrig.service"):
paused_by_us = False
idle_since = None
time.sleep(POLL_INTERVAL)
if __name__ == "__main__":
main()