fix xmrig pause

This commit is contained in:
2026-04-03 14:39:20 -04:00
parent d4d01d63f1
commit daf82c16ba
8 changed files with 263 additions and 298 deletions

View File

@@ -63,6 +63,7 @@
./services/monero.nix
./services/p2pool.nix
./services/xmrig.nix
./services/xmrig-auto-pause.nix
./services/graphing-calculator.nix

View File

@@ -1,6 +1,5 @@
{
imports = [
./llama-cpp.nix
./llama-cpp-xmrig-pause.nix
];
}

View File

@@ -1,123 +0,0 @@
#!/usr/bin/env python3
"""
Pause xmrig while llama-cpp is processing inference requests.
Checks if the llama-server process is actively using CPU by reading
/proc/<pid>/stat. When CPU usage exceeds the threshold, stops xmrig.
When CPU drops below threshold for GRACE_PERIOD seconds, restarts xmrig.
"""
import glob
import os
import subprocess
import sys
import time
POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "3"))
GRACE_PERIOD = float(os.environ.get("GRACE_PERIOD", "10"))
# CPU percentage (per-core) above which llama-server is considered busy.
# Idle llama-server uses ~0% CPU; active inference saturates multiple cores.
CPU_THRESHOLD = float(os.environ.get("CPU_THRESHOLD", "50"))
def log(msg):
print(f"[llama-cpp-xmrig-pause] {msg}", file=sys.stderr, flush=True)
def find_llama_pid():
"""Find the PID of the llama-server process."""
for path in glob.glob("/proc/[0-9]*/comm"):
try:
with open(path) as f:
if f.read().strip() == "llama-server":
return int(path.split("/")[2])
except (OSError, ValueError):
continue
return None
def get_cpu_times(pid):
"""Read utime + stime from /proc/<pid>/stat. Returns total ticks or None."""
try:
with open(f"/proc/{pid}/stat") as f:
fields = f.read().split(")")[-1].split()
# fields[11] = utime, fields[12] = stime (0-indexed after ')')
return int(fields[11]) + int(fields[12])
except (OSError, IndexError, ValueError):
return None
def systemctl(action, unit):
result = subprocess.run(
["systemctl", action, unit],
capture_output=True,
text=True,
)
if result.returncode != 0:
log(f"systemctl {action} {unit} failed (rc={result.returncode}): {result.stderr.strip()}")
return result.returncode == 0
def main():
xmrig_paused = False
idle_since = None
prev_ticks = None
prev_time = None
hz = os.sysconf("SC_CLK_TCK")
log(f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s threshold={CPU_THRESHOLD}%")
while True:
pid = find_llama_pid()
if pid is None:
# llama-server not running
idle_since = None
prev_ticks = None
prev_time = None
time.sleep(POLL_INTERVAL)
continue
ticks = get_cpu_times(pid)
now = time.monotonic()
if ticks is None or prev_ticks is None or prev_time is None:
prev_ticks = ticks
prev_time = now
time.sleep(POLL_INTERVAL)
continue
dt = now - prev_time
if dt <= 0:
prev_ticks = ticks
prev_time = now
time.sleep(POLL_INTERVAL)
continue
# CPU% = (delta_ticks / hz) / delta_seconds * 100
cpu_pct = ((ticks - prev_ticks) / hz) / dt * 100
prev_ticks = ticks
prev_time = now
busy = cpu_pct > CPU_THRESHOLD
if busy:
idle_since = None
if not xmrig_paused:
log(f"llama-server busy ({cpu_pct:.0f}% CPU) — stopping xmrig")
if systemctl("stop", "xmrig"):
xmrig_paused = True
else:
if xmrig_paused:
if idle_since is None:
idle_since = now
elif now - idle_since >= GRACE_PERIOD:
log(f"llama-server idle ({cpu_pct:.0f}% CPU) past grace period — starting xmrig")
if systemctl("start", "xmrig"):
xmrig_paused = False
idle_since = None
time.sleep(POLL_INTERVAL)
if __name__ == "__main__":
main()

View File

@@ -4,19 +4,15 @@
pkgs,
...
}:
lib.mkIf config.services.llama-cpp.enable {
systemd.services.llama-cpp-xmrig-pause = {
description = "Pause xmrig while llama-cpp is processing requests";
after = [
"llama-cpp.service"
"xmrig.service"
];
lib.mkIf config.services.xmrig.enable {
systemd.services.xmrig-auto-pause = {
description = "Auto-pause xmrig when other services need CPU";
after = [ "xmrig.service" ];
wantedBy = [ "multi-user.target" ];
serviceConfig = {
ExecStart = "${pkgs.python3}/bin/python3 ${./llama-cpp-xmrig-pause.py}";
ExecStart = "${pkgs.python3}/bin/python3 ${./xmrig-auto-pause.py}";
Restart = "always";
RestartSec = "10s";
# Needs /proc access (default) and AF_UNIX for systemctl
NoNewPrivileges = true;
ProtectHome = true;
ProtectSystem = "strict";
@@ -28,8 +24,8 @@ lib.mkIf config.services.llama-cpp.enable {
};
environment = {
POLL_INTERVAL = "3";
GRACE_PERIOD = "10";
CPU_THRESHOLD = "50";
GRACE_PERIOD = "15";
CPU_THRESHOLD = "5";
};
};
}

View File

@@ -0,0 +1,131 @@
#!/usr/bin/env python3
"""
Auto-pause xmrig when other services need CPU.
Monitors non-nice CPU usage from /proc/stat. Since xmrig runs at Nice=19,
its CPU time lands in the 'nice' column and is excluded from the metric.
When real workload (user + system + irq + softirq) exceeds the threshold,
stops xmrig. When it drops below threshold for GRACE_PERIOD seconds,
restarts xmrig.
This replaces per-service pause scripts with a single general-purpose
monitor that handles any CPU-intensive workload (gitea workers, llama-cpp
inference, etc.) without needing to know about specific processes.
Why scheduler priority alone isn't enough:
Nice=19 / SCHED_IDLE only affects which thread gets the next time slice.
RandomX's 2MB-per-thread scratchpad (24MB across 12 threads) pollutes
the shared 32MB L3 cache, and its memory access pattern saturates DRAM
bandwidth. Other services run slower even though they aren't denied CPU
time. The only fix is to stop xmrig entirely when real work is happening.
"""
import os
import subprocess
import sys
import time
POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "3"))
GRACE_PERIOD = float(os.environ.get("GRACE_PERIOD", "15"))
# Percentage of total CPU ticks that non-nice processes must use to trigger
# a pause. On a 12-thread system, one fully loaded core ≈ 8.3% of total.
# Default 5% catches anything using more than ~60% of a single core.
CPU_THRESHOLD = float(os.environ.get("CPU_THRESHOLD", "5"))
def log(msg):
print(f"[xmrig-auto-pause] {msg}", file=sys.stderr, flush=True)
def read_cpu_ticks():
"""Read CPU tick counters from /proc/stat.
Returns (total_ticks, real_work_ticks) where real_work excludes the
'nice' column (xmrig) and idle/iowait.
"""
with open("/proc/stat") as f:
parts = f.readline().split()
# cpu user nice system idle iowait irq softirq steal
user, nice, system, idle, iowait, irq, softirq, steal = (
int(x) for x in parts[1:9]
)
total = user + nice + system + idle + iowait + irq + softirq + steal
real_work = user + system + irq + softirq
return total, real_work
def is_active(unit):
"""Check if a systemd unit is currently active."""
result = subprocess.run(
["systemctl", "is-active", "--quiet", unit],
capture_output=True,
)
return result.returncode == 0
def systemctl(action, unit):
result = subprocess.run(
["systemctl", action, unit],
capture_output=True,
text=True,
)
if result.returncode != 0:
log(f"systemctl {action} {unit} failed (rc={result.returncode}): {result.stderr.strip()}")
return result.returncode == 0
def main():
paused_by_us = False
idle_since = None
prev_total = None
prev_work = None
log(f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s threshold={CPU_THRESHOLD}%")
while True:
total, work = read_cpu_ticks()
if prev_total is None:
prev_total = total
prev_work = work
time.sleep(POLL_INTERVAL)
continue
dt = total - prev_total
if dt <= 0:
prev_total = total
prev_work = work
time.sleep(POLL_INTERVAL)
continue
real_work_pct = ((work - prev_work) / dt) * 100
prev_total = total
prev_work = work
busy = real_work_pct > CPU_THRESHOLD
if busy:
idle_since = None
if not paused_by_us:
# Only claim ownership if xmrig is actually running.
# If something else stopped it (e.g. UPS battery hook),
# don't interfere — we'd wrongly restart it later.
if is_active("xmrig.service"):
log(f"Real workload detected ({real_work_pct:.1f}% CPU) — stopping xmrig")
if systemctl("stop", "xmrig.service"):
paused_by_us = True
else:
if paused_by_us:
if idle_since is None:
idle_since = time.monotonic()
elif time.monotonic() - idle_since >= GRACE_PERIOD:
log(f"Workload ended ({real_work_pct:.1f}% CPU) past grace period — starting xmrig")
if systemctl("start", "xmrig.service"):
paused_by_us = False
idle_since = None
time.sleep(POLL_INTERVAL)
if __name__ == "__main__":
main()

View File

@@ -1,162 +0,0 @@
{
pkgs,
...
}:
let
script = ../services/llama-cpp/llama-cpp-xmrig-pause.py;
python = pkgs.python3;
# SmolLM-135M Q2_K: 85MB, modern GGUFv3, generates ~30 tok/s on one CPU
# thread — slow enough that a 200-token request keeps the process busy for
# several seconds, fast enough that tests don't crawl.
tinyModel = pkgs.fetchurl {
url = "https://huggingface.co/QuantFactory/SmolLM-135M-GGUF/resolve/main/SmolLM-135M.Q2_K.gguf";
hash = "sha256-DX46drPNJILNba21xfY2tyE0/yPWgOhz43gJdeSYKh4=";
};
in
pkgs.testers.runNixOSTest {
name = "llama-cpp-xmrig-pause";
nodes.machine =
{ pkgs, ... }:
{
environment.systemPackages = [
pkgs.python3
pkgs.procps
pkgs.curl
pkgs.llama-cpp
];
# Mock xmrig as a simple sleep process that can be stopped/started.
systemd.services.xmrig = {
description = "Mock xmrig miner";
serviceConfig = {
ExecStart = "${pkgs.coreutils}/bin/sleep infinity";
Type = "simple";
};
wantedBy = [ "multi-user.target" ];
};
};
testScript = ''
import time
PORT = 18088
MODEL = "${tinyModel}"
PYTHON = "${python}/bin/python3"
SCRIPT = "${script}"
# Tuned for test speed while remaining realistic.
# POLL_INTERVAL=1 keeps detection latency low.
# GRACE_PERIOD=5 is long enough to verify "stays stopped" but short enough
# that the full test completes in ~2 minutes.
# CPU_THRESHOLD=10 is low because the VM has limited cores and the model
# is small but any active inference still saturates a core.
POLL_INTERVAL = "1"
GRACE_PERIOD = "5"
CPU_THRESHOLD = "10"
infer_counter = 0
def send_completion(n_predict=200):
"""Fire a completion request in the background via a transient systemd unit."""
global infer_counter
infer_counter += 1
name = f"infer-{infer_counter}"
machine.succeed(
f"systemd-run --unit={name} --property=Type=exec "
f"curl -sf -X POST http://127.0.0.1:{PORT}/completion "
f"-H 'Content-Type: application/json' "
f"-d '{{\"prompt\": \"Once upon a time in a land far away there lived\", \"n_predict\": {n_predict}}}'"
)
return name
def wait_inference_done(unit_name, timeout=60):
"""Wait for a background inference request to finish."""
machine.wait_until_fails(
f"systemctl is-active {unit_name}",
timeout=timeout,
)
start_all()
machine.wait_for_unit("multi-user.target")
machine.wait_for_unit("xmrig.service")
with subtest("Start llama-server"):
machine.succeed(
f"systemd-run --unit=llama-server "
# Single inference thread to maximise per-core CPU%, which is
# what the monitor measures. Keeps token generation slow enough
# (~30 tok/s) that a 200-token request sustains load for seconds.
f"llama-server --model {MODEL} --port {PORT} --ctx-size 512 -t 1 -np 1"
)
machine.wait_until_succeeds(
f"curl -sf http://127.0.0.1:{PORT}/health",
timeout=30,
)
machine.succeed("pgrep -x llama-server")
with subtest("Start pause monitor"):
machine.succeed(
f"systemd-run --unit=llama-xmrig-pause "
f"--setenv=POLL_INTERVAL={POLL_INTERVAL} "
f"--setenv=GRACE_PERIOD={GRACE_PERIOD} "
f"--setenv=CPU_THRESHOLD={CPU_THRESHOLD} "
f"{PYTHON} {SCRIPT}"
)
# The monitor needs two consecutive polls to compute a CPU delta.
# Wait for baseline to stabilise.
time.sleep(3)
with subtest("xmrig stays running while llama-server is idle"):
machine.succeed("systemctl is-active xmrig")
with subtest("xmrig stopped during prompt processing"):
unit = send_completion(n_predict=200)
machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
with subtest("xmrig remains stopped during grace period after inference ends"):
wait_inference_done(unit)
# Inference just finished. The monitor will need 1-2 polls to detect
# idle, then the grace period starts. Checking 2s after completion
# is well within the 5s grace window.
time.sleep(2)
machine.fail("systemctl is-active xmrig")
with subtest("xmrig resumes after grace period expires"):
# Already idle since previous subtest. Grace period (5s) plus
# detection delay (~2 polls) means xmrig should restart within ~8s.
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=15)
with subtest("Sequential prompts do not cause xmrig flapping"):
# First prompt stop xmrig
unit1 = send_completion(n_predict=200)
machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
wait_inference_done(unit1)
# Brief idle gap shorter than grace period
time.sleep(2)
# Second prompt arrives before grace period expires, resetting it
unit2 = send_completion(n_predict=200)
time.sleep(3)
# xmrig must still be stopped
machine.fail("systemctl is-active xmrig")
wait_inference_done(unit2)
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=15)
with subtest("xmrig stays stopped during sustained inference"):
unit = send_completion(n_predict=500)
machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
# Stay busy longer than the grace period to prove continuous
# activity keeps xmrig stopped indefinitely.
time.sleep(8)
machine.fail("systemctl is-active xmrig")
wait_inference_done(unit)
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=15)
'';
}

View File

@@ -30,7 +30,9 @@ in
# llama-cpp tests
llamaCppAnnotationsTest = handleTest ./llama-cpp-annotations.nix;
llamaCppXmrigPauseTest = handleTest ./llama-cpp-xmrig-pause.nix;
# xmrig auto-pause test
xmrigAutoPauseTest = handleTest ./xmrig-auto-pause.nix;
# ntfy alerts test
ntfyAlertsTest = handleTest ./ntfy-alerts.nix;

121
tests/xmrig-auto-pause.nix Normal file
View File

@@ -0,0 +1,121 @@
{
pkgs,
...
}:
let
script = ../services/xmrig-auto-pause.py;
python = pkgs.python3;
in
pkgs.testers.runNixOSTest {
name = "xmrig-auto-pause";
nodes.machine =
{ pkgs, ... }:
{
environment.systemPackages = [
pkgs.python3
pkgs.procps
];
# Mock xmrig as a nice'd sleep process that can be stopped/started.
systemd.services.xmrig = {
description = "Mock xmrig miner";
serviceConfig = {
ExecStart = "${pkgs.coreutils}/bin/sleep infinity";
Type = "simple";
Nice = 19;
};
wantedBy = [ "multi-user.target" ];
};
};
testScript = ''
import time
PYTHON = "${python}/bin/python3"
SCRIPT = "${script}"
# Tuned for test VMs (1-2 cores).
# POLL_INTERVAL=1 keeps detection latency low.
# GRACE_PERIOD=5 is long enough to verify "stays stopped" but short
# enough that the full test completes in reasonable time.
# CPU_THRESHOLD=10 catches a single busy-loop on a 1-2 core VM.
POLL_INTERVAL = "1"
GRACE_PERIOD = "5"
CPU_THRESHOLD = "10"
def start_cpu_load(name):
"""Start a non-nice CPU burn as a transient systemd unit."""
machine.succeed(
f"systemd-run --unit={name} --property=Type=exec "
f"bash -c 'while true; do :; done'"
)
def stop_cpu_load(name):
machine.succeed(f"systemctl stop {name}")
start_all()
machine.wait_for_unit("multi-user.target")
machine.wait_for_unit("xmrig.service")
with subtest("Start auto-pause monitor"):
machine.succeed(
f"systemd-run --unit=xmrig-auto-pause "
f"--setenv=POLL_INTERVAL={POLL_INTERVAL} "
f"--setenv=GRACE_PERIOD={GRACE_PERIOD} "
f"--setenv=CPU_THRESHOLD={CPU_THRESHOLD} "
f"{PYTHON} {SCRIPT}"
)
# Monitor needs two consecutive polls to compute a CPU delta.
time.sleep(3)
with subtest("xmrig stays running while system is idle"):
machine.succeed("systemctl is-active xmrig")
with subtest("xmrig stopped when CPU load appears"):
start_cpu_load("cpu-load")
machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
with subtest("xmrig remains stopped during grace period after load ends"):
stop_cpu_load("cpu-load")
# Load just stopped. Grace period is 5s. Check at 2s well within.
time.sleep(2)
machine.fail("systemctl is-active xmrig")
with subtest("xmrig resumes after grace period expires"):
# Already idle since previous subtest. Grace period (5s) plus
# detection delay (~2 polls) means xmrig should restart within ~8s.
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=15)
with subtest("Intermittent load does not cause flapping"):
# First load stop xmrig
start_cpu_load("cpu-load-1")
machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
stop_cpu_load("cpu-load-1")
# Brief idle gap shorter than grace period
time.sleep(2)
# Second load arrives before grace period expires
start_cpu_load("cpu-load-2")
time.sleep(3)
# xmrig must still be stopped
machine.fail("systemctl is-active xmrig")
stop_cpu_load("cpu-load-2")
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=15)
with subtest("Sustained load keeps xmrig stopped"):
start_cpu_load("cpu-load-3")
machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
# Stay busy longer than the grace period to prove continuous
# activity keeps xmrig stopped indefinitely.
time.sleep(8)
machine.fail("systemctl is-active xmrig")
stop_cpu_load("cpu-load-3")
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=15)
'';
}