llama-cpp: xmrig + grafana hooks
This commit is contained in:
@@ -2,42 +2,51 @@
|
||||
"""
|
||||
Pause xmrig while llama-cpp is processing inference requests.
|
||||
|
||||
Polls llama-cpp /slots endpoint. When any slot is busy, stops xmrig.
|
||||
When all slots are idle for GRACE_PERIOD seconds, restarts xmrig.
|
||||
If llama-cpp is unreachable, does nothing (leaves xmrig in its current state).
|
||||
Checks if the llama-server process is actively using CPU by reading
|
||||
/proc/<pid>/stat. When CPU usage exceeds the threshold, stops xmrig.
|
||||
When CPU drops below threshold for GRACE_PERIOD seconds, restarts xmrig.
|
||||
"""
|
||||
|
||||
import json
|
||||
import glob
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import urllib.request
|
||||
|
||||
LLAMA_CPP_URL = os.environ["LLAMA_CPP_URL"].rstrip("/")
|
||||
POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "3"))
|
||||
GRACE_PERIOD = float(os.environ.get("GRACE_PERIOD", "10"))
|
||||
# CPU percentage (per-core) above which llama-server is considered busy.
|
||||
# Idle llama-server uses ~0% CPU; active inference saturates multiple cores.
|
||||
CPU_THRESHOLD = float(os.environ.get("CPU_THRESHOLD", "50"))
|
||||
|
||||
|
||||
def log(msg):
|
||||
print(f"[llama-cpp-xmrig-pause] {msg}", file=sys.stderr, flush=True)
|
||||
|
||||
|
||||
def get_slots():
|
||||
"""Fetch /slots from llama-cpp. Returns list of slot dicts, or None on error."""
|
||||
req = urllib.request.Request(f"{LLAMA_CPP_URL}/slots")
|
||||
def find_llama_pid():
|
||||
"""Find the PID of the llama-server process."""
|
||||
for path in glob.glob("/proc/[0-9]*/comm"):
|
||||
try:
|
||||
with open(path) as f:
|
||||
if f.read().strip() == "llama-server":
|
||||
return int(path.split("/")[2])
|
||||
except (OSError, ValueError):
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
def get_cpu_times(pid):
|
||||
"""Read utime + stime from /proc/<pid>/stat. Returns total ticks or None."""
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
return json.loads(resp.read())
|
||||
except (urllib.error.URLError, OSError, json.JSONDecodeError, ValueError) as exc:
|
||||
log(f"Cannot reach llama-cpp: {exc}")
|
||||
with open(f"/proc/{pid}/stat") as f:
|
||||
fields = f.read().split(")")[-1].split()
|
||||
# fields[11] = utime, fields[12] = stime (0-indexed after ')')
|
||||
return int(fields[11]) + int(fields[12])
|
||||
except (OSError, IndexError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def any_slot_busy(slots):
|
||||
return any(s.get("is_processing", False) for s in slots)
|
||||
|
||||
|
||||
def systemctl(action, unit):
|
||||
result = subprocess.run(
|
||||
["systemctl", action, unit],
|
||||
@@ -51,35 +60,58 @@ def systemctl(action, unit):
|
||||
|
||||
def main():
|
||||
xmrig_paused = False
|
||||
idle_since = None # monotonic timestamp when slots first went idle
|
||||
idle_since = None
|
||||
prev_ticks = None
|
||||
prev_time = None
|
||||
hz = os.sysconf("SC_CLK_TCK")
|
||||
|
||||
log(f"Starting: url={LLAMA_CPP_URL} poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s")
|
||||
log(f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s threshold={CPU_THRESHOLD}%")
|
||||
|
||||
while True:
|
||||
slots = get_slots()
|
||||
|
||||
if slots is None:
|
||||
# llama-cpp unreachable — leave xmrig alone, reset idle timer
|
||||
pid = find_llama_pid()
|
||||
if pid is None:
|
||||
# llama-server not running
|
||||
idle_since = None
|
||||
prev_ticks = None
|
||||
prev_time = None
|
||||
time.sleep(POLL_INTERVAL)
|
||||
continue
|
||||
|
||||
busy = any_slot_busy(slots)
|
||||
ticks = get_cpu_times(pid)
|
||||
now = time.monotonic()
|
||||
|
||||
if ticks is None or prev_ticks is None or prev_time is None:
|
||||
prev_ticks = ticks
|
||||
prev_time = now
|
||||
time.sleep(POLL_INTERVAL)
|
||||
continue
|
||||
|
||||
dt = now - prev_time
|
||||
if dt <= 0:
|
||||
prev_ticks = ticks
|
||||
prev_time = now
|
||||
time.sleep(POLL_INTERVAL)
|
||||
continue
|
||||
|
||||
# CPU% = (delta_ticks / hz) / delta_seconds * 100
|
||||
cpu_pct = ((ticks - prev_ticks) / hz) / dt * 100
|
||||
prev_ticks = ticks
|
||||
prev_time = now
|
||||
|
||||
busy = cpu_pct > CPU_THRESHOLD
|
||||
|
||||
if busy:
|
||||
idle_since = None
|
||||
if not xmrig_paused:
|
||||
log("Slot busy — stopping xmrig")
|
||||
log(f"llama-server busy ({cpu_pct:.0f}% CPU) — stopping xmrig")
|
||||
if systemctl("stop", "xmrig"):
|
||||
xmrig_paused = True
|
||||
else:
|
||||
# All slots idle
|
||||
if xmrig_paused:
|
||||
now = time.monotonic()
|
||||
if idle_since is None:
|
||||
idle_since = now
|
||||
elif now - idle_since >= GRACE_PERIOD:
|
||||
log("Slots idle past grace period — starting xmrig")
|
||||
log(f"llama-server idle ({cpu_pct:.0f}% CPU) past grace period — starting xmrig")
|
||||
if systemctl("start", "xmrig"):
|
||||
xmrig_paused = False
|
||||
idle_since = None
|
||||
|
||||
Reference in New Issue
Block a user