llama-cpp: pause xmrig during active inference requests
Add sidecar service that polls llama-cpp /slots endpoint every 3s. When any slot is processing, stops xmrig. Restarts xmrig after 10s grace period when all slots are idle. Handles unreachable llama-cpp gracefully (leaves xmrig untouched).
This commit is contained in:
@@ -65,6 +65,8 @@
|
|||||||
./services/p2pool.nix
|
./services/p2pool.nix
|
||||||
./services/xmrig.nix
|
./services/xmrig.nix
|
||||||
|
|
||||||
|
./services/llama-cpp-xmrig-pause.nix
|
||||||
|
|
||||||
# KEEP UNTIL 2028
|
# KEEP UNTIL 2028
|
||||||
./services/caddy_senior_project.nix
|
./services/caddy_senior_project.nix
|
||||||
|
|
||||||
|
|||||||
35
services/llama-cpp-xmrig-pause.nix
Normal file
35
services/llama-cpp-xmrig-pause.nix
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
{
|
||||||
|
pkgs,
|
||||||
|
service_configs,
|
||||||
|
...
|
||||||
|
}:
|
||||||
|
{
|
||||||
|
systemd.services.llama-cpp-xmrig-pause = {
|
||||||
|
description = "Pause xmrig while llama-cpp is processing requests";
|
||||||
|
after = [
|
||||||
|
"network.target"
|
||||||
|
"llama-cpp.service"
|
||||||
|
"xmrig.service"
|
||||||
|
];
|
||||||
|
wantedBy = [ "multi-user.target" ];
|
||||||
|
serviceConfig = {
|
||||||
|
ExecStart = "${pkgs.python3}/bin/python3 ${./llama-cpp-xmrig-pause.py}";
|
||||||
|
Restart = "always";
|
||||||
|
RestartSec = "10s";
|
||||||
|
NoNewPrivileges = true;
|
||||||
|
ProtectHome = true;
|
||||||
|
ProtectSystem = "strict";
|
||||||
|
PrivateTmp = true;
|
||||||
|
RestrictAddressFamilies = [
|
||||||
|
"AF_INET"
|
||||||
|
"AF_INET6"
|
||||||
|
];
|
||||||
|
MemoryDenyWriteExecute = true;
|
||||||
|
};
|
||||||
|
environment = {
|
||||||
|
LLAMA_CPP_URL = "http://127.0.0.1:${toString service_configs.ports.private.llama_cpp.port}";
|
||||||
|
POLL_INTERVAL = "3";
|
||||||
|
GRACE_PERIOD = "10";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
91
services/llama-cpp-xmrig-pause.py
Normal file
91
services/llama-cpp-xmrig-pause.py
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Pause xmrig while llama-cpp is processing inference requests.
|
||||||
|
|
||||||
|
Polls llama-cpp /slots endpoint. When any slot is busy, stops xmrig.
|
||||||
|
When all slots are idle for GRACE_PERIOD seconds, restarts xmrig.
|
||||||
|
If llama-cpp is unreachable, does nothing (leaves xmrig in its current state).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
LLAMA_CPP_URL = os.environ["LLAMA_CPP_URL"].rstrip("/")
|
||||||
|
POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "3"))
|
||||||
|
GRACE_PERIOD = float(os.environ.get("GRACE_PERIOD", "10"))
|
||||||
|
|
||||||
|
|
||||||
|
def log(msg):
|
||||||
|
print(f"[llama-cpp-xmrig-pause] {msg}", file=sys.stderr, flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
def get_slots():
|
||||||
|
"""Fetch /slots from llama-cpp. Returns list of slot dicts, or None on error."""
|
||||||
|
req = urllib.request.Request(f"{LLAMA_CPP_URL}/slots")
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||||
|
return json.loads(resp.read())
|
||||||
|
except (urllib.error.URLError, OSError, json.JSONDecodeError, ValueError) as exc:
|
||||||
|
log(f"Cannot reach llama-cpp: {exc}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def any_slot_busy(slots):
|
||||||
|
return any(s.get("is_processing", False) for s in slots)
|
||||||
|
|
||||||
|
|
||||||
|
def systemctl(action, unit):
|
||||||
|
result = subprocess.run(
|
||||||
|
["systemctl", action, unit],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
if result.returncode != 0:
|
||||||
|
log(f"systemctl {action} {unit} failed (rc={result.returncode}): {result.stderr.strip()}")
|
||||||
|
return result.returncode == 0
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
xmrig_paused = False
|
||||||
|
idle_since = None # monotonic timestamp when slots first went idle
|
||||||
|
|
||||||
|
log(f"Starting: url={LLAMA_CPP_URL} poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
slots = get_slots()
|
||||||
|
|
||||||
|
if slots is None:
|
||||||
|
# llama-cpp unreachable — leave xmrig alone, reset idle timer
|
||||||
|
idle_since = None
|
||||||
|
time.sleep(POLL_INTERVAL)
|
||||||
|
continue
|
||||||
|
|
||||||
|
busy = any_slot_busy(slots)
|
||||||
|
|
||||||
|
if busy:
|
||||||
|
idle_since = None
|
||||||
|
if not xmrig_paused:
|
||||||
|
log("Slot busy — stopping xmrig")
|
||||||
|
if systemctl("stop", "xmrig"):
|
||||||
|
xmrig_paused = True
|
||||||
|
else:
|
||||||
|
# All slots idle
|
||||||
|
if xmrig_paused:
|
||||||
|
now = time.monotonic()
|
||||||
|
if idle_since is None:
|
||||||
|
idle_since = now
|
||||||
|
elif now - idle_since >= GRACE_PERIOD:
|
||||||
|
log("Slots idle past grace period — starting xmrig")
|
||||||
|
if systemctl("start", "xmrig"):
|
||||||
|
xmrig_paused = False
|
||||||
|
idle_since = None
|
||||||
|
|
||||||
|
time.sleep(POLL_INTERVAL)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user