From df15be01eac8b411b24226ad86ecad6f5d09e35b Mon Sep 17 00:00:00 2001 From: Simon Gardling Date: Thu, 2 Apr 2026 17:43:07 -0400 Subject: [PATCH] llama-cpp: pause xmrig during active inference requests Add sidecar service that polls llama-cpp /slots endpoint every 3s. When any slot is processing, stops xmrig. Restarts xmrig after 10s grace period when all slots are idle. Handles unreachable llama-cpp gracefully (leaves xmrig untouched). --- configuration.nix | 2 + services/llama-cpp-xmrig-pause.nix | 35 ++++++++++++ services/llama-cpp-xmrig-pause.py | 91 ++++++++++++++++++++++++++++++ 3 files changed, 128 insertions(+) create mode 100644 services/llama-cpp-xmrig-pause.nix create mode 100644 services/llama-cpp-xmrig-pause.py diff --git a/configuration.nix b/configuration.nix index a49e9a7..676c9ef 100644 --- a/configuration.nix +++ b/configuration.nix @@ -65,6 +65,8 @@ ./services/p2pool.nix ./services/xmrig.nix + ./services/llama-cpp-xmrig-pause.nix + # KEEP UNTIL 2028 ./services/caddy_senior_project.nix diff --git a/services/llama-cpp-xmrig-pause.nix b/services/llama-cpp-xmrig-pause.nix new file mode 100644 index 0000000..c5ee1e2 --- /dev/null +++ b/services/llama-cpp-xmrig-pause.nix @@ -0,0 +1,35 @@ +{ + pkgs, + service_configs, + ... +}: +{ + systemd.services.llama-cpp-xmrig-pause = { + description = "Pause xmrig while llama-cpp is processing requests"; + after = [ + "network.target" + "llama-cpp.service" + "xmrig.service" + ]; + wantedBy = [ "multi-user.target" ]; + serviceConfig = { + ExecStart = "${pkgs.python3}/bin/python3 ${./llama-cpp-xmrig-pause.py}"; + Restart = "always"; + RestartSec = "10s"; + NoNewPrivileges = true; + ProtectHome = true; + ProtectSystem = "strict"; + PrivateTmp = true; + RestrictAddressFamilies = [ + "AF_INET" + "AF_INET6" + ]; + MemoryDenyWriteExecute = true; + }; + environment = { + LLAMA_CPP_URL = "http://127.0.0.1:${toString service_configs.ports.private.llama_cpp.port}"; + POLL_INTERVAL = "3"; + GRACE_PERIOD = "10"; + }; + }; +} diff --git a/services/llama-cpp-xmrig-pause.py b/services/llama-cpp-xmrig-pause.py new file mode 100644 index 0000000..7f816f2 --- /dev/null +++ b/services/llama-cpp-xmrig-pause.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +""" +Pause xmrig while llama-cpp is processing inference requests. + +Polls llama-cpp /slots endpoint. When any slot is busy, stops xmrig. +When all slots are idle for GRACE_PERIOD seconds, restarts xmrig. +If llama-cpp is unreachable, does nothing (leaves xmrig in its current state). +""" + +import json +import os +import subprocess +import sys +import time +import urllib.request + +LLAMA_CPP_URL = os.environ["LLAMA_CPP_URL"].rstrip("/") +POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "3")) +GRACE_PERIOD = float(os.environ.get("GRACE_PERIOD", "10")) + + +def log(msg): + print(f"[llama-cpp-xmrig-pause] {msg}", file=sys.stderr, flush=True) + + +def get_slots(): + """Fetch /slots from llama-cpp. Returns list of slot dicts, or None on error.""" + req = urllib.request.Request(f"{LLAMA_CPP_URL}/slots") + try: + with urllib.request.urlopen(req, timeout=5) as resp: + return json.loads(resp.read()) + except (urllib.error.URLError, OSError, json.JSONDecodeError, ValueError) as exc: + log(f"Cannot reach llama-cpp: {exc}") + return None + + +def any_slot_busy(slots): + return any(s.get("is_processing", False) for s in slots) + + +def systemctl(action, unit): + result = subprocess.run( + ["systemctl", action, unit], + capture_output=True, + text=True, + ) + if result.returncode != 0: + log(f"systemctl {action} {unit} failed (rc={result.returncode}): {result.stderr.strip()}") + return result.returncode == 0 + + +def main(): + xmrig_paused = False + idle_since = None # monotonic timestamp when slots first went idle + + log(f"Starting: url={LLAMA_CPP_URL} poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s") + + while True: + slots = get_slots() + + if slots is None: + # llama-cpp unreachable — leave xmrig alone, reset idle timer + idle_since = None + time.sleep(POLL_INTERVAL) + continue + + busy = any_slot_busy(slots) + + if busy: + idle_since = None + if not xmrig_paused: + log("Slot busy — stopping xmrig") + if systemctl("stop", "xmrig"): + xmrig_paused = True + else: + # All slots idle + if xmrig_paused: + now = time.monotonic() + if idle_since is None: + idle_since = now + elif now - idle_since >= GRACE_PERIOD: + log("Slots idle past grace period — starting xmrig") + if systemctl("start", "xmrig"): + xmrig_paused = False + idle_since = None + + time.sleep(POLL_INTERVAL) + + +if __name__ == "__main__": + main()