llama-cpp: pause xmrig during active inference requests

Add sidecar service that polls llama-cpp /slots endpoint every 3s. When any slot is processing, stops xmrig. Restarts xmrig after 10s grace period when all slots are idle. Handles unreachable llama-cpp gracefully (leaves xmrig untouched).
2026-04-02 17:43:07 -04:00
parent 50453cf0b5
commit df15be01ea
3 changed files with 128 additions and 0 deletions
--- a/configuration.nix
+++ b/configuration.nix
@@ -65,6 +65,8 @@
    ./services/p2pool.nix
    ./services/xmrig.nix
    ./services/llama-cpp-xmrig-pause.nix
    # KEEP UNTIL 2028
    ./services/caddy_senior_project.nix
--- a/services/llama-cpp-xmrig-pause.nix
+++ b/services/llama-cpp-xmrig-pause.nix
@@ -0,0 +1,35 @@
 {
  pkgs,
  service_configs,
  ...
 }:
 {
  systemd.services.llama-cpp-xmrig-pause = {
    description = "Pause xmrig while llama-cpp is processing requests";
    after = [
      "network.target"
      "llama-cpp.service"
      "xmrig.service"
    ];
    wantedBy = [ "multi-user.target" ];
    serviceConfig = {
      ExecStart = "${pkgs.python3}/bin/python3 ${./llama-cpp-xmrig-pause.py}";
      Restart = "always";
      RestartSec = "10s";
      NoNewPrivileges = true;
      ProtectHome = true;
      ProtectSystem = "strict";
      PrivateTmp = true;
      RestrictAddressFamilies = [
        "AF_INET"
        "AF_INET6"
      ];
      MemoryDenyWriteExecute = true;
    };
    environment = {
      LLAMA_CPP_URL = "http://127.0.0.1:${toString service_configs.ports.private.llama_cpp.port}";
      POLL_INTERVAL = "3";
      GRACE_PERIOD = "10";
    };
  };
 }
--- a/services/llama-cpp-xmrig-pause.py
+++ b/services/llama-cpp-xmrig-pause.py
@@ -0,0 +1,91 @@
 #!/usr/bin/env python3
 """
 Pause xmrig while llama-cpp is processing inference requests.
 Polls llama-cpp /slots endpoint. When any slot is busy, stops xmrig.
 When all slots are idle for GRACE_PERIOD seconds, restarts xmrig.
 If llama-cpp is unreachable, does nothing (leaves xmrig in its current state).
 """
 import json
 import os
 import subprocess
 import sys
 import time
 import urllib.request
 LLAMA_CPP_URL = os.environ["LLAMA_CPP_URL"].rstrip("/")
 POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "3"))
 GRACE_PERIOD = float(os.environ.get("GRACE_PERIOD", "10"))
 def log(msg):
    print(f"[llama-cpp-xmrig-pause] {msg}", file=sys.stderr, flush=True)
 def get_slots():
    """Fetch /slots from llama-cpp. Returns list of slot dicts, or None on error."""
    req = urllib.request.Request(f"{LLAMA_CPP_URL}/slots")
    try:
        with urllib.request.urlopen(req, timeout=5) as resp:
            return json.loads(resp.read())
    except (urllib.error.URLError, OSError, json.JSONDecodeError, ValueError) as exc:
        log(f"Cannot reach llama-cpp: {exc}")
        return None
 def any_slot_busy(slots):
    return any(s.get("is_processing", False) for s in slots)
 def systemctl(action, unit):
    result = subprocess.run(
        ["systemctl", action, unit],
        capture_output=True,
        text=True,
    )
    if result.returncode != 0:
        log(f"systemctl {action} {unit} failed (rc={result.returncode}): {result.stderr.strip()}")
    return result.returncode == 0
 def main():
    xmrig_paused = False
    idle_since = None  # monotonic timestamp when slots first went idle
    log(f"Starting: url={LLAMA_CPP_URL} poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s")
    while True:
        slots = get_slots()
        if slots is None:
            # llama-cpp unreachable — leave xmrig alone, reset idle timer
            idle_since = None
            time.sleep(POLL_INTERVAL)
            continue
        busy = any_slot_busy(slots)
        if busy:
            idle_since = None
            if not xmrig_paused:
                log("Slot busy — stopping xmrig")
                if systemctl("stop", "xmrig"):
                    xmrig_paused = True
        else:
            # All slots idle
            if xmrig_paused:
                now = time.monotonic()
                if idle_since is None:
                    idle_since = now
                elif now - idle_since >= GRACE_PERIOD:
                    log("Slots idle past grace period — starting xmrig")
                    if systemctl("start", "xmrig"):
                        xmrig_paused = False
                    idle_since = None
        time.sleep(POLL_INTERVAL)
 if __name__ == "__main__":
    main()