From df15be01eac8b411b24226ad86ecad6f5d09e35b Mon Sep 17 00:00:00 2001
From: Simon Gardling <titaniumtown@proton.me>
Date: Thu, 2 Apr 2026 17:43:07 -0400
Subject: [PATCH] llama-cpp: pause xmrig during active inference requests

Add sidecar service that polls llama-cpp /slots endpoint every 3s.
When any slot is processing, stops xmrig. Restarts xmrig after 10s
grace period when all slots are idle. Handles unreachable llama-cpp
gracefully (leaves xmrig untouched).
---
 configuration.nix                  |  2 +
 services/llama-cpp-xmrig-pause.nix | 35 ++++++++++++
 services/llama-cpp-xmrig-pause.py  | 91 ++++++++++++++++++++++++++++++
 3 files changed, 128 insertions(+)
 create mode 100644 services/llama-cpp-xmrig-pause.nix
 create mode 100644 services/llama-cpp-xmrig-pause.py

diff --git a/configuration.nix b/configuration.nix
index a49e9a7..676c9ef 100644
--- a/configuration.nix
+++ b/configuration.nix
@@ -65,6 +65,8 @@
     ./services/p2pool.nix
     ./services/xmrig.nix
 
+    ./services/llama-cpp-xmrig-pause.nix
+
     # KEEP UNTIL 2028
     ./services/caddy_senior_project.nix
 
diff --git a/services/llama-cpp-xmrig-pause.nix b/services/llama-cpp-xmrig-pause.nix
new file mode 100644
index 0000000..c5ee1e2
--- /dev/null
+++ b/services/llama-cpp-xmrig-pause.nix
@@ -0,0 +1,35 @@
+{
+  pkgs,
+  service_configs,
+  ...
+}:
+{
+  systemd.services.llama-cpp-xmrig-pause = {
+    description = "Pause xmrig while llama-cpp is processing requests";
+    after = [
+      "network.target"
+      "llama-cpp.service"
+      "xmrig.service"
+    ];
+    wantedBy = [ "multi-user.target" ];
+    serviceConfig = {
+      ExecStart = "${pkgs.python3}/bin/python3 ${./llama-cpp-xmrig-pause.py}";
+      Restart = "always";
+      RestartSec = "10s";
+      NoNewPrivileges = true;
+      ProtectHome = true;
+      ProtectSystem = "strict";
+      PrivateTmp = true;
+      RestrictAddressFamilies = [
+        "AF_INET"
+        "AF_INET6"
+      ];
+      MemoryDenyWriteExecute = true;
+    };
+    environment = {
+      LLAMA_CPP_URL = "http://127.0.0.1:${toString service_configs.ports.private.llama_cpp.port}";
+      POLL_INTERVAL = "3";
+      GRACE_PERIOD = "10";
+    };
+  };
+}
diff --git a/services/llama-cpp-xmrig-pause.py b/services/llama-cpp-xmrig-pause.py
new file mode 100644
index 0000000..7f816f2
--- /dev/null
+++ b/services/llama-cpp-xmrig-pause.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+"""
+Pause xmrig while llama-cpp is processing inference requests.
+
+Polls llama-cpp /slots endpoint. When any slot is busy, stops xmrig.
+When all slots are idle for GRACE_PERIOD seconds, restarts xmrig.
+If llama-cpp is unreachable, does nothing (leaves xmrig in its current state).
+"""
+
+import json
+import os
+import subprocess
+import sys
+import time
+import urllib.request
+
+LLAMA_CPP_URL = os.environ["LLAMA_CPP_URL"].rstrip("/")
+POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "3"))
+GRACE_PERIOD = float(os.environ.get("GRACE_PERIOD", "10"))
+
+
+def log(msg):
+    print(f"[llama-cpp-xmrig-pause] {msg}", file=sys.stderr, flush=True)
+
+
+def get_slots():
+    """Fetch /slots from llama-cpp. Returns list of slot dicts, or None on error."""
+    req = urllib.request.Request(f"{LLAMA_CPP_URL}/slots")
+    try:
+        with urllib.request.urlopen(req, timeout=5) as resp:
+            return json.loads(resp.read())
+    except (urllib.error.URLError, OSError, json.JSONDecodeError, ValueError) as exc:
+        log(f"Cannot reach llama-cpp: {exc}")
+        return None
+
+
+def any_slot_busy(slots):
+    return any(s.get("is_processing", False) for s in slots)
+
+
+def systemctl(action, unit):
+    result = subprocess.run(
+        ["systemctl", action, unit],
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        log(f"systemctl {action} {unit} failed (rc={result.returncode}): {result.stderr.strip()}")
+    return result.returncode == 0
+
+
+def main():
+    xmrig_paused = False
+    idle_since = None  # monotonic timestamp when slots first went idle
+
+    log(f"Starting: url={LLAMA_CPP_URL} poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s")
+
+    while True:
+        slots = get_slots()
+
+        if slots is None:
+            # llama-cpp unreachable — leave xmrig alone, reset idle timer
+            idle_since = None
+            time.sleep(POLL_INTERVAL)
+            continue
+
+        busy = any_slot_busy(slots)
+
+        if busy:
+            idle_since = None
+            if not xmrig_paused:
+                log("Slot busy — stopping xmrig")
+                if systemctl("stop", "xmrig"):
+                    xmrig_paused = True
+        else:
+            # All slots idle
+            if xmrig_paused:
+                now = time.monotonic()
+                if idle_since is None:
+                    idle_since = now
+                elif now - idle_since >= GRACE_PERIOD:
+                    log("Slots idle past grace period — starting xmrig")
+                    if systemctl("start", "xmrig"):
+                        xmrig_paused = False
+                    idle_since = None
+
+        time.sleep(POLL_INTERVAL)
+
+
+if __name__ == "__main__":
+    main()