fix xmrig pause

2026-04-03 14:39:20 -04:00
parent d4d01d63f1
commit daf82c16ba
8 changed files with 263 additions and 298 deletions
--- a/configuration.nix
+++ b/configuration.nix
@@ -63,6 +63,7 @@
    ./services/monero.nix
    ./services/p2pool.nix
    ./services/xmrig.nix
    ./services/xmrig-auto-pause.nix
    ./services/graphing-calculator.nix
--- a/services/llama-cpp/default.nix
+++ b/services/llama-cpp/default.nix
@@ -1,6 +1,5 @@
 {
  imports = [
    ./llama-cpp.nix
    ./llama-cpp-xmrig-pause.nix
  ];
 }
--- a/services/llama-cpp/llama-cpp-xmrig-pause.py
+++ b/services/llama-cpp/llama-cpp-xmrig-pause.py
@@ -1,123 +0,0 @@
 #!/usr/bin/env python3
 """
 Pause xmrig while llama-cpp is processing inference requests.
 Checks if the llama-server process is actively using CPU by reading
 /proc/<pid>/stat. When CPU usage exceeds the threshold, stops xmrig.
 When CPU drops below threshold for GRACE_PERIOD seconds, restarts xmrig.
 """
 import glob
 import os
 import subprocess
 import sys
 import time
 POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "3"))
 GRACE_PERIOD = float(os.environ.get("GRACE_PERIOD", "10"))
 # CPU percentage (per-core) above which llama-server is considered busy.
 # Idle llama-server uses ~0% CPU; active inference saturates multiple cores.
 CPU_THRESHOLD = float(os.environ.get("CPU_THRESHOLD", "50"))
 def log(msg):
    print(f"[llama-cpp-xmrig-pause] {msg}", file=sys.stderr, flush=True)
 def find_llama_pid():
    """Find the PID of the llama-server process."""
    for path in glob.glob("/proc/[0-9]*/comm"):
        try:
            with open(path) as f:
                if f.read().strip() == "llama-server":
                    return int(path.split("/")[2])
        except (OSError, ValueError):
            continue
    return None
 def get_cpu_times(pid):
    """Read utime + stime from /proc/<pid>/stat. Returns total ticks or None."""
    try:
        with open(f"/proc/{pid}/stat") as f:
            fields = f.read().split(")")[-1].split()
            # fields[11] = utime, fields[12] = stime (0-indexed after ')')
            return int(fields[11]) + int(fields[12])
    except (OSError, IndexError, ValueError):
        return None
 def systemctl(action, unit):
    result = subprocess.run(
        ["systemctl", action, unit],
        capture_output=True,
        text=True,
    )
    if result.returncode != 0:
        log(f"systemctl {action} {unit} failed (rc={result.returncode}): {result.stderr.strip()}")
    return result.returncode == 0
 def main():
    xmrig_paused = False
    idle_since = None
    prev_ticks = None
    prev_time = None
    hz = os.sysconf("SC_CLK_TCK")
    log(f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s threshold={CPU_THRESHOLD}%")
    while True:
        pid = find_llama_pid()
        if pid is None:
            # llama-server not running
            idle_since = None
            prev_ticks = None
            prev_time = None
            time.sleep(POLL_INTERVAL)
            continue
        ticks = get_cpu_times(pid)
        now = time.monotonic()
        if ticks is None or prev_ticks is None or prev_time is None:
            prev_ticks = ticks
            prev_time = now
            time.sleep(POLL_INTERVAL)
            continue
        dt = now - prev_time
        if dt <= 0:
            prev_ticks = ticks
            prev_time = now
            time.sleep(POLL_INTERVAL)
            continue
        # CPU% = (delta_ticks / hz) / delta_seconds * 100
        cpu_pct = ((ticks - prev_ticks) / hz) / dt * 100
        prev_ticks = ticks
        prev_time = now
        busy = cpu_pct > CPU_THRESHOLD
        if busy:
            idle_since = None
            if not xmrig_paused:
                log(f"llama-server busy ({cpu_pct:.0f}% CPU) — stopping xmrig")
                if systemctl("stop", "xmrig"):
                    xmrig_paused = True
        else:
            if xmrig_paused:
                if idle_since is None:
                    idle_since = now
                elif now - idle_since >= GRACE_PERIOD:
                    log(f"llama-server idle ({cpu_pct:.0f}% CPU) past grace period — starting xmrig")
                    if systemctl("start", "xmrig"):
                        xmrig_paused = False
                    idle_since = None
        time.sleep(POLL_INTERVAL)
 if __name__ == "__main__":
    main()
--- a/services/llama-cpp/llama-cpp-xmrig-pause.nix
+++ b/services/llama-cpp/llama-cpp-xmrig-pause.nix
@@ -4,19 +4,15 @@
  pkgs,
  ...
 }:
-lib.mkIf config.services.llama-cpp.enable {
+lib.mkIf config.services.xmrig.enable {
-  systemd.services.llama-cpp-xmrig-pause = {
+  systemd.services.xmrig-auto-pause = {
-    description = "Pause xmrig while llama-cpp is processing requests";
+    description = "Auto-pause xmrig when other services need CPU";
-    after = [
+    after = [ "xmrig.service" ];
      "llama-cpp.service"
      "xmrig.service"
    ];
    wantedBy = [ "multi-user.target" ];
    serviceConfig = {
-      ExecStart = "${pkgs.python3}/bin/python3 ${./llama-cpp-xmrig-pause.py}";
+      ExecStart = "${pkgs.python3}/bin/python3 ${./xmrig-auto-pause.py}";
      Restart = "always";
      RestartSec = "10s";
      # Needs /proc access (default) and AF_UNIX for systemctl
      NoNewPrivileges = true;
      ProtectHome = true;
      ProtectSystem = "strict";
@@ -28,8 +24,8 @@ lib.mkIf config.services.llama-cpp.enable {
    };
    environment = {
      POLL_INTERVAL = "3";
-      GRACE_PERIOD = "10";
+      GRACE_PERIOD = "15";
-      CPU_THRESHOLD = "50";
+      CPU_THRESHOLD = "5";
    };
  };
 }
--- a/services/xmrig-auto-pause.py
+++ b/services/xmrig-auto-pause.py
@@ -0,0 +1,131 @@
 #!/usr/bin/env python3
 """
 Auto-pause xmrig when other services need CPU.
 Monitors non-nice CPU usage from /proc/stat. Since xmrig runs at Nice=19,
 its CPU time lands in the 'nice' column and is excluded from the metric.
 When real workload (user + system + irq + softirq) exceeds the threshold,
 stops xmrig. When it drops below threshold for GRACE_PERIOD seconds,
 restarts xmrig.
 This replaces per-service pause scripts with a single general-purpose
 monitor that handles any CPU-intensive workload (gitea workers, llama-cpp
 inference, etc.) without needing to know about specific processes.
 Why scheduler priority alone isn't enough:
  Nice=19 / SCHED_IDLE only affects which thread gets the next time slice.
  RandomX's 2MB-per-thread scratchpad (24MB across 12 threads) pollutes
  the shared 32MB L3 cache, and its memory access pattern saturates DRAM
  bandwidth. Other services run slower even though they aren't denied CPU
  time. The only fix is to stop xmrig entirely when real work is happening.
 """
 import os
 import subprocess
 import sys
 import time
 POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "3"))
 GRACE_PERIOD = float(os.environ.get("GRACE_PERIOD", "15"))
 # Percentage of total CPU ticks that non-nice processes must use to trigger
 # a pause. On a 12-thread system, one fully loaded core ≈ 8.3% of total.
 # Default 5% catches anything using more than ~60% of a single core.
 CPU_THRESHOLD = float(os.environ.get("CPU_THRESHOLD", "5"))
 def log(msg):
    print(f"[xmrig-auto-pause] {msg}", file=sys.stderr, flush=True)
 def read_cpu_ticks():
    """Read CPU tick counters from /proc/stat.
    Returns (total_ticks, real_work_ticks) where real_work excludes the
    'nice' column (xmrig) and idle/iowait.
    """
    with open("/proc/stat") as f:
        parts = f.readline().split()
    # cpu  user nice system idle iowait irq softirq steal
    user, nice, system, idle, iowait, irq, softirq, steal = (
        int(x) for x in parts[1:9]
    )
    total = user + nice + system + idle + iowait + irq + softirq + steal
    real_work = user + system + irq + softirq
    return total, real_work
 def is_active(unit):
    """Check if a systemd unit is currently active."""
    result = subprocess.run(
        ["systemctl", "is-active", "--quiet", unit],
        capture_output=True,
    )
    return result.returncode == 0
 def systemctl(action, unit):
    result = subprocess.run(
        ["systemctl", action, unit],
        capture_output=True,
        text=True,
    )
    if result.returncode != 0:
        log(f"systemctl {action} {unit} failed (rc={result.returncode}): {result.stderr.strip()}")
    return result.returncode == 0
 def main():
    paused_by_us = False
    idle_since = None
    prev_total = None
    prev_work = None
    log(f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s threshold={CPU_THRESHOLD}%")
    while True:
        total, work = read_cpu_ticks()
        if prev_total is None:
            prev_total = total
            prev_work = work
            time.sleep(POLL_INTERVAL)
            continue
        dt = total - prev_total
        if dt <= 0:
            prev_total = total
            prev_work = work
            time.sleep(POLL_INTERVAL)
            continue
        real_work_pct = ((work - prev_work) / dt) * 100
        prev_total = total
        prev_work = work
        busy = real_work_pct > CPU_THRESHOLD
        if busy:
            idle_since = None
            if not paused_by_us:
                # Only claim ownership if xmrig is actually running.
                # If something else stopped it (e.g. UPS battery hook),
                # don't interfere — we'd wrongly restart it later.
                if is_active("xmrig.service"):
                    log(f"Real workload detected ({real_work_pct:.1f}% CPU) — stopping xmrig")
                    if systemctl("stop", "xmrig.service"):
                        paused_by_us = True
        else:
            if paused_by_us:
                if idle_since is None:
                    idle_since = time.monotonic()
                elif time.monotonic() - idle_since >= GRACE_PERIOD:
                    log(f"Workload ended ({real_work_pct:.1f}% CPU) past grace period — starting xmrig")
                    if systemctl("start", "xmrig.service"):
                        paused_by_us = False
                    idle_since = None
        time.sleep(POLL_INTERVAL)
 if __name__ == "__main__":
    main()
--- a/tests/llama-cpp-xmrig-pause.nix
+++ b/tests/llama-cpp-xmrig-pause.nix
@@ -1,162 +0,0 @@
 {
  pkgs,
  ...
 }:
 let
  script = ../services/llama-cpp/llama-cpp-xmrig-pause.py;
  python = pkgs.python3;
  # SmolLM-135M Q2_K: 85MB, modern GGUFv3, generates ~30 tok/s on one CPU
  # thread — slow enough that a 200-token request keeps the process busy for
  # several seconds, fast enough that tests don't crawl.
  tinyModel = pkgs.fetchurl {
    url = "https://huggingface.co/QuantFactory/SmolLM-135M-GGUF/resolve/main/SmolLM-135M.Q2_K.gguf";
    hash = "sha256-DX46drPNJILNba21xfY2tyE0/yPWgOhz43gJdeSYKh4=";
  };
 in
 pkgs.testers.runNixOSTest {
  name = "llama-cpp-xmrig-pause";
  nodes.machine =
    { pkgs, ... }:
    {
      environment.systemPackages = [
        pkgs.python3
        pkgs.procps
        pkgs.curl
        pkgs.llama-cpp
      ];
      # Mock xmrig as a simple sleep process that can be stopped/started.
      systemd.services.xmrig = {
        description = "Mock xmrig miner";
        serviceConfig = {
          ExecStart = "${pkgs.coreutils}/bin/sleep infinity";
          Type = "simple";
        };
        wantedBy = [ "multi-user.target" ];
      };
    };
  testScript = ''
    import time
    PORT = 18088
    MODEL = "${tinyModel}"
    PYTHON = "${python}/bin/python3"
    SCRIPT = "${script}"
    # Tuned for test speed while remaining realistic.
    # POLL_INTERVAL=1 keeps detection latency low.
    # GRACE_PERIOD=5 is long enough to verify "stays stopped" but short enough
    # that the full test completes in ~2 minutes.
    # CPU_THRESHOLD=10 is low because the VM has limited cores and the model
    # is small — but any active inference still saturates a core.
    POLL_INTERVAL = "1"
    GRACE_PERIOD = "5"
    CPU_THRESHOLD = "10"
    infer_counter = 0
    def send_completion(n_predict=200):
        """Fire a completion request in the background via a transient systemd unit."""
        global infer_counter
        infer_counter += 1
        name = f"infer-{infer_counter}"
        machine.succeed(
            f"systemd-run --unit={name} --property=Type=exec "
            f"curl -sf -X POST http://127.0.0.1:{PORT}/completion "
            f"-H 'Content-Type: application/json' "
            f"-d '{{\"prompt\": \"Once upon a time in a land far away there lived\", \"n_predict\": {n_predict}}}'"
        )
        return name
    def wait_inference_done(unit_name, timeout=60):
        """Wait for a background inference request to finish."""
        machine.wait_until_fails(
            f"systemctl is-active {unit_name}",
            timeout=timeout,
        )
    start_all()
    machine.wait_for_unit("multi-user.target")
    machine.wait_for_unit("xmrig.service")
    with subtest("Start llama-server"):
        machine.succeed(
            f"systemd-run --unit=llama-server "
            # Single inference thread to maximise per-core CPU%, which is
            # what the monitor measures.  Keeps token generation slow enough
            # (~30 tok/s) that a 200-token request sustains load for seconds.
            f"llama-server --model {MODEL} --port {PORT} --ctx-size 512 -t 1 -np 1"
        )
        machine.wait_until_succeeds(
            f"curl -sf http://127.0.0.1:{PORT}/health",
            timeout=30,
        )
        machine.succeed("pgrep -x llama-server")
    with subtest("Start pause monitor"):
        machine.succeed(
            f"systemd-run --unit=llama-xmrig-pause "
            f"--setenv=POLL_INTERVAL={POLL_INTERVAL} "
            f"--setenv=GRACE_PERIOD={GRACE_PERIOD} "
            f"--setenv=CPU_THRESHOLD={CPU_THRESHOLD} "
            f"{PYTHON} {SCRIPT}"
        )
        # The monitor needs two consecutive polls to compute a CPU delta.
        # Wait for baseline to stabilise.
        time.sleep(3)
    with subtest("xmrig stays running while llama-server is idle"):
        machine.succeed("systemctl is-active xmrig")
    with subtest("xmrig stopped during prompt processing"):
        unit = send_completion(n_predict=200)
        machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
    with subtest("xmrig remains stopped during grace period after inference ends"):
        wait_inference_done(unit)
        # Inference just finished.  The monitor will need 1-2 polls to detect
        # idle, then the grace period starts.  Checking 2s after completion
        # is well within the 5s grace window.
        time.sleep(2)
        machine.fail("systemctl is-active xmrig")
    with subtest("xmrig resumes after grace period expires"):
        # Already idle since previous subtest.  Grace period (5s) plus
        # detection delay (~2 polls) means xmrig should restart within ~8s.
        machine.wait_until_succeeds("systemctl is-active xmrig", timeout=15)
    with subtest("Sequential prompts do not cause xmrig flapping"):
        # First prompt — stop xmrig
        unit1 = send_completion(n_predict=200)
        machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
        wait_inference_done(unit1)
        # Brief idle gap — shorter than grace period
        time.sleep(2)
        # Second prompt arrives before grace period expires, resetting it
        unit2 = send_completion(n_predict=200)
        time.sleep(3)
        # xmrig must still be stopped
        machine.fail("systemctl is-active xmrig")
        wait_inference_done(unit2)
        machine.wait_until_succeeds("systemctl is-active xmrig", timeout=15)
    with subtest("xmrig stays stopped during sustained inference"):
        unit = send_completion(n_predict=500)
        machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
        # Stay busy longer than the grace period to prove continuous
        # activity keeps xmrig stopped indefinitely.
        time.sleep(8)
        machine.fail("systemctl is-active xmrig")
        wait_inference_done(unit)
        machine.wait_until_succeeds("systemctl is-active xmrig", timeout=15)
  '';
 }
--- a/tests/tests.nix
+++ b/tests/tests.nix
@@ -30,7 +30,9 @@ in
  # llama-cpp tests
  llamaCppAnnotationsTest = handleTest ./llama-cpp-annotations.nix;
-  llamaCppXmrigPauseTest = handleTest ./llama-cpp-xmrig-pause.nix;
+
  # xmrig auto-pause test
  xmrigAutoPauseTest = handleTest ./xmrig-auto-pause.nix;
  # ntfy alerts test
  ntfyAlertsTest = handleTest ./ntfy-alerts.nix;
--- a/tests/xmrig-auto-pause.nix
+++ b/tests/xmrig-auto-pause.nix
@@ -0,0 +1,121 @@
 {
  pkgs,
  ...
 }:
 let
  script = ../services/xmrig-auto-pause.py;
  python = pkgs.python3;
 in
 pkgs.testers.runNixOSTest {
  name = "xmrig-auto-pause";
  nodes.machine =
    { pkgs, ... }:
    {
      environment.systemPackages = [
        pkgs.python3
        pkgs.procps
      ];
      # Mock xmrig as a nice'd sleep process that can be stopped/started.
      systemd.services.xmrig = {
        description = "Mock xmrig miner";
        serviceConfig = {
          ExecStart = "${pkgs.coreutils}/bin/sleep infinity";
          Type = "simple";
          Nice = 19;
        };
        wantedBy = [ "multi-user.target" ];
      };
    };
  testScript = ''
    import time
    PYTHON = "${python}/bin/python3"
    SCRIPT = "${script}"
    # Tuned for test VMs (1-2 cores).
    # POLL_INTERVAL=1 keeps detection latency low.
    # GRACE_PERIOD=5 is long enough to verify "stays stopped" but short
    # enough that the full test completes in reasonable time.
    # CPU_THRESHOLD=10 catches a single busy-loop on a 1-2 core VM.
    POLL_INTERVAL = "1"
    GRACE_PERIOD = "5"
    CPU_THRESHOLD = "10"
    def start_cpu_load(name):
        """Start a non-nice CPU burn as a transient systemd unit."""
        machine.succeed(
            f"systemd-run --unit={name} --property=Type=exec "
            f"bash -c 'while true; do :; done'"
        )
    def stop_cpu_load(name):
        machine.succeed(f"systemctl stop {name}")
    start_all()
    machine.wait_for_unit("multi-user.target")
    machine.wait_for_unit("xmrig.service")
    with subtest("Start auto-pause monitor"):
        machine.succeed(
            f"systemd-run --unit=xmrig-auto-pause "
            f"--setenv=POLL_INTERVAL={POLL_INTERVAL} "
            f"--setenv=GRACE_PERIOD={GRACE_PERIOD} "
            f"--setenv=CPU_THRESHOLD={CPU_THRESHOLD} "
            f"{PYTHON} {SCRIPT}"
        )
        # Monitor needs two consecutive polls to compute a CPU delta.
        time.sleep(3)
    with subtest("xmrig stays running while system is idle"):
        machine.succeed("systemctl is-active xmrig")
    with subtest("xmrig stopped when CPU load appears"):
        start_cpu_load("cpu-load")
        machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
    with subtest("xmrig remains stopped during grace period after load ends"):
        stop_cpu_load("cpu-load")
        # Load just stopped. Grace period is 5s. Check at 2s — well within.
        time.sleep(2)
        machine.fail("systemctl is-active xmrig")
    with subtest("xmrig resumes after grace period expires"):
        # Already idle since previous subtest. Grace period (5s) plus
        # detection delay (~2 polls) means xmrig should restart within ~8s.
        machine.wait_until_succeeds("systemctl is-active xmrig", timeout=15)
    with subtest("Intermittent load does not cause flapping"):
        # First load — stop xmrig
        start_cpu_load("cpu-load-1")
        machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
        stop_cpu_load("cpu-load-1")
        # Brief idle gap — shorter than grace period
        time.sleep(2)
        # Second load arrives before grace period expires
        start_cpu_load("cpu-load-2")
        time.sleep(3)
        # xmrig must still be stopped
        machine.fail("systemctl is-active xmrig")
        stop_cpu_load("cpu-load-2")
        machine.wait_until_succeeds("systemctl is-active xmrig", timeout=15)
    with subtest("Sustained load keeps xmrig stopped"):
        start_cpu_load("cpu-load-3")
        machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
        # Stay busy longer than the grace period to prove continuous
        # activity keeps xmrig stopped indefinitely.
        time.sleep(8)
        machine.fail("systemctl is-active xmrig")
        stop_cpu_load("cpu-load-3")
        machine.wait_until_succeeds("systemctl is-active xmrig", timeout=15)
  '';
 }