{ pkgs, ... }: let script = ../services/llama-cpp/llama-cpp-xmrig-pause.py; python = pkgs.python3; # SmolLM-135M Q2_K: 85MB, modern GGUFv3, generates ~30 tok/s on one CPU # thread — slow enough that a 200-token request keeps the process busy for # several seconds, fast enough that tests don't crawl. tinyModel = pkgs.fetchurl { url = "https://huggingface.co/QuantFactory/SmolLM-135M-GGUF/resolve/main/SmolLM-135M.Q2_K.gguf"; hash = "sha256-DX46drPNJILNba21xfY2tyE0/yPWgOhz43gJdeSYKh4="; }; in pkgs.testers.runNixOSTest { name = "llama-cpp-xmrig-pause"; nodes.machine = { pkgs, ... }: { environment.systemPackages = [ pkgs.python3 pkgs.procps pkgs.curl pkgs.llama-cpp ]; # Mock xmrig as a simple sleep process that can be stopped/started. systemd.services.xmrig = { description = "Mock xmrig miner"; serviceConfig = { ExecStart = "${pkgs.coreutils}/bin/sleep infinity"; Type = "simple"; }; wantedBy = [ "multi-user.target" ]; }; }; testScript = '' import time PORT = 18088 MODEL = "${tinyModel}" PYTHON = "${python}/bin/python3" SCRIPT = "${script}" # Tuned for test speed while remaining realistic. # POLL_INTERVAL=1 keeps detection latency low. # GRACE_PERIOD=5 is long enough to verify "stays stopped" but short enough # that the full test completes in ~2 minutes. # CPU_THRESHOLD=10 is low because the VM has limited cores and the model # is small — but any active inference still saturates a core. POLL_INTERVAL = "1" GRACE_PERIOD = "5" CPU_THRESHOLD = "10" infer_counter = 0 def send_completion(n_predict=200): """Fire a completion request in the background via a transient systemd unit.""" global infer_counter infer_counter += 1 name = f"infer-{infer_counter}" machine.succeed( f"systemd-run --unit={name} --property=Type=exec " f"curl -sf -X POST http://127.0.0.1:{PORT}/completion " f"-H 'Content-Type: application/json' " f"-d '{{\"prompt\": \"Once upon a time in a land far away there lived\", \"n_predict\": {n_predict}}}'" ) return name def wait_inference_done(unit_name, timeout=60): """Wait for a background inference request to finish.""" machine.wait_until_fails( f"systemctl is-active {unit_name}", timeout=timeout, ) start_all() machine.wait_for_unit("multi-user.target") machine.wait_for_unit("xmrig.service") with subtest("Start llama-server"): machine.succeed( f"systemd-run --unit=llama-server " # Single inference thread to maximise per-core CPU%, which is # what the monitor measures. Keeps token generation slow enough # (~30 tok/s) that a 200-token request sustains load for seconds. f"llama-server --model {MODEL} --port {PORT} --ctx-size 512 -t 1 -np 1" ) machine.wait_until_succeeds( f"curl -sf http://127.0.0.1:{PORT}/health", timeout=30, ) machine.succeed("pgrep -x llama-server") with subtest("Start pause monitor"): machine.succeed( f"systemd-run --unit=llama-xmrig-pause " f"--setenv=POLL_INTERVAL={POLL_INTERVAL} " f"--setenv=GRACE_PERIOD={GRACE_PERIOD} " f"--setenv=CPU_THRESHOLD={CPU_THRESHOLD} " f"{PYTHON} {SCRIPT}" ) # The monitor needs two consecutive polls to compute a CPU delta. # Wait for baseline to stabilise. time.sleep(3) with subtest("xmrig stays running while llama-server is idle"): machine.succeed("systemctl is-active xmrig") with subtest("xmrig stopped during prompt processing"): unit = send_completion(n_predict=200) machine.wait_until_fails("systemctl is-active xmrig", timeout=20) with subtest("xmrig remains stopped during grace period after inference ends"): wait_inference_done(unit) # Inference just finished. The monitor will need 1-2 polls to detect # idle, then the grace period starts. Checking 2s after completion # is well within the 5s grace window. time.sleep(2) machine.fail("systemctl is-active xmrig") with subtest("xmrig resumes after grace period expires"): # Already idle since previous subtest. Grace period (5s) plus # detection delay (~2 polls) means xmrig should restart within ~8s. machine.wait_until_succeeds("systemctl is-active xmrig", timeout=15) with subtest("Sequential prompts do not cause xmrig flapping"): # First prompt — stop xmrig unit1 = send_completion(n_predict=200) machine.wait_until_fails("systemctl is-active xmrig", timeout=20) wait_inference_done(unit1) # Brief idle gap — shorter than grace period time.sleep(2) # Second prompt arrives before grace period expires, resetting it unit2 = send_completion(n_predict=200) time.sleep(3) # xmrig must still be stopped machine.fail("systemctl is-active xmrig") wait_inference_done(unit2) machine.wait_until_succeeds("systemctl is-active xmrig", timeout=15) with subtest("xmrig stays stopped during sustained inference"): unit = send_completion(n_predict=500) machine.wait_until_fails("systemctl is-active xmrig", timeout=20) # Stay busy longer than the grace period to prove continuous # activity keeps xmrig stopped indefinitely. time.sleep(8) machine.fail("systemctl is-active xmrig") wait_inference_done(unit) machine.wait_until_succeeds("systemctl is-active xmrig", timeout=15) ''; }