Compare commits
3 Commits
274ef40ccc
...
e57c9cb83b
| Author | SHA1 | Date | |
|---|---|---|---|
|
e57c9cb83b
|
|||
|
d48f27701f
|
|||
|
738861fd53
|
11
flake.lock
generated
11
flake.lock
generated
@@ -304,16 +304,15 @@
|
|||||||
"rust-overlay": "rust-overlay"
|
"rust-overlay": "rust-overlay"
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1775504408,
|
"lastModified": 1775510693,
|
||||||
"narHash": "sha256-0OueSEJk/BZlanFXEexVQg2Jy1pGXA2DCO9ZgBkWCTM=",
|
"narHash": "sha256-gZfJ07j/oOciDi8mF/V8QTm7YCeDcusNSMZzBFi8OUM=",
|
||||||
"owner": "Titaniumtown",
|
"owner": "nix-community",
|
||||||
"repo": "lanzaboote",
|
"repo": "lanzaboote",
|
||||||
"rev": "ba00703759608f49a094c40d09ed39ed98c2b8bb",
|
"rev": "3fe0ae8cb285e0ad101a9675f4190d455fb05e85",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
"owner": "Titaniumtown",
|
"owner": "nix-community",
|
||||||
"ref": "pr/fix-pcrlock-reinstall-systemd-boot",
|
|
||||||
"repo": "lanzaboote",
|
"repo": "lanzaboote",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,7 +5,7 @@
|
|||||||
nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.11";
|
nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.11";
|
||||||
|
|
||||||
lanzaboote = {
|
lanzaboote = {
|
||||||
url = "github:Titaniumtown/lanzaboote/pr/fix-pcrlock-reinstall-systemd-boot";
|
url = "github:nix-community/lanzaboote";
|
||||||
inputs.nixpkgs.follows = "nixpkgs";
|
inputs.nixpkgs.follows = "nixpkgs";
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -26,7 +26,11 @@ lib.mkIf config.services.xmrig.enable {
|
|||||||
environment = {
|
environment = {
|
||||||
POLL_INTERVAL = "3";
|
POLL_INTERVAL = "3";
|
||||||
GRACE_PERIOD = "15";
|
GRACE_PERIOD = "15";
|
||||||
CPU_THRESHOLD = "5";
|
# This server's background services (qbittorrent, monero, bazarr, etc.)
|
||||||
|
# produce 5-14% non-nice CPU during normal operation. Thresholds must
|
||||||
|
# sit above that noise floor.
|
||||||
|
CPU_STOP_THRESHOLD = "40";
|
||||||
|
CPU_RESUME_THRESHOLD = "30";
|
||||||
STARTUP_COOLDOWN = "10";
|
STARTUP_COOLDOWN = "10";
|
||||||
STATE_DIR = "/var/lib/xmrig-auto-pause";
|
STATE_DIR = "/var/lib/xmrig-auto-pause";
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -4,9 +4,9 @@ Auto-pause xmrig when other services need CPU.
|
|||||||
|
|
||||||
Monitors non-nice CPU usage from /proc/stat. Since xmrig runs at Nice=19,
|
Monitors non-nice CPU usage from /proc/stat. Since xmrig runs at Nice=19,
|
||||||
its CPU time lands in the 'nice' column and is excluded from the metric.
|
its CPU time lands in the 'nice' column and is excluded from the metric.
|
||||||
When real workload (user + system + irq + softirq) exceeds the threshold,
|
When real workload (user + system + irq + softirq) exceeds the stop
|
||||||
stops xmrig. When it drops below threshold for GRACE_PERIOD seconds,
|
threshold, stops xmrig. When it drops below the resume threshold for
|
||||||
restarts xmrig.
|
GRACE_PERIOD seconds, restarts xmrig.
|
||||||
|
|
||||||
This replaces per-service pause scripts with a single general-purpose
|
This replaces per-service pause scripts with a single general-purpose
|
||||||
monitor that handles any CPU-intensive workload (gitea workers, llama-cpp
|
monitor that handles any CPU-intensive workload (gitea workers, llama-cpp
|
||||||
@@ -18,6 +18,14 @@ Why scheduler priority alone isn't enough:
|
|||||||
the shared 32MB L3 cache, and its memory access pattern saturates DRAM
|
the shared 32MB L3 cache, and its memory access pattern saturates DRAM
|
||||||
bandwidth. Other services run slower even though they aren't denied CPU
|
bandwidth. Other services run slower even though they aren't denied CPU
|
||||||
time. The only fix is to stop xmrig entirely when real work is happening.
|
time. The only fix is to stop xmrig entirely when real work is happening.
|
||||||
|
|
||||||
|
Hysteresis:
|
||||||
|
The stop threshold is set higher than the resume threshold to prevent
|
||||||
|
oscillation. When xmrig runs, its L3 cache pressure makes other processes
|
||||||
|
appear ~3-8% busier. A single threshold trips on this indirect effect,
|
||||||
|
causing stop/start thrashing. Separate thresholds break the cycle: the
|
||||||
|
resume threshold confirms the system is truly idle, while the stop
|
||||||
|
threshold requires genuine workload above xmrig's indirect pressure.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
@@ -29,8 +37,12 @@ POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "3"))
|
|||||||
GRACE_PERIOD = float(os.environ.get("GRACE_PERIOD", "15"))
|
GRACE_PERIOD = float(os.environ.get("GRACE_PERIOD", "15"))
|
||||||
# Percentage of total CPU ticks that non-nice processes must use to trigger
|
# Percentage of total CPU ticks that non-nice processes must use to trigger
|
||||||
# a pause. On a 12-thread system, one fully loaded core ≈ 8.3% of total.
|
# a pause. On a 12-thread system, one fully loaded core ≈ 8.3% of total.
|
||||||
# Default 5% catches anything using more than ~60% of a single core.
|
# Default 15% requires roughly two busy cores, which avoids false positives
|
||||||
CPU_THRESHOLD = float(os.environ.get("CPU_THRESHOLD", "5"))
|
# from xmrig's L3 cache pressure inflating other processes' apparent CPU.
|
||||||
|
CPU_STOP_THRESHOLD = float(os.environ.get("CPU_STOP_THRESHOLD", "15"))
|
||||||
|
# Percentage below which the system is considered idle enough to resume
|
||||||
|
# mining. Lower than the stop threshold to provide hysteresis.
|
||||||
|
CPU_RESUME_THRESHOLD = float(os.environ.get("CPU_RESUME_THRESHOLD", "5"))
|
||||||
# After starting xmrig, ignore CPU spikes for this many seconds to let
|
# After starting xmrig, ignore CPU spikes for this many seconds to let
|
||||||
# RandomX dataset initialization complete (~4s on the target hardware)
|
# RandomX dataset initialization complete (~4s on the target hardware)
|
||||||
# without retriggering a stop.
|
# without retriggering a stop.
|
||||||
@@ -115,7 +127,8 @@ def main():
|
|||||||
|
|
||||||
log(
|
log(
|
||||||
f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s "
|
f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s "
|
||||||
f"threshold={CPU_THRESHOLD}% cooldown={STARTUP_COOLDOWN}s"
|
f"stop={CPU_STOP_THRESHOLD}% resume={CPU_RESUME_THRESHOLD}% "
|
||||||
|
f"cooldown={STARTUP_COOLDOWN}s"
|
||||||
)
|
)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
@@ -154,9 +167,10 @@ def main():
|
|||||||
_save_paused(True)
|
_save_paused(True)
|
||||||
started_at = None
|
started_at = None
|
||||||
|
|
||||||
busy = real_work_pct > CPU_THRESHOLD
|
above_stop = real_work_pct > CPU_STOP_THRESHOLD
|
||||||
|
below_resume = real_work_pct <= CPU_RESUME_THRESHOLD
|
||||||
|
|
||||||
if busy:
|
if above_stop:
|
||||||
idle_since = None
|
idle_since = None
|
||||||
if paused_by_us and is_active("xmrig.service"):
|
if paused_by_us and is_active("xmrig.service"):
|
||||||
# Something else restarted xmrig (deploy, manual start, etc.)
|
# Something else restarted xmrig (deploy, manual start, etc.)
|
||||||
@@ -174,8 +188,8 @@ def main():
|
|||||||
if systemctl("stop", "xmrig.service"):
|
if systemctl("stop", "xmrig.service"):
|
||||||
paused_by_us = True
|
paused_by_us = True
|
||||||
_save_paused(True)
|
_save_paused(True)
|
||||||
else:
|
elif paused_by_us:
|
||||||
if paused_by_us:
|
if below_resume:
|
||||||
if idle_since is None:
|
if idle_since is None:
|
||||||
idle_since = time.monotonic()
|
idle_since = time.monotonic()
|
||||||
elif time.monotonic() - idle_since >= GRACE_PERIOD:
|
elif time.monotonic() - idle_since >= GRACE_PERIOD:
|
||||||
@@ -185,6 +199,9 @@ def main():
|
|||||||
_save_paused(False)
|
_save_paused(False)
|
||||||
started_at = time.monotonic()
|
started_at = time.monotonic()
|
||||||
idle_since = None
|
idle_since = None
|
||||||
|
else:
|
||||||
|
# Between thresholds — not idle enough to resume.
|
||||||
|
idle_since = None
|
||||||
|
|
||||||
time.sleep(POLL_INTERVAL)
|
time.sleep(POLL_INTERVAL)
|
||||||
|
|
||||||
|
|||||||
@@ -39,13 +39,15 @@ pkgs.testers.runNixOSTest {
|
|||||||
# POLL_INTERVAL=1 keeps detection latency low.
|
# POLL_INTERVAL=1 keeps detection latency low.
|
||||||
# GRACE_PERIOD=5 is long enough to verify "stays stopped" but short
|
# GRACE_PERIOD=5 is long enough to verify "stays stopped" but short
|
||||||
# enough that the full test completes in reasonable time.
|
# enough that the full test completes in reasonable time.
|
||||||
# CPU_THRESHOLD=10 catches a single busy-loop on a 1-2 core VM.
|
# CPU_STOP_THRESHOLD=20 catches a busy-loop on a 1-2 core VM (50-100%)
|
||||||
|
# without triggering from normal VM noise.
|
||||||
|
# CPU_RESUME_THRESHOLD=10 is the idle cutoff for a 1-2 core VM.
|
||||||
POLL_INTERVAL = "1"
|
POLL_INTERVAL = "1"
|
||||||
GRACE_PERIOD = "5"
|
GRACE_PERIOD = "5"
|
||||||
CPU_THRESHOLD = "10"
|
CPU_STOP_THRESHOLD = "20"
|
||||||
|
CPU_RESUME_THRESHOLD = "10"
|
||||||
STARTUP_COOLDOWN = "4"
|
STARTUP_COOLDOWN = "4"
|
||||||
STATE_DIR = "/tmp/xap-state"
|
STATE_DIR = "/tmp/xap-state"
|
||||||
|
|
||||||
def start_cpu_load(name):
|
def start_cpu_load(name):
|
||||||
"""Start a non-nice CPU burn as a transient systemd unit."""
|
"""Start a non-nice CPU burn as a transient systemd unit."""
|
||||||
machine.succeed(
|
machine.succeed(
|
||||||
@@ -62,13 +64,16 @@ pkgs.testers.runNixOSTest {
|
|||||||
f"systemd-run --unit={unit_name} "
|
f"systemd-run --unit={unit_name} "
|
||||||
f"--setenv=POLL_INTERVAL={POLL_INTERVAL} "
|
f"--setenv=POLL_INTERVAL={POLL_INTERVAL} "
|
||||||
f"--setenv=GRACE_PERIOD={GRACE_PERIOD} "
|
f"--setenv=GRACE_PERIOD={GRACE_PERIOD} "
|
||||||
f"--setenv=CPU_THRESHOLD={CPU_THRESHOLD} "
|
f"--setenv=CPU_STOP_THRESHOLD={CPU_STOP_THRESHOLD} "
|
||||||
|
f"--setenv=CPU_RESUME_THRESHOLD={CPU_RESUME_THRESHOLD} "
|
||||||
f"--setenv=STARTUP_COOLDOWN={STARTUP_COOLDOWN} "
|
f"--setenv=STARTUP_COOLDOWN={STARTUP_COOLDOWN} "
|
||||||
f"--setenv=STATE_DIR={STATE_DIR} "
|
f"--setenv=STATE_DIR={STATE_DIR} "
|
||||||
f"{PYTHON} {SCRIPT}"
|
f"{PYTHON} {SCRIPT}"
|
||||||
)
|
)
|
||||||
# Monitor needs two consecutive polls to compute a CPU delta.
|
# Monitor needs two consecutive polls to compute a CPU delta.
|
||||||
time.sleep(3)
|
time.sleep(3)
|
||||||
|
# Monitor needs two consecutive polls to compute a CPU delta.
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
start_all()
|
start_all()
|
||||||
machine.wait_for_unit("multi-user.target")
|
machine.wait_for_unit("multi-user.target")
|
||||||
|
|||||||
Reference in New Issue
Block a user