Two bugs found during live verification on the server: 1. Stuck state after external restart: if something else restarted xmrig (e.g. deploy-rs activation) while paused_by_us=True, the script never detected this and became permanently stuck — unable to stop xmrig on future load because it thought xmrig was already stopped. Fix: when paused_by_us=True and busy, check if xmrig is actually running. If so, reset paused_by_us=False and re-stop it. 2. Flapping on xmrig restart: RandomX dataset init takes ~3.7s of intense non-nice CPU, which the script detected as real workload and immediately re-stopped xmrig after every restart, creating a start-stop loop. Fix: add STARTUP_COOLDOWN (default 10s) — after starting xmrig, skip CPU checks until the cooldown expires. Both bugs were present in production: the script had been stuck since Apr 3 (2+ days) with xmrig running unmanaged alongside llama-server.
33 lines
820 B
Nix
33 lines
820 B
Nix
{
|
|
config,
|
|
lib,
|
|
pkgs,
|
|
...
|
|
}:
|
|
lib.mkIf config.services.xmrig.enable {
|
|
systemd.services.xmrig-auto-pause = {
|
|
description = "Auto-pause xmrig when other services need CPU";
|
|
after = [ "xmrig.service" ];
|
|
wantedBy = [ "multi-user.target" ];
|
|
serviceConfig = {
|
|
ExecStart = "${pkgs.python3}/bin/python3 ${./xmrig-auto-pause.py}";
|
|
Restart = "always";
|
|
RestartSec = "10s";
|
|
NoNewPrivileges = true;
|
|
ProtectHome = true;
|
|
ProtectSystem = "strict";
|
|
PrivateTmp = true;
|
|
RestrictAddressFamilies = [
|
|
"AF_UNIX" # systemctl talks to systemd over D-Bus unix socket
|
|
];
|
|
MemoryDenyWriteExecute = true;
|
|
};
|
|
environment = {
|
|
POLL_INTERVAL = "3";
|
|
GRACE_PERIOD = "15";
|
|
CPU_THRESHOLD = "5";
|
|
STARTUP_COOLDOWN = "10";
|
|
};
|
|
};
|
|
}
|