Compare commits
4 Commits
6d47f02a0f
...
960259b0d0
| Author | SHA1 | Date | |
|---|---|---|---|
|
960259b0d0
|
|||
|
5fa6f37b28
|
|||
|
7afd1f35d2
|
|||
|
a12dcb01ec
|
@@ -46,7 +46,7 @@
|
|||||||
|
|
||||||
./services/soulseek.nix
|
./services/soulseek.nix
|
||||||
|
|
||||||
./services/llama-cpp
|
# ./services/llama-cpp.nix
|
||||||
./services/trilium.nix
|
./services/trilium.nix
|
||||||
|
|
||||||
./services/ups.nix
|
./services/ups.nix
|
||||||
|
|||||||
42
flake.lock
generated
42
flake.lock
generated
@@ -238,11 +238,11 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1775077333,
|
"lastModified": 1775425411,
|
||||||
"narHash": "sha256-OXcxobt7lBkh1B8AjwreU+24myhtKpqeLfAeIyNLFY8=",
|
"narHash": "sha256-KY6HsebJHEe5nHOWP7ur09mb0drGxYSzE3rQxy62rJo=",
|
||||||
"owner": "nix-community",
|
"owner": "nix-community",
|
||||||
"repo": "home-manager",
|
"repo": "home-manager",
|
||||||
"rev": "49ca96b2714c5931e17401eff87f3edd42d2b0f2",
|
"rev": "0d02ec1d0a05f88ef9e74b516842900c41f0f2fe",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
@@ -304,11 +304,11 @@
|
|||||||
"rust-overlay": "rust-overlay"
|
"rust-overlay": "rust-overlay"
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1774858933,
|
"lastModified": 1775494882,
|
||||||
"narHash": "sha256-rgHUoE4QhOvK3Rcl9cbuIVdjPjFjfhcTm/uPs8Y7+2w=",
|
"narHash": "sha256-bOUFAWjD95Au+K1LkEhV4u3ulsiVfIXbbOFPxnEgSv8=",
|
||||||
"owner": "nix-community",
|
"owner": "nix-community",
|
||||||
"repo": "lanzaboote",
|
"repo": "lanzaboote",
|
||||||
"rev": "45338aab3013924c75305f5cb3543b9cda993183",
|
"rev": "40970afc8d8f1c122f3d282d3e7329d9faf65bec",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
@@ -368,11 +368,11 @@
|
|||||||
"systems": "systems_3"
|
"systems": "systems_3"
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1775185059,
|
"lastModified": 1775446111,
|
||||||
"narHash": "sha256-3d9gBmLMfI9d5xwfbd9Zr5JwpQzZ27qw9NiRjJ2aB28=",
|
"narHash": "sha256-3W1RFYoJgpC9N7Oezj3r4ILOzBP4LSob8QZV0/vuxhc=",
|
||||||
"owner": "Infinidoge",
|
"owner": "Infinidoge",
|
||||||
"repo": "nix-minecraft",
|
"repo": "nix-minecraft",
|
||||||
"rev": "f5d7077eb578b9e321b74329bd0625d5569dc90e",
|
"rev": "059dc0e19a275112ba0a396f0d7d2c4cda062d10",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
@@ -383,11 +383,11 @@
|
|||||||
},
|
},
|
||||||
"nixos-hardware": {
|
"nixos-hardware": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1774933469,
|
"lastModified": 1775490113,
|
||||||
"narHash": "sha256-OrnCQeUO2bqaWUl0lkDWyGWjKsOhtCyd7JSfTedQNUE=",
|
"narHash": "sha256-2ZBhDNZZwYkRmefK5XLOusCJHnoeKkoN95hoSGgMxWM=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixos-hardware",
|
"repo": "nixos-hardware",
|
||||||
"rev": "f4c4c2c0c923d7811ac2a63ccc154767e4195337",
|
"rev": "c775c2772ba56e906cbeb4e0b2db19079ef11ff7",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
@@ -399,11 +399,11 @@
|
|||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1775002709,
|
"lastModified": 1775305101,
|
||||||
"narHash": "sha256-d3Yx83vSrN+2z/loBh4mJpyRqr9aAJqlke4TkpFmRJA=",
|
"narHash": "sha256-/74n1oQPtKG52Yw41cbToxspxHbYz6O3vi+XEw16Qe8=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "bcd464ccd2a1a7cd09aa2f8d4ffba83b761b1d0e",
|
"rev": "36a601196c4ebf49e035270e10b2d103fe39076b",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
@@ -624,11 +624,11 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1775095870,
|
"lastModified": 1775444042,
|
||||||
"narHash": "sha256-C15ZVObWmLOKOme4VkJru8+1an5xRZE0R0/t3AuIEKM=",
|
"narHash": "sha256-cg19ipIlZaLYgs/5ZPFcDDuOcZlGzfprB5xS4x7bVM4=",
|
||||||
"owner": "nix-community",
|
"owner": "nix-community",
|
||||||
"repo": "srvos",
|
"repo": "srvos",
|
||||||
"rev": "8677ae9b6569964e5a27e27abfb707a49a6b827f",
|
"rev": "64c9cc6a274dac7d08c4d53494ffa4acf906e287",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
@@ -715,11 +715,11 @@
|
|||||||
"trackerlist": {
|
"trackerlist": {
|
||||||
"flake": false,
|
"flake": false,
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1775167783,
|
"lastModified": 1775426970,
|
||||||
"narHash": "sha256-Tus994D/cxp3HDFRJ2057eBw5wHJ7EncOXyodiwUCwU=",
|
"narHash": "sha256-MXs6xRTFxCvXnhShHMTCSw70nFeIkY1L20YWXso0xyo=",
|
||||||
"owner": "ngosang",
|
"owner": "ngosang",
|
||||||
"repo": "trackerslist",
|
"repo": "trackerslist",
|
||||||
"rev": "74023c1466f7ad7b777a3047d10cca83e005c111",
|
"rev": "00634b20e7c805cffcde71f280324ef6ab45607f",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|||||||
@@ -1,5 +0,0 @@
|
|||||||
{
|
|
||||||
imports = [
|
|
||||||
./llama-cpp.nix
|
|
||||||
];
|
|
||||||
}
|
|
||||||
@@ -21,12 +21,14 @@ lib.mkIf config.services.xmrig.enable {
|
|||||||
"AF_UNIX" # systemctl talks to systemd over D-Bus unix socket
|
"AF_UNIX" # systemctl talks to systemd over D-Bus unix socket
|
||||||
];
|
];
|
||||||
MemoryDenyWriteExecute = true;
|
MemoryDenyWriteExecute = true;
|
||||||
|
StateDirectory = "xmrig-auto-pause";
|
||||||
};
|
};
|
||||||
environment = {
|
environment = {
|
||||||
POLL_INTERVAL = "3";
|
POLL_INTERVAL = "3";
|
||||||
GRACE_PERIOD = "15";
|
GRACE_PERIOD = "15";
|
||||||
CPU_THRESHOLD = "5";
|
CPU_THRESHOLD = "5";
|
||||||
STARTUP_COOLDOWN = "10";
|
STARTUP_COOLDOWN = "10";
|
||||||
|
STATE_DIR = "/var/lib/xmrig-auto-pause";
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -35,6 +35,11 @@ CPU_THRESHOLD = float(os.environ.get("CPU_THRESHOLD", "5"))
|
|||||||
# RandomX dataset initialization complete (~4s on the target hardware)
|
# RandomX dataset initialization complete (~4s on the target hardware)
|
||||||
# without retriggering a stop.
|
# without retriggering a stop.
|
||||||
STARTUP_COOLDOWN = float(os.environ.get("STARTUP_COOLDOWN", "10"))
|
STARTUP_COOLDOWN = float(os.environ.get("STARTUP_COOLDOWN", "10"))
|
||||||
|
# Directory for persisting pause state across script restarts. Without
|
||||||
|
# this, a restart while xmrig is paused loses the paused_by_us flag and
|
||||||
|
# xmrig stays stopped permanently.
|
||||||
|
STATE_DIR = os.environ.get("STATE_DIR", "")
|
||||||
|
_PAUSE_FILE = os.path.join(STATE_DIR, "paused") if STATE_DIR else ""
|
||||||
|
|
||||||
|
|
||||||
def log(msg):
|
def log(msg):
|
||||||
@@ -78,13 +83,36 @@ def systemctl(action, unit):
|
|||||||
return result.returncode == 0
|
return result.returncode == 0
|
||||||
|
|
||||||
|
|
||||||
|
def _save_paused(paused):
|
||||||
|
"""Persist pause flag so a script restart can resume where we left off."""
|
||||||
|
if not _PAUSE_FILE:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
if paused:
|
||||||
|
open(_PAUSE_FILE, "w").close()
|
||||||
|
else:
|
||||||
|
os.remove(_PAUSE_FILE)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _load_paused():
|
||||||
|
"""Check if a previous instance left xmrig paused."""
|
||||||
|
if not _PAUSE_FILE:
|
||||||
|
return False
|
||||||
|
return os.path.isfile(_PAUSE_FILE)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
paused_by_us = False
|
paused_by_us = _load_paused()
|
||||||
idle_since = None
|
idle_since = None
|
||||||
started_at = None # monotonic time when we last started xmrig
|
started_at = None # monotonic time when we last started xmrig
|
||||||
prev_total = None
|
prev_total = None
|
||||||
prev_work = None
|
prev_work = None
|
||||||
|
|
||||||
|
if paused_by_us:
|
||||||
|
log("Recovered pause state from previous instance")
|
||||||
|
|
||||||
log(
|
log(
|
||||||
f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s "
|
f"Starting: poll={POLL_INTERVAL}s grace={GRACE_PERIOD}s "
|
||||||
f"threshold={CPU_THRESHOLD}% cooldown={STARTUP_COOLDOWN}s"
|
f"threshold={CPU_THRESHOLD}% cooldown={STARTUP_COOLDOWN}s"
|
||||||
@@ -116,6 +144,14 @@ def main():
|
|||||||
if time.monotonic() - started_at < STARTUP_COOLDOWN:
|
if time.monotonic() - started_at < STARTUP_COOLDOWN:
|
||||||
time.sleep(POLL_INTERVAL)
|
time.sleep(POLL_INTERVAL)
|
||||||
continue
|
continue
|
||||||
|
# Cooldown expired — verify xmrig survived startup. If it
|
||||||
|
# crashed during init (hugepage failure, pool unreachable, etc.),
|
||||||
|
# re-enter the pause/retry cycle rather than silently leaving
|
||||||
|
# xmrig dead.
|
||||||
|
if not is_active("xmrig.service"):
|
||||||
|
log("xmrig died during startup cooldown — will retry")
|
||||||
|
paused_by_us = True
|
||||||
|
_save_paused(True)
|
||||||
started_at = None
|
started_at = None
|
||||||
|
|
||||||
busy = real_work_pct > CPU_THRESHOLD
|
busy = real_work_pct > CPU_THRESHOLD
|
||||||
@@ -128,6 +164,7 @@ def main():
|
|||||||
# manage it again.
|
# manage it again.
|
||||||
log("xmrig was restarted externally while paused — reclaiming")
|
log("xmrig was restarted externally while paused — reclaiming")
|
||||||
paused_by_us = False
|
paused_by_us = False
|
||||||
|
_save_paused(False)
|
||||||
if not paused_by_us:
|
if not paused_by_us:
|
||||||
# Only claim ownership if xmrig is actually running.
|
# Only claim ownership if xmrig is actually running.
|
||||||
# If something else stopped it (e.g. UPS battery hook),
|
# If something else stopped it (e.g. UPS battery hook),
|
||||||
@@ -136,6 +173,7 @@ def main():
|
|||||||
log(f"Real workload detected ({real_work_pct:.1f}% CPU) — stopping xmrig")
|
log(f"Real workload detected ({real_work_pct:.1f}% CPU) — stopping xmrig")
|
||||||
if systemctl("stop", "xmrig.service"):
|
if systemctl("stop", "xmrig.service"):
|
||||||
paused_by_us = True
|
paused_by_us = True
|
||||||
|
_save_paused(True)
|
||||||
else:
|
else:
|
||||||
if paused_by_us:
|
if paused_by_us:
|
||||||
if idle_since is None:
|
if idle_since is None:
|
||||||
@@ -144,6 +182,7 @@ def main():
|
|||||||
log(f"Workload ended ({real_work_pct:.1f}% CPU) past grace period — starting xmrig")
|
log(f"Workload ended ({real_work_pct:.1f}% CPU) past grace period — starting xmrig")
|
||||||
if systemctl("start", "xmrig.service"):
|
if systemctl("start", "xmrig.service"):
|
||||||
paused_by_us = False
|
paused_by_us = False
|
||||||
|
_save_paused(False)
|
||||||
started_at = time.monotonic()
|
started_at = time.monotonic()
|
||||||
idle_since = None
|
idle_since = None
|
||||||
|
|
||||||
|
|||||||
@@ -44,6 +44,7 @@ pkgs.testers.runNixOSTest {
|
|||||||
GRACE_PERIOD = "5"
|
GRACE_PERIOD = "5"
|
||||||
CPU_THRESHOLD = "10"
|
CPU_THRESHOLD = "10"
|
||||||
STARTUP_COOLDOWN = "4"
|
STARTUP_COOLDOWN = "4"
|
||||||
|
STATE_DIR = "/tmp/xap-state"
|
||||||
|
|
||||||
def start_cpu_load(name):
|
def start_cpu_load(name):
|
||||||
"""Start a non-nice CPU burn as a transient systemd unit."""
|
"""Start a non-nice CPU burn as a transient systemd unit."""
|
||||||
@@ -55,22 +56,28 @@ pkgs.testers.runNixOSTest {
|
|||||||
def stop_cpu_load(name):
|
def stop_cpu_load(name):
|
||||||
machine.succeed(f"systemctl stop {name}")
|
machine.succeed(f"systemctl stop {name}")
|
||||||
|
|
||||||
start_all()
|
def start_monitor(unit_name):
|
||||||
machine.wait_for_unit("multi-user.target")
|
"""Start the auto-pause monitor as a transient unit."""
|
||||||
machine.wait_for_unit("xmrig.service")
|
|
||||||
|
|
||||||
with subtest("Start auto-pause monitor"):
|
|
||||||
machine.succeed(
|
machine.succeed(
|
||||||
f"systemd-run --unit=xmrig-auto-pause "
|
f"systemd-run --unit={unit_name} "
|
||||||
f"--setenv=POLL_INTERVAL={POLL_INTERVAL} "
|
f"--setenv=POLL_INTERVAL={POLL_INTERVAL} "
|
||||||
f"--setenv=GRACE_PERIOD={GRACE_PERIOD} "
|
f"--setenv=GRACE_PERIOD={GRACE_PERIOD} "
|
||||||
f"--setenv=CPU_THRESHOLD={CPU_THRESHOLD} "
|
f"--setenv=CPU_THRESHOLD={CPU_THRESHOLD} "
|
||||||
f"--setenv=STARTUP_COOLDOWN={STARTUP_COOLDOWN} "
|
f"--setenv=STARTUP_COOLDOWN={STARTUP_COOLDOWN} "
|
||||||
|
f"--setenv=STATE_DIR={STATE_DIR} "
|
||||||
f"{PYTHON} {SCRIPT}"
|
f"{PYTHON} {SCRIPT}"
|
||||||
)
|
)
|
||||||
# Monitor needs two consecutive polls to compute a CPU delta.
|
# Monitor needs two consecutive polls to compute a CPU delta.
|
||||||
time.sleep(3)
|
time.sleep(3)
|
||||||
|
|
||||||
|
start_all()
|
||||||
|
machine.wait_for_unit("multi-user.target")
|
||||||
|
machine.wait_for_unit("xmrig.service")
|
||||||
|
machine.succeed(f"mkdir -p {STATE_DIR}")
|
||||||
|
|
||||||
|
with subtest("Start auto-pause monitor"):
|
||||||
|
start_monitor("xmrig-auto-pause")
|
||||||
|
|
||||||
with subtest("xmrig stays running while system is idle"):
|
with subtest("xmrig stays running while system is idle"):
|
||||||
machine.succeed("systemctl is-active xmrig")
|
machine.succeed("systemctl is-active xmrig")
|
||||||
|
|
||||||
@@ -134,5 +141,61 @@ pkgs.testers.runNixOSTest {
|
|||||||
|
|
||||||
stop_cpu_load("cpu-load-4")
|
stop_cpu_load("cpu-load-4")
|
||||||
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20)
|
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=20)
|
||||||
|
|
||||||
|
# --- State persistence and crash recovery ---
|
||||||
|
machine.succeed("systemctl stop xmrig-auto-pause")
|
||||||
|
|
||||||
|
with subtest("xmrig recovers after crash during startup cooldown"):
|
||||||
|
machine.succeed(f"rm -rf {STATE_DIR} && mkdir -p {STATE_DIR}")
|
||||||
|
start_monitor("xmrig-auto-pause-crash")
|
||||||
|
|
||||||
|
# Load -> xmrig stops
|
||||||
|
start_cpu_load("cpu-crash")
|
||||||
|
machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
|
||||||
|
|
||||||
|
# End load -> xmrig restarts after grace period
|
||||||
|
stop_cpu_load("cpu-crash")
|
||||||
|
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=30)
|
||||||
|
|
||||||
|
# Kill xmrig immediately — simulates crash during startup cooldown.
|
||||||
|
# The script should detect the failure when cooldown expires and
|
||||||
|
# re-enter the retry cycle.
|
||||||
|
machine.succeed("systemctl kill --signal=KILL xmrig")
|
||||||
|
machine.wait_until_fails("systemctl is-active xmrig", timeout=5)
|
||||||
|
|
||||||
|
# After cooldown + grace period + restart, xmrig should be back.
|
||||||
|
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=30)
|
||||||
|
|
||||||
|
machine.succeed("systemctl stop xmrig-auto-pause-crash")
|
||||||
|
machine.succeed("systemctl reset-failed xmrig.service || true")
|
||||||
|
machine.succeed("systemctl start xmrig")
|
||||||
|
machine.wait_for_unit("xmrig.service")
|
||||||
|
|
||||||
|
with subtest("Script restart preserves pause state"):
|
||||||
|
machine.succeed(f"rm -rf {STATE_DIR} && mkdir -p {STATE_DIR}")
|
||||||
|
start_monitor("xmrig-auto-pause-persist")
|
||||||
|
|
||||||
|
# Load -> xmrig stops
|
||||||
|
start_cpu_load("cpu-persist")
|
||||||
|
machine.wait_until_fails("systemctl is-active xmrig", timeout=20)
|
||||||
|
|
||||||
|
# Kill the monitor while xmrig is paused (simulates script crash)
|
||||||
|
machine.succeed("systemctl stop xmrig-auto-pause-persist")
|
||||||
|
|
||||||
|
# State file must exist — the monitor persisted the pause flag
|
||||||
|
machine.succeed(f"test -f {STATE_DIR}/paused")
|
||||||
|
|
||||||
|
# Start a fresh monitor instance (reads state file on startup)
|
||||||
|
start_monitor("xmrig-auto-pause-persist2")
|
||||||
|
|
||||||
|
# End load — the new monitor should pick up the paused state
|
||||||
|
# and restart xmrig after the grace period
|
||||||
|
stop_cpu_load("cpu-persist")
|
||||||
|
machine.wait_until_succeeds("systemctl is-active xmrig", timeout=30)
|
||||||
|
|
||||||
|
# State file should be cleaned up after successful restart
|
||||||
|
machine.fail(f"test -f {STATE_DIR}/paused")
|
||||||
|
|
||||||
|
machine.succeed("systemctl stop xmrig-auto-pause-persist2")
|
||||||
'';
|
'';
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user