diff --git a/modules/zfs.nix b/modules/zfs.nix index c447c34..4e5da01 100644 --- a/modules/zfs.nix +++ b/modules/zfs.nix @@ -1,15 +1,39 @@ { config, + lib, service_configs, pkgs, ... }: +let + # Total RAM in bytes (from /proc/meminfo: 65775836 KiB). + totalRamBytes = 65775836 * 1024; + + # Hugepage reservations that the kernel carves out before ZFS can use them. + hugepages2mBytes = service_configs.hugepages_2m.total_pages * 2 * 1024 * 1024; + hugepages1gBytes = 3 * 1024 * 1024 * 1024; # 3x 1G pages for RandomX (xmrig.nix) + totalHugepageBytes = hugepages2mBytes + hugepages1gBytes; + + # ARC max: 60% of RAM remaining after hugepages. Leaves headroom for + # application RSS (PostgreSQL, qBittorrent, Jellyfin, Grafana, etc.), + # kernel slabs, and page cache. + arcMaxBytes = (totalRamBytes - totalHugepageBytes) * 60 / 100; +in { - boot.zfs.package = pkgs.zfs; + boot.zfs.package = pkgs.zfs_2_4; boot.initrd.kernelModules = [ "zfs" ]; boot.kernelParams = [ - "zfs.zfs_txg_timeout=120" # longer TXG open time = larger sequential writes + # 120s TXG timeout: batch more dirty data per transaction group so the + # HDD pool (hdds) writes larger, sequential I/Os instead of many small syncs. + # This is a global setting (no per-pool control); the SSD pool (tank) syncs + # infrequently but handles it fine since SSDs don't suffer from seek overhead. + "zfs.zfs_txg_timeout=120" + + # Cap ARC to prevent it from claiming memory reserved for hugepages. + # Without this, ZFS auto-sizes c_max to ~62 GiB on a 64 GiB system, + # ignoring the 11.5 GiB of hugepage reservations. + "zfs.zfs_arc_max=${toString arcMaxBytes}" # vdev I/O scheduler: feed more concurrent reads to the block scheduler so # mq-deadline has a larger pool of requests to sort and merge into elevator sweeps.