{ config, lib, service_configs, pkgs, ... }: let # Total RAM in bytes (from /proc/meminfo: 65775836 KiB). totalRamBytes = 65775836 * 1024; # Hugepage reservations that the kernel carves out before ZFS can use them. hugepages2mBytes = service_configs.hugepages_2m.total_pages * 2 * 1024 * 1024; hugepages1gBytes = 3 * 1024 * 1024 * 1024; # 3x 1G pages for RandomX (xmrig.nix) totalHugepageBytes = hugepages2mBytes + hugepages1gBytes; # ARC max: 60% of RAM remaining after hugepages. Leaves headroom for # application RSS (PostgreSQL, qBittorrent, Jellyfin, Grafana, etc.), # kernel slabs, and page cache. arcMaxBytes = (totalRamBytes - totalHugepageBytes) * 60 / 100; in { # remove inline dbuf_evict_one call so the dedicated eviction thread # handles cache pressure instead of stalling txg_sync on 6.14+. # https://github.com/openzfs/zfs/issues/18426 boot.zfs.package = pkgs.zfs_2_4.overrideAttrs (old: { patches = (old.patches or [ ]) ++ [ ../patches/zfs/0001-remove-dbuf_evict_one-call.patch ]; }); boot.zfs.modulePackage = config.boot.zfs.package; boot.initrd.kernelModules = [ "zfs" ]; boot.kernelParams = [ # 120s TXG timeout: batch more dirty data per transaction group so the # HDD pool (hdds) writes larger, sequential I/Os instead of many small syncs. # This is a global setting (no per-pool control); the SSD pool (tank) syncs # infrequently but handles it fine since SSDs don't suffer from seek overhead. "zfs.zfs_txg_timeout=120" # Cap ARC to prevent it from claiming memory reserved for hugepages. # Without this, ZFS auto-sizes c_max to ~62 GiB on a 64 GiB system, # ignoring the 11.5 GiB of hugepage reservations. "zfs.zfs_arc_max=${toString arcMaxBytes}" # vdev I/O scheduler: feed more concurrent reads to the block scheduler so # mq-deadline has a larger pool of requests to sort and merge into elevator sweeps. # Default async_read_max is 3 — far too few for effective coalescence. # 32 was empirically optimal (64 overwhelmed the drives, 3 gave near-zero merges). "zfs.zfs_vdev_async_read_max_active=32" "zfs.zfs_vdev_async_read_min_active=4" # Merge reads within 128 KiB of each other (default 32 KiB). On HDDs, reading a # 128 KiB gap is far cheaper than a mechanical seek (~8 ms). "zfs.zfs_vdev_read_gap_limit=131072" # Allow ZFS to aggregate I/Os up to 4 MiB (default 1 MiB), matching the # libtorrent piece extent size for larger sequential disk operations. "zfs.zfs_vdev_aggregation_limit=4194304" ]; boot.supportedFilesystems = [ "zfs" ]; boot.zfs.extraPools = [ service_configs.zpool_ssds service_configs.zpool_hdds ]; services.sanoid = { enable = true; datasets."${service_configs.zpool_ssds}" = { recursive = true; autoprune = true; autosnap = true; hourly = 5; daily = 7; monthly = 3; yearly = 0; }; datasets."${service_configs.zpool_ssds}/services/sql" = { recursive = true; autoprune = true; autosnap = true; hourly = 12; daily = 2; monthly = 0; yearly = 0; }; datasets."${service_configs.zpool_ssds}/services/jellyfin/cache" = { recursive = true; autoprune = true; autosnap = true; hourly = 0; daily = 0; monthly = 0; yearly = 0; }; datasets."${service_configs.zpool_ssds}/services/monero" = { recursive = true; autoprune = true; autosnap = true; hourly = 0; daily = 0; monthly = 0; yearly = 0; }; datasets."${service_configs.zpool_ssds}/services/p2pool" = { recursive = true; autoprune = true; autosnap = true; hourly = 0; daily = 0; monthly = 0; yearly = 0; }; datasets."${service_configs.zpool_hdds}" = { recursive = true; autoprune = true; autosnap = true; hourly = 0; daily = 0; monthly = 0; yearly = 0; }; }; services.zfs = { autoScrub.enable = true; trim.enable = true; }; }