Files
nixos/modules/zfs.nix

135 lines
4.0 KiB
Nix

{
config,
lib,
service_configs,
pkgs,
...
}:
let
# Total RAM in bytes (from /proc/meminfo: 65775836 KiB).
totalRamBytes = 65775836 * 1024;
# Hugepage reservations that the kernel carves out before ZFS can use them.
hugepages2mBytes = service_configs.hugepages_2m.total_pages * 2 * 1024 * 1024;
hugepages1gBytes = 3 * 1024 * 1024 * 1024; # 3x 1G pages for RandomX (xmrig.nix)
totalHugepageBytes = hugepages2mBytes + hugepages1gBytes;
# ARC max: 60% of RAM remaining after hugepages. Leaves headroom for
# application RSS (PostgreSQL, qBittorrent, Jellyfin, Grafana, etc.),
# kernel slabs, and page cache.
arcMaxBytes = (totalRamBytes - totalHugepageBytes) * 60 / 100;
in
{
# remove inline dbuf_evict_one call so the dedicated eviction thread
# handles cache pressure instead of stalling txg_sync on 6.14+.
# https://github.com/openzfs/zfs/issues/18426
boot.zfs.package = pkgs.zfs_2_4.overrideAttrs (old: {
patches = (old.patches or [ ]) ++ [
../patches/zfs/0001-remove-dbuf_evict_one-call.patch
];
});
boot.initrd.kernelModules = [ "zfs" ];
boot.kernelParams = [
# 120s TXG timeout: batch more dirty data per transaction group so the
# HDD pool (hdds) writes larger, sequential I/Os instead of many small syncs.
# This is a global setting (no per-pool control); the SSD pool (tank) syncs
# infrequently but handles it fine since SSDs don't suffer from seek overhead.
"zfs.zfs_txg_timeout=120"
# Cap ARC to prevent it from claiming memory reserved for hugepages.
# Without this, ZFS auto-sizes c_max to ~62 GiB on a 64 GiB system,
# ignoring the 11.5 GiB of hugepage reservations.
"zfs.zfs_arc_max=${toString arcMaxBytes}"
# vdev I/O scheduler: feed more concurrent reads to the block scheduler so
# mq-deadline has a larger pool of requests to sort and merge into elevator sweeps.
# Default async_read_max is 3 — far too few for effective coalescence.
# 32 was empirically optimal (64 overwhelmed the drives, 3 gave near-zero merges).
"zfs.zfs_vdev_async_read_max_active=32"
"zfs.zfs_vdev_async_read_min_active=4"
# Merge reads within 128 KiB of each other (default 32 KiB). On HDDs, reading a
# 128 KiB gap is far cheaper than a mechanical seek (~8 ms).
"zfs.zfs_vdev_read_gap_limit=131072"
# Allow ZFS to aggregate I/Os up to 4 MiB (default 1 MiB), matching the
# libtorrent piece extent size for larger sequential disk operations.
"zfs.zfs_vdev_aggregation_limit=4194304"
];
boot.supportedFilesystems = [ "zfs" ];
boot.zfs.extraPools = [
service_configs.zpool_ssds
service_configs.zpool_hdds
];
services.sanoid = {
enable = true;
datasets."${service_configs.zpool_ssds}" = {
recursive = true;
autoprune = true;
autosnap = true;
hourly = 5;
daily = 7;
monthly = 3;
yearly = 0;
};
datasets."${service_configs.zpool_ssds}/services/sql" = {
recursive = true;
autoprune = true;
autosnap = true;
hourly = 12;
daily = 2;
monthly = 0;
yearly = 0;
};
datasets."${service_configs.zpool_ssds}/services/jellyfin/cache" = {
recursive = true;
autoprune = true;
autosnap = true;
hourly = 0;
daily = 0;
monthly = 0;
yearly = 0;
};
datasets."${service_configs.zpool_ssds}/services/monero" = {
recursive = true;
autoprune = true;
autosnap = true;
hourly = 0;
daily = 0;
monthly = 0;
yearly = 0;
};
datasets."${service_configs.zpool_ssds}/services/p2pool" = {
recursive = true;
autoprune = true;
autosnap = true;
hourly = 0;
daily = 0;
monthly = 0;
yearly = 0;
};
datasets."${service_configs.zpool_hdds}" = {
recursive = true;
autoprune = true;
autosnap = true;
hourly = 0;
daily = 0;
monthly = 0;
yearly = 0;
};
};
services.zfs = {
autoScrub.enable = true;
trim.enable = true;
};
}