nixos/modules/zfs.nix

{
  config,
  lib,
  service_configs,
  pkgs,
  ...
}:
let
  # Total RAM in bytes (from /proc/meminfo: 65775836 KiB).
  totalRamBytes = 65775836 * 1024;

  # Hugepage reservations that the kernel carves out before ZFS can use them.
  hugepages2mBytes = service_configs.hugepages_2m.total_pages * 2 * 1024 * 1024;
  hugepages1gBytes = 3 * 1024 * 1024 * 1024; # 3x 1G pages for RandomX (xmrig.nix)
  totalHugepageBytes = hugepages2mBytes + hugepages1gBytes;

  # ARC max: 60% of RAM remaining after hugepages. Leaves headroom for
  # application RSS (PostgreSQL, qBittorrent, Jellyfin, Grafana, etc.),
  # kernel slabs, and page cache.
  arcMaxBytes = (totalRamBytes - totalHugepageBytes) * 60 / 100;
in
{
  # remove inline dbuf_evict_one call so the dedicated eviction thread
  # handles cache pressure instead of stalling txg_sync on 6.14+.
  # https://github.com/openzfs/zfs/issues/18426
  boot.zfs.package = pkgs.zfs_2_4.overrideAttrs (old: {
    patches = (old.patches or [ ]) ++ [
      ../patches/zfs/0001-remove-dbuf_evict_one-call.patch
    ];
  });
  boot.initrd.kernelModules = [ "zfs" ];

  boot.kernelParams = [
    # 120s TXG timeout: batch more dirty data per transaction group so the
    # HDD pool (hdds) writes larger, sequential I/Os instead of many small syncs.
    # This is a global setting (no per-pool control); the SSD pool (tank) syncs
    # infrequently but handles it fine since SSDs don't suffer from seek overhead.
    "zfs.zfs_txg_timeout=120"

    # Cap ARC to prevent it from claiming memory reserved for hugepages.
    # Without this, ZFS auto-sizes c_max to ~62 GiB on a 64 GiB system,
    # ignoring the 11.5 GiB of hugepage reservations.
    "zfs.zfs_arc_max=${toString arcMaxBytes}"

    # vdev I/O scheduler: feed more concurrent reads to the block scheduler so
    # mq-deadline has a larger pool of requests to sort and merge into elevator sweeps.
    # Default async_read_max is 3 — far too few for effective coalescence.
    # 32 was empirically optimal (64 overwhelmed the drives, 3 gave near-zero merges).
    "zfs.zfs_vdev_async_read_max_active=32"
    "zfs.zfs_vdev_async_read_min_active=4"

    # Merge reads within 128 KiB of each other (default 32 KiB). On HDDs, reading a
    # 128 KiB gap is far cheaper than a mechanical seek (~8 ms).
    "zfs.zfs_vdev_read_gap_limit=131072"

    # Allow ZFS to aggregate I/Os up to 4 MiB (default 1 MiB), matching the
    # libtorrent piece extent size for larger sequential disk operations.
    "zfs.zfs_vdev_aggregation_limit=4194304"
  ];

  boot.supportedFilesystems = [ "zfs" ];
  boot.zfs.extraPools = [
    service_configs.zpool_ssds
    service_configs.zpool_hdds
  ];

  services.sanoid = {
    enable = true;
    datasets."${service_configs.zpool_ssds}" = {
      recursive = true;
      autoprune = true;
      autosnap = true;
      hourly = 5;
      daily = 7;
      monthly = 3;
      yearly = 0;
    };

    datasets."${service_configs.zpool_ssds}/services/sql" = {
      recursive = true;
      autoprune = true;
      autosnap = true;
      hourly = 12;
      daily = 2;
      monthly = 0;
      yearly = 0;
    };

    datasets."${service_configs.zpool_ssds}/services/jellyfin/cache" = {
      recursive = true;
      autoprune = true;
      autosnap = true;
      hourly = 0;
      daily = 0;
      monthly = 0;
      yearly = 0;
    };

    datasets."${service_configs.zpool_ssds}/services/monero" = {
      recursive = true;
      autoprune = true;
      autosnap = true;
      hourly = 0;
      daily = 0;
      monthly = 0;
      yearly = 0;
    };

    datasets."${service_configs.zpool_ssds}/services/p2pool" = {
      recursive = true;
      autoprune = true;
      autosnap = true;
      hourly = 0;
      daily = 0;
      monthly = 0;
      yearly = 0;
    };

    datasets."${service_configs.zpool_hdds}" = {
      recursive = true;
      autoprune = true;
      autosnap = true;
      hourly = 0;
      daily = 0;
      monthly = 0;
      yearly = 0;
    };
  };

  services.zfs = {
    autoScrub.enable = true;
    trim.enable = true;
  };
}