diff --git a/services/llama-cpp.nix b/services/llama-cpp.nix
index d9f7f76..ce6de79 100644
--- a/services/llama-cpp.nix
+++ b/services/llama-cpp.nix
@@ -20,6 +20,48 @@ in
     })
   ];
 
+  # Per-vhost Caddy access log for fail2ban to tail. llama.cpp's own
+  # "Invalid API Key" warning has no client IP, and behind Caddy the
+  # llama-server access log only sees 127.0.0.1. Caddy's JSON log has
+  # the real client IP via request.remote_ip.
+  services.caddy.virtualHosts."llm.${service_configs.https.domain}".extraConfig = ''
+    log {
+      output file /var/log/caddy/access-llama-cpp.log
+      format json
+    }
+  '';
+
+  # Ensure the log file exists on boot so fail2ban can start before Caddy
+  # has received its first request.
+  systemd.tmpfiles.rules = [
+    "d /var/log/caddy 755 caddy caddy"
+    "f /var/log/caddy/access-llama-cpp.log 644 caddy caddy"
+  ];
+
+  # Ban IPs that repeatedly fail API key validation. llama.cpp's public
+  # endpoints (/, /index.html, /bundle.{js,css}, /health, /v1/models,
+  # /v1/health, /models, /api/tags, /props) bypass auth, so any 401 on
+  # this vhost is an authenticated-endpoint failure -- no need to filter
+  # on the Authorization header the way caddy-auth does.
+  services.fail2ban.jails.llama-cpp = {
+    enabled = true;
+    settings = {
+      backend = "auto";
+      port = "http,https";
+      logpath = "/var/log/caddy/access-llama-cpp.log";
+      # defaults: maxretry=5, findtime=10m, bantime=10m
+
+      # NAT hairpinning sends LAN traffic via the router IP. Don't ban
+      # 192.168.1.0/24 or we lock ourselves out.
+      ignoreip = "127.0.0.1/8 ::1 192.168.1.0/24";
+    };
+    filter.Definition = {
+      failregex = ''^.*"remote_ip":"<HOST>".*"status":401.*$'';
+      ignoreregex = "";
+      datepattern = ''"ts":{Epoch}\.'';
+    };
+  };
+
   services.llama-cpp = {
     enable = true;
     model = toString (
diff --git a/tests/fail2ban-llama-cpp.nix b/tests/fail2ban-llama-cpp.nix
new file mode 100644
index 0000000..33f65b4
--- /dev/null
+++ b/tests/fail2ban-llama-cpp.nix
@@ -0,0 +1,103 @@
+{
+  config,
+  lib,
+  pkgs,
+  ...
+}:
+pkgs.testers.runNixOSTest {
+  name = "fail2ban-llama-cpp";
+
+  nodes = {
+    server =
+      {
+        config,
+        pkgs,
+        lib,
+        ...
+      }:
+      {
+        imports = [
+          ../modules/server-security.nix
+        ];
+
+        # Minimal Caddy that stands in for the llama-cpp reverse_proxy.
+        # Every request returns 401, mimicking llama.cpp's api-key middleware
+        # on an invalid key. We only care that Caddy writes the 401 with the
+        # real client IP to the same access log the production jail tails.
+        services.caddy = {
+          enable = true;
+          virtualHosts.":80".extraConfig = ''
+            log {
+              output file /var/log/caddy/access-llama-cpp.log
+              format json
+            }
+            respond "Invalid API Key" 401
+          '';
+        };
+
+        # Jail definition mirrors services/llama-cpp.nix. ignoreip omitted
+        # so the test VM subnet isn't exempted; maxretry lowered for speed.
+        services.fail2ban.jails.llama-cpp = {
+          enabled = true;
+          settings = {
+            backend = "auto";
+            port = "http,https";
+            logpath = "/var/log/caddy/access-llama-cpp.log";
+            maxretry = 3;
+          };
+          filter.Definition = {
+            failregex = ''^.*"remote_ip":"<HOST>".*"status":401.*$'';
+            ignoreregex = "";
+            datepattern = ''"ts":{Epoch}\.'';
+          };
+        };
+
+        systemd.tmpfiles.rules = [
+          "d /var/log/caddy 755 caddy caddy"
+          "f /var/log/caddy/access-llama-cpp.log 644 caddy caddy"
+        ];
+
+        networking.firewall.allowedTCPPorts = [ 80 ];
+      };
+
+    client = {
+      environment.systemPackages = [ pkgs.curl ];
+    };
+  };
+
+  testScript = ''
+    import time
+    import re
+
+    start_all()
+    server.wait_for_unit("caddy.service")
+    server.wait_for_unit("fail2ban.service")
+    server.wait_for_open_port(80)
+    time.sleep(2)
+
+    with subtest("Verify llama-cpp jail is active"):
+        status = server.succeed("fail2ban-client status")
+        assert "llama-cpp" in status, f"llama-cpp jail not found in: {status}"
+
+    with subtest("Generate failed API key attempts"):
+        # Force IPv4 for consistent IP tracking across the NAT fabric.
+        for i in range(4):
+            client.execute(
+                "curl -4 -s -H 'Authorization: Bearer badkey' http://server/v1/chat/completions || true"
+            )
+            time.sleep(1)
+
+    with subtest("Verify IP is banned"):
+        time.sleep(5)
+        status = server.succeed("fail2ban-client status llama-cpp")
+        print(f"llama-cpp jail status: {status}")
+        match = re.search(r"Currently banned:\s*(\d+)", status)
+        assert match and int(match.group(1)) >= 1, (
+            f"Expected at least 1 banned IP, got: {status}"
+        )
+
+    with subtest("Verify banned client cannot connect"):
+        exit_code = client.execute("curl -4 -s --max-time 3 http://server/ 2>&1")[0]
+        assert exit_code != 0, "Connection should be blocked"
+  '';
+}
diff --git a/tests/tests.nix b/tests/tests.nix
index 8493569..dda877a 100644
--- a/tests/tests.nix
+++ b/tests/tests.nix
@@ -21,6 +21,7 @@ in
   fail2banVaultwardenTest = handleTest ./fail2ban-vaultwarden.nix;
   fail2banImmichTest = handleTest ./fail2ban-immich.nix;
   fail2banJellyfinTest = handleTest ./fail2ban-jellyfin.nix;
+  fail2banLlamaCppTest = handleTest ./fail2ban-llama-cpp.nix;
 
   # jellyfin annotation service test
   jellyfinAnnotationsTest = handleTest ./jellyfin-annotations.nix;