llama-cpp: integrate native prometheus /metrics endpoint

llama.cpp server has a built-in /metrics endpoint exposing prompt_tokens_seconds, predicted_tokens_seconds, tokens_predicted_total, n_decode_total, and n_busy_slots_per_decode. Enable it with --metrics and add a Prometheus scrape target, replacing the need for any external metric collection for LLM inference monitoring.
2026-04-03 15:19:11 -04:00
parent 37ac88fc0f
commit 479ec43b8f
2 changed files with 7 additions and 0 deletions
--- a/services/grafana/prometheus.nix
+++ b/services/grafana/prometheus.nix
@@ -65,6 +65,12 @@ in
          { targets = [ "127.0.0.1:${toString service_configs.ports.private.prometheus_apcupsd.port}" ]; }
        ];
      }
+      {
+        job_name = "llama-cpp";
+        static_configs = [
+          { targets = [ "127.0.0.1:${toString service_configs.ports.private.llama_cpp.port}" ]; }
+        ];
+      }
    ];
  };

--- a/services/llama-cpp/llama-cpp.nix
+++ b/services/llama-cpp/llama-cpp.nix
@@ -35,6 +35,7 @@ in
      "on"
      "--api-key-file"
      config.age.secrets.llama-cpp-api-key.path
+      "--metrics"
    ];
  };