llama-cpp: integrate native prometheus /metrics endpoint

llama.cpp server has a built-in /metrics endpoint exposing
prompt_tokens_seconds, predicted_tokens_seconds, tokens_predicted_total,
n_decode_total, and n_busy_slots_per_decode. Enable it with --metrics
and add a Prometheus scrape target, replacing the need for any external
metric collection for LLM inference monitoring.
This commit is contained in:
2026-04-03 15:19:11 -04:00
parent 37ac88fc0f
commit 479ec43b8f
2 changed files with 7 additions and 0 deletions

View File

@@ -65,6 +65,12 @@ in
{ targets = [ "127.0.0.1:${toString service_configs.ports.private.prometheus_apcupsd.port}" ]; }
];
}
{
job_name = "llama-cpp";
static_configs = [
{ targets = [ "127.0.0.1:${toString service_configs.ports.private.llama_cpp.port}" ]; }
];
}
];
};

View File

@@ -35,6 +35,7 @@ in
"on"
"--api-key-file"
config.age.secrets.llama-cpp-api-key.path
"--metrics"
];
};