llama-cpp: integrate native prometheus /metrics endpoint
llama.cpp server has a built-in /metrics endpoint exposing prompt_tokens_seconds, predicted_tokens_seconds, tokens_predicted_total, n_decode_total, and n_busy_slots_per_decode. Enable it with --metrics and add a Prometheus scrape target, replacing the need for any external metric collection for LLM inference monitoring.
This commit is contained in:
@@ -65,6 +65,12 @@ in
|
||||
{ targets = [ "127.0.0.1:${toString service_configs.ports.private.prometheus_apcupsd.port}" ]; }
|
||||
];
|
||||
}
|
||||
{
|
||||
job_name = "llama-cpp";
|
||||
static_configs = [
|
||||
{ targets = [ "127.0.0.1:${toString service_configs.ports.private.llama_cpp.port}" ]; }
|
||||
];
|
||||
}
|
||||
];
|
||||
};
|
||||
|
||||
|
||||
@@ -35,6 +35,7 @@ in
|
||||
"on"
|
||||
"--api-key-file"
|
||||
config.age.secrets.llama-cpp-api-key.path
|
||||
"--metrics"
|
||||
];
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user