llama-cpp: add grafana annotations for inference requests

Poll /slots endpoint, create annotations when slots start processing, close with token count when complete. Includes NixOS VM test with mock llama-cpp and grafana servers. Dashboard annotation entry added.
2026-04-02 17:43:49 -04:00
parent 0235617627
commit 9baeaa5c23
6 changed files with 362 additions and 0 deletions
--- a/services/monitoring.nix
+++ b/services/monitoring.nix
@@ -120,6 +120,18 @@ let
        type = "tags";
        tags = [ "zfs-scrub" ];
      }
+      {
+        name = "LLM Requests";
+        datasource = {
+          type = "grafana";
+          uid = "-- Grafana --";
+        };
+        enable = true;
+        iconColor = "purple";
+        showIn = 0;
+        type = "tags";
+        tags = [ "llama-cpp" ];
+      }
    ];

    panels = [