monitoring: fix intel-gpu-collector crash resilience

Wrap entire read_one_sample() in try/except to handle all failures (missing binary, permission errors, malformed JSON, timeouts). Write zero-valued metrics on failure instead of exiting non-zero. Increase timeout from 5s to 8s for slower GPU initialization.
2026-04-02 17:43:13 -04:00
parent df15be01ea
commit 0235617627
1 changed files with 55 additions and 26 deletions
--- a/services/intel-gpu-collector.py
+++ b/services/intel-gpu-collector.py
@@ -12,33 +12,61 @@ TEXTFILE = os.environ.get(


 def read_one_sample():
-    proc = subprocess.Popen(
-        ["intel_gpu_top", "-J", "-s", "1000"],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.DEVNULL,
-    )
-    buf = b""
-    depth = 0
-    in_obj = False
-    deadline = time.monotonic() + 5.0
    try:
-        while time.monotonic() < deadline:
-            byte = proc.stdout.read(1)
-            if not byte:
-                break
-            if byte == b"{":
-                in_obj = True
-                depth += 1
-            if in_obj:
-                buf += byte
-            if in_obj and byte == b"}":
-                depth -= 1
-                if depth == 0:
+        proc = subprocess.Popen(
+            ["intel_gpu_top", "-J", "-s", "1000"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.DEVNULL,
+        )
+        buf = b""
+        depth = 0
+        in_obj = False
+        deadline = time.monotonic() + 8.0
+        try:
+            while time.monotonic() < deadline:
+                byte = proc.stdout.read(1)
+                if not byte:
                    break
-    finally:
-        proc.terminate()
-        proc.wait()
-    return json.loads(buf) if buf else None
+                if byte == b"{":
+                    in_obj = True
+                    depth += 1
+                if in_obj:
+                    buf += byte
+                if in_obj and byte == b"}":
+                    depth -= 1
+                    if depth == 0:
+                        break
+        finally:
+            proc.terminate()
+            proc.wait()
+        if not buf:
+            return None
+        try:
+            return json.loads(buf)
+        except json.JSONDecodeError:
+            print("Malformed JSON from intel_gpu_top", file=sys.stderr)
+            return None
+    except Exception as e:
+        print(f"intel_gpu_top unavailable: {e}", file=sys.stderr)
+        return None
+
+
+def write_empty_metrics():
+    """Write zero-valued metrics so Prometheus doesn't see stale data."""
+    lines = [
+        "# HELP intel_gpu_engine_busy_percent Intel GPU engine busy percentage",
+        "# TYPE intel_gpu_engine_busy_percent gauge",
+        "# HELP intel_gpu_frequency_mhz Intel GPU actual frequency in MHz",
+        "# TYPE intel_gpu_frequency_mhz gauge",
+        "intel_gpu_frequency_mhz 0",
+        "# HELP intel_gpu_rc6_percent Intel GPU RC6 power-saving state percentage",
+        "# TYPE intel_gpu_rc6_percent gauge",
+        "intel_gpu_rc6_percent 0",
+    ]
+    tmp = TEXTFILE + ".tmp"
+    with open(tmp, "w") as f:
+        f.write("\n".join(lines) + "\n")
+    os.replace(tmp, TEXTFILE)


 def write_metrics(sample):
@@ -70,7 +98,8 @@ def main():
    sample = read_one_sample()
    if sample is None:
        print("Failed to read intel_gpu_top sample", file=sys.stderr)
-        sys.exit(1)
+        write_empty_metrics()
+        sys.exit(0)
    write_metrics(sample)