From 0235617627a7860ad713ffb0196ed9aeeb796b89 Mon Sep 17 00:00:00 2001
From: Simon Gardling <titaniumtown@proton.me>
Date: Thu, 2 Apr 2026 17:43:13 -0400
Subject: [PATCH] monitoring: fix intel-gpu-collector crash resilience

Wrap entire read_one_sample() in try/except to handle all failures
(missing binary, permission errors, malformed JSON, timeouts).
Write zero-valued metrics on failure instead of exiting non-zero.
Increase timeout from 5s to 8s for slower GPU initialization.
---
 services/intel-gpu-collector.py | 81 ++++++++++++++++++++++-----------
 1 file changed, 55 insertions(+), 26 deletions(-)

diff --git a/services/intel-gpu-collector.py b/services/intel-gpu-collector.py
index 97aacec..70a5560 100644
--- a/services/intel-gpu-collector.py
+++ b/services/intel-gpu-collector.py
@@ -12,33 +12,61 @@ TEXTFILE = os.environ.get(
 
 
 def read_one_sample():
-    proc = subprocess.Popen(
-        ["intel_gpu_top", "-J", "-s", "1000"],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.DEVNULL,
-    )
-    buf = b""
-    depth = 0
-    in_obj = False
-    deadline = time.monotonic() + 5.0
     try:
-        while time.monotonic() < deadline:
-            byte = proc.stdout.read(1)
-            if not byte:
-                break
-            if byte == b"{":
-                in_obj = True
-                depth += 1
-            if in_obj:
-                buf += byte
-            if in_obj and byte == b"}":
-                depth -= 1
-                if depth == 0:
+        proc = subprocess.Popen(
+            ["intel_gpu_top", "-J", "-s", "1000"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.DEVNULL,
+        )
+        buf = b""
+        depth = 0
+        in_obj = False
+        deadline = time.monotonic() + 8.0
+        try:
+            while time.monotonic() < deadline:
+                byte = proc.stdout.read(1)
+                if not byte:
                     break
-    finally:
-        proc.terminate()
-        proc.wait()
-    return json.loads(buf) if buf else None
+                if byte == b"{":
+                    in_obj = True
+                    depth += 1
+                if in_obj:
+                    buf += byte
+                if in_obj and byte == b"}":
+                    depth -= 1
+                    if depth == 0:
+                        break
+        finally:
+            proc.terminate()
+            proc.wait()
+        if not buf:
+            return None
+        try:
+            return json.loads(buf)
+        except json.JSONDecodeError:
+            print("Malformed JSON from intel_gpu_top", file=sys.stderr)
+            return None
+    except Exception as e:
+        print(f"intel_gpu_top unavailable: {e}", file=sys.stderr)
+        return None
+
+
+def write_empty_metrics():
+    """Write zero-valued metrics so Prometheus doesn't see stale data."""
+    lines = [
+        "# HELP intel_gpu_engine_busy_percent Intel GPU engine busy percentage",
+        "# TYPE intel_gpu_engine_busy_percent gauge",
+        "# HELP intel_gpu_frequency_mhz Intel GPU actual frequency in MHz",
+        "# TYPE intel_gpu_frequency_mhz gauge",
+        "intel_gpu_frequency_mhz 0",
+        "# HELP intel_gpu_rc6_percent Intel GPU RC6 power-saving state percentage",
+        "# TYPE intel_gpu_rc6_percent gauge",
+        "intel_gpu_rc6_percent 0",
+    ]
+    tmp = TEXTFILE + ".tmp"
+    with open(tmp, "w") as f:
+        f.write("\n".join(lines) + "\n")
+    os.replace(tmp, TEXTFILE)
 
 
 def write_metrics(sample):
@@ -70,7 +98,8 @@ def main():
     sample = read_one_sample()
     if sample is None:
         print("Failed to read intel_gpu_top sample", file=sys.stderr)
-        sys.exit(1)
+        write_empty_metrics()
+        sys.exit(0)
     write_metrics(sample)