monitoring: fix intel-gpu-collector crash resilience

Wrap entire read_one_sample() in try/except to handle all failures (missing binary, permission errors, malformed JSON, timeouts). Write zero-valued metrics on failure instead of exiting non-zero. Increase timeout from 5s to 8s for slower GPU initialization.
2026-04-02 17:43:13 -04:00
parent df15be01ea
commit 0235617627
1 changed files with 55 additions and 26 deletions
--- a/services/intel-gpu-collector.py
+++ b/services/intel-gpu-collector.py
@@ -12,33 +12,61 @@ TEXTFILE = os.environ.get(
 def read_one_sample():
    proc = subprocess.Popen(
        ["intel_gpu_top", "-J", "-s", "1000"],
        stdout=subprocess.PIPE,
        stderr=subprocess.DEVNULL,
    )
    buf = b""
    depth = 0
    in_obj = False
    deadline = time.monotonic() + 5.0
    try:
-        while time.monotonic() < deadline:
+        proc = subprocess.Popen(
-            byte = proc.stdout.read(1)
+            ["intel_gpu_top", "-J", "-s", "1000"],
-            if not byte:
+            stdout=subprocess.PIPE,
-                break
+            stderr=subprocess.DEVNULL,
-            if byte == b"{":
+        )
-                in_obj = True
+        buf = b""
-                depth += 1
+        depth = 0
-            if in_obj:
+        in_obj = False
-                buf += byte
+        deadline = time.monotonic() + 8.0
-            if in_obj and byte == b"}":
+        try:
-                depth -= 1
+            while time.monotonic() < deadline:
-                if depth == 0:
+                byte = proc.stdout.read(1)
                if not byte:
                    break
-    finally:
+                if byte == b"{":
-        proc.terminate()
+                    in_obj = True
-        proc.wait()
+                    depth += 1
-    return json.loads(buf) if buf else None
+                if in_obj:
                    buf += byte
                if in_obj and byte == b"}":
                    depth -= 1
                    if depth == 0:
                        break
        finally:
            proc.terminate()
            proc.wait()
        if not buf:
            return None
        try:
            return json.loads(buf)
        except json.JSONDecodeError:
            print("Malformed JSON from intel_gpu_top", file=sys.stderr)
            return None
    except Exception as e:
        print(f"intel_gpu_top unavailable: {e}", file=sys.stderr)
        return None
 def write_empty_metrics():
    """Write zero-valued metrics so Prometheus doesn't see stale data."""
    lines = [
        "# HELP intel_gpu_engine_busy_percent Intel GPU engine busy percentage",
        "# TYPE intel_gpu_engine_busy_percent gauge",
        "# HELP intel_gpu_frequency_mhz Intel GPU actual frequency in MHz",
        "# TYPE intel_gpu_frequency_mhz gauge",
        "intel_gpu_frequency_mhz 0",
        "# HELP intel_gpu_rc6_percent Intel GPU RC6 power-saving state percentage",
        "# TYPE intel_gpu_rc6_percent gauge",
        "intel_gpu_rc6_percent 0",
    ]
    tmp = TEXTFILE + ".tmp"
    with open(tmp, "w") as f:
        f.write("\n".join(lines) + "\n")
    os.replace(tmp, TEXTFILE)
 def write_metrics(sample):
@@ -70,7 +98,8 @@ def main():
    sample = read_one_sample()
    if sample is None:
        print("Failed to read intel_gpu_top sample", file=sys.stderr)
-        sys.exit(1)
+        write_empty_metrics()
        sys.exit(0)
    write_metrics(sample)