From 0235617627a7860ad713ffb0196ed9aeeb796b89 Mon Sep 17 00:00:00 2001 From: Simon Gardling Date: Thu, 2 Apr 2026 17:43:13 -0400 Subject: [PATCH] monitoring: fix intel-gpu-collector crash resilience Wrap entire read_one_sample() in try/except to handle all failures (missing binary, permission errors, malformed JSON, timeouts). Write zero-valued metrics on failure instead of exiting non-zero. Increase timeout from 5s to 8s for slower GPU initialization. --- services/intel-gpu-collector.py | 81 ++++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 26 deletions(-) diff --git a/services/intel-gpu-collector.py b/services/intel-gpu-collector.py index 97aacec..70a5560 100644 --- a/services/intel-gpu-collector.py +++ b/services/intel-gpu-collector.py @@ -12,33 +12,61 @@ TEXTFILE = os.environ.get( def read_one_sample(): - proc = subprocess.Popen( - ["intel_gpu_top", "-J", "-s", "1000"], - stdout=subprocess.PIPE, - stderr=subprocess.DEVNULL, - ) - buf = b"" - depth = 0 - in_obj = False - deadline = time.monotonic() + 5.0 try: - while time.monotonic() < deadline: - byte = proc.stdout.read(1) - if not byte: - break - if byte == b"{": - in_obj = True - depth += 1 - if in_obj: - buf += byte - if in_obj and byte == b"}": - depth -= 1 - if depth == 0: + proc = subprocess.Popen( + ["intel_gpu_top", "-J", "-s", "1000"], + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + ) + buf = b"" + depth = 0 + in_obj = False + deadline = time.monotonic() + 8.0 + try: + while time.monotonic() < deadline: + byte = proc.stdout.read(1) + if not byte: break - finally: - proc.terminate() - proc.wait() - return json.loads(buf) if buf else None + if byte == b"{": + in_obj = True + depth += 1 + if in_obj: + buf += byte + if in_obj and byte == b"}": + depth -= 1 + if depth == 0: + break + finally: + proc.terminate() + proc.wait() + if not buf: + return None + try: + return json.loads(buf) + except json.JSONDecodeError: + print("Malformed JSON from intel_gpu_top", file=sys.stderr) + return None + except Exception as e: + print(f"intel_gpu_top unavailable: {e}", file=sys.stderr) + return None + + +def write_empty_metrics(): + """Write zero-valued metrics so Prometheus doesn't see stale data.""" + lines = [ + "# HELP intel_gpu_engine_busy_percent Intel GPU engine busy percentage", + "# TYPE intel_gpu_engine_busy_percent gauge", + "# HELP intel_gpu_frequency_mhz Intel GPU actual frequency in MHz", + "# TYPE intel_gpu_frequency_mhz gauge", + "intel_gpu_frequency_mhz 0", + "# HELP intel_gpu_rc6_percent Intel GPU RC6 power-saving state percentage", + "# TYPE intel_gpu_rc6_percent gauge", + "intel_gpu_rc6_percent 0", + ] + tmp = TEXTFILE + ".tmp" + with open(tmp, "w") as f: + f.write("\n".join(lines) + "\n") + os.replace(tmp, TEXTFILE) def write_metrics(sample): @@ -70,7 +98,8 @@ def main(): sample = read_one_sample() if sample is None: print("Failed to read intel_gpu_top sample", file=sys.stderr) - sys.exit(1) + write_empty_metrics() + sys.exit(0) write_metrics(sample)