monitoring: fix intel-gpu-collector crash resilience

Wrap entire read_one_sample() in try/except to handle all failures
(missing binary, permission errors, malformed JSON, timeouts).
Write zero-valued metrics on failure instead of exiting non-zero.
Increase timeout from 5s to 8s for slower GPU initialization.
This commit is contained in:
2026-04-02 17:43:13 -04:00
parent df15be01ea
commit 0235617627

View File

@@ -12,33 +12,61 @@ TEXTFILE = os.environ.get(
def read_one_sample():
proc = subprocess.Popen(
["intel_gpu_top", "-J", "-s", "1000"],
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
)
buf = b""
depth = 0
in_obj = False
deadline = time.monotonic() + 5.0
try:
while time.monotonic() < deadline:
byte = proc.stdout.read(1)
if not byte:
break
if byte == b"{":
in_obj = True
depth += 1
if in_obj:
buf += byte
if in_obj and byte == b"}":
depth -= 1
if depth == 0:
proc = subprocess.Popen(
["intel_gpu_top", "-J", "-s", "1000"],
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
)
buf = b""
depth = 0
in_obj = False
deadline = time.monotonic() + 8.0
try:
while time.monotonic() < deadline:
byte = proc.stdout.read(1)
if not byte:
break
finally:
proc.terminate()
proc.wait()
return json.loads(buf) if buf else None
if byte == b"{":
in_obj = True
depth += 1
if in_obj:
buf += byte
if in_obj and byte == b"}":
depth -= 1
if depth == 0:
break
finally:
proc.terminate()
proc.wait()
if not buf:
return None
try:
return json.loads(buf)
except json.JSONDecodeError:
print("Malformed JSON from intel_gpu_top", file=sys.stderr)
return None
except Exception as e:
print(f"intel_gpu_top unavailable: {e}", file=sys.stderr)
return None
def write_empty_metrics():
"""Write zero-valued metrics so Prometheus doesn't see stale data."""
lines = [
"# HELP intel_gpu_engine_busy_percent Intel GPU engine busy percentage",
"# TYPE intel_gpu_engine_busy_percent gauge",
"# HELP intel_gpu_frequency_mhz Intel GPU actual frequency in MHz",
"# TYPE intel_gpu_frequency_mhz gauge",
"intel_gpu_frequency_mhz 0",
"# HELP intel_gpu_rc6_percent Intel GPU RC6 power-saving state percentage",
"# TYPE intel_gpu_rc6_percent gauge",
"intel_gpu_rc6_percent 0",
]
tmp = TEXTFILE + ".tmp"
with open(tmp, "w") as f:
f.write("\n".join(lines) + "\n")
os.replace(tmp, TEXTFILE)
def write_metrics(sample):
@@ -70,7 +98,8 @@ def main():
sample = read_one_sample()
if sample is None:
print("Failed to read intel_gpu_top sample", file=sys.stderr)
sys.exit(1)
write_empty_metrics()
sys.exit(0)
write_metrics(sample)