monitoring: fix intel-gpu-collector crash resilience
Wrap entire read_one_sample() in try/except to handle all failures (missing binary, permission errors, malformed JSON, timeouts). Write zero-valued metrics on failure instead of exiting non-zero. Increase timeout from 5s to 8s for slower GPU initialization.
This commit is contained in:
@@ -12,33 +12,61 @@ TEXTFILE = os.environ.get(
|
||||
|
||||
|
||||
def read_one_sample():
|
||||
proc = subprocess.Popen(
|
||||
["intel_gpu_top", "-J", "-s", "1000"],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.DEVNULL,
|
||||
)
|
||||
buf = b""
|
||||
depth = 0
|
||||
in_obj = False
|
||||
deadline = time.monotonic() + 5.0
|
||||
try:
|
||||
while time.monotonic() < deadline:
|
||||
byte = proc.stdout.read(1)
|
||||
if not byte:
|
||||
break
|
||||
if byte == b"{":
|
||||
in_obj = True
|
||||
depth += 1
|
||||
if in_obj:
|
||||
buf += byte
|
||||
if in_obj and byte == b"}":
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
proc = subprocess.Popen(
|
||||
["intel_gpu_top", "-J", "-s", "1000"],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.DEVNULL,
|
||||
)
|
||||
buf = b""
|
||||
depth = 0
|
||||
in_obj = False
|
||||
deadline = time.monotonic() + 8.0
|
||||
try:
|
||||
while time.monotonic() < deadline:
|
||||
byte = proc.stdout.read(1)
|
||||
if not byte:
|
||||
break
|
||||
finally:
|
||||
proc.terminate()
|
||||
proc.wait()
|
||||
return json.loads(buf) if buf else None
|
||||
if byte == b"{":
|
||||
in_obj = True
|
||||
depth += 1
|
||||
if in_obj:
|
||||
buf += byte
|
||||
if in_obj and byte == b"}":
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
break
|
||||
finally:
|
||||
proc.terminate()
|
||||
proc.wait()
|
||||
if not buf:
|
||||
return None
|
||||
try:
|
||||
return json.loads(buf)
|
||||
except json.JSONDecodeError:
|
||||
print("Malformed JSON from intel_gpu_top", file=sys.stderr)
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"intel_gpu_top unavailable: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def write_empty_metrics():
|
||||
"""Write zero-valued metrics so Prometheus doesn't see stale data."""
|
||||
lines = [
|
||||
"# HELP intel_gpu_engine_busy_percent Intel GPU engine busy percentage",
|
||||
"# TYPE intel_gpu_engine_busy_percent gauge",
|
||||
"# HELP intel_gpu_frequency_mhz Intel GPU actual frequency in MHz",
|
||||
"# TYPE intel_gpu_frequency_mhz gauge",
|
||||
"intel_gpu_frequency_mhz 0",
|
||||
"# HELP intel_gpu_rc6_percent Intel GPU RC6 power-saving state percentage",
|
||||
"# TYPE intel_gpu_rc6_percent gauge",
|
||||
"intel_gpu_rc6_percent 0",
|
||||
]
|
||||
tmp = TEXTFILE + ".tmp"
|
||||
with open(tmp, "w") as f:
|
||||
f.write("\n".join(lines) + "\n")
|
||||
os.replace(tmp, TEXTFILE)
|
||||
|
||||
|
||||
def write_metrics(sample):
|
||||
@@ -70,7 +98,8 @@ def main():
|
||||
sample = read_one_sample()
|
||||
if sample is None:
|
||||
print("Failed to read intel_gpu_top sample", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
write_empty_metrics()
|
||||
sys.exit(0)
|
||||
write_metrics(sample)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user