monitoring: fix intel-gpu-collector crash resilience
Wrap entire read_one_sample() in try/except to handle all failures (missing binary, permission errors, malformed JSON, timeouts). Write zero-valued metrics on failure instead of exiting non-zero. Increase timeout from 5s to 8s for slower GPU initialization.
This commit is contained in:
@@ -12,33 +12,61 @@ TEXTFILE = os.environ.get(
|
|||||||
|
|
||||||
|
|
||||||
def read_one_sample():
|
def read_one_sample():
|
||||||
proc = subprocess.Popen(
|
|
||||||
["intel_gpu_top", "-J", "-s", "1000"],
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.DEVNULL,
|
|
||||||
)
|
|
||||||
buf = b""
|
|
||||||
depth = 0
|
|
||||||
in_obj = False
|
|
||||||
deadline = time.monotonic() + 5.0
|
|
||||||
try:
|
try:
|
||||||
while time.monotonic() < deadline:
|
proc = subprocess.Popen(
|
||||||
byte = proc.stdout.read(1)
|
["intel_gpu_top", "-J", "-s", "1000"],
|
||||||
if not byte:
|
stdout=subprocess.PIPE,
|
||||||
break
|
stderr=subprocess.DEVNULL,
|
||||||
if byte == b"{":
|
)
|
||||||
in_obj = True
|
buf = b""
|
||||||
depth += 1
|
depth = 0
|
||||||
if in_obj:
|
in_obj = False
|
||||||
buf += byte
|
deadline = time.monotonic() + 8.0
|
||||||
if in_obj and byte == b"}":
|
try:
|
||||||
depth -= 1
|
while time.monotonic() < deadline:
|
||||||
if depth == 0:
|
byte = proc.stdout.read(1)
|
||||||
|
if not byte:
|
||||||
break
|
break
|
||||||
finally:
|
if byte == b"{":
|
||||||
proc.terminate()
|
in_obj = True
|
||||||
proc.wait()
|
depth += 1
|
||||||
return json.loads(buf) if buf else None
|
if in_obj:
|
||||||
|
buf += byte
|
||||||
|
if in_obj and byte == b"}":
|
||||||
|
depth -= 1
|
||||||
|
if depth == 0:
|
||||||
|
break
|
||||||
|
finally:
|
||||||
|
proc.terminate()
|
||||||
|
proc.wait()
|
||||||
|
if not buf:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return json.loads(buf)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
print("Malformed JSON from intel_gpu_top", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
print(f"intel_gpu_top unavailable: {e}", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def write_empty_metrics():
|
||||||
|
"""Write zero-valued metrics so Prometheus doesn't see stale data."""
|
||||||
|
lines = [
|
||||||
|
"# HELP intel_gpu_engine_busy_percent Intel GPU engine busy percentage",
|
||||||
|
"# TYPE intel_gpu_engine_busy_percent gauge",
|
||||||
|
"# HELP intel_gpu_frequency_mhz Intel GPU actual frequency in MHz",
|
||||||
|
"# TYPE intel_gpu_frequency_mhz gauge",
|
||||||
|
"intel_gpu_frequency_mhz 0",
|
||||||
|
"# HELP intel_gpu_rc6_percent Intel GPU RC6 power-saving state percentage",
|
||||||
|
"# TYPE intel_gpu_rc6_percent gauge",
|
||||||
|
"intel_gpu_rc6_percent 0",
|
||||||
|
]
|
||||||
|
tmp = TEXTFILE + ".tmp"
|
||||||
|
with open(tmp, "w") as f:
|
||||||
|
f.write("\n".join(lines) + "\n")
|
||||||
|
os.replace(tmp, TEXTFILE)
|
||||||
|
|
||||||
|
|
||||||
def write_metrics(sample):
|
def write_metrics(sample):
|
||||||
@@ -70,7 +98,8 @@ def main():
|
|||||||
sample = read_one_sample()
|
sample = read_one_sample()
|
||||||
if sample is None:
|
if sample is None:
|
||||||
print("Failed to read intel_gpu_top sample", file=sys.stderr)
|
print("Failed to read intel_gpu_top sample", file=sys.stderr)
|
||||||
sys.exit(1)
|
write_empty_metrics()
|
||||||
|
sys.exit(0)
|
||||||
write_metrics(sample)
|
write_metrics(sample)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user