108 lines
3.2 KiB
Python
108 lines
3.2 KiB
Python
#!/usr/bin/env python3
|
|
import json
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
|
|
TEXTFILE = os.environ.get(
|
|
"TEXTFILE",
|
|
"/var/lib/prometheus-node-exporter-textfiles/intel-gpu.prom",
|
|
)
|
|
|
|
|
|
def read_one_sample():
|
|
try:
|
|
proc = subprocess.Popen(
|
|
["intel_gpu_top", "-J", "-s", "1000"],
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.DEVNULL,
|
|
)
|
|
buf = b""
|
|
depth = 0
|
|
in_obj = False
|
|
deadline = time.monotonic() + 8.0
|
|
try:
|
|
while time.monotonic() < deadline:
|
|
byte = proc.stdout.read(1)
|
|
if not byte:
|
|
break
|
|
if byte == b"{":
|
|
in_obj = True
|
|
depth += 1
|
|
if in_obj:
|
|
buf += byte
|
|
if in_obj and byte == b"}":
|
|
depth -= 1
|
|
if depth == 0:
|
|
break
|
|
finally:
|
|
proc.terminate()
|
|
proc.wait()
|
|
if not buf:
|
|
return None
|
|
try:
|
|
return json.loads(buf)
|
|
except json.JSONDecodeError:
|
|
print("Malformed JSON from intel_gpu_top", file=sys.stderr)
|
|
return None
|
|
except Exception as e:
|
|
print(f"intel_gpu_top unavailable: {e}", file=sys.stderr)
|
|
return None
|
|
|
|
|
|
def write_empty_metrics():
|
|
"""Write zero-valued metrics so Prometheus doesn't see stale data."""
|
|
lines = [
|
|
"# HELP intel_gpu_engine_busy_percent Intel GPU engine busy percentage",
|
|
"# TYPE intel_gpu_engine_busy_percent gauge",
|
|
"# HELP intel_gpu_frequency_mhz Intel GPU actual frequency in MHz",
|
|
"# TYPE intel_gpu_frequency_mhz gauge",
|
|
"intel_gpu_frequency_mhz 0",
|
|
"# HELP intel_gpu_rc6_percent Intel GPU RC6 power-saving state percentage",
|
|
"# TYPE intel_gpu_rc6_percent gauge",
|
|
"intel_gpu_rc6_percent 0",
|
|
]
|
|
tmp = TEXTFILE + ".tmp"
|
|
with open(tmp, "w") as f:
|
|
f.write("\n".join(lines) + "\n")
|
|
os.replace(tmp, TEXTFILE)
|
|
|
|
|
|
def write_metrics(sample):
|
|
lines = [
|
|
"# HELP intel_gpu_engine_busy_percent Intel GPU engine busy percentage",
|
|
"# TYPE intel_gpu_engine_busy_percent gauge",
|
|
]
|
|
for engine, data in sample.get("engines", {}).items():
|
|
lines.append(
|
|
f'intel_gpu_engine_busy_percent{{engine="{engine}"}} {data.get("busy", 0)}'
|
|
)
|
|
freq = sample.get("frequency", {})
|
|
lines += [
|
|
"# HELP intel_gpu_frequency_mhz Intel GPU actual frequency in MHz",
|
|
"# TYPE intel_gpu_frequency_mhz gauge",
|
|
f'intel_gpu_frequency_mhz {freq.get("actual", 0)}',
|
|
"# HELP intel_gpu_rc6_percent Intel GPU RC6 power-saving state percentage",
|
|
"# TYPE intel_gpu_rc6_percent gauge",
|
|
f'intel_gpu_rc6_percent {sample.get("rc6", {}).get("value", 0)}',
|
|
]
|
|
|
|
tmp = TEXTFILE + ".tmp"
|
|
with open(tmp, "w") as f:
|
|
f.write("\n".join(lines) + "\n")
|
|
os.replace(tmp, TEXTFILE)
|
|
|
|
|
|
def main():
|
|
sample = read_one_sample()
|
|
if sample is None:
|
|
print("Failed to read intel_gpu_top sample", file=sys.stderr)
|
|
write_empty_metrics()
|
|
sys.exit(0)
|
|
write_metrics(sample)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|