grafana: re-organize
This commit is contained in:
107
services/grafana/intel-gpu-collector.py
Normal file
107
services/grafana/intel-gpu-collector.py
Normal file
@@ -0,0 +1,107 @@
|
||||
#!/usr/bin/env python3
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
|
||||
TEXTFILE = os.environ.get(
|
||||
"TEXTFILE",
|
||||
"/var/lib/prometheus-node-exporter-textfiles/intel-gpu.prom",
|
||||
)
|
||||
|
||||
|
||||
def read_one_sample():
|
||||
try:
|
||||
proc = subprocess.Popen(
|
||||
["intel_gpu_top", "-J", "-s", "1000"],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.DEVNULL,
|
||||
)
|
||||
buf = b""
|
||||
depth = 0
|
||||
in_obj = False
|
||||
deadline = time.monotonic() + 8.0
|
||||
try:
|
||||
while time.monotonic() < deadline:
|
||||
byte = proc.stdout.read(1)
|
||||
if not byte:
|
||||
break
|
||||
if byte == b"{":
|
||||
in_obj = True
|
||||
depth += 1
|
||||
if in_obj:
|
||||
buf += byte
|
||||
if in_obj and byte == b"}":
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
break
|
||||
finally:
|
||||
proc.terminate()
|
||||
proc.wait()
|
||||
if not buf:
|
||||
return None
|
||||
try:
|
||||
return json.loads(buf)
|
||||
except json.JSONDecodeError:
|
||||
print("Malformed JSON from intel_gpu_top", file=sys.stderr)
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"intel_gpu_top unavailable: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def write_empty_metrics():
|
||||
"""Write zero-valued metrics so Prometheus doesn't see stale data."""
|
||||
lines = [
|
||||
"# HELP intel_gpu_engine_busy_percent Intel GPU engine busy percentage",
|
||||
"# TYPE intel_gpu_engine_busy_percent gauge",
|
||||
"# HELP intel_gpu_frequency_mhz Intel GPU actual frequency in MHz",
|
||||
"# TYPE intel_gpu_frequency_mhz gauge",
|
||||
"intel_gpu_frequency_mhz 0",
|
||||
"# HELP intel_gpu_rc6_percent Intel GPU RC6 power-saving state percentage",
|
||||
"# TYPE intel_gpu_rc6_percent gauge",
|
||||
"intel_gpu_rc6_percent 0",
|
||||
]
|
||||
tmp = TEXTFILE + ".tmp"
|
||||
with open(tmp, "w") as f:
|
||||
f.write("\n".join(lines) + "\n")
|
||||
os.replace(tmp, TEXTFILE)
|
||||
|
||||
|
||||
def write_metrics(sample):
|
||||
lines = [
|
||||
"# HELP intel_gpu_engine_busy_percent Intel GPU engine busy percentage",
|
||||
"# TYPE intel_gpu_engine_busy_percent gauge",
|
||||
]
|
||||
for engine, data in sample.get("engines", {}).items():
|
||||
lines.append(
|
||||
f'intel_gpu_engine_busy_percent{{engine="{engine}"}} {data.get("busy", 0)}'
|
||||
)
|
||||
freq = sample.get("frequency", {})
|
||||
lines += [
|
||||
"# HELP intel_gpu_frequency_mhz Intel GPU actual frequency in MHz",
|
||||
"# TYPE intel_gpu_frequency_mhz gauge",
|
||||
f'intel_gpu_frequency_mhz {freq.get("actual", 0)}',
|
||||
"# HELP intel_gpu_rc6_percent Intel GPU RC6 power-saving state percentage",
|
||||
"# TYPE intel_gpu_rc6_percent gauge",
|
||||
f'intel_gpu_rc6_percent {sample.get("rc6", {}).get("value", 0)}',
|
||||
]
|
||||
|
||||
tmp = TEXTFILE + ".tmp"
|
||||
with open(tmp, "w") as f:
|
||||
f.write("\n".join(lines) + "\n")
|
||||
os.replace(tmp, TEXTFILE)
|
||||
|
||||
|
||||
def main():
|
||||
sample = read_one_sample()
|
||||
if sample is None:
|
||||
print("Failed to read intel_gpu_top sample", file=sys.stderr)
|
||||
write_empty_metrics()
|
||||
sys.exit(0)
|
||||
write_metrics(sample)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user