jellyfin-annotations: preserve state on grafana failure, add grace period
Three edge cases broke annotations on reboot or interrupted sessions: - state.pop() ran before grafana_close(), so a failed PATCH (Grafana still restarting after reboot) permanently lost the grafana_id and left the annotation open forever in Grafana. - a single poll with no sessions closed every active annotation, so Jellyfin restarts or client reconnects produced spurious close + duplicate-open pairs. - timeEnd was always now_ms, so a reboot during playback wrote an annotation reading as if the user watched through the outage. Fix: track last_seen_ms and missing_count in state; retain entries until grafana_close succeeds (retry indefinitely); require MISSING_THRESHOLD absent polls before close; clamp close_time to last_seen_ms + (MISSING_THRESHOLD + 1) * POLL_INTERVAL. Adds three subtests in tests/jellyfin-annotations.nix that each fail on the old code and pass on the new.
This commit is contained in:
@@ -10,6 +10,11 @@ JELLYFIN_URL = os.environ.get("JELLYFIN_URL", "http://127.0.0.1:8096")
|
||||
GRAFANA_URL = os.environ.get("GRAFANA_URL", "http://127.0.0.1:3000")
|
||||
STATE_FILE = os.environ.get("STATE_FILE", "/var/lib/jellyfin-annotations/state.json")
|
||||
POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "30"))
|
||||
# Consecutive polls a session must be absent from /Sessions before we close
|
||||
# its annotation. Smooths over Jellyfin restarts, client reconnects, and
|
||||
# brief network hiccups that would otherwise spuriously close + reopen an
|
||||
# annotation every time /Sessions returns an empty list for a single poll.
|
||||
MISSING_THRESHOLD = int(os.environ.get("MISSING_THRESHOLD", "2"))
|
||||
|
||||
|
||||
def get_api_key():
|
||||
@@ -193,8 +198,77 @@ def grafana_close(grafana_id, end_ms):
|
||||
f"{GRAFANA_URL}/api/annotations/{grafana_id}",
|
||||
{"timeEnd": end_ms},
|
||||
)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Error closing annotation {grafana_id}: {e}", file=sys.stderr)
|
||||
return False
|
||||
|
||||
|
||||
def reconcile(state, sessions, now_ms):
|
||||
"""Fold the current /Sessions snapshot into state in place.
|
||||
|
||||
Returns True iff state was mutated (caller should persist).
|
||||
|
||||
Invariants this preserves, which the prior implementation violated:
|
||||
- A state entry is only removed after grafana_close succeeds. A failed
|
||||
PATCH (Grafana restarting, network blip) must leave the entry so we
|
||||
can retry on the next poll rather than orphan the open annotation.
|
||||
- Closing a session uses last_seen_ms clamped by grace window, not
|
||||
now_ms. After a reboot or long outage, now_ms is wildly later than
|
||||
when playback actually stopped, and using it paints the annotation
|
||||
as if the user watched through the outage.
|
||||
- A single missed poll does not close; a session must be absent for
|
||||
MISSING_THRESHOLD consecutive polls. Absorbs Jellyfin restarts and
|
||||
brief /Sessions empties without duplicating annotations.
|
||||
"""
|
||||
dirty = False
|
||||
current_ids = {s["Id"] for s in sessions}
|
||||
|
||||
# Active sessions: create new entries or refresh existing ones.
|
||||
for s in sessions:
|
||||
sid = s["Id"]
|
||||
entry = state.get(sid)
|
||||
if entry is None:
|
||||
label = format_label(s)
|
||||
grafana_id = grafana_post(label, now_ms)
|
||||
if grafana_id is None:
|
||||
# Grafana unreachable; retry on next poll. Do not persist a
|
||||
# half-open entry.
|
||||
continue
|
||||
state[sid] = {
|
||||
"grafana_id": grafana_id,
|
||||
"label": label,
|
||||
"start_ms": now_ms,
|
||||
"last_seen_ms": now_ms,
|
||||
"missing_count": 0,
|
||||
}
|
||||
dirty = True
|
||||
else:
|
||||
entry["last_seen_ms"] = now_ms
|
||||
entry["missing_count"] = 0
|
||||
dirty = True
|
||||
|
||||
# Absent sessions: increment miss counter; close only after threshold.
|
||||
# `grace_ms` caps how far timeEnd drifts from last_seen, so a reboot or
|
||||
# long outage closes the annotation near when playback actually stopped
|
||||
# rather than at service-recovery time.
|
||||
grace_ms = (MISSING_THRESHOLD + 1) * POLL_INTERVAL * 1000
|
||||
for sid in list(state.keys()):
|
||||
if sid in current_ids:
|
||||
continue
|
||||
entry = state[sid]
|
||||
entry["missing_count"] = entry.get("missing_count", 0) + 1
|
||||
dirty = True
|
||||
if entry["missing_count"] < MISSING_THRESHOLD:
|
||||
continue
|
||||
last_seen_ms = entry.get("last_seen_ms", now_ms)
|
||||
close_time = min(now_ms, last_seen_ms + grace_ms)
|
||||
if grafana_close(entry["grafana_id"], close_time):
|
||||
del state[sid]
|
||||
# On failure the entry stays with a bumped missing_count; next poll
|
||||
# will retry with the same bounded close_time.
|
||||
|
||||
return dirty
|
||||
|
||||
|
||||
def main():
|
||||
@@ -205,25 +279,11 @@ def main():
|
||||
now_ms = int(time.time() * 1000)
|
||||
sessions = get_active_sessions(api_key)
|
||||
|
||||
# sessions is None on a Jellyfin API error — skip the whole pass
|
||||
# rather than risk reinterpreting a transport failure as "nothing
|
||||
# is playing" and closing every open annotation.
|
||||
if sessions is not None:
|
||||
current_ids = {s["Id"] for s in sessions}
|
||||
|
||||
for s in sessions:
|
||||
sid = s["Id"]
|
||||
if sid not in state:
|
||||
label = format_label(s)
|
||||
grafana_id = grafana_post(label, now_ms)
|
||||
if grafana_id is not None:
|
||||
state[sid] = {
|
||||
"grafana_id": grafana_id,
|
||||
"label": label,
|
||||
"start_ms": now_ms,
|
||||
}
|
||||
save_state(state)
|
||||
|
||||
for sid in [k for k in state if k not in current_ids]:
|
||||
info = state.pop(sid)
|
||||
grafana_close(info["grafana_id"], now_ms)
|
||||
if reconcile(state, sessions, now_ms):
|
||||
save_state(state)
|
||||
|
||||
time.sleep(POLL_INTERVAL)
|
||||
|
||||
Reference in New Issue
Block a user