diff --git a/admin-system/wan-monitor.yaml b/admin-system/wan-monitor.yaml index 9fb0082..94e9f19 100644 --- a/admin-system/wan-monitor.yaml +++ b/admin-system/wan-monitor.yaml @@ -75,31 +75,61 @@ data: #!/usr/bin/env python3 # irtt JSON (stdin) -> Prometheus sample lines (no HELP/TYPE; header is static). # args: - import json, sys, time - cond = sys.argv[1] if len(sys.argv) > 1 else "idle" + # Hardened: tolerates missing/null/NaN/Inf fields and clamps ratios to 0..1 + # so a pathological irtt run can never emit an out-of-range or invalid sample. + import json, sys, time, math + + cond = sys.argv[1] if len(sys.argv) > 1 else "idle" target = sys.argv[2] if len(sys.argv) > 2 else "hetzner" - L = f'target="{target}",condition="{cond}"' + L = f'target="{target}",condition="{cond}"' ts = f'{time.time():.0f}' - try: - s = json.load(sys.stdin)["stats"] - except Exception: + + def num(x, default=0.0): + # finite float or default (handles None / str / missing / NaN / Inf) + try: + v = float(x) + except (TypeError, ValueError): + return default + return v if math.isfinite(v) else default + + def pct_ratio(x): + # percent (0..100, possibly garbage) -> ratio clamped to 0..1 + return max(0.0, min(1.0, num(x) / 100.0)) + + def fail(): print(f'wan_irtt_success{{{L}}} 0') print(f'wan_probe_last_run_timestamp_seconds{{probe="irtt",{L}}} {ts}') sys.exit(0) - rtt = s["rtt"] + + try: + s = json.load(sys.stdin).get("stats") + except Exception: + fail() + if not isinstance(s, dict): + fail() + + rtt = s.get("rtt") or {} for k in ("min", "mean", "median", "max", "stddev"): - print(f'wan_irtt_rtt_seconds{{{L},stat="{k}"}} {rtt[k]/1e9}') - print(f'wan_irtt_jitter_seconds{{{L},direction="round_trip"}} {s["ipdv_round_trip"]["mean"]/1e9}') - print(f'wan_irtt_jitter_seconds{{{L},direction="send"}} {s["ipdv_send"]["mean"]/1e9}') - print(f'wan_irtt_jitter_seconds{{{L},direction="receive"}} {s["ipdv_receive"]["mean"]/1e9}') - print(f'wan_irtt_loss_ratio{{{L},direction="round_trip"}} {s["packet_loss_percent"]/100.0}') - print(f'wan_irtt_loss_ratio{{{L},direction="upstream"}} {s["upstream_loss_percent"]/100.0}') - print(f'wan_irtt_loss_ratio{{{L},direction="downstream"}} {s["downstream_loss_percent"]/100.0}') - print(f'wan_irtt_late_ratio{{{L}}} {s["late_packets_percent"]/100.0}') - print(f'wan_irtt_duplicate_ratio{{{L}}} {s["duplicate_percent"]/100.0}') - print(f'wan_irtt_packets{{{L},kind="sent"}} {s["packets_sent"]}') - print(f'wan_irtt_packets{{{L},kind="received"}} {s["packets_received"]}') - print(f'wan_irtt_packets{{{L},kind="server_received"}} {s["server_packets_received"]}') + print(f'wan_irtt_rtt_seconds{{{L},stat="{k}"}} {num(rtt.get(k)) / 1e9}') + + def ipdv(key): + d = s.get(key) or {} + return num(d.get("mean")) / 1e9 + + print(f'wan_irtt_jitter_seconds{{{L},direction="round_trip"}} {ipdv("ipdv_round_trip")}') + print(f'wan_irtt_jitter_seconds{{{L},direction="send"}} {ipdv("ipdv_send")}') + print(f'wan_irtt_jitter_seconds{{{L},direction="receive"}} {ipdv("ipdv_receive")}') + + print(f'wan_irtt_loss_ratio{{{L},direction="round_trip"}} {pct_ratio(s.get("packet_loss_percent"))}') + print(f'wan_irtt_loss_ratio{{{L},direction="upstream"}} {pct_ratio(s.get("upstream_loss_percent"))}') + print(f'wan_irtt_loss_ratio{{{L},direction="downstream"}} {pct_ratio(s.get("downstream_loss_percent"))}') + print(f'wan_irtt_late_ratio{{{L}}} {pct_ratio(s.get("late_packets_percent"))}') + print(f'wan_irtt_duplicate_ratio{{{L}}} {pct_ratio(s.get("duplicate_percent"))}') + + print(f'wan_irtt_packets{{{L},kind="sent"}} {int(num(s.get("packets_sent")))}') + print(f'wan_irtt_packets{{{L},kind="received"}} {int(num(s.get("packets_received")))}') + print(f'wan_irtt_packets{{{L},kind="server_received"}} {int(num(s.get("server_packets_received")))}') + print(f'wan_irtt_success{{{L}}} 1') print(f'wan_probe_last_run_timestamp_seconds{{probe="irtt",{L}}} {ts}') @@ -129,8 +159,8 @@ data: IPERF_PORT="${IPERF_PORT:-5201}" IRTT_INTERVAL="${IRTT_INTERVAL:-20ms}" IRTT_DURATION="${IRTT_DURATION:-60}" # seconds (numeric, for timeout math) - TPUT_EVERY="${TPUT_EVERY:-900}" # seconds between throughput tests - TPUT_TIME="${TPUT_TIME:-10}" # iperf3 seconds per direction + TPUT_EVERY="${TPUT_EVERY:-900}" # seconds between throughput tests + TPUT_TIME="${TPUT_TIME:-10}" # iperf3 seconds per direction IRTT_TARGET="${IRTT_TARGET:-hetzner}" TPUT_TARGET="${TPUT_TARGET:-hetzner}" HMAC_OPT="" @@ -140,26 +170,32 @@ data: : > "$SHARED/.irtt.prom"; : > "$SHARED/.irttload.prom"; : > "$SHARED/.tput.prom" cp "$HDR" "$SHARED/metrics" # serve header immediately so first scrapes don't 404 + # Concatenate fragments into the served file via temp + atomic rename. assemble() { cat "$HDR" "$SHARED/.irtt.prom" "$SHARED/.irttload.prom" "$SHARED/.tput.prom" \ > "$SHARED/.metrics.tmp" 2>/dev/null mv "$SHARED/.metrics.tmp" "$SHARED/metrics" } - run_irtt() { + # Each fragment is written to .tmp then renamed, so assemble() never + # cats a partially written file (the cause of the impossible loss spikes). + run_irtt() { # $1 condition $2 outfile $3 duration(seconds) timeout "$(( $3 + 25 ))" irtt client -i "$IRTT_INTERVAL" -d "${3}s" -q $HMAC_OPT \ -o - "${HETZNER}:${IRTT_PORT}" 2>/dev/null \ | python3 /scripts/irtt_to_prom.py "$1" "$IRTT_TARGET" > "$2.tmp" mv "$2.tmp" "$2" } - + run_tput() { - P="${IPERF_PARALLEL:-4}" # parallel streams: a single stream can't fill 1 Gbps over the RTT + P="${IPERF_PARALLEL:-4}" # parallel streams: a single stream can't fill the pipe over the RTT TO="$(( TPUT_TIME + 20 ))" + TMP="$SHARED/.tput.prom.partial" + : > "$TMP" timeout "$TO" iperf3 -c "$HETZNER" -p "$IPERF_PORT" -t "$TPUT_TIME" -P "$P" --connect-timeout 5000 -R -J 2>/dev/null \ - | python3 /scripts/tput_to_prom.py download "$TPUT_TARGET" > "$SHARED/.tput.prom" + | python3 /scripts/tput_to_prom.py download "$TPUT_TARGET" > "$TMP" timeout "$TO" iperf3 -c "$HETZNER" -p "$IPERF_PORT" -t "$TPUT_TIME" -P "$P" --connect-timeout 5000 -J 2>/dev/null \ - | python3 /scripts/tput_to_prom.py upload "$TPUT_TARGET" >> "$SHARED/.tput.prom" + | python3 /scripts/tput_to_prom.py upload "$TPUT_TARGET" >> "$TMP" + mv "$TMP" "$SHARED/.tput.prom" } last_tput=0