--- # ============================================================================ # wan-monitor — internet connection quality monitoring # Single pod (3 containers) in admin-system: # - blackbox : prometheus blackbox-exporter (HTTP phases, ICMP, DNS) :9115 # - wan-probe : irtt (UDP quality) + iperf3 (throughput) loop -> /shared # - metrics-http : busybox httpd serving /shared/metrics :9116 # Prometheus scrapes :9115 (blackbox relabel jobs) and :9116 (textfile metrics). # Scrape jobs live in prometheus-wan-scrape-jobs.yaml (merge into monitoring.yaml). # ============================================================================ apiVersion: v1 kind: ConfigMap metadata: name: wan-monitor-blackbox namespace: admin-system labels: app: wan-monitor data: blackbox.yml: | modules: http_2xx: prober: http timeout: 10s http: preferred_ip_protocol: ip4 ip_protocol_fallback: false method: GET fail_if_not_ssl: false icmp: prober: icmp timeout: 5s icmp: preferred_ip_protocol: ip4 ip_protocol_fallback: false dns_udp: prober: dns timeout: 5s dns: transport_protocol: udp preferred_ip_protocol: ip4 query_name: "telex.hu" query_type: "A" --- apiVersion: v1 kind: ConfigMap metadata: name: wan-monitor-scripts namespace: admin-system labels: app: wan-monitor data: metrics-header.prom: | # HELP wan_irtt_rtt_seconds irtt round-trip time by statistic (seconds) # TYPE wan_irtt_rtt_seconds gauge # HELP wan_irtt_jitter_seconds irtt IPDV jitter mean by direction (seconds) # TYPE wan_irtt_jitter_seconds gauge # HELP wan_irtt_loss_ratio irtt packet loss ratio by direction (0-1) # TYPE wan_irtt_loss_ratio gauge # HELP wan_irtt_late_ratio irtt late/reordered packet ratio (0-1) # TYPE wan_irtt_late_ratio gauge # HELP wan_irtt_duplicate_ratio irtt duplicate packet ratio (0-1) # TYPE wan_irtt_duplicate_ratio gauge # HELP wan_irtt_packets irtt packet counters for the run # TYPE wan_irtt_packets gauge # HELP wan_irtt_success 1 if the irtt run produced stats # TYPE wan_irtt_success gauge # HELP wan_throughput_bits_per_second achieved throughput (bits/sec) # TYPE wan_throughput_bits_per_second gauge # HELP wan_throughput_success 1 if the throughput test succeeded # TYPE wan_throughput_success gauge # HELP wan_probe_last_run_timestamp_seconds unix time of last probe run # TYPE wan_probe_last_run_timestamp_seconds gauge irtt_to_prom.py: | #!/usr/bin/env python3 # irtt JSON (stdin) -> Prometheus sample lines (no HELP/TYPE; header is static). # args: # Hardened: tolerates missing/null/NaN/Inf fields and clamps ratios to 0..1 # so a pathological irtt run can never emit an out-of-range or invalid sample. import json, sys, time, math cond = sys.argv[1] if len(sys.argv) > 1 else "idle" target = sys.argv[2] if len(sys.argv) > 2 else "hetzner" L = f'target="{target}",condition="{cond}"' ts = f'{time.time():.0f}' def num(x, default=0.0): # finite float or default (handles None / str / missing / NaN / Inf) try: v = float(x) except (TypeError, ValueError): return default return v if math.isfinite(v) else default def pct_ratio(x): # percent (0..100, possibly garbage) -> ratio clamped to 0..1 return max(0.0, min(1.0, num(x) / 100.0)) def fail(): print(f'wan_irtt_success{{{L}}} 0') print(f'wan_probe_last_run_timestamp_seconds{{probe="irtt",{L}}} {ts}') sys.exit(0) try: s = json.load(sys.stdin).get("stats") except Exception: fail() if not isinstance(s, dict): fail() rtt = s.get("rtt") or {} for k in ("min", "mean", "median", "max", "stddev"): print(f'wan_irtt_rtt_seconds{{{L},stat="{k}"}} {num(rtt.get(k)) / 1e9}') def ipdv(key): d = s.get(key) or {} return num(d.get("mean")) / 1e9 print(f'wan_irtt_jitter_seconds{{{L},direction="round_trip"}} {ipdv("ipdv_round_trip")}') print(f'wan_irtt_jitter_seconds{{{L},direction="send"}} {ipdv("ipdv_send")}') print(f'wan_irtt_jitter_seconds{{{L},direction="receive"}} {ipdv("ipdv_receive")}') print(f'wan_irtt_loss_ratio{{{L},direction="round_trip"}} {pct_ratio(s.get("packet_loss_percent"))}') print(f'wan_irtt_loss_ratio{{{L},direction="upstream"}} {pct_ratio(s.get("upstream_loss_percent"))}') print(f'wan_irtt_loss_ratio{{{L},direction="downstream"}} {pct_ratio(s.get("downstream_loss_percent"))}') print(f'wan_irtt_late_ratio{{{L}}} {pct_ratio(s.get("late_packets_percent"))}') print(f'wan_irtt_duplicate_ratio{{{L}}} {pct_ratio(s.get("duplicate_percent"))}') print(f'wan_irtt_packets{{{L},kind="sent"}} {int(num(s.get("packets_sent")))}') print(f'wan_irtt_packets{{{L},kind="received"}} {int(num(s.get("packets_received")))}') print(f'wan_irtt_packets{{{L},kind="server_received"}} {int(num(s.get("server_packets_received")))}') print(f'wan_irtt_success{{{L}}} 1') print(f'wan_probe_last_run_timestamp_seconds{{probe="irtt",{L}}} {ts}') tput_to_prom.py: | #!/usr/bin/env python3 # iperf3 JSON (stdin) -> Prometheus sample lines. args: import json, sys, time direction = sys.argv[1] if len(sys.argv) > 1 else "download" target = sys.argv[2] if len(sys.argv) > 2 else "hetzner" L = f'target="{target}",direction="{direction}"' ts = f'{time.time():.0f}' try: bps = json.load(sys.stdin)["end"]["sum_received"]["bits_per_second"] print(f'wan_throughput_bits_per_second{{{L}}} {bps:.0f}') print(f'wan_throughput_success{{{L}}} 1') except Exception: print(f'wan_throughput_success{{{L}}} 0') print(f'wan_probe_last_run_timestamp_seconds{{probe="throughput",{L}}} {ts}') probe-loop.sh: | #!/bin/sh set -u SHARED=/shared HDR=/scripts/metrics-header.prom # Space-separated endpoint labels. For each LABEL there must be an env var #