--- # ============================================================================ # wan-monitor — internet connection quality monitoring # Single pod (3 containers) in admin-system: # - blackbox : prometheus blackbox-exporter (HTTP phases, ICMP, DNS) :9115 # - wan-probe : irtt (UDP quality) + iperf3 (throughput) loop -> /shared # - metrics-http : busybox httpd serving /shared/metrics :9116 # Prometheus scrapes :9115 (blackbox relabel jobs) and :9116 (textfile metrics). # Scrape jobs live in prometheus-wan-scrape-jobs.yaml (merge into monitoring.yaml). # ============================================================================ apiVersion: v1 kind: ConfigMap metadata: name: wan-monitor-blackbox namespace: admin-system labels: app: wan-monitor data: blackbox.yml: | modules: http_2xx: prober: http timeout: 10s http: preferred_ip_protocol: ip4 ip_protocol_fallback: false method: GET fail_if_not_ssl: false icmp: prober: icmp timeout: 5s icmp: preferred_ip_protocol: ip4 ip_protocol_fallback: false dns_udp: prober: dns timeout: 5s dns: transport_protocol: udp preferred_ip_protocol: ip4 query_name: "telex.hu" query_type: "A" --- apiVersion: v1 kind: ConfigMap metadata: name: wan-monitor-scripts namespace: admin-system labels: app: wan-monitor data: metrics-header.prom: | # HELP wan_irtt_rtt_seconds irtt round-trip time by statistic (seconds) # TYPE wan_irtt_rtt_seconds gauge # HELP wan_irtt_jitter_seconds irtt IPDV jitter mean by direction (seconds) # TYPE wan_irtt_jitter_seconds gauge # HELP wan_irtt_loss_ratio irtt packet loss ratio by direction (0-1) # TYPE wan_irtt_loss_ratio gauge # HELP wan_irtt_late_ratio irtt late/reordered packet ratio (0-1) # TYPE wan_irtt_late_ratio gauge # HELP wan_irtt_duplicate_ratio irtt duplicate packet ratio (0-1) # TYPE wan_irtt_duplicate_ratio gauge # HELP wan_irtt_packets irtt packet counters for the run # TYPE wan_irtt_packets gauge # HELP wan_irtt_success 1 if the irtt run produced stats # TYPE wan_irtt_success gauge # HELP wan_throughput_bits_per_second achieved throughput (bits/sec) # TYPE wan_throughput_bits_per_second gauge # HELP wan_throughput_success 1 if the throughput test succeeded # TYPE wan_throughput_success gauge # HELP wan_probe_last_run_timestamp_seconds unix time of last probe run # TYPE wan_probe_last_run_timestamp_seconds gauge irtt_to_prom.py: | #!/usr/bin/env python3 # irtt JSON (stdin) -> Prometheus sample lines (no HELP/TYPE; header is static). # args: # Hardened: tolerates missing/null/NaN/Inf fields and clamps ratios to 0..1 # so a pathological irtt run can never emit an out-of-range or invalid sample. import json, sys, time, math cond = sys.argv[1] if len(sys.argv) > 1 else "idle" target = sys.argv[2] if len(sys.argv) > 2 else "hetzner" L = f'target="{target}",condition="{cond}"' ts = f'{time.time():.0f}' def num(x, default=0.0): # finite float or default (handles None / str / missing / NaN / Inf) try: v = float(x) except (TypeError, ValueError): return default return v if math.isfinite(v) else default def pct_ratio(x): # percent (0..100, possibly garbage) -> ratio clamped to 0..1 return max(0.0, min(1.0, num(x) / 100.0)) def fail(): print(f'wan_irtt_success{{{L}}} 0') print(f'wan_probe_last_run_timestamp_seconds{{probe="irtt",{L}}} {ts}') sys.exit(0) try: s = json.load(sys.stdin).get("stats") except Exception: fail() if not isinstance(s, dict): fail() rtt = s.get("rtt") or {} for k in ("min", "mean", "median", "max", "stddev"): print(f'wan_irtt_rtt_seconds{{{L},stat="{k}"}} {num(rtt.get(k)) / 1e9}') def ipdv(key): d = s.get(key) or {} return num(d.get("mean")) / 1e9 print(f'wan_irtt_jitter_seconds{{{L},direction="round_trip"}} {ipdv("ipdv_round_trip")}') print(f'wan_irtt_jitter_seconds{{{L},direction="send"}} {ipdv("ipdv_send")}') print(f'wan_irtt_jitter_seconds{{{L},direction="receive"}} {ipdv("ipdv_receive")}') print(f'wan_irtt_loss_ratio{{{L},direction="round_trip"}} {pct_ratio(s.get("packet_loss_percent"))}') print(f'wan_irtt_loss_ratio{{{L},direction="upstream"}} {pct_ratio(s.get("upstream_loss_percent"))}') print(f'wan_irtt_loss_ratio{{{L},direction="downstream"}} {pct_ratio(s.get("downstream_loss_percent"))}') print(f'wan_irtt_late_ratio{{{L}}} {pct_ratio(s.get("late_packets_percent"))}') print(f'wan_irtt_duplicate_ratio{{{L}}} {pct_ratio(s.get("duplicate_percent"))}') print(f'wan_irtt_packets{{{L},kind="sent"}} {int(num(s.get("packets_sent")))}') print(f'wan_irtt_packets{{{L},kind="received"}} {int(num(s.get("packets_received")))}') print(f'wan_irtt_packets{{{L},kind="server_received"}} {int(num(s.get("server_packets_received")))}') print(f'wan_irtt_success{{{L}}} 1') print(f'wan_probe_last_run_timestamp_seconds{{probe="irtt",{L}}} {ts}') tput_to_prom.py: | #!/usr/bin/env python3 # iperf3 JSON (stdin) -> Prometheus sample lines. args: import json, sys, time direction = sys.argv[1] if len(sys.argv) > 1 else "download" target = sys.argv[2] if len(sys.argv) > 2 else "hetzner" L = f'target="{target}",direction="{direction}"' ts = f'{time.time():.0f}' try: bps = json.load(sys.stdin)["end"]["sum_received"]["bits_per_second"] print(f'wan_throughput_bits_per_second{{{L}}} {bps:.0f}') print(f'wan_throughput_success{{{L}}} 1') except Exception: print(f'wan_throughput_success{{{L}}} 0') print(f'wan_probe_last_run_timestamp_seconds{{probe="throughput",{L}}} {ts}') probe-loop.sh: | #!/bin/sh set -u SHARED=/shared HDR=/scripts/metrics-header.prom HETZNER="${HETZNER_HOST:?set HETZNER_HOST}" IRTT_PORT="${IRTT_PORT:-2112}" IPERF_PORT="${IPERF_PORT:-5201}" IRTT_INTERVAL="${IRTT_INTERVAL:-20ms}" IRTT_DURATION="${IRTT_DURATION:-60}" # seconds (numeric, for timeout math) TPUT_EVERY="${TPUT_EVERY:-900}" # seconds between throughput tests TPUT_TIME="${TPUT_TIME:-10}" # iperf3 seconds per direction IRTT_TARGET="${IRTT_TARGET:-hetzner}" TPUT_TARGET="${TPUT_TARGET:-hetzner}" HMAC_OPT="" [ -n "${IRTT_HMAC:-}" ] && HMAC_OPT="--hmac=${IRTT_HMAC}" mkdir -p "$SHARED" : > "$SHARED/.irtt.prom"; : > "$SHARED/.irttload.prom"; : > "$SHARED/.tput.prom" cp "$HDR" "$SHARED/metrics" # serve header immediately so first scrapes don't 404 # Concatenate fragments into the served file via temp + atomic rename. assemble() { cat "$HDR" "$SHARED/.irtt.prom" "$SHARED/.irttload.prom" "$SHARED/.tput.prom" \ > "$SHARED/.metrics.tmp" 2>/dev/null mv "$SHARED/.metrics.tmp" "$SHARED/metrics" } # Each fragment is written to .tmp then renamed, so assemble() never # cats a partially written file (the cause of the impossible loss spikes). run_irtt() { # $1 condition $2 outfile $3 duration(seconds) timeout "$(( $3 + 25 ))" irtt client -i "$IRTT_INTERVAL" -d "${3}s" -q $HMAC_OPT \ -o - "${HETZNER}:${IRTT_PORT}" 2>/dev/null \ | python3 /scripts/irtt_to_prom.py "$1" "$IRTT_TARGET" > "$2.tmp" mv "$2.tmp" "$2" } run_tput() { P="${IPERF_PARALLEL:-4}" # parallel streams: a single stream can't fill the pipe over the RTT TO="$(( TPUT_TIME + 20 ))" TMP="$SHARED/.tput.prom.partial" : > "$TMP" timeout "$TO" iperf3 -c "$HETZNER" -p "$IPERF_PORT" -t "$TPUT_TIME" -P "$P" --connect-timeout 5000 -R -J 2>/dev/null \ | python3 /scripts/tput_to_prom.py download "$TPUT_TARGET" > "$TMP" timeout "$TO" iperf3 -c "$HETZNER" -p "$IPERF_PORT" -t "$TPUT_TIME" -P "$P" --connect-timeout 5000 -J 2>/dev/null \ | python3 /scripts/tput_to_prom.py upload "$TPUT_TARGET" >> "$TMP" mv "$TMP" "$SHARED/.tput.prom" } last_tput=0 while true; do run_irtt idle "$SHARED/.irtt.prom" "$IRTT_DURATION" # blocks ~IRTT_DURATION = loop cadence assemble now=$(date +%s) if [ $(( now - last_tput )) -ge "$TPUT_EVERY" ]; then LOAD_DUR=$(( 2 * TPUT_TIME + 4 )) run_irtt under_load "$SHARED/.irttload.prom" "$LOAD_DUR" & # concurrent = bufferbloat LOADPID=$! run_tput wait "$LOADPID" 2>/dev/null last_tput="$now" assemble fi done --- apiVersion: apps/v1 kind: Deployment metadata: name: wan-monitor namespace: admin-system labels: app: wan-monitor spec: replicas: 1 strategy: type: Recreate selector: matchLabels: app: wan-monitor template: metadata: labels: app: wan-monitor annotations: enable.version-checker.io/blackbox: "true" enable.version-checker.io/metrics-http: "true" enable.version-checker.io/wan-probe: "true" match-regex.version-checker.io/blackbox: "^v[0-9]+\\.[0-9]+\\.[0-9]+$" match-regex.version-checker.io/metrics-http: "^[0-9]+\\.[0-9]+\\.[0-9]+$" match-regex.version-checker.io/wan-probe: "^[0-9]+\\.[0-9]+\\.[0-9]+$" spec: enableServiceLinks: false containers: - name: blackbox image: quay.io/prometheus/blackbox-exporter:v0.28.0 args: - --config.file=/etc/blackbox/blackbox.yml - --web.listen-address=:9115 ports: - name: blackbox containerPort: 9115 securityContext: capabilities: add: ["NET_RAW"] # required for the ICMP prober resources: requests: { cpu: 10m, memory: 32Mi } limits: { memory: 64Mi } volumeMounts: - name: blackbox-config mountPath: /etc/blackbox readOnly: true - name: wan-probe # Build + push from Dockerfile.wan-probe (adjust registry/tag to taste) image: gitea.dooplex.hu/admin/wan-probe:0.1.0 command: ["/bin/sh", "/scripts/probe-loop.sh"] env: - name: HETZNER_HOST # MUST be the Hetzner origin: a DNS-only (grey-cloud) record or raw IP. # NOT the Cloudflare-proxied jarrs.eu — CF only forwards HTTP/HTTPS, so # UDP 2112 (irtt) / TCP 5201 (iperf3) never reach the origin behind it. value: "metrics.jarrs.eu" # DNS-only A record -> Hetzner IPv4 - name: IRTT_PORT value: "2112" - name: IPERF_PORT value: "5201" - name: IRTT_INTERVAL value: "20ms" - name: IRTT_DURATION value: "60" # seconds (numeric) - name: TPUT_EVERY value: "900" # 15 min - name: TPUT_TIME value: "10" - name: IPERF_PARALLEL value: "4" - name: IRTT_HMAC # shared key; apply via secret (see below) valueFrom: secretKeyRef: name: wan-monitor-irtt key: hmac optional: true resources: requests: { cpu: 20m, memory: 48Mi } limits: { memory: 96Mi } volumeMounts: - name: scripts mountPath: /scripts readOnly: true - name: shared mountPath: /shared - name: metrics-http image: busybox:1.36 command: ["httpd", "-f", "-v", "-p", "9116", "-h", "/shared"] ports: - name: metrics containerPort: 9116 resources: requests: { cpu: 5m, memory: 8Mi } limits: { memory: 24Mi } volumeMounts: - name: shared mountPath: /shared readOnly: true volumes: - name: blackbox-config configMap: name: wan-monitor-blackbox - name: scripts configMap: name: wan-monitor-scripts - name: shared emptyDir: {} --- apiVersion: v1 kind: Service metadata: name: wan-monitor namespace: admin-system labels: app: wan-monitor spec: type: ClusterIP selector: app: wan-monitor ports: - name: blackbox port: 9115 targetPort: 9115 - name: metrics port: 9116 targetPort: 9116