From 998cd150a1c86b84b9c0815588a03a3188e28013 Mon Sep 17 00:00:00 2001 From: kisfenyo Date: Sun, 7 Jun 2026 10:34:41 +0200 Subject: [PATCH] added wan-monitor --- admin-system/wan-monitor.yaml | 302 ++++++++++++++++++++++++++++++++++ 1 file changed, 302 insertions(+) create mode 100644 admin-system/wan-monitor.yaml diff --git a/admin-system/wan-monitor.yaml b/admin-system/wan-monitor.yaml new file mode 100644 index 0000000..c33c459 --- /dev/null +++ b/admin-system/wan-monitor.yaml @@ -0,0 +1,302 @@ +--- +# ============================================================================ +# wan-monitor — internet connection quality monitoring +# Single pod (3 containers) in admin-system: +# - blackbox : prometheus blackbox-exporter (HTTP phases, ICMP, DNS) :9115 +# - wan-probe : irtt (UDP quality) + iperf3 (throughput) loop -> /shared +# - metrics-http : busybox httpd serving /shared/metrics :9116 +# Prometheus scrapes :9115 (blackbox relabel jobs) and :9116 (textfile metrics). +# Scrape jobs live in prometheus-wan-scrape-jobs.yaml (merge into monitoring.yaml). +# ============================================================================ +apiVersion: v1 +kind: ConfigMap +metadata: + name: wan-monitor-blackbox + namespace: admin-system + labels: + app: wan-monitor +data: + blackbox.yml: | + modules: + http_2xx: + prober: http + timeout: 10s + http: + preferred_ip_protocol: ip4 + ip_protocol_fallback: false + method: GET + fail_if_not_ssl: false + icmp: + prober: icmp + timeout: 5s + icmp: + preferred_ip_protocol: ip4 + ip_protocol_fallback: false + dns_udp: + prober: dns + timeout: 5s + dns: + transport_protocol: udp + preferred_ip_protocol: ip4 + query_name: "telex.hu" + query_type: "A" +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: wan-monitor-scripts + namespace: admin-system + labels: + app: wan-monitor +data: + metrics-header.prom: | + # HELP wan_irtt_rtt_seconds irtt round-trip time by statistic (seconds) + # TYPE wan_irtt_rtt_seconds gauge + # HELP wan_irtt_jitter_seconds irtt IPDV jitter mean by direction (seconds) + # TYPE wan_irtt_jitter_seconds gauge + # HELP wan_irtt_loss_ratio irtt packet loss ratio by direction (0-1) + # TYPE wan_irtt_loss_ratio gauge + # HELP wan_irtt_late_ratio irtt late/reordered packet ratio (0-1) + # TYPE wan_irtt_late_ratio gauge + # HELP wan_irtt_duplicate_ratio irtt duplicate packet ratio (0-1) + # TYPE wan_irtt_duplicate_ratio gauge + # HELP wan_irtt_packets irtt packet counters for the run + # TYPE wan_irtt_packets gauge + # HELP wan_irtt_success 1 if the irtt run produced stats + # TYPE wan_irtt_success gauge + # HELP wan_throughput_bits_per_second achieved throughput (bits/sec) + # TYPE wan_throughput_bits_per_second gauge + # HELP wan_throughput_success 1 if the throughput test succeeded + # TYPE wan_throughput_success gauge + # HELP wan_probe_last_run_timestamp_seconds unix time of last probe run + # TYPE wan_probe_last_run_timestamp_seconds gauge + + irtt_to_prom.py: | + #!/usr/bin/env python3 + # irtt JSON (stdin) -> Prometheus sample lines (no HELP/TYPE; header is static). + # args: + import json, sys, time + cond = sys.argv[1] if len(sys.argv) > 1 else "idle" + target = sys.argv[2] if len(sys.argv) > 2 else "hetzner" + L = f'target="{target}",condition="{cond}"' + ts = f'{time.time():.0f}' + try: + s = json.load(sys.stdin)["stats"] + except Exception: + print(f'wan_irtt_success{{{L}}} 0') + print(f'wan_probe_last_run_timestamp_seconds{{probe="irtt",{L}}} {ts}') + sys.exit(0) + rtt = s["rtt"] + for k in ("min", "mean", "median", "max", "stddev"): + print(f'wan_irtt_rtt_seconds{{{L},stat="{k}"}} {rtt[k]/1e9}') + print(f'wan_irtt_jitter_seconds{{{L},direction="round_trip"}} {s["ipdv_round_trip"]["mean"]/1e9}') + print(f'wan_irtt_jitter_seconds{{{L},direction="send"}} {s["ipdv_send"]["mean"]/1e9}') + print(f'wan_irtt_jitter_seconds{{{L},direction="receive"}} {s["ipdv_receive"]["mean"]/1e9}') + print(f'wan_irtt_loss_ratio{{{L},direction="round_trip"}} {s["packet_loss_percent"]/100.0}') + print(f'wan_irtt_loss_ratio{{{L},direction="upstream"}} {s["upstream_loss_percent"]/100.0}') + print(f'wan_irtt_loss_ratio{{{L},direction="downstream"}} {s["downstream_loss_percent"]/100.0}') + print(f'wan_irtt_late_ratio{{{L}}} {s["late_packets_percent"]/100.0}') + print(f'wan_irtt_duplicate_ratio{{{L}}} {s["duplicate_percent"]/100.0}') + print(f'wan_irtt_packets{{{L},kind="sent"}} {s["packets_sent"]}') + print(f'wan_irtt_packets{{{L},kind="received"}} {s["packets_received"]}') + print(f'wan_irtt_packets{{{L},kind="server_received"}} {s["server_packets_received"]}') + print(f'wan_irtt_success{{{L}}} 1') + print(f'wan_probe_last_run_timestamp_seconds{{probe="irtt",{L}}} {ts}') + + tput_to_prom.py: | + #!/usr/bin/env python3 + # iperf3 JSON (stdin) -> Prometheus sample lines. args: + import json, sys, time + direction = sys.argv[1] if len(sys.argv) > 1 else "download" + target = sys.argv[2] if len(sys.argv) > 2 else "hetzner" + L = f'target="{target}",direction="{direction}"' + ts = f'{time.time():.0f}' + try: + bps = json.load(sys.stdin)["end"]["sum_received"]["bits_per_second"] + print(f'wan_throughput_bits_per_second{{{L}}} {bps:.0f}') + print(f'wan_throughput_success{{{L}}} 1') + except Exception: + print(f'wan_throughput_success{{{L}}} 0') + print(f'wan_probe_last_run_timestamp_seconds{{probe="throughput",{L}}} {ts}') + + probe-loop.sh: | + #!/bin/sh + set -u + SHARED=/shared + HDR=/scripts/metrics-header.prom + HETZNER="${HETZNER_HOST:?set HETZNER_HOST}" + IRTT_PORT="${IRTT_PORT:-2112}" + IPERF_PORT="${IPERF_PORT:-5201}" + IRTT_INTERVAL="${IRTT_INTERVAL:-20ms}" + IRTT_DURATION="${IRTT_DURATION:-60s}" + TPUT_EVERY="${TPUT_EVERY:-900}" # seconds between throughput tests + TPUT_TIME="${TPUT_TIME:-10}" # iperf3 seconds per direction + IRTT_TARGET="${IRTT_TARGET:-hetzner}" + TPUT_TARGET="${TPUT_TARGET:-hetzner}" + HMAC_OPT="" + [ -n "${IRTT_HMAC:-}" ] && HMAC_OPT="--hmac=${IRTT_HMAC}" + + mkdir -p "$SHARED" + : > "$SHARED/.irtt.prom"; : > "$SHARED/.irttload.prom"; : > "$SHARED/.tput.prom" + cp "$HDR" "$SHARED/metrics" # serve header immediately so first scrapes don't 404 + + assemble() { + cat "$HDR" "$SHARED/.irtt.prom" "$SHARED/.irttload.prom" "$SHARED/.tput.prom" \ + > "$SHARED/.metrics.tmp" 2>/dev/null + mv "$SHARED/.metrics.tmp" "$SHARED/metrics" + } + + run_irtt() { # $1 condition $2 outfile $3 duration + irtt client -i "$IRTT_INTERVAL" -d "$3" -q $HMAC_OPT -o - "${HETZNER}:${IRTT_PORT}" 2>/dev/null \ + | python3 /scripts/irtt_to_prom.py "$1" "$IRTT_TARGET" > "$2" + } + + run_tput() { + P="${IPERF_PARALLEL:-4}" # parallel streams: a single stream can't fill 1 Gbps over the RTT + iperf3 -c "$HETZNER" -p "$IPERF_PORT" -t "$TPUT_TIME" -P "$P" -R -J 2>/dev/null \ + | python3 /scripts/tput_to_prom.py download "$TPUT_TARGET" > "$SHARED/.tput.prom" + iperf3 -c "$HETZNER" -p "$IPERF_PORT" -t "$TPUT_TIME" -P "$P" -J 2>/dev/null \ + | python3 /scripts/tput_to_prom.py upload "$TPUT_TARGET" >> "$SHARED/.tput.prom" + } + + last_tput=0 + while true; do + run_irtt idle "$SHARED/.irtt.prom" "$IRTT_DURATION" # blocks ~IRTT_DURATION = loop cadence + assemble + now=$(date +%s) + if [ $(( now - last_tput )) -ge "$TPUT_EVERY" ]; then + LOAD_DUR=$(( 2 * TPUT_TIME + 4 ))s + run_irtt under_load "$SHARED/.irttload.prom" "$LOAD_DUR" & # concurrent = bufferbloat + LOADPID=$! + run_tput + wait "$LOADPID" 2>/dev/null + last_tput="$now" + assemble + fi + done +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: wan-monitor + namespace: admin-system + labels: + app: wan-monitor +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app: wan-monitor + template: + metadata: + labels: + app: wan-monitor + annotations: + enable.version-checker.io/blackbox: "true" + enable.version-checker.io/metrics-http: "true" + enable.version-checker.io/wan-probe: "true" + match-regex.version-checker.io/blackbox: "^v[0-9]+\\.[0-9]+\\.[0-9]+$" + match-regex.version-checker.io/metrics-http: "^[0-9]+\\.[0-9]+\\.[0-9]+$" + match-regex.version-checker.io/wan-probe: "^[0-9]+\\.[0-9]+\\.[0-9]+$" + spec: + enableServiceLinks: false + containers: + - name: blackbox + image: quay.io/prometheus/blackbox-exporter:v0.25.0 + args: + - --config.file=/etc/blackbox/blackbox.yml + - --web.listen-address=:9115 + ports: + - name: blackbox + containerPort: 9115 + securityContext: + capabilities: + add: ["NET_RAW"] # required for the ICMP prober + resources: + requests: { cpu: 10m, memory: 32Mi } + limits: { memory: 64Mi } + volumeMounts: + - name: blackbox-config + mountPath: /etc/blackbox + readOnly: true + + - name: wan-probe + # Build + push from Dockerfile.wan-probe (adjust registry/tag to taste) + image: gitea.dooplex.hu/viktor/wan-probe:0.1.0 + command: ["/bin/sh", "/scripts/probe-loop.sh"] + env: + - name: HETZNER_HOST + value: "jarrs.eu" # irtt + iperf3 server (your Hetzner box) + - name: IRTT_PORT + value: "2112" + - name: IPERF_PORT + value: "5201" + - name: IRTT_INTERVAL + value: "20ms" + - name: IRTT_DURATION + value: "60s" + - name: TPUT_EVERY + value: "900" # 15 min + - name: TPUT_TIME + value: "10" + - name: IPERF_PARALLEL + value: "4" + - name: IRTT_HMAC # shared key; apply via secret (see below) + valueFrom: + secretKeyRef: + name: wan-monitor-irtt + key: hmac + optional: true + resources: + requests: { cpu: 20m, memory: 48Mi } + limits: { memory: 96Mi } + volumeMounts: + - name: scripts + mountPath: /scripts + readOnly: true + - name: shared + mountPath: /shared + + - name: metrics-http + image: busybox:1.36 + command: ["httpd", "-f", "-v", "-p", "9116", "-h", "/shared"] + ports: + - name: metrics + containerPort: 9116 + resources: + requests: { cpu: 5m, memory: 8Mi } + limits: { memory: 24Mi } + volumeMounts: + - name: shared + mountPath: /shared + readOnly: true + volumes: + - name: blackbox-config + configMap: + name: wan-monitor-blackbox + - name: scripts + configMap: + name: wan-monitor-scripts + - name: shared + emptyDir: {} +--- +apiVersion: v1 +kind: Service +metadata: + name: wan-monitor + namespace: admin-system + labels: + app: wan-monitor +spec: + type: ClusterIP + selector: + app: wan-monitor + ports: + - name: blackbox + port: 9115 + targetPort: 9115 + - name: metrics + port: 9116 + targetPort: 9116 \ No newline at end of file