307 lines
11 KiB
YAML
307 lines
11 KiB
YAML
---
|
|
# ============================================================================
|
|
# wan-monitor — internet connection quality monitoring
|
|
# Single pod (3 containers) in admin-system:
|
|
# - blackbox : prometheus blackbox-exporter (HTTP phases, ICMP, DNS) :9115
|
|
# - wan-probe : irtt (UDP quality) + iperf3 (throughput) loop -> /shared
|
|
# - metrics-http : busybox httpd serving /shared/metrics :9116
|
|
# Prometheus scrapes :9115 (blackbox relabel jobs) and :9116 (textfile metrics).
|
|
# Scrape jobs live in prometheus-wan-scrape-jobs.yaml (merge into monitoring.yaml).
|
|
# ============================================================================
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: wan-monitor-blackbox
|
|
namespace: admin-system
|
|
labels:
|
|
app: wan-monitor
|
|
data:
|
|
blackbox.yml: |
|
|
modules:
|
|
http_2xx:
|
|
prober: http
|
|
timeout: 10s
|
|
http:
|
|
preferred_ip_protocol: ip4
|
|
ip_protocol_fallback: false
|
|
method: GET
|
|
fail_if_not_ssl: false
|
|
icmp:
|
|
prober: icmp
|
|
timeout: 5s
|
|
icmp:
|
|
preferred_ip_protocol: ip4
|
|
ip_protocol_fallback: false
|
|
dns_udp:
|
|
prober: dns
|
|
timeout: 5s
|
|
dns:
|
|
transport_protocol: udp
|
|
preferred_ip_protocol: ip4
|
|
query_name: "telex.hu"
|
|
query_type: "A"
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: wan-monitor-scripts
|
|
namespace: admin-system
|
|
labels:
|
|
app: wan-monitor
|
|
data:
|
|
metrics-header.prom: |
|
|
# HELP wan_irtt_rtt_seconds irtt round-trip time by statistic (seconds)
|
|
# TYPE wan_irtt_rtt_seconds gauge
|
|
# HELP wan_irtt_jitter_seconds irtt IPDV jitter mean by direction (seconds)
|
|
# TYPE wan_irtt_jitter_seconds gauge
|
|
# HELP wan_irtt_loss_ratio irtt packet loss ratio by direction (0-1)
|
|
# TYPE wan_irtt_loss_ratio gauge
|
|
# HELP wan_irtt_late_ratio irtt late/reordered packet ratio (0-1)
|
|
# TYPE wan_irtt_late_ratio gauge
|
|
# HELP wan_irtt_duplicate_ratio irtt duplicate packet ratio (0-1)
|
|
# TYPE wan_irtt_duplicate_ratio gauge
|
|
# HELP wan_irtt_packets irtt packet counters for the run
|
|
# TYPE wan_irtt_packets gauge
|
|
# HELP wan_irtt_success 1 if the irtt run produced stats
|
|
# TYPE wan_irtt_success gauge
|
|
# HELP wan_throughput_bits_per_second achieved throughput (bits/sec)
|
|
# TYPE wan_throughput_bits_per_second gauge
|
|
# HELP wan_throughput_success 1 if the throughput test succeeded
|
|
# TYPE wan_throughput_success gauge
|
|
# HELP wan_probe_last_run_timestamp_seconds unix time of last probe run
|
|
# TYPE wan_probe_last_run_timestamp_seconds gauge
|
|
|
|
irtt_to_prom.py: |
|
|
#!/usr/bin/env python3
|
|
# irtt JSON (stdin) -> Prometheus sample lines (no HELP/TYPE; header is static).
|
|
# args: <condition> <target>
|
|
import json, sys, time
|
|
cond = sys.argv[1] if len(sys.argv) > 1 else "idle"
|
|
target = sys.argv[2] if len(sys.argv) > 2 else "hetzner"
|
|
L = f'target="{target}",condition="{cond}"'
|
|
ts = f'{time.time():.0f}'
|
|
try:
|
|
s = json.load(sys.stdin)["stats"]
|
|
except Exception:
|
|
print(f'wan_irtt_success{{{L}}} 0')
|
|
print(f'wan_probe_last_run_timestamp_seconds{{probe="irtt",{L}}} {ts}')
|
|
sys.exit(0)
|
|
rtt = s["rtt"]
|
|
for k in ("min", "mean", "median", "max", "stddev"):
|
|
print(f'wan_irtt_rtt_seconds{{{L},stat="{k}"}} {rtt[k]/1e9}')
|
|
print(f'wan_irtt_jitter_seconds{{{L},direction="round_trip"}} {s["ipdv_round_trip"]["mean"]/1e9}')
|
|
print(f'wan_irtt_jitter_seconds{{{L},direction="send"}} {s["ipdv_send"]["mean"]/1e9}')
|
|
print(f'wan_irtt_jitter_seconds{{{L},direction="receive"}} {s["ipdv_receive"]["mean"]/1e9}')
|
|
print(f'wan_irtt_loss_ratio{{{L},direction="round_trip"}} {s["packet_loss_percent"]/100.0}')
|
|
print(f'wan_irtt_loss_ratio{{{L},direction="upstream"}} {s["upstream_loss_percent"]/100.0}')
|
|
print(f'wan_irtt_loss_ratio{{{L},direction="downstream"}} {s["downstream_loss_percent"]/100.0}')
|
|
print(f'wan_irtt_late_ratio{{{L}}} {s["late_packets_percent"]/100.0}')
|
|
print(f'wan_irtt_duplicate_ratio{{{L}}} {s["duplicate_percent"]/100.0}')
|
|
print(f'wan_irtt_packets{{{L},kind="sent"}} {s["packets_sent"]}')
|
|
print(f'wan_irtt_packets{{{L},kind="received"}} {s["packets_received"]}')
|
|
print(f'wan_irtt_packets{{{L},kind="server_received"}} {s["server_packets_received"]}')
|
|
print(f'wan_irtt_success{{{L}}} 1')
|
|
print(f'wan_probe_last_run_timestamp_seconds{{probe="irtt",{L}}} {ts}')
|
|
|
|
tput_to_prom.py: |
|
|
#!/usr/bin/env python3
|
|
# iperf3 JSON (stdin) -> Prometheus sample lines. args: <direction> <target>
|
|
import json, sys, time
|
|
direction = sys.argv[1] if len(sys.argv) > 1 else "download"
|
|
target = sys.argv[2] if len(sys.argv) > 2 else "hetzner"
|
|
L = f'target="{target}",direction="{direction}"'
|
|
ts = f'{time.time():.0f}'
|
|
try:
|
|
bps = json.load(sys.stdin)["end"]["sum_received"]["bits_per_second"]
|
|
print(f'wan_throughput_bits_per_second{{{L}}} {bps:.0f}')
|
|
print(f'wan_throughput_success{{{L}}} 1')
|
|
except Exception:
|
|
print(f'wan_throughput_success{{{L}}} 0')
|
|
print(f'wan_probe_last_run_timestamp_seconds{{probe="throughput",{L}}} {ts}')
|
|
|
|
probe-loop.sh: |
|
|
#!/bin/sh
|
|
set -u
|
|
SHARED=/shared
|
|
HDR=/scripts/metrics-header.prom
|
|
HETZNER="${HETZNER_HOST:?set HETZNER_HOST}"
|
|
IRTT_PORT="${IRTT_PORT:-2112}"
|
|
IPERF_PORT="${IPERF_PORT:-5201}"
|
|
IRTT_INTERVAL="${IRTT_INTERVAL:-20ms}"
|
|
IRTT_DURATION="${IRTT_DURATION:-60}" # seconds (numeric, for timeout math)
|
|
TPUT_EVERY="${TPUT_EVERY:-900}" # seconds between throughput tests
|
|
TPUT_TIME="${TPUT_TIME:-10}" # iperf3 seconds per direction
|
|
IRTT_TARGET="${IRTT_TARGET:-hetzner}"
|
|
TPUT_TARGET="${TPUT_TARGET:-hetzner}"
|
|
HMAC_OPT=""
|
|
[ -n "${IRTT_HMAC:-}" ] && HMAC_OPT="--hmac=${IRTT_HMAC}"
|
|
|
|
mkdir -p "$SHARED"
|
|
: > "$SHARED/.irtt.prom"; : > "$SHARED/.irttload.prom"; : > "$SHARED/.tput.prom"
|
|
cp "$HDR" "$SHARED/metrics" # serve header immediately so first scrapes don't 404
|
|
|
|
assemble() {
|
|
cat "$HDR" "$SHARED/.irtt.prom" "$SHARED/.irttload.prom" "$SHARED/.tput.prom" \
|
|
> "$SHARED/.metrics.tmp" 2>/dev/null
|
|
mv "$SHARED/.metrics.tmp" "$SHARED/metrics"
|
|
}
|
|
|
|
run_irtt() { # $1 condition $2 outfile $3 duration(seconds)
|
|
timeout "$(( $3 + 25 ))" irtt client -i "$IRTT_INTERVAL" -d "${3}s" -q $HMAC_OPT \
|
|
-o - "${HETZNER}:${IRTT_PORT}" 2>/dev/null \
|
|
| python3 /scripts/irtt_to_prom.py "$1" "$IRTT_TARGET" > "$2"
|
|
}
|
|
|
|
run_tput() {
|
|
P="${IPERF_PARALLEL:-4}" # parallel streams: a single stream can't fill 1 Gbps over the RTT
|
|
TO="$(( TPUT_TIME + 20 ))"
|
|
timeout "$TO" iperf3 -c "$HETZNER" -p "$IPERF_PORT" -t "$TPUT_TIME" -P "$P" --connect-timeout 5000 -R -J 2>/dev/null \
|
|
| python3 /scripts/tput_to_prom.py download "$TPUT_TARGET" > "$SHARED/.tput.prom"
|
|
timeout "$TO" iperf3 -c "$HETZNER" -p "$IPERF_PORT" -t "$TPUT_TIME" -P "$P" --connect-timeout 5000 -J 2>/dev/null \
|
|
| python3 /scripts/tput_to_prom.py upload "$TPUT_TARGET" >> "$SHARED/.tput.prom"
|
|
}
|
|
|
|
last_tput=0
|
|
while true; do
|
|
run_irtt idle "$SHARED/.irtt.prom" "$IRTT_DURATION" # blocks ~IRTT_DURATION = loop cadence
|
|
assemble
|
|
now=$(date +%s)
|
|
if [ $(( now - last_tput )) -ge "$TPUT_EVERY" ]; then
|
|
LOAD_DUR=$(( 2 * TPUT_TIME + 4 ))
|
|
run_irtt under_load "$SHARED/.irttload.prom" "$LOAD_DUR" & # concurrent = bufferbloat
|
|
LOADPID=$!
|
|
run_tput
|
|
wait "$LOADPID" 2>/dev/null
|
|
last_tput="$now"
|
|
assemble
|
|
fi
|
|
done
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: wan-monitor
|
|
namespace: admin-system
|
|
labels:
|
|
app: wan-monitor
|
|
spec:
|
|
replicas: 1
|
|
strategy:
|
|
type: Recreate
|
|
selector:
|
|
matchLabels:
|
|
app: wan-monitor
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: wan-monitor
|
|
annotations:
|
|
enable.version-checker.io/blackbox: "true"
|
|
enable.version-checker.io/metrics-http: "true"
|
|
enable.version-checker.io/wan-probe: "true"
|
|
match-regex.version-checker.io/blackbox: "^v[0-9]+\\.[0-9]+\\.[0-9]+$"
|
|
match-regex.version-checker.io/metrics-http: "^[0-9]+\\.[0-9]+\\.[0-9]+$"
|
|
match-regex.version-checker.io/wan-probe: "^[0-9]+\\.[0-9]+\\.[0-9]+$"
|
|
spec:
|
|
enableServiceLinks: false
|
|
containers:
|
|
- name: blackbox
|
|
image: quay.io/prometheus/blackbox-exporter:v0.28.0
|
|
args:
|
|
- --config.file=/etc/blackbox/blackbox.yml
|
|
- --web.listen-address=:9115
|
|
ports:
|
|
- name: blackbox
|
|
containerPort: 9115
|
|
securityContext:
|
|
capabilities:
|
|
add: ["NET_RAW"] # required for the ICMP prober
|
|
resources:
|
|
requests: { cpu: 10m, memory: 32Mi }
|
|
limits: { memory: 64Mi }
|
|
volumeMounts:
|
|
- name: blackbox-config
|
|
mountPath: /etc/blackbox
|
|
readOnly: true
|
|
|
|
- name: wan-probe
|
|
# Build + push from Dockerfile.wan-probe (adjust registry/tag to taste)
|
|
image: gitea.dooplex.hu/admin/wan-probe:0.1.0
|
|
command: ["/bin/sh", "/scripts/probe-loop.sh"]
|
|
env:
|
|
- name: HETZNER_HOST
|
|
# MUST be the Hetzner origin: a DNS-only (grey-cloud) record or raw IP.
|
|
# NOT the Cloudflare-proxied jarrs.eu — CF only forwards HTTP/HTTPS, so
|
|
# UDP 2112 (irtt) / TCP 5201 (iperf3) never reach the origin behind it.
|
|
value: "metrics.jarrs.eu" # DNS-only A record -> Hetzner IPv4
|
|
- name: IRTT_PORT
|
|
value: "2112"
|
|
- name: IPERF_PORT
|
|
value: "5201"
|
|
- name: IRTT_INTERVAL
|
|
value: "20ms"
|
|
- name: IRTT_DURATION
|
|
value: "60" # seconds (numeric)
|
|
- name: TPUT_EVERY
|
|
value: "900" # 15 min
|
|
- name: TPUT_TIME
|
|
value: "10"
|
|
- name: IPERF_PARALLEL
|
|
value: "4"
|
|
- name: IRTT_HMAC # shared key; apply via secret (see below)
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: wan-monitor-irtt
|
|
key: hmac
|
|
optional: true
|
|
resources:
|
|
requests: { cpu: 20m, memory: 48Mi }
|
|
limits: { memory: 96Mi }
|
|
volumeMounts:
|
|
- name: scripts
|
|
mountPath: /scripts
|
|
readOnly: true
|
|
- name: shared
|
|
mountPath: /shared
|
|
|
|
- name: metrics-http
|
|
image: busybox:1.36
|
|
command: ["httpd", "-f", "-v", "-p", "9116", "-h", "/shared"]
|
|
ports:
|
|
- name: metrics
|
|
containerPort: 9116
|
|
resources:
|
|
requests: { cpu: 5m, memory: 8Mi }
|
|
limits: { memory: 24Mi }
|
|
volumeMounts:
|
|
- name: shared
|
|
mountPath: /shared
|
|
readOnly: true
|
|
volumes:
|
|
- name: blackbox-config
|
|
configMap:
|
|
name: wan-monitor-blackbox
|
|
- name: scripts
|
|
configMap:
|
|
name: wan-monitor-scripts
|
|
- name: shared
|
|
emptyDir: {}
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: wan-monitor
|
|
namespace: admin-system
|
|
labels:
|
|
app: wan-monitor
|
|
spec:
|
|
type: ClusterIP
|
|
selector:
|
|
app: wan-monitor
|
|
ports:
|
|
- name: blackbox
|
|
port: 9115
|
|
targetPort: 9115
|
|
- name: metrics
|
|
port: 9116
|
|
targetPort: 9116 |