added wan-monitor
This commit is contained in:
@@ -0,0 +1,302 @@
|
||||
---
|
||||
# ============================================================================
|
||||
# wan-monitor — internet connection quality monitoring
|
||||
# Single pod (3 containers) in admin-system:
|
||||
# - blackbox : prometheus blackbox-exporter (HTTP phases, ICMP, DNS) :9115
|
||||
# - wan-probe : irtt (UDP quality) + iperf3 (throughput) loop -> /shared
|
||||
# - metrics-http : busybox httpd serving /shared/metrics :9116
|
||||
# Prometheus scrapes :9115 (blackbox relabel jobs) and :9116 (textfile metrics).
|
||||
# Scrape jobs live in prometheus-wan-scrape-jobs.yaml (merge into monitoring.yaml).
|
||||
# ============================================================================
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: wan-monitor-blackbox
|
||||
namespace: admin-system
|
||||
labels:
|
||||
app: wan-monitor
|
||||
data:
|
||||
blackbox.yml: |
|
||||
modules:
|
||||
http_2xx:
|
||||
prober: http
|
||||
timeout: 10s
|
||||
http:
|
||||
preferred_ip_protocol: ip4
|
||||
ip_protocol_fallback: false
|
||||
method: GET
|
||||
fail_if_not_ssl: false
|
||||
icmp:
|
||||
prober: icmp
|
||||
timeout: 5s
|
||||
icmp:
|
||||
preferred_ip_protocol: ip4
|
||||
ip_protocol_fallback: false
|
||||
dns_udp:
|
||||
prober: dns
|
||||
timeout: 5s
|
||||
dns:
|
||||
transport_protocol: udp
|
||||
preferred_ip_protocol: ip4
|
||||
query_name: "telex.hu"
|
||||
query_type: "A"
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: wan-monitor-scripts
|
||||
namespace: admin-system
|
||||
labels:
|
||||
app: wan-monitor
|
||||
data:
|
||||
metrics-header.prom: |
|
||||
# HELP wan_irtt_rtt_seconds irtt round-trip time by statistic (seconds)
|
||||
# TYPE wan_irtt_rtt_seconds gauge
|
||||
# HELP wan_irtt_jitter_seconds irtt IPDV jitter mean by direction (seconds)
|
||||
# TYPE wan_irtt_jitter_seconds gauge
|
||||
# HELP wan_irtt_loss_ratio irtt packet loss ratio by direction (0-1)
|
||||
# TYPE wan_irtt_loss_ratio gauge
|
||||
# HELP wan_irtt_late_ratio irtt late/reordered packet ratio (0-1)
|
||||
# TYPE wan_irtt_late_ratio gauge
|
||||
# HELP wan_irtt_duplicate_ratio irtt duplicate packet ratio (0-1)
|
||||
# TYPE wan_irtt_duplicate_ratio gauge
|
||||
# HELP wan_irtt_packets irtt packet counters for the run
|
||||
# TYPE wan_irtt_packets gauge
|
||||
# HELP wan_irtt_success 1 if the irtt run produced stats
|
||||
# TYPE wan_irtt_success gauge
|
||||
# HELP wan_throughput_bits_per_second achieved throughput (bits/sec)
|
||||
# TYPE wan_throughput_bits_per_second gauge
|
||||
# HELP wan_throughput_success 1 if the throughput test succeeded
|
||||
# TYPE wan_throughput_success gauge
|
||||
# HELP wan_probe_last_run_timestamp_seconds unix time of last probe run
|
||||
# TYPE wan_probe_last_run_timestamp_seconds gauge
|
||||
|
||||
irtt_to_prom.py: |
|
||||
#!/usr/bin/env python3
|
||||
# irtt JSON (stdin) -> Prometheus sample lines (no HELP/TYPE; header is static).
|
||||
# args: <condition> <target>
|
||||
import json, sys, time
|
||||
cond = sys.argv[1] if len(sys.argv) > 1 else "idle"
|
||||
target = sys.argv[2] if len(sys.argv) > 2 else "hetzner"
|
||||
L = f'target="{target}",condition="{cond}"'
|
||||
ts = f'{time.time():.0f}'
|
||||
try:
|
||||
s = json.load(sys.stdin)["stats"]
|
||||
except Exception:
|
||||
print(f'wan_irtt_success{{{L}}} 0')
|
||||
print(f'wan_probe_last_run_timestamp_seconds{{probe="irtt",{L}}} {ts}')
|
||||
sys.exit(0)
|
||||
rtt = s["rtt"]
|
||||
for k in ("min", "mean", "median", "max", "stddev"):
|
||||
print(f'wan_irtt_rtt_seconds{{{L},stat="{k}"}} {rtt[k]/1e9}')
|
||||
print(f'wan_irtt_jitter_seconds{{{L},direction="round_trip"}} {s["ipdv_round_trip"]["mean"]/1e9}')
|
||||
print(f'wan_irtt_jitter_seconds{{{L},direction="send"}} {s["ipdv_send"]["mean"]/1e9}')
|
||||
print(f'wan_irtt_jitter_seconds{{{L},direction="receive"}} {s["ipdv_receive"]["mean"]/1e9}')
|
||||
print(f'wan_irtt_loss_ratio{{{L},direction="round_trip"}} {s["packet_loss_percent"]/100.0}')
|
||||
print(f'wan_irtt_loss_ratio{{{L},direction="upstream"}} {s["upstream_loss_percent"]/100.0}')
|
||||
print(f'wan_irtt_loss_ratio{{{L},direction="downstream"}} {s["downstream_loss_percent"]/100.0}')
|
||||
print(f'wan_irtt_late_ratio{{{L}}} {s["late_packets_percent"]/100.0}')
|
||||
print(f'wan_irtt_duplicate_ratio{{{L}}} {s["duplicate_percent"]/100.0}')
|
||||
print(f'wan_irtt_packets{{{L},kind="sent"}} {s["packets_sent"]}')
|
||||
print(f'wan_irtt_packets{{{L},kind="received"}} {s["packets_received"]}')
|
||||
print(f'wan_irtt_packets{{{L},kind="server_received"}} {s["server_packets_received"]}')
|
||||
print(f'wan_irtt_success{{{L}}} 1')
|
||||
print(f'wan_probe_last_run_timestamp_seconds{{probe="irtt",{L}}} {ts}')
|
||||
|
||||
tput_to_prom.py: |
|
||||
#!/usr/bin/env python3
|
||||
# iperf3 JSON (stdin) -> Prometheus sample lines. args: <direction> <target>
|
||||
import json, sys, time
|
||||
direction = sys.argv[1] if len(sys.argv) > 1 else "download"
|
||||
target = sys.argv[2] if len(sys.argv) > 2 else "hetzner"
|
||||
L = f'target="{target}",direction="{direction}"'
|
||||
ts = f'{time.time():.0f}'
|
||||
try:
|
||||
bps = json.load(sys.stdin)["end"]["sum_received"]["bits_per_second"]
|
||||
print(f'wan_throughput_bits_per_second{{{L}}} {bps:.0f}')
|
||||
print(f'wan_throughput_success{{{L}}} 1')
|
||||
except Exception:
|
||||
print(f'wan_throughput_success{{{L}}} 0')
|
||||
print(f'wan_probe_last_run_timestamp_seconds{{probe="throughput",{L}}} {ts}')
|
||||
|
||||
probe-loop.sh: |
|
||||
#!/bin/sh
|
||||
set -u
|
||||
SHARED=/shared
|
||||
HDR=/scripts/metrics-header.prom
|
||||
HETZNER="${HETZNER_HOST:?set HETZNER_HOST}"
|
||||
IRTT_PORT="${IRTT_PORT:-2112}"
|
||||
IPERF_PORT="${IPERF_PORT:-5201}"
|
||||
IRTT_INTERVAL="${IRTT_INTERVAL:-20ms}"
|
||||
IRTT_DURATION="${IRTT_DURATION:-60s}"
|
||||
TPUT_EVERY="${TPUT_EVERY:-900}" # seconds between throughput tests
|
||||
TPUT_TIME="${TPUT_TIME:-10}" # iperf3 seconds per direction
|
||||
IRTT_TARGET="${IRTT_TARGET:-hetzner}"
|
||||
TPUT_TARGET="${TPUT_TARGET:-hetzner}"
|
||||
HMAC_OPT=""
|
||||
[ -n "${IRTT_HMAC:-}" ] && HMAC_OPT="--hmac=${IRTT_HMAC}"
|
||||
|
||||
mkdir -p "$SHARED"
|
||||
: > "$SHARED/.irtt.prom"; : > "$SHARED/.irttload.prom"; : > "$SHARED/.tput.prom"
|
||||
cp "$HDR" "$SHARED/metrics" # serve header immediately so first scrapes don't 404
|
||||
|
||||
assemble() {
|
||||
cat "$HDR" "$SHARED/.irtt.prom" "$SHARED/.irttload.prom" "$SHARED/.tput.prom" \
|
||||
> "$SHARED/.metrics.tmp" 2>/dev/null
|
||||
mv "$SHARED/.metrics.tmp" "$SHARED/metrics"
|
||||
}
|
||||
|
||||
run_irtt() { # $1 condition $2 outfile $3 duration
|
||||
irtt client -i "$IRTT_INTERVAL" -d "$3" -q $HMAC_OPT -o - "${HETZNER}:${IRTT_PORT}" 2>/dev/null \
|
||||
| python3 /scripts/irtt_to_prom.py "$1" "$IRTT_TARGET" > "$2"
|
||||
}
|
||||
|
||||
run_tput() {
|
||||
P="${IPERF_PARALLEL:-4}" # parallel streams: a single stream can't fill 1 Gbps over the RTT
|
||||
iperf3 -c "$HETZNER" -p "$IPERF_PORT" -t "$TPUT_TIME" -P "$P" -R -J 2>/dev/null \
|
||||
| python3 /scripts/tput_to_prom.py download "$TPUT_TARGET" > "$SHARED/.tput.prom"
|
||||
iperf3 -c "$HETZNER" -p "$IPERF_PORT" -t "$TPUT_TIME" -P "$P" -J 2>/dev/null \
|
||||
| python3 /scripts/tput_to_prom.py upload "$TPUT_TARGET" >> "$SHARED/.tput.prom"
|
||||
}
|
||||
|
||||
last_tput=0
|
||||
while true; do
|
||||
run_irtt idle "$SHARED/.irtt.prom" "$IRTT_DURATION" # blocks ~IRTT_DURATION = loop cadence
|
||||
assemble
|
||||
now=$(date +%s)
|
||||
if [ $(( now - last_tput )) -ge "$TPUT_EVERY" ]; then
|
||||
LOAD_DUR=$(( 2 * TPUT_TIME + 4 ))s
|
||||
run_irtt under_load "$SHARED/.irttload.prom" "$LOAD_DUR" & # concurrent = bufferbloat
|
||||
LOADPID=$!
|
||||
run_tput
|
||||
wait "$LOADPID" 2>/dev/null
|
||||
last_tput="$now"
|
||||
assemble
|
||||
fi
|
||||
done
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: wan-monitor
|
||||
namespace: admin-system
|
||||
labels:
|
||||
app: wan-monitor
|
||||
spec:
|
||||
replicas: 1
|
||||
strategy:
|
||||
type: Recreate
|
||||
selector:
|
||||
matchLabels:
|
||||
app: wan-monitor
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: wan-monitor
|
||||
annotations:
|
||||
enable.version-checker.io/blackbox: "true"
|
||||
enable.version-checker.io/metrics-http: "true"
|
||||
enable.version-checker.io/wan-probe: "true"
|
||||
match-regex.version-checker.io/blackbox: "^v[0-9]+\\.[0-9]+\\.[0-9]+$"
|
||||
match-regex.version-checker.io/metrics-http: "^[0-9]+\\.[0-9]+\\.[0-9]+$"
|
||||
match-regex.version-checker.io/wan-probe: "^[0-9]+\\.[0-9]+\\.[0-9]+$"
|
||||
spec:
|
||||
enableServiceLinks: false
|
||||
containers:
|
||||
- name: blackbox
|
||||
image: quay.io/prometheus/blackbox-exporter:v0.25.0
|
||||
args:
|
||||
- --config.file=/etc/blackbox/blackbox.yml
|
||||
- --web.listen-address=:9115
|
||||
ports:
|
||||
- name: blackbox
|
||||
containerPort: 9115
|
||||
securityContext:
|
||||
capabilities:
|
||||
add: ["NET_RAW"] # required for the ICMP prober
|
||||
resources:
|
||||
requests: { cpu: 10m, memory: 32Mi }
|
||||
limits: { memory: 64Mi }
|
||||
volumeMounts:
|
||||
- name: blackbox-config
|
||||
mountPath: /etc/blackbox
|
||||
readOnly: true
|
||||
|
||||
- name: wan-probe
|
||||
# Build + push from Dockerfile.wan-probe (adjust registry/tag to taste)
|
||||
image: gitea.dooplex.hu/viktor/wan-probe:0.1.0
|
||||
command: ["/bin/sh", "/scripts/probe-loop.sh"]
|
||||
env:
|
||||
- name: HETZNER_HOST
|
||||
value: "jarrs.eu" # irtt + iperf3 server (your Hetzner box)
|
||||
- name: IRTT_PORT
|
||||
value: "2112"
|
||||
- name: IPERF_PORT
|
||||
value: "5201"
|
||||
- name: IRTT_INTERVAL
|
||||
value: "20ms"
|
||||
- name: IRTT_DURATION
|
||||
value: "60s"
|
||||
- name: TPUT_EVERY
|
||||
value: "900" # 15 min
|
||||
- name: TPUT_TIME
|
||||
value: "10"
|
||||
- name: IPERF_PARALLEL
|
||||
value: "4"
|
||||
- name: IRTT_HMAC # shared key; apply via secret (see below)
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: wan-monitor-irtt
|
||||
key: hmac
|
||||
optional: true
|
||||
resources:
|
||||
requests: { cpu: 20m, memory: 48Mi }
|
||||
limits: { memory: 96Mi }
|
||||
volumeMounts:
|
||||
- name: scripts
|
||||
mountPath: /scripts
|
||||
readOnly: true
|
||||
- name: shared
|
||||
mountPath: /shared
|
||||
|
||||
- name: metrics-http
|
||||
image: busybox:1.36
|
||||
command: ["httpd", "-f", "-v", "-p", "9116", "-h", "/shared"]
|
||||
ports:
|
||||
- name: metrics
|
||||
containerPort: 9116
|
||||
resources:
|
||||
requests: { cpu: 5m, memory: 8Mi }
|
||||
limits: { memory: 24Mi }
|
||||
volumeMounts:
|
||||
- name: shared
|
||||
mountPath: /shared
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: blackbox-config
|
||||
configMap:
|
||||
name: wan-monitor-blackbox
|
||||
- name: scripts
|
||||
configMap:
|
||||
name: wan-monitor-scripts
|
||||
- name: shared
|
||||
emptyDir: {}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: wan-monitor
|
||||
namespace: admin-system
|
||||
labels:
|
||||
app: wan-monitor
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
app: wan-monitor
|
||||
ports:
|
||||
- name: blackbox
|
||||
port: 9115
|
||||
targetPort: 9115
|
||||
- name: metrics
|
||||
port: 9116
|
||||
targetPort: 9116
|
||||
Reference in New Issue
Block a user