Compare commits

..

15 Commits

Author SHA1 Message Date
admin 4e86091f7d updates scripts 2026-06-07 12:59:09 +02:00
admin 754564167f updated probe-loop.sh 2026-06-07 12:50:41 +02:00
admin 40f5532570 added memoty to grafana 2026-06-07 12:01:19 +02:00
admin 0a2efb86ac fixed image 2026-06-07 11:52:28 +02:00
admin b40090dec1 fixed dns 2026-06-07 11:51:35 +02:00
admin 2370f005c6 added fallback_scrape_protocol 2026-06-07 11:32:54 +02:00
admin 05fa40ff5d prom targets for wan 2026-06-07 11:24:29 +02:00
admin ef77ab9285 updated wan rules 2026-06-07 11:20:30 +02:00
admin e0fd669f7c fix 2026-06-07 11:15:44 +02:00
admin 877cda7be1 updated configmap 2026-06-07 11:14:48 +02:00
admin 0887848d29 changed linebreak 2026-06-07 11:12:23 +02:00
admin 565c4c8bd0 fixed repo, added prometheus rules 2026-06-07 11:00:31 +02:00
admin 998cd150a1 added wan-monitor 2026-06-07 10:34:41 +02:00
admin 1a1cded065 outlint 1.8.1 2026-06-06 15:44:24 +02:00
admin a66cef8a9e Merge pull request 'feat: migrate seerr from fallenbagel/jellyseerr:preview-OIDC -> seerr-team/seerr:v3.3.0' (#88) from feat/seerr-migrate-to-seerr-team into main 2026-06-06 13:37:22 +00:00
3 changed files with 516 additions and 2 deletions
+344
View File
@@ -0,0 +1,344 @@
---
# ============================================================================
# wan-monitor — internet connection quality monitoring
# Single pod (3 containers) in admin-system:
# - blackbox : prometheus blackbox-exporter (HTTP phases, ICMP, DNS) :9115
# - wan-probe : irtt (UDP quality) + iperf3 (throughput) loop -> /shared
# - metrics-http : busybox httpd serving /shared/metrics :9116
# Prometheus scrapes :9115 (blackbox relabel jobs) and :9116 (textfile metrics).
# Scrape jobs live in prometheus-wan-scrape-jobs.yaml (merge into monitoring.yaml).
# ============================================================================
apiVersion: v1
kind: ConfigMap
metadata:
name: wan-monitor-blackbox
namespace: admin-system
labels:
app: wan-monitor
data:
blackbox.yml: |
modules:
http_2xx:
prober: http
timeout: 10s
http:
preferred_ip_protocol: ip4
ip_protocol_fallback: false
method: GET
fail_if_not_ssl: false
icmp:
prober: icmp
timeout: 5s
icmp:
preferred_ip_protocol: ip4
ip_protocol_fallback: false
dns_udp:
prober: dns
timeout: 5s
dns:
transport_protocol: udp
preferred_ip_protocol: ip4
query_name: "telex.hu"
query_type: "A"
---
apiVersion: v1
kind: ConfigMap
metadata:
name: wan-monitor-scripts
namespace: admin-system
labels:
app: wan-monitor
data:
metrics-header.prom: |
# HELP wan_irtt_rtt_seconds irtt round-trip time by statistic (seconds)
# TYPE wan_irtt_rtt_seconds gauge
# HELP wan_irtt_jitter_seconds irtt IPDV jitter mean by direction (seconds)
# TYPE wan_irtt_jitter_seconds gauge
# HELP wan_irtt_loss_ratio irtt packet loss ratio by direction (0-1)
# TYPE wan_irtt_loss_ratio gauge
# HELP wan_irtt_late_ratio irtt late/reordered packet ratio (0-1)
# TYPE wan_irtt_late_ratio gauge
# HELP wan_irtt_duplicate_ratio irtt duplicate packet ratio (0-1)
# TYPE wan_irtt_duplicate_ratio gauge
# HELP wan_irtt_packets irtt packet counters for the run
# TYPE wan_irtt_packets gauge
# HELP wan_irtt_success 1 if the irtt run produced stats
# TYPE wan_irtt_success gauge
# HELP wan_throughput_bits_per_second achieved throughput (bits/sec)
# TYPE wan_throughput_bits_per_second gauge
# HELP wan_throughput_success 1 if the throughput test succeeded
# TYPE wan_throughput_success gauge
# HELP wan_probe_last_run_timestamp_seconds unix time of last probe run
# TYPE wan_probe_last_run_timestamp_seconds gauge
irtt_to_prom.py: |
#!/usr/bin/env python3
# irtt JSON (stdin) -> Prometheus sample lines (no HELP/TYPE; header is static).
# args: <condition> <target>
# Hardened: tolerates missing/null/NaN/Inf fields and clamps ratios to 0..1
# so a pathological irtt run can never emit an out-of-range or invalid sample.
import json, sys, time, math
cond = sys.argv[1] if len(sys.argv) > 1 else "idle"
target = sys.argv[2] if len(sys.argv) > 2 else "hetzner"
L = f'target="{target}",condition="{cond}"'
ts = f'{time.time():.0f}'
def num(x, default=0.0):
# finite float or default (handles None / str / missing / NaN / Inf)
try:
v = float(x)
except (TypeError, ValueError):
return default
return v if math.isfinite(v) else default
def pct_ratio(x):
# percent (0..100, possibly garbage) -> ratio clamped to 0..1
return max(0.0, min(1.0, num(x) / 100.0))
def fail():
print(f'wan_irtt_success{{{L}}} 0')
print(f'wan_probe_last_run_timestamp_seconds{{probe="irtt",{L}}} {ts}')
sys.exit(0)
try:
s = json.load(sys.stdin).get("stats")
except Exception:
fail()
if not isinstance(s, dict):
fail()
rtt = s.get("rtt") or {}
for k in ("min", "mean", "median", "max", "stddev"):
print(f'wan_irtt_rtt_seconds{{{L},stat="{k}"}} {num(rtt.get(k)) / 1e9}')
def ipdv(key):
d = s.get(key) or {}
return num(d.get("mean")) / 1e9
print(f'wan_irtt_jitter_seconds{{{L},direction="round_trip"}} {ipdv("ipdv_round_trip")}')
print(f'wan_irtt_jitter_seconds{{{L},direction="send"}} {ipdv("ipdv_send")}')
print(f'wan_irtt_jitter_seconds{{{L},direction="receive"}} {ipdv("ipdv_receive")}')
print(f'wan_irtt_loss_ratio{{{L},direction="round_trip"}} {pct_ratio(s.get("packet_loss_percent"))}')
print(f'wan_irtt_loss_ratio{{{L},direction="upstream"}} {pct_ratio(s.get("upstream_loss_percent"))}')
print(f'wan_irtt_loss_ratio{{{L},direction="downstream"}} {pct_ratio(s.get("downstream_loss_percent"))}')
print(f'wan_irtt_late_ratio{{{L}}} {pct_ratio(s.get("late_packets_percent"))}')
print(f'wan_irtt_duplicate_ratio{{{L}}} {pct_ratio(s.get("duplicate_percent"))}')
print(f'wan_irtt_packets{{{L},kind="sent"}} {int(num(s.get("packets_sent")))}')
print(f'wan_irtt_packets{{{L},kind="received"}} {int(num(s.get("packets_received")))}')
print(f'wan_irtt_packets{{{L},kind="server_received"}} {int(num(s.get("server_packets_received")))}')
print(f'wan_irtt_success{{{L}}} 1')
print(f'wan_probe_last_run_timestamp_seconds{{probe="irtt",{L}}} {ts}')
tput_to_prom.py: |
#!/usr/bin/env python3
# iperf3 JSON (stdin) -> Prometheus sample lines. args: <direction> <target>
import json, sys, time
direction = sys.argv[1] if len(sys.argv) > 1 else "download"
target = sys.argv[2] if len(sys.argv) > 2 else "hetzner"
L = f'target="{target}",direction="{direction}"'
ts = f'{time.time():.0f}'
try:
bps = json.load(sys.stdin)["end"]["sum_received"]["bits_per_second"]
print(f'wan_throughput_bits_per_second{{{L}}} {bps:.0f}')
print(f'wan_throughput_success{{{L}}} 1')
except Exception:
print(f'wan_throughput_success{{{L}}} 0')
print(f'wan_probe_last_run_timestamp_seconds{{probe="throughput",{L}}} {ts}')
probe-loop.sh: |
#!/bin/sh
set -u
SHARED=/shared
HDR=/scripts/metrics-header.prom
HETZNER="${HETZNER_HOST:?set HETZNER_HOST}"
IRTT_PORT="${IRTT_PORT:-2112}"
IPERF_PORT="${IPERF_PORT:-5201}"
IRTT_INTERVAL="${IRTT_INTERVAL:-20ms}"
IRTT_DURATION="${IRTT_DURATION:-60}" # seconds (numeric, for timeout math)
TPUT_EVERY="${TPUT_EVERY:-900}" # seconds between throughput tests
TPUT_TIME="${TPUT_TIME:-10}" # iperf3 seconds per direction
IRTT_TARGET="${IRTT_TARGET:-hetzner}"
TPUT_TARGET="${TPUT_TARGET:-hetzner}"
HMAC_OPT=""
[ -n "${IRTT_HMAC:-}" ] && HMAC_OPT="--hmac=${IRTT_HMAC}"
mkdir -p "$SHARED"
: > "$SHARED/.irtt.prom"; : > "$SHARED/.irttload.prom"; : > "$SHARED/.tput.prom"
cp "$HDR" "$SHARED/metrics" # serve header immediately so first scrapes don't 404
# Concatenate fragments into the served file via temp + atomic rename.
assemble() {
cat "$HDR" "$SHARED/.irtt.prom" "$SHARED/.irttload.prom" "$SHARED/.tput.prom" \
> "$SHARED/.metrics.tmp" 2>/dev/null
mv "$SHARED/.metrics.tmp" "$SHARED/metrics"
}
# Each fragment is written to <file>.tmp then renamed, so assemble() never
# cats a partially written file (the cause of the impossible loss spikes).
run_irtt() { # $1 condition $2 outfile $3 duration(seconds)
timeout "$(( $3 + 25 ))" irtt client -i "$IRTT_INTERVAL" -d "${3}s" -q $HMAC_OPT \
-o - "${HETZNER}:${IRTT_PORT}" 2>/dev/null \
| python3 /scripts/irtt_to_prom.py "$1" "$IRTT_TARGET" > "$2.tmp"
mv "$2.tmp" "$2"
}
run_tput() {
P="${IPERF_PARALLEL:-4}" # parallel streams: a single stream can't fill the pipe over the RTT
TO="$(( TPUT_TIME + 20 ))"
TMP="$SHARED/.tput.prom.partial"
: > "$TMP"
timeout "$TO" iperf3 -c "$HETZNER" -p "$IPERF_PORT" -t "$TPUT_TIME" -P "$P" --connect-timeout 5000 -R -J 2>/dev/null \
| python3 /scripts/tput_to_prom.py download "$TPUT_TARGET" > "$TMP"
timeout "$TO" iperf3 -c "$HETZNER" -p "$IPERF_PORT" -t "$TPUT_TIME" -P "$P" --connect-timeout 5000 -J 2>/dev/null \
| python3 /scripts/tput_to_prom.py upload "$TPUT_TARGET" >> "$TMP"
mv "$TMP" "$SHARED/.tput.prom"
}
last_tput=0
while true; do
run_irtt idle "$SHARED/.irtt.prom" "$IRTT_DURATION" # blocks ~IRTT_DURATION = loop cadence
assemble
now=$(date +%s)
if [ $(( now - last_tput )) -ge "$TPUT_EVERY" ]; then
LOAD_DUR=$(( 2 * TPUT_TIME + 4 ))
run_irtt under_load "$SHARED/.irttload.prom" "$LOAD_DUR" & # concurrent = bufferbloat
LOADPID=$!
run_tput
wait "$LOADPID" 2>/dev/null
last_tput="$now"
assemble
fi
done
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: wan-monitor
namespace: admin-system
labels:
app: wan-monitor
spec:
replicas: 1
strategy:
type: Recreate
selector:
matchLabels:
app: wan-monitor
template:
metadata:
labels:
app: wan-monitor
annotations:
enable.version-checker.io/blackbox: "true"
enable.version-checker.io/metrics-http: "true"
enable.version-checker.io/wan-probe: "true"
match-regex.version-checker.io/blackbox: "^v[0-9]+\\.[0-9]+\\.[0-9]+$"
match-regex.version-checker.io/metrics-http: "^[0-9]+\\.[0-9]+\\.[0-9]+$"
match-regex.version-checker.io/wan-probe: "^[0-9]+\\.[0-9]+\\.[0-9]+$"
spec:
enableServiceLinks: false
containers:
- name: blackbox
image: quay.io/prometheus/blackbox-exporter:v0.28.0
args:
- --config.file=/etc/blackbox/blackbox.yml
- --web.listen-address=:9115
ports:
- name: blackbox
containerPort: 9115
securityContext:
capabilities:
add: ["NET_RAW"] # required for the ICMP prober
resources:
requests: { cpu: 10m, memory: 32Mi }
limits: { memory: 64Mi }
volumeMounts:
- name: blackbox-config
mountPath: /etc/blackbox
readOnly: true
- name: wan-probe
# Build + push from Dockerfile.wan-probe (adjust registry/tag to taste)
image: gitea.dooplex.hu/admin/wan-probe:0.1.0
command: ["/bin/sh", "/scripts/probe-loop.sh"]
env:
- name: HETZNER_HOST
# MUST be the Hetzner origin: a DNS-only (grey-cloud) record or raw IP.
# NOT the Cloudflare-proxied jarrs.eu — CF only forwards HTTP/HTTPS, so
# UDP 2112 (irtt) / TCP 5201 (iperf3) never reach the origin behind it.
value: "metrics.jarrs.eu" # DNS-only A record -> Hetzner IPv4
- name: IRTT_PORT
value: "2112"
- name: IPERF_PORT
value: "5201"
- name: IRTT_INTERVAL
value: "20ms"
- name: IRTT_DURATION
value: "60" # seconds (numeric)
- name: TPUT_EVERY
value: "900" # 15 min
- name: TPUT_TIME
value: "10"
- name: IPERF_PARALLEL
value: "4"
- name: IRTT_HMAC # shared key; apply via secret (see below)
valueFrom:
secretKeyRef:
name: wan-monitor-irtt
key: hmac
optional: true
resources:
requests: { cpu: 20m, memory: 48Mi }
limits: { memory: 96Mi }
volumeMounts:
- name: scripts
mountPath: /scripts
readOnly: true
- name: shared
mountPath: /shared
- name: metrics-http
image: busybox:1.36
command: ["httpd", "-f", "-v", "-p", "9116", "-h", "/shared"]
ports:
- name: metrics
containerPort: 9116
resources:
requests: { cpu: 5m, memory: 8Mi }
limits: { memory: 24Mi }
volumeMounts:
- name: shared
mountPath: /shared
readOnly: true
volumes:
- name: blackbox-config
configMap:
name: wan-monitor-blackbox
- name: scripts
configMap:
name: wan-monitor-scripts
- name: shared
emptyDir: {}
---
apiVersion: v1
kind: Service
metadata:
name: wan-monitor
namespace: admin-system
labels:
app: wan-monitor
spec:
type: ClusterIP
selector:
app: wan-monitor
ports:
- name: blackbox
port: 9115
targetPort: 9115
- name: metrics
port: 9116
targetPort: 9116
+70 -1
View File
@@ -237,6 +237,75 @@ data:
regex: 'ak-outpost-(.*)-outpost' regex: 'ak-outpost-(.*)-outpost'
replacement: '$1' replacement: '$1'
# --- end-to-end latency + loss (ICMP) to many destinations ---
- job_name: 'wan-icmp'
metrics_path: /probe
params:
module: [icmp]
scrape_interval: 15s
static_configs:
- targets:
- 8.8.8.8
- 1.1.1.1
- jarrs.eu # Hetzner
- telex.hu
- store.steampowered.com
- 192.168.0.1 # gateway
- 37.191.56.193 # your public IP (update if it changes)
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: wan-monitor.admin-system:9115
# --- HTTP phase breakdown (dns/connect/tls/processing/transfer) ---
- job_name: 'wan-http'
metrics_path: /probe
params:
module: [http_2xx]
scrape_interval: 30s
static_configs:
- targets:
- https://telex.hu
- https://store.steampowered.com
- https://jarrs.eu
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: wan-monitor.admin-system:9115
# --- DNS resolution time per resolver (Pi-hole vs public) ---
- job_name: 'wan-dns'
metrics_path: /probe
params:
module: [dns_udp]
scrape_interval: 30s
static_configs:
- targets:
- 192.168.0.250 # Pi-hole
- 1.1.1.1
- 8.8.8.8
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: wan-monitor.admin-system:9115
# --- irtt (UDP quality) + iperf3 (throughput) textfile metrics ---
- job_name: 'wan-probe'
metrics_path: /metrics
scrape_interval: 30s
fallback_scrape_protocol: PrometheusText0.0.4
static_configs:
- targets: ['wan-monitor.admin-system:9116']
# CloudNativePG - Postgres metrics per instance # CloudNativePG - Postgres metrics per instance
- job_name: 'cloudnativepg' - job_name: 'cloudnativepg'
kubernetes_sd_configs: kubernetes_sd_configs:
@@ -621,7 +690,7 @@ spec:
memory: 128Mi memory: 128Mi
limits: limits:
cpu: 500m cpu: 500m
memory: 256Mi memory: 768Mi
volumeMounts: volumeMounts:
- name: data - name: data
mountPath: /var/lib/grafana mountPath: /var/lib/grafana
+102 -1
View File
@@ -13,7 +13,7 @@ metadata:
labels: labels:
app: prometheus app: prometheus
data: data:
authentik-alerts.yml: | authentik-alerts.yml: |
groups: groups:
- name: authentik-availability - name: authentik-availability
@@ -210,3 +210,104 @@ data:
annotations: annotations:
summary: "Longhorn node {{ $labels.node }} storage pressure" summary: "Longhorn node {{ $labels.node }} storage pressure"
description: "Node {{ $labels.node }} disk usage is at {{ printf \"%.1f\" $value }}%." description: "Node {{ $labels.node }} disk usage is at {{ printf \"%.1f\" $value }}%."
# Add this as a new data key (wan-alerts.yml) in the existing
# prometheus-rules ConfigMap (mon-system). Thresholds anchored to One.hu's
# "normal conditions" figures: 700 Mbit/s down / 28 Mbit/s up.
# Throughput is sampled every ~15 min, so `for:` spans >=2 samples to avoid
# firing on a single fluke. Recalibrate floors after a week of baseline data.
# NOTE: uses Prometheus template funcs (humanize/humanizePercentage/humanizeDuration);
# mul/div are NOT valid Prometheus template functions.
wan-alerts.yml: |
groups:
- name: wan-quality-alerts
rules:
# --- upstream loss: the prime suspect for dropped calls / WireGuard ---
- alert: WanUpstreamPacketLoss
expr: wan_irtt_loss_ratio{direction="upstream",condition="idle"} > 0.01
for: 2m
labels:
severity: warning
annotations:
summary: "WAN upstream packet loss to {{ $labels.target }}"
description: "irtt upstream loss {{ $value | humanizePercentage }} (>1%) for 2m. Cable-upstream symptom; capture for ISP."
- alert: WanDownstreamPacketLoss
expr: wan_irtt_loss_ratio{direction="downstream",condition="idle"} > 0.01
for: 2m
labels:
severity: warning
annotations:
summary: "WAN downstream packet loss to {{ $labels.target }}"
description: "irtt downstream loss {{ $value | humanizePercentage }} (>1%) for 2m."
# --- latency / jitter ---
- alert: WanLatencyHigh
expr: wan_irtt_rtt_seconds{stat="max",condition="idle"} > 0.08
for: 5m
labels:
severity: warning
annotations:
summary: "WAN RTT spikes to {{ $labels.target }}"
description: "irtt max RTT {{ $value | humanizeDuration }} (>80 ms) for 5m (idle). Real-time apps will feel this."
- alert: WanJitterHigh
expr: wan_irtt_jitter_seconds{direction="round_trip",condition="idle"} > 0.03
for: 5m
labels:
severity: warning
annotations:
summary: "WAN jitter high to {{ $labels.target }}"
description: "Round-trip jitter {{ $value | humanizeDuration }} (>30 ms) for 5m. Degrades VoIP/video."
# --- bufferbloat: latency added while the line is saturated ---
- alert: WanBufferbloat
expr: |
(
wan_irtt_rtt_seconds{stat="mean",condition="under_load"}
- on(target) wan_irtt_rtt_seconds{stat="mean",condition="idle"}
) > 0.1
for: 0m
labels:
severity: info
annotations:
summary: "WAN bufferbloat on {{ $labels.target }}"
description: "RTT rises {{ $value | humanizeDuration }} under load (>100 ms). Line buckles when saturated."
# --- throughput vs One.hu "normal" 700/28 (alert below 50%) ---
- alert: WanDownloadDegraded
expr: wan_throughput_bits_per_second{direction="download"} < 350e6
for: 20m
labels:
severity: warning
annotations:
summary: "WAN download below half of plan"
description: "Download {{ $value | humanize }}bit/s (< 350M, half of 700 normal) for 20m."
- alert: WanUploadDegraded
expr: wan_throughput_bits_per_second{direction="upload"} < 14e6
for: 20m
labels:
severity: warning
annotations:
summary: "WAN upload below half of plan"
description: "Upload {{ $value | humanize }}bit/s (< 14M, half of 28 normal) for 20m."
# --- the monitor itself stopped producing data ---
- alert: WanProbeStalled
expr: time() - max by(probe) (wan_probe_last_run_timestamp_seconds) > 300
for: 0m
labels:
severity: warning
annotations:
summary: "WAN probe '{{ $labels.probe }}' stalled"
description: "No fresh samples for >5 min. Check the wan-monitor pod / Hetzner endpoint."
- alert: WanBlackboxTargetDown
expr: probe_success{job=~"wan-.*"} == 0
for: 3m
labels:
severity: warning
annotations:
summary: "WAN probe to {{ $labels.instance }} failing"
description: "blackbox {{ $labels.job }} to {{ $labels.instance }} unreachable for 3m."