238 lines
8.5 KiB
YAML
238 lines
8.5 KiB
YAML
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: idokep-scraper
|
|
namespace: glance-system
|
|
spec:
|
|
replicas: 1
|
|
selector:
|
|
matchLabels:
|
|
app: idokep-scraper
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: idokep-scraper
|
|
spec:
|
|
containers:
|
|
- name: idokep-scraper
|
|
image: python:3.12-slim
|
|
imagePullPolicy: IfNotPresent
|
|
env:
|
|
- name: IDOKEP_URL
|
|
value: "https://www.idokep.hu/idojaras/Budapest%20VII.%20ker"
|
|
- name: PLACE_NAME
|
|
value: "Budapest VII. ker"
|
|
ports:
|
|
- containerPort: 8000
|
|
command: ["/bin/sh", "-lc"]
|
|
args:
|
|
- |
|
|
pip install --no-cache-dir fastapi uvicorn requests beautifulsoup4 prometheus-client &&
|
|
python -c "import uvicorn; uvicorn.run('app:APP', host='0.0.0.0', port=8000)"
|
|
volumeMounts:
|
|
- name: app
|
|
mountPath: /app
|
|
workingDir: /app
|
|
volumes:
|
|
- name: app
|
|
configMap:
|
|
name: idokep-scraper-app
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: idokep-scraper-app
|
|
namespace: glance-system
|
|
data:
|
|
app.py: |
|
|
import os
|
|
import time
|
|
import re
|
|
from typing import List, Dict, Any, Optional
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from fastapi import FastAPI, Response
|
|
from prometheus_client import Counter, Histogram, Gauge, generate_latest, CONTENT_TYPE_LATEST
|
|
|
|
APP = FastAPI()
|
|
|
|
IDOKEP_URL = os.getenv(
|
|
"IDOKEP_URL",
|
|
"https://www.idokep.hu/idojaras/Budapest%20VIII.%20ker",
|
|
)
|
|
PLACE_NAME = os.getenv("PLACE_NAME", "Budapest VIII. ker")
|
|
SOURCE_NAME = "Időkép"
|
|
|
|
UA = os.getenv(
|
|
"USER_AGENT",
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome Safari",
|
|
)
|
|
|
|
# Prometheus metrics (optional)
|
|
SCRAPES = Counter("idokep_scrapes_total", "Total Időkép scrapes", ["place", "status"])
|
|
SCRAPE_SECONDS = Histogram("idokep_scrape_seconds", "Időkép scrape duration in seconds", ["place"])
|
|
CURRENT_TEMP = Gauge("idokep_current_temp_c", "Current temperature in Celsius", ["place"])
|
|
DAILY_TMIN = Gauge("idokep_daily_tmin_c", "Daily minimum temperature in Celsius", ["place", "dow"])
|
|
DAILY_TMAX = Gauge("idokep_daily_tmax_c", "Daily maximum temperature in Celsius", ["place", "dow"])
|
|
HOURLY_TEMP = Gauge("idokep_hourly_temp_c", "Hourly temperature in Celsius", ["place", "time"])
|
|
|
|
|
|
def _abs_url(maybe_relative: Optional[str]) -> Optional[str]:
|
|
if not maybe_relative:
|
|
return None
|
|
if maybe_relative.startswith("http://") or maybe_relative.startswith("https://"):
|
|
return maybe_relative
|
|
# Időkép uses /assets/... paths
|
|
return "https://www.idokep.hu" + maybe_relative
|
|
|
|
|
|
def _to_int_temp(s: str) -> Optional[float]:
|
|
if not s:
|
|
return None
|
|
s = s.strip().replace("˚C", "").replace("°C", "").replace("°", "")
|
|
try:
|
|
return float(s)
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def scrape() -> Dict[str, Any]:
|
|
headers = {"User-Agent": UA}
|
|
r = requests.get(IDOKEP_URL, headers=headers, timeout=15)
|
|
r.raise_for_status()
|
|
|
|
soup = BeautifulSoup(r.text, "html.parser")
|
|
|
|
# Current
|
|
cur_temp_el = soup.select_one(".current-temperature")
|
|
cur_cond_el = soup.select_one(".current-weather")
|
|
cur_icon_el = soup.select_one(".forecast-bigicon")
|
|
|
|
cur_temp = _to_int_temp(cur_temp_el.get_text(strip=True) if cur_temp_el else "")
|
|
cur_cond = cur_cond_el.get_text(strip=True) if cur_cond_el else ""
|
|
cur_icon = _abs_url(cur_icon_el.get("src") if cur_icon_el else None)
|
|
|
|
# Hourly cards (the block you highlighted in devtools: .ik.hourly-forecast-card)
|
|
hourly: List[Dict[str, Any]] = []
|
|
for card in soup.select(".ik.hourly-forecast-card")[:8]:
|
|
t_el = card.select_one(".ik.hourly-forecast-hour")
|
|
temp_el = card.select_one(".ik.temperature-circled")
|
|
icon_el = card.select_one("img.ik.forecast-icon")
|
|
|
|
t = t_el.get_text(strip=True) if t_el else ""
|
|
temp = _to_int_temp(temp_el.get_text(strip=True) if temp_el else "")
|
|
icon = _abs_url(icon_el.get("src") if icon_el else None)
|
|
|
|
if t and temp is not None:
|
|
hourly.append(
|
|
{
|
|
"time": t, # e.g. "18:00"
|
|
"temp_c": temp, # e.g. -2
|
|
"icon_url": icon, # absolute URL
|
|
}
|
|
)
|
|
|
|
# Daily columns (bottom forecast table: .ik.daily-forecast-container .ik.dailyForecastCol)
|
|
daily: List[Dict[str, Any]] = []
|
|
for col in soup.select(".ik.daily-forecast-container .ik.dailyForecastCol")[:15]:
|
|
dow_el = col.select_one(".ik.dfDay")
|
|
icon_el = col.select_one("img.ik.forecast-icon")
|
|
daynum_el = col.select_one(".ik.dfDayNum")
|
|
|
|
# Normal structure (most days)
|
|
tmax_el = col.select_one("div.ik.max")
|
|
tmin_el = col.select_one("div.ik.min")
|
|
|
|
daynum = daynum_el.get_text(strip=True) if daynum_el else ""
|
|
dow = dow_el.get_text(strip=True) if dow_el else ""
|
|
icon = _abs_url(icon_el.get("src") if icon_el else None)
|
|
|
|
tmax = _to_int_temp(tmax_el.get_text(strip=True) if tmax_el else "")
|
|
tmin = _to_int_temp(tmin_el.get_text(strip=True) if tmin_el else "")
|
|
|
|
# Fallback structure (e.g. "vacation" days) where div.ik.max/min are missing
|
|
# In those cases the visible temps are usually the first two numeric <a> texts
|
|
# inside .ik.min-max-container (order: max, min).
|
|
if tmax is None or tmin is None:
|
|
vals: List[str] = []
|
|
for a in col.select(".ik.min-max-container a"):
|
|
txt = a.get_text(strip=True)
|
|
if re.fullmatch(r"-?\d+", txt or ""):
|
|
vals.append(txt)
|
|
|
|
if len(vals) >= 2:
|
|
tmax = _to_int_temp(vals[0])
|
|
tmin = _to_int_temp(vals[1])
|
|
|
|
# Keep only rows that look valid
|
|
if dow and (tmin is not None) and (tmax is not None):
|
|
daily.append(
|
|
{
|
|
"daynum": daynum,
|
|
"dow": dow, # e.g. "Cs", "P", "Sz"
|
|
"tmin_c": tmin,
|
|
"tmax_c": tmax,
|
|
"icon_url": icon,
|
|
}
|
|
)
|
|
|
|
# Limit to 5 days for your widget (first 5 columns in the table, including "vacation" days)
|
|
daily = daily[:5]
|
|
|
|
return {
|
|
"source": {"name": SOURCE_NAME, "url": IDOKEP_URL},
|
|
"location": {"name": PLACE_NAME},
|
|
"current": {"temp_c": cur_temp, "condition": cur_cond, "icon_url": cur_icon},
|
|
"hourly": hourly,
|
|
"daily": daily,
|
|
"fetched_at_unix": int(time.time()),
|
|
}
|
|
|
|
|
|
@APP.get("/api")
|
|
def api():
|
|
status = "ok"
|
|
with SCRAPE_SECONDS.labels(place=PLACE_NAME).time():
|
|
try:
|
|
data = scrape()
|
|
except Exception:
|
|
status = "error"
|
|
SCRAPES.labels(place=PLACE_NAME, status=status).inc()
|
|
raise
|
|
|
|
SCRAPES.labels(place=PLACE_NAME, status=status).inc()
|
|
|
|
# Update Prometheus gauges (best-effort)
|
|
try:
|
|
if data.get("current", {}).get("temp_c") is not None:
|
|
CURRENT_TEMP.labels(place=PLACE_NAME).set(float(data["current"]["temp_c"]))
|
|
for d in data.get("daily", []):
|
|
DAILY_TMIN.labels(place=PLACE_NAME, dow=d["dow"]).set(float(d["tmin_c"]))
|
|
DAILY_TMAX.labels(place=PLACE_NAME, dow=d["dow"]).set(float(d["tmax_c"]))
|
|
for h in data.get("hourly", []):
|
|
HOURLY_TEMP.labels(place=PLACE_NAME, time=h["time"]).set(float(h["temp_c"]))
|
|
except Exception:
|
|
pass
|
|
|
|
# IMPORTANT: force JSON content-type so Glance exposes `.JSON`
|
|
import json
|
|
return Response(content=json.dumps(data, ensure_ascii=False), media_type="application/json; charset=utf-8")
|
|
|
|
|
|
@APP.get("/metrics")
|
|
def metrics():
|
|
return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST)
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: idokep-scraper
|
|
namespace: glance-system
|
|
spec:
|
|
selector:
|
|
app: idokep-scraper
|
|
ports:
|
|
- name: http
|
|
port: 8000
|
|
targetPort: 8000 |