"""Recipe scraper — parses Hungarian recipe sites into a structured dict.
Currently supported: mindmegette.hu
"""
import re
import requests
from bs4 import BeautifulSoup
_HEADERS = {
"User-Agent": "RecipeImporter/1.0 (Hungarian recipe scraper)",
"Accept-Language": "hu-HU,hu;q=0.9,en;q=0.5",
}
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def scrape(url: str) -> dict:
"""Fetch *url* and return a recipe dict.
Returns::
{
"title": str,
"description": str,
"image_url": str | None,
"ingredients": [str, ...],
"instructions": [str, ...],
"original_url": str,
}
Raises ValueError on unsupported sites or parse failures.
"""
resp = requests.get(url, headers=_HEADERS, timeout=30)
resp.raise_for_status()
resp.encoding = resp.apparent_encoding or "utf-8"
soup = BeautifulSoup(resp.text, "lxml")
host = _host(url)
if "mindmegette" in host:
return _parse_mindmegette(soup, url)
else:
# Fallback: try generic schema.org / og-tag extraction
return _parse_generic(soup, url)
# ---------------------------------------------------------------------------
# mindmegette.hu
# ---------------------------------------------------------------------------
def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict:
title = _og(soup, "og:title") or _text(soup.find("title"))
# Strip " | Mindmegette.hu" suffix
if title:
title = re.sub(r"\s*\|\s*Mindmegette\.hu$", "", title).strip()
description = _og(soup, "og:description") or ""
image_url = _og(soup, "og:image")
# --- Ingredients ---
ingredients = []
ing_container = soup.find("div", class_="ingredients")
if ing_container:
for row in ing_container.find_all("div", class_="ingredients-meta"):
parts = []
# Actual HTML: qty unit
# name
qty_el = row.find("strong")
# Unit: first plain (not one with a specific class like
# "ingredients-checkbox" etc.)
unit_el = None
for sp in row.find_all("span"):
if not sp.get("class"):
unit_el = sp
break
name_el = row.find("a", class_="ingredients-link")
# Extra info: (darált) or