"""Recipe scraper — parses Hungarian recipe sites into a structured dict. Each supported site has a parser registered via _PARSERS. Unsupported sites fall back to generic schema.org / og-tag extraction. """ import json import re import requests from bs4 import BeautifulSoup _HEADERS = { "User-Agent": "RecipeImporter/1.0 (Hungarian recipe scraper)", "Accept-Language": "hu-HU,hu;q=0.9,en;q=0.5", } # Maps a substring of the hostname to a parser function. # Order matters: first match wins. _PARSERS: list[tuple[str, "callable"]] = [] def _register(host_substring: str): """Decorator: register a parser for URLs whose hostname contains *host_substring*.""" def decorator(fn): _PARSERS.append((host_substring, fn)) return fn return decorator # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- def scrape(url: str) -> dict: """Fetch *url* and return a recipe dict. Returns:: { "title": str, "description": str, "image_url": str | None, "ingredients": [{"quantity": str, "unit": str, "food": str, "extra": str}, ...], "instructions": [str, ...], "tags": [str, ...], "original_url": str, } Raises ValueError on unsupported sites or parse failures. """ resp = requests.get(url, headers=_HEADERS, timeout=30) resp.raise_for_status() resp.encoding = resp.apparent_encoding or "utf-8" soup = BeautifulSoup(resp.text, "lxml") host = _host(url) result = None for substring, parser in _PARSERS: if substring in host: result = parser(soup, url) break if result is None: # Fallback: try generic schema.org / og-tag extraction result = _parse_generic(soup, url) # Post-process: extract parenthesized comments from food into extra _extract_ingredient_comments(result) # Strip trailing "recept*" from title (e.g. "receptje", "recept") title = result.get("title", "") if title: result["title"] = re.sub(r"\s+recept\w*$", "", title, flags=re.IGNORECASE).strip() return result def supported_sites() -> list[dict]: """Return list of supported sites with name and URL.""" _SITE_URLS = { "mindmegette": "https://www.mindmegette.hu", "streetkitchen": "https://streetkitchen.hu", "nosalty": "https://www.nosalty.hu", "sobors": "https://sobors.hu", "kiskegyed": "https://www.kiskegyed.hu", "gastrohobbi": "https://gastrohobbi.hu", } return [{"name": s + ".hu", "url": _SITE_URLS.get(s, "#")} for s, _ in _PARSERS] # --------------------------------------------------------------------------- # mindmegette.hu # --------------------------------------------------------------------------- @_register("mindmegette") def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict: # Prefer h1 (clean meal name) over og:title (often has "receptje" suffix) h1 = soup.find("h1") title = _text(h1) if h1 else "" if not title: title = _og(soup, "og:title") or _text(soup.find("title")) # Strip " | Mindmegette.hu" or " - Mindmegette.hu" suffix if title: title = re.sub(r"\s*[-–|]\s*Mindmegette\.hu$", "", title).strip() description = _og(soup, "og:description") or "" image_url = _og(soup, "og:image") # --- Ingredients --- # Format A (regular /recept/ pages): div.ingredients containers with structured rows # Format B (alt /alapetelek/ pages): h3 "Hozzávalók" →